Merge pull request #19 from jerjou/master

Refactor some json parsers into a common class.
This commit is contained in:
Jay Goel 2020-08-09 09:11:23 -04:00 committed by GitHub
commit 51b132e742
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 69 additions and 217 deletions

View file

@ -35,4 +35,4 @@ def getParser(domain):
if not parser:
return None
return parser()
return parser(domain)

View file

@ -1,37 +1,4 @@
import json
from parsers.recipe import WpJsonRecipe
from parsers.recipe import Recipe
class Bowlofdelicious(Recipe):
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = r['recipeIngredient']
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {}
recipe['url'] = url
recipe['source'] = 'bowlofdelicious.com'
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
d = json.loads(result.contents[0])
parsed_recipe = self.get_json_recipe(d)
recipe.update(parsed_recipe)
return recipe
class Bowlofdelicious(WpJsonRecipe):
pass

View file

@ -1,37 +1,4 @@
import json
from parsers.recipe import WpJsonRecipe
from parsers.recipe import Recipe
class Gimmesomeoven(Recipe):
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = r['recipeIngredient']
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {}
recipe['url'] = url
recipe['source'] = 'gimmesomeoven.com'
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
d = json.loads(result.contents[0])
parsed_recipe = self.get_json_recipe(d)
recipe.update(parsed_recipe)
return recipe
class Gimmesomeoven(WpJsonRecipe):
pass

View file

@ -1,37 +1,4 @@
import json
from parsers.recipe import WpJsonRecipe
from parsers.recipe import Recipe
class Hostthetoast(Recipe):
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = r['recipeIngredient']
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {}
recipe['url'] = url
recipe['source'] = 'hostthetoast.com'
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
d = json.loads(result.contents[0])
parsed_recipe = self.get_json_recipe(d)
recipe.update(parsed_recipe)
return recipe
class Hostthetoast(WpJsonRecipe):
pass

View file

@ -1,37 +1,4 @@
import json
from parsers.recipe import WpJsonRecipe
from parsers.recipe import Recipe
class Lovingitvegan(Recipe):
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = r['recipeIngredient']
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {}
recipe['url'] = url
recipe['source'] = 'lovingitvegan.com'
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
d = json.loads(result.contents[0])
parsed_recipe = self.get_json_recipe(d)
recipe.update(parsed_recipe)
return recipe
class Lovingitvegan(WpJsonRecipe):
pass

View file

@ -1,37 +1,4 @@
import json
from parsers.recipe import WpJsonRecipe
from parsers.recipe import Recipe
class Minimalistbaker(Recipe):
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = r['recipeIngredient']
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {}
recipe['url'] = url
recipe['source'] = 'minimalistbaker.com'
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
d = json.loads(result.contents[0])
parsed_recipe = self.get_json_recipe(d)
recipe.update(parsed_recipe)
return recipe
class Minimalistbaker(WpJsonRecipe):
pass

View file

@ -1,9 +1,13 @@
import json
import requests
from bs4 import BeautifulSoup
class Recipe(object):
def __init__(self, domain):
pass
def fetch_html(self, url):
#fd = open('allrecipes3.html', 'r')
#return fd.read()
@ -13,5 +17,44 @@ class Recipe(object):
def fetch_soup(self, url):
html = self.fetch_html(url)
soup = BeautifulSoup(html)
return soup
soup = BeautifulSoup(html, features='lxml')
return soup
class WpJsonRecipe(Recipe):
"""Some wordpress sites provide the recipe in a convenient json format."""
def __init__(self, domain):
self.domain = domain
def fetch_json(self, url):
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
return self.get_json_recipe(json.loads(result.contents[0]))
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = r['recipeIngredient']
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {
'url': url,
'source': self.domain
}
parsed_recipe = self.fetch_json(url)
recipe.update(parsed_recipe)
return recipe

View file

@ -1,39 +1,13 @@
import json
from parsers.recipe import Recipe
class Thewoksoflife(Recipe):
from parsers.recipe import WpJsonRecipe
class Thewoksoflife(WpJsonRecipe):
def get_json_recipe(self, d):
recipe = {}
for r in d['@graph']:
if not isinstance(r['@type'], str):
continue
if r['@type'].lower() != 'recipe':
continue
recipe['name'] = r['name']
recipe['description'] = r['description']
recipe['ingredients'] = [
ingredient.replace('((', '(').replace('))', ')')
for ingredient in r.get('recipeIngredient', [])]
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
recipe['image'] = r['image'][0]
return recipe
def Parse(self, url):
recipe = {}
recipe['url'] = url
recipe['source'] = 'thewoksoflife.com'
soup = self.fetch_soup(url)
result = soup.find('script', {'type': 'application/ld+json'})
d = json.loads(result.contents[0])
parsed_recipe = self.get_json_recipe(d)
recipe.update(parsed_recipe)
recipe = super().get_json_recipe(d)
# thewoksoflife.com for some reason has double parentheses in its
# ingredients. Remove them.
recipe['ingredients'] = [
ingredient.replace('((', '(').replace('))', ')')
for ingredient in recipe.get('ingredients', [])]
return recipe