Merge pull request #19 from jerjou/master
Refactor some json parsers into a common class.
This commit is contained in:
commit
51b132e742
8 changed files with 69 additions and 217 deletions
|
@ -35,4 +35,4 @@ def getParser(domain):
|
|||
if not parser:
|
||||
return None
|
||||
|
||||
return parser()
|
||||
return parser(domain)
|
||||
|
|
|
@ -1,37 +1,4 @@
|
|||
import json
|
||||
from parsers.recipe import WpJsonRecipe
|
||||
|
||||
from parsers.recipe import Recipe
|
||||
|
||||
class Bowlofdelicious(Recipe):
|
||||
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = r['recipeIngredient']
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {}
|
||||
recipe['url'] = url
|
||||
recipe['source'] = 'bowlofdelicious.com'
|
||||
|
||||
soup = self.fetch_soup(url)
|
||||
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
|
||||
d = json.loads(result.contents[0])
|
||||
parsed_recipe = self.get_json_recipe(d)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
return recipe
|
||||
class Bowlofdelicious(WpJsonRecipe):
|
||||
pass
|
||||
|
|
|
@ -1,37 +1,4 @@
|
|||
import json
|
||||
from parsers.recipe import WpJsonRecipe
|
||||
|
||||
from parsers.recipe import Recipe
|
||||
|
||||
class Gimmesomeoven(Recipe):
|
||||
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = r['recipeIngredient']
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {}
|
||||
recipe['url'] = url
|
||||
recipe['source'] = 'gimmesomeoven.com'
|
||||
|
||||
soup = self.fetch_soup(url)
|
||||
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
|
||||
d = json.loads(result.contents[0])
|
||||
parsed_recipe = self.get_json_recipe(d)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
return recipe
|
||||
class Gimmesomeoven(WpJsonRecipe):
|
||||
pass
|
||||
|
|
|
@ -1,37 +1,4 @@
|
|||
import json
|
||||
from parsers.recipe import WpJsonRecipe
|
||||
|
||||
from parsers.recipe import Recipe
|
||||
|
||||
class Hostthetoast(Recipe):
|
||||
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = r['recipeIngredient']
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {}
|
||||
recipe['url'] = url
|
||||
recipe['source'] = 'hostthetoast.com'
|
||||
|
||||
soup = self.fetch_soup(url)
|
||||
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
|
||||
d = json.loads(result.contents[0])
|
||||
parsed_recipe = self.get_json_recipe(d)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
return recipe
|
||||
class Hostthetoast(WpJsonRecipe):
|
||||
pass
|
||||
|
|
|
@ -1,37 +1,4 @@
|
|||
import json
|
||||
from parsers.recipe import WpJsonRecipe
|
||||
|
||||
from parsers.recipe import Recipe
|
||||
|
||||
class Lovingitvegan(Recipe):
|
||||
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = r['recipeIngredient']
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {}
|
||||
recipe['url'] = url
|
||||
recipe['source'] = 'lovingitvegan.com'
|
||||
|
||||
soup = self.fetch_soup(url)
|
||||
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
|
||||
d = json.loads(result.contents[0])
|
||||
parsed_recipe = self.get_json_recipe(d)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
return recipe
|
||||
class Lovingitvegan(WpJsonRecipe):
|
||||
pass
|
||||
|
|
|
@ -1,37 +1,4 @@
|
|||
import json
|
||||
from parsers.recipe import WpJsonRecipe
|
||||
|
||||
from parsers.recipe import Recipe
|
||||
|
||||
class Minimalistbaker(Recipe):
|
||||
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = r['recipeIngredient']
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {}
|
||||
recipe['url'] = url
|
||||
recipe['source'] = 'minimalistbaker.com'
|
||||
|
||||
soup = self.fetch_soup(url)
|
||||
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
|
||||
d = json.loads(result.contents[0])
|
||||
parsed_recipe = self.get_json_recipe(d)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
return recipe
|
||||
class Minimalistbaker(WpJsonRecipe):
|
||||
pass
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
import json
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class Recipe(object):
|
||||
|
||||
def __init__(self, domain):
|
||||
pass
|
||||
|
||||
def fetch_html(self, url):
|
||||
#fd = open('allrecipes3.html', 'r')
|
||||
#return fd.read()
|
||||
|
@ -13,5 +17,44 @@ class Recipe(object):
|
|||
|
||||
def fetch_soup(self, url):
|
||||
html = self.fetch_html(url)
|
||||
soup = BeautifulSoup(html)
|
||||
return soup
|
||||
soup = BeautifulSoup(html, features='lxml')
|
||||
return soup
|
||||
|
||||
class WpJsonRecipe(Recipe):
|
||||
"""Some wordpress sites provide the recipe in a convenient json format."""
|
||||
|
||||
def __init__(self, domain):
|
||||
self.domain = domain
|
||||
|
||||
def fetch_json(self, url):
|
||||
soup = self.fetch_soup(url)
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
return self.get_json_recipe(json.loads(result.contents[0]))
|
||||
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = r['recipeIngredient']
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {
|
||||
'url': url,
|
||||
'source': self.domain
|
||||
}
|
||||
|
||||
parsed_recipe = self.fetch_json(url)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
return recipe
|
||||
|
|
|
@ -1,39 +1,13 @@
|
|||
import json
|
||||
|
||||
from parsers.recipe import Recipe
|
||||
|
||||
class Thewoksoflife(Recipe):
|
||||
from parsers.recipe import WpJsonRecipe
|
||||
|
||||
class Thewoksoflife(WpJsonRecipe):
|
||||
def get_json_recipe(self, d):
|
||||
recipe = {}
|
||||
for r in d['@graph']:
|
||||
if not isinstance(r['@type'], str):
|
||||
continue
|
||||
|
||||
if r['@type'].lower() != 'recipe':
|
||||
continue
|
||||
|
||||
recipe['name'] = r['name']
|
||||
recipe['description'] = r['description']
|
||||
recipe['ingredients'] = [
|
||||
ingredient.replace('((', '(').replace('))', ')')
|
||||
for ingredient in r.get('recipeIngredient', [])]
|
||||
recipe['instructions'] = [i['text'] for i in r['recipeInstructions']]
|
||||
recipe['image'] = r['image'][0]
|
||||
|
||||
return recipe
|
||||
|
||||
def Parse(self, url):
|
||||
recipe = {}
|
||||
recipe['url'] = url
|
||||
recipe['source'] = 'thewoksoflife.com'
|
||||
|
||||
soup = self.fetch_soup(url)
|
||||
|
||||
result = soup.find('script', {'type': 'application/ld+json'})
|
||||
|
||||
d = json.loads(result.contents[0])
|
||||
parsed_recipe = self.get_json_recipe(d)
|
||||
recipe.update(parsed_recipe)
|
||||
|
||||
recipe = super().get_json_recipe(d)
|
||||
# thewoksoflife.com for some reason has double parentheses in its
|
||||
# ingredients. Remove them.
|
||||
recipe['ingredients'] = [
|
||||
ingredient.replace('((', '(').replace('))', ')')
|
||||
for ingredient in recipe.get('ingredients', [])]
|
||||
return recipe
|
||||
|
|
Loading…
Reference in a new issue