mirror of
https://github.com/TandoorRecipes/recipes.git
synced 2026-01-01 04:10:06 -05:00
updated to handle new behavior of recipe_scrapers
This commit is contained in:
@@ -17,53 +17,84 @@ def get_from_scraper(scrape, space):
|
|||||||
|
|
||||||
recipe_json = {}
|
recipe_json = {}
|
||||||
try:
|
try:
|
||||||
recipe_json['name'] = parse_name(scrape.title() or scrape.schema.data.get('name') or '')
|
recipe_json['name'] = parse_name(scrape.title() or None)
|
||||||
except (TypeError, AttributeError,ElementNotFoundInHtml):
|
except (TypeError, AttributeError, ElementNotFoundInHtml, NotImplementedError):
|
||||||
recipe_json['name'] = ''
|
recipe_json['name'] = None
|
||||||
|
if not recipe_json['name']:
|
||||||
|
try:
|
||||||
|
recipe_json['name'] = scrape.schema.data.get('name') or ''
|
||||||
|
except Exception:
|
||||||
|
recipe_json['name'] = ''
|
||||||
|
|
||||||
try:
|
try:
|
||||||
description = scrape.schema.data.get("description") or ''
|
description = scrape.schema.data.get("description") or ''
|
||||||
except (AttributeError,ElementNotFoundInHtml):
|
except (AttributeError, ElementNotFoundInHtml, NotImplementedError, SchemaOrgException):
|
||||||
description = ''
|
description = ''
|
||||||
|
|
||||||
recipe_json['description'] = parse_description(description)
|
recipe_json['description'] = parse_description(description)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
servings = scrape.yields()
|
servings = scrape.yields() or None
|
||||||
servings = int(re.findall(r'\b\d+\b', servings)[0])
|
except Exception:
|
||||||
except (AttributeError,ElementNotFoundInHtml, ValueError, IndexError):
|
servings = None
|
||||||
servings = 1
|
if not servings:
|
||||||
|
try:
|
||||||
|
servings = scrape.schema.data.get('recipeYield') or 1
|
||||||
|
except Exception:
|
||||||
|
servings = 1
|
||||||
|
if type(servings) != int:
|
||||||
|
try:
|
||||||
|
servings = int(re.findall(r'\b\d+\b', servings)[0])
|
||||||
|
except Exception:
|
||||||
|
servings = 1
|
||||||
recipe_json['servings'] = servings
|
recipe_json['servings'] = servings
|
||||||
|
|
||||||
try:
|
try:
|
||||||
recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0
|
recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0
|
||||||
except (AttributeError, ElementNotFoundInHtml):
|
except Exception:
|
||||||
recipe_json['prepTime'] = 0
|
recipe_json['prepTime'] = 0
|
||||||
try:
|
try:
|
||||||
recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0
|
recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0
|
||||||
except (AttributeError, ElementNotFoundInHtml):
|
except Exception:
|
||||||
recipe_json['cookTime'] = 0
|
recipe_json['cookTime'] = 0
|
||||||
if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
|
if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
|
||||||
try:
|
try:
|
||||||
recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0
|
recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0
|
||||||
except (AttributeError,ElementNotFoundInHtml):
|
except Exception:
|
||||||
pass
|
try:
|
||||||
|
get_minutes(scrape.schema.data.get("totalTime")) or 0
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
recipe_json['image'] = parse_image(scrape.image()) or ''
|
recipe_json['image'] = parse_image(scrape.image()) or None
|
||||||
except (AttributeError,ElementNotFoundInHtml, TypeError, SchemaOrgException):
|
except Exception:
|
||||||
recipe_json['image'] = ''
|
recipe_json['image'] = None
|
||||||
|
if not recipe_json['image']:
|
||||||
|
try:
|
||||||
|
recipe_json['image'] = parse_image(scrape.schema.data.get('image')) or ''
|
||||||
|
except Exception:
|
||||||
|
recipe_json['image'] = ''
|
||||||
|
|
||||||
keywords = []
|
keywords = []
|
||||||
try:
|
try:
|
||||||
if scrape.schema.data.get("keywords"):
|
if scrape.schema.data.get("keywords"):
|
||||||
keywords += listify_keywords(scrape.schema.data.get("keywords"))
|
keywords += listify_keywords(scrape.schema.data.get("keywords"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
if scrape.schema.data.get('recipeCategory'):
|
if scrape.schema.data.get('recipeCategory'):
|
||||||
keywords += listify_keywords(scrape.schema.data.get("recipeCategory"))
|
keywords += listify_keywords(scrape.schema.data.get("recipeCategory"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
if scrape.schema.data.get('recipeCuisine'):
|
if scrape.schema.data.get('recipeCuisine'):
|
||||||
keywords += listify_keywords(scrape.schema.data.get("recipeCuisine"))
|
keywords += listify_keywords(scrape.schema.data.get("recipeCuisine"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space)
|
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space)
|
||||||
except (AttributeError,ElementNotFoundInHtml):
|
except AttributeError:
|
||||||
recipe_json['keywords'] = keywords
|
recipe_json['keywords'] = keywords
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -104,12 +135,12 @@ def get_from_scraper(scrape, space):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
recipe_json['recipeIngredient'] = ingredients
|
recipe_json['recipeIngredient'] = ingredients
|
||||||
except (AttributeError,ElementNotFoundInHtml):
|
except Exception:
|
||||||
recipe_json['recipeIngredient'] = ingredients
|
recipe_json['recipeIngredient'] = ingredients
|
||||||
|
|
||||||
try:
|
try:
|
||||||
recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions())
|
recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions())
|
||||||
except (AttributeError,ElementNotFoundInHtml):
|
except Exception:
|
||||||
recipe_json['recipeInstructions'] = ""
|
recipe_json['recipeInstructions'] = ""
|
||||||
|
|
||||||
if scrape.url:
|
if scrape.url:
|
||||||
@@ -222,6 +253,8 @@ def parse_instructions(instructions):
|
|||||||
|
|
||||||
def parse_image(image):
|
def parse_image(image):
|
||||||
# check if list of images is returned, take first if so
|
# check if list of images is returned, take first if so
|
||||||
|
if not image:
|
||||||
|
return None
|
||||||
if type(image) == list:
|
if type(image) == list:
|
||||||
for pic in image:
|
for pic in image:
|
||||||
if (type(pic) == str) and (pic[:4] == 'http'):
|
if (type(pic) == str) and (pic[:4] == 'http'):
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ def text_scraper(text, url=None):
|
|||||||
url=None
|
url=None
|
||||||
):
|
):
|
||||||
self.wild_mode = False
|
self.wild_mode = False
|
||||||
self.exception_handling = None # TODO add new method here, old one was deprecated
|
# self.exception_handling = None # TODO add new method here, old one was deprecated
|
||||||
self.meta_http_equiv = False
|
self.meta_http_equiv = False
|
||||||
self.soup = BeautifulSoup(page_data, "html.parser")
|
self.soup = BeautifulSoup(page_data, "html.parser")
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|||||||
@@ -863,7 +863,7 @@ DELISH = {
|
|||||||
"servings": 6,
|
"servings": 6,
|
||||||
"prepTime": 10,
|
"prepTime": 10,
|
||||||
"cookTime": 0,
|
"cookTime": 0,
|
||||||
"image": '',
|
"image": 'https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/delish-cheesy-asparagus-horizontal-7-1536094595.png',
|
||||||
"keywords": [
|
"keywords": [
|
||||||
{
|
{
|
||||||
"id": 2211187,
|
"id": 2211187,
|
||||||
|
|||||||
Reference in New Issue
Block a user