updated to handle new behavior of recipe_scrapers

This commit is contained in:
smilerz
2021-05-02 15:44:27 -05:00
parent 90e1e69dac
commit e345d2eb39
3 changed files with 53 additions and 20 deletions

View File

@@ -17,53 +17,84 @@ def get_from_scraper(scrape, space):
recipe_json = {} recipe_json = {}
try: try:
recipe_json['name'] = parse_name(scrape.title() or scrape.schema.data.get('name') or '') recipe_json['name'] = parse_name(scrape.title() or None)
except (TypeError, AttributeError,ElementNotFoundInHtml): except (TypeError, AttributeError, ElementNotFoundInHtml, NotImplementedError):
recipe_json['name'] = '' recipe_json['name'] = None
if not recipe_json['name']:
try:
recipe_json['name'] = scrape.schema.data.get('name') or ''
except Exception:
recipe_json['name'] = ''
try: try:
description = scrape.schema.data.get("description") or '' description = scrape.schema.data.get("description") or ''
except (AttributeError,ElementNotFoundInHtml): except (AttributeError, ElementNotFoundInHtml, NotImplementedError, SchemaOrgException):
description = '' description = ''
recipe_json['description'] = parse_description(description) recipe_json['description'] = parse_description(description)
try: try:
servings = scrape.yields() servings = scrape.yields() or None
servings = int(re.findall(r'\b\d+\b', servings)[0]) except Exception:
except (AttributeError,ElementNotFoundInHtml, ValueError, IndexError): servings = None
servings = 1 if not servings:
try:
servings = scrape.schema.data.get('recipeYield') or 1
except Exception:
servings = 1
if type(servings) != int:
try:
servings = int(re.findall(r'\b\d+\b', servings)[0])
except Exception:
servings = 1
recipe_json['servings'] = servings recipe_json['servings'] = servings
try: try:
recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0 recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0
except (AttributeError, ElementNotFoundInHtml): except Exception:
recipe_json['prepTime'] = 0 recipe_json['prepTime'] = 0
try: try:
recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0 recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0
except (AttributeError, ElementNotFoundInHtml): except Exception:
recipe_json['cookTime'] = 0 recipe_json['cookTime'] = 0
if recipe_json['cookTime'] + recipe_json['prepTime'] == 0: if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
try: try:
recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0 recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0
except (AttributeError,ElementNotFoundInHtml): except Exception:
pass try:
get_minutes(scrape.schema.data.get("totalTime")) or 0
except Exception:
pass
try: try:
recipe_json['image'] = parse_image(scrape.image()) or '' recipe_json['image'] = parse_image(scrape.image()) or None
except (AttributeError,ElementNotFoundInHtml, TypeError, SchemaOrgException): except Exception:
recipe_json['image'] = '' recipe_json['image'] = None
if not recipe_json['image']:
try:
recipe_json['image'] = parse_image(scrape.schema.data.get('image')) or ''
except Exception:
recipe_json['image'] = ''
keywords = [] keywords = []
try: try:
if scrape.schema.data.get("keywords"): if scrape.schema.data.get("keywords"):
keywords += listify_keywords(scrape.schema.data.get("keywords")) keywords += listify_keywords(scrape.schema.data.get("keywords"))
except Exception:
pass
try:
if scrape.schema.data.get('recipeCategory'): if scrape.schema.data.get('recipeCategory'):
keywords += listify_keywords(scrape.schema.data.get("recipeCategory")) keywords += listify_keywords(scrape.schema.data.get("recipeCategory"))
except Exception:
pass
try:
if scrape.schema.data.get('recipeCuisine'): if scrape.schema.data.get('recipeCuisine'):
keywords += listify_keywords(scrape.schema.data.get("recipeCuisine")) keywords += listify_keywords(scrape.schema.data.get("recipeCuisine"))
except Exception:
pass
try:
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space) recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space)
except (AttributeError,ElementNotFoundInHtml): except AttributeError:
recipe_json['keywords'] = keywords recipe_json['keywords'] = keywords
try: try:
@@ -104,12 +135,12 @@ def get_from_scraper(scrape, space):
} }
) )
recipe_json['recipeIngredient'] = ingredients recipe_json['recipeIngredient'] = ingredients
except (AttributeError,ElementNotFoundInHtml): except Exception:
recipe_json['recipeIngredient'] = ingredients recipe_json['recipeIngredient'] = ingredients
try: try:
recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions()) recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions())
except (AttributeError,ElementNotFoundInHtml): except Exception:
recipe_json['recipeInstructions'] = "" recipe_json['recipeInstructions'] = ""
if scrape.url: if scrape.url:
@@ -222,6 +253,8 @@ def parse_instructions(instructions):
def parse_image(image): def parse_image(image):
# check if list of images is returned, take first if so # check if list of images is returned, take first if so
if not image:
return None
if type(image) == list: if type(image) == list:
for pic in image: for pic in image:
if (type(pic) == str) and (pic[:4] == 'http'): if (type(pic) == str) and (pic[:4] == 'http'):

View File

@@ -30,7 +30,7 @@ def text_scraper(text, url=None):
url=None url=None
): ):
self.wild_mode = False self.wild_mode = False
self.exception_handling = None # TODO add new method here, old one was deprecated # self.exception_handling = None # TODO add new method here, old one was deprecated
self.meta_http_equiv = False self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser") self.soup = BeautifulSoup(page_data, "html.parser")
self.url = url self.url = url

View File

@@ -863,7 +863,7 @@ DELISH = {
"servings": 6, "servings": 6,
"prepTime": 10, "prepTime": 10,
"cookTime": 0, "cookTime": 0,
"image": '', "image": 'https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/delish-cheesy-asparagus-horizontal-7-1536094595.png',
"keywords": [ "keywords": [
{ {
"id": 2211187, "id": 2211187,