From e345d2eb3914bd5b3132dfe50816023f149cdac9 Mon Sep 17 00:00:00 2001 From: smilerz Date: Sun, 2 May 2021 15:44:27 -0500 Subject: [PATCH] updated to handle new behavior of recipe_scrapers --- cookbook/helper/recipe_url_import.py | 69 ++++++++++++++++++++-------- cookbook/helper/scrapers/scrapers.py | 2 +- cookbook/tests/other/_recipes.py | 2 +- 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 508137750..55aa38bb6 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -17,53 +17,84 @@ def get_from_scraper(scrape, space): recipe_json = {} try: - recipe_json['name'] = parse_name(scrape.title() or scrape.schema.data.get('name') or '') - except (TypeError, AttributeError,ElementNotFoundInHtml): - recipe_json['name'] = '' + recipe_json['name'] = parse_name(scrape.title() or None) + except (TypeError, AttributeError, ElementNotFoundInHtml, NotImplementedError): + recipe_json['name'] = None + if not recipe_json['name']: + try: + recipe_json['name'] = scrape.schema.data.get('name') or '' + except Exception: + recipe_json['name'] = '' try: description = scrape.schema.data.get("description") or '' - except (AttributeError,ElementNotFoundInHtml): + except (AttributeError, ElementNotFoundInHtml, NotImplementedError, SchemaOrgException): description = '' recipe_json['description'] = parse_description(description) try: - servings = scrape.yields() - servings = int(re.findall(r'\b\d+\b', servings)[0]) - except (AttributeError,ElementNotFoundInHtml, ValueError, IndexError): - servings = 1 + servings = scrape.yields() or None + except Exception: + servings = None + if not servings: + try: + servings = scrape.schema.data.get('recipeYield') or 1 + except Exception: + servings = 1 + if type(servings) != int: + try: + servings = int(re.findall(r'\b\d+\b', servings)[0]) + except Exception: + servings = 1 recipe_json['servings'] = servings try: recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0 - except (AttributeError, ElementNotFoundInHtml): + except Exception: recipe_json['prepTime'] = 0 try: recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0 - except (AttributeError, ElementNotFoundInHtml): + except Exception: recipe_json['cookTime'] = 0 if recipe_json['cookTime'] + recipe_json['prepTime'] == 0: try: recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0 - except (AttributeError,ElementNotFoundInHtml): - pass + except Exception: + try: + get_minutes(scrape.schema.data.get("totalTime")) or 0 + except Exception: + pass try: - recipe_json['image'] = parse_image(scrape.image()) or '' - except (AttributeError,ElementNotFoundInHtml, TypeError, SchemaOrgException): - recipe_json['image'] = '' + recipe_json['image'] = parse_image(scrape.image()) or None + except Exception: + recipe_json['image'] = None + if not recipe_json['image']: + try: + recipe_json['image'] = parse_image(scrape.schema.data.get('image')) or '' + except Exception: + recipe_json['image'] = '' keywords = [] try: if scrape.schema.data.get("keywords"): keywords += listify_keywords(scrape.schema.data.get("keywords")) + except Exception: + pass + try: if scrape.schema.data.get('recipeCategory'): keywords += listify_keywords(scrape.schema.data.get("recipeCategory")) + except Exception: + pass + try: if scrape.schema.data.get('recipeCuisine'): keywords += listify_keywords(scrape.schema.data.get("recipeCuisine")) + except Exception: + pass + try: recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space) - except (AttributeError,ElementNotFoundInHtml): + except AttributeError: recipe_json['keywords'] = keywords try: @@ -104,12 +135,12 @@ def get_from_scraper(scrape, space): } ) recipe_json['recipeIngredient'] = ingredients - except (AttributeError,ElementNotFoundInHtml): + except Exception: recipe_json['recipeIngredient'] = ingredients try: recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions()) - except (AttributeError,ElementNotFoundInHtml): + except Exception: recipe_json['recipeInstructions'] = "" if scrape.url: @@ -222,6 +253,8 @@ def parse_instructions(instructions): def parse_image(image): # check if list of images is returned, take first if so + if not image: + return None if type(image) == list: for pic in image: if (type(pic) == str) and (pic[:4] == 'http'): diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index 98807d186..8b0ae78b8 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -30,7 +30,7 @@ def text_scraper(text, url=None): url=None ): self.wild_mode = False - self.exception_handling = None # TODO add new method here, old one was deprecated + # self.exception_handling = None # TODO add new method here, old one was deprecated self.meta_http_equiv = False self.soup = BeautifulSoup(page_data, "html.parser") self.url = url diff --git a/cookbook/tests/other/_recipes.py b/cookbook/tests/other/_recipes.py index 95b9b18f5..adcf3bfa6 100644 --- a/cookbook/tests/other/_recipes.py +++ b/cookbook/tests/other/_recipes.py @@ -863,7 +863,7 @@ DELISH = { "servings": 6, "prepTime": 10, "cookTime": 0, - "image": '', + "image": 'https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/delish-cheesy-asparagus-horizontal-7-1536094595.png', "keywords": [ { "id": 2211187,