diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index b8553beac..3b06dc80d 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -68,7 +68,7 @@ def get_recipe_from_source(text, url, space): 'servings': '', 'prepTime': '', 'cookTime': '' - } + } recipe_tree = [] parse_list = [] html_data = [] @@ -77,6 +77,9 @@ def get_recipe_from_source(text, url, space): try: parse_list.append(remove_graph(json.loads(text))) + if not url and 'url' in parse_list[0]: + url = parse_list[0]['url'] + scrape = text_scraper("", url=url) except JSONDecodeError: soup = BeautifulSoup(text, "html.parser") @@ -84,6 +87,8 @@ def get_recipe_from_source(text, url, space): images += get_images_from_source(soup, url) for el in soup.find_all('script', type='application/ld+json'): el = remove_graph(el) + if not url and 'url' in el: + url = el['url'] if type(el) == list: for le in el: parse_list.append(le) @@ -96,15 +101,6 @@ def get_recipe_from_source(text, url, space): parse_list.append(le) elif type(el) == dict: parse_list.append(el) - - # if a url was not provided, try to find one in the first document - if not url and len(parse_list) > 0: - if 'url' in parse_list[0]: - url = parse_list[0]['url'] - - if type(text) == dict: - scrape = text_scraper("", url=url) - elif type(text) == str: scrape = text_scraper(text, url=url) recipe_json = helper.get_from_scraper(scrape, space) diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 366068e75..6efa82e7a 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -6,6 +6,7 @@ from isodate.isoerror import ISO8601Error from cookbook.helper.ingredient_parser import parse as parse_single_ingredient from cookbook.models import Keyword from django.utils.dateparse import parse_duration +from recipe_scrapers._schemaorg import SchemaOrgException from recipe_scrapers._utils import get_minutes, normalize_string @@ -13,7 +14,10 @@ def get_from_scraper(scrape, space): # converting the scrape_me object to the existing json format based on ld+json recipe_json = {} - recipe_json['name'] = scrape.title() + try: + recipe_json['name'] = scrape.title() + except TypeError: + recipe_json['name'] = '' try: description = scrape.schema.data.get("description") or '' @@ -21,7 +25,7 @@ def get_from_scraper(scrape, space): except AttributeError: description = '' - recipe_json['description'] = normalize_string(description) + recipe_json['description'] = parse_description(description) try: servings = scrape.yields() @@ -40,7 +44,7 @@ def get_from_scraper(scrape, space): try: recipe_json['image'] = parse_image(scrape.image()) or '' - except (AttributeError, TypeError): + except (AttributeError, TypeError, SchemaOrgException): recipe_json['image'] = '' keywords = [] @@ -181,6 +185,14 @@ def parse_ingredients(ingredients): return ingredients +def parse_description(description): + description = re.sub(r'\n\s*\n', '\n\n', description) + description = re.sub(' +', ' ', description) + description = re.sub('

', '\n', description) + description = re.sub('<[^<]+?>', '', description) + return normalize_string(description) + + def parse_instructions(instructions): instruction_text = ''