diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 5316c8a6b..235ab05d7 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -3,6 +3,8 @@ import re from isodate import parse_duration as iso_parse_duration from isodate.isoerror import ISO8601Error +import microdata +from bs4 import BeautifulSoup from cookbook.helper.ingredient_parser import parse as parse_single_ingredient from cookbook.models import Keyword from django.utils.dateparse import parse_duration @@ -14,18 +16,53 @@ from recipe_scrapers._utils import get_minutes def get_from_scraper(scrape, space): # converting the scrape_me object to the existing json format based on ld+json - recipe_json = {} - try: - recipe_json['name'] = parse_name(scrape.title() or scrape.schema.data.get('name') or '') - except (TypeError, AttributeError): - recipe_json['name'] = '' + # first try finding ld+json as its most common + for ld in soup.find_all('script', type='application/ld+json'): + try: + ld_json = json.loads(ld.string.replace('\n', '')) + if type(ld_json) != list: + ld_json = [ld_json] + + for ld_json_item in ld_json: + # recipes type might be wrapped in @graph type + if '@graph' in ld_json_item: + for x in ld_json_item['@graph']: + if '@type' in x and x['@type'] == 'Recipe': + ld_json_item = x + + if ('@type' in ld_json_item and ld_json_item['@type'] == 'Recipe'): + return JsonResponse(find_recipe_json(ld_json_item, url)) + except JSONDecodeError: + return JsonResponse( + { + 'error': True, + 'msg': _('The requested site provided malformed data and cannot be read.') # noqa: E501 + }, + status=400) + + # now try to find microdata + items = microdata.get_items(html_text) + for i in items: + md_json = json.loads(i.json()) + if 'schema.org/Recipe' in str(md_json['type']): + return JsonResponse(find_recipe_json(md_json['properties'], url)) + + return JsonResponse( + { + 'error': True, + 'msg': _('The requested site does not provide any recognized data format to import the recipe from.') # noqa: E501 + }, + status=400) + + +def find_recipe_json(ld_json, url): + ld_json['name'] = parse_name(ld_json['name']) - try: - description = scrape.schema.data.get("description") or '' except AttributeError: description = '' - recipe_json['description'] = parse_description(description) + if 'recipeIngredient' in ld_json: + ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient']) try: servings = scrape.yields() @@ -47,78 +84,45 @@ def get_from_scraper(scrape, space): recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0 except AttributeError: pass + keywords = [] + if 'keywords' in ld_json: + keywords += listify_keywords(ld_json['keywords']) + if 'recipeCategory' in ld_json: + keywords += listify_keywords(ld_json['recipeCategory']) + if 'recipeCuisine' in ld_json: + keywords += listify_keywords(ld_json['keywords']) + ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords)))) + if 'recipeInstructions' in ld_json: + ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions']) + + if 'image' in ld_json: + ld_json['image'] = parse_image(ld_json['image']) + + if 'cookTime' in ld_json: + ld_json['cookTime'] = parse_cooktime(ld_json['cookTime']) + + if 'prepTime' in ld_json: + ld_json['prepTime'] = parse_cooktime(ld_json['prepTime']) + + ld_json['servings'] = 1 try: - recipe_json['image'] = parse_image(scrape.image()) or '' - except (AttributeError, TypeError, SchemaOrgException): - recipe_json['image'] = '' + if 'recipeYield' in ld_json: + if type(ld_json['recipeYield']) == str: + ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'])[0]) + elif type(ld_json['recipeYield']) == list: + ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0]) + except Exception as e: + print(e) - for x in ld_json['recipeIngredient']: - if x.replace(' ', '') != '': - x = x.replace('½', "0.5").replace('¼', "0.25").replace('¾', "0.75") - try: - amount, unit, ingredient, note = parse_ingredient(x) - if ingredient: - ingredients.append( - { - 'amount': amount, - 'unit': { - 'text': unit, - 'id': random.randrange(10000, 99999) - }, - 'ingredient': { - 'text': ingredient, - 'id': random.randrange(10000, 99999) - }, - 'note': note, - 'original': x - } - ) - except Exception: - ingredients.append( - { - 'amount': amount, - 'unit': { - 'text': unit, - 'id': random.randrange(10000, 99999) - }, - 'ingredient': { - 'text': ingredient, - 'id': random.randrange(10000, 99999) - }, - 'note': note, - 'original': x - } - ) - except Exception: - ingredients.append( - { - 'amount': 0, - 'unit': { - 'text': '', - 'id': random.randrange(10000, 99999) - }, - 'ingredient': { - 'text': x, - 'id': random.randrange(10000, 99999) - }, - 'note': '', - 'original': x - } - ) - recipe_json['recipeIngredient'] = ingredients - except AttributeError: - recipe_json['recipeIngredient'] = ingredients + for key in list(ld_json): + if key not in [ + 'prepTime', 'cookTime', 'image', 'recipeInstructions', + 'keywords', 'name', 'recipeIngredient', 'servings' + ]: + ld_json.pop(key, None) - try: - recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions()) - except AttributeError: - recipe_json['recipeInstructions'] = "" - - if scrape.url: - recipe_json['url'] = scrape.url - recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url - return recipe_json + return ld_json def parse_name(name): @@ -127,17 +131,11 @@ def parse_name(name): name = name[0] except Exception: name = 'ERROR' - return normalize_string(name) + return name def parse_ingredients(ingredients): # some pages have comma separated ingredients in a single array entry - try: - if type(ingredients[0]) == dict: - return ingredients - except (KeyError, IndexError): - pass - if (len(ingredients) == 1 and type(ingredients) == list): ingredients = ingredients[0].split(',') elif type(ingredients) == str: @@ -153,7 +151,6 @@ def parse_ingredients(ingredients): for x in ingredients: if x.replace(' ', '') != '': - x = x.replace('½', "0.5").replace('¼', "0.25").replace('¾', "0.75") try: amount, unit, ingredient, note = parse_single_ingredient(x) if ingredient: @@ -195,10 +192,6 @@ def parse_ingredients(ingredients): return ingredients -def parse_description(description): - return normalize_string(description) - - def parse_instructions(instructions): instruction_text = '' @@ -220,98 +213,69 @@ def parse_instructions(instructions): instruction_text += str(i) instructions = instruction_text - return normalize_string(instructions) + instructions = re.sub(r'\n\s*\n', '\n\n', instructions) + instructions = re.sub(' +', ' ', instructions) + instructions = instructions.replace('

', '') + instructions = instructions.replace('

', '') + return instruction_text def parse_image(image): # check if list of images is returned, take first if so - if type(image) == list: - for pic in image: - if (type(pic) == str) and (pic[:4] == 'http'): - image = pic - elif 'url' in pic: - image = pic['url'] - elif type(image) == dict: - if 'url' in image: - image = image['url'] + if (type(image)) == list: + if type(image[0]) == str: + image = image[0] + elif 'url' in image[0]: + image = image[0]['url'] # ignore relative image paths - if image[:4] != 'http': + if 'http' not in image: image = '' return image -def parse_servings(servings): - if type(servings) == str: - try: - servings = int(re.search(r'\d+', servings).group()) - except AttributeError: - servings = 1 - elif type(servings) == list: - try: - servings = int(re.findall(r'\b\d+\b', servings[0])[0]) - except KeyError: - servings = 1 - return servings - - def parse_cooktime(cooktime): - if type(cooktime) not in [int, float]: - try: - cooktime = float(re.search(r'\d+', cooktime).group()) - except (ValueError, AttributeError): - try: - cooktime = round(iso_parse_duration(cooktime).seconds / 60) - except ISO8601Error: - try: - if (type(cooktime) == list and len(cooktime) > 0): - cooktime = cooktime[0] - cooktime = round(parse_duration(cooktime).seconds / 60) - except AttributeError: - cooktime = 0 - + try: + if (type(cooktime) == list and len(cooktime) > 0): + cooktime = cooktime[0] + cooktime = round(parse_duration(cooktime).seconds / 60) + except TypeError: + cooktime = 0 + if type(cooktime) != int or float: + cooktime = 0 return cooktime def parse_preptime(preptime): - if type(preptime) not in [int, float]: - try: - preptime = float(re.search(r'\d+', preptime).group()) - except ValueError: - try: - preptime = round(iso_parse_duration(preptime).seconds / 60) - except ISO8601Error: - try: - if (type(preptime) == list and len(preptime) > 0): - preptime = preptime[0] - preptime = round(parse_duration(preptime).seconds / 60) - except AttributeError: - preptime = 0 - + try: + if (type(preptime) == list and len(preptime) > 0): + preptime = preptime[0] + preptime = round( + parse_duration( + preptime + ).seconds / 60 + ) + except TypeError: + preptime = 0 + if type(preptime) != int or float: + preptime = 0 return preptime -def parse_keywords(keyword_json, space): +def parse_keywords(keyword_json): keywords = [] # keywords as list for kw in keyword_json: - kw = normalize_string(kw) - if len(kw) != 0: - if k := Keyword.objects.filter(name=kw, space=space).first(): - keywords.append({'id': str(k.id), 'text': str(k)}) - else: - keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw}) + if k := Keyword.objects.filter(name=kw).first(): + keywords.append({'id': str(k.id), 'text': str(k)}) + else: + keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw}) return keywords def listify_keywords(keyword_list): # keywords as string - try: - if type(keyword_list[0]) == dict: - return keyword_list - except (KeyError, IndexError): - pass if type(keyword_list) == str: keyword_list = keyword_list.split(',')