diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 3b06dc80d..19ef40143 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -1,16 +1,16 @@ import json -import re +from json.decoder import JSONDecodeError from bs4 import BeautifulSoup from bs4.element import Tag +# from cookbook.helper.ingredient_parser import parse as parse_ingredient from cookbook.helper import recipe_url_import as helper -from cookbook.helper.scrapers.scrapers import text_scraper -from json import JSONDecodeError -from recipe_scrapers._utils import get_host_name, normalize_string -from urllib.parse import unquote -def get_recipe_from_source(text, url, space): +# %% + +# %% +def get_from_raw(text): def build_node(k, v): if isinstance(v, dict): node = { @@ -26,8 +26,8 @@ def get_recipe_from_source(text, url, space): } else: node = { - 'name': k + ": " + normalize_string(str(v)), - 'value': normalize_string(str(v)) + 'name': k + ": " + str(v), + 'value': str(v) } return node @@ -52,14 +52,13 @@ def get_recipe_from_source(text, url, space): kid_list.append(build_node(k, v)) else: kid_list.append({ - 'name': normalize_string(str(kid)), - 'value': normalize_string(str(kid)) + 'name': kid, + 'value': kid }) return kid_list recipe_json = { 'name': '', - 'url': '', 'description': '', 'image': '', 'keywords': [], @@ -68,51 +67,26 @@ def get_recipe_from_source(text, url, space): 'servings': '', 'prepTime': '', 'cookTime': '' - } + } recipe_tree = [] + temp_tree = [] parse_list = [] - html_data = [] - images = [] - text = unquote(text) try: - parse_list.append(remove_graph(json.loads(text))) - if not url and 'url' in parse_list[0]: - url = parse_list[0]['url'] - scrape = text_scraper("", url=url) - + parse_list.append(json.loads(text)) except JSONDecodeError: soup = BeautifulSoup(text, "html.parser") - html_data = get_from_html(soup) - images += get_images_from_source(soup, url) for el in soup.find_all('script', type='application/ld+json'): - el = remove_graph(el) - if not url and 'url' in el: - url = el['url'] - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) + parse_list.append(el) for el in soup.find_all(type='application/json'): - el = remove_graph(el) - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - scrape = text_scraper(text, url=url) - - recipe_json = helper.get_from_scraper(scrape, space) + parse_list.append(el) + # first try finding ld+json as its most common for el in parse_list: - temp_tree = [] - if isinstance(el, Tag): - try: - el = json.loads(el.string) - except TypeError: - continue + if isinstance(el, Tag): + el = json.loads(el.string) + for k, v in el.items(): if isinstance(v, dict): node = { @@ -128,66 +102,22 @@ def get_recipe_from_source(text, url, space): } else: node = { - 'name': k + ": " + normalize_string(str(v)), - 'value': normalize_string(str(v)) + 'name': k + ": " + str(v), + 'value': str(v) } temp_tree.append(node) - - if '@type' in el and el['@type'] == 'Recipe': + if ('@type' in el and el['@type'] == 'Recipe'): + recipe_json = helper.find_recipe_json(el, None) recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] else: recipe_tree += [{'name': 'json', 'children': temp_tree}] - return recipe_json, recipe_tree, html_data, images + temp_tree = [] + # overide keyword structure from dict to list + kws = [] + for kw in recipe_json['keywords']: + kws.append(kw['text']) + recipe_json['keywords'] = kws -def get_from_html(soup): - INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') - html = [] - for s in soup.strings: - if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): - html.append(s) - return html - - -def get_images_from_source(soup, url): - sources = ['src', 'srcset', 'data-src'] - images = [] - img_tags = soup.find_all('img') - if url: - site = get_host_name(url) - prot = url.split(':')[0] - - urls = [] - for img in img_tags: - for src in sources: - try: - urls.append(img[src]) - except KeyError: - pass - - for u in urls: - u = u.split('?')[0] - filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) - if filename: - if (('http' not in u) and (url)): - # sometimes an image source can be relative - # if it is provide the base url - u = '{}://{}{}'.format(prot, site, u) - if 'http' in u: - images.append(u) - return images - - -def remove_graph(el): - # recipes type might be wrapped in @graph type - if isinstance(el, Tag): - try: - el = json.loads(el.string) - if '@graph' in el: - for x in el['@graph']: - if '@type' in x and x['@type'] == 'Recipe': - el = x - except TypeError: - pass - return el + return recipe_json, recipe_tree diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 58dc8d36f..24516f9f2 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -1,5 +1,6 @@ import random import re +from json import JSONDecodeError from isodate import parse_duration as iso_parse_duration from isodate.isoerror import ISO8601Error @@ -63,6 +64,8 @@ def find_recipe_json(ld_json, url): if 'recipeIngredient' in ld_json: ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient']) + else: + ld_json['recipeIngredient'] = "" try: servings = scrape.yields() @@ -90,22 +93,40 @@ def find_recipe_json(ld_json, url): if 'recipeCategory' in ld_json: keywords += listify_keywords(ld_json['recipeCategory']) if 'recipeCuisine' in ld_json: - keywords += listify_keywords(ld_json['keywords']) - ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords)))) + keywords += listify_keywords(ld_json['recipeCuisine']) + try: + ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords)))) + except TypeError: + pass if 'recipeInstructions' in ld_json: ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions']) + else: + ld_json['recipeInstructions'] = "" if 'image' in ld_json: ld_json['image'] = parse_image(ld_json['image']) + else: + ld_json['image'] = "" + + if 'description' not in ld_json: + ld_json['description'] = "" if 'cookTime' in ld_json: ld_json['cookTime'] = parse_cooktime(ld_json['cookTime']) + else: + ld_json['cookTime'] = 0 if 'prepTime' in ld_json: ld_json['prepTime'] = parse_cooktime(ld_json['prepTime']) + else: + ld_json['prepTime'] = 0 - ld_json['servings'] = 1 + if 'servings' in ld_json: + if type(ld_json['servings']) == str: + ld_json['servings'] = int(re.search(r'\d+', ld_json['servings']).group()) + else: + ld_json['servings'] = 1 try: if 'recipeYield' in ld_json: if type(ld_json['recipeYield']) == str: @@ -118,7 +139,7 @@ def find_recipe_json(ld_json, url): for key in list(ld_json): if key not in [ 'prepTime', 'cookTime', 'image', 'recipeInstructions', - 'keywords', 'name', 'recipeIngredient', 'servings' + 'keywords', 'name', 'recipeIngredient', 'servings', 'description' ]: ld_json.pop(key, None) @@ -136,6 +157,12 @@ def parse_name(name): def parse_ingredients(ingredients): # some pages have comma separated ingredients in a single array entry + try: + if type(ingredients[0]) == dict: + return ingredients + except (KeyError, IndexError): + pass + if (len(ingredients) == 1 and type(ingredients) == list): ingredients = ingredients[0].split(',') elif type(ingredients) == str: @@ -216,50 +243,59 @@ def parse_instructions(instructions): instructions = re.sub(r'\n\s*\n', '\n\n', instructions) instructions = re.sub(' +', ' ', instructions) - instructions = instructions.replace('
', '') - instructions = instructions.replace('
', '') - return instruction_text + instructions = re.sub('', '\n', instructions) + instructions = re.sub('<[^<]+?>', '', instructions) + return instructions def parse_image(image): # check if list of images is returned, take first if so - if (type(image)) == list: - if type(image[0]) == str: - image = image[0] - elif 'url' in image[0]: - image = image[0]['url'] + if type(image) == list: + for pic in image: + if (type(pic) == str) and (pic[:4] == 'http'): + image = pic + elif 'url' in pic: + image = pic['url'] # ignore relative image paths - if 'http' not in image: + if image[:4] != 'http': image = '' return image def parse_cooktime(cooktime): - try: - if (type(cooktime) == list and len(cooktime) > 0): - cooktime = cooktime[0] - cooktime = round(parse_duration(cooktime).seconds / 60) - except TypeError: - cooktime = 0 - if type(cooktime) != int or float: - cooktime = 0 + if type(cooktime) not in [int, float]: + try: + cooktime = float(re.search(r'\d+', cooktime).group()) + except (ValueError, AttributeError): + try: + cooktime = round(iso_parse_duration(cooktime).seconds / 60) + except ISO8601Error: + try: + if (type(cooktime) == list and len(cooktime) > 0): + cooktime = cooktime[0] + cooktime = round(parse_duration(cooktime).seconds / 60) + except AttributeError: + cooktime = 0 + return cooktime def parse_preptime(preptime): - try: - if (type(preptime) == list and len(preptime) > 0): - preptime = preptime[0] - preptime = round( - parse_duration( - preptime - ).seconds / 60 - ) - except TypeError: - preptime = 0 - if type(preptime) != int or float: - preptime = 0 + if type(preptime) not in [int, float]: + try: + preptime = float(re.search(r'\d+', preptime).group()) + except ValueError: + try: + preptime = round(iso_parse_duration(preptime).seconds / 60) + except ISO8601Error: + try: + if (type(preptime) == list and len(preptime) > 0): + preptime = preptime[0] + preptime = round(parse_duration(preptime).seconds / 60) + except AttributeError: + preptime = 0 + return preptime @@ -277,6 +313,11 @@ def parse_keywords(keyword_json): def listify_keywords(keyword_list): # keywords as string + try: + if type(keyword_list[0]) == dict: + return keyword_list + except KeyError: + pass if type(keyword_list) == str: keyword_list = keyword_list.split(',') diff --git a/cookbook/templates/import_json.html b/cookbook/templates/import_json.html index e71e77d50..ce7780560 100644 --- a/cookbook/templates/import_json.html +++ b/cookbook/templates/import_json.html @@ -1,3 +1,4 @@ + {% extends "base.html" %} {% load crispy_forms_filters %} {% load i18n %} @@ -24,7 +25,7 @@