diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 62a057e2f..95f115b76 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -1,191 +1,191 @@ -import json -import re -from json import JSONDecodeError -from urllib.parse import unquote +# import json +# import re +# from json import JSONDecodeError +# from urllib.parse import unquote -from bs4 import BeautifulSoup -from bs4.element import Tag -from recipe_scrapers import scrape_html, scrape_me -from recipe_scrapers._exceptions import NoSchemaFoundInWildMode -from recipe_scrapers._utils import get_host_name, normalize_string +# from bs4 import BeautifulSoup +# from bs4.element import Tag +# from recipe_scrapers import scrape_html, scrape_me +# from recipe_scrapers._exceptions import NoSchemaFoundInWildMode +# from recipe_scrapers._utils import get_host_name, normalize_string -from cookbook.helper import recipe_url_import as helper -from cookbook.helper.scrapers.scrapers import text_scraper +# from cookbook.helper import recipe_url_import as helper +# from cookbook.helper.scrapers.scrapers import text_scraper -def get_recipe_from_source(text, url, request): - def build_node(k, v): - if isinstance(v, dict): - node = { - 'name': k, - 'value': k, - 'children': get_children_dict(v) - } - elif isinstance(v, list): - node = { - 'name': k, - 'value': k, - 'children': get_children_list(v) - } - else: - node = { - 'name': k + ": " + normalize_string(str(v)), - 'value': normalize_string(str(v)) - } - return node +# def get_recipe_from_source(text, url, request): +# def build_node(k, v): +# if isinstance(v, dict): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_dict(v) +# } +# elif isinstance(v, list): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_list(v) +# } +# else: +# node = { +# 'name': k + ": " + normalize_string(str(v)), +# 'value': normalize_string(str(v)) +# } +# return node - def get_children_dict(children): - kid_list = [] - for k, v in children.items(): - kid_list.append(build_node(k, v)) - return kid_list +# def get_children_dict(children): +# kid_list = [] +# for k, v in children.items(): +# kid_list.append(build_node(k, v)) +# return kid_list - def get_children_list(children): - kid_list = [] - for kid in children: - if type(kid) == list: - node = { - 'name': "unknown list", - 'value': "unknown list", - 'children': get_children_list(kid) - } - kid_list.append(node) - elif type(kid) == dict: - for k, v in kid.items(): - kid_list.append(build_node(k, v)) - else: - kid_list.append({ - 'name': normalize_string(str(kid)), - 'value': normalize_string(str(kid)) - }) - return kid_list +# def get_children_list(children): +# kid_list = [] +# for kid in children: +# if type(kid) == list: +# node = { +# 'name': "unknown list", +# 'value': "unknown list", +# 'children': get_children_list(kid) +# } +# kid_list.append(node) +# elif type(kid) == dict: +# for k, v in kid.items(): +# kid_list.append(build_node(k, v)) +# else: +# kid_list.append({ +# 'name': normalize_string(str(kid)), +# 'value': normalize_string(str(kid)) +# }) +# return kid_list - recipe_tree = [] - parse_list = [] - soup = BeautifulSoup(text, "html.parser") - html_data = get_from_html(soup) - images = get_images_from_source(soup, url) - text = unquote(text) - scrape = None +# recipe_tree = [] +# parse_list = [] +# soup = BeautifulSoup(text, "html.parser") +# html_data = get_from_html(soup) +# images = get_images_from_source(soup, url) +# text = unquote(text) +# scrape = None - if url and not text: - try: - scrape = scrape_me(url_path=url, wild_mode=True) - except(NoSchemaFoundInWildMode): - pass +# if url and not text: +# try: +# scrape = scrape_me(url_path=url, wild_mode=True) +# except(NoSchemaFoundInWildMode): +# pass - if not scrape: - try: - parse_list.append(remove_graph(json.loads(text))) - if not url and 'url' in parse_list[0]: - url = parse_list[0]['url'] - scrape = text_scraper("", url=url) +# if not scrape: +# try: +# parse_list.append(remove_graph(json.loads(text))) +# if not url and 'url' in parse_list[0]: +# url = parse_list[0]['url'] +# scrape = text_scraper("", url=url) - except JSONDecodeError: - for el in soup.find_all('script', type='application/ld+json'): - el = remove_graph(el) - if not url and 'url' in el: - url = el['url'] - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - for el in soup.find_all(type='application/json'): - el = remove_graph(el) - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - scrape = text_scraper(text, url=url) +# except JSONDecodeError: +# for el in soup.find_all('script', type='application/ld+json'): +# el = remove_graph(el) +# if not url and 'url' in el: +# url = el['url'] +# if type(el) == list: +# for le in el: +# parse_list.append(le) +# elif type(el) == dict: +# parse_list.append(el) +# for el in soup.find_all(type='application/json'): +# el = remove_graph(el) +# if type(el) == list: +# for le in el: +# parse_list.append(le) +# elif type(el) == dict: +# parse_list.append(el) +# scrape = text_scraper(text, url=url) - recipe_json = helper.get_from_scraper(scrape, request) +# recipe_json = helper.get_from_scraper(scrape, request) - # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere - for el in parse_list: - temp_tree = [] - if isinstance(el, Tag): - try: - el = json.loads(el.string) - except TypeError: - continue +# # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere +# for el in parse_list: +# temp_tree = [] +# if isinstance(el, Tag): +# try: +# el = json.loads(el.string) +# except TypeError: +# continue - for k, v in el.items(): - if isinstance(v, dict): - node = { - 'name': k, - 'value': k, - 'children': get_children_dict(v) - } - elif isinstance(v, list): - node = { - 'name': k, - 'value': k, - 'children': get_children_list(v) - } - else: - node = { - 'name': k + ": " + normalize_string(str(v)), - 'value': normalize_string(str(v)) - } - temp_tree.append(node) +# for k, v in el.items(): +# if isinstance(v, dict): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_dict(v) +# } +# elif isinstance(v, list): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_list(v) +# } +# else: +# node = { +# 'name': k + ": " + normalize_string(str(v)), +# 'value': normalize_string(str(v)) +# } +# temp_tree.append(node) - if '@type' in el and el['@type'] == 'Recipe': - recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] - else: - recipe_tree += [{'name': 'json', 'children': temp_tree}] +# if '@type' in el and el['@type'] == 'Recipe': +# recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] +# else: +# recipe_tree += [{'name': 'json', 'children': temp_tree}] - return recipe_json, recipe_tree, html_data, images +# return recipe_json, recipe_tree, html_data, images -def get_from_html(soup): - INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') - html = [] - for s in soup.strings: - if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): - html.append(s) - return html +# def get_from_html(soup): +# INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') +# html = [] +# for s in soup.strings: +# if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): +# html.append(s) +# return html -def get_images_from_source(soup, url): - sources = ['src', 'srcset', 'data-src'] - images = [] - img_tags = soup.find_all('img') - if url: - site = get_host_name(url) - prot = url.split(':')[0] +# def get_images_from_source(soup, url): +# sources = ['src', 'srcset', 'data-src'] +# images = [] +# img_tags = soup.find_all('img') +# if url: +# site = get_host_name(url) +# prot = url.split(':')[0] - urls = [] - for img in img_tags: - for src in sources: - try: - urls.append(img[src]) - except KeyError: - pass +# urls = [] +# for img in img_tags: +# for src in sources: +# try: +# urls.append(img[src]) +# except KeyError: +# pass - for u in urls: - u = u.split('?')[0] - filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) - if filename: - if (('http' not in u) and (url)): - # sometimes an image source can be relative - # if it is provide the base url - u = '{}://{}{}'.format(prot, site, u) - if 'http' in u: - images.append(u) - return images +# for u in urls: +# u = u.split('?')[0] +# filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) +# if filename: +# if (('http' not in u) and (url)): +# # sometimes an image source can be relative +# # if it is provide the base url +# u = '{}://{}{}'.format(prot, site, u) +# if 'http' in u: +# images.append(u) +# return images -def remove_graph(el): - # recipes type might be wrapped in @graph type - if isinstance(el, Tag): - try: - el = json.loads(el.string) - if '@graph' in el: - for x in el['@graph']: - if '@type' in x and x['@type'] == 'Recipe': - el = x - except (TypeError, JSONDecodeError): - pass - return el +# def remove_graph(el): +# # recipes type might be wrapped in @graph type +# if isinstance(el, Tag): +# try: +# el = json.loads(el.string) +# if '@graph' in el: +# for x in el['@graph']: +# if '@type' in x and x['@type'] == 'Recipe': +# el = x +# except (TypeError, JSONDecodeError): +# pass +# return el diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index aa3cc5cff..cec57e729 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -1,21 +1,19 @@ import random import re from html import unescape - -from pytube import YouTube from unicodedata import decomposition from django.utils.dateparse import parse_duration from django.utils.translation import gettext as _ from isodate import parse_duration as iso_parse_duration from isodate.isoerror import ISO8601Error -from recipe_scrapers._utils import get_minutes +from pytube import YouTube +from recipe_scrapers._utils import get_host_name, get_minutes from cookbook.helper import recipe_url_import as helper from cookbook.helper.ingredient_parser import IngredientParser from cookbook.models import Keyword - # from recipe_scrapers._utils import get_minutes ## temporary until/unless upstream incorporates get_minutes() PR @@ -369,3 +367,32 @@ def iso_duration_to_minutes(string): string ).groupdict() return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0) + + +def get_images_from_soup(soup, url): + sources = ['src', 'srcset', 'data-src'] + images = [] + img_tags = soup.find_all('img') + if url: + site = get_host_name(url) + prot = url.split(':')[0] + + urls = [] + for img in img_tags: + for src in sources: + try: + urls.append(img[src]) + except KeyError: + pass + + for u in urls: + u = u.split('?')[0] + filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) + if filename: + if (('http' not in u) and (url)): + # sometimes an image source can be relative + # if it is provide the base url + u = '{}://{}{}'.format(prot, site, u) + if 'http' in u: + images.append(u) + return images diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index 94e3daea0..7d6c08b15 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -27,17 +27,17 @@ def text_scraper(text, url=None): class TextScraper(scraper_class): def __init__( self, - page_data, - url=None + html=None, + url=None, ): self.wild_mode = False self.meta_http_equiv = False - self.soup = BeautifulSoup(page_data, "html.parser") + self.soup = BeautifulSoup(html, "html.parser") self.url = url self.recipe = None try: - self.schema = SchemaOrg(page_data) + self.schema = SchemaOrg(html) except (JSONDecodeError, AttributeError): pass - return TextScraper(text, url) + return TextScraper(url=url, html=text) diff --git a/cookbook/integration/cookbookapp.py b/cookbook/integration/cookbookapp.py index f22e9d45d..7ff50ab62 100644 --- a/cookbook/integration/cookbookapp.py +++ b/cookbook/integration/cookbookapp.py @@ -10,8 +10,8 @@ import validators import yaml from cookbook.helper.ingredient_parser import IngredientParser -from cookbook.helper.recipe_html_import import get_recipe_from_source -from cookbook.helper.recipe_url_import import iso_duration_to_minutes +from cookbook.helper.recipe_url_import import get_images_from_soup, iso_duration_to_minutes +from cookbook.helper.scrapers.scrapers import text_scraper from cookbook.integration.integration import Integration from cookbook.models import Ingredient, Keyword, Recipe, Step @@ -24,7 +24,10 @@ class CookBookApp(Integration): def get_recipe_from_file(self, file): recipe_html = file.getvalue().decode("utf-8") - recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request) + # recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request) + scrape = text_scraper(text=data) + recipe_json = helper.get_from_scraper(scrape, request) + images = list(dict.fromkeys(get_images_from_soup(scrape.soup, url))) recipe = Recipe.objects.create( name=recipe_json['name'].strip(), diff --git a/cookbook/integration/copymethat.py b/cookbook/integration/copymethat.py index 7a2a532f9..2a9c56521 100644 --- a/cookbook/integration/copymethat.py +++ b/cookbook/integration/copymethat.py @@ -3,10 +3,9 @@ from io import BytesIO from zipfile import ZipFile from bs4 import BeautifulSoup - from django.utils.translation import gettext as _ + from cookbook.helper.ingredient_parser import IngredientParser -from cookbook.helper.recipe_html_import import get_recipe_from_source from cookbook.helper.recipe_url_import import iso_duration_to_minutes, parse_servings from cookbook.integration.integration import Integration from cookbook.models import Ingredient, Keyword, Recipe, Step diff --git a/cookbook/views/api.py b/cookbook/views/api.py index 4325fe385..54df51bfb 100644 --- a/cookbook/views/api.py +++ b/cookbook/views/api.py @@ -5,6 +5,8 @@ import re import traceback import uuid from collections import OrderedDict +from json import JSONDecodeError +from urllib.parse import unquote from zipfile import ZipFile import requests @@ -26,6 +28,8 @@ from django.utils.translation import gettext as _ from django_scopes import scopes_disabled from icalendar import Calendar, Event from PIL import UnidentifiedImageError +from recipe_scrapers import scrape_html, scrape_me +from recipe_scrapers._exceptions import NoSchemaFoundInWildMode from requests.exceptions import MissingSchema from rest_framework import decorators, status, viewsets from rest_framework.authtoken.models import Token @@ -40,6 +44,7 @@ from rest_framework.throttling import AnonRateThrottle from rest_framework.viewsets import ViewSetMixin from treebeard.exceptions import InvalidMoveToDescendant, InvalidPosition, PathOverflow +from cookbook.helper import recipe_url_import as helper from cookbook.helper.HelperFunctions import str2bool from cookbook.helper.image_processing import handle_image from cookbook.helper.ingredient_parser import IngredientParser @@ -47,9 +52,9 @@ from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest, Cus CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared, CustomIsSpaceOwner, CustomIsUser, group_required, is_space_owner, switch_user_active_space) -from cookbook.helper.recipe_html_import import get_recipe_from_source from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search -from cookbook.helper.recipe_url_import import get_from_youtube_scraper +from cookbook.helper.recipe_url_import import get_from_youtube_scraper, get_images_from_soup +from cookbook.helper.scrapers.scrapers import text_scraper from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food, FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan, @@ -1116,69 +1121,79 @@ def recipe_from_source(request): - url: url to use for importing recipe - data: if no url is given recipe is imported from provided source data - (optional) bookmarklet: id of bookmarklet import to use, overrides URL and data attributes - :return: JsonResponse containing the parsed json, original html,json and images + :return: JsonResponse containing the parsed json and images """ + scrape = None serializer = RecipeFromSourceSerializer(data=request.data) if serializer.is_valid(): - # headers to use for request to external sites - DEPRECATE - external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"} if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()): serializer.validated_data['url'] = bookmarklet.url serializer.validated_data['data'] = bookmarklet.html bookmarklet.delete() - elif not 'url' in serializer.validated_data and not 'data' in serializer.validated_data: + url = serializer.validated_data.get('url', None) + data = unquote(serializer.validated_data.get('data', None)) + if not url and not data: return Response({ 'error': True, 'msg': _('Nothing to do.') }, status=status.HTTP_400_BAD_REQUEST) - # in manual mode request complete page to return it later - elif 'url' in serializer.validated_data and serializer.validated_data['url'] != '': - if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', serializer.validated_data['url']): - if validators.url(serializer.validated_data['url'], public=True): + elif url and not data: + if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', url): + if validators.url(url, public=True): return Response({ - 'recipe_json': get_from_youtube_scraper(serializer.validated_data['url'], request), - 'recipe_tree': '', - 'recipe_html': '', + 'recipe_json': get_from_youtube_scraper(url, request), + # 'recipe_tree': '', + # 'recipe_html': '', 'recipe_images': [], }, status=status.HTTP_200_OK) - ####### - # this section is redundant to scrape_me. REFACTOR to catch errors from scrape_me - try: - if validators.url(serializer.validated_data['url'], public=True): - requests.get(serializer.validated_data['url'], headers=external_request_headers).content - else: + else: + try: + if validators.url(url, public=True): + scrape = scrape_me(url_path=url, wild_mode=True) + + else: + return Response({ + 'error': True, + 'msg': _('Invalid Url') + }, status=status.HTTP_400_BAD_REQUEST) + except NoSchemaFoundInWildMode: + pass + except requests.exceptions.ConnectionError: return Response({ 'error': True, - 'msg': _('Invalid Url') + 'msg': _('Connection Refused.') }, status=status.HTTP_400_BAD_REQUEST) - except requests.exceptions.ConnectionError: - return Response({ - 'error': True, - 'msg': _('Connection Refused.') - }, status=status.HTTP_400_BAD_REQUEST) - except requests.exceptions.MissingSchema: - return Response({ - 'error': True, - 'msg': _('Bad URL Schema.') - }, status=status.HTTP_400_BAD_REQUEST) - ####### + except requests.exceptions.MissingSchema: + return Response({ + 'error': True, + 'msg': _('Bad URL Schema.') + }, status=status.HTTP_400_BAD_REQUEST) + else: + try: + json.loads(data) + data = "" + except JSONDecodeError: + pass + scrape = text_scraper(text=data, url=url) + if not url and (found_url := scrape.schema.data.get('url', None)): + scrape = text_scraper(text=data, url=found_url) - recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request) - if len(recipe_tree) == 0 and len(recipe_json) == 0: + if scrape: + return Response({ + 'recipe_json': helper.get_from_scraper(scrape, request), + # 'recipe_tree': recipe_tree, + # 'recipe_html': recipe_html, + 'recipe_images': list(dict.fromkeys(get_images_from_soup(scrape.soup, url))), + }, status=status.HTTP_200_OK) + + else: return Response({ 'error': True, 'msg': _('No usable data could be found.') }, status=status.HTTP_400_BAD_REQUEST) - else: - return Response({ - 'recipe_json': recipe_json, - 'recipe_tree': recipe_tree, - 'recipe_html': recipe_html, - 'recipe_images': list(dict.fromkeys(recipe_images)), - }, status=status.HTTP_200_OK) else: return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) diff --git a/vue/src/apps/ImportView/ImportView.vue b/vue/src/apps/ImportView/ImportView.vue index 407664929..4b2ed0ffd 100644 --- a/vue/src/apps/ImportView/ImportView.vue +++ b/vue/src/apps/ImportView/ImportView.vue @@ -461,8 +461,8 @@ export default { recent_urls: [], source_data: '', recipe_json: undefined, - recipe_html: undefined, - recipe_tree: undefined, + // recipe_html: undefined, + // recipe_tree: undefined, recipe_images: [], imported_recipes: [], failed_imports: [], @@ -593,9 +593,9 @@ export default { } // reset all variables - this.recipe_html = undefined + // this.recipe_html = undefined this.recipe_json = undefined - this.recipe_tree = undefined + // this.recipe_tree = undefined this.recipe_images = [] // load recipe @@ -621,8 +621,8 @@ export default { return x }) - this.recipe_tree = response.data['recipe_tree']; - this.recipe_html = response.data['recipe_html']; + // this.recipe_tree = response.data['recipe_tree']; + // this.recipe_html = response.data['recipe_html']; this.recipe_images = response.data['recipe_images'] !== undefined ? response.data['recipe_images'] : []; if (!silent) {