deprecate get_recipe_from_source

This commit is contained in:
smilerz
2022-07-07 15:09:22 -05:00
parent b1c0334947
commit e40b73f420
7 changed files with 272 additions and 228 deletions

View File

@@ -1,191 +1,191 @@
import json # import json
import re # import re
from json import JSONDecodeError # from json import JSONDecodeError
from urllib.parse import unquote # from urllib.parse import unquote
from bs4 import BeautifulSoup # from bs4 import BeautifulSoup
from bs4.element import Tag # from bs4.element import Tag
from recipe_scrapers import scrape_html, scrape_me # from recipe_scrapers import scrape_html, scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode # from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
from recipe_scrapers._utils import get_host_name, normalize_string # from recipe_scrapers._utils import get_host_name, normalize_string
from cookbook.helper import recipe_url_import as helper # from cookbook.helper import recipe_url_import as helper
from cookbook.helper.scrapers.scrapers import text_scraper # from cookbook.helper.scrapers.scrapers import text_scraper
def get_recipe_from_source(text, url, request): # def get_recipe_from_source(text, url, request):
def build_node(k, v): # def build_node(k, v):
if isinstance(v, dict): # if isinstance(v, dict):
node = { # node = {
'name': k, # 'name': k,
'value': k, # 'value': k,
'children': get_children_dict(v) # 'children': get_children_dict(v)
} # }
elif isinstance(v, list): # elif isinstance(v, list):
node = { # node = {
'name': k, # 'name': k,
'value': k, # 'value': k,
'children': get_children_list(v) # 'children': get_children_list(v)
} # }
else: # else:
node = { # node = {
'name': k + ": " + normalize_string(str(v)), # 'name': k + ": " + normalize_string(str(v)),
'value': normalize_string(str(v)) # 'value': normalize_string(str(v))
} # }
return node # return node
def get_children_dict(children): # def get_children_dict(children):
kid_list = [] # kid_list = []
for k, v in children.items(): # for k, v in children.items():
kid_list.append(build_node(k, v)) # kid_list.append(build_node(k, v))
return kid_list # return kid_list
def get_children_list(children): # def get_children_list(children):
kid_list = [] # kid_list = []
for kid in children: # for kid in children:
if type(kid) == list: # if type(kid) == list:
node = { # node = {
'name': "unknown list", # 'name': "unknown list",
'value': "unknown list", # 'value': "unknown list",
'children': get_children_list(kid) # 'children': get_children_list(kid)
} # }
kid_list.append(node) # kid_list.append(node)
elif type(kid) == dict: # elif type(kid) == dict:
for k, v in kid.items(): # for k, v in kid.items():
kid_list.append(build_node(k, v)) # kid_list.append(build_node(k, v))
else: # else:
kid_list.append({ # kid_list.append({
'name': normalize_string(str(kid)), # 'name': normalize_string(str(kid)),
'value': normalize_string(str(kid)) # 'value': normalize_string(str(kid))
}) # })
return kid_list # return kid_list
recipe_tree = [] # recipe_tree = []
parse_list = [] # parse_list = []
soup = BeautifulSoup(text, "html.parser") # soup = BeautifulSoup(text, "html.parser")
html_data = get_from_html(soup) # html_data = get_from_html(soup)
images = get_images_from_source(soup, url) # images = get_images_from_source(soup, url)
text = unquote(text) # text = unquote(text)
scrape = None # scrape = None
if url and not text: # if url and not text:
try: # try:
scrape = scrape_me(url_path=url, wild_mode=True) # scrape = scrape_me(url_path=url, wild_mode=True)
except(NoSchemaFoundInWildMode): # except(NoSchemaFoundInWildMode):
pass # pass
if not scrape: # if not scrape:
try: # try:
parse_list.append(remove_graph(json.loads(text))) # parse_list.append(remove_graph(json.loads(text)))
if not url and 'url' in parse_list[0]: # if not url and 'url' in parse_list[0]:
url = parse_list[0]['url'] # url = parse_list[0]['url']
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url) # scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
except JSONDecodeError: # except JSONDecodeError:
for el in soup.find_all('script', type='application/ld+json'): # for el in soup.find_all('script', type='application/ld+json'):
el = remove_graph(el) # el = remove_graph(el)
if not url and 'url' in el: # if not url and 'url' in el:
url = el['url'] # url = el['url']
if type(el) == list: # if type(el) == list:
for le in el: # for le in el:
parse_list.append(le) # parse_list.append(le)
elif type(el) == dict: # elif type(el) == dict:
parse_list.append(el) # parse_list.append(el)
for el in soup.find_all(type='application/json'): # for el in soup.find_all(type='application/json'):
el = remove_graph(el) # el = remove_graph(el)
if type(el) == list: # if type(el) == list:
for le in el: # for le in el:
parse_list.append(le) # parse_list.append(le)
elif type(el) == dict: # elif type(el) == dict:
parse_list.append(el) # parse_list.append(el)
scrape = text_scraper(text, url=url) # scrape = text_scraper(text, url=url)
recipe_json = helper.get_from_scraper(scrape, request) # recipe_json = helper.get_from_scraper(scrape, request)
# TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere # # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere
for el in parse_list: # for el in parse_list:
temp_tree = [] # temp_tree = []
if isinstance(el, Tag): # if isinstance(el, Tag):
try: # try:
el = json.loads(el.string) # el = json.loads(el.string)
except TypeError: # except TypeError:
continue # continue
for k, v in el.items(): # for k, v in el.items():
if isinstance(v, dict): # if isinstance(v, dict):
node = { # node = {
'name': k, # 'name': k,
'value': k, # 'value': k,
'children': get_children_dict(v) # 'children': get_children_dict(v)
} # }
elif isinstance(v, list): # elif isinstance(v, list):
node = { # node = {
'name': k, # 'name': k,
'value': k, # 'value': k,
'children': get_children_list(v) # 'children': get_children_list(v)
} # }
else: # else:
node = { # node = {
'name': k + ": " + normalize_string(str(v)), # 'name': k + ": " + normalize_string(str(v)),
'value': normalize_string(str(v)) # 'value': normalize_string(str(v))
} # }
temp_tree.append(node) # temp_tree.append(node)
if '@type' in el and el['@type'] == 'Recipe': # if '@type' in el and el['@type'] == 'Recipe':
recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] # recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
else: # else:
recipe_tree += [{'name': 'json', 'children': temp_tree}] # recipe_tree += [{'name': 'json', 'children': temp_tree}]
return recipe_json, recipe_tree, html_data, images # return recipe_json, recipe_tree, html_data, images
def get_from_html(soup): # def get_from_html(soup):
INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') # INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
html = [] # html = []
for s in soup.strings: # for s in soup.strings:
if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): # if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
html.append(s) # html.append(s)
return html # return html
def get_images_from_source(soup, url): # def get_images_from_source(soup, url):
sources = ['src', 'srcset', 'data-src'] # sources = ['src', 'srcset', 'data-src']
images = [] # images = []
img_tags = soup.find_all('img') # img_tags = soup.find_all('img')
if url: # if url:
site = get_host_name(url) # site = get_host_name(url)
prot = url.split(':')[0] # prot = url.split(':')[0]
urls = [] # urls = []
for img in img_tags: # for img in img_tags:
for src in sources: # for src in sources:
try: # try:
urls.append(img[src]) # urls.append(img[src])
except KeyError: # except KeyError:
pass # pass
for u in urls: # for u in urls:
u = u.split('?')[0] # u = u.split('?')[0]
filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) # filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
if filename: # if filename:
if (('http' not in u) and (url)): # if (('http' not in u) and (url)):
# sometimes an image source can be relative # # sometimes an image source can be relative
# if it is provide the base url # # if it is provide the base url
u = '{}://{}{}'.format(prot, site, u) # u = '{}://{}{}'.format(prot, site, u)
if 'http' in u: # if 'http' in u:
images.append(u) # images.append(u)
return images # return images
def remove_graph(el): # def remove_graph(el):
# recipes type might be wrapped in @graph type # # recipes type might be wrapped in @graph type
if isinstance(el, Tag): # if isinstance(el, Tag):
try: # try:
el = json.loads(el.string) # el = json.loads(el.string)
if '@graph' in el: # if '@graph' in el:
for x in el['@graph']: # for x in el['@graph']:
if '@type' in x and x['@type'] == 'Recipe': # if '@type' in x and x['@type'] == 'Recipe':
el = x # el = x
except (TypeError, JSONDecodeError): # except (TypeError, JSONDecodeError):
pass # pass
return el # return el

View File

@@ -1,21 +1,19 @@
import random import random
import re import re
from html import unescape from html import unescape
from pytube import YouTube
from unicodedata import decomposition from unicodedata import decomposition
from django.utils.dateparse import parse_duration from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from isodate import parse_duration as iso_parse_duration from isodate import parse_duration as iso_parse_duration
from isodate.isoerror import ISO8601Error from isodate.isoerror import ISO8601Error
from recipe_scrapers._utils import get_minutes from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper import recipe_url_import as helper from cookbook.helper import recipe_url_import as helper
from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Keyword from cookbook.models import Keyword
# from recipe_scrapers._utils import get_minutes ## temporary until/unless upstream incorporates get_minutes() PR # from recipe_scrapers._utils import get_minutes ## temporary until/unless upstream incorporates get_minutes() PR
@@ -369,3 +367,32 @@ def iso_duration_to_minutes(string):
string string
).groupdict() ).groupdict()
return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0) return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0)
def get_images_from_soup(soup, url):
sources = ['src', 'srcset', 'data-src']
images = []
img_tags = soup.find_all('img')
if url:
site = get_host_name(url)
prot = url.split(':')[0]
urls = []
for img in img_tags:
for src in sources:
try:
urls.append(img[src])
except KeyError:
pass
for u in urls:
u = u.split('?')[0]
filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
if filename:
if (('http' not in u) and (url)):
# sometimes an image source can be relative
# if it is provide the base url
u = '{}://{}{}'.format(prot, site, u)
if 'http' in u:
images.append(u)
return images

View File

@@ -27,17 +27,17 @@ def text_scraper(text, url=None):
class TextScraper(scraper_class): class TextScraper(scraper_class):
def __init__( def __init__(
self, self,
page_data, html=None,
url=None url=None,
): ):
self.wild_mode = False self.wild_mode = False
self.meta_http_equiv = False self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser") self.soup = BeautifulSoup(html, "html.parser")
self.url = url self.url = url
self.recipe = None self.recipe = None
try: try:
self.schema = SchemaOrg(page_data) self.schema = SchemaOrg(html)
except (JSONDecodeError, AttributeError): except (JSONDecodeError, AttributeError):
pass pass
return TextScraper(text, url) return TextScraper(url=url, html=text)

View File

@@ -10,8 +10,8 @@ import validators
import yaml import yaml
from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_html_import import get_recipe_from_source from cookbook.helper.recipe_url_import import get_images_from_soup, iso_duration_to_minutes
from cookbook.helper.recipe_url_import import iso_duration_to_minutes from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.integration.integration import Integration from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Keyword, Recipe, Step from cookbook.models import Ingredient, Keyword, Recipe, Step
@@ -24,7 +24,10 @@ class CookBookApp(Integration):
def get_recipe_from_file(self, file): def get_recipe_from_file(self, file):
recipe_html = file.getvalue().decode("utf-8") recipe_html = file.getvalue().decode("utf-8")
recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request) # recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request)
scrape = text_scraper(text=data)
recipe_json = helper.get_from_scraper(scrape, request)
images = list(dict.fromkeys(get_images_from_soup(scrape.soup, url)))
recipe = Recipe.objects.create( recipe = Recipe.objects.create(
name=recipe_json['name'].strip(), name=recipe_json['name'].strip(),

View File

@@ -3,10 +3,9 @@ from io import BytesIO
from zipfile import ZipFile from zipfile import ZipFile
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_url_import import iso_duration_to_minutes, parse_servings from cookbook.helper.recipe_url_import import iso_duration_to_minutes, parse_servings
from cookbook.integration.integration import Integration from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Keyword, Recipe, Step from cookbook.models import Ingredient, Keyword, Recipe, Step

View File

@@ -5,6 +5,8 @@ import re
import traceback import traceback
import uuid import uuid
from collections import OrderedDict from collections import OrderedDict
from json import JSONDecodeError
from urllib.parse import unquote
from zipfile import ZipFile from zipfile import ZipFile
import requests import requests
@@ -26,6 +28,8 @@ from django.utils.translation import gettext as _
from django_scopes import scopes_disabled from django_scopes import scopes_disabled
from icalendar import Calendar, Event from icalendar import Calendar, Event
from PIL import UnidentifiedImageError from PIL import UnidentifiedImageError
from recipe_scrapers import scrape_html, scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
from requests.exceptions import MissingSchema from requests.exceptions import MissingSchema
from rest_framework import decorators, status, viewsets from rest_framework import decorators, status, viewsets
from rest_framework.authtoken.models import Token from rest_framework.authtoken.models import Token
@@ -40,6 +44,7 @@ from rest_framework.throttling import AnonRateThrottle
from rest_framework.viewsets import ViewSetMixin from rest_framework.viewsets import ViewSetMixin
from treebeard.exceptions import InvalidMoveToDescendant, InvalidPosition, PathOverflow from treebeard.exceptions import InvalidMoveToDescendant, InvalidPosition, PathOverflow
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.HelperFunctions import str2bool from cookbook.helper.HelperFunctions import str2bool
from cookbook.helper.image_processing import handle_image from cookbook.helper.image_processing import handle_image
from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.ingredient_parser import IngredientParser
@@ -47,9 +52,9 @@ from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest, Cus
CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared, CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared,
CustomIsSpaceOwner, CustomIsUser, group_required, CustomIsSpaceOwner, CustomIsUser, group_required,
is_space_owner, switch_user_active_space) is_space_owner, switch_user_active_space)
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search
from cookbook.helper.recipe_url_import import get_from_youtube_scraper from cookbook.helper.recipe_url_import import get_from_youtube_scraper, get_images_from_soup
from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper
from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food, from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food,
FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan, FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan,
@@ -1116,69 +1121,79 @@ def recipe_from_source(request):
- url: url to use for importing recipe - url: url to use for importing recipe
- data: if no url is given recipe is imported from provided source data - data: if no url is given recipe is imported from provided source data
- (optional) bookmarklet: id of bookmarklet import to use, overrides URL and data attributes - (optional) bookmarklet: id of bookmarklet import to use, overrides URL and data attributes
:return: JsonResponse containing the parsed json, original html,json and images :return: JsonResponse containing the parsed json and images
""" """
scrape = None
serializer = RecipeFromSourceSerializer(data=request.data) serializer = RecipeFromSourceSerializer(data=request.data)
if serializer.is_valid(): if serializer.is_valid():
# headers to use for request to external sites - DEPRECATE
external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"}
if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()): if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()):
serializer.validated_data['url'] = bookmarklet.url serializer.validated_data['url'] = bookmarklet.url
serializer.validated_data['data'] = bookmarklet.html serializer.validated_data['data'] = bookmarklet.html
bookmarklet.delete() bookmarklet.delete()
elif not 'url' in serializer.validated_data and not 'data' in serializer.validated_data: url = serializer.validated_data.get('url', None)
data = unquote(serializer.validated_data.get('data', None))
if not url and not data:
return Response({ return Response({
'error': True, 'error': True,
'msg': _('Nothing to do.') 'msg': _('Nothing to do.')
}, status=status.HTTP_400_BAD_REQUEST) }, status=status.HTTP_400_BAD_REQUEST)
# in manual mode request complete page to return it later elif url and not data:
elif 'url' in serializer.validated_data and serializer.validated_data['url'] != '': if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', url):
if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', serializer.validated_data['url']): if validators.url(url, public=True):
if validators.url(serializer.validated_data['url'], public=True):
return Response({ return Response({
'recipe_json': get_from_youtube_scraper(serializer.validated_data['url'], request), 'recipe_json': get_from_youtube_scraper(url, request),
'recipe_tree': '', # 'recipe_tree': '',
'recipe_html': '', # 'recipe_html': '',
'recipe_images': [], 'recipe_images': [],
}, status=status.HTTP_200_OK) }, status=status.HTTP_200_OK)
####### else:
# this section is redundant to scrape_me. REFACTOR to catch errors from scrape_me try:
try: if validators.url(url, public=True):
if validators.url(serializer.validated_data['url'], public=True): scrape = scrape_me(url_path=url, wild_mode=True)
requests.get(serializer.validated_data['url'], headers=external_request_headers).content
else: else:
return Response({
'error': True,
'msg': _('Invalid Url')
}, status=status.HTTP_400_BAD_REQUEST)
except NoSchemaFoundInWildMode:
pass
except requests.exceptions.ConnectionError:
return Response({ return Response({
'error': True, 'error': True,
'msg': _('Invalid Url') 'msg': _('Connection Refused.')
}, status=status.HTTP_400_BAD_REQUEST) }, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.ConnectionError: except requests.exceptions.MissingSchema:
return Response({ return Response({
'error': True, 'error': True,
'msg': _('Connection Refused.') 'msg': _('Bad URL Schema.')
}, status=status.HTTP_400_BAD_REQUEST) }, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.MissingSchema: else:
return Response({ try:
'error': True, json.loads(data)
'msg': _('Bad URL Schema.') data = "<script type='application/ld+json'>" + data + "</script>"
}, status=status.HTTP_400_BAD_REQUEST) except JSONDecodeError:
####### pass
scrape = text_scraper(text=data, url=url)
if not url and (found_url := scrape.schema.data.get('url', None)):
scrape = text_scraper(text=data, url=found_url)
recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request) if scrape:
if len(recipe_tree) == 0 and len(recipe_json) == 0: return Response({
'recipe_json': helper.get_from_scraper(scrape, request),
# 'recipe_tree': recipe_tree,
# 'recipe_html': recipe_html,
'recipe_images': list(dict.fromkeys(get_images_from_soup(scrape.soup, url))),
}, status=status.HTTP_200_OK)
else:
return Response({ return Response({
'error': True, 'error': True,
'msg': _('No usable data could be found.') 'msg': _('No usable data could be found.')
}, status=status.HTTP_400_BAD_REQUEST) }, status=status.HTTP_400_BAD_REQUEST)
else:
return Response({
'recipe_json': recipe_json,
'recipe_tree': recipe_tree,
'recipe_html': recipe_html,
'recipe_images': list(dict.fromkeys(recipe_images)),
}, status=status.HTTP_200_OK)
else: else:
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

View File

@@ -461,8 +461,8 @@ export default {
recent_urls: [], recent_urls: [],
source_data: '', source_data: '',
recipe_json: undefined, recipe_json: undefined,
recipe_html: undefined, // recipe_html: undefined,
recipe_tree: undefined, // recipe_tree: undefined,
recipe_images: [], recipe_images: [],
imported_recipes: [], imported_recipes: [],
failed_imports: [], failed_imports: [],
@@ -593,9 +593,9 @@ export default {
} }
// reset all variables // reset all variables
this.recipe_html = undefined // this.recipe_html = undefined
this.recipe_json = undefined this.recipe_json = undefined
this.recipe_tree = undefined // this.recipe_tree = undefined
this.recipe_images = [] this.recipe_images = []
// load recipe // load recipe
@@ -621,8 +621,8 @@ export default {
return x return x
}) })
this.recipe_tree = response.data['recipe_tree']; // this.recipe_tree = response.data['recipe_tree'];
this.recipe_html = response.data['recipe_html']; // this.recipe_html = response.data['recipe_html'];
this.recipe_images = response.data['recipe_images'] !== undefined ? response.data['recipe_images'] : []; this.recipe_images = response.data['recipe_images'] !== undefined ? response.data['recipe_images'] : [];
if (!silent) { if (!silent) {