deprecate get_recipe_from_source

This commit is contained in:
smilerz
2022-07-07 15:09:22 -05:00
parent b1c0334947
commit e40b73f420
7 changed files with 272 additions and 228 deletions

View File

@@ -1,191 +1,191 @@
import json
import re
from json import JSONDecodeError
from urllib.parse import unquote
# import json
# import re
# from json import JSONDecodeError
# from urllib.parse import unquote
from bs4 import BeautifulSoup
from bs4.element import Tag
from recipe_scrapers import scrape_html, scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
from recipe_scrapers._utils import get_host_name, normalize_string
# from bs4 import BeautifulSoup
# from bs4.element import Tag
# from recipe_scrapers import scrape_html, scrape_me
# from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
# from recipe_scrapers._utils import get_host_name, normalize_string
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.scrapers.scrapers import text_scraper
# from cookbook.helper import recipe_url_import as helper
# from cookbook.helper.scrapers.scrapers import text_scraper
def get_recipe_from_source(text, url, request):
def build_node(k, v):
if isinstance(v, dict):
node = {
'name': k,
'value': k,
'children': get_children_dict(v)
}
elif isinstance(v, list):
node = {
'name': k,
'value': k,
'children': get_children_list(v)
}
else:
node = {
'name': k + ": " + normalize_string(str(v)),
'value': normalize_string(str(v))
}
return node
# def get_recipe_from_source(text, url, request):
# def build_node(k, v):
# if isinstance(v, dict):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_dict(v)
# }
# elif isinstance(v, list):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_list(v)
# }
# else:
# node = {
# 'name': k + ": " + normalize_string(str(v)),
# 'value': normalize_string(str(v))
# }
# return node
def get_children_dict(children):
kid_list = []
for k, v in children.items():
kid_list.append(build_node(k, v))
return kid_list
# def get_children_dict(children):
# kid_list = []
# for k, v in children.items():
# kid_list.append(build_node(k, v))
# return kid_list
def get_children_list(children):
kid_list = []
for kid in children:
if type(kid) == list:
node = {
'name': "unknown list",
'value': "unknown list",
'children': get_children_list(kid)
}
kid_list.append(node)
elif type(kid) == dict:
for k, v in kid.items():
kid_list.append(build_node(k, v))
else:
kid_list.append({
'name': normalize_string(str(kid)),
'value': normalize_string(str(kid))
})
return kid_list
# def get_children_list(children):
# kid_list = []
# for kid in children:
# if type(kid) == list:
# node = {
# 'name': "unknown list",
# 'value': "unknown list",
# 'children': get_children_list(kid)
# }
# kid_list.append(node)
# elif type(kid) == dict:
# for k, v in kid.items():
# kid_list.append(build_node(k, v))
# else:
# kid_list.append({
# 'name': normalize_string(str(kid)),
# 'value': normalize_string(str(kid))
# })
# return kid_list
recipe_tree = []
parse_list = []
soup = BeautifulSoup(text, "html.parser")
html_data = get_from_html(soup)
images = get_images_from_source(soup, url)
text = unquote(text)
scrape = None
# recipe_tree = []
# parse_list = []
# soup = BeautifulSoup(text, "html.parser")
# html_data = get_from_html(soup)
# images = get_images_from_source(soup, url)
# text = unquote(text)
# scrape = None
if url and not text:
try:
scrape = scrape_me(url_path=url, wild_mode=True)
except(NoSchemaFoundInWildMode):
pass
# if url and not text:
# try:
# scrape = scrape_me(url_path=url, wild_mode=True)
# except(NoSchemaFoundInWildMode):
# pass
if not scrape:
try:
parse_list.append(remove_graph(json.loads(text)))
if not url and 'url' in parse_list[0]:
url = parse_list[0]['url']
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
# if not scrape:
# try:
# parse_list.append(remove_graph(json.loads(text)))
# if not url and 'url' in parse_list[0]:
# url = parse_list[0]['url']
# scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
except JSONDecodeError:
for el in soup.find_all('script', type='application/ld+json'):
el = remove_graph(el)
if not url and 'url' in el:
url = el['url']
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
for el in soup.find_all(type='application/json'):
el = remove_graph(el)
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
scrape = text_scraper(text, url=url)
# except JSONDecodeError:
# for el in soup.find_all('script', type='application/ld+json'):
# el = remove_graph(el)
# if not url and 'url' in el:
# url = el['url']
# if type(el) == list:
# for le in el:
# parse_list.append(le)
# elif type(el) == dict:
# parse_list.append(el)
# for el in soup.find_all(type='application/json'):
# el = remove_graph(el)
# if type(el) == list:
# for le in el:
# parse_list.append(le)
# elif type(el) == dict:
# parse_list.append(el)
# scrape = text_scraper(text, url=url)
recipe_json = helper.get_from_scraper(scrape, request)
# recipe_json = helper.get_from_scraper(scrape, request)
# TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere
for el in parse_list:
temp_tree = []
if isinstance(el, Tag):
try:
el = json.loads(el.string)
except TypeError:
continue
# # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere
# for el in parse_list:
# temp_tree = []
# if isinstance(el, Tag):
# try:
# el = json.loads(el.string)
# except TypeError:
# continue
for k, v in el.items():
if isinstance(v, dict):
node = {
'name': k,
'value': k,
'children': get_children_dict(v)
}
elif isinstance(v, list):
node = {
'name': k,
'value': k,
'children': get_children_list(v)
}
else:
node = {
'name': k + ": " + normalize_string(str(v)),
'value': normalize_string(str(v))
}
temp_tree.append(node)
# for k, v in el.items():
# if isinstance(v, dict):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_dict(v)
# }
# elif isinstance(v, list):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_list(v)
# }
# else:
# node = {
# 'name': k + ": " + normalize_string(str(v)),
# 'value': normalize_string(str(v))
# }
# temp_tree.append(node)
if '@type' in el and el['@type'] == 'Recipe':
recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
else:
recipe_tree += [{'name': 'json', 'children': temp_tree}]
# if '@type' in el and el['@type'] == 'Recipe':
# recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
# else:
# recipe_tree += [{'name': 'json', 'children': temp_tree}]
return recipe_json, recipe_tree, html_data, images
# return recipe_json, recipe_tree, html_data, images
def get_from_html(soup):
INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
html = []
for s in soup.strings:
if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
html.append(s)
return html
# def get_from_html(soup):
# INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
# html = []
# for s in soup.strings:
# if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
# html.append(s)
# return html
def get_images_from_source(soup, url):
sources = ['src', 'srcset', 'data-src']
images = []
img_tags = soup.find_all('img')
if url:
site = get_host_name(url)
prot = url.split(':')[0]
# def get_images_from_source(soup, url):
# sources = ['src', 'srcset', 'data-src']
# images = []
# img_tags = soup.find_all('img')
# if url:
# site = get_host_name(url)
# prot = url.split(':')[0]
urls = []
for img in img_tags:
for src in sources:
try:
urls.append(img[src])
except KeyError:
pass
# urls = []
# for img in img_tags:
# for src in sources:
# try:
# urls.append(img[src])
# except KeyError:
# pass
for u in urls:
u = u.split('?')[0]
filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
if filename:
if (('http' not in u) and (url)):
# sometimes an image source can be relative
# if it is provide the base url
u = '{}://{}{}'.format(prot, site, u)
if 'http' in u:
images.append(u)
return images
# for u in urls:
# u = u.split('?')[0]
# filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
# if filename:
# if (('http' not in u) and (url)):
# # sometimes an image source can be relative
# # if it is provide the base url
# u = '{}://{}{}'.format(prot, site, u)
# if 'http' in u:
# images.append(u)
# return images
def remove_graph(el):
# recipes type might be wrapped in @graph type
if isinstance(el, Tag):
try:
el = json.loads(el.string)
if '@graph' in el:
for x in el['@graph']:
if '@type' in x and x['@type'] == 'Recipe':
el = x
except (TypeError, JSONDecodeError):
pass
return el
# def remove_graph(el):
# # recipes type might be wrapped in @graph type
# if isinstance(el, Tag):
# try:
# el = json.loads(el.string)
# if '@graph' in el:
# for x in el['@graph']:
# if '@type' in x and x['@type'] == 'Recipe':
# el = x
# except (TypeError, JSONDecodeError):
# pass
# return el

View File

@@ -1,21 +1,19 @@
import random
import re
from html import unescape
from pytube import YouTube
from unicodedata import decomposition
from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _
from isodate import parse_duration as iso_parse_duration
from isodate.isoerror import ISO8601Error
from recipe_scrapers._utils import get_minutes
from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Keyword
# from recipe_scrapers._utils import get_minutes ## temporary until/unless upstream incorporates get_minutes() PR
@@ -369,3 +367,32 @@ def iso_duration_to_minutes(string):
string
).groupdict()
return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0)
def get_images_from_soup(soup, url):
sources = ['src', 'srcset', 'data-src']
images = []
img_tags = soup.find_all('img')
if url:
site = get_host_name(url)
prot = url.split(':')[0]
urls = []
for img in img_tags:
for src in sources:
try:
urls.append(img[src])
except KeyError:
pass
for u in urls:
u = u.split('?')[0]
filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
if filename:
if (('http' not in u) and (url)):
# sometimes an image source can be relative
# if it is provide the base url
u = '{}://{}{}'.format(prot, site, u)
if 'http' in u:
images.append(u)
return images

View File

@@ -27,17 +27,17 @@ def text_scraper(text, url=None):
class TextScraper(scraper_class):
def __init__(
self,
page_data,
url=None
html=None,
url=None,
):
self.wild_mode = False
self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser")
self.soup = BeautifulSoup(html, "html.parser")
self.url = url
self.recipe = None
try:
self.schema = SchemaOrg(page_data)
self.schema = SchemaOrg(html)
except (JSONDecodeError, AttributeError):
pass
return TextScraper(text, url)
return TextScraper(url=url, html=text)

View File

@@ -10,8 +10,8 @@ import validators
import yaml
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_url_import import iso_duration_to_minutes
from cookbook.helper.recipe_url_import import get_images_from_soup, iso_duration_to_minutes
from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Keyword, Recipe, Step
@@ -24,7 +24,10 @@ class CookBookApp(Integration):
def get_recipe_from_file(self, file):
recipe_html = file.getvalue().decode("utf-8")
recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request)
# recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request)
scrape = text_scraper(text=data)
recipe_json = helper.get_from_scraper(scrape, request)
images = list(dict.fromkeys(get_images_from_soup(scrape.soup, url)))
recipe = Recipe.objects.create(
name=recipe_json['name'].strip(),

View File

@@ -3,10 +3,9 @@ from io import BytesIO
from zipfile import ZipFile
from bs4 import BeautifulSoup
from django.utils.translation import gettext as _
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_url_import import iso_duration_to_minutes, parse_servings
from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Keyword, Recipe, Step

View File

@@ -5,6 +5,8 @@ import re
import traceback
import uuid
from collections import OrderedDict
from json import JSONDecodeError
from urllib.parse import unquote
from zipfile import ZipFile
import requests
@@ -26,6 +28,8 @@ from django.utils.translation import gettext as _
from django_scopes import scopes_disabled
from icalendar import Calendar, Event
from PIL import UnidentifiedImageError
from recipe_scrapers import scrape_html, scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
from requests.exceptions import MissingSchema
from rest_framework import decorators, status, viewsets
from rest_framework.authtoken.models import Token
@@ -40,6 +44,7 @@ from rest_framework.throttling import AnonRateThrottle
from rest_framework.viewsets import ViewSetMixin
from treebeard.exceptions import InvalidMoveToDescendant, InvalidPosition, PathOverflow
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.HelperFunctions import str2bool
from cookbook.helper.image_processing import handle_image
from cookbook.helper.ingredient_parser import IngredientParser
@@ -47,9 +52,9 @@ from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest, Cus
CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared,
CustomIsSpaceOwner, CustomIsUser, group_required,
is_space_owner, switch_user_active_space)
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search
from cookbook.helper.recipe_url_import import get_from_youtube_scraper
from cookbook.helper.recipe_url_import import get_from_youtube_scraper, get_images_from_soup
from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper
from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food,
FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan,
@@ -1116,69 +1121,79 @@ def recipe_from_source(request):
- url: url to use for importing recipe
- data: if no url is given recipe is imported from provided source data
- (optional) bookmarklet: id of bookmarklet import to use, overrides URL and data attributes
:return: JsonResponse containing the parsed json, original html,json and images
:return: JsonResponse containing the parsed json and images
"""
scrape = None
serializer = RecipeFromSourceSerializer(data=request.data)
if serializer.is_valid():
# headers to use for request to external sites - DEPRECATE
external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"}
if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()):
serializer.validated_data['url'] = bookmarklet.url
serializer.validated_data['data'] = bookmarklet.html
bookmarklet.delete()
elif not 'url' in serializer.validated_data and not 'data' in serializer.validated_data:
url = serializer.validated_data.get('url', None)
data = unquote(serializer.validated_data.get('data', None))
if not url and not data:
return Response({
'error': True,
'msg': _('Nothing to do.')
}, status=status.HTTP_400_BAD_REQUEST)
# in manual mode request complete page to return it later
elif 'url' in serializer.validated_data and serializer.validated_data['url'] != '':
if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', serializer.validated_data['url']):
if validators.url(serializer.validated_data['url'], public=True):
elif url and not data:
if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', url):
if validators.url(url, public=True):
return Response({
'recipe_json': get_from_youtube_scraper(serializer.validated_data['url'], request),
'recipe_tree': '',
'recipe_html': '',
'recipe_json': get_from_youtube_scraper(url, request),
# 'recipe_tree': '',
# 'recipe_html': '',
'recipe_images': [],
}, status=status.HTTP_200_OK)
#######
# this section is redundant to scrape_me. REFACTOR to catch errors from scrape_me
try:
if validators.url(serializer.validated_data['url'], public=True):
requests.get(serializer.validated_data['url'], headers=external_request_headers).content
else:
else:
try:
if validators.url(url, public=True):
scrape = scrape_me(url_path=url, wild_mode=True)
else:
return Response({
'error': True,
'msg': _('Invalid Url')
}, status=status.HTTP_400_BAD_REQUEST)
except NoSchemaFoundInWildMode:
pass
except requests.exceptions.ConnectionError:
return Response({
'error': True,
'msg': _('Invalid Url')
'msg': _('Connection Refused.')
}, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.ConnectionError:
return Response({
'error': True,
'msg': _('Connection Refused.')
}, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.MissingSchema:
return Response({
'error': True,
'msg': _('Bad URL Schema.')
}, status=status.HTTP_400_BAD_REQUEST)
#######
except requests.exceptions.MissingSchema:
return Response({
'error': True,
'msg': _('Bad URL Schema.')
}, status=status.HTTP_400_BAD_REQUEST)
else:
try:
json.loads(data)
data = "<script type='application/ld+json'>" + data + "</script>"
except JSONDecodeError:
pass
scrape = text_scraper(text=data, url=url)
if not url and (found_url := scrape.schema.data.get('url', None)):
scrape = text_scraper(text=data, url=found_url)
recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request)
if len(recipe_tree) == 0 and len(recipe_json) == 0:
if scrape:
return Response({
'recipe_json': helper.get_from_scraper(scrape, request),
# 'recipe_tree': recipe_tree,
# 'recipe_html': recipe_html,
'recipe_images': list(dict.fromkeys(get_images_from_soup(scrape.soup, url))),
}, status=status.HTTP_200_OK)
else:
return Response({
'error': True,
'msg': _('No usable data could be found.')
}, status=status.HTTP_400_BAD_REQUEST)
else:
return Response({
'recipe_json': recipe_json,
'recipe_tree': recipe_tree,
'recipe_html': recipe_html,
'recipe_images': list(dict.fromkeys(recipe_images)),
}, status=status.HTTP_200_OK)
else:
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)