Merge pull request #1917 from smilerz/bookmarklet_fix

Bookmarklet fix
This commit is contained in:
vabene1111
2022-07-11 14:28:08 +02:00
committed by GitHub
8 changed files with 327 additions and 271 deletions

View File

@@ -1,189 +1,191 @@
import json
import re
from json import JSONDecodeError
from urllib.parse import unquote
# import json
# import re
# from json import JSONDecodeError
# from urllib.parse import unquote
from bs4 import BeautifulSoup
from bs4.element import Tag
from recipe_scrapers import scrape_html, scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
from recipe_scrapers._utils import get_host_name, normalize_string
# from bs4 import BeautifulSoup
# from bs4.element import Tag
# from recipe_scrapers import scrape_html, scrape_me
# from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
# from recipe_scrapers._utils import get_host_name, normalize_string
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.scrapers.scrapers import text_scraper
# from cookbook.helper import recipe_url_import as helper
# from cookbook.helper.scrapers.scrapers import text_scraper
def get_recipe_from_source(text, url, request):
def build_node(k, v):
if isinstance(v, dict):
node = {
'name': k,
'value': k,
'children': get_children_dict(v)
}
elif isinstance(v, list):
node = {
'name': k,
'value': k,
'children': get_children_list(v)
}
else:
node = {
'name': k + ": " + normalize_string(str(v)),
'value': normalize_string(str(v))
}
return node
# def get_recipe_from_source(text, url, request):
# def build_node(k, v):
# if isinstance(v, dict):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_dict(v)
# }
# elif isinstance(v, list):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_list(v)
# }
# else:
# node = {
# 'name': k + ": " + normalize_string(str(v)),
# 'value': normalize_string(str(v))
# }
# return node
def get_children_dict(children):
kid_list = []
for k, v in children.items():
kid_list.append(build_node(k, v))
return kid_list
# def get_children_dict(children):
# kid_list = []
# for k, v in children.items():
# kid_list.append(build_node(k, v))
# return kid_list
def get_children_list(children):
kid_list = []
for kid in children:
if type(kid) == list:
node = {
'name': "unknown list",
'value': "unknown list",
'children': get_children_list(kid)
}
kid_list.append(node)
elif type(kid) == dict:
for k, v in kid.items():
kid_list.append(build_node(k, v))
else:
kid_list.append({
'name': normalize_string(str(kid)),
'value': normalize_string(str(kid))
})
return kid_list
# def get_children_list(children):
# kid_list = []
# for kid in children:
# if type(kid) == list:
# node = {
# 'name': "unknown list",
# 'value': "unknown list",
# 'children': get_children_list(kid)
# }
# kid_list.append(node)
# elif type(kid) == dict:
# for k, v in kid.items():
# kid_list.append(build_node(k, v))
# else:
# kid_list.append({
# 'name': normalize_string(str(kid)),
# 'value': normalize_string(str(kid))
# })
# return kid_list
recipe_tree = []
parse_list = []
soup = BeautifulSoup(text, "html.parser")
html_data = get_from_html(soup)
images = get_images_from_source(soup, url)
text = unquote(text)
scrape = None
# recipe_tree = []
# parse_list = []
# soup = BeautifulSoup(text, "html.parser")
# html_data = get_from_html(soup)
# images = get_images_from_source(soup, url)
# text = unquote(text)
# scrape = None
if url:
try:
scrape = scrape_me(url_path=url, wild_mode=True)
except(NoSchemaFoundInWildMode):
pass
if not scrape:
try:
parse_list.append(remove_graph(json.loads(text)))
if not url and 'url' in parse_list[0]:
url = parse_list[0]['url']
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
# if url and not text:
# try:
# scrape = scrape_me(url_path=url, wild_mode=True)
# except(NoSchemaFoundInWildMode):
# pass
except JSONDecodeError:
for el in soup.find_all('script', type='application/ld+json'):
el = remove_graph(el)
if not url and 'url' in el:
url = el['url']
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
for el in soup.find_all(type='application/json'):
el = remove_graph(el)
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
scrape = text_scraper(text, url=url)
# if not scrape:
# try:
# parse_list.append(remove_graph(json.loads(text)))
# if not url and 'url' in parse_list[0]:
# url = parse_list[0]['url']
# scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
recipe_json = helper.get_from_scraper(scrape, request)
# except JSONDecodeError:
# for el in soup.find_all('script', type='application/ld+json'):
# el = remove_graph(el)
# if not url and 'url' in el:
# url = el['url']
# if type(el) == list:
# for le in el:
# parse_list.append(le)
# elif type(el) == dict:
# parse_list.append(el)
# for el in soup.find_all(type='application/json'):
# el = remove_graph(el)
# if type(el) == list:
# for le in el:
# parse_list.append(le)
# elif type(el) == dict:
# parse_list.append(el)
# scrape = text_scraper(text, url=url)
for el in parse_list:
temp_tree = []
if isinstance(el, Tag):
try:
el = json.loads(el.string)
except TypeError:
continue
# recipe_json = helper.get_from_scraper(scrape, request)
for k, v in el.items():
if isinstance(v, dict):
node = {
'name': k,
'value': k,
'children': get_children_dict(v)
}
elif isinstance(v, list):
node = {
'name': k,
'value': k,
'children': get_children_list(v)
}
else:
node = {
'name': k + ": " + normalize_string(str(v)),
'value': normalize_string(str(v))
}
temp_tree.append(node)
# # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere
# for el in parse_list:
# temp_tree = []
# if isinstance(el, Tag):
# try:
# el = json.loads(el.string)
# except TypeError:
# continue
if '@type' in el and el['@type'] == 'Recipe':
recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
else:
recipe_tree += [{'name': 'json', 'children': temp_tree}]
# for k, v in el.items():
# if isinstance(v, dict):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_dict(v)
# }
# elif isinstance(v, list):
# node = {
# 'name': k,
# 'value': k,
# 'children': get_children_list(v)
# }
# else:
# node = {
# 'name': k + ": " + normalize_string(str(v)),
# 'value': normalize_string(str(v))
# }
# temp_tree.append(node)
return recipe_json, recipe_tree, html_data, images
# if '@type' in el and el['@type'] == 'Recipe':
# recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
# else:
# recipe_tree += [{'name': 'json', 'children': temp_tree}]
# return recipe_json, recipe_tree, html_data, images
def get_from_html(soup):
INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
html = []
for s in soup.strings:
if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
html.append(s)
return html
# def get_from_html(soup):
# INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
# html = []
# for s in soup.strings:
# if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
# html.append(s)
# return html
def get_images_from_source(soup, url):
sources = ['src', 'srcset', 'data-src']
images = []
img_tags = soup.find_all('img')
if url:
site = get_host_name(url)
prot = url.split(':')[0]
# def get_images_from_source(soup, url):
# sources = ['src', 'srcset', 'data-src']
# images = []
# img_tags = soup.find_all('img')
# if url:
# site = get_host_name(url)
# prot = url.split(':')[0]
urls = []
for img in img_tags:
for src in sources:
try:
urls.append(img[src])
except KeyError:
pass
# urls = []
# for img in img_tags:
# for src in sources:
# try:
# urls.append(img[src])
# except KeyError:
# pass
for u in urls:
u = u.split('?')[0]
filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
if filename:
if (('http' not in u) and (url)):
# sometimes an image source can be relative
# if it is provide the base url
u = '{}://{}{}'.format(prot, site, u)
if 'http' in u:
images.append(u)
return images
# for u in urls:
# u = u.split('?')[0]
# filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
# if filename:
# if (('http' not in u) and (url)):
# # sometimes an image source can be relative
# # if it is provide the base url
# u = '{}://{}{}'.format(prot, site, u)
# if 'http' in u:
# images.append(u)
# return images
def remove_graph(el):
# recipes type might be wrapped in @graph type
if isinstance(el, Tag):
try:
el = json.loads(el.string)
if '@graph' in el:
for x in el['@graph']:
if '@type' in x and x['@type'] == 'Recipe':
el = x
except (TypeError, JSONDecodeError):
pass
return el
# def remove_graph(el):
# # recipes type might be wrapped in @graph type
# if isinstance(el, Tag):
# try:
# el = json.loads(el.string)
# if '@graph' in el:
# for x in el['@graph']:
# if '@type' in x and x['@type'] == 'Recipe':
# el = x
# except (TypeError, JSONDecodeError):
# pass
# return el

View File

@@ -1,21 +1,19 @@
import random
import re
from html import unescape
from pytube import YouTube
from unicodedata import decomposition
from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _
from isodate import parse_duration as iso_parse_duration
from isodate.isoerror import ISO8601Error
from recipe_scrapers._utils import get_minutes
from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Keyword
# from recipe_scrapers._utils import get_minutes ## temporary until/unless upstream incorporates get_minutes() PR
@@ -369,3 +367,32 @@ def iso_duration_to_minutes(string):
string
).groupdict()
return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0)
def get_images_from_soup(soup, url):
sources = ['src', 'srcset', 'data-src']
images = []
img_tags = soup.find_all('img')
if url:
site = get_host_name(url)
prot = url.split(':')[0]
urls = []
for img in img_tags:
for src in sources:
try:
urls.append(img[src])
except KeyError:
pass
for u in urls:
u = u.split('?')[0]
filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
if filename:
if (('http' not in u) and (url)):
# sometimes an image source can be relative
# if it is provide the base url
u = '{}://{}{}'.format(prot, site, u)
if 'http' in u:
images.append(u)
return images

View File

@@ -1,6 +1,7 @@
from bs4 import BeautifulSoup
from json import JSONDecodeError
from recipe_scrapers import SCRAPERS
from bs4 import BeautifulSoup
from recipe_scrapers import SCRAPERS, get_host_name
from recipe_scrapers._factory import SchemaScraperFactory
from recipe_scrapers._schemaorg import SchemaOrg
@@ -15,22 +16,28 @@ SCRAPERS.update(CUSTOM_SCRAPERS)
def text_scraper(text, url=None):
scraper_class = SchemaScraperFactory.SchemaScraper
domain = None
if url:
domain = get_host_name(url)
if domain in SCRAPERS:
scraper_class = SCRAPERS[domain]
else:
scraper_class = SchemaScraperFactory.SchemaScraper
class TextScraper(scraper_class):
def __init__(
self,
page_data,
url=None
html=None,
url=None,
):
self.wild_mode = False
self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser")
self.soup = BeautifulSoup(html, "html.parser")
self.url = url
self.recipe = None
try:
self.schema = SchemaOrg(page_data)
self.schema = SchemaOrg(html)
except (JSONDecodeError, AttributeError):
pass
return TextScraper(text, url)
return TextScraper(url=url, html=text)

View File

@@ -10,8 +10,8 @@ import validators
import yaml
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_url_import import iso_duration_to_minutes
from cookbook.helper.recipe_url_import import get_images_from_soup, iso_duration_to_minutes
from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Keyword, Recipe, Step
@@ -24,7 +24,10 @@ class CookBookApp(Integration):
def get_recipe_from_file(self, file):
recipe_html = file.getvalue().decode("utf-8")
recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request)
# recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request)
scrape = text_scraper(text=data)
recipe_json = helper.get_from_scraper(scrape, request)
images = list(dict.fromkeys(get_images_from_soup(scrape.soup, url)))
recipe = Recipe.objects.create(
name=recipe_json['name'].strip(),

View File

@@ -3,10 +3,9 @@ from io import BytesIO
from zipfile import ZipFile
from bs4 import BeautifulSoup
from django.utils.translation import gettext as _
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_html_import import get_recipe_from_source
from cookbook.helper.recipe_url_import import iso_duration_to_minutes, parse_servings
from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Keyword, Recipe, Step

View File

@@ -1,12 +1,11 @@
import traceback
from datetime import timedelta, datetime
from datetime import datetime, timedelta
from decimal import Decimal
from gettext import gettext as _
from html import escape
from smtplib import SMTPException
from PIL import Image
from django.contrib.auth.models import User, Group
from django.contrib.auth.models import Group, User
from django.core.mail import send_mail
from django.db.models import Avg, Q, QuerySet, Sum
from django.http import BadHeaderError
@@ -14,6 +13,7 @@ from django.urls import reverse
from django.utils import timezone
from django_scopes import scopes_disabled
from drf_writable_nested import UniqueFieldsMixin, WritableNestedModelSerializer
from PIL import Image
from rest_framework import serializers
from rest_framework.exceptions import NotFound, ValidationError
@@ -22,14 +22,14 @@ from cookbook.helper.HelperFunctions import str2bool
from cookbook.helper.permission_helper import above_space_limit
from cookbook.helper.shopping_helper import RecipeShoppingEditor
from cookbook.models import (Automation, BookmarkletImport, Comment, CookLog, CustomFilter,
ExportLog, Food, FoodInheritField, ImportLog, Ingredient, Keyword,
MealPlan, MealType, NutritionInformation, Recipe, RecipeBook,
ExportLog, Food, FoodInheritField, ImportLog, Ingredient, InviteLink,
Keyword, MealPlan, MealType, NutritionInformation, Recipe, RecipeBook,
RecipeBookEntry, RecipeImport, ShareLink, ShoppingList,
ShoppingListEntry, ShoppingListRecipe, Step, Storage, Supermarket,
SupermarketCategory, SupermarketCategoryRelation, Sync, SyncLog, Unit,
UserFile, UserPreference, ViewLog, Space, UserSpace, InviteLink)
ShoppingListEntry, ShoppingListRecipe, Space, Step, Storage,
Supermarket, SupermarketCategory, SupermarketCategoryRelation, Sync,
SyncLog, Unit, UserFile, UserPreference, UserSpace, ViewLog)
from cookbook.templatetags.custom_tags import markdown
from recipes.settings import MEDIA_URL, AWS_ENABLED
from recipes.settings import AWS_ENABLED, MEDIA_URL
class ExtendedRecipeMixin(serializers.ModelSerializer):
@@ -193,7 +193,8 @@ class SpaceSerializer(WritableNestedModelSerializer):
class Meta:
model = Space
fields = ('id', 'name', 'created_by', 'created_at', 'message', 'max_recipes', 'max_file_storage_mb', 'max_users', 'allow_sharing', 'demo', 'food_inherit', 'show_facet_count', 'user_count', 'recipe_count', 'file_size_mb',)
fields = ('id', 'name', 'created_by', 'created_at', 'message', 'max_recipes', 'max_file_storage_mb', 'max_users',
'allow_sharing', 'demo', 'food_inherit', 'show_facet_count', 'user_count', 'recipe_count', 'file_size_mb',)
read_only_fields = ('id', 'created_by', 'created_at', 'max_recipes', 'max_file_storage_mb', 'max_users', 'allow_sharing', 'demo',)
@@ -815,7 +816,7 @@ class RecipeBookEntrySerializer(serializers.ModelSerializer):
book = validated_data['book']
recipe = validated_data['recipe']
if not book.get_owner() == self.context['request'].user and not self.context[
'request'].user in book.get_shared():
'request'].user in book.get_shared():
raise NotFound(detail=None, code=None)
obj, created = RecipeBookEntry.objects.get_or_create(book=book, recipe=recipe)
return obj
@@ -871,11 +872,11 @@ class ShoppingListRecipeSerializer(serializers.ModelSerializer):
value = value.quantize(
Decimal(1)) if value == value.to_integral() else value.normalize() # strips trailing zero
return (
obj.name
or getattr(obj.mealplan, 'title', None)
or (d := getattr(obj.mealplan, 'date', None)) and ': '.join([obj.mealplan.recipe.name, str(d)])
or obj.recipe.name
) + f' ({value:.2g})'
obj.name
or getattr(obj.mealplan, 'title', None)
or (d := getattr(obj.mealplan, 'date', None)) and ': '.join([obj.mealplan.recipe.name, str(d)])
or obj.recipe.name
) + f' ({value:.2g})'
def update(self, instance, validated_data):
# TODO remove once old shopping list
@@ -1232,6 +1233,6 @@ class FoodShoppingUpdateSerializer(serializers.ModelSerializer):
# non model serializers
class RecipeFromSourceSerializer(serializers.Serializer):
url = serializers.CharField(max_length=4096, required=False, allow_null=True)
url = serializers.CharField(max_length=4096, required=False, allow_null=True, allow_blank=True)
data = serializers.CharField(required=False, allow_null=True, allow_blank=True)
bookmarklet = serializers.IntegerField(required=False, allow_null=True, )

View File

@@ -5,20 +5,20 @@ import re
import traceback
import uuid
from collections import OrderedDict
from json import JSONDecodeError
from urllib.parse import unquote
from zipfile import ZipFile
import requests
import validators
from PIL import UnidentifiedImageError
from annoying.decorators import ajax_request
from annoying.functions import get_object_or_None
from django.contrib import messages
from django.contrib.auth.models import User, Group
from django.contrib.auth.models import Group, User
from django.contrib.postgres.search import TrigramSimilarity
from django.core.exceptions import FieldError, ValidationError
from django.core.files import File
from django.db.models import (Case, Count, Exists, OuterRef, ProtectedError, Q,
Subquery, Value, When)
from django.db.models import Case, Count, Exists, OuterRef, ProtectedError, Q, Subquery, Value, When
from django.db.models.fields.related import ForeignObjectRel
from django.db.models.functions import Coalesce, Lower
from django.http import FileResponse, HttpResponse, JsonResponse
@@ -27,6 +27,9 @@ from django.urls import reverse
from django.utils.translation import gettext as _
from django_scopes import scopes_disabled
from icalendar import Calendar, Event
from PIL import UnidentifiedImageError
from recipe_scrapers import scrape_html, scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
from requests.exceptions import MissingSchema
from rest_framework import decorators, status, viewsets
from rest_framework.authtoken.models import Token
@@ -41,43 +44,47 @@ from rest_framework.throttling import AnonRateThrottle
from rest_framework.viewsets import ViewSetMixin
from treebeard.exceptions import InvalidMoveToDescendant, InvalidPosition, PathOverflow
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.HelperFunctions import str2bool
from cookbook.helper.image_processing import handle_image
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest, CustomIsOwner,
CustomIsShare, CustomIsShared, CustomIsUser,
group_required, CustomIsSpaceOwner, switch_user_active_space, is_space_owner, CustomIsOwnerReadOnly)
from cookbook.helper.recipe_html_import import get_recipe_from_source
CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared,
CustomIsSpaceOwner, CustomIsUser, group_required,
is_space_owner, switch_user_active_space)
from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search
from cookbook.helper.recipe_url_import import get_from_youtube_scraper
from cookbook.helper.recipe_url_import import get_from_youtube_scraper, get_images_from_soup
from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper
from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food,
FoodInheritField, ImportLog, Ingredient, Keyword, MealPlan, MealType,
Recipe, RecipeBook, RecipeBookEntry, ShareLink, ShoppingList,
ShoppingListEntry, ShoppingListRecipe, Step, Storage, Supermarket,
SupermarketCategory, SupermarketCategoryRelation, Sync, SyncLog, Unit,
UserFile, UserPreference, ViewLog, Space, UserSpace, InviteLink)
FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan,
MealType, Recipe, RecipeBook, RecipeBookEntry, ShareLink, ShoppingList,
ShoppingListEntry, ShoppingListRecipe, Space, Step, Storage,
Supermarket, SupermarketCategory, SupermarketCategoryRelation, Sync,
SyncLog, Unit, UserFile, UserPreference, UserSpace, ViewLog)
from cookbook.provider.dropbox import Dropbox
from cookbook.provider.local import Local
from cookbook.provider.nextcloud import Nextcloud
from cookbook.schemas import FilterSchema, QueryParam, QueryParamAutoSchema, TreeSchema
from cookbook.serializer import (AutomationSerializer, BookmarkletImportSerializer,
CookLogSerializer, CustomFilterSerializer, ExportLogSerializer,
from cookbook.serializer import (AutomationSerializer, BookmarkletImportListSerializer,
BookmarkletImportSerializer, CookLogSerializer,
CustomFilterSerializer, ExportLogSerializer,
FoodInheritFieldSerializer, FoodSerializer,
FoodShoppingUpdateSerializer, ImportLogSerializer,
IngredientSerializer, KeywordSerializer, MealPlanSerializer,
FoodShoppingUpdateSerializer, GroupSerializer, ImportLogSerializer,
IngredientSerializer, IngredientSimpleSerializer,
InviteLinkSerializer, KeywordSerializer, MealPlanSerializer,
MealTypeSerializer, RecipeBookEntrySerializer,
RecipeBookSerializer, RecipeImageSerializer,
RecipeOverviewSerializer, RecipeSerializer,
RecipeBookSerializer, RecipeFromSourceSerializer,
RecipeImageSerializer, RecipeOverviewSerializer, RecipeSerializer,
RecipeShoppingUpdateSerializer, RecipeSimpleSerializer,
ShoppingListAutoSyncSerializer, ShoppingListEntrySerializer,
ShoppingListRecipeSerializer, ShoppingListSerializer,
StepSerializer, StorageSerializer,
SpaceSerializer, StepSerializer, StorageSerializer,
SupermarketCategoryRelationSerializer,
SupermarketCategorySerializer, SupermarketSerializer,
SyncLogSerializer, SyncSerializer, UnitSerializer,
UserFileSerializer, UserNameSerializer, UserPreferenceSerializer,
ViewLogSerializer, IngredientSimpleSerializer, BookmarkletImportListSerializer, RecipeFromSourceSerializer, SpaceSerializer, UserSpaceSerializer, GroupSerializer, InviteLinkSerializer)
UserSpaceSerializer, ViewLogSerializer)
from recipes import settings
@@ -713,7 +720,7 @@ class RecipeViewSet(viewsets.ModelViewSet):
'Query string matched (fuzzy) against recipe name. In the future also fulltext search.')),
QueryParam(name='keywords', description=_(
'ID of keyword a recipe should have. For multiple repeat parameter. Equivalent to keywords_or'),
qtype='int'),
qtype='int'),
QueryParam(name='keywords_or',
description=_('Keyword IDs, repeat for multiple. Return recipes with any of the keywords'),
qtype='int'),
@@ -1114,69 +1121,79 @@ def recipe_from_source(request):
- url: url to use for importing recipe
- data: if no url is given recipe is imported from provided source data
- (optional) bookmarklet: id of bookmarklet import to use, overrides URL and data attributes
:return: JsonResponse containing the parsed json, original html,json and images
:return: JsonResponse containing the parsed json and images
"""
scrape = None
serializer = RecipeFromSourceSerializer(data=request.data)
if serializer.is_valid():
try:
if bookmarklet := BookmarkletImport.objects.filter(pk=serializer.validated_data['bookmarklet']).first():
serializer.validated_data['url'] = bookmarklet.url
serializer.validated_data['data'] = bookmarklet.html
bookmarklet.delete()
except KeyError:
pass
# headers to use for request to external sites
external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"}
if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()):
serializer.validated_data['url'] = bookmarklet.url
serializer.validated_data['data'] = bookmarklet.html
bookmarklet.delete()
if not 'url' in serializer.validated_data and not 'data' in serializer.validated_data:
url = serializer.validated_data.get('url', None)
data = unquote(serializer.validated_data.get('data', None))
if not url and not data:
return Response({
'error': True,
'msg': _('Nothing to do.')
}, status=status.HTTP_400_BAD_REQUEST)
# in manual mode request complete page to return it later
if 'url' in serializer.validated_data:
if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', serializer.validated_data['url']):
if validators.url(serializer.validated_data['url'], public=True):
elif url and not data:
if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', url):
if validators.url(url, public=True):
return Response({
'recipe_json': get_from_youtube_scraper(serializer.validated_data['url'], request),
'recipe_tree': '',
'recipe_html': '',
'recipe_json': get_from_youtube_scraper(url, request),
# 'recipe_tree': '',
# 'recipe_html': '',
'recipe_images': [],
}, status=status.HTTP_200_OK)
try:
if validators.url(serializer.validated_data['url'], public=True):
serializer.validated_data['data'] = requests.get(serializer.validated_data['url'], headers=external_request_headers).content
else:
else:
try:
if validators.url(url, public=True):
scrape = scrape_me(url_path=url, wild_mode=True)
else:
return Response({
'error': True,
'msg': _('Invalid Url')
}, status=status.HTTP_400_BAD_REQUEST)
except NoSchemaFoundInWildMode:
pass
except requests.exceptions.ConnectionError:
return Response({
'error': True,
'msg': _('Invalid Url')
'msg': _('Connection Refused.')
}, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.ConnectionError:
return Response({
'error': True,
'msg': _('Connection Refused.')
}, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.MissingSchema:
return Response({
'error': True,
'msg': _('Bad URL Schema.')
}, status=status.HTTP_400_BAD_REQUEST)
except requests.exceptions.MissingSchema:
return Response({
'error': True,
'msg': _('Bad URL Schema.')
}, status=status.HTTP_400_BAD_REQUEST)
else:
try:
json.loads(data)
data = "<script type='application/ld+json'>" + data + "</script>"
except JSONDecodeError:
pass
scrape = text_scraper(text=data, url=url)
if not url and (found_url := scrape.schema.data.get('url', None)):
scrape = text_scraper(text=data, url=found_url)
recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request)
if len(recipe_tree) == 0 and len(recipe_json) == 0:
if scrape:
return Response({
'recipe_json': helper.get_from_scraper(scrape, request),
# 'recipe_tree': recipe_tree,
# 'recipe_html': recipe_html,
'recipe_images': list(dict.fromkeys(get_images_from_soup(scrape.soup, url))),
}, status=status.HTTP_200_OK)
else:
return Response({
'error': True,
'msg': _('No usable data could be found.')
}, status=status.HTTP_400_BAD_REQUEST)
else:
return Response({
'recipe_json': recipe_json,
'recipe_tree': recipe_tree,
'recipe_html': recipe_html,
'recipe_images': list(dict.fromkeys(recipe_images)),
}, status=status.HTTP_200_OK)
else:
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)