From e23d514d894f299d003767bbdb829d2c6c495e0d Mon Sep 17 00:00:00 2001 From: smilerz Date: Wed, 6 Jul 2022 16:16:53 -0500 Subject: [PATCH 1/4] fix bookmarklet --- cookbook/helper/recipe_html_import.py | 13 +++--- cookbook/helper/scrapers/scrapers.py | 13 ++++-- cookbook/serializer.py | 35 ++++++++-------- cookbook/views/api.py | 59 +++++++++++++-------------- 4 files changed, 65 insertions(+), 55 deletions(-) diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 1b5d37ad2..48dc4c119 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -6,7 +6,7 @@ from urllib.parse import unquote from bs4 import BeautifulSoup from bs4.element import Tag from recipe_scrapers import scrape_html, scrape_me -from recipe_scrapers._exceptions import NoSchemaFoundInWildMode +from recipe_scrapers._exceptions import NoSchemaFoundInWildMode, WebsiteNotImplementedError from recipe_scrapers._utils import get_host_name, normalize_string from cookbook.helper import recipe_url_import as helper @@ -68,11 +68,14 @@ def get_recipe_from_source(text, url, request): text = unquote(text) scrape = None - if url: + if url and not text: try: - scrape = scrape_me(url_path=url, wild_mode=True) - except(NoSchemaFoundInWildMode): - pass + scrape = scrape_me(url_path=url) + except WebsiteNotImplementedError: + try: + scrape = scrape_me(url_path=url, wild_mode=True) + except(NoSchemaFoundInWildMode): + pass if not scrape: try: parse_list.append(remove_graph(json.loads(text))) diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index eb93cc2c2..94e3daea0 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -1,6 +1,7 @@ -from bs4 import BeautifulSoup from json import JSONDecodeError -from recipe_scrapers import SCRAPERS + +from bs4 import BeautifulSoup +from recipe_scrapers import SCRAPERS, get_host_name from recipe_scrapers._factory import SchemaScraperFactory from recipe_scrapers._schemaorg import SchemaOrg @@ -15,7 +16,13 @@ SCRAPERS.update(CUSTOM_SCRAPERS) def text_scraper(text, url=None): - scraper_class = SchemaScraperFactory.SchemaScraper + domain = None + if url: + domain = get_host_name(url) + if domain in SCRAPERS: + scraper_class = SCRAPERS[domain] + else: + scraper_class = SchemaScraperFactory.SchemaScraper class TextScraper(scraper_class): def __init__( diff --git a/cookbook/serializer.py b/cookbook/serializer.py index 1e386c5dd..a4b8b2fe9 100644 --- a/cookbook/serializer.py +++ b/cookbook/serializer.py @@ -1,12 +1,11 @@ import traceback -from datetime import timedelta, datetime +from datetime import datetime, timedelta from decimal import Decimal from gettext import gettext as _ from html import escape from smtplib import SMTPException -from PIL import Image -from django.contrib.auth.models import User, Group +from django.contrib.auth.models import Group, User from django.core.mail import send_mail from django.db.models import Avg, Q, QuerySet, Sum from django.http import BadHeaderError @@ -14,6 +13,7 @@ from django.urls import reverse from django.utils import timezone from django_scopes import scopes_disabled from drf_writable_nested import UniqueFieldsMixin, WritableNestedModelSerializer +from PIL import Image from rest_framework import serializers from rest_framework.exceptions import NotFound, ValidationError @@ -22,14 +22,14 @@ from cookbook.helper.HelperFunctions import str2bool from cookbook.helper.permission_helper import above_space_limit from cookbook.helper.shopping_helper import RecipeShoppingEditor from cookbook.models import (Automation, BookmarkletImport, Comment, CookLog, CustomFilter, - ExportLog, Food, FoodInheritField, ImportLog, Ingredient, Keyword, - MealPlan, MealType, NutritionInformation, Recipe, RecipeBook, + ExportLog, Food, FoodInheritField, ImportLog, Ingredient, InviteLink, + Keyword, MealPlan, MealType, NutritionInformation, Recipe, RecipeBook, RecipeBookEntry, RecipeImport, ShareLink, ShoppingList, - ShoppingListEntry, ShoppingListRecipe, Step, Storage, Supermarket, - SupermarketCategory, SupermarketCategoryRelation, Sync, SyncLog, Unit, - UserFile, UserPreference, ViewLog, Space, UserSpace, InviteLink) + ShoppingListEntry, ShoppingListRecipe, Space, Step, Storage, + Supermarket, SupermarketCategory, SupermarketCategoryRelation, Sync, + SyncLog, Unit, UserFile, UserPreference, UserSpace, ViewLog) from cookbook.templatetags.custom_tags import markdown -from recipes.settings import MEDIA_URL, AWS_ENABLED +from recipes.settings import AWS_ENABLED, MEDIA_URL class ExtendedRecipeMixin(serializers.ModelSerializer): @@ -193,7 +193,8 @@ class SpaceSerializer(WritableNestedModelSerializer): class Meta: model = Space - fields = ('id', 'name', 'created_by', 'created_at', 'message', 'max_recipes', 'max_file_storage_mb', 'max_users', 'allow_sharing', 'demo', 'food_inherit', 'show_facet_count', 'user_count', 'recipe_count', 'file_size_mb',) + fields = ('id', 'name', 'created_by', 'created_at', 'message', 'max_recipes', 'max_file_storage_mb', 'max_users', + 'allow_sharing', 'demo', 'food_inherit', 'show_facet_count', 'user_count', 'recipe_count', 'file_size_mb',) read_only_fields = ('id', 'created_by', 'created_at', 'max_recipes', 'max_file_storage_mb', 'max_users', 'allow_sharing', 'demo',) @@ -815,7 +816,7 @@ class RecipeBookEntrySerializer(serializers.ModelSerializer): book = validated_data['book'] recipe = validated_data['recipe'] if not book.get_owner() == self.context['request'].user and not self.context[ - 'request'].user in book.get_shared(): + 'request'].user in book.get_shared(): raise NotFound(detail=None, code=None) obj, created = RecipeBookEntry.objects.get_or_create(book=book, recipe=recipe) return obj @@ -871,11 +872,11 @@ class ShoppingListRecipeSerializer(serializers.ModelSerializer): value = value.quantize( Decimal(1)) if value == value.to_integral() else value.normalize() # strips trailing zero return ( - obj.name - or getattr(obj.mealplan, 'title', None) - or (d := getattr(obj.mealplan, 'date', None)) and ': '.join([obj.mealplan.recipe.name, str(d)]) - or obj.recipe.name - ) + f' ({value:.2g})' + obj.name + or getattr(obj.mealplan, 'title', None) + or (d := getattr(obj.mealplan, 'date', None)) and ': '.join([obj.mealplan.recipe.name, str(d)]) + or obj.recipe.name + ) + f' ({value:.2g})' def update(self, instance, validated_data): # TODO remove once old shopping list @@ -1232,6 +1233,6 @@ class FoodShoppingUpdateSerializer(serializers.ModelSerializer): # non model serializers class RecipeFromSourceSerializer(serializers.Serializer): - url = serializers.CharField(max_length=4096, required=False, allow_null=True) + url = serializers.CharField(max_length=4096, required=False, allow_null=True, allow_blank=True) data = serializers.CharField(required=False, allow_null=True, allow_blank=True) bookmarklet = serializers.IntegerField(required=False, allow_null=True, ) diff --git a/cookbook/views/api.py b/cookbook/views/api.py index 7a49261be..9b077b5c1 100644 --- a/cookbook/views/api.py +++ b/cookbook/views/api.py @@ -9,16 +9,14 @@ from zipfile import ZipFile import requests import validators -from PIL import UnidentifiedImageError from annoying.decorators import ajax_request from annoying.functions import get_object_or_None from django.contrib import messages -from django.contrib.auth.models import User, Group +from django.contrib.auth.models import Group, User from django.contrib.postgres.search import TrigramSimilarity from django.core.exceptions import FieldError, ValidationError from django.core.files import File -from django.db.models import (Case, Count, Exists, OuterRef, ProtectedError, Q, - Subquery, Value, When) +from django.db.models import Case, Count, Exists, OuterRef, ProtectedError, Q, Subquery, Value, When from django.db.models.fields.related import ForeignObjectRel from django.db.models.functions import Coalesce, Lower from django.http import FileResponse, HttpResponse, JsonResponse @@ -27,6 +25,7 @@ from django.urls import reverse from django.utils.translation import gettext as _ from django_scopes import scopes_disabled from icalendar import Calendar, Event +from PIL import UnidentifiedImageError from requests.exceptions import MissingSchema from rest_framework import decorators, status, viewsets from rest_framework.authtoken.models import Token @@ -45,39 +44,42 @@ from cookbook.helper.HelperFunctions import str2bool from cookbook.helper.image_processing import handle_image from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest, CustomIsOwner, - CustomIsShare, CustomIsShared, CustomIsUser, - group_required, CustomIsSpaceOwner, switch_user_active_space, is_space_owner, CustomIsOwnerReadOnly) + CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared, + CustomIsSpaceOwner, CustomIsUser, group_required, + is_space_owner, switch_user_active_space) from cookbook.helper.recipe_html_import import get_recipe_from_source from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search from cookbook.helper.recipe_url_import import get_from_youtube_scraper from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food, - FoodInheritField, ImportLog, Ingredient, Keyword, MealPlan, MealType, - Recipe, RecipeBook, RecipeBookEntry, ShareLink, ShoppingList, - ShoppingListEntry, ShoppingListRecipe, Step, Storage, Supermarket, - SupermarketCategory, SupermarketCategoryRelation, Sync, SyncLog, Unit, - UserFile, UserPreference, ViewLog, Space, UserSpace, InviteLink) + FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan, + MealType, Recipe, RecipeBook, RecipeBookEntry, ShareLink, ShoppingList, + ShoppingListEntry, ShoppingListRecipe, Space, Step, Storage, + Supermarket, SupermarketCategory, SupermarketCategoryRelation, Sync, + SyncLog, Unit, UserFile, UserPreference, UserSpace, ViewLog) from cookbook.provider.dropbox import Dropbox from cookbook.provider.local import Local from cookbook.provider.nextcloud import Nextcloud from cookbook.schemas import FilterSchema, QueryParam, QueryParamAutoSchema, TreeSchema -from cookbook.serializer import (AutomationSerializer, BookmarkletImportSerializer, - CookLogSerializer, CustomFilterSerializer, ExportLogSerializer, +from cookbook.serializer import (AutomationSerializer, BookmarkletImportListSerializer, + BookmarkletImportSerializer, CookLogSerializer, + CustomFilterSerializer, ExportLogSerializer, FoodInheritFieldSerializer, FoodSerializer, - FoodShoppingUpdateSerializer, ImportLogSerializer, - IngredientSerializer, KeywordSerializer, MealPlanSerializer, + FoodShoppingUpdateSerializer, GroupSerializer, ImportLogSerializer, + IngredientSerializer, IngredientSimpleSerializer, + InviteLinkSerializer, KeywordSerializer, MealPlanSerializer, MealTypeSerializer, RecipeBookEntrySerializer, - RecipeBookSerializer, RecipeImageSerializer, - RecipeOverviewSerializer, RecipeSerializer, + RecipeBookSerializer, RecipeFromSourceSerializer, + RecipeImageSerializer, RecipeOverviewSerializer, RecipeSerializer, RecipeShoppingUpdateSerializer, RecipeSimpleSerializer, ShoppingListAutoSyncSerializer, ShoppingListEntrySerializer, ShoppingListRecipeSerializer, ShoppingListSerializer, - StepSerializer, StorageSerializer, + SpaceSerializer, StepSerializer, StorageSerializer, SupermarketCategoryRelationSerializer, SupermarketCategorySerializer, SupermarketSerializer, SyncLogSerializer, SyncSerializer, UnitSerializer, UserFileSerializer, UserNameSerializer, UserPreferenceSerializer, - ViewLogSerializer, IngredientSimpleSerializer, BookmarkletImportListSerializer, RecipeFromSourceSerializer, SpaceSerializer, UserSpaceSerializer, GroupSerializer, InviteLinkSerializer) + UserSpaceSerializer, ViewLogSerializer) from recipes import settings @@ -713,7 +715,7 @@ class RecipeViewSet(viewsets.ModelViewSet): 'Query string matched (fuzzy) against recipe name. In the future also fulltext search.')), QueryParam(name='keywords', description=_( 'ID of keyword a recipe should have. For multiple repeat parameter. Equivalent to keywords_or'), - qtype='int'), + qtype='int'), QueryParam(name='keywords_or', description=_('Keyword IDs, repeat for multiple. Return recipes with any of the keywords'), qtype='int'), @@ -1118,25 +1120,22 @@ def recipe_from_source(request): """ serializer = RecipeFromSourceSerializer(data=request.data) if serializer.is_valid(): - try: - if bookmarklet := BookmarkletImport.objects.filter(pk=serializer.validated_data['bookmarklet']).first(): - serializer.validated_data['url'] = bookmarklet.url - serializer.validated_data['data'] = bookmarklet.html - bookmarklet.delete() - except KeyError: - pass - # headers to use for request to external sites external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"} - if not 'url' in serializer.validated_data and not 'data' in serializer.validated_data: + if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()): + serializer.validated_data['url'] = bookmarklet.url + serializer.validated_data['data'] = bookmarklet.html + bookmarklet.delete() + + elif not 'url' in serializer.validated_data and not 'data' in serializer.validated_data: return Response({ 'error': True, 'msg': _('Nothing to do.') }, status=status.HTTP_400_BAD_REQUEST) # in manual mode request complete page to return it later - if 'url' in serializer.validated_data: + elif 'url' in serializer.validated_data and serializer.validated_data['url'] != '': if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', serializer.validated_data['url']): if validators.url(serializer.validated_data['url'], public=True): return Response({ From 25a41bd293873febcb57febdd0fb378852cc7086 Mon Sep 17 00:00:00 2001 From: smilerz Date: Thu, 7 Jul 2022 06:43:07 -0500 Subject: [PATCH 2/4] reverting scraper to just using wildmode --- cookbook/helper/recipe_html_import.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 48dc4c119..c97629ef9 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -6,7 +6,7 @@ from urllib.parse import unquote from bs4 import BeautifulSoup from bs4.element import Tag from recipe_scrapers import scrape_html, scrape_me -from recipe_scrapers._exceptions import NoSchemaFoundInWildMode, WebsiteNotImplementedError +from recipe_scrapers._exceptions import NoSchemaFoundInWildMode from recipe_scrapers._utils import get_host_name, normalize_string from cookbook.helper import recipe_url_import as helper @@ -70,12 +70,9 @@ def get_recipe_from_source(text, url, request): if url and not text: try: - scrape = scrape_me(url_path=url) - except WebsiteNotImplementedError: - try: - scrape = scrape_me(url_path=url, wild_mode=True) - except(NoSchemaFoundInWildMode): - pass + scrape = scrape_me(url_path=url, wild_mode=True) + except(NoSchemaFoundInWildMode): + pass if not scrape: try: parse_list.append(remove_graph(json.loads(text))) From b1c0334947d6b6c8c0fd59efff70ebda3984729e Mon Sep 17 00:00:00 2001 From: smilerz Date: Thu, 7 Jul 2022 07:50:57 -0500 Subject: [PATCH 3/4] quick hack to allow scraper to work correctly --- cookbook/helper/recipe_html_import.py | 2 ++ cookbook/views/api.py | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index c97629ef9..62a057e2f 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -73,6 +73,7 @@ def get_recipe_from_source(text, url, request): scrape = scrape_me(url_path=url, wild_mode=True) except(NoSchemaFoundInWildMode): pass + if not scrape: try: parse_list.append(remove_graph(json.loads(text))) @@ -101,6 +102,7 @@ def get_recipe_from_source(text, url, request): recipe_json = helper.get_from_scraper(scrape, request) + # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere for el in parse_list: temp_tree = [] if isinstance(el, Tag): diff --git a/cookbook/views/api.py b/cookbook/views/api.py index 9b077b5c1..4325fe385 100644 --- a/cookbook/views/api.py +++ b/cookbook/views/api.py @@ -1120,7 +1120,7 @@ def recipe_from_source(request): """ serializer = RecipeFromSourceSerializer(data=request.data) if serializer.is_valid(): - # headers to use for request to external sites + # headers to use for request to external sites - DEPRECATE external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"} if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()): @@ -1144,9 +1144,11 @@ def recipe_from_source(request): 'recipe_html': '', 'recipe_images': [], }, status=status.HTTP_200_OK) + ####### + # this section is redundant to scrape_me. REFACTOR to catch errors from scrape_me try: if validators.url(serializer.validated_data['url'], public=True): - serializer.validated_data['data'] = requests.get(serializer.validated_data['url'], headers=external_request_headers).content + requests.get(serializer.validated_data['url'], headers=external_request_headers).content else: return Response({ 'error': True, @@ -1162,6 +1164,7 @@ def recipe_from_source(request): 'error': True, 'msg': _('Bad URL Schema.') }, status=status.HTTP_400_BAD_REQUEST) + ####### recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request) if len(recipe_tree) == 0 and len(recipe_json) == 0: From e40b73f420564dd927bd692f6d4df1055e30de07 Mon Sep 17 00:00:00 2001 From: smilerz Date: Thu, 7 Jul 2022 15:09:22 -0500 Subject: [PATCH 4/4] deprecate get_recipe_from_source --- cookbook/helper/recipe_html_import.py | 336 ++++++++++++------------- cookbook/helper/recipe_url_import.py | 35 ++- cookbook/helper/scrapers/scrapers.py | 10 +- cookbook/integration/cookbookapp.py | 9 +- cookbook/integration/copymethat.py | 3 +- cookbook/views/api.py | 95 ++++--- vue/src/apps/ImportView/ImportView.vue | 12 +- 7 files changed, 272 insertions(+), 228 deletions(-) diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 62a057e2f..95f115b76 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -1,191 +1,191 @@ -import json -import re -from json import JSONDecodeError -from urllib.parse import unquote +# import json +# import re +# from json import JSONDecodeError +# from urllib.parse import unquote -from bs4 import BeautifulSoup -from bs4.element import Tag -from recipe_scrapers import scrape_html, scrape_me -from recipe_scrapers._exceptions import NoSchemaFoundInWildMode -from recipe_scrapers._utils import get_host_name, normalize_string +# from bs4 import BeautifulSoup +# from bs4.element import Tag +# from recipe_scrapers import scrape_html, scrape_me +# from recipe_scrapers._exceptions import NoSchemaFoundInWildMode +# from recipe_scrapers._utils import get_host_name, normalize_string -from cookbook.helper import recipe_url_import as helper -from cookbook.helper.scrapers.scrapers import text_scraper +# from cookbook.helper import recipe_url_import as helper +# from cookbook.helper.scrapers.scrapers import text_scraper -def get_recipe_from_source(text, url, request): - def build_node(k, v): - if isinstance(v, dict): - node = { - 'name': k, - 'value': k, - 'children': get_children_dict(v) - } - elif isinstance(v, list): - node = { - 'name': k, - 'value': k, - 'children': get_children_list(v) - } - else: - node = { - 'name': k + ": " + normalize_string(str(v)), - 'value': normalize_string(str(v)) - } - return node +# def get_recipe_from_source(text, url, request): +# def build_node(k, v): +# if isinstance(v, dict): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_dict(v) +# } +# elif isinstance(v, list): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_list(v) +# } +# else: +# node = { +# 'name': k + ": " + normalize_string(str(v)), +# 'value': normalize_string(str(v)) +# } +# return node - def get_children_dict(children): - kid_list = [] - for k, v in children.items(): - kid_list.append(build_node(k, v)) - return kid_list +# def get_children_dict(children): +# kid_list = [] +# for k, v in children.items(): +# kid_list.append(build_node(k, v)) +# return kid_list - def get_children_list(children): - kid_list = [] - for kid in children: - if type(kid) == list: - node = { - 'name': "unknown list", - 'value': "unknown list", - 'children': get_children_list(kid) - } - kid_list.append(node) - elif type(kid) == dict: - for k, v in kid.items(): - kid_list.append(build_node(k, v)) - else: - kid_list.append({ - 'name': normalize_string(str(kid)), - 'value': normalize_string(str(kid)) - }) - return kid_list +# def get_children_list(children): +# kid_list = [] +# for kid in children: +# if type(kid) == list: +# node = { +# 'name': "unknown list", +# 'value': "unknown list", +# 'children': get_children_list(kid) +# } +# kid_list.append(node) +# elif type(kid) == dict: +# for k, v in kid.items(): +# kid_list.append(build_node(k, v)) +# else: +# kid_list.append({ +# 'name': normalize_string(str(kid)), +# 'value': normalize_string(str(kid)) +# }) +# return kid_list - recipe_tree = [] - parse_list = [] - soup = BeautifulSoup(text, "html.parser") - html_data = get_from_html(soup) - images = get_images_from_source(soup, url) - text = unquote(text) - scrape = None +# recipe_tree = [] +# parse_list = [] +# soup = BeautifulSoup(text, "html.parser") +# html_data = get_from_html(soup) +# images = get_images_from_source(soup, url) +# text = unquote(text) +# scrape = None - if url and not text: - try: - scrape = scrape_me(url_path=url, wild_mode=True) - except(NoSchemaFoundInWildMode): - pass +# if url and not text: +# try: +# scrape = scrape_me(url_path=url, wild_mode=True) +# except(NoSchemaFoundInWildMode): +# pass - if not scrape: - try: - parse_list.append(remove_graph(json.loads(text))) - if not url and 'url' in parse_list[0]: - url = parse_list[0]['url'] - scrape = text_scraper("", url=url) +# if not scrape: +# try: +# parse_list.append(remove_graph(json.loads(text))) +# if not url and 'url' in parse_list[0]: +# url = parse_list[0]['url'] +# scrape = text_scraper("", url=url) - except JSONDecodeError: - for el in soup.find_all('script', type='application/ld+json'): - el = remove_graph(el) - if not url and 'url' in el: - url = el['url'] - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - for el in soup.find_all(type='application/json'): - el = remove_graph(el) - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - scrape = text_scraper(text, url=url) +# except JSONDecodeError: +# for el in soup.find_all('script', type='application/ld+json'): +# el = remove_graph(el) +# if not url and 'url' in el: +# url = el['url'] +# if type(el) == list: +# for le in el: +# parse_list.append(le) +# elif type(el) == dict: +# parse_list.append(el) +# for el in soup.find_all(type='application/json'): +# el = remove_graph(el) +# if type(el) == list: +# for le in el: +# parse_list.append(le) +# elif type(el) == dict: +# parse_list.append(el) +# scrape = text_scraper(text, url=url) - recipe_json = helper.get_from_scraper(scrape, request) +# recipe_json = helper.get_from_scraper(scrape, request) - # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere - for el in parse_list: - temp_tree = [] - if isinstance(el, Tag): - try: - el = json.loads(el.string) - except TypeError: - continue +# # TODO: DEPRECATE recipe_tree & html_data. first validate it isn't used anywhere +# for el in parse_list: +# temp_tree = [] +# if isinstance(el, Tag): +# try: +# el = json.loads(el.string) +# except TypeError: +# continue - for k, v in el.items(): - if isinstance(v, dict): - node = { - 'name': k, - 'value': k, - 'children': get_children_dict(v) - } - elif isinstance(v, list): - node = { - 'name': k, - 'value': k, - 'children': get_children_list(v) - } - else: - node = { - 'name': k + ": " + normalize_string(str(v)), - 'value': normalize_string(str(v)) - } - temp_tree.append(node) +# for k, v in el.items(): +# if isinstance(v, dict): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_dict(v) +# } +# elif isinstance(v, list): +# node = { +# 'name': k, +# 'value': k, +# 'children': get_children_list(v) +# } +# else: +# node = { +# 'name': k + ": " + normalize_string(str(v)), +# 'value': normalize_string(str(v)) +# } +# temp_tree.append(node) - if '@type' in el and el['@type'] == 'Recipe': - recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] - else: - recipe_tree += [{'name': 'json', 'children': temp_tree}] +# if '@type' in el and el['@type'] == 'Recipe': +# recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] +# else: +# recipe_tree += [{'name': 'json', 'children': temp_tree}] - return recipe_json, recipe_tree, html_data, images +# return recipe_json, recipe_tree, html_data, images -def get_from_html(soup): - INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') - html = [] - for s in soup.strings: - if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): - html.append(s) - return html +# def get_from_html(soup): +# INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') +# html = [] +# for s in soup.strings: +# if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): +# html.append(s) +# return html -def get_images_from_source(soup, url): - sources = ['src', 'srcset', 'data-src'] - images = [] - img_tags = soup.find_all('img') - if url: - site = get_host_name(url) - prot = url.split(':')[0] +# def get_images_from_source(soup, url): +# sources = ['src', 'srcset', 'data-src'] +# images = [] +# img_tags = soup.find_all('img') +# if url: +# site = get_host_name(url) +# prot = url.split(':')[0] - urls = [] - for img in img_tags: - for src in sources: - try: - urls.append(img[src]) - except KeyError: - pass +# urls = [] +# for img in img_tags: +# for src in sources: +# try: +# urls.append(img[src]) +# except KeyError: +# pass - for u in urls: - u = u.split('?')[0] - filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) - if filename: - if (('http' not in u) and (url)): - # sometimes an image source can be relative - # if it is provide the base url - u = '{}://{}{}'.format(prot, site, u) - if 'http' in u: - images.append(u) - return images +# for u in urls: +# u = u.split('?')[0] +# filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) +# if filename: +# if (('http' not in u) and (url)): +# # sometimes an image source can be relative +# # if it is provide the base url +# u = '{}://{}{}'.format(prot, site, u) +# if 'http' in u: +# images.append(u) +# return images -def remove_graph(el): - # recipes type might be wrapped in @graph type - if isinstance(el, Tag): - try: - el = json.loads(el.string) - if '@graph' in el: - for x in el['@graph']: - if '@type' in x and x['@type'] == 'Recipe': - el = x - except (TypeError, JSONDecodeError): - pass - return el +# def remove_graph(el): +# # recipes type might be wrapped in @graph type +# if isinstance(el, Tag): +# try: +# el = json.loads(el.string) +# if '@graph' in el: +# for x in el['@graph']: +# if '@type' in x and x['@type'] == 'Recipe': +# el = x +# except (TypeError, JSONDecodeError): +# pass +# return el diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index aa3cc5cff..cec57e729 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -1,21 +1,19 @@ import random import re from html import unescape - -from pytube import YouTube from unicodedata import decomposition from django.utils.dateparse import parse_duration from django.utils.translation import gettext as _ from isodate import parse_duration as iso_parse_duration from isodate.isoerror import ISO8601Error -from recipe_scrapers._utils import get_minutes +from pytube import YouTube +from recipe_scrapers._utils import get_host_name, get_minutes from cookbook.helper import recipe_url_import as helper from cookbook.helper.ingredient_parser import IngredientParser from cookbook.models import Keyword - # from recipe_scrapers._utils import get_minutes ## temporary until/unless upstream incorporates get_minutes() PR @@ -369,3 +367,32 @@ def iso_duration_to_minutes(string): string ).groupdict() return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0) + + +def get_images_from_soup(soup, url): + sources = ['src', 'srcset', 'data-src'] + images = [] + img_tags = soup.find_all('img') + if url: + site = get_host_name(url) + prot = url.split(':')[0] + + urls = [] + for img in img_tags: + for src in sources: + try: + urls.append(img[src]) + except KeyError: + pass + + for u in urls: + u = u.split('?')[0] + filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) + if filename: + if (('http' not in u) and (url)): + # sometimes an image source can be relative + # if it is provide the base url + u = '{}://{}{}'.format(prot, site, u) + if 'http' in u: + images.append(u) + return images diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index 94e3daea0..7d6c08b15 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -27,17 +27,17 @@ def text_scraper(text, url=None): class TextScraper(scraper_class): def __init__( self, - page_data, - url=None + html=None, + url=None, ): self.wild_mode = False self.meta_http_equiv = False - self.soup = BeautifulSoup(page_data, "html.parser") + self.soup = BeautifulSoup(html, "html.parser") self.url = url self.recipe = None try: - self.schema = SchemaOrg(page_data) + self.schema = SchemaOrg(html) except (JSONDecodeError, AttributeError): pass - return TextScraper(text, url) + return TextScraper(url=url, html=text) diff --git a/cookbook/integration/cookbookapp.py b/cookbook/integration/cookbookapp.py index f22e9d45d..7ff50ab62 100644 --- a/cookbook/integration/cookbookapp.py +++ b/cookbook/integration/cookbookapp.py @@ -10,8 +10,8 @@ import validators import yaml from cookbook.helper.ingredient_parser import IngredientParser -from cookbook.helper.recipe_html_import import get_recipe_from_source -from cookbook.helper.recipe_url_import import iso_duration_to_minutes +from cookbook.helper.recipe_url_import import get_images_from_soup, iso_duration_to_minutes +from cookbook.helper.scrapers.scrapers import text_scraper from cookbook.integration.integration import Integration from cookbook.models import Ingredient, Keyword, Recipe, Step @@ -24,7 +24,10 @@ class CookBookApp(Integration): def get_recipe_from_file(self, file): recipe_html = file.getvalue().decode("utf-8") - recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request) + # recipe_json, recipe_tree, html_data, images = get_recipe_from_source(recipe_html, 'CookBookApp', self.request) + scrape = text_scraper(text=data) + recipe_json = helper.get_from_scraper(scrape, request) + images = list(dict.fromkeys(get_images_from_soup(scrape.soup, url))) recipe = Recipe.objects.create( name=recipe_json['name'].strip(), diff --git a/cookbook/integration/copymethat.py b/cookbook/integration/copymethat.py index 7a2a532f9..2a9c56521 100644 --- a/cookbook/integration/copymethat.py +++ b/cookbook/integration/copymethat.py @@ -3,10 +3,9 @@ from io import BytesIO from zipfile import ZipFile from bs4 import BeautifulSoup - from django.utils.translation import gettext as _ + from cookbook.helper.ingredient_parser import IngredientParser -from cookbook.helper.recipe_html_import import get_recipe_from_source from cookbook.helper.recipe_url_import import iso_duration_to_minutes, parse_servings from cookbook.integration.integration import Integration from cookbook.models import Ingredient, Keyword, Recipe, Step diff --git a/cookbook/views/api.py b/cookbook/views/api.py index 4325fe385..54df51bfb 100644 --- a/cookbook/views/api.py +++ b/cookbook/views/api.py @@ -5,6 +5,8 @@ import re import traceback import uuid from collections import OrderedDict +from json import JSONDecodeError +from urllib.parse import unquote from zipfile import ZipFile import requests @@ -26,6 +28,8 @@ from django.utils.translation import gettext as _ from django_scopes import scopes_disabled from icalendar import Calendar, Event from PIL import UnidentifiedImageError +from recipe_scrapers import scrape_html, scrape_me +from recipe_scrapers._exceptions import NoSchemaFoundInWildMode from requests.exceptions import MissingSchema from rest_framework import decorators, status, viewsets from rest_framework.authtoken.models import Token @@ -40,6 +44,7 @@ from rest_framework.throttling import AnonRateThrottle from rest_framework.viewsets import ViewSetMixin from treebeard.exceptions import InvalidMoveToDescendant, InvalidPosition, PathOverflow +from cookbook.helper import recipe_url_import as helper from cookbook.helper.HelperFunctions import str2bool from cookbook.helper.image_processing import handle_image from cookbook.helper.ingredient_parser import IngredientParser @@ -47,9 +52,9 @@ from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest, Cus CustomIsOwnerReadOnly, CustomIsShare, CustomIsShared, CustomIsSpaceOwner, CustomIsUser, group_required, is_space_owner, switch_user_active_space) -from cookbook.helper.recipe_html_import import get_recipe_from_source from cookbook.helper.recipe_search import RecipeFacet, RecipeSearch, old_search -from cookbook.helper.recipe_url_import import get_from_youtube_scraper +from cookbook.helper.recipe_url_import import get_from_youtube_scraper, get_images_from_soup +from cookbook.helper.scrapers.scrapers import text_scraper from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food, FoodInheritField, ImportLog, Ingredient, InviteLink, Keyword, MealPlan, @@ -1116,69 +1121,79 @@ def recipe_from_source(request): - url: url to use for importing recipe - data: if no url is given recipe is imported from provided source data - (optional) bookmarklet: id of bookmarklet import to use, overrides URL and data attributes - :return: JsonResponse containing the parsed json, original html,json and images + :return: JsonResponse containing the parsed json and images """ + scrape = None serializer = RecipeFromSourceSerializer(data=request.data) if serializer.is_valid(): - # headers to use for request to external sites - DEPRECATE - external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"} if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()): serializer.validated_data['url'] = bookmarklet.url serializer.validated_data['data'] = bookmarklet.html bookmarklet.delete() - elif not 'url' in serializer.validated_data and not 'data' in serializer.validated_data: + url = serializer.validated_data.get('url', None) + data = unquote(serializer.validated_data.get('data', None)) + if not url and not data: return Response({ 'error': True, 'msg': _('Nothing to do.') }, status=status.HTTP_400_BAD_REQUEST) - # in manual mode request complete page to return it later - elif 'url' in serializer.validated_data and serializer.validated_data['url'] != '': - if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', serializer.validated_data['url']): - if validators.url(serializer.validated_data['url'], public=True): + elif url and not data: + if re.match('^(https?://)?(www\.youtube\.com|youtu\.be)/.+$', url): + if validators.url(url, public=True): return Response({ - 'recipe_json': get_from_youtube_scraper(serializer.validated_data['url'], request), - 'recipe_tree': '', - 'recipe_html': '', + 'recipe_json': get_from_youtube_scraper(url, request), + # 'recipe_tree': '', + # 'recipe_html': '', 'recipe_images': [], }, status=status.HTTP_200_OK) - ####### - # this section is redundant to scrape_me. REFACTOR to catch errors from scrape_me - try: - if validators.url(serializer.validated_data['url'], public=True): - requests.get(serializer.validated_data['url'], headers=external_request_headers).content - else: + else: + try: + if validators.url(url, public=True): + scrape = scrape_me(url_path=url, wild_mode=True) + + else: + return Response({ + 'error': True, + 'msg': _('Invalid Url') + }, status=status.HTTP_400_BAD_REQUEST) + except NoSchemaFoundInWildMode: + pass + except requests.exceptions.ConnectionError: return Response({ 'error': True, - 'msg': _('Invalid Url') + 'msg': _('Connection Refused.') }, status=status.HTTP_400_BAD_REQUEST) - except requests.exceptions.ConnectionError: - return Response({ - 'error': True, - 'msg': _('Connection Refused.') - }, status=status.HTTP_400_BAD_REQUEST) - except requests.exceptions.MissingSchema: - return Response({ - 'error': True, - 'msg': _('Bad URL Schema.') - }, status=status.HTTP_400_BAD_REQUEST) - ####### + except requests.exceptions.MissingSchema: + return Response({ + 'error': True, + 'msg': _('Bad URL Schema.') + }, status=status.HTTP_400_BAD_REQUEST) + else: + try: + json.loads(data) + data = "" + except JSONDecodeError: + pass + scrape = text_scraper(text=data, url=url) + if not url and (found_url := scrape.schema.data.get('url', None)): + scrape = text_scraper(text=data, url=found_url) - recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request) - if len(recipe_tree) == 0 and len(recipe_json) == 0: + if scrape: + return Response({ + 'recipe_json': helper.get_from_scraper(scrape, request), + # 'recipe_tree': recipe_tree, + # 'recipe_html': recipe_html, + 'recipe_images': list(dict.fromkeys(get_images_from_soup(scrape.soup, url))), + }, status=status.HTTP_200_OK) + + else: return Response({ 'error': True, 'msg': _('No usable data could be found.') }, status=status.HTTP_400_BAD_REQUEST) - else: - return Response({ - 'recipe_json': recipe_json, - 'recipe_tree': recipe_tree, - 'recipe_html': recipe_html, - 'recipe_images': list(dict.fromkeys(recipe_images)), - }, status=status.HTTP_200_OK) else: return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) diff --git a/vue/src/apps/ImportView/ImportView.vue b/vue/src/apps/ImportView/ImportView.vue index 407664929..4b2ed0ffd 100644 --- a/vue/src/apps/ImportView/ImportView.vue +++ b/vue/src/apps/ImportView/ImportView.vue @@ -461,8 +461,8 @@ export default { recent_urls: [], source_data: '', recipe_json: undefined, - recipe_html: undefined, - recipe_tree: undefined, + // recipe_html: undefined, + // recipe_tree: undefined, recipe_images: [], imported_recipes: [], failed_imports: [], @@ -593,9 +593,9 @@ export default { } // reset all variables - this.recipe_html = undefined + // this.recipe_html = undefined this.recipe_json = undefined - this.recipe_tree = undefined + // this.recipe_tree = undefined this.recipe_images = [] // load recipe @@ -621,8 +621,8 @@ export default { return x }) - this.recipe_tree = response.data['recipe_tree']; - this.recipe_html = response.data['recipe_html']; + // this.recipe_tree = response.data['recipe_tree']; + // this.recipe_html = response.data['recipe_html']; this.recipe_images = response.data['recipe_images'] !== undefined ? response.data['recipe_images'] : []; if (!silent) {