From a02582e9f8f329d05483e94df02a0f4c420a2bac Mon Sep 17 00:00:00 2001 From: smilerz Date: Tue, 9 Jul 2024 08:01:39 -0500 Subject: [PATCH] rebase --- cookbook/helper/scrapers/cooksillustrated.py | 68 ------------------- cookbook/helper/scrapers/scrapers.py | 43 ------------ cookbook/integration/cookbookapp.py | 4 +- cookbook/tests/api/test_api_import_log.py | 2 +- .../other/test_recipe_full_text_search.py | 8 +-- cookbook/tests/other/test_url_import.py | 41 +++++------ cookbook/views/api.py | 7 +- 7 files changed, 28 insertions(+), 145 deletions(-) delete mode 100644 cookbook/helper/scrapers/cooksillustrated.py delete mode 100644 cookbook/helper/scrapers/scrapers.py diff --git a/cookbook/helper/scrapers/cooksillustrated.py b/cookbook/helper/scrapers/cooksillustrated.py deleted file mode 100644 index e1e54a97a..000000000 --- a/cookbook/helper/scrapers/cooksillustrated.py +++ /dev/null @@ -1,68 +0,0 @@ -import json -from recipe_scrapers._abstract import AbstractScraper - - -class CooksIllustrated(AbstractScraper): - @classmethod - def host(cls, site='cooksillustrated'): - return { - 'cooksillustrated': f"{site}.com", - 'americastestkitchen': f"{site}.com", - 'cookscountry': f"{site}.com", - }.get(site) - - def title(self): - return self.schema.title() - - def image(self): - return self.schema.image() - - def total_time(self): - if not self.recipe: - self.get_recipe() - return self.recipe['recipeTimeNote'] - - def yields(self): - if not self.recipe: - self.get_recipe() - return self.recipe['yields'] - - def ingredients(self): - if not self.recipe: - self.get_recipe() - ingredients = [] - for group in self.recipe['ingredientGroups']: - ingredients += group['fields']['recipeIngredientItems'] - return [ - "{} {} {}{}".format( - i['fields']['qty'] or '', - i['fields']['measurement'] or '', - i['fields']['ingredient']['fields']['title'] or '', - i['fields']['postText'] or '' - ) - for i in ingredients - ] - - def instructions(self): - if not self.recipe: - self.get_recipe() - if self.recipe.get('headnote', False): - i = ['Note: ' + self.recipe.get('headnote', '')] - else: - i = [] - return "\n".join( - i - + [self.recipe.get('whyThisWorks', '')] - + [ - instruction['fields']['content'] - for instruction in self.recipe['instructions'] - ] - ) - - def nutrients(self): - raise NotImplementedError("This should be implemented.") - - def get_recipe(self): - j = json.loads(self.soup.find(type='application/json').string) - name = list(j['props']['initialState']['content']['documents'])[0] - self.recipe = j['props']['initialState']['content']['documents'][name] diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py deleted file mode 100644 index 0cf01333b..000000000 --- a/cookbook/helper/scrapers/scrapers.py +++ /dev/null @@ -1,43 +0,0 @@ -from json import JSONDecodeError - -from bs4 import BeautifulSoup -from recipe_scrapers import SCRAPERS, get_host_name -from recipe_scrapers._factory import SchemaScraperFactory -from recipe_scrapers._schemaorg import SchemaOrg - -from .cooksillustrated import CooksIllustrated - -CUSTOM_SCRAPERS = { - CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated, - CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated, - CooksIllustrated.host(site="cookscountry"): CooksIllustrated, -} -SCRAPERS.update(CUSTOM_SCRAPERS) - - -def text_scraper(text, url=None): - domain = None - if url: - domain = get_host_name(url) - if domain in SCRAPERS: - scraper_class = SCRAPERS[domain] - else: - scraper_class = SchemaScraperFactory.SchemaScraper - - class TextScraper(scraper_class): - def __init__( - self, - html=None, - url=None, - ): - self.supported_only = False - self.meta_http_equiv = False - self.soup = BeautifulSoup(html, "html.parser") - self.url = url - self.recipe = None - try: - self.schema = SchemaOrg(html) - except (JSONDecodeError, AttributeError): - pass - - return TextScraper(url=url, html=text) diff --git a/cookbook/integration/cookbookapp.py b/cookbook/integration/cookbookapp.py index 21d9d7f30..a9b5ac132 100644 --- a/cookbook/integration/cookbookapp.py +++ b/cookbook/integration/cookbookapp.py @@ -7,7 +7,7 @@ import validators from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.recipe_url_import import (get_from_scraper, get_images_from_soup, iso_duration_to_minutes) -from cookbook.helper.scrapers.scrapers import text_scraper +from recipe_scrapers import scrape_html from cookbook.integration.integration import Integration from cookbook.models import Ingredient, Recipe, Step @@ -20,7 +20,7 @@ class CookBookApp(Integration): def get_recipe_from_file(self, file): recipe_html = file.getvalue().decode("utf-8") - scrape = text_scraper(text=recipe_html) + scrape = scrape_html(html=recipe_html, org_url="https://cookbookapp.import", supported_only=False) recipe_json = get_from_scraper(scrape, self.request) images = list(dict.fromkeys(get_images_from_soup(scrape.soup, None))) diff --git a/cookbook/tests/api/test_api_import_log.py b/cookbook/tests/api/test_api_import_log.py index 4bfd7ec7c..bae21eaed 100644 --- a/cookbook/tests/api/test_api_import_log.py +++ b/cookbook/tests/api/test_api_import_log.py @@ -51,7 +51,7 @@ def test_list_space(obj_1, obj_2, u1_s1, u1_s2, space_2): ['g1_s2', 403], ['u1_s2', 404], ['a1_s2', 404], -]) +], ids=str) def test_update(arg, request, obj_1): c = request.getfixturevalue(arg[0]) r = c.patch( diff --git a/cookbook/tests/other/test_recipe_full_text_search.py b/cookbook/tests/other/test_recipe_full_text_search.py index 9118c64dd..c8e34e750 100644 --- a/cookbook/tests/other/test_recipe_full_text_search.py +++ b/cookbook/tests/other/test_recipe_full_text_search.py @@ -273,12 +273,12 @@ def test_search_units(found_recipe, recipes, u1_s1, space_1): ('fuzzy_lookups', True), ('fuzzy_lookups', False) ], [('unaccent', True), ('unaccent', False)] -), indirect=['user1']) +), indirect=['user1'], ids=str) @pytest.mark.parametrize("found_recipe, param_type", [ ({'unit': True}, 'unit'), ({'keyword': True}, 'keyword'), ({'food': True}, 'food'), -], indirect=['found_recipe']) +], indirect=['found_recipe'], ids=str) def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1): with scope(space=space_1): list_url = f'api:{param_type}-list' @@ -306,14 +306,14 @@ def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1): ('istartswith', True), ('istartswith', False), ], [('unaccent', True), ('unaccent', False)] -), indirect=['user1']) +), indirect=['user1'], ids=str) @pytest.mark.parametrize("found_recipe", [ ({'name': True}), ({'description': True}), ({'instruction': True}), ({'keyword': True}), ({'food': True}), -], indirect=['found_recipe']) +], indirect=['found_recipe'], ids=str) # user array contains: user client, expected count of search, expected count of mispelled search, search string, mispelled search string, user search preferences def test_search_string(found_recipe, recipes, user1, space_1): with scope(space=space_1): diff --git a/cookbook/tests/other/test_url_import.py b/cookbook/tests/other/test_url_import.py index ae4677c0f..176ca5110 100644 --- a/cookbook/tests/other/test_url_import.py +++ b/cookbook/tests/other/test_url_import.py @@ -19,6 +19,23 @@ DATA_DIR = "cookbook/tests/other/test_data/" # plus the test that previously existed # plus the custom scraper that was created # plus any specific defects discovered along the way +RECIPES = [ + ALLRECIPES, + AMERICAS_TEST_KITCHEN, + CHEF_KOCH, + CHEF_KOCH2, # test for empty ingredient in ingredient_parser + COOKPAD, + COOKS_COUNTRY, + DELISH, + FOOD_NETWORK, + GIALLOZAFFERANO, + JOURNAL_DES_FEMMES, + MADAME_DESSERT, # example of json only source + MARMITON, + TASTE_OF_HOME, + THE_SPRUCE_EATS, # example of non-json recipes_scraper + TUDOGOSTOSO, +] @pytest.mark.parametrize("arg", [ @@ -32,29 +49,7 @@ def test_import_permission(arg, request): assert c.get(reverse(IMPORT_SOURCE_URL)).status_code == arg[1] -@pytest.mark.parametrize("arg", [ - ALLRECIPES, - # test of custom scraper ATK - AMERICAS_TEST_KITCHEN, - CHEF_KOCH, - # test for empty ingredient in ingredient_parser - CHEF_KOCH2, - COOKPAD, - # test of custom scraper ATK - COOKS_COUNTRY, - DELISH, - FOOD_NETWORK, - GIALLOZAFFERANO, - JOURNAL_DES_FEMMES, - # example of recipes_scraper in with wildmode - # example of json only source - MADAME_DESSERT, - MARMITON, - TASTE_OF_HOME, - # example of non-json recipes_scraper - THE_SPRUCE_EATS, # TODO seems to be broken in recipe scrapers - TUDOGOSTOSO, -]) +@pytest.mark.parametrize("arg", RECIPES, ids=[x['file'][0] for x in RECIPES]) def test_recipe_import(arg, u1_s1): url = arg['url'] for f in list(arg['file']): # url and files get popped later diff --git a/cookbook/views/api.py b/cookbook/views/api.py index 3c2d596b2..734e5333b 100644 --- a/cookbook/views/api.py +++ b/cookbook/views/api.py @@ -63,7 +63,6 @@ from cookbook.helper.permission_helper import ( ) from cookbook.helper.recipe_search import RecipeSearch from cookbook.helper.recipe_url_import import clean_dict, get_from_youtube_scraper, get_images_from_soup -from cookbook.helper.scrapers.scrapers import text_scraper from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food, FoodInheritField, FoodProperty, ImportLog, Ingredient, InviteLink, @@ -1457,9 +1456,9 @@ class RecipeUrlImportView(APIView): data = "" except JSONDecodeError: pass - scrape = text_scraper(text=data, url=url) - if not url and (found_url := scrape.schema.data.get('url', None)): - scrape = text_scraper(text=data, url=found_url) + scrape = scrape_html(html=data, org_url=url, supported_only=False) + if not url and (found_url := scrape.schema.data.get('url', 'https://urlnotfound.none')): + scrape = scrape_html(text=data, url=found_url, supported_only=False) if scrape: return Response({