From a02582e9f8f329d05483e94df02a0f4c420a2bac Mon Sep 17 00:00:00 2001
From: smilerz <smilerz@gmail.com>
Date: Tue, 9 Jul 2024 08:01:39 -0500
Subject: [PATCH] rebase

---
 cookbook/helper/scrapers/cooksillustrated.py  | 68 -------------------
 cookbook/helper/scrapers/scrapers.py          | 43 ------------
 cookbook/integration/cookbookapp.py           |  4 +-
 cookbook/tests/api/test_api_import_log.py     |  2 +-
 .../other/test_recipe_full_text_search.py     |  8 +--
 cookbook/tests/other/test_url_import.py       | 41 +++++------
 cookbook/views/api.py                         |  7 +-
 7 files changed, 28 insertions(+), 145 deletions(-)
 delete mode 100644 cookbook/helper/scrapers/cooksillustrated.py
 delete mode 100644 cookbook/helper/scrapers/scrapers.py

diff --git a/cookbook/helper/scrapers/cooksillustrated.py b/cookbook/helper/scrapers/cooksillustrated.py
deleted file mode 100644
index e1e54a97a..000000000
--- a/cookbook/helper/scrapers/cooksillustrated.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import json
-from recipe_scrapers._abstract import AbstractScraper
-
-
-class CooksIllustrated(AbstractScraper):
-    @classmethod
-    def host(cls, site='cooksillustrated'):
-        return {
-            'cooksillustrated': f"{site}.com",
-            'americastestkitchen': f"{site}.com",
-            'cookscountry': f"{site}.com",
-        }.get(site)
-
-    def title(self):
-        return self.schema.title()
-
-    def image(self):
-        return self.schema.image()
-
-    def total_time(self):
-        if not self.recipe:
-            self.get_recipe()
-        return self.recipe['recipeTimeNote']
-
-    def yields(self):
-        if not self.recipe:
-            self.get_recipe()
-        return self.recipe['yields']
-
-    def ingredients(self):
-        if not self.recipe:
-            self.get_recipe()
-        ingredients = []
-        for group in self.recipe['ingredientGroups']:
-            ingredients += group['fields']['recipeIngredientItems']
-        return [
-            "{} {} {}{}".format(
-                i['fields']['qty'] or '',
-                i['fields']['measurement'] or '',
-                i['fields']['ingredient']['fields']['title'] or '',
-                i['fields']['postText'] or ''
-            )
-            for i in ingredients
-        ]
-
-    def instructions(self):
-        if not self.recipe:
-            self.get_recipe()
-        if self.recipe.get('headnote', False):
-            i = ['Note: ' + self.recipe.get('headnote', '')]
-        else:
-            i = []
-        return "\n".join(
-            i
-            + [self.recipe.get('whyThisWorks', '')]
-            + [
-                instruction['fields']['content']
-                for instruction in self.recipe['instructions']
-            ]
-        )
-
-    def nutrients(self):
-        raise NotImplementedError("This should be implemented.")
-
-    def get_recipe(self):
-        j = json.loads(self.soup.find(type='application/json').string)
-        name = list(j['props']['initialState']['content']['documents'])[0]
-        self.recipe = j['props']['initialState']['content']['documents'][name]
diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py
deleted file mode 100644
index 0cf01333b..000000000
--- a/cookbook/helper/scrapers/scrapers.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from json import JSONDecodeError
-
-from bs4 import BeautifulSoup
-from recipe_scrapers import SCRAPERS, get_host_name
-from recipe_scrapers._factory import SchemaScraperFactory
-from recipe_scrapers._schemaorg import SchemaOrg
-
-from .cooksillustrated import CooksIllustrated
-
-CUSTOM_SCRAPERS = {
-    CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated,
-    CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated,
-    CooksIllustrated.host(site="cookscountry"): CooksIllustrated,
-}
-SCRAPERS.update(CUSTOM_SCRAPERS)
-
-
-def text_scraper(text, url=None):
-    domain = None
-    if url:
-        domain = get_host_name(url)
-    if domain in SCRAPERS:
-        scraper_class = SCRAPERS[domain]
-    else:
-        scraper_class = SchemaScraperFactory.SchemaScraper
-
-    class TextScraper(scraper_class):
-        def __init__(
-            self,
-            html=None,
-            url=None,
-        ):
-            self.supported_only = False
-            self.meta_http_equiv = False
-            self.soup = BeautifulSoup(html, "html.parser")
-            self.url = url
-            self.recipe = None
-            try:
-                self.schema = SchemaOrg(html)
-            except (JSONDecodeError, AttributeError):
-                pass
-
-    return TextScraper(url=url, html=text)
diff --git a/cookbook/integration/cookbookapp.py b/cookbook/integration/cookbookapp.py
index 21d9d7f30..a9b5ac132 100644
--- a/cookbook/integration/cookbookapp.py
+++ b/cookbook/integration/cookbookapp.py
@@ -7,7 +7,7 @@ import validators
 from cookbook.helper.ingredient_parser import IngredientParser
 from cookbook.helper.recipe_url_import import (get_from_scraper, get_images_from_soup,
                                                iso_duration_to_minutes)
-from cookbook.helper.scrapers.scrapers import text_scraper
+from recipe_scrapers import scrape_html
 from cookbook.integration.integration import Integration
 from cookbook.models import Ingredient, Recipe, Step
 
@@ -20,7 +20,7 @@ class CookBookApp(Integration):
     def get_recipe_from_file(self, file):
         recipe_html = file.getvalue().decode("utf-8")
 
-        scrape = text_scraper(text=recipe_html)
+        scrape = scrape_html(html=recipe_html, org_url="https://cookbookapp.import", supported_only=False)
         recipe_json = get_from_scraper(scrape, self.request)
         images = list(dict.fromkeys(get_images_from_soup(scrape.soup, None)))
 
diff --git a/cookbook/tests/api/test_api_import_log.py b/cookbook/tests/api/test_api_import_log.py
index 4bfd7ec7c..bae21eaed 100644
--- a/cookbook/tests/api/test_api_import_log.py
+++ b/cookbook/tests/api/test_api_import_log.py
@@ -51,7 +51,7 @@ def test_list_space(obj_1, obj_2, u1_s1, u1_s2, space_2):
     ['g1_s2', 403],
     ['u1_s2', 404],
     ['a1_s2', 404],
-])
+], ids=str)
 def test_update(arg, request, obj_1):
     c = request.getfixturevalue(arg[0])
     r = c.patch(
diff --git a/cookbook/tests/other/test_recipe_full_text_search.py b/cookbook/tests/other/test_recipe_full_text_search.py
index 9118c64dd..c8e34e750 100644
--- a/cookbook/tests/other/test_recipe_full_text_search.py
+++ b/cookbook/tests/other/test_recipe_full_text_search.py
@@ -273,12 +273,12 @@ def test_search_units(found_recipe, recipes, u1_s1, space_1):
         ('fuzzy_lookups', True), ('fuzzy_lookups', False)
     ],
     [('unaccent', True), ('unaccent', False)]
-), indirect=['user1'])
+), indirect=['user1'], ids=str)
 @pytest.mark.parametrize("found_recipe, param_type", [
     ({'unit': True}, 'unit'),
     ({'keyword': True}, 'keyword'),
     ({'food': True}, 'food'),
-], indirect=['found_recipe'])
+], indirect=['found_recipe'], ids=str)
 def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1):
     with scope(space=space_1):
         list_url = f'api:{param_type}-list'
@@ -306,14 +306,14 @@ def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1):
         ('istartswith', True), ('istartswith', False),
     ],
     [('unaccent', True), ('unaccent', False)]
-), indirect=['user1'])
+), indirect=['user1'], ids=str)
 @pytest.mark.parametrize("found_recipe", [
     ({'name': True}),
     ({'description': True}),
     ({'instruction': True}),
     ({'keyword': True}),
     ({'food': True}),
-], indirect=['found_recipe'])
+], indirect=['found_recipe'], ids=str)
 # user array contains: user client, expected count of search, expected count of mispelled search, search string, mispelled search string, user search preferences
 def test_search_string(found_recipe, recipes, user1, space_1):
     with scope(space=space_1):
diff --git a/cookbook/tests/other/test_url_import.py b/cookbook/tests/other/test_url_import.py
index ae4677c0f..176ca5110 100644
--- a/cookbook/tests/other/test_url_import.py
+++ b/cookbook/tests/other/test_url_import.py
@@ -19,6 +19,23 @@ DATA_DIR = "cookbook/tests/other/test_data/"
 # plus the test that previously existed
 # plus the custom scraper that was created
 # plus any specific defects discovered along the way
+RECIPES = [
+    ALLRECIPES,
+    AMERICAS_TEST_KITCHEN,
+    CHEF_KOCH,
+    CHEF_KOCH2,  # test for empty ingredient in ingredient_parser
+    COOKPAD,
+    COOKS_COUNTRY,
+    DELISH,
+    FOOD_NETWORK,
+    GIALLOZAFFERANO,
+    JOURNAL_DES_FEMMES,
+    MADAME_DESSERT,  # example of json only source
+    MARMITON,
+    TASTE_OF_HOME,
+    THE_SPRUCE_EATS,  # example of non-json recipes_scraper
+    TUDOGOSTOSO,
+]
 
 
 @pytest.mark.parametrize("arg", [
@@ -32,29 +49,7 @@ def test_import_permission(arg, request):
     assert c.get(reverse(IMPORT_SOURCE_URL)).status_code == arg[1]
 
 
-@pytest.mark.parametrize("arg", [
-    ALLRECIPES,
-    # test of custom scraper ATK
-    AMERICAS_TEST_KITCHEN,
-    CHEF_KOCH,
-    # test for empty ingredient in ingredient_parser
-    CHEF_KOCH2,
-    COOKPAD,
-    # test of custom scraper ATK
-    COOKS_COUNTRY,
-    DELISH,
-    FOOD_NETWORK,
-    GIALLOZAFFERANO,
-    JOURNAL_DES_FEMMES,
-    # example of recipes_scraper in with wildmode
-    # example of json only source
-    MADAME_DESSERT,
-    MARMITON,
-    TASTE_OF_HOME,
-    # example of non-json recipes_scraper
-    THE_SPRUCE_EATS,  # TODO seems to be broken in recipe scrapers
-    TUDOGOSTOSO,
-])
+@pytest.mark.parametrize("arg", RECIPES, ids=[x['file'][0] for x in RECIPES])
 def test_recipe_import(arg, u1_s1):
     url = arg['url']
     for f in list(arg['file']):  # url and files get popped later
diff --git a/cookbook/views/api.py b/cookbook/views/api.py
index 3c2d596b2..734e5333b 100644
--- a/cookbook/views/api.py
+++ b/cookbook/views/api.py
@@ -63,7 +63,6 @@ from cookbook.helper.permission_helper import (
 )
 from cookbook.helper.recipe_search import RecipeSearch
 from cookbook.helper.recipe_url_import import clean_dict, get_from_youtube_scraper, get_images_from_soup
-from cookbook.helper.scrapers.scrapers import text_scraper
 from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper
 from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food,
                              FoodInheritField, FoodProperty, ImportLog, Ingredient, InviteLink,
@@ -1457,9 +1456,9 @@ class RecipeUrlImportView(APIView):
                     data = "<script type='application/ld+json'>" + json.dumps(data_json) + "</script>"
                 except JSONDecodeError:
                     pass
-                scrape = text_scraper(text=data, url=url)
-                if not url and (found_url := scrape.schema.data.get('url', None)):
-                    scrape = text_scraper(text=data, url=found_url)
+                scrape = scrape_html(html=data, org_url=url, supported_only=False)
+                if not url and (found_url := scrape.schema.data.get('url', 'https://urlnotfound.none')):
+                    scrape = scrape_html(text=data, url=found_url, supported_only=False)
 
             if scrape:
                 return Response({