Merge pull request #3227 from smilerz/scraper-updates

Scraper updates
2026-01-03 21:37:49 -05:00 · 2024-08-01 13:48:38 +02:00
parent 5cdc8302bb b095718545
commit 028a8ddbda
8 changed files with 33 additions and 147 deletions
--- a/cookbook/helper/scrapers/cooksillustrated.py
+++ b/cookbook/helper/scrapers/cooksillustrated.py
@@ -1,68 +0,0 @@
 import json
 from recipe_scrapers._abstract import AbstractScraper
 class CooksIllustrated(AbstractScraper):
    @classmethod
    def host(cls, site='cooksillustrated'):
        return {
            'cooksillustrated': f"{site}.com",
            'americastestkitchen': f"{site}.com",
            'cookscountry': f"{site}.com",
        }.get(site)
    def title(self):
        return self.schema.title()
    def image(self):
        return self.schema.image()
    def total_time(self):
        if not self.recipe:
            self.get_recipe()
        return self.recipe['recipeTimeNote']
    def yields(self):
        if not self.recipe:
            self.get_recipe()
        return self.recipe['yields']
    def ingredients(self):
        if not self.recipe:
            self.get_recipe()
        ingredients = []
        for group in self.recipe['ingredientGroups']:
            ingredients += group['fields']['recipeIngredientItems']
        return [
            "{} {} {}{}".format(
                i['fields']['qty'] or '',
                i['fields']['measurement'] or '',
                i['fields']['ingredient']['fields']['title'] or '',
                i['fields']['postText'] or ''
            )
            for i in ingredients
        ]
    def instructions(self):
        if not self.recipe:
            self.get_recipe()
        if self.recipe.get('headnote', False):
            i = ['Note: ' + self.recipe.get('headnote', '')]
        else:
            i = []
        return "\n".join(
            i
            + [self.recipe.get('whyThisWorks', '')]
            + [
                instruction['fields']['content']
                for instruction in self.recipe['instructions']
            ]
        )
    def nutrients(self):
        raise NotImplementedError("This should be implemented.")
    def get_recipe(self):
        j = json.loads(self.soup.find(type='application/json').string)
        name = list(j['props']['initialState']['content']['documents'])[0]
        self.recipe = j['props']['initialState']['content']['documents'][name]
--- a/cookbook/helper/scrapers/scrapers.py
+++ b/cookbook/helper/scrapers/scrapers.py
@@ -1,43 +0,0 @@
 from json import JSONDecodeError
 from bs4 import BeautifulSoup
 from recipe_scrapers import SCRAPERS, get_host_name
 from recipe_scrapers._factory import SchemaScraperFactory
 from recipe_scrapers._schemaorg import SchemaOrg
 from .cooksillustrated import CooksIllustrated
 CUSTOM_SCRAPERS = {
    CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated,
    CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated,
    CooksIllustrated.host(site="cookscountry"): CooksIllustrated,
 }
 SCRAPERS.update(CUSTOM_SCRAPERS)
 def text_scraper(text, url=None):
    domain = None
    if url:
        domain = get_host_name(url)
    if domain in SCRAPERS:
        scraper_class = SCRAPERS[domain]
    else:
        scraper_class = SchemaScraperFactory.SchemaScraper
    class TextScraper(scraper_class):
        def __init__(
            self,
            html=None,
            url=None,
        ):
            self.supported_only = False
            self.meta_http_equiv = False
            self.soup = BeautifulSoup(html, "html.parser")
            self.url = url
            self.recipe = None
            try:
                self.schema = SchemaOrg(html)
            except (JSONDecodeError, AttributeError):
                pass
    return TextScraper(url=url, html=text)
--- a/cookbook/integration/cookbookapp.py
+++ b/cookbook/integration/cookbookapp.py
@@ -7,7 +7,7 @@ import validators
 from cookbook.helper.ingredient_parser import IngredientParser
 from cookbook.helper.recipe_url_import import (get_from_scraper, get_images_from_soup,
                                               iso_duration_to_minutes)
-from cookbook.helper.scrapers.scrapers import text_scraper
+from recipe_scrapers import scrape_html
 from cookbook.integration.integration import Integration
 from cookbook.models import Ingredient, Recipe, Step
@@ -20,7 +20,7 @@ class CookBookApp(Integration):
    def get_recipe_from_file(self, file):
        recipe_html = file.getvalue().decode("utf-8")
-        scrape = text_scraper(text=recipe_html)
+        scrape = scrape_html(html=recipe_html, org_url="https://cookbookapp.import", supported_only=False)
        recipe_json = get_from_scraper(scrape, self.request)
        images = list(dict.fromkeys(get_images_from_soup(scrape.soup, None)))
--- a/cookbook/tests/api/test_api_import_log.py
+++ b/cookbook/tests/api/test_api_import_log.py
@@ -51,7 +51,7 @@ def test_list_space(obj_1, obj_2, u1_s1, u1_s2, space_2):
    ['g1_s2', 403],
    ['u1_s2', 404],
    ['a1_s2', 404],
-])
+], ids=str)
 def test_update(arg, request, obj_1):
    c = request.getfixturevalue(arg[0])
    r = c.patch(
--- a/cookbook/tests/other/test_recipe_full_text_search.py
+++ b/cookbook/tests/other/test_recipe_full_text_search.py
@@ -273,12 +273,12 @@ def test_search_units(found_recipe, recipes, u1_s1, space_1):
        ('fuzzy_lookups', True), ('fuzzy_lookups', False)
    ],
    [('unaccent', True), ('unaccent', False)]
-), indirect=['user1'])
+), indirect=['user1'], ids=str)
@pytest.mark.parametrize("found_recipe, param_type", [
    ({'unit': True}, 'unit'),
    ({'keyword': True}, 'keyword'),
    ({'food': True}, 'food'),
-], indirect=['found_recipe'])
+], indirect=['found_recipe'], ids=str)
 def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1):
    with scope(space=space_1):
        list_url = f'api:{param_type}-list'
@@ -306,14 +306,14 @@ def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1):
        ('istartswith', True), ('istartswith', False),
    ],
    [('unaccent', True), ('unaccent', False)]
-), indirect=['user1'])
+), indirect=['user1'], ids=str)
@pytest.mark.parametrize("found_recipe", [
    ({'name': True}),
    ({'description': True}),
    ({'instruction': True}),
    ({'keyword': True}),
    ({'food': True}),
-], indirect=['found_recipe'])
+], indirect=['found_recipe'], ids=str)
 # user array contains: user client, expected count of search, expected count of mispelled search, search string, mispelled search string, user search preferences
 def test_search_string(found_recipe, recipes, user1, space_1):
    with scope(space=space_1):
--- a/cookbook/tests/other/test_url_import.py
+++ b/cookbook/tests/other/test_url_import.py
@@ -19,6 +19,23 @@ DATA_DIR = "cookbook/tests/other/test_data/"
 # plus the test that previously existed
 # plus the custom scraper that was created
 # plus any specific defects discovered along the way
 RECIPES = [
    ALLRECIPES,
    AMERICAS_TEST_KITCHEN,
    CHEF_KOCH,
    CHEF_KOCH2,  # test for empty ingredient in ingredient_parser
    COOKPAD,
    COOKS_COUNTRY,
    DELISH,
    FOOD_NETWORK,
    GIALLOZAFFERANO,
    JOURNAL_DES_FEMMES,
    MADAME_DESSERT,  # example of json only source
    MARMITON,
    TASTE_OF_HOME,
    THE_SPRUCE_EATS,  # example of non-json recipes_scraper
    TUDOGOSTOSO,
 ]
@pytest.mark.parametrize("arg", [
@@ -32,29 +49,7 @@ def test_import_permission(arg, request):
    assert c.get(reverse(IMPORT_SOURCE_URL)).status_code == arg[1]
-@pytest.mark.parametrize("arg", [
+@pytest.mark.parametrize("arg", RECIPES, ids=[x['file'][0] for x in RECIPES])
    ALLRECIPES,
    # test of custom scraper ATK
    AMERICAS_TEST_KITCHEN,
    CHEF_KOCH,
    # test for empty ingredient in ingredient_parser
    CHEF_KOCH2,
    COOKPAD,
    # test of custom scraper ATK
    COOKS_COUNTRY,
    DELISH,
    FOOD_NETWORK,
    GIALLOZAFFERANO,
    JOURNAL_DES_FEMMES,
    # example of recipes_scraper in with wildmode
    # example of json only source
    MADAME_DESSERT,
    MARMITON,
    TASTE_OF_HOME,
    # example of non-json recipes_scraper
    THE_SPRUCE_EATS,  # TODO seems to be broken in recipe scrapers
    TUDOGOSTOSO,
 ])
 def test_recipe_import(arg, u1_s1):
    url = arg['url']
    for f in list(arg['file']):  # url and files get popped later
--- a/cookbook/views/api.py
+++ b/cookbook/views/api.py
@@ -63,7 +63,6 @@ from cookbook.helper.permission_helper import (
 )
 from cookbook.helper.recipe_search import RecipeSearch
 from cookbook.helper.recipe_url_import import clean_dict, get_from_youtube_scraper, get_images_from_soup
 from cookbook.helper.scrapers.scrapers import text_scraper
 from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper
 from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food,
                             FoodInheritField, FoodProperty, ImportLog, Ingredient, InviteLink,
@@ -1437,7 +1436,10 @@ class RecipeUrlImportView(APIView):
                else:
                    try:
                        if validators.url(url, public=True):
-                            html = requests.get(url).content
+                            html = requests.get(
                                url,
                                headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"}
                            ).content
                            scrape = scrape_html(org_url=url, html=html, supported_only=False)
                        else:
                            return Response({'error': True, 'msg': _('Invalid Url')}, status=status.HTTP_400_BAD_REQUEST)
@@ -1457,9 +1459,9 @@ class RecipeUrlImportView(APIView):
                    data = "<script type='application/ld+json'>" + json.dumps(data_json) + "</script>"
                except JSONDecodeError:
                    pass
-                scrape = text_scraper(text=data, url=url)
+                scrape = scrape_html(html=data, org_url=url, supported_only=False)
-                if not url and (found_url := scrape.schema.data.get('url', None)):
+                if not url and (found_url := scrape.schema.data.get('url', 'https://urlnotfound.none')):
-                    scrape = text_scraper(text=data, url=found_url)
+                    scrape = scrape_html(text=data, url=found_url, supported_only=False)
            if scrape:
                return Response({
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ Jinja2==3.1.4
 django-webpack-loader==3.0.1
 git+https://github.com/BITSOLVER/django-js-reverse@071e304fd600107bc64bbde6f2491f1fe049ec82
 django-allauth==0.61.1
-recipe-scrapers==15.0.0-rc3
+recipe-scrapers==15.0.0
 django-scopes==2.0.0
 django-treebeard==4.7
 django-cors-headers==4.3.1