This commit is contained in:
smilerz
2024-07-09 08:01:39 -05:00
parent c4ff29beda
commit a02582e9f8
7 changed files with 28 additions and 145 deletions

View File

@@ -1,68 +0,0 @@
import json
from recipe_scrapers._abstract import AbstractScraper
class CooksIllustrated(AbstractScraper):
@classmethod
def host(cls, site='cooksillustrated'):
return {
'cooksillustrated': f"{site}.com",
'americastestkitchen': f"{site}.com",
'cookscountry': f"{site}.com",
}.get(site)
def title(self):
return self.schema.title()
def image(self):
return self.schema.image()
def total_time(self):
if not self.recipe:
self.get_recipe()
return self.recipe['recipeTimeNote']
def yields(self):
if not self.recipe:
self.get_recipe()
return self.recipe['yields']
def ingredients(self):
if not self.recipe:
self.get_recipe()
ingredients = []
for group in self.recipe['ingredientGroups']:
ingredients += group['fields']['recipeIngredientItems']
return [
"{} {} {}{}".format(
i['fields']['qty'] or '',
i['fields']['measurement'] or '',
i['fields']['ingredient']['fields']['title'] or '',
i['fields']['postText'] or ''
)
for i in ingredients
]
def instructions(self):
if not self.recipe:
self.get_recipe()
if self.recipe.get('headnote', False):
i = ['Note: ' + self.recipe.get('headnote', '')]
else:
i = []
return "\n".join(
i
+ [self.recipe.get('whyThisWorks', '')]
+ [
instruction['fields']['content']
for instruction in self.recipe['instructions']
]
)
def nutrients(self):
raise NotImplementedError("This should be implemented.")
def get_recipe(self):
j = json.loads(self.soup.find(type='application/json').string)
name = list(j['props']['initialState']['content']['documents'])[0]
self.recipe = j['props']['initialState']['content']['documents'][name]

View File

@@ -1,43 +0,0 @@
from json import JSONDecodeError
from bs4 import BeautifulSoup
from recipe_scrapers import SCRAPERS, get_host_name
from recipe_scrapers._factory import SchemaScraperFactory
from recipe_scrapers._schemaorg import SchemaOrg
from .cooksillustrated import CooksIllustrated
CUSTOM_SCRAPERS = {
CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated,
CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated,
CooksIllustrated.host(site="cookscountry"): CooksIllustrated,
}
SCRAPERS.update(CUSTOM_SCRAPERS)
def text_scraper(text, url=None):
domain = None
if url:
domain = get_host_name(url)
if domain in SCRAPERS:
scraper_class = SCRAPERS[domain]
else:
scraper_class = SchemaScraperFactory.SchemaScraper
class TextScraper(scraper_class):
def __init__(
self,
html=None,
url=None,
):
self.supported_only = False
self.meta_http_equiv = False
self.soup = BeautifulSoup(html, "html.parser")
self.url = url
self.recipe = None
try:
self.schema = SchemaOrg(html)
except (JSONDecodeError, AttributeError):
pass
return TextScraper(url=url, html=text)

View File

@@ -7,7 +7,7 @@ import validators
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.helper.recipe_url_import import (get_from_scraper, get_images_from_soup,
iso_duration_to_minutes)
from cookbook.helper.scrapers.scrapers import text_scraper
from recipe_scrapers import scrape_html
from cookbook.integration.integration import Integration
from cookbook.models import Ingredient, Recipe, Step
@@ -20,7 +20,7 @@ class CookBookApp(Integration):
def get_recipe_from_file(self, file):
recipe_html = file.getvalue().decode("utf-8")
scrape = text_scraper(text=recipe_html)
scrape = scrape_html(html=recipe_html, org_url="https://cookbookapp.import", supported_only=False)
recipe_json = get_from_scraper(scrape, self.request)
images = list(dict.fromkeys(get_images_from_soup(scrape.soup, None)))

View File

@@ -51,7 +51,7 @@ def test_list_space(obj_1, obj_2, u1_s1, u1_s2, space_2):
['g1_s2', 403],
['u1_s2', 404],
['a1_s2', 404],
])
], ids=str)
def test_update(arg, request, obj_1):
c = request.getfixturevalue(arg[0])
r = c.patch(

View File

@@ -273,12 +273,12 @@ def test_search_units(found_recipe, recipes, u1_s1, space_1):
('fuzzy_lookups', True), ('fuzzy_lookups', False)
],
[('unaccent', True), ('unaccent', False)]
), indirect=['user1'])
), indirect=['user1'], ids=str)
@pytest.mark.parametrize("found_recipe, param_type", [
({'unit': True}, 'unit'),
({'keyword': True}, 'keyword'),
({'food': True}, 'food'),
], indirect=['found_recipe'])
], indirect=['found_recipe'], ids=str)
def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1):
with scope(space=space_1):
list_url = f'api:{param_type}-list'
@@ -306,14 +306,14 @@ def test_fuzzy_lookup(found_recipe, recipes, param_type, user1, space_1):
('istartswith', True), ('istartswith', False),
],
[('unaccent', True), ('unaccent', False)]
), indirect=['user1'])
), indirect=['user1'], ids=str)
@pytest.mark.parametrize("found_recipe", [
({'name': True}),
({'description': True}),
({'instruction': True}),
({'keyword': True}),
({'food': True}),
], indirect=['found_recipe'])
], indirect=['found_recipe'], ids=str)
# user array contains: user client, expected count of search, expected count of mispelled search, search string, mispelled search string, user search preferences
def test_search_string(found_recipe, recipes, user1, space_1):
with scope(space=space_1):

View File

@@ -19,6 +19,23 @@ DATA_DIR = "cookbook/tests/other/test_data/"
# plus the test that previously existed
# plus the custom scraper that was created
# plus any specific defects discovered along the way
RECIPES = [
ALLRECIPES,
AMERICAS_TEST_KITCHEN,
CHEF_KOCH,
CHEF_KOCH2, # test for empty ingredient in ingredient_parser
COOKPAD,
COOKS_COUNTRY,
DELISH,
FOOD_NETWORK,
GIALLOZAFFERANO,
JOURNAL_DES_FEMMES,
MADAME_DESSERT, # example of json only source
MARMITON,
TASTE_OF_HOME,
THE_SPRUCE_EATS, # example of non-json recipes_scraper
TUDOGOSTOSO,
]
@pytest.mark.parametrize("arg", [
@@ -32,29 +49,7 @@ def test_import_permission(arg, request):
assert c.get(reverse(IMPORT_SOURCE_URL)).status_code == arg[1]
@pytest.mark.parametrize("arg", [
ALLRECIPES,
# test of custom scraper ATK
AMERICAS_TEST_KITCHEN,
CHEF_KOCH,
# test for empty ingredient in ingredient_parser
CHEF_KOCH2,
COOKPAD,
# test of custom scraper ATK
COOKS_COUNTRY,
DELISH,
FOOD_NETWORK,
GIALLOZAFFERANO,
JOURNAL_DES_FEMMES,
# example of recipes_scraper in with wildmode
# example of json only source
MADAME_DESSERT,
MARMITON,
TASTE_OF_HOME,
# example of non-json recipes_scraper
THE_SPRUCE_EATS, # TODO seems to be broken in recipe scrapers
TUDOGOSTOSO,
])
@pytest.mark.parametrize("arg", RECIPES, ids=[x['file'][0] for x in RECIPES])
def test_recipe_import(arg, u1_s1):
url = arg['url']
for f in list(arg['file']): # url and files get popped later

View File

@@ -63,7 +63,6 @@ from cookbook.helper.permission_helper import (
)
from cookbook.helper.recipe_search import RecipeSearch
from cookbook.helper.recipe_url_import import clean_dict, get_from_youtube_scraper, get_images_from_soup
from cookbook.helper.scrapers.scrapers import text_scraper
from cookbook.helper.shopping_helper import RecipeShoppingEditor, shopping_helper
from cookbook.models import (Automation, BookmarkletImport, CookLog, CustomFilter, ExportLog, Food,
FoodInheritField, FoodProperty, ImportLog, Ingredient, InviteLink,
@@ -1457,9 +1456,9 @@ class RecipeUrlImportView(APIView):
data = "<script type='application/ld+json'>" + json.dumps(data_json) + "</script>"
except JSONDecodeError:
pass
scrape = text_scraper(text=data, url=url)
if not url and (found_url := scrape.schema.data.get('url', None)):
scrape = text_scraper(text=data, url=found_url)
scrape = scrape_html(html=data, org_url=url, supported_only=False)
if not url and (found_url := scrape.schema.data.get('url', 'https://urlnotfound.none')):
scrape = scrape_html(text=data, url=found_url, supported_only=False)
if scrape:
return Response({