Merge branch 'develop' of https://github.com/TandoorRecipes/recipes into develop

This commit is contained in:
Kaibu
2022-05-11 17:10:18 +02:00
28 changed files with 71 additions and 50 deletions

View File

@@ -9,6 +9,8 @@ from recipe_scrapers._utils import get_host_name, normalize_string
from cookbook.helper import recipe_url_import as helper
from cookbook.helper.scrapers.scrapers import text_scraper
from recipe_scrapers import scrape_me
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
def get_recipe_from_source(text, url, request):
@@ -63,34 +65,41 @@ def get_recipe_from_source(text, url, request):
html_data = []
images = []
text = unquote(text)
scrape = None
try:
parse_list.append(remove_graph(json.loads(text)))
if not url and 'url' in parse_list[0]:
url = parse_list[0]['url']
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
if url:
try:
scrape = scrape_me(url_path=url, wild_mode=True)
except(NoSchemaFoundInWildMode):
pass
if not scrape:
try:
parse_list.append(remove_graph(json.loads(text)))
if not url and 'url' in parse_list[0]:
url = parse_list[0]['url']
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
except JSONDecodeError:
soup = BeautifulSoup(text, "html.parser")
html_data = get_from_html(soup)
images += get_images_from_source(soup, url)
for el in soup.find_all('script', type='application/ld+json'):
el = remove_graph(el)
if not url and 'url' in el:
url = el['url']
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
for el in soup.find_all(type='application/json'):
el = remove_graph(el)
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
scrape = text_scraper(text, url=url)
except JSONDecodeError:
soup = BeautifulSoup(text, "html.parser")
html_data = get_from_html(soup)
images += get_images_from_source(soup, url)
for el in soup.find_all('script', type='application/ld+json'):
el = remove_graph(el)
if not url and 'url' in el:
url = el['url']
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
for el in soup.find_all(type='application/json'):
el = remove_graph(el)
if type(el) == list:
for le in el:
parse_list.append(le)
elif type(el) == dict:
parse_list.append(el)
scrape = text_scraper(text, url=url)
recipe_json = helper.get_from_scraper(scrape, request)

View File

@@ -114,7 +114,14 @@ def get_from_scraper(scrape, request):
except Exception:
pass
if source_url := scrape.url:
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
@@ -129,9 +136,11 @@ def get_from_scraper(scrape, request):
ingredient_parser = IngredientParser(request, True)
recipe_json['steps'] = []
for i in parse_instructions(scrape.instructions()):
recipe_json['steps'].append({'instruction': i, 'ingredients': [], })
try:
for i in parse_instructions(scrape.instructions()):
recipe_json['steps'].append({'instruction': i, 'ingredients': [], })
except Exception:
pass
if len(recipe_json['steps']) == 0:
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })

View File

@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from json import JSONDecodeError
from recipe_scrapers import SCRAPERS, get_host_name
from recipe_scrapers import SCRAPERS
from recipe_scrapers._factory import SchemaScraperFactory
from recipe_scrapers._schemaorg import SchemaOrg
@@ -15,13 +15,7 @@ SCRAPERS.update(CUSTOM_SCRAPERS)
def text_scraper(text, url=None):
domain = None
if url:
domain = get_host_name(url)
if domain in SCRAPERS:
scraper_class = SCRAPERS[domain]
else:
scraper_class = SchemaScraperFactory.SchemaScraper
scraper_class = SchemaScraperFactory.SchemaScraper
class TextScraper(scraper_class):
def __init__(

View File

@@ -27,7 +27,7 @@ class Paprika(Integration):
recipe.description = '' if len(recipe_json['description'].strip()) > 500 else recipe_json['description'].strip()
try:
if 'servings' in recipe_json['servings']:
if 'servings' in recipe_json:
recipe.servings = parse_servings(recipe_json['servings'])
recipe.servings_text = parse_servings_text(recipe_json['servings'])

View File

@@ -78,7 +78,11 @@ class Plantoeat(Integration):
current_recipe = ''
for fl in file.readlines():
line = fl.decode("windows-1250")
try:
line = fl.decode("utf-8")
except UnicodeDecodeError:
line = fl.decode("windows-1250")
if line.startswith('--------------'):
if current_recipe != '':
recipe_list.append(current_recipe)

Binary file not shown.

Binary file not shown.