mirror of
https://github.com/TandoorRecipes/recipes.git
synced 2025-12-24 02:39:20 -05:00
improved website parser
This commit is contained in:
@@ -18,7 +18,7 @@ def get_from_html(html_text, url):
|
||||
# first try finding ld+json as its most common
|
||||
for ld in soup.find_all('script', type='application/ld+json'):
|
||||
try:
|
||||
ld_json = json.loads(ld.string)
|
||||
ld_json = json.loads(ld.string.replace('\n', ''))
|
||||
if type(ld_json) != list:
|
||||
ld_json = [ld_json]
|
||||
|
||||
@@ -31,8 +31,8 @@ def get_from_html(html_text, url):
|
||||
|
||||
if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
|
||||
return find_recipe_json(ld_json_item, url)
|
||||
except JSONDecodeError:
|
||||
JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400)
|
||||
except JSONDecodeError as e:
|
||||
return JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400)
|
||||
|
||||
# now try to find microdata
|
||||
items = microdata.get_items(html_text)
|
||||
|
||||
@@ -12,7 +12,7 @@ class TestEditsRecipe(TestBase):
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_2.html', 'result_length': 1450},
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_3.html', 'result_length': 1545},
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_4.html', 'result_length': 1657},
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_invalid.html', 'result_length': 115},
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_invalid.html', 'result_length': 88},
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_itemList.html', 'result_length': 3131},
|
||||
{'file': 'cookbook/tests/resources/websites/ld_json_multiple.html', 'result_length': 1546},
|
||||
{'file': 'cookbook/tests/resources/websites/micro_data_1.html', 'result_length': 1022},
|
||||
|
||||
Reference in New Issue
Block a user