improved website parser

This commit is contained in:
vabene1111
2020-08-26 11:37:59 +02:00
parent 78be002134
commit 0b948618f3
2 changed files with 4 additions and 4 deletions

View File

@@ -18,7 +18,7 @@ def get_from_html(html_text, url):
# first try finding ld+json as its most common
for ld in soup.find_all('script', type='application/ld+json'):
try:
ld_json = json.loads(ld.string)
ld_json = json.loads(ld.string.replace('\n', ''))
if type(ld_json) != list:
ld_json = [ld_json]
@@ -31,8 +31,8 @@ def get_from_html(html_text, url):
if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
return find_recipe_json(ld_json_item, url)
except JSONDecodeError:
JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400)
except JSONDecodeError as e:
return JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400)
# now try to find microdata
items = microdata.get_items(html_text)

View File

@@ -12,7 +12,7 @@ class TestEditsRecipe(TestBase):
{'file': 'cookbook/tests/resources/websites/ld_json_2.html', 'result_length': 1450},
{'file': 'cookbook/tests/resources/websites/ld_json_3.html', 'result_length': 1545},
{'file': 'cookbook/tests/resources/websites/ld_json_4.html', 'result_length': 1657},
{'file': 'cookbook/tests/resources/websites/ld_json_invalid.html', 'result_length': 115},
{'file': 'cookbook/tests/resources/websites/ld_json_invalid.html', 'result_length': 88},
{'file': 'cookbook/tests/resources/websites/ld_json_itemList.html', 'result_length': 3131},
{'file': 'cookbook/tests/resources/websites/ld_json_multiple.html', 'result_length': 1546},
{'file': 'cookbook/tests/resources/websites/micro_data_1.html', 'result_length': 1022},