helper/recipe_url_import

This commit is contained in:
Tobias Lindenberg
2021-01-10 13:57:06 +01:00
parent 986bda0c81
commit 1ad468e652

View File

@@ -1,18 +1,16 @@
import json import json
import random import random
import re import re
import unicodedata
from json import JSONDecodeError from json import JSONDecodeError
import microdata import microdata
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from cookbook.helper.ingredient_parser import parse as parse_ingredient
from cookbook.models import Keyword
from django.http import JsonResponse from django.http import JsonResponse
from django.utils.dateparse import parse_duration from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from cookbook.models import Keyword
from cookbook.helper.ingredient_parser import parse as parse_ingredient
def get_from_html(html_text, url): def get_from_html(html_text, url):
soup = BeautifulSoup(html_text, "html.parser") soup = BeautifulSoup(html_text, "html.parser")
@@ -31,10 +29,16 @@ def get_from_html(html_text, url):
if '@type' in x and x['@type'] == 'Recipe': if '@type' in x and x['@type'] == 'Recipe':
ld_json_item = x ld_json_item = x
if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe': if ('@type' in ld_json_item
and ld_json_item['@type'] == 'Recipe'):
return find_recipe_json(ld_json_item, url) return find_recipe_json(ld_json_item, url)
except JSONDecodeError as e: except JSONDecodeError:
return JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400) return JsonResponse(
{
'error': True,
'msg': _('The requested site provided malformed data and cannot be read.') # noqa: E501
},
status=400)
# now try to find microdata # now try to find microdata
items = microdata.get_items(html_text) items = microdata.get_items(html_text)
@@ -43,14 +47,19 @@ def get_from_html(html_text, url):
if 'schema.org/Recipe' in str(md_json['type']): if 'schema.org/Recipe' in str(md_json['type']):
return find_recipe_json(md_json['properties'], url) return find_recipe_json(md_json['properties'], url)
return JsonResponse({'error': True, 'msg': _('The requested site does not provide any recognized data format to import the recipe from.')}, status=400) return JsonResponse(
{
'error': True,
'msg': _('The requested site does not provide any recognized data format to import the recipe from.') # noqa: E501
},
status=400)
def find_recipe_json(ld_json, url): def find_recipe_json(ld_json, url):
if type(ld_json['name']) == list: if type(ld_json['name']) == list:
try: try:
ld_json['name'] = ld_json['name'][0] ld_json['name'] = ld_json['name'][0]
except: except Exception:
ld_json['name'] = 'ERROR' ld_json['name'] = 'ERROR'
# some sites use ingredients instead of recipeIngredients # some sites use ingredients instead of recipeIngredients
@@ -59,8 +68,9 @@ def find_recipe_json(ld_json, url):
if 'recipeIngredient' in ld_json: if 'recipeIngredient' in ld_json:
# some pages have comma separated ingredients in a single array entry # some pages have comma separated ingredients in a single array entry
if len(ld_json['recipeIngredient']) == 1 and len(ld_json['recipeIngredient'][0]) > 30: if (len(ld_json['recipeIngredient']) == 1
ld_json['recipeIngredient'] = ld_json['recipeIngredient'][0].split(',') and len(ld_json['recipeIngredient'][0]) > 30):
ld_json['recipeIngredient'] = ld_json['recipeIngredient'][0].split(',') # noqa: E501
for x in ld_json['recipeIngredient']: for x in ld_json['recipeIngredient']:
if '\n' in x: if '\n' in x:
@@ -71,13 +81,41 @@ def find_recipe_json(ld_json, url):
ingredients = [] ingredients = []
for x in ld_json['recipeIngredient']: for x in ld_json['recipeIngredient']:
if x.replace(' ','') != '': if x.replace(' ', '') != '':
try: try:
amount, unit, ingredient, note = parse_ingredient(x) amount, unit, ingredient, note = parse_ingredient(x)
if ingredient: if ingredient:
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, "note": note, 'original': x}) ingredients.append(
except: {
ingredients.append({'amount': 0, 'unit': {'text': "", 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': x, 'id': random.randrange(10000, 99999)}, "note": "", 'original': x}) 'amount': amount,
'unit': {
'text': unit,
'id': random.randrange(10000, 99999)
},
'ingredient': {
'text': ingredient,
'id': random.randrange(10000, 99999)
},
'note': note,
'original': x
}
)
except Exception:
ingredients.append(
{
'amount': 0,
'unit': {
'text': '',
'id': random.randrange(10000, 99999)
},
'ingredient': {
'text': x,
'id': random.randrange(10000, 99999)
},
'note': '',
'original': x
}
)
ld_json['recipeIngredient'] = ingredients ld_json['recipeIngredient'] = ingredients
else: else:
@@ -91,7 +129,9 @@ def find_recipe_json(ld_json, url):
ld_json['keywords'] = ld_json['keywords'].split(',') ld_json['keywords'] = ld_json['keywords'].split(',')
# keywords as string in list # keywords as string in list
if type(ld_json['keywords']) == list and len(ld_json['keywords']) == 1 and ',' in ld_json['keywords'][0]: if (type(ld_json['keywords']) == list
and len(ld_json['keywords']) == 1
and ',' in ld_json['keywords'][0]):
ld_json['keywords'] = ld_json['keywords'][0].split(',') ld_json['keywords'] = ld_json['keywords'][0].split(',')
# keywords as list # keywords as list
@@ -126,10 +166,10 @@ def find_recipe_json(ld_json, url):
instructions += str(i) instructions += str(i)
ld_json['recipeInstructions'] = instructions ld_json['recipeInstructions'] = instructions
ld_json['recipeInstructions'] = re.sub(r'\n\s*\n', '\n\n', ld_json['recipeInstructions']) ld_json['recipeInstructions'] = re.sub(r'\n\s*\n', '\n\n', ld_json['recipeInstructions']) # noqa: E501
ld_json['recipeInstructions'] = re.sub(' +', ' ', ld_json['recipeInstructions']) ld_json['recipeInstructions'] = re.sub(' +', ' ', ld_json['recipeInstructions']) # noqa: E501
ld_json['recipeInstructions'] = ld_json['recipeInstructions'].replace('<p>', '') ld_json['recipeInstructions'] = ld_json['recipeInstructions'].replace('<p>', '') # noqa: E501
ld_json['recipeInstructions'] = ld_json['recipeInstructions'].replace('</p>', '') ld_json['recipeInstructions'] = ld_json['recipeInstructions'].replace('</p>', '') # noqa: E501
else: else:
ld_json['recipeInstructions'] = '' ld_json['recipeInstructions'] = ''
@@ -149,9 +189,14 @@ def find_recipe_json(ld_json, url):
if 'cookTime' in ld_json: if 'cookTime' in ld_json:
try: try:
if type(ld_json['cookTime']) == list and len(ld_json['cookTime']) > 0: if (type(ld_json['cookTime']) == list
and len(ld_json['cookTime']) > 0):
ld_json['cookTime'] = ld_json['cookTime'][0] ld_json['cookTime'] = ld_json['cookTime'][0]
ld_json['cookTime'] = round(parse_duration(ld_json['cookTime']).seconds / 60) ld_json['cookTime'] = round(
parse_duration(
ld_json['cookTime']
).seconds / 60
)
except TypeError: except TypeError:
ld_json['cookTime'] = 0 ld_json['cookTime'] = 0
else: else:
@@ -159,16 +204,24 @@ def find_recipe_json(ld_json, url):
if 'prepTime' in ld_json: if 'prepTime' in ld_json:
try: try:
if type(ld_json['prepTime']) == list and len(ld_json['prepTime']) > 0: if (type(ld_json['prepTime']) == list
and len(ld_json['prepTime']) > 0):
ld_json['prepTime'] = ld_json['prepTime'][0] ld_json['prepTime'] = ld_json['prepTime'][0]
ld_json['prepTime'] = round(parse_duration(ld_json['prepTime']).seconds / 60) ld_json['prepTime'] = round(
parse_duration(
ld_json['prepTime']
).seconds / 60
)
except TypeError: except TypeError:
ld_json['prepTime'] = 0 ld_json['prepTime'] = 0
else: else:
ld_json['prepTime'] = 0 ld_json['prepTime'] = 0
for key in list(ld_json): for key in list(ld_json):
if key not in ['prepTime', 'cookTime', 'image', 'recipeInstructions', 'keywords', 'name', 'recipeIngredient']: if key not in [
'prepTime', 'cookTime', 'image', 'recipeInstructions',
'keywords', 'name', 'recipeIngredient'
]:
ld_json.pop(key, None) ld_json.pop(key, None)
return JsonResponse(ld_json) return JsonResponse(ld_json)