Merge pull request #277 from l0c4lh057/master

Improve text to ingredient parsing
This commit is contained in:
vabene1111
2020-12-26 13:52:24 +01:00
committed by GitHub
2 changed files with 138 additions and 33 deletions

View File

@@ -0,0 +1,131 @@
import unicodedata
import string
def parse_fraction(x):
if len(x) == 1 and 'fraction' in unicodedata.decomposition(x):
frac_split = unicodedata.decomposition(x[-1:]).split()
return float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', ''))
else:
frac_split = x.split('/')
if not len(frac_split) == 2:
raise ValueError
try:
return int(frac_split[0]) / int(frac_split[1])
except ZeroDivisionError:
raise ValueError
def parse_amount(x):
amount = 0
unit = ''
did_check_frac = False
end = 0
while end < len(x) and (x[end] in string.digits or ((x[end] == '.' or x[end] == ',') and end + 1 < len(x) and x[end+1] in string.digits)):
end += 1
if end > 0:
amount = float(x[:end].replace(',', '.'))
else:
amount = parse_fraction(x[0])
end += 1
did_check_frac = True
if end < len(x):
if did_check_frac:
unit = x[end:]
else:
try:
amount += parse_fraction(x[end])
unit = x[end+1:]
except ValueError:
unit = x[end:]
return amount, unit
def parse_ingredient_with_comma(tokens):
ingredient = ''
note = ''
start = 0
# search for first occurence of an argument ending in a comma
while start < len(tokens) and not tokens[start].endswith(','):
start += 1
if start == len(tokens):
# no token ending in a comma found -> use everything as ingredient
ingredient = ' '.join(tokens)
else:
ingredient = ' '.join(tokens[:start+1])[:-1]
note = ' '.join(tokens[start+1:])
return ingredient, note
def parse_ingredient(tokens):
ingredient = ''
note = ''
if tokens[-1].endswith(')'):
# last argument ends with closing bracket -> look for opening bracket
start = len(tokens) - 1
while not tokens[start].startswith('(') and not start == 0:
start -= 1
if start == 0:
# the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit)
raise ValueError
elif start < 0:
# no opening bracket anywhere -> just ignore the last bracket
ingredient, note = parse_ingredient_with_comma(tokens)
else:
# opening bracket found -> split in ingredient and note, remove brackets from note
note = ' '.join(tokens[start:])[1:-1]
ingredient = ' '.join(tokens[:start])
else:
ingredient, note = parse_ingredient_with_comma(tokens)
return ingredient, note
def parse(x):
# initialize default values
amount = 0
unit = ''
ingredient = ''
note = ''
tokens = x.split()
if len(tokens) == 1:
# there only is one argument, that must be the ingredient
ingredient = tokens[0]
else:
try:
# try to parse first argument as amount
amount, unit = parse_amount(tokens[0])
# only try to parse second argument as amount if there are at least three arguments
# if it already has a unit there can't be a fraction for the amount
if len(tokens) > 2:
try:
if not unit == '':
# a unit is already found, no need to try the second argument for a fraction
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except
raise ValueError
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
amount += parse_fraction(tokens[1])
# assume that units can't end with a comma
if len(tokens) > 3 and not tokens[2].endswith(','):
# try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails
try:
ingredient, note = parse_ingredient(tokens[3:])
unit = tokens[2]
except ValueError:
ingredient, note = parse_ingredient(tokens[2:])
else:
ingredient, note = parse_ingredient(tokens[2:])
except ValueError:
# assume that units can't end with a comma
if not tokens[1].endswith(','):
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails
try:
ingredient, note = parse_ingredient(tokens[2:])
unit = tokens[1]
except ValueError:
ingredient, note = parse_ingredient(tokens[1:])
else:
ingredient, note = parse_ingredient(tokens[1:])
else:
# only two arguments, first one is the amount which means this is the ingredient
ingredient = tokens[1]
except ValueError:
# can't parse first argument as amount -> no unit -> parse everything as ingredient
ingredient, note = parse_ingredient(tokens)
return amount, unit.strip(), ingredient.strip(), note.strip()

View File

@@ -11,6 +11,7 @@ from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _
from cookbook.models import Keyword
from cookbook.helper.ingredient_parser import parse as parse_ingredient
def get_from_html(html_text, url):
@@ -70,39 +71,12 @@ def find_recipe_json(ld_json, url):
ingredients = []
for x in ld_json['recipeIngredient']:
ingredient_split = x.split()
ingredient = None
amount = 0
unit = ''
if len(ingredient_split) > 2:
ingredient = " ".join(ingredient_split[2:])
unit = ingredient_split[1]
try:
if 'fraction' in unicodedata.decomposition(ingredient_split[0]):
frac_split = unicodedata.decomposition(ingredient_split[0]).split()
amount = round(float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', '')), 3)
else:
raise TypeError
except TypeError: # raised by unicodedata.decomposition if there was no unicode character in parsed data
try:
amount = float(ingredient_split[0].replace(',', '.'))
except ValueError:
amount = 0
ingredient = " ".join(ingredient_split)
if len(ingredient_split) == 2:
ingredient = " ".join(ingredient_split[1:])
unit = ''
try:
amount = float(ingredient_split[0].replace(',', '.'))
except ValueError:
amount = 0
ingredient = " ".join(ingredient_split)
if len(ingredient_split) == 1:
ingredient = " ".join(ingredient_split)
if ingredient:
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, 'original': x})
try:
amount, unit, ingredient, note = parse_ingredient(x)
if ingredient:
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, "note": note, 'original': x})
except:
pass
ld_json['recipeIngredient'] = ingredients
else: