From cffa731106dafc84b273c5e75435b90db146a9a7 Mon Sep 17 00:00:00 2001 From: Andrew Jayne Date: Sun, 7 Aug 2022 20:51:45 +0100 Subject: [PATCH] fix: ingredient parsing for non-latin languages Before this change the ingredient string for non-latin languages was not being parsed into the correct amount or units when the food is found at the start of the ingredient string. This was because the regex being used was restricted to latin characters. With this change the amount and units are correctly parsed from such a string. Fixes https://github.com/TandoorRecipes/recipes/issues/1983 --- cookbook/helper/ingredient_parser.py | 4 ++-- cookbook/tests/other/test_ingredient_parser.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cookbook/helper/ingredient_parser.py b/cookbook/helper/ingredient_parser.py index 8a4fc30ac..468f1128f 100644 --- a/cookbook/helper/ingredient_parser.py +++ b/cookbook/helper/ingredient_parser.py @@ -221,8 +221,8 @@ class IngredientParser: # some people/languages put amount and unit at the end of the ingredient string # if something like this is detected move it to the beginning so the parser can handle it - if len(ingredient) < 1000 and re.search(r'^([A-z])+(.)*[1-9](\d)*\s([A-z])+', ingredient): - match = re.search(r'[1-9](\d)*\s([A-z])+', ingredient) + if len(ingredient) < 1000 and re.search(r'^([^\W\d_])+(.)*[1-9](\d)*\s*([^\W\d_])+', ingredient): + match = re.search(r'[1-9](\d)*\s*([^\W\d_])+', ingredient) print(f'reording from {ingredient} to {ingredient[match.start():match.end()] + " " + ingredient.replace(ingredient[match.start():match.end()], "")}') ingredient = ingredient[match.start():match.end()] + ' ' + ingredient.replace(ingredient[match.start():match.end()], '') diff --git a/cookbook/tests/other/test_ingredient_parser.py b/cookbook/tests/other/test_ingredient_parser.py index 90d5f0b79..d61cbc693 100644 --- a/cookbook/tests/other/test_ingredient_parser.py +++ b/cookbook/tests/other/test_ingredient_parser.py @@ -66,7 +66,9 @@ def test_ingredient_parser(): 1.0, 'Lorem', 'ipsum', 'dolor sit amet consetetur sadipscing elitr sed diam nonumy eirmod tempor invidunt ut l Lorem ipsum dolor sit amet consetetur sadipscing elitr sed diam nonumy eirmod tempor invidunt ut l'), "1 LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutl": ( 1.0, None, 'LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingeli', - 'LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutl') + 'LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutl'), + "砂糖 50g": (50, "g", "砂糖", ""), + "卵 4個": (4, "個", "卵", "") } # for German you could say that if an ingredient does not have