From be24e25ae46cecf1d7a5c47b4121b92ab55614b1 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Fri, 13 Jan 2023 21:19:49 +0100 Subject: [PATCH 1/9] #1552 Import Recipes from Cookidoo --- cookbook/helper/scrapers/cookidoo.py | 67 ++++++++++++++++++++++++++++ cookbook/helper/scrapers/scrapers.py | 4 ++ requirements.txt | 5 ++- 3 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 cookbook/helper/scrapers/cookidoo.py diff --git a/cookbook/helper/scrapers/cookidoo.py b/cookbook/helper/scrapers/cookidoo.py new file mode 100644 index 000000000..0906465e0 --- /dev/null +++ b/cookbook/helper/scrapers/cookidoo.py @@ -0,0 +1,67 @@ +from recipe_scrapers._abstract import AbstractScraper +from gettext import gettext as _ + + +class Cookidoo(AbstractScraper): + + def normalize_instruction(self, instruction): + if instruction is None: + return "" + # handle Thermomix-specific instructions that happen in nearly every receipe on Cookidoo + return instruction \ + .replace("", "**") \ + .replace("", "**") \ + .replace("", _('Linkslauf')) \ + .replace("", _('Kochlöffel')) \ + .replace("", _('Kneten')) \ + .replace("Andicken ", _('Andicken')) \ + .replace("Erwärmen ", _('Erwärmen')) \ + .replace("Fermentieren ", _('Fermentieren')) \ + .replace("Rühraufsatz einsetzen", "**Rühraufsatz einsetzen**") \ + .replace("Rühraufsatz entfernen", "**Rühraufsatz entfernen**") + + def instructions(self): + instructions = self.schema.data.get("recipeInstructions") or "" + + if isinstance(instructions, list): + instructions_gist = [] + step_number = 1 + for schema_instruction_item in instructions: + instructions_gist += self.extract_instructions_text(schema_instruction_item, "#", step_number) + step_number = step_number + 1 + + # join all steps into a recipe + return "".join(self.normalize_instruction(instruction) + for instruction in instructions_gist) + + return instructions + + def extract_instructions_text(self, schema_item, prefix, start_step_number): + step_number = start_step_number + step_format = "\n\n" + prefix + _("Step {}") + "\n\n{}" + section_format = "\n\n{}\n\n" + instructions_gist = [] + if type(schema_item) is str: + instructions_gist.append(step_format.format(step_number, schema_item)) + step_number = step_number + 1 + elif schema_item.get("@type") == "HowToStep": + if schema_item.get("name", False): + # some sites have duplicated name and text properties (1:1) + # others have name same as text but truncated to X chars. + # ignore name in these cases and add the name value only if it's different from the text + if not schema_item.get("text").startswith( + schema_item.get("name").rstrip(".") + ): + instructions_gist.append(step_format.format(step_number, schema_item.get("name"))) + instructions_gist.append(step_format.format(step_number, schema_item.get("text"))) + elif schema_item.get("@type") == "HowToSection": + section_name = schema_item.get("name") or schema_item.get("Name") or _("Instructions") + instructions_gist.append(section_format.format(section_name)) + step_number = 1 + for item in schema_item.get("itemListElement"): + instructions_gist += self.extract_instructions_text(item, "#" + prefix, step_number) + step_number = step_number + 1 + return instructions_gist + + def ingredients(self): + return self.schema.ingredients() diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index 7d6c08b15..01dfa374f 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -6,11 +6,15 @@ from recipe_scrapers._factory import SchemaScraperFactory from recipe_scrapers._schemaorg import SchemaOrg from .cooksillustrated import CooksIllustrated +from .cookidoo import Cookidoo CUSTOM_SCRAPERS = { CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated, CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated, CooksIllustrated.host(site="cookscountry"): CooksIllustrated, + "cookidoo.de": Cookidoo, + "cookidoo.at": Cookidoo, + "cookidoo.ch": Cookidoo, } SCRAPERS.update(CUSTOM_SCRAPERS) diff --git a/requirements.txt b/requirements.txt index cc565492c..bbd4df7a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,8 +40,9 @@ django-storages==1.13.2 boto3==1.26.41 django-prometheus==2.2.0 django-hCaptcha==0.2.0 -python-ldap==3.4.3 -django-auth-ldap==4.1.0 +#python-ldap==3.4.3 +django-python3-ldap +#django-auth-ldap==4.1.0 pytest-factoryboy==2.5.0 pyppeteer==1.0.2 validators==0.20.0 From 33c634c0e20c752b11c73ac1133d2f9d1b08c8e0 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Fri, 13 Jan 2023 21:22:17 +0100 Subject: [PATCH 2/9] #1552 Import Recipes from Cookidoo --- requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index bbd4df7a3..cc565492c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,9 +40,8 @@ django-storages==1.13.2 boto3==1.26.41 django-prometheus==2.2.0 django-hCaptcha==0.2.0 -#python-ldap==3.4.3 -django-python3-ldap -#django-auth-ldap==4.1.0 +python-ldap==3.4.3 +django-auth-ldap==4.1.0 pytest-factoryboy==2.5.0 pyppeteer==1.0.2 validators==0.20.0 From 77feb0db3a60902200f88f8966fe6fc42f8b17e3 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Fri, 13 Jan 2023 21:31:49 +0100 Subject: [PATCH 3/9] #1552 Import Recipes from Cookidoo --- cookbook/helper/scrapers/cookidoo.py | 6 +++--- requirements.txt | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cookbook/helper/scrapers/cookidoo.py b/cookbook/helper/scrapers/cookidoo.py index 0906465e0..08cdc9072 100644 --- a/cookbook/helper/scrapers/cookidoo.py +++ b/cookbook/helper/scrapers/cookidoo.py @@ -45,16 +45,16 @@ class Cookidoo(AbstractScraper): instructions_gist.append(step_format.format(step_number, schema_item)) step_number = step_number + 1 elif schema_item.get("@type") == "HowToStep": + # steps make up simple recipes or a section of a more complex recipe if schema_item.get("name", False): - # some sites have duplicated name and text properties (1:1) - # others have name same as text but truncated to X chars. - # ignore name in these cases and add the name value only if it's different from the text + # name may be the text in full or truncated if not schema_item.get("text").startswith( schema_item.get("name").rstrip(".") ): instructions_gist.append(step_format.format(step_number, schema_item.get("name"))) instructions_gist.append(step_format.format(step_number, schema_item.get("text"))) elif schema_item.get("@type") == "HowToSection": + # complex recipes are made up of named sections that are made up of steps section_name = schema_item.get("name") or schema_item.get("Name") or _("Instructions") instructions_gist.append(section_format.format(section_name)) step_number = 1 diff --git a/requirements.txt b/requirements.txt index cc565492c..bbd4df7a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,8 +40,9 @@ django-storages==1.13.2 boto3==1.26.41 django-prometheus==2.2.0 django-hCaptcha==0.2.0 -python-ldap==3.4.3 -django-auth-ldap==4.1.0 +#python-ldap==3.4.3 +django-python3-ldap +#django-auth-ldap==4.1.0 pytest-factoryboy==2.5.0 pyppeteer==1.0.2 validators==0.20.0 From a4bf967f6561d260ba454d4a3fe4cf91113a1ea6 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Fri, 13 Jan 2023 21:33:01 +0100 Subject: [PATCH 4/9] #1552 Import Recipes from Cookidoo --- requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index bbd4df7a3..cc565492c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,9 +40,8 @@ django-storages==1.13.2 boto3==1.26.41 django-prometheus==2.2.0 django-hCaptcha==0.2.0 -#python-ldap==3.4.3 -django-python3-ldap -#django-auth-ldap==4.1.0 +python-ldap==3.4.3 +django-auth-ldap==4.1.0 pytest-factoryboy==2.5.0 pyppeteer==1.0.2 validators==0.20.0 From 5a0f07a6b2304c047931f3b67372aca04da7a503 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Fri, 13 Jan 2023 22:21:31 +0100 Subject: [PATCH 5/9] handle steps --- cookbook/helper/scrapers/cookidoo.py | 31 ++++++++++++---------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/cookbook/helper/scrapers/cookidoo.py b/cookbook/helper/scrapers/cookidoo.py index 08cdc9072..99f1ea084 100644 --- a/cookbook/helper/scrapers/cookidoo.py +++ b/cookbook/helper/scrapers/cookidoo.py @@ -25,25 +25,22 @@ class Cookidoo(AbstractScraper): if isinstance(instructions, list): instructions_gist = [] - step_number = 1 for schema_instruction_item in instructions: - instructions_gist += self.extract_instructions_text(schema_instruction_item, "#", step_number) - step_number = step_number + 1 + # combine lists of instructions per section into a flat list + instructions_gist += self.extract_instructions_text(schema_instruction_item, "",) - # join all steps into a recipe - return "".join(self.normalize_instruction(instruction) - for instruction in instructions_gist) + steps = [] + for instruction in instructions_gist: + steps.append(self.normalize_instruction(instruction)) + + return steps return instructions - def extract_instructions_text(self, schema_item, prefix, start_step_number): - step_number = start_step_number - step_format = "\n\n" + prefix + _("Step {}") + "\n\n{}" - section_format = "\n\n{}\n\n" + def extract_instructions_text(self, schema_item, prefix): instructions_gist = [] if type(schema_item) is str: - instructions_gist.append(step_format.format(step_number, schema_item)) - step_number = step_number + 1 + instructions_gist.append(prefix + schema_item) elif schema_item.get("@type") == "HowToStep": # steps make up simple recipes or a section of a more complex recipe if schema_item.get("name", False): @@ -51,16 +48,14 @@ class Cookidoo(AbstractScraper): if not schema_item.get("text").startswith( schema_item.get("name").rstrip(".") ): - instructions_gist.append(step_format.format(step_number, schema_item.get("name"))) - instructions_gist.append(step_format.format(step_number, schema_item.get("text"))) + instructions_gist.append(schema_item.get("name")) + instructions_gist.append(schema_item.get("text")) elif schema_item.get("@type") == "HowToSection": # complex recipes are made up of named sections that are made up of steps section_name = schema_item.get("name") or schema_item.get("Name") or _("Instructions") - instructions_gist.append(section_format.format(section_name)) - step_number = 1 + instructions_gist.append("**" + section_name + "**") for item in schema_item.get("itemListElement"): - instructions_gist += self.extract_instructions_text(item, "#" + prefix, step_number) - step_number = step_number + 1 + instructions_gist += self.extract_instructions_text(item, "#" + prefix) return instructions_gist def ingredients(self): From 54d0b70f01f925742df238a5b3a306811fffb6e3 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Sun, 15 Jan 2023 12:55:03 +0100 Subject: [PATCH 6/9] #1552 use instructions_list() instead of instructions() and move Thermomix-Handling to clean_instruction_string() --- cookbook/helper/recipe_url_import.py | 18 +++++++- cookbook/helper/scrapers/cookidoo.py | 62 ---------------------------- cookbook/helper/scrapers/scrapers.py | 4 -- 3 files changed, 17 insertions(+), 67 deletions(-) delete mode 100644 cookbook/helper/scrapers/cookidoo.py diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 3091023ad..7c94a43ef 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -138,7 +138,7 @@ def get_from_scraper(scrape, request): recipe_json['steps'] = [] try: - for i in parse_instructions(scrape.instructions()): + for i in parse_instructions(scrape.instructions_list()): recipe_json['steps'].append({'instruction': i, 'ingredients': [], }) except Exception: pass @@ -248,6 +248,22 @@ def clean_instruction_string(instruction): normalized_string = normalize_string(instruction) normalized_string = normalized_string.replace('\n', ' \n') normalized_string = normalized_string.replace(' \n \n', '\n\n') + + # handle unsupported, special UTF8 character in Thermomix-specific instructions, + # that happen in nearly every receipe on Cookidoo, Zaubertopf Club, Rezeptwelt + # and in thermomix-spefici recipes on many other sites + return normalized_string \ + .replace("", "**") \ + .replace("", "**") \ + .replace("", _('Linkslauf')) \ + .replace("", _('Kochlöffel')) \ + .replace("", _('Kneten')) \ + .replace("Andicken ", _('Andicken')) \ + .replace("Erwärmen ", _('Erwärmen')) \ + .replace("Fermentieren ", _('Fermentieren')) \ + .replace("Rühraufsatz einsetzen", "**Rühraufsatz einsetzen**") \ + .replace("Rühraufsatz entfernen", "**Rühraufsatz entfernen**") + return normalized_string diff --git a/cookbook/helper/scrapers/cookidoo.py b/cookbook/helper/scrapers/cookidoo.py deleted file mode 100644 index 99f1ea084..000000000 --- a/cookbook/helper/scrapers/cookidoo.py +++ /dev/null @@ -1,62 +0,0 @@ -from recipe_scrapers._abstract import AbstractScraper -from gettext import gettext as _ - - -class Cookidoo(AbstractScraper): - - def normalize_instruction(self, instruction): - if instruction is None: - return "" - # handle Thermomix-specific instructions that happen in nearly every receipe on Cookidoo - return instruction \ - .replace("", "**") \ - .replace("", "**") \ - .replace("", _('Linkslauf')) \ - .replace("", _('Kochlöffel')) \ - .replace("", _('Kneten')) \ - .replace("Andicken ", _('Andicken')) \ - .replace("Erwärmen ", _('Erwärmen')) \ - .replace("Fermentieren ", _('Fermentieren')) \ - .replace("Rühraufsatz einsetzen", "**Rühraufsatz einsetzen**") \ - .replace("Rühraufsatz entfernen", "**Rühraufsatz entfernen**") - - def instructions(self): - instructions = self.schema.data.get("recipeInstructions") or "" - - if isinstance(instructions, list): - instructions_gist = [] - for schema_instruction_item in instructions: - # combine lists of instructions per section into a flat list - instructions_gist += self.extract_instructions_text(schema_instruction_item, "",) - - steps = [] - for instruction in instructions_gist: - steps.append(self.normalize_instruction(instruction)) - - return steps - - return instructions - - def extract_instructions_text(self, schema_item, prefix): - instructions_gist = [] - if type(schema_item) is str: - instructions_gist.append(prefix + schema_item) - elif schema_item.get("@type") == "HowToStep": - # steps make up simple recipes or a section of a more complex recipe - if schema_item.get("name", False): - # name may be the text in full or truncated - if not schema_item.get("text").startswith( - schema_item.get("name").rstrip(".") - ): - instructions_gist.append(schema_item.get("name")) - instructions_gist.append(schema_item.get("text")) - elif schema_item.get("@type") == "HowToSection": - # complex recipes are made up of named sections that are made up of steps - section_name = schema_item.get("name") or schema_item.get("Name") or _("Instructions") - instructions_gist.append("**" + section_name + "**") - for item in schema_item.get("itemListElement"): - instructions_gist += self.extract_instructions_text(item, "#" + prefix) - return instructions_gist - - def ingredients(self): - return self.schema.ingredients() diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index 01dfa374f..7d6c08b15 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -6,15 +6,11 @@ from recipe_scrapers._factory import SchemaScraperFactory from recipe_scrapers._schemaorg import SchemaOrg from .cooksillustrated import CooksIllustrated -from .cookidoo import Cookidoo CUSTOM_SCRAPERS = { CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated, CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated, CooksIllustrated.host(site="cookscountry"): CooksIllustrated, - "cookidoo.de": Cookidoo, - "cookidoo.at": Cookidoo, - "cookidoo.ch": Cookidoo, } SCRAPERS.update(CUSTOM_SCRAPERS) From 4c71c5b088542cbe15238c663d25ebcff598ae0d Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Sun, 15 Jan 2023 13:03:51 +0100 Subject: [PATCH 7/9] fix typo --- cookbook/helper/recipe_url_import.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 7c94a43ef..838c13e58 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -251,7 +251,7 @@ def clean_instruction_string(instruction): # handle unsupported, special UTF8 character in Thermomix-specific instructions, # that happen in nearly every receipe on Cookidoo, Zaubertopf Club, Rezeptwelt - # and in thermomix-spefici recipes on many other sites + # and in Thermomix-specific recipes on many other sites return normalized_string \ .replace("", "**") \ .replace("", "**") \ @@ -264,8 +264,6 @@ def clean_instruction_string(instruction): .replace("Rühraufsatz einsetzen", "**Rühraufsatz einsetzen**") \ .replace("Rühraufsatz entfernen", "**Rühraufsatz entfernen**") - return normalized_string - def parse_instructions(instructions): """ From f43ef3ad59e27fd0f1f4c66b6723afe66e8818ce Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Sun, 15 Jan 2023 13:04:29 +0100 Subject: [PATCH 8/9] another typo --- cookbook/helper/recipe_url_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 838c13e58..9026c5580 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -250,7 +250,7 @@ def clean_instruction_string(instruction): normalized_string = normalized_string.replace(' \n \n', '\n\n') # handle unsupported, special UTF8 character in Thermomix-specific instructions, - # that happen in nearly every receipe on Cookidoo, Zaubertopf Club, Rezeptwelt + # that happen in nearly every recipe on Cookidoo, Zaubertopf Club, Rezeptwelt # and in Thermomix-specific recipes on many other sites return normalized_string \ .replace("", "**") \ From bce44866c23a624b6c34c9884da7834e1b2a4df5 Mon Sep 17 00:00:00 2001 From: Marcus Wolschon Date: Mon, 30 Jan 2023 16:49:38 +0100 Subject: [PATCH 9/9] add translations, go back to using scraper.instructions() --- cookbook/helper/recipe_url_import.py | 27 ++++++++++++---------- cookbook/locale/de/LC_MESSAGES/django.po | 29 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 9026c5580..02f930856 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -138,7 +138,7 @@ def get_from_scraper(scrape, request): recipe_json['steps'] = [] try: - for i in parse_instructions(scrape.instructions_list()): + for i in parse_instructions(scrape.instructions()): recipe_json['steps'].append({'instruction': i, 'ingredients': [], }) except Exception: pass @@ -245,7 +245,13 @@ def parse_description(description): def clean_instruction_string(instruction): - normalized_string = normalize_string(instruction) + # handle HTML tags that can be converted to markup + normalized_string = instruction \ + .replace("", "**") \ + .replace("", "**") \ + .replace("", "**") \ + .replace("", "**") + normalized_string = normalize_string(normalized_string) normalized_string = normalized_string.replace('\n', ' \n') normalized_string = normalized_string.replace(' \n \n', '\n\n') @@ -253,16 +259,13 @@ def clean_instruction_string(instruction): # that happen in nearly every recipe on Cookidoo, Zaubertopf Club, Rezeptwelt # and in Thermomix-specific recipes on many other sites return normalized_string \ - .replace("", "**") \ - .replace("", "**") \ - .replace("", _('Linkslauf')) \ - .replace("", _('Kochlöffel')) \ - .replace("", _('Kneten')) \ - .replace("Andicken ", _('Andicken')) \ - .replace("Erwärmen ", _('Erwärmen')) \ - .replace("Fermentieren ", _('Fermentieren')) \ - .replace("Rühraufsatz einsetzen", "**Rühraufsatz einsetzen**") \ - .replace("Rühraufsatz entfernen", "**Rühraufsatz entfernen**") + .replace("", _('reverse rotation')) \ + .replace("", _('careful rotation')) \ + .replace("", _('knead')) \ + .replace("Andicken ", _('thicken')) \ + .replace("Erwärmen ", _('warm up')) \ + .replace("Fermentieren ", _('ferment')) \ + .replace("Sous-vide ", _("sous-vide")) def parse_instructions(instructions): diff --git a/cookbook/locale/de/LC_MESSAGES/django.po b/cookbook/locale/de/LC_MESSAGES/django.po index 4eab5983f..d7737b367 100644 --- a/cookbook/locale/de/LC_MESSAGES/django.po +++ b/cookbook/locale/de/LC_MESSAGES/django.po @@ -542,6 +542,35 @@ msgstr "Du hast mehr Benutzer in Deinem Space als erlaubt." msgid "One of queryset or hash_key must be provided" msgstr "Es muss die Abfrage oder der Hash_Key angeben werden" +#: .\cookbook\helper\recipe_url_import.py:265 +msgid "reverse rotation" +msgstr "Linkslauf" + +#: .\cookbook\helper\recipe_url_import.py:266 +msgid "careful rotation" +msgstr "Kochlöffel" + +#: .\cookbook\helper\recipe_url_import.py:267 +msgid "knead" +msgstr "Kneten" + +#: .\cookbook\helper\recipe_url_import.py:268 +msgid "thicken" +msgstr "Andicken" + +#: .\cookbook\helper\recipe_url_import.py:269 +msgid "warm up" +msgstr "Erwärmen" + +#: .\cookbook\helper\recipe_url_import.py:270 +msgid "ferment" +msgstr "Fermentieren" + +#: .\cookbook\helper\recipe_url_import.py:271 +msgid "sous-vide" +msgstr "Sous-vide" + + #: .\cookbook\helper\shopping_helper.py:152 msgid "You must supply a servings size" msgstr "Sie müssen eine Portionsgröße angeben"