Merge pull request #1917 from smilerz/bookmarklet_fix

Bookmarklet fix
2026-01-03 05:11:31 -05:00 · 2022-07-11 14:28:08 +02:00
parent 9eaf0f9530 e40b73f420
commit bb424cc3d6
8 changed files with 327 additions and 271 deletions
--- a/cookbook/helper/recipe_html_import.py
+++ b/cookbook/helper/recipe_html_import.py
@@ -1,189 +1,191 @@
-import json
-import re
-from json import JSONDecodeError
-from urllib.parse import unquote
+# import json
+# import re
+# from json import JSONDecodeError
+# from urllib.parse import unquote

-from bs4 import BeautifulSoup
-from bs4.element import Tag
-from recipe_scrapers import scrape_html, scrape_me
-from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
-from recipe_scrapers._utils import get_host_name, normalize_string
+# from bs4 import BeautifulSoup
+# from bs4.element import Tag
+# from recipe_scrapers import scrape_html, scrape_me
+# from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
+# from recipe_scrapers._utils import get_host_name, normalize_string

-from cookbook.helper import recipe_url_import as helper
-from cookbook.helper.scrapers.scrapers import text_scraper
+# from cookbook.helper import recipe_url_import as helper
+# from cookbook.helper.scrapers.scrapers import text_scraper


-def get_recipe_from_source(text, url, request):
-    def build_node(k, v):
-        if isinstance(v, dict):
-            node = {
-                'name': k,
-                'value': k,
-                'children': get_children_dict(v)
-            }
-        elif isinstance(v, list):
-            node = {
-                'name': k,
-                'value': k,
-                'children': get_children_list(v)
-            }
-        else:
-            node = {
-                'name': k + ": " + normalize_string(str(v)),
-                'value': normalize_string(str(v))
-            }
-        return node
+# def get_recipe_from_source(text, url, request):
+#     def build_node(k, v):
+#         if isinstance(v, dict):
+#             node = {
+#                 'name': k,
+#                 'value': k,
+#                 'children': get_children_dict(v)
+#             }
+#         elif isinstance(v, list):
+#             node = {
+#                 'name': k,
+#                 'value': k,
+#                 'children': get_children_list(v)
+#             }
+#         else:
+#             node = {
+#                 'name': k + ": " + normalize_string(str(v)),
+#                 'value': normalize_string(str(v))
+#             }
+#         return node

-    def get_children_dict(children):
-        kid_list = []
-        for k, v in children.items():
-            kid_list.append(build_node(k, v))
-        return kid_list
+#     def get_children_dict(children):
+#         kid_list = []
+#         for k, v in children.items():
+#             kid_list.append(build_node(k, v))
+#         return kid_list

-    def get_children_list(children):
-        kid_list = []
-        for kid in children:
-            if type(kid) == list:
-                node = {
-                    'name': "unknown list",
-                    'value': "unknown list",
-                    'children': get_children_list(kid)
-                }
-                kid_list.append(node)
-            elif type(kid) == dict:
-                for k, v in kid.items():
-                    kid_list.append(build_node(k, v))
-            else:
-                kid_list.append({
-                    'name': normalize_string(str(kid)),
-                    'value': normalize_string(str(kid))
-                })
-        return kid_list
+#     def get_children_list(children):
+#         kid_list = []
+#         for kid in children:
+#             if type(kid) == list:
+#                 node = {
+#                     'name': "unknown list",
+#                     'value': "unknown list",
+#                     'children': get_children_list(kid)
+#                 }
+#                 kid_list.append(node)
+#             elif type(kid) == dict:
+#                 for k, v in kid.items():
+#                     kid_list.append(build_node(k, v))
+#             else:
+#                 kid_list.append({
+#                     'name': normalize_string(str(kid)),
+#                     'value': normalize_string(str(kid))
+#                 })
+#         return kid_list

-    recipe_tree = []
-    parse_list = []
-    soup = BeautifulSoup(text, "html.parser")
-    html_data = get_from_html(soup)
-    images = get_images_from_source(soup, url)
-    text = unquote(text)
-    scrape = None
+#     recipe_tree = []
+#     parse_list = []
+#     soup = BeautifulSoup(text, "html.parser")
+#     html_data = get_from_html(soup)
+#     images = get_images_from_source(soup, url)
+#     text = unquote(text)
+#     scrape = None

-    if url:
-        try:
-            scrape = scrape_me(url_path=url, wild_mode=True)
-        except(NoSchemaFoundInWildMode):
-            pass
-    if not scrape:
-        try:
-            parse_list.append(remove_graph(json.loads(text)))
-            if not url and 'url' in parse_list[0]:
-                url = parse_list[0]['url']
-            scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
+#     if url and not text:
+#         try:
+#             scrape = scrape_me(url_path=url, wild_mode=True)
+#         except(NoSchemaFoundInWildMode):
+#             pass

-        except JSONDecodeError:
-            for el in soup.find_all('script', type='application/ld+json'):
-                el = remove_graph(el)
-                if not url and 'url' in el:
-                    url = el['url']
-                if type(el) == list:
-                    for le in el:
-                        parse_list.append(le)
-                elif type(el) == dict:
-                    parse_list.append(el)
-            for el in soup.find_all(type='application/json'):
-                el = remove_graph(el)
-                if type(el) == list:
-                    for le in el:
-                        parse_list.append(le)
-                elif type(el) == dict:
-                    parse_list.append(el)
-            scrape = text_scraper(text, url=url)
+#     if not scrape:
+#         try:
+#             parse_list.append(remove_graph(json.loads(text)))
+#             if not url and 'url' in parse_list[0]:
+#                 url = parse_list[0]['url']
+#             scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)

-    recipe_json = helper.get_from_scraper(scrape, request)
+#         except JSONDecodeError:
+#             for el in soup.find_all('script', type='application/ld+json'):
+#                 el = remove_graph(el)
+#                 if not url and 'url' in el:
+#                     url = el['url']
+#                 if type(el) == list:
+#                     for le in el:
+#                         parse_list.append(le)
+#                 elif type(el) == dict:
+#                     parse_list.append(el)
+#             for el in soup.find_all(type='application/json'):
+#                 el = remove_graph(el)
+#                 if type(el) == list:
+#                     for le in el:
+#                         parse_list.append(le)
+#                 elif type(el) == dict:
+#                     parse_list.append(el)
+#             scrape = text_scraper(text, url=url)

-    for el in parse_list:
-        temp_tree = []
-        if isinstance(el, Tag):
-            try:
-                el = json.loads(el.string)
-            except TypeError:
-                continue
+#     recipe_json = helper.get_from_scraper(scrape, request)

-        for k, v in el.items():
-            if isinstance(v, dict):
-                node = {
-                    'name': k,
-                    'value': k,
-                    'children': get_children_dict(v)
-                }
-            elif isinstance(v, list):
-                node = {
-                    'name': k,
-                    'value': k,
-                    'children': get_children_list(v)
-                }
-            else:
-                node = {
-                    'name': k + ": " + normalize_string(str(v)),
-                    'value': normalize_string(str(v))
-                }
-            temp_tree.append(node)
+#     # TODO: DEPRECATE recipe_tree & html_data.  first validate it isn't used anywhere
+#     for el in parse_list:
+#         temp_tree = []
+#         if isinstance(el, Tag):
+#             try:
+#                 el = json.loads(el.string)
+#             except TypeError:
+#                 continue

-        if '@type' in el and el['@type'] == 'Recipe':
-            recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
-        else:
-            recipe_tree += [{'name': 'json', 'children': temp_tree}]
+#         for k, v in el.items():
+#             if isinstance(v, dict):
+#                 node = {
+#                     'name': k,
+#                     'value': k,
+#                     'children': get_children_dict(v)
+#                 }
+#             elif isinstance(v, list):
+#                 node = {
+#                     'name': k,
+#                     'value': k,
+#                     'children': get_children_list(v)
+#                 }
+#             else:
+#                 node = {
+#                     'name': k + ": " + normalize_string(str(v)),
+#                     'value': normalize_string(str(v))
+#                 }
+#             temp_tree.append(node)

-    return recipe_json, recipe_tree, html_data, images
+#         if '@type' in el and el['@type'] == 'Recipe':
+#             recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
+#         else:
+#             recipe_tree += [{'name': 'json', 'children': temp_tree}]
+
+#     return recipe_json, recipe_tree, html_data, images


-def get_from_html(soup):
-    INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
-    html = []
-    for s in soup.strings:
-        if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
-            html.append(s)
-    return html
+# def get_from_html(soup):
+#     INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
+#     html = []
+#     for s in soup.strings:
+#         if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
+#             html.append(s)
+#     return html


-def get_images_from_source(soup, url):
-    sources = ['src', 'srcset', 'data-src']
-    images = []
-    img_tags = soup.find_all('img')
-    if url:
-        site = get_host_name(url)
-        prot = url.split(':')[0]
+# def get_images_from_source(soup, url):
+#     sources = ['src', 'srcset', 'data-src']
+#     images = []
+#     img_tags = soup.find_all('img')
+#     if url:
+#         site = get_host_name(url)
+#         prot = url.split(':')[0]

-    urls = []
-    for img in img_tags:
-        for src in sources:
-            try:
-                urls.append(img[src])
-            except KeyError:
-                pass
+#     urls = []
+#     for img in img_tags:
+#         for src in sources:
+#             try:
+#                 urls.append(img[src])
+#             except KeyError:
+#                 pass

-    for u in urls:
-        u = u.split('?')[0]
-        filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
-        if filename:
-            if (('http' not in u) and (url)):
-                # sometimes an image source can be relative
-                # if it is provide the base url
-                u = '{}://{}{}'.format(prot, site, u)
-            if 'http' in u:
-                images.append(u)
-    return images
+#     for u in urls:
+#         u = u.split('?')[0]
+#         filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
+#         if filename:
+#             if (('http' not in u) and (url)):
+#                 # sometimes an image source can be relative
+#                 # if it is provide the base url
+#                 u = '{}://{}{}'.format(prot, site, u)
+#             if 'http' in u:
+#                 images.append(u)
+#     return images


-def remove_graph(el):
-    # recipes type might be wrapped in @graph type
-    if isinstance(el, Tag):
-        try:
-            el = json.loads(el.string)
-            if '@graph' in el:
-                for x in el['@graph']:
-                    if '@type' in x and x['@type'] == 'Recipe':
-                        el = x
-        except (TypeError, JSONDecodeError):
-            pass
-    return el
+# def remove_graph(el):
+#     # recipes type might be wrapped in @graph type
+#     if isinstance(el, Tag):
+#         try:
+#             el = json.loads(el.string)
+#             if '@graph' in el:
+#                 for x in el['@graph']:
+#                     if '@type' in x and x['@type'] == 'Recipe':
+#                         el = x
+#         except (TypeError, JSONDecodeError):
+#             pass
+#     return el
--- a/cookbook/helper/recipe_url_import.py
+++ b/cookbook/helper/recipe_url_import.py
@@ -1,21 +1,19 @@
 import random
 import re
 from html import unescape
-
-from pytube import YouTube
 from unicodedata import decomposition

 from django.utils.dateparse import parse_duration
 from django.utils.translation import gettext as _
 from isodate import parse_duration as iso_parse_duration
 from isodate.isoerror import ISO8601Error
-from recipe_scrapers._utils import get_minutes
+from pytube import YouTube
+from recipe_scrapers._utils import get_host_name, get_minutes

 from cookbook.helper import recipe_url_import as helper
 from cookbook.helper.ingredient_parser import IngredientParser
 from cookbook.models import Keyword

-
 # from recipe_scrapers._utils import get_minutes  ## temporary until/unless upstream incorporates get_minutes() PR


@@ -369,3 +367,32 @@ def iso_duration_to_minutes(string):
        string
    ).groupdict()
    return int(match['days'] or 0) * 24 * 60 + int(match['hours'] or 0) * 60 + int(match['minutes'] or 0)
+
+
+def get_images_from_soup(soup, url):
+    sources = ['src', 'srcset', 'data-src']
+    images = []
+    img_tags = soup.find_all('img')
+    if url:
+        site = get_host_name(url)
+        prot = url.split(':')[0]
+
+    urls = []
+    for img in img_tags:
+        for src in sources:
+            try:
+                urls.append(img[src])
+            except KeyError:
+                pass
+
+    for u in urls:
+        u = u.split('?')[0]
+        filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
+        if filename:
+            if (('http' not in u) and (url)):
+                # sometimes an image source can be relative
+                # if it is provide the base url
+                u = '{}://{}{}'.format(prot, site, u)
+            if 'http' in u:
+                images.append(u)
+    return images
--- a/cookbook/helper/scrapers/scrapers.py
+++ b/cookbook/helper/scrapers/scrapers.py
@@ -1,6 +1,7 @@
-from bs4 import BeautifulSoup
 from json import JSONDecodeError
-from recipe_scrapers import SCRAPERS 
+
+from bs4 import BeautifulSoup
+from recipe_scrapers import SCRAPERS, get_host_name
 from recipe_scrapers._factory import SchemaScraperFactory
 from recipe_scrapers._schemaorg import SchemaOrg

@@ -15,22 +16,28 @@ SCRAPERS.update(CUSTOM_SCRAPERS)


 def text_scraper(text, url=None):
-    scraper_class = SchemaScraperFactory.SchemaScraper
+    domain = None
+    if url:
+        domain = get_host_name(url)
+    if domain in SCRAPERS:
+        scraper_class = SCRAPERS[domain]
+    else:
+        scraper_class = SchemaScraperFactory.SchemaScraper

    class TextScraper(scraper_class):
        def __init__(
            self,
-            page_data,
-            url=None
+            html=None,
+            url=None,
        ):
            self.wild_mode = False
            self.meta_http_equiv = False
-            self.soup = BeautifulSoup(page_data, "html.parser")
+            self.soup = BeautifulSoup(html, "html.parser")
            self.url = url
            self.recipe = None
            try:
-                self.schema = SchemaOrg(page_data)
+                self.schema = SchemaOrg(html)
            except (JSONDecodeError, AttributeError):
                pass

-    return TextScraper(text, url)
+    return TextScraper(url=url, html=text)