From bfaed434cc65bc69880bc9988b7629e5111edc85 Mon Sep 17 00:00:00 2001
From: smilerz <smilerz@gmail.com>
Date: Wed, 3 Mar 2021 21:08:34 -0600
Subject: [PATCH] refactored url_import to use recipe-scraper

---
 cookbook/helper/recipe_url_import.py | 144 ++++++++++++++++++++++-----
 cookbook/templates/url_import.html   |   6 ++
 cookbook/views/api.py                |  32 +++++-
 requirements.txt                     |   1 +
 4 files changed, 157 insertions(+), 26 deletions(-)

diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py
index 139b20ab2..e40ec9e45 100644
--- a/cookbook/helper/recipe_url_import.py
+++ b/cookbook/helper/recipe_url_import.py
@@ -10,6 +10,7 @@ from cookbook.models import Keyword
 from django.http import JsonResponse
 from django.utils.dateparse import parse_duration
 from django.utils.translation import gettext as _
+from recipe_scrapers import _utils
 
 
 def get_from_html(html_text, url):
@@ -69,8 +70,10 @@ def find_recipe_json(ld_json, url):
     if 'recipeIngredient' in ld_json:
         # some pages have comma separated ingredients in a single array entry
         if (len(ld_json['recipeIngredient']) == 1
-                and len(ld_json['recipeIngredient'][0]) > 30):
+                and type(ld_json['recipeIngredient']) == list):
             ld_json['recipeIngredient'] = ld_json['recipeIngredient'][0].split(',')  # noqa: E501
+        elif type(ld_json['recipeIngredient']) == str:
+            ld_json['recipeIngredient'] = ld_json['recipeIngredient'].split(',')
 
         for x in ld_json['recipeIngredient']:
             if '\n' in x:
@@ -122,28 +125,7 @@ def find_recipe_json(ld_json, url):
         ld_json['recipeIngredient'] = []
 
     if 'keywords' in ld_json:
-        keywords = []
-
-        # keywords as string
-        if type(ld_json['keywords']) == str:
-            ld_json['keywords'] = ld_json['keywords'].split(',')
-
-        # keywords as string in list
-        if (type(ld_json['keywords']) == list
-                and len(ld_json['keywords']) == 1
-                and ',' in ld_json['keywords'][0]):
-            ld_json['keywords'] = ld_json['keywords'][0].split(',')
-
-        # keywords as list
-        for kw in ld_json['keywords']:
-            if k := Keyword.objects.filter(name=kw).first():
-                keywords.append({'id': str(k.id), 'text': str(k).strip()})
-            else:
-                keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw.strip()})
-
-        ld_json['keywords'] = keywords
-    else:
-        ld_json['keywords'] = []
+        ld_json['keywords'] = parse_keywords(listify_keywords(ld_json['keywords']))
 
     if 'recipeInstructions' in ld_json:
         instructions = ''
@@ -218,6 +200,7 @@ def find_recipe_json(ld_json, url):
     else:
         ld_json['prepTime'] = 0
 
+    ld_json['servings'] = 1
     try:
         if 'recipeYield' in ld_json:
             if type(ld_json['recipeYield']) == str:
@@ -226,7 +209,6 @@ def find_recipe_json(ld_json, url):
                 ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0])
     except Exception as e:
         print(e)
-        ld_json['servings'] = 1
 
     for key in list(ld_json):
         if key not in [
@@ -236,3 +218,117 @@ def find_recipe_json(ld_json, url):
             ld_json.pop(key, None)
 
     return ld_json
+
+
+def get_from_scraper(scrape):
+    # converting the scrape_me object to the existing json format based on ld+json
+
+    recipe_json = {}
+    recipe_json['name'] = scrape.title()
+
+    recipe_json['description'] = ''
+    description = scrape.schema.data.get("description")
+    description += "\n\nImported from " + scrape.url
+    recipe_json['description'] = description
+
+    try:
+        servings = scrape.yields()
+        servings = int(re.findall(r'\b\d+\b', servings)[0])
+    except (AttributeError, ValueError):
+        servings = 1
+    recipe_json['servings'] = servings
+
+    recipe_json['prepTime'] = _utils.get_minutes(scrape.schema.data.get("prepTime")) or 0
+    recipe_json['cookTime'] = _utils.get_minutes(scrape.schema.data.get("cookTime")) or 0
+    if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
+        try:
+            recipe_json['prepTime'] = scrape.total_time()
+        except AttributeError:
+            pass
+
+    try:
+        recipe_json['image'] = scrape.image()
+    except AttributeError:
+        pass
+
+    keywords = []
+    if scrape.schema.data.get("keywords"):
+        keywords += listify_keywords(scrape.schema.data.get("keywords"))
+    if scrape.schema.data.get('recipeCategory'):
+        keywords += listify_keywords(scrape.schema.data.get("recipeCategory"))
+    if scrape.schema.data.get('recipeCuisine'):
+        keywords += listify_keywords(scrape.schema.data.get("recipeCuisine"))
+    recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))))
+
+    try:
+        ingredients = []
+        for x in scrape.ingredients():
+            try:
+                amount, unit, ingredient, note = parse_ingredient(x)
+                if ingredient:
+                    ingredients.append(
+                        {
+                            'amount': amount,
+                            'unit': {
+                                'text': unit,
+                                'id': random.randrange(10000, 99999)
+                            },
+                            'ingredient': {
+                                'text': ingredient,
+                                'id': random.randrange(10000, 99999)
+                            },
+                            'note': note,
+                            'original': x
+                        }
+                    )
+            except Exception:
+                ingredients.append(
+                    {
+                        'amount': 0,
+                        'unit': {
+                            'text': '',
+                            'id': random.randrange(10000, 99999)
+                        },
+                        'ingredient': {
+                            'text': x,
+                            'id': random.randrange(10000, 99999)
+                        },
+                        'note': '',
+                        'original': x
+                    }
+                )
+        recipe_json['recipeIngredient'] = ingredients
+    except AttributeError:
+        recipe_json['recipeIngredient'] = ingredients
+
+    try:
+        recipe_json['recipeInstructions'] = scrape.instructions()
+    except AttributeError:
+        recipe_json['recipeInstructions'] = ""
+
+    return recipe_json
+
+
+def parse_keywords(keyword_json):
+    keywords = []
+    # keywords as list
+    for kw in keyword_json:
+        if k := Keyword.objects.filter(name=kw).first():
+            keywords.append({'id': str(k.id), 'text': str(k)})
+        else:
+            keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw})
+
+    return keywords
+
+
+def listify_keywords(keyword_list):
+    # keywords as string
+    if type(keyword_list) == str:
+        keyword_list = keyword_list.split(',')
+
+    # keywords as string in list
+    if (type(keyword_list) == list
+            and len(keyword_list) == 1
+            and ',' in keyword_list[0]):
+        keyword_list = keyword_list[0].split(',')
+    return [x.strip() for x in keyword_list]
diff --git a/cookbook/templates/url_import.html b/cookbook/templates/url_import.html
index 0fdc81141..caade2615 100644
--- a/cookbook/templates/url_import.html
+++ b/cookbook/templates/url_import.html
@@ -45,6 +45,12 @@
                     <input id="id_name" class="form-control" v-model="recipe_data.name">
                 </div>
 
+                <div class="form-group">
+                    <label for="id_description">{% trans 'Recipe Description' %}</label>
+                    <textarea id="id_description" class="form-control" v-model="recipe_data.description"
+                              rows="4"></textarea>
+                </div>
+
                 <div class="row">
                     <div class="col col-md-6" v-if="recipe_data.image !== ''">
                         <img v-bind:src="recipe_data.image" alt="{% trans 'Recipe Image' %}"
diff --git a/cookbook/views/api.py b/cookbook/views/api.py
index b6036ab30..4feeca0ba 100644
--- a/cookbook/views/api.py
+++ b/cookbook/views/api.py
@@ -9,7 +9,7 @@ from annoying.functions import get_object_or_None
 from django.contrib import messages
 from django.contrib.auth.models import User
 from django.core import management
-from django.core.exceptions import FieldError
+from django.core.exceptions import FieldError, ValidationError
 from django.core.files import File
 from django.db.models import Q
 from django.http import FileResponse, HttpResponse, JsonResponse
@@ -32,7 +32,7 @@ from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest,
                                                CustomIsOwner, CustomIsShare,
                                                CustomIsShared, CustomIsUser,
                                                group_required)
-from cookbook.helper.recipe_url_import import get_from_html
+from cookbook.helper.recipe_url_import import get_from_html, get_from_scraper
 from cookbook.models import (CookLog, Food, Ingredient, Keyword, MealPlan,
                              MealType, Recipe, RecipeBook, ShoppingList,
                              ShoppingListEntry, ShoppingListRecipe, Step,
@@ -54,6 +54,7 @@ from cookbook.serializer import (FoodSerializer, IngredientSerializer,
                                  UserNameSerializer, UserPreferenceSerializer,
                                  ViewLogSerializer, CookLogSerializer, RecipeBookEntrySerializer, RecipeOverviewSerializer, SupermarketSerializer)
 from recipes.settings import DEMO
+from recipe_scrapers import scrape_me, WebsiteNotImplementedError, NoSchemaFoundInWildMode
 
 
 class StandardFilterMixin(ViewSetMixin):
@@ -498,6 +499,33 @@ def get_plan_ical(request, from_date, to_date):
 def recipe_from_url(request):
     url = request.POST['url']
 
+    try:
+        scrape = scrape_me(url)
+    except WebsiteNotImplementedError:
+        try:
+            scrape = scrape_me(url, wild_mode=True)
+        except NoSchemaFoundInWildMode:
+            return JsonResponse(
+                {
+                    'error': True,
+                    'msg': _('The requested site provided malformed data and cannot be read.')  # noqa: E501
+                },
+                status=400)
+    except ConnectionError:
+        return JsonResponse(
+            {
+                'error': True,
+                'msg': _('The requested page could not be found.')
+            },
+            status=400
+        )
+    return JsonResponse(get_from_scraper(scrape))
+
+
+@group_required('user')
+def recipe_from_url_old(request):
+    url = request.POST['url']
+
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'  # noqa: E501
     }
diff --git a/requirements.txt b/requirements.txt
index 61dce5f96..4b7ff6cbf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,3 +31,4 @@ Jinja2==2.11.3
 django-webpack-loader==0.7.0
 django-js-reverse==0.9.1
 django-allauth==0.44.0
+recipe-scrapers==12.2.0