scrapehero-code · vatsalunadkat · Sep 24, 2023
diff --git a/app.py b/app.py
@@ -5,7 +5,8 @@
 app = Flask(__name__)
 extractor = selectorlib.Extractor.from_yaml_file('selectors.yml')
 
-def scrape(url):    
+
+def scrape(url):
     headers = {
         'authority': 'www.amazon.com',
         'pragma': 'no-cache',
@@ -20,18 +21,30 @@ def scrape(url):
         'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
     }
 
+    # Using you own data here. Check the cookies from the inspect tab.
+    cookies = {
+        "lc-acbin": "en_IN",
+        "i18n-prefs": "INR",
+        "session-id": "YOUR_SESSION_ID",
+        "session-id-time": "YOUR_SESSION_ID_TIME",
+        "session-token": "YOUR_SESSION_TOKEN",
+        "ubid-acbin": "YOUR_UBID"}
+
     # Download the page using requests
-    print("Downloading %s"%url)
-    r = requests.get(url, headers=headers)
+    print("Downloading %s" % url)
+    r = requests.get(url, headers=headers, cookies=cookies)
     # Simple check to check if page was blocked (Usually 503)
     if r.status_code > 500:
         if "To discuss automated access to Amazon data please contact" in r.text:
-            print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
+            print(
+                "Page %s was blocked by Amazon. Please try using better proxies\n" % url)
         else:
-            print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
+            print("Page %s must have been blocked by Amazon as the status code was %d" % (
+                url, r.status_code))
         return None
-    # Pass the HTML of the page and create 
-    data = extractor.extract(r.text,base_url=url)
+
+    # Pass the HTML of the page and create
+    data = extractor.extract(r.text, base_url=url)
     reviews = []
     for r in data['reviews']:
         r["product"] = data["product_title"]
@@ -41,7 +54,8 @@ def scrape(url):
                 r['verified_purchase'] = True
             else:
                 r['verified_purchase'] = False
-        r['rating'] = r['rating'].split(' out of')[0]
+        if r['rating']:
+            r['rating'] = r['rating'].split(' out of')[0]
         date_posted = r['date'].split('on ')[-1]
         if r['images']:
             r['images'] = "\n".join(r['images'])
@@ -53,13 +67,15 @@ def scrape(url):
     data['histogram'] = histogram
     data['average_rating'] = float(data['average_rating'].split(' out')[0])
     data['reviews'] = reviews
-    data['number_of_reviews'] = int(data['number_of_reviews'].split('  customer')[0])
-    return data 
-
+    data['number_of_reviews'] = int(
+        data['number_of_reviews'].split(' global rating')[0].replace(',', ''))
+    return data
+
+
 @app.route('/')
 def api():
-    url = request.args.get('url',None)
+    url = request.args.get('url', None)
     if url:
         data = scrape(url)
         return jsonify(data)
-    return jsonify({'error':'URL to scrape is not provided'}),400
+    return jsonify({'error': 'URL to scrape is not provided'}), 400