From 53d588ef718a15313581d524269e07bd87439236 Mon Sep 17 00:00:00 2001 From: Vatsal Unadkat <31386890+vatsalunadkat@users.noreply.github.com> Date: Sun, 24 Sep 2023 19:50:28 +0530 Subject: [PATCH] Added cookies + Fixed issues with rating and number of reviews --- app.py | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/app.py b/app.py index c4b67e4..5835ad3 100644 --- a/app.py +++ b/app.py @@ -5,7 +5,8 @@ app = Flask(__name__) extractor = selectorlib.Extractor.from_yaml_file('selectors.yml') -def scrape(url): + +def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', @@ -20,18 +21,30 @@ def scrape(url): 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } + # Using you own data here. Check the cookies from the inspect tab. + cookies = { + "lc-acbin": "en_IN", + "i18n-prefs": "INR", + "session-id": "YOUR_SESSION_ID", + "session-id-time": "YOUR_SESSION_ID_TIME", + "session-token": "YOUR_SESSION_TOKEN", + "ubid-acbin": "YOUR_UBID"} + # Download the page using requests - print("Downloading %s"%url) - r = requests.get(url, headers=headers) + print("Downloading %s" % url) + r = requests.get(url, headers=headers, cookies=cookies) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) + print( + "Page %s was blocked by Amazon. Please try using better proxies\n" % url) else: - print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) + print("Page %s must have been blocked by Amazon as the status code was %d" % ( + url, r.status_code)) return None - # Pass the HTML of the page and create - data = extractor.extract(r.text,base_url=url) + + # Pass the HTML of the page and create + data = extractor.extract(r.text, base_url=url) reviews = [] for r in data['reviews']: r["product"] = data["product_title"] @@ -41,7 +54,8 @@ def scrape(url): r['verified_purchase'] = True else: r['verified_purchase'] = False - r['rating'] = r['rating'].split(' out of')[0] + if r['rating']: + r['rating'] = r['rating'].split(' out of')[0] date_posted = r['date'].split('on ')[-1] if r['images']: r['images'] = "\n".join(r['images']) @@ -53,13 +67,15 @@ def scrape(url): data['histogram'] = histogram data['average_rating'] = float(data['average_rating'].split(' out')[0]) data['reviews'] = reviews - data['number_of_reviews'] = int(data['number_of_reviews'].split(' customer')[0]) - return data - + data['number_of_reviews'] = int( + data['number_of_reviews'].split(' global rating')[0].replace(',', '')) + return data + + @app.route('/') def api(): - url = request.args.get('url',None) + url = request.args.get('url', None) if url: data = scrape(url) return jsonify(data) - return jsonify({'error':'URL to scrape is not provided'}),400 \ No newline at end of file + return jsonify({'error': 'URL to scrape is not provided'}), 400