-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
180 lines (151 loc) · 6.5 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from bs4 import BeautifulSoup
import requests
from retry import retry
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from time import sleep
from proxy_tester import reading_proxy, test_proxy_production, get_response
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
def get_soup_splash(product, site):
url = f"{site}={get_link(product)}"
response = requests.get(f"http://localhost:8050/render.html", headers=headers, params={"url" : url, "wait": 2 })
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_soup(search, site):
print(get_link(search, site))
for proxy in reading_proxy("tested_proxies.csv"):
print("-------------------------")
response = get_response(proxy, get_link(search, site))
if response is not None:
soup = BeautifulSoup(response.text, 'lxml')
return soup
return None
def get_link(search, site):
if site == "https://www.lazada.sg/tag/":
search = search.strip().lower().replace(" ","-")
return f"{site}{search}?ajax=true&page=1"
else:
return f"{site}{search.strip().lower()}"
def get_keywords(string):
return string.strip().lower()
@retry()
def amazon_scraper(product):
site = "https://www.amazon.sg/s?k="
items = get_soup(product, site).find_all("div", class_="a-section a-spacing-base")
productData = []
data = {}
for item in items:
name = item.find("span", class_= "a-size-base-plus a-color-base a-text-normal").text
sponsored = item.find("span", class_= "aok-inline-block s-sponsored-label-info-icon")
if sponsored is None:
data = {}
data["name"] = name
data["price"] = item.find("span", class_= "a-offscreen").text.replace("S$", "")
directory = item.find("a", class_= "a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal").attrs["href"]
data["url"] = f"https://www.amazon.sg{directory}"
prime = item.find("i", class_="a-icon a-icon-prime a-icon-medium")
if prime is not None:
international = item.find("span", class_="a-color-base puis-light-weight-text").text
if international is not None:
location = "Prime and International"
else:
location = "Prime and Singapore"
else:
location = "Non Prime"
data["location"] = location
productData.append(data)
return productData
return(print("error: no items found/end of the list of items"))
def shopee_scraper(product):
# create object for chrome options
chrome_options = Options()
site = "https://shopee.sg/search?keyword="
base_url = get_link(product, site)
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('user-data-dir=C:\\Users\\darkk\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument('disable-infobars')
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
# invoke the webdriver
browser = webdriver.Chrome(executable_path = r'C:\Yu Quan\web_scraper\chromedriver.exe',
options = chrome_options)
browser.get(base_url)
delay = 5 #secods
while True:
try:
WebDriverWait(browser, delay)
print ("Page is ready")
sleep(5)
html = browser.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
i=0
productData = []
count = 5
for item in items:
data = {}
if i <= count:
ad = item.find('div', class_='Sh+UIZ')
name = item.find('div', class_='ie3A+n bM+7UW Cve6sh').text
if ad is None and get_keywords(product) in name.lower():
data["name"] = item.find('div', class_='ie3A+n bM+7UW Cve6sh').text
data["price"] = item.find('span', class_='ZEgDH9').text
data["location"] = item.find('div', class_ = 'zGGwiV').text
directory = item.find("a").attrs["href"]
data["url"] = f"https://shopee.sg{directory}"
productData.append(data)
i+= 1
return print(productData)
except:
return print("error at end of loop")
def lazada_scraper(product):
site = "https://www.lazada.sg/tag/"
url = get_link(product, site)
print(url)
for proxy in reading_proxy("tested_proxies.csv"):
response = get_response(proxy, url)
print(response)
if response is not None:
try:
response_json = response.json()
print(response_json)
break
except:
print("no json")
# response = response.json()
# break
if response_json is None:
return print("error in proxy")
items = response_json["mods"]["listItems"]
i=0
productData = []
count = 5
for item in items:
if i < count:
data = get_data(item["name"],item["priceShow"], item["itemUrl"].lstrip("/"),item["location"])
productData.append(data)
i+= 1
print(productData)
return productData
# site = "https://www.lazada.sg/tag/"
# items = get_soup(product, site).find_all("div", class_="Bm3ON")
def get_data(name, price, link, location):
data = {}
data["name"] = name
data["price"] = price
data["location"] = location
data["url"] = link
return data
if __name__ == "__main__":
lazada_scraper("airpods pro")