-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathportableapps_com_crawl.py
70 lines (58 loc) · 2.7 KB
/
portableapps_com_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
from lxml.html import fromstring
import progressbar
import crawler
class PortableAppsCrawler(crawler.Crawler):
def __init__(self):
crawler.Crawler.__init__(self, "https://portableapps.com")
self.app_links = None
def login(self):
form_build_id = fromstring(self.sess.get(self.base_url + "/user/login").content).xpath('//input[@name="form_build_id"]/@value')
payload = {
'name': 'username',
'pass': 'password',
'form-build-id': form_build_id,
'form_id': "user_login"
}
self.sess.post("https://portableapps.com/user/login", data=payload)
def fetch_executables(self, count, skip_count=0):
self.extract_all_app_links()
if skip_count > len(self.app_links):
return
if skip_count + count > len(self.app_links):
count = len(self.app_links) - skip_count
self.app_links = self.app_links[skip_count:(skip_count+count)]
count_downloaded = 0
max_val = len(self.app_links)
with progressbar.ProgressBar(max_value=max_val) as bar:
for i in range(max_val):
current_app_link = self.app_links.pop()
try:
self.get_document(self.base_url + current_app_link)
download_list = self.xpath_selector.xpath('//div[@class="download-box"]//a/@href')
if len(download_list) is 0:
continue
download_link = download_list[0]
direct_link = None
if download_link[-4::] == ".exe" and "redirect" in download_link:
direct_link = download_link
else:
self.get_document(self.base_url + download_link)
try:
direct_link = self.xpath_selector.xpath('//article[1]//a[1]/@href')[0]
except Exception:
direct_link = str(self.xpath_selector.xpath('//article[1]//a[1]/@href'))
self.file_name = re.search("a=(.*)&s", direct_link).group(1) + ".exe"
self.download_file(self.base_url + direct_link)
count_downloaded = count_downloaded + 1
except Exception:
bar.update(i)
continue
bar.update(i)
print("Successfully downloaded " + str(count_downloaded) + " samples.")
def extract_all_app_links(self):
self.get_document(self.base_url + "/apps")
self.app_links = self.xpath_selector.xpath('//ul[@class="appdirectory"]//a/@href')
c = PortableAppsCrawler()
c.login()
c.fetch_executables(500, 250)