-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.py
49 lines (41 loc) · 1.37 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# Importing required libraries
import urllib.request,urllib.error,urllib.parse
from xlwt import Workbook
from bs4 import BeautifulSoup
from datetime import datetime
#Tacking URL input from user
url=input("Enter site to get links\n")
links=[]
while(len(url)==0):
url=input("Enter site to get links\n")
try:
# Sending request to server using BeautifulSoup
html_data=urllib.request.urlopen(url).read()
#Beautyfying all data to html form
soup=BeautifulSoup(html_data,'html.parser')
#Retriving all anchor tags in html data
tags=soup('a')
#Adding all href attribute values to list
for tag in tags:
if tag.has_attr('href'):
links.append(tag['href'])
except:
#Check if any errors
print("Please check the URL properly")
if(len(links)==0):
print("No links to fetch")
else:
# Tackning workbook
wb=Workbook()
#Creaing sheet in workbook
sheet1 = wb.add_sheet('Links')
#adding all data in list to excel sheet
for i in range(0,len(links)):
sheet1.write(i,0,links[i])
#Getting date and time to create file
data_time=datetime.now()
current_time = str(data_time.strftime("%H-%M-%S"))
#Adding time to file name and saving file locally
wb.save('links for '+current_time+'.xls')
print("Done writing data to excel sheet")
#