-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
76 lines (58 loc) · 2.3 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from requests import get
from itertools import count
from bs4 import BeautifulSoup
import os, sys
def main():
res = request(sys.argv[1])
content = scrape(res)
path = save(content)
print('\nDone! Scraped content has generated at ' + path)
def request(url): # Request URL from the the terminal
print('\nLoading...')
try:
res = get(url)
res.encoding = 'utf-8'
return res
except:
print('\nLink is inaccessible.')
sys.exit()
def scrape(res): # Scrape the webpage and return the reorganized text
content = ''
soup = BeautifulSoup(res.content, 'html.parser')
text = soup.find_all(attrs={'data-el':'text'})
for i in text:
content += i.get_text() + '\n\n'
if content == '':
print('\nThe webpage content is inaccessible. This may because:\n1. The news service requires subscription\n2. The news webpage does not use a conventional data tag.\n3. This is not a news webpage.\nPlease try other scraper service.')
sys.exit()
content = write_head(soup) + content
return content
def save(content): # Write the reorganized content as a .txt file
path = filename('content')
with open(filename('content'),'w',encoding='utf-8') as f:
f.write(content)
return path
def write_head(soup): # Collect headline, author, and time information
headline = soup.find(attrs={"data-qa":"headline-text"})
author = soup.find(attrs={"data-qa":"author-name"})
time = soup.find(attrs={"data-testid":"display-date"})
List = error_check([headline,author,time])
content = 'Title: ' + List[0] + '\n\nAuthor: ' + List[1] + '\n\nDate: ' + List[2] + '\n\n\n\n'
return content
def filename(filename): # Generate a filename. If there is a file already, add a number behind the name.
c = 0
path = filename + '.txt'
for i in count():
while os.path.exists(path):
c += 1
path = filename + str(c) + '.txt'
break
return path
def error_check(List): # Replace the invalid information with the placeholder.
for i in range(len(List)):
try:
List[i] = List[i].get_text()
except:
List[i] = 'Unknown'
return List
main()