-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
96 lines (84 loc) · 4.22 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#coding: utf-8
"""
Scrape an HTML file to extract relevant parts
The scraping configuration is a parameter to each of its public methods.
DEPENDENCIES:
* lxml
* re
"""
import sys
import time
from art import *
from lxml import html
import re
print(text2art("web scraper 1.1"))
if True :
try:
def scrape(fd, conf):
return scrapes(fd.read(), conf)
def scrapes(html_string, conf):
html_tree = html.fromstring(html_string)
return process(html_tree, html_string, conf)
def process(html_tree, html_string, conf):
result = {}
for field in conf:
#extract field content with xpath or regexp:
scraped = None
if 'xpath' in conf[field]:
xpath = conf[field]['xpath']
scraped = html_tree.xpath(xpath)
if isinstance(scraped, list): #element list
try:
scraped = map(lambda x : x.text, scraped)
except:
pass
elif 'regexp' in conf[field]:
regexp = conf[field]['regexp']
scraped = re.findall(regexp, html_string)
if scraped is not None:
#encode field if character encoding is defined:
if 'encoding' in conf[field]:
encoding = conf[field]['encoding']
if encoding is not None:
if isinstance(scraped, list): #list value
try:
scraped = map(lambda x : x.decode(encoding), scraped)
except Exception as e:
print ("Error decoding %s field: %s") % (field, e)
else: #single value
try:
scraped = scraped.decode(encoding)
except Exception as e:
print ("Error decoding %s field: %s") % (field, e)
#apply transformations (if defined)
if 'transf' in conf[field]:
#apply transformations in chain:
for func in conf[field]['transf']:
if isinstance(scraped, list): #list value
try:
scraped = map(func, scraped)
except Exception as e:
print ("Error applying function %s to element list: %s") % (func, e)
scraped = None
break #dont include erroneous field
else: #single value:
try:
scraped = func(scraped)
except Exception as e:
print ("Error applying func %s to field value %s") % (func, scraped)
scraped = None
break #dont include erroneous field
if scraped is None: #some error occurred as a result of applying transformations
if 'default' in conf[field]:
scraped = conf[field]['default']
result[field] = scraped
return result
except Exception as e:
print("Loading:")
#animation = ["10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"]
animation = ["[■□□□□□□□□□]","[■■□□□□□□□□]", "[■■■□□□□□□□]", "[■■■■□□□□□□]", "[■■■■■□□□□□]", "[■■■■■■□□□□]", "[■■■■■■■□□□]", "[■■■■■■■■□□]", "[■■■■■■■■■□]", "[■■■■■■■■■■]"]
for i in range(len(animation)):
time.sleep(0.2)
sys.stdout.write("\r" + animation[i % len(animation)])
sys.stdout.flush()
print("\n")