-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathDownloader.py
161 lines (138 loc) · 5.27 KB
/
Downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import re
import time
import Logging
kWgetProgressLineMod = 10
kWgetNumTries = 3
kWgetTimeout = 10 # in secs
kMaxFilenameLenPrint = 15
def _wget_is_progress_line(line):
"""
:param str line:
:rtype: bool
"""
parts = line.split()
if not parts:
return False
first = parts[0]
if not first:
return False
first = first[:-1] # "K" or so is at the end
try:
int(first)
except ValueError:
return False
return True
def convert_to_unicode(value):
"""
:param str|bytes|unicode value:
:rtype: str
"""
if isinstance(value, str):
return value
assert isinstance(value, bytes)
try:
value = value.decode("utf-8")
except UnicodeError:
try:
value = value.decode() # default
except UnicodeError:
try:
value = value.decode("iso-8859-1")
except UnicodeError:
value = value.decode("utf-8", "replace")
#value = value.replace(u"\ufffd", "?")
assert isinstance(value, str)
return value
class DownloadFatalError(Exception):
"""
E.g. the file does not exist anymore, or so.
We should not retry anymore.
"""
class DownloadTemporaryError(Exception):
"""
We hope that we can still get access to the file at some later point.
"""
class Downloader:
def __init__(self, url):
"""
:param str url:
"""
self.url = url
self.last_output_time = None
def __repr__(self):
return "Downloader(%r)" % self.url
def describe_state(self):
if not self.last_output_time:
return "not started"
return "last output %f secs ago" % (time.time() - self.last_output_time,)
def __str__(self):
return "%r, %s" % (self, self.describe_state())
def run(self):
url = self.url
filename = os.path.basename(str(url))
filename, ext = os.path.splitext(filename)
if len(filename) > kMaxFilenameLenPrint:
filename = filename[:kMaxFilenameLenPrint] + "..."
filename = filename + ext
print_prefix = "wget (%s)" % filename
Logging.log("%s: start download %s" % (print_prefix, url))
args = ["wget",
"--continue",
"--no-check-certificate", # SSL errors ignored, like in list-dir
"--force-directories",
"--directory-prefix", "downloads/",
"--progress=dot:mega", # see also the progress handling below
"--tries=%i" % kWgetNumTries, # note that we also do our own retry-handling
"--timeout=%i" % kWgetTimeout,
str(url)]
Logging.log(" ".join(map(repr, args)))
from subprocess import Popen, PIPE, STDOUT
env = os.environ.copy()
env["LANG"] = env["LC"] = env["LC_ALL"] = "en_US.UTF-8"
devnull = open(os.devnull, "rb")
p = Popen(args, stdin=devnull, stdout=PIPE, stderr=STDOUT, bufsize=0, env=env)
target_filename = None
progress_line_idx = 0
while p.returncode is None:
line = p.stdout.readline()
self.last_output_time = time.time()
line = convert_to_unicode(line)
line = line.rstrip()
if not line:
pass # Cleanup output a bit.
elif _wget_is_progress_line(line):
if progress_line_idx % kWgetProgressLineMod == 0:
Logging.log("%s progress: %s" % (print_prefix, line))
progress_line_idx += 1
elif line.strip().startswith("=>") or line.strip().startswith("Saving to:"): # => ‘downloads/...’
m = re.match("\\s*=> ‘(.*)’", line) or re.match("\\s*Saving to: ‘(.*)’", line)
assert m, "%s: unexpected target filename pattern: %r" % (self, line)
target_filename = m.group(1)
Logging.log("%s: target filename: %s" % (print_prefix, target_filename))
else:
Logging.log("%s: %s" % (print_prefix, line))
# The only good way to check for certain errors.
if line.startswith("No such file "):
p.kill()
raise DownloadFatalError("error: " + line)
if line.startswith("No such directory "):
p.kill()
raise DownloadFatalError("error: " + line)
if "404 Not Found" in line:
p.kill()
raise DownloadFatalError("error: " + line)
if "416 Requested Range" in line:
p.kill()
raise DownloadFatalError("error: " + line)
p.poll()
if progress_line_idx > 0:
assert target_filename
if os.path.basename(target_filename) != ".listing":
assert os.path.exists(target_filename)
if p.returncode != 0:
if not target_filename or not os.path.exists(target_filename):
# Just skip it. We might later add it again.
raise DownloadFatalError("return code %i; download not started" % p.returncode)
raise DownloadTemporaryError("return code %i; download started" % p.returncode)
Logging.log("%s done." % print_prefix)