-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlibrivox2lpf.py
323 lines (248 loc) · 10.3 KB
/
librivox2lpf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
""" Transforms audiobook content from Librivox into an LPF packaged W3C Audiobook.
Copyright 2020 European Digital Reading Lab. All rights reserved.
Use of this source code is governed by a BSD-style license
available in the project repository on Github.
Creator: Laurent Le Meur, EDRLab, 2020
"""
""" sample content: download from https://librivox.org/compilation-de-poemes-001-by-various/ """
import io, os, sys
import argparse
from lxml import etree
import jsons
import json, jsonschema
from zipfile import ZipFile
import shutil
# Publication class
class Publication:
"""A W3C Publication, init limited to required properties"""
_context = ["https://schema.org","https://www.w3.org/ns/pub-context"]
conformsTo = "https://www.w3/org/TR/audiobooks/"
type = "Audiobook"
readingOrder = []
def __init__(self, name, id):
self.name = name
self.id = id
# Link class
class Link:
"""A W3C Link"""
def __init__(self, url):
self.url = url
# MediaType is a global dictionary of image content-types
MediaType = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image.png",
".gif": "image/gif",
".webp": "image/webp"
}
def getStrProperty(item, xpath, nsmap=None):
"""Get a string property found at the location specified by an XPath locator.
The xpath can point to xml elements or attributes.
In Internet Archive rss feeds, each text node inside an element is a CDATA
(which is bad) with extra spaces (which is also bad), therefore the string
must be cleaned before being returned """
values = item.xpath(xpath, namespaces=nsmap)
if len(values) == 0:
return ""
return values[0].text.strip() if isinstance(values[0], etree._Element) else values[0]
def getStrProperties(item, xpath, nsmap=None):
"""Get an array of string properties found at the location specified by an XPath locator.
The xpath can point to xml elements or attributes."""
array = []
values = item.xpath(xpath, namespaces=nsmap)
if len(values) > 0:
for v in values:
array.append(v.text.strip() if isinstance(v, etree._Element) else v)
return array
def findFileWithExtension(filePath, extensions):
"""Find a file name in the given path with the given extension"""
try:
fileNames = os.listdir(filePath)
except:
print("Folder", filePath, " not found")
return
for fileName in fileNames:
ext = os.path.splitext(fileName)
if ext[1] in extensions:
return fileName
return None
def convertRSSFeedToW3PM(filePath, rssFileName, zipFileName, coverFileName):
"""Convert a RSS feed from the Internet Archive to a W3C Publication Manifest"""
rssPath = os.path.join(filePath, rssFileName)
# get the list of files in the zip, will be used to control manifest entries vs zip entries
zipPath = os.path.join(filePath, zipFileName)
with ZipFile(zipPath, mode='r') as zip:
filesInZip = {item for item in zip.namelist()}
filesInManifest = set()
# parse the source XML
parser = etree.XMLParser(remove_comments=True)
try:
xdoc =etree.parse(rssPath, parser)
rss = xdoc.getroot()
except (etree.XMLSyntaxError):
print("Cannot parse this file, XML syntax error")
return None
# map required info from the rss structure to a new W3C Publication Manifest structure
title = getStrProperty(xdoc, "/rss/channel/title")
id = getStrProperty(xdoc, "/rss/channel/link")
if title == None or id == None:
return None
pub = Publication(title, id)
# map useful metadata from the rss structure to the W3C Publication
pub.url = getStrProperty(xdoc, "/rss/channel/atom:link[@rel='self']/@href", nsmap=rss.nsmap)
pub.author = getStrProperty(xdoc, "/rss/channel/itunes:author", nsmap=rss.nsmap)
pub.publisher = getStrProperty(xdoc, "/rss/channel/itunes:owner/itunesname", nsmap=rss.nsmap)
pub._description = getStrProperty(xdoc, "/rss/channel/description")
pub._subjects = getStrProperties(xdoc, "/rss/channel//itunes:category/@text", nsmap=rss.nsmap)
# loop through rss items, create a reading order
for item in rss.iter("item"):
url = getStrProperty(item, "media:content/@url", nsmap=rss.nsmap)
if url == None:
continue
# add an item to a set, will be used to control manifest entries vs zip entries
filesInManifest.add(os.path.basename(url))
# the url becomes a file name inside the zip (files are at the root of the archives)
track = Link(os.path.basename(url))
track.name = getStrProperty(item, "title")
track.encodingFormat = getStrProperty(item, "media:content/@type", nsmap=rss.nsmap)
# convert duration from hh:mm:ss to PThhHmmMssS
duration = getStrProperty(item, "itunes:duration", nsmap=rss.nsmap).split(":")
if len(duration) == 3:
track.duration = "PT" + duration[0] + "H" + duration[1] + "M" + duration[2] + "S"
pub.readingOrder.append(track)
# compare filesInManifest and filesInZip
# error if the manifest references files absent from the zip
#print(filesInManifest)
#print(filesInZip)
diffFiles1 = filesInManifest - filesInZip
if diffFiles1:
print("The manifest references files absent from the zip")
print(diffFiles1)
return None
# warning if there are un-referenced files in the zip
diffFiles2 = filesInZip - filesInManifest
if diffFiles2:
print("The zip archive contains files un-referenced in the manifest")
print(diffFiles2)
# add the cover as a resource
if coverFileName:
pub.resources = []
cover = Link(coverFileName)
ext = os.path.splitext(coverFileName)[1]
cover.encodingFormat = MediaType[ext]
cover.rel = "cover"
pub.resources.append(cover)
# we can't create a variable with an @ or a namespace prefix,
# therefore rely on replacing substrings.
# ensure_ascii False avoids accented characters to be json encoded (e.g. '\u00e9' for 'é')
# ident could be used as a pretty-print param (add "indent":2 to the jswargs set if needed)
jpub = jsons.dumps(pub, jdkwargs={"ensure_ascii":False} )
jpub = jpub.replace("_context", "@context")
jpub = jpub.replace("_description", "dcterms:description")
jpub = jpub.replace("_subjects", "dcterms:subject")
return jpub
def validateManifest(jpub):
"""Validate a W3C Publication Manifest vs the corresponding schema"""
# de-serialize the publication manifest
pub = json.loads(jpub)
# set the relative path of the W3C schemas (relative to the python code)
schema_search_path = "schema"
ok = False
try:
# load the different W3C sub-schemas in a dictionary
schemastore = {}
schema = None
fnames = os.listdir(schema_search_path)
for fname in fnames:
fpath = os.path.join(schema_search_path, fname)
if fpath[-5:] == ".json":
with open(fpath, "r") as schemaf:
schema = json.load(schemaf)
if "$id" in schema:
schemastore[schema["$id"]] = schema
# create a resolver for all $ref present in the dictionary
resolver = jsonschema.RefResolver("", "", schemastore)
# validate the json structure vs the json schema
jsonschema.validate(pub, schema, resolver=resolver, format_checker=jsonschema.FormatChecker())
print("Validation succeeded")
ok = True
except jsonschema.exceptions.ValidationError as err:
print("Validation failed")
print(err.message)
return ok
def saveInLPF(jpub, filePath, zipFileName, coverFileName):
"""Save a json manifest, cover and audio tracks into a new zip archive, check the zip integrity"""
baseName = os.path.splitext(zipFileName)[0]
lpfPath = os.path.join(filePath, baseName + ".lpf")
audioPath = os.path.join(filePath, zipFileName)
coverPath = os.path.join(filePath, coverFileName) if coverFileName else None
# create a new zip, use ZIP_STORED
# note: the first version was using append but resulted in checksum error with the Go zip reader
with ZipFile(lpfPath, mode='w') as zip:
# add the json manifest
zip.writestr("publication.json", jpub)
# add (optionally) the cover
if coverFileName:
zip.write(coverPath, coverFileName)
# add audio tracks from the librivox archive
# no need to look inside subfolders
with ZipFile(audioPath, mode='r') as audio:
for track in audio.namelist():
if sys.version_info >= (3, 6):
with audio.open(track) as from_item:
with zip.open(track, 'w') as to_item:
shutil.copyfileobj(from_item, to_item)
else:
# warning, memory intensive
zip.writestr(track, audio.read(track))
# check the integrity of the zip archive
if zip.testzip():
print("Zip integrity check failed")
def finalizeTransformation(filePath, rssFileName, zipFileName, coverFileName):
"""Delete the source files"""
os.remove(os.path.join(filePath, rssFileName))
os.remove(os.path.join(filePath, zipFileName))
if coverFileName:
os.remove(os.path.join(filePath, coverFileName))
def main():
parser = argparse.ArgumentParser(description="Internet Archive rss feed to W3C Publication Manifest")
parser.add_argument("filePath", help="path to the xml and zip file")
args = parser.parse_args()
# find the rss manifest
rssFileName = findFileWithExtension(args.filePath, {".xml"})
if rssFileName == None:
print("The source xml file wasn't found")
return
# find the zip archive
zipFileName = findFileWithExtension(args.filePath, {".zip"})
if zipFileName == None:
print("The source zip file wasn't found")
return
# find the optional cover
extensions = {ext for ext in MediaType}
coverFileName = findFileWithExtension(args.filePath, extensions)
# log
print("rss file: ", rssFileName)
print("zip file: ", zipFileName)
print("cover file: ", coverFileName)
# convert the rss feed into a W3C Publication Manifest
jpub = convertRSSFeedToW3PM(args.filePath, rssFileName, zipFileName, coverFileName)
if jpub == None:
return
"""
# save the json file (only for debug purpose)
with open("sample/publication.json", "w") as jsonf:
jsonf.write(jpub)
"""
# validate the json manifest vs the corresponding schema
ok = validateManifest(jpub)
if not ok:
return
# save the json manifest, cover and audio tracks into a .lpf file
saveInLPF(jpub, args.filePath, zipFileName, coverFileName)
# finalize the transformation
#finalizeTransformation(args.filePath, rssFileName, zipFileName, coverFileName)
# final message
print("lpf packaged publication generated")
if __name__ == "__main__":
main()