forked from OpenSourceOrg/licenses
-
Notifications
You must be signed in to change notification settings - Fork 3
/
compile.py
executable file
·120 lines (98 loc) · 3.52 KB
/
compile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# Copyright (c) 2015, Paul R. Tagliamonte <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import os.path
import sys
import json
import validictory
import audit
if sys.version_info < (3,):
print("This program is Python 3 only")
sys.exit(0)
def cleanup(obj):
for key in ["identifiers", "links", "other_names", "keywords", "text"]:
if key not in obj:
obj[key] = []
return obj
def load_file(path):
"""
Load a JSON file and yield out all top level elements from the JSON
list in the file.
"""
with open(path) as fd:
yield from map(cleanup, json.load(fd))
def merge_into(root, new):
lists = ["identifiers", "links", "other_names", "keywords", "text"]
id_ = new.pop('id')
for key, value in new.items():
if key in lists:
if key not in root:
root[key] = value
continue
root[key] += new[key]
continue
if key in root:
raise ValueError("Fatal error: Key \"{}\" present on multiple "
"documents ({})".format(key, id_))
root[key] = value
return root
def merge_stream(stream):
merged = {}
for el in stream:
id_ = el['id']
root = merged.get(id_, {"id": id_})
if 'name' in root and 'name' in el:
raise ValueError("`name` given on two objects")
merged[id_] = merge_into(root, el)
return merged.values()
def validate(stream):
with open("schema/license.json", 'r') as fd:
schema = json.load(fd)
def valid_schema(obj):
try:
validictory.validate(obj, schema)
except validictory.validator.RequiredFieldValidationError:
print("Failure to validate {id}".format(**obj))
raise
seen = set()
for el in stream:
valid_schema(el)
if el['id'] in seen:
raise ValueError("Duplicate ID in stream")
seen.add(el['id'])
if 'name' not in el:
raise ValueError("Object {id} missing a name attribute".format(**el))
yield el
def stream_licenses(path="./licenses"):
"""
Given a path, walk all the JSON in the directory, and yield back all the
license data blobs from each.
"""
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
if not filename.endswith(".json"):
continue
yield from load_file(os.path.join(dirpath, filename))
def load_licenses(path="./licenses", output="licenses.json"):
licenses = stream_licenses(path=path)
data = list(sorted(validate(merge_stream(licenses)), key=lambda x: x['id']))
with open(output, 'w') as fd:
json.dump(data, fd, sort_keys=True)
print("{len} records written out".format(len=len(data)))
report = audit.audit(path=output)
audit.display_report(report=report)
if __name__ == "__main__":
load_licenses(*sys.argv[1:])