-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
141 lines (111 loc) · 2.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import csv
import json
def can_be_a_int(s: str) -> bool:
try:
int(s)
return True
except:
return False
def is_valid_year(y: str) -> bool:
return all([
y.isascii(),
y.isdigit(),
y.isdecimal(),
y.isnumeric(),
can_be_a_int(y)
])
def is_valid_rank(r: str) -> bool:
return all([
r.isascii(),
r.isdigit(),
r.isdecimal(),
r.isnumeric(),
can_be_a_int(r)
])
def can_be_a_float(x: str) -> bool:
try:
float(x)
return True
except:
print("Not a float")
return False
def is_valid_sales(s: str) -> bool:
return all([
s.isascii(),
s.replace('.', '', 1).isdigit(),
can_be_a_float(s)
])
invalids = list()
def is_valid_row(row: dict) -> bool:
if not is_valid_year(row["year"]):
invalids.append(
dict([
['reason', f"invalid year of {row['year']}"],
['entry', row],
])
)
return False
if not is_valid_rank(row["rank"]):
invalids.append(
dict([
['reason', f"invalid rank of {row['rank']}"],
['entry', row],
])
)
return False
for k in ['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales']:
if not is_valid_sales(row[k]):
invalids.append(
dict([
['reason', f"invalid {k} value of {row[k]}"],
['entry', row],
])
)
return False
return True
def cleanup_data(l: list) -> list:
return list(
filter(is_valid_row, l)
)
def parse_row(row: dict) -> dict:
out = dict([
['name', row['name']],
['publisher', row['publisher']],
['genre', row['genre']],
['platform', row['platform']],
['rank', int(row['rank'])],
['year', int(row['year'])],
['eu_sales', float(row['eu_sales'])],
['global_sales', float(row['global_sales'])],
['na_sales', float(row['na_sales'])],
['other_sales', float(row['other_sales'])],
['jp_sales', float(row['jp_sales'])],
])
return out
def parse_data(dataset: list) -> list:
return list(map(parse_row, dataset))
def jsonify(dataset: list) -> str:
return json.dumps(
dataset,
ensure_ascii=True,
check_circular=True,
allow_nan=False,
sort_keys=True,
indent=None
)
l = list()
with open("./dataset.csv", "r+") as datafile:
dictReader = csv.DictReader(
datafile,
delimiter=',',
quotechar='"',
)
for row in dictReader:
l.append(dict())
for key in row:
l[len(l) - 1][key] = row[key]
data_json = jsonify(parse_data(cleanup_data(l)))
with open("./clean-dataset.json", "w+") as output:
output.write(data_json)
with open("./invalids.json", "w+") as output:
output.write(jsonify(invalids))