forked from CSSEGISandData/COVID-19
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcheck.py
113 lines (100 loc) · 3.81 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from os import (
mkdir, makedirs, replace, listdir, rmdir, symlink, rename, remove, environ,
walk, lstat, chmod, stat, readlink )
from os.path import (
basename, join, isfile, isdir, islink, relpath, abspath, dirname, split,
getsize, splitext )
from covid19ru.defs import ( COVID19RU_ROOT, COVID19RU_TSROOT )
from pandas import DataFrame, read_csv, notnull
from datetime import datetime
from typing import ( Any, List, Dict, Tuple, NamedTuple, Optional )
from collections import defaultdict
Error=NamedTuple('Error',[('file',str),('text',str)])
class CheckerState:
def __init__(self)->None:
self.prev:Dict[int,Optional[DataFrame]]=defaultdict(lambda:None) # Last seen data of certain format
def filedate(filepath:str)->datetime:
return datetime.strptime(splitext(basename(filepath))[0],"%m-%d-%Y")
def is_format1(f:str):
return filedate(f)<datetime(2020,3,22)
def is_format2(f:str):
return filedate(f)>=datetime(2020,3,22)
def is_format2_buggy(f:str):
return filedate(f)==datetime(2020,3,22)
def filter_ru(df:DataFrame)->DataFrame:
return df[(df['Country_Region']=='Russia') & notnull(df['Province_State'])]
def check_file(filepath:str, cs:CheckerState)->List[Error]:
fmt=1
try:
fn=basename(filepath)
print(f'Checking {fn}', end='')
df=read_csv(filepath)
if is_format1(filepath):
fmt=1
print('.....skipping',end='')
elif is_format2(filepath):
fmt=2
ru=filter_ru(read_csv(filepath))
prev=cs.prev[fmt]
if prev is not None:
prev_ru=filter_ru(prev)
prev_regions=len(prev_ru.index)
num_regions=len(ru.index)
if not (num_regions>=prev_regions):
pr=list(sorted(prev_ru['Province_State'].tolist()))
nr=list(sorted(ru['Province_State'].tolist()))
print(
f"\n"
f"Prev regions: {pr}"
f"\n\n"
f"New regions: {nr}"
f"\n\n"
f"Diff: {set(pr)-set(nr)}"
)
assert False, (
f"Number of regioins decreased! "
f"{num_regions} < {prev_regions}. "
)
assert len(ru[ru['Confirmed']>=0].index)==num_regions, 'ill-formed confirmed'
assert len(ru[ru['Deaths']>=0].index)==num_regions, 'ill-formed deaths'
assert len(ru[ru['Recovered']>=0].index)==num_regions, 'ill-formed recovered'
new_regions=False
for i,row in ru.iterrows():
region=row['Province_State']
p=prev[prev['Province_State']==region]
if len(p.index)>0:
prow=p.iloc[0]
assert row['Confirmed'] >= prow['Confirmed'], \
f"Confirmed decreased for {region} from {prow['Confirmed']} to {row['Confirmed']}"
assert row['Deaths'] >= prow['Deaths'], \
f"Resurrected in {region}?? {prow['Deaths']} -> {row['Deaths']}"
assert row['Recovered'] >= prow['Recovered'], \
f"Recovered decreased in {region} from {prow['Recovered']} to {row['Recovered']} (oh no!)"
else:
new_regions=True
if new_regions:
print('.....newregions', end='')
else:
print('.....noprev', end='')
else:
raise ValueError('Unknown format')
cs.prev[fmt]=df
print('.....OK')
return []
except KeyboardInterrupt:
cs.prev[fmt]=None
raise
except Exception as e:
cs.prev[fmt]=None
print('.....ERROR')
return [Error(filepath,str(e))]
def check_all(root:str=COVID19RU_ROOT, tsroot:str=COVID19RU_TSROOT)->List[Error]:
cs=CheckerState()
errors=[]
for root, dirs, filenames in walk(abspath(root), topdown=True):
for filename in sorted(filenames):
if filename.endswith('csv'):
filepath=abspath(join(root, filename))
errors.extend(check_file(filepath,cs))
read_csv(join(tsroot,'time_series_covid19_confirmed_RU.csv'))
return errors