This repository has been archived by the owner on Oct 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathprediction_supervised.py
496 lines (393 loc) · 18.8 KB
/
prediction_supervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#!/usr/bin/python
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import json
import os
import random
import shutil
import sys
import time
from datetime import datetime
import pandas as pd
from azure.storage.blob import (
BlockBlobService
)
from dotenv import load_dotenv
from requests import get, post
load_dotenv()
def build_keys_json_object(keys, blob_name, anchor_key,
ground_truth_value, extracted_value, confidence, issuer_name,
actual_accuracy, extracted_page_number):
"""
This function build the json object for the auto-labelling
:param keys: The json object
:param anchor_key: The field we are looking for
:param blob_name: The name of the file we are processing
:param ground_truth_value: The ground truth value for the field in questions
:param confidence: The confidence score of the extracted value
:param issuer_name: The unique identifier of the form
:param actual_accuracy: The score we have inferred by comparing with the GT data
:param extracted_value: The value extracted from the invoice
:param extracted_page_number: The document page number the value was extracted from
:return: The appended json dict and the found keys list
"""
keys[issuer_name + ':' + blob_name].append({
'key': anchor_key,
'groundTruthValue': ground_truth_value,
'extractedValue': extracted_value,
'confidence': confidence,
'actualAccuracy': actual_accuracy,
'pageNumber': extracted_page_number
})
return keys
def extract_anchor_key_value(anchor_key, anchor_key_value):
"""
:param anchor_key: The key field we are processing
:param anchor_key_value: The value of the key
:return: The value we have extracted
"""
# TODO add custom logic and formatting to fields to be extracted
if anchor_key == 'InvoiceNumber':
# TODO basic formatting - add your own
anchor_key_value = anchor_key_value.upper()
anchor_key_value = anchor_key_value.strip("$")
anchor_key_value = anchor_key_value.replace("-", "")
anchor_key_value = anchor_key_value.replace(".", "")
anchor_key_value = anchor_key_value.replace("'", "")
# TODO Add any post-processing here
anchor_key_value = str(anchor_key_value.replace(",", "")).lower().strip()
anchor_key_value = anchor_key_value.replace(" ", "")
anchor_key_value = anchor_key_value.replace("/", "")
anchor_key_value = anchor_key_value.replace("-", "")
return anchor_key_value
def gt_preprocessing(anchor_key, dfGTRow):
"""
Simple formatting
:param anchor_key: Key field we are extracting
:param dfGTRow: The Ground Truth dataframe record for the file we are processing
:return: The value extracted
"""
# TODO add logic here for multi-page fields
gt_key_value = str(dfGTRow.iloc[0][anchor_key]).lower().strip()
return gt_key_value
def download_input_files_from_blob_storage(
blob_service,
container_name,
input_folder_path,
ext='pdf',
num_sample=40):
"""
:param blob_service: The blob service client
:param container_name: The container we are working with
:param input_folder_path: Folder path that contains the files for processing
:param ext: Document extension - pdf
:param num_sample: How many files we are sampling
:return: list of files to process
"""
blobs = blob_service.list_blobs(container_name)
blob_names = []
for blob in blobs:
if not blob.name.endswith(ext):
continue
length = BlockBlobService.get_blob_properties(blob_service, container_name,
blob.name).properties.content_length
# TODO at the time of writing this is the file size limit for a single form amend if needed
if length >= 4000000:
continue
blob_names.append(blob.name)
print(f"\t Blob name: {blob.name}")
random.seed(16371580834230)
sample_ok = False
sample_decrement_counter = 0
# Now we sample files for prediction
while not sample_ok:
try:
little_blob_names = random.sample(blob_names, k=num_sample - sample_decrement_counter)
sample_ok = True
except Exception as not_enough_samples:
print(f"{not_enough_samples} Reducing samples by 1")
sample_decrement_counter += 1
for blob_name in little_blob_names:
print(f'Sampling file {blob_name}')
blob_service.get_blob_to_path(container_name, blob_name, file_path=input_folder_path + '/' + blob_name)
input_doc_files = [f for f in os.listdir(input_folder_path) if f.endswith(ext)]
return input_doc_files
def process_folder_and_predict(
keys,
input_folder_path,
ground_truth_df,
model_id,
issuer_name,
input_doc_files,
key_field_names,
region,
subscription_key,
):
"""
Iterate through our storage accounts, download, correlate with Ground Truth and invoke downstream
functions
:param keys: Our data structure to store the results of every file prediction
:param input_folder_path: The folder we are processing
:param ground_truth_df: The dataframe with our ground truth
:param model_id: The model associated with the files we are predicting
:param issuer_name: The unique identifier of the form being processed
:param input_doc_files: List of files to predict
:param key_field_names: The fields we want to extract
:param region: The region where Form Recognizer is deployed
:param subscription_key: The Form Recognizer key
:return:
"""
# Let's get the fields we want to extract into a list
anchor_keys = [f for f in key_field_names.split(',')]
for input_file_name in input_doc_files:
fieldcount = 0
field_match_count = 0
keys[issuer_name + ':' + input_file_name] = []
try:
before_call_time = datetime.now().strftime("%H:%M:%S")
# Call to the Form Recognizer service
resp = form_recognizerv2_analyse(region,
subscription_key,
model_id, input_file_name, input_folder_path)
after_call_time = datetime.now().strftime("%H:%M:%S")
short_file_name = str(input_file_name[:len(input_file_name) - 4])
print(f'Searching for GT record {short_file_name}')
# TODO add your file name identifier here from your Ground Truth
df_gt_row = ground_truth_df[ground_truth_df['FILENAME'] == short_file_name]
# loop through anchor keys and identify what was extracted by Form Recognizer
for anchor_key in anchor_keys:
anchor_key = anchor_key.strip()
fields = resp['analyzeResult']['documentResults'][0]['fields']
if anchor_key in fields:
fieldcount += 1
anchor_key_value = str(fields[anchor_key]['text'])
anchor_key_page_num = str(fields[anchor_key]['page'])
confidence = fields[anchor_key]['confidence']
if anchor_key == 'TOTAL':
# TODO add custom total formatting here
anchor_key_value = anchor_key_value.replace(",", "")
# TODO add your custom formatting here if required
"""
anchor_key_value = extract_anchor_key_value(
anchor_key,
anchor_key_value)
"""
# TODO add your custom formatting/normalisation of your Ground Truth here
gt_original_value = str(df_gt_row.iloc[0][anchor_key])
gt_key_value = gt_preprocessing(anchor_key, df_gt_row)
# Does the post processed predicted field match the preprocessed ground truth
if anchor_key_value.lower().strip() == gt_key_value:
field_match_count += 1
print(f'{input_file_name} {anchor_key} Ground Truth: {gt_key_value.upper()} Extracted:'
f' {anchor_key_value.upper()}')
actual_accuracy = field_match_count / fieldcount
# Add key extraction to the output json
keys = build_keys_json_object(keys, input_file_name,
anchor_key, gt_original_value.strip(),
anchor_key_value.strip(),
confidence,
issuer_name,
actual_accuracy,
anchor_key_page_num)
after_processing_time = datetime.now().strftime("%H:%M:%S")
print(f"start call: {before_call_time} "
f"end_call_time: {after_call_time} "
f"end processing: {after_processing_time}")
except Exception as e:
exc_type, _, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(f'Predict Error {e} {exc_type} {fname} {exc_tb.tb_lineno}')
return keys
def get_prediction(region, subscription_key, blob_sas_url, model_id, predict_type):
"""Gets a prediction for a document with the Form Recognizer supervised model"""
print(f"MODEL ID : {model_id}")
headers = {
"Content-Type": "application/pdf",
"Ocp-Apim-Subscription-Key": subscription_key,
}
url = f"https://{region}.api.cognitive.microsoft.com/formrecognizer/v2.0/custom/models/{model_id}/analyze?includeTextDetails=True"
result = None
try:
f = get(blob_sas_url)
resp = post(url=url, data=f.content, headers=headers)
if resp.status_code == 202:
status_url = resp.headers['Operation-Location']
print(f"Invoice analyze submitted. Operation Location: {status_url}")
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
resp = get(url=status_url, headers=headers)
print(resp.json())
count = 0
max_retry = 100
while (count < max_retry and (resp.status_code == 200 or resp.status_code == 429) and (
resp.json()['status'] == 'running' or resp.json()['status'] == 'notStarted')):
resp = get(url=status_url, headers=headers)
time.sleep(1)
count += 1
print(resp.json()['status'])
# print(resp.json())
result = resp.json()['analyzeResult']
else:
print(f"Error during analysis: {str(resp.text)}")
except Exception as e:
print(f"Error analyzing invoice : {e}")
prediction = {}
if result != None:
try:
prediction['readResults'] = result['readResults']
if predict_type == 'supervised':
prediction['fields'] = []
for key in result['documentResults'][0]['fields'].keys():
f = result['documentResults'][0]['fields'][key]
if f != None:
field = {}
field['label'] = key
field['text'] = f['text']
field['confidence'] = f['confidence']
field['boundingBox'] = f['boundingBox']
prediction['fields'].append(field)
else:
prediction['keyValuePairs'] = result['pageResults'][0]['keyValuePairs']
except Exception as e:
print(f"Prediction is invalid: {e}")
return prediction
def form_recognizerv2_analyse(region, subscription_key, model_id, file_name, file_name_path):
"""
Analyses a document with the Form Recognizer supervised model
:param base_url: Prefix url for service
:param subscription_key: CogSvc key
:param model_id: Model associated with the document to predict
:param file_name: File name we a predicting
:param file_name_path: Path for file we are predicting
:return: Prediction json response object
"""
headers = {
"Ocp-Apim-Subscription-Key": subscription_key,
"Content-Type": "application/pdf"
}
print(f'Evaluating against model_id {model_id}')
url = f"https://{region}.api.cognitive.microsoft.com/formrecognizer/v2.0/custom/models/{model_id}/analyze?includeTextDetails=True"
print(f'Predict {file_name} {file_name_path}')
try:
files = {'file': (file_name, open(file_name_path + '/' + file_name, 'rb'),
'application/pdf', {'Expires': '0'})}
resp = post(url=url, files=files, headers=headers)
if resp.status_code == 202:
status_url = resp.headers['Operation-Location']
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
resp = get(url=status_url, headers=headers)
while not resp.json()['status'] == 'succeeded':
resp = get(url=status_url, headers=headers)
return resp.json()
else:
print(f"Error predicting {resp.text}")
except Exception as e:
exc_type, _, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(f'Predict error {e} {exc_type} {fname} {exc_tb.tb_lineno}')
def get_ground_truth():
"""
TODO Add code to retrieve the ground truth from your datastore
:return: Data frame with the Ground Truth
"""
df = None
models_df = None
try:
# TODO load your Ground Truth file
df = pd.read_pickle(Config.GROUND_TRUTH_PATH, compression=None)
# TODO load your model/issuer lookup
models_df = pd.read_csv(Config.MODEL_LOOKUP, delimiter=',', compression=None)
except Exception as e:
exc_type, _, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(f'Error loading files {e} {exc_type} {fname} {exc_tb.tb_lineno}')
return df, models_df
class Config:
"""
Read from .env
"""
TRAINING_END_POINT = os.environ.get("TRAINING_END_POINT") # FR Training endpoint
ANALYZE_END_POINT = os.environ.get("ANALYZE_END_POINT") # OCR endpoint
SUBSCRIPTION_KEY = os.environ.get("SUBSCRIPTION_KEY") # CogSvc key
STORAGE_ACCOUNT_NAME = os.environ.get("STORAGE_ACCOUNT_NAME") # Account name for storage
STORAGE_KEY = os.environ.get("STORAGE_KEY") # The key for the storage account
KEY_FIELD_NAMES = os.environ.get("KEY_FIELD_NAMES") # The fields to be extracted e.g. invoicenumber,date,total
SAS_PREFIX = os.environ.get("SAS_PREFIX") # First part of storage account
SAS = os.environ.get("SAS") # SAS for storage train
SAS_TEST = os.environ.get("SAS") # SAS for storage test
RUN_FOR_SINGLE_ISSUER = os.environ.get("RUN_FOR_SINGLE_ISSUER") # If true process only this issuer
DOC_EXT = os.environ.get("DOC_EXT")
LANGUAGE_CODE = os.environ.get("LANGUAGE_CODE") # The language we invoke Read OCR in only en supported now
GROUND_TRUTH_PATH = os.environ.get("GROUND_TRUTH_PATH") # This is the path to our Ground Truth
LOCAL_WORKING_DIR = os.environ.get(
"LOCAL_WORKING_DIR") # The local temporary directory to which we write and remove
CONTAINER_SUFFIX = os.environ.get(
"CONTAINER_SUFFIX") # The suffix name of the containers that store the training datasets
LIMIT_TRAINING_SET = os.environ.get("LIMIT_TRAINING_SET") # For testing models by file qty trained on
COUNTRY_CODE = os.environ.get("COUNTRY_CODE") # Our country code if needed
MODEL_ID = os.environ.get("MODEL_ID") # Run for a single model
MODEL_LOOKUP = os.environ.get("MODEL_LOOKUP") # The issuer to modelId lookup
TRAIN_TEST = os.environ.get("TRAIN_TEST") # Suffixes train or test to container name
SAMPLE_NUMBER = os.environ.get("SAMPLE_NUMBER") # Sample number of files for prediction
KEY_FIELD_NAMES = os.environ.get("KEY_FIELD_NAMES") # The fields we want to extract
REGION = os.environ.get("REGION") # The region Form Recognizer and OCR are deployed
def main():
"""
:param argv: See input args below
:return: Generates cluster file
"""
rf = Config.LOCAL_WORKING_DIR
# get the ground truth file for the key value extraction
ground_truth_df, models_df = get_ground_truth()
print(f'Downloaded GT and models {models_df}')
block_blob_service = BlockBlobService(
account_name=Config.STORAGE_ACCOUNT_NAME, account_key=Config.STORAGE_KEY)
issuer_name = ''
containers = block_blob_service.list_containers()
for container in containers:
keys = {}
if len(Config.RUN_FOR_SINGLE_ISSUER) > 0:
# This is where you control what it predicts
temp_container_name = Config.RUN_FOR_SINGLE_ISSUER + Config.CONTAINER_SUFFIX + Config.TRAIN_TEST
if temp_container_name not in container.name:
continue
if Config.CONTAINER_SUFFIX + Config.TRAIN_TEST \
not in container.name or container.name[:1] != Config.COUNTRY_VENDOR_PREFIX:
continue
issuer_name = container.name[:9]
print(f'Searching model file for issuer {issuer_name.strip()}')
# TODO add your unique identifier key here
model_df = models_df[models_df['Your unique key'] == int(issuer_name.strip())]
if len(Config.MODEL_ID) > 0:
model_id = Config.MODEL_ID
else:
print(f'Searching for {issuer_name} {len(model_df)}')
model_id = model_df['modelId'].iloc[0]
print(f'Model for vendor {issuer_name} is {model_id}')
vendor_folder_path = f"{rf}/{container.name}"
if not os.path.exists(vendor_folder_path):
os.mkdir(vendor_folder_path)
# Download the files to predict locally
input_doc_files = download_input_files_from_blob_storage(
block_blob_service, container.name, vendor_folder_path, Config.DOC_EXT,
int(Config.SAMPLE_NUMBER))
keys = process_folder_and_predict(
keys,
vendor_folder_path,
ground_truth_df,
model_id,
issuer_name,
input_doc_files,
Config.KEY_FIELD_NAMES,
Config.REGION
)
# Let's clean up to save space
shutil.rmtree(vendor_folder_path)
print(f'Predict finished')
# Let's write our prediction file here
# TODO add versioning here
with open(Config.LOCAL_WORKING_DIR + '/supervised_predict_' + str(issuer_name) +
'_.json', 'w') as json_file:
json.dump(keys, json_file)
print('Wrote lookup file', str(issuer_name))
if __name__ == "__main__":
main()