From 1b9b5bbd4dd5acfcb644f6b5fd33a02422320808 Mon Sep 17 00:00:00 2001 From: Soujanya8977 <62736952+Soujanya8977@users.noreply.github.com> Date: Mon, 18 Oct 2021 14:15:50 -0500 Subject: [PATCH 1/4] Fixed coloumns_names issue. Fixed coloumns_names issue. --- dxc/ai/clean_data/clean_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dxc/ai/clean_data/clean_data.py b/dxc/ai/clean_data/clean_data.py index c17051c..f6e1c54 100644 --- a/dxc/ai/clean_data/clean_data.py +++ b/dxc/ai/clean_data/clean_data.py @@ -1,5 +1,5 @@ import pandas as pd -#import janitor #data cleaning +import janitor #data cleaning from ftfy import fix_text #data cleaning import nltk #data cleaning nltk.download('punkt') #data cleaning @@ -34,10 +34,10 @@ def clean_dataframe(df, impute = False, text_fields = [], date_fields = [], nume clean_df = ( df #make the column names lower case and remove spaces - #.clean_names() + .clean_names() #remove empty columns - #.remove_empty() + .remove_empty() #remove empty rows and columns .dropna(how='all') @@ -69,7 +69,7 @@ def clean_dataframe(df, impute = False, text_fields = [], date_fields = [], nume field = '_'.join(field.split()).lower() clean_df[field] = clean_df[field].astype('category') - #clean_df=clean_df.clean_names() + clean_df=clean_df.clean_names() globals_file.clean_data_used = True From 3e5b15274c4c4d2ac36c7c0a9b0833d331b66f86 Mon Sep 17 00:00:00 2001 From: Soujanya8977 <62736952+Soujanya8977@users.noreply.github.com> Date: Mon, 18 Oct 2021 14:21:34 -0500 Subject: [PATCH 2/4] Fixed coloumn_name issue. Fixed coloumn_name issue. --- dxc/ai/pipeline/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dxc/ai/pipeline/pipeline.py b/dxc/ai/pipeline/pipeline.py index c349ab3..4607d2e 100644 --- a/dxc/ai/pipeline/pipeline.py +++ b/dxc/ai/pipeline/pipeline.py @@ -27,7 +27,7 @@ def insert_collection(data_layer, collection_name, df): def write_raw_data(data_layer, raw_data, arrow_date_fields = []): ##make the column names lower case and remove spaces if globals_file.clean_data_used == True: - #raw_data = raw_data.clean_names() + raw_data = raw_data.clean_names() globals_file.wrt_raw_data_used = True globals_file.clean_data_used = False ##convert your raw data into writable data by converting Arrow dates to strings From b3ad0730563beb78944b0d951617ccdc6668a0d0 Mon Sep 17 00:00:00 2001 From: Soujanya8977 <62736952+Soujanya8977@users.noreply.github.com> Date: Mon, 18 Oct 2021 14:41:56 -0500 Subject: [PATCH 3/4] Updated pyjanaitor version Updated pyjanaitor version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a8b5e47..8947794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,14 +8,14 @@ ftfy==6.0.3 interpret_community==0.19.3 missingno==0.5.0 arrow==1.1.1 -pyjanitor==0.21.0 +pyjanitor==0.21.2 pyaf==3.0 pandas_profiling==3.0.0 datacleaner==0.1.5 Algorithmia==1.10.0 GitPython==3.1.18 ipython>=7.16.1 -janitor==0.1.1 +#janitor==0.1.1 raiwidgets==0.9.4 scikit-learn flatten-json==0.1.13 From 3bb1755289122fa3d807022aac585f9ad81c9166 Mon Sep 17 00:00:00 2001 From: Soujanya8977 <62736952+Soujanya8977@users.noreply.github.com> Date: Mon, 18 Oct 2021 14:46:58 -0500 Subject: [PATCH 4/4] updated pyjanitor version updated pyjanitor version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8947794..d5a7c47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ ftfy==6.0.3 interpret_community==0.19.3 missingno==0.5.0 arrow==1.1.1 -pyjanitor==0.21.2 +pyjanitor==0.20.0 pyaf==3.0 pandas_profiling==3.0.0 datacleaner==0.1.5