-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainingModel.py
114 lines (77 loc) · 4.64 KB
/
trainingModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
This is the Entry point for Training the Machine Learning Model.
Written By: Arpit Kumar
Version: 1.0
Revisions: None
"""
# Doing the necessary imports
from sklearn.model_selection import train_test_split
from data_ingestion import data_loader
from data_preprocessing import preprocessing
from data_preprocessing import clustering
from best_model_finder import tuner
from file_operations import file_methods
from application_logging import logger
#Creating the common Logging object
class trainModel:
def __init__(self):
self.log_writer = logger.App_Logger()
self.file_object = open("Training_Logs/ModelTrainingLog.txt", 'a+')
def trainingModel(self):
# Logging the start of Training
self.log_writer.log(self.file_object, 'Start of Training')
try:
# Getting the data from the source
data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
data=data_getter.get_data()
"""doing the data preprocessing"""
preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)
#data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.
#removing unwanted columns as discussed in the EDA part in ipynb file
data = preprocessor.dropUnnecessaryColumns(data,['DATE','Precip','WETBULBTEMPF','DewPointTempF','StationPressure'])
#repalcing '?' values with np.nan as discussed in the EDA part
data = preprocessor.replaceInvalidValuesWithNull(data)
# check if missing values are present in the dataset
is_null_present,cols_with_missing_values=preprocessor.is_null_present(data)
# if missing values are there, replace them appropriately.
if(is_null_present):
data=preprocessor.impute_missing_values(data) # missing value imputation
# get encoded values for categorical data
#data = preprocessor.encodeCategoricalValues(data)
# create separate features and labels
X, Y = preprocessor.separate_label_feature(data, label_column_name='VISIBILITY')
# drop the columns obtained above
#X=preprocessor.remove_columns(X,cols_to_drop)
""" Applying the clustering approach"""
kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization.
number_of_clusters=kmeans.elbow_plot(X) # using the elbow plot to find the number of optimum clusters
# Divide the data into clusters
X=kmeans.create_clusters(X,number_of_clusters)
#create a new column in the dataset consisting of the corresponding cluster assignments.
X['Labels']=Y
# getting the unique clusters from our dataset
list_of_clusters=X['Cluster'].unique()
"""parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
for i in list_of_clusters:
cluster_data=X[X['Cluster']==i] # filter the data for one cluster
# Prepare the feature and Label columns
cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
cluster_label= cluster_data['Labels']
# splitting the data into training and test set for each cluster one by one
x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=36)
x_train_scaled = preprocessor.standardScalingData(x_train)
x_test_scaled = preprocessor.standardScalingData(x_test)
model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization
#getting the best model for each of the clusters
best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test)
#saving the best model to the directory.
file_op = file_methods.File_Operation(self.file_object,self.log_writer)
save_model=file_op.save_model(best_model,best_model_name+str(i))
# logging the successful Training
self.log_writer.log(self.file_object, 'Successful End of Training')
self.file_object.close()
except Exception:
# logging the unsuccessful Training
self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
self.file_object.close()
raise Exception