-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathdataloader.py
106 lines (84 loc) · 3.23 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import multiprocessing
import pandas as pd
import os
import numpy as np
import time
import glob
import multiprocessing
#***********************************CODE USAGE GUIDE***************************************
# Work for FSE 2018
# Not directly used, should be invoked by cascading_clustering.py
# data_loader.py is a script that loads the datasets using multiple processes.
# Description:
# It loads the log sequence matrix files and a KPI file into the memory. Furthermore,
# duplicate events are removed after loading. To facilitate the loading process, we utilize
# the Python multiprocessing to load a large number of log sequence matrix files.
#******************************************************************************************
def loading_all_data(para):
""" load all log sequence matrixs, remove duplicates, and count the number of
log sequences that contain an event (used for correlation weighting in section 3.2)
Args:
--------
para: the dictionary of parameters, set in run.py
Returns:
--------
allrawData: loaded all log sequence matrix, these matrix are merged into one big matrix of (N, M).
N is the number of all log sequences, M is event number.
rawIndex: index list that used to mark which log sequences are clustered.
eveOccuMat: count the number of log sequences that contain each event, it will be used for weighting
"""
t0 = time.time()
# find the all log sequence matrix files.
path = para['seq_folder']
fileList = glob.glob(path + 'timeInter_*.csv')
fileNumList = []
for file in fileList:
fileNum = file.replace(path + 'timeInter_', '').replace('.csv', '')
fileNumList.append(int(fileNum))
print("there are %d log sequence files files found"%(len(fileNumList)))
newfileList = []
for x in sorted(fileNumList):
newfileList.append(path+ 'timeInter_'+str(x)+'.csv')
# load all the files using multiprocessing.
print('start loading data')
pool = multiprocessing.Pool(para['proc_num'])
rawdataList = pool.map(load_single_file, newfileList)
pool.close()
pool.join()
allrawData = np.vstack(rawdataList)
# index used to mark which log sequences are already processed
rawIndex = range(0, allrawData.shape[0])
# count the number of log sequences that contain each event, it will be used for weighting
eveOccuMat = []
for inter_data in rawdataList:
eveOccuMat.append(np.sum(inter_data, axis = 0))
eveOccuMat = np.array(eveOccuMat)
print('Step 1. Data Loading, the raw input data size is %d, it takes %f' % (allrawData.shape[0], time.time() - t0))
return allrawData, rawIndex, eveOccuMat
def load_kpi( kpipath):
""" load the KPI data
Args:
--------
kpipath: data path of KPI
Returns:
--------
kpiList: list of KPIs, one KPI value per time interval.
"""
df = pd.read_csv(kpipath, dtype= int, header=None)
kpiList = df.as_matrix()
return kpiList
def load_single_file(filepath):
""" load one log sequence matrix from the file path, and duplicate events are removed.
Args:
--------
filepath: file path of a log sequences matrix
Returns:
--------
rawData: log sequences matrix (duplicates removed)
"""
df = pd.read_csv(filepath, header=None)
rawData = df.as_matrix()
rawData[rawData > 1] = 1
return rawData