-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4e1380a
commit 3347266
Showing
12 changed files
with
1,255 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from imputation import Impute | ||
from missing_mat import MissMat | ||
import statsmodels.api as sm | ||
|
||
TEST = 700 | ||
filename = 'FRED-MD.csv' | ||
|
||
def get_regression_data(column_y, t_end=733, method=0, r=3): | ||
data = pd.read_csv(filename) | ||
y = data[column_y].to_numpy()[:t_end] | ||
x = data.drop(['Date', column_y], axis=1).to_numpy()[:t_end] | ||
w = np.ones(x.shape) | ||
w[np.isnan(x)] = 0 | ||
mdata = MissMat(x, w) | ||
mc = Impute(mdata) | ||
if method == 0: | ||
_ = mc.amputation(r) | ||
F = mc.F | ||
if method == 1: | ||
_ = mc.fit_via_TP(r) | ||
F = mc.F | ||
if method == 2: | ||
_ = mc.fit_via_Weight(r) | ||
F = mc.F | ||
if method == 3: | ||
_ = mc.fit_via_Nuclear(r) | ||
F = mc.F | ||
if method == 4: | ||
_ = mc.fit_via_PQ(r) | ||
F = mc.F | ||
return y, F | ||
|
||
|
||
def one_step_predict(column_y, step=0, lags=4, method=0): | ||
y, F = get_regression_data(column_y, t_end=TEST+step, method=method) | ||
data = pd.DataFrame(F, columns=['f1','f2','f3']) | ||
data['y'] = y | ||
cols = ['y', 'f1','f2','f3'] | ||
data = data[cols] | ||
for l in range(lags): | ||
data = pd.concat([data, data['y'].shift(l+1).rename('y_{}'.format(l+1))], axis=1) | ||
data = pd.concat([data, data['f1'].shift(l+1).rename('f1_{}'.format(l+1))], axis=1) | ||
data = pd.concat([data, data['f2'].shift(l+1).rename('f2_{}'.format(l+1))], axis=1) | ||
data = pd.concat([data, data['f3'].shift(l+1).rename('f3_{}'.format(l+1))], axis=1) | ||
data = data.iloc[lags:] | ||
model = sm.OLS(data['y'],data.iloc[:,4:]) | ||
results = model.fit() | ||
yhat = results.predict(data.iloc[-1:,:len(data.columns)-4].to_numpy()) | ||
return yhat[0] | ||
|
||
|
||
def out_sample_mse(column_y, method=0): | ||
yhats = [] | ||
for t in range(733-TEST): | ||
yhats.append(one_step_predict(column_y, step=t, method=method)) | ||
yhats = np.array(yhats) | ||
data = pd.read_csv(filename) | ||
y = data[column_y].to_numpy()[TEST:] | ||
rmse = np.mean((y - yhats)**2) | ||
return rmse, yhats | ||
|
||
""" | ||
cols = ['INDPRO', 'UNRATE', 'CPIAUCSL'] | ||
y, F = get_regression_data(cols[0], method=0) | ||
rmse = out_sample_mse(cols[2]) | ||
print(rmse) | ||
rmse = out_sample_mse(cols[2], method=1) | ||
print(rmse) | ||
rmse = out_sample_mse(cols[2], method=2) | ||
print(rmse) | ||
rmse = out_sample_mse(cols[2], method=3) | ||
print(rmse) | ||
""" | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
cols = ['INDPRO', 'UNRATE', 'CPIAUCSL'] | ||
rmse, yhats = out_sample_mse(cols[0]) | ||
df = pd.read_csv(filename) | ||
y = df[cols[0]].to_numpy() | ||
yhat = np.copy(y) | ||
yhat[700:] = yhats | ||
plt.figure(figsize=(10,5)) | ||
plt.plot(y[630:], color='blue', label='Real IP Index') | ||
plt.plot(yhat[630:], color='black', ls='-', lw=2,label='1 step forecast of IP Index') | ||
plt.legend() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import numpy as np | ||
|
||
|
||
def get_data(N, T, r): | ||
F = np.random.normal(size=(N, r)) | ||
L = np.random.normal(size=(T, r)) | ||
e = np.random.normal(size=(N, T)) | ||
T = F @ L.T + e | ||
return T | ||
|
||
|
||
def get_missing_lr(N, T, r, N_o, T_o, p): | ||
W = np.ones((N,T)) | ||
Wb = np.random.choice([0,1], size=(N-N_o, T-T_o), p=[p,1-p]) | ||
W[N_o:, T_o:] = Wb | ||
return W |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from missing_mat import MissMat | ||
from get_data import get_missing_lr, get_data | ||
from sklearn.decomposition import PCA | ||
from matrix_completion import svt_solve | ||
from tools import SGD | ||
|
||
|
||
class Impute(): | ||
|
||
def __init__(self, MissData: MissMat): | ||
self.MissData = MissData | ||
|
||
def fit_via_TW(self, r): | ||
# PCA on Tall block | ||
tall = self.MissData.get_tall() | ||
pca = PCA(n_components=r) | ||
pca.fit(tall) | ||
F_tall = pca.transform(tall) | ||
Lambda_tall = np.linalg.solve(F_tall.T @ F_tall, F_tall.T @ tall).T | ||
|
||
# PCA on Wide block | ||
wide = self.MissData.get_wide().T | ||
pca.fit(wide) | ||
Lambda_wide = pca.transform(wide) | ||
|
||
# regress Lambda_tall on submatrix of Lambda_wide | ||
N_o = Lambda_tall.shape[0] | ||
Lsub = Lambda_wide[:N_o] | ||
H = np.linalg.solve(Lsub.T @ Lsub, Lsub.T @ Lambda_tall) | ||
C = F_tall @ H @ Lambda_wide.T | ||
|
||
Xout = np.zeros((self.MissData.N, self.MissData.T)) | ||
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1] | ||
Xout[self.MissData.W == 0] = C[self.MissData.W == 0] | ||
self.Xout = Xout | ||
self.F = F_tall | ||
return Xout | ||
|
||
def fit_via_TP(self, r): | ||
# PCA on Tall block | ||
tall, rest, W_rest = self.MissData.get_tall_rest() | ||
pca = PCA(n_components=r) | ||
pca.fit(tall) | ||
F_tall = pca.transform(tall) | ||
|
||
N, T = self.MissData.N, self.MissData.T | ||
T_o = tall.shape[1] | ||
Xm = np.zeros((N, T-T_o)) | ||
Ls = np.zeros((T-T_o, r)) | ||
for i in range(T-T_o): | ||
x, w = rest[:,i], W_rest[:,i] | ||
f = F_tall[w==1] | ||
x = x[w==1] | ||
l = np.linalg.solve(f.T @ f, f.T @ x) | ||
Ls[i] = l | ||
Xm = F_tall @ Ls.T | ||
Xout = np.zeros((self.MissData.N, self.MissData.T)) | ||
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1] | ||
Xout[:,T_o:][self.MissData.W[:,T_o:] == 0] = Xm[self.MissData.W[:,T_o:] == 0] | ||
self.Xout = Xout | ||
self.F = F_tall | ||
return Xout | ||
|
||
def fit_via_Weight(self, r): | ||
N, T = self.MissData.N, self.MissData.T | ||
cov = pd.DataFrame(self.MissData.X).cov().to_numpy() | ||
cov[np.isnan(cov)] = np.mean(cov[~np.isnan(cov)]) | ||
eigval, eigvec = np.linalg.eig(cov) | ||
Lambda = np.sqrt(T)*eigvec[:,:r] | ||
F = np.zeros((N, r)) | ||
for i in range(N): | ||
x = np.copy(self.MissData.X[i]) | ||
x[self.MissData.W[i]==0] = 0 | ||
F[i] = np.linalg.solve(Lambda.T @ np.diag(self.MissData.W[i]) @ Lambda, | ||
Lambda.T @ x) | ||
C = F @ Lambda.T | ||
Xout = np.zeros((self.MissData.N, self.MissData.T)) | ||
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1] | ||
Xout[self.MissData.W == 0] = C[self.MissData.W == 0] | ||
self.Xout = Xout | ||
self.F = F | ||
return Xout | ||
|
||
def fit_via_Nuclear(self, r): | ||
C = svt_solve(self.MissData.X, self.MissData.W) | ||
Xout = np.zeros((self.MissData.N, self.MissData.T)) | ||
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1] | ||
Xout[self.MissData.W == 0] = C[self.MissData.W == 0] | ||
self.Xout = Xout | ||
pca = PCA(n_components=r) | ||
pca.fit(Xout) | ||
self.F = pca.transform(Xout) | ||
return Xout | ||
|
||
def fit_via_PQ(self, r): | ||
F, L = SGD(self.MissData.X_sparse, r, gamma=0.01, lamda=0.1, steps=200) | ||
C = F @ L | ||
Xout = np.zeros((self.MissData.N, self.MissData.T)) | ||
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1] | ||
Xout[self.MissData.W == 0] = C[self.MissData.W == 0] | ||
self.Xout = Xout | ||
self.F = F | ||
return Xout | ||
|
||
def amputation(self, r): | ||
row_sum = np.sum(self.MissData.W, axis=0) | ||
Xout = self.MissData.X[:, row_sum == self.MissData.N] | ||
self.Xout = Xout | ||
pca = PCA(n_components=r) | ||
pca.fit(Xout) | ||
self.F = pca.transform(Xout) | ||
return Xout | ||
|
||
|
||
if __name__ == "__main__": | ||
N, T, r = 200, 100, 3 | ||
X = get_data(N, T, r) | ||
W = get_missing_lr(N, T, r, 150, 80, 0.5) | ||
Xm = np.copy(X) | ||
Xm[W==0] = np.nan | ||
M = MissMat(Xm, W) | ||
imp = Impute(M) | ||
Xhat = imp.fit_via_TW(r) | ||
print(np.mean((Xhat[W==0] - X[W==0])**2)) | ||
Xhat = imp.fit_via_TP(r) | ||
print(np.mean((Xhat[W==0] - X[W==0])**2)) | ||
Xhat = imp.fit_via_Weight(r) | ||
print(np.mean((Xhat[W==0] - X[W==0])**2)) | ||
Xhat = imp.fit_via_Nuclear() | ||
print(np.mean((Xhat[W==0] - X[W==0])**2)) | ||
Xhat = imp.fit_via_PQ(r) | ||
print(np.mean((Xhat[W==0] - X[W==0])**2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import numpy as np | ||
from scipy.sparse import coo_matrix | ||
|
||
|
||
class MissMat(): | ||
|
||
def __init__(self, X, W): | ||
self.X = X | ||
self.W = W | ||
self.N, self.T = X.shape | ||
self.verify_data() | ||
self.X_sparse = self.get_sparse() | ||
|
||
|
||
def verify_data(self): | ||
if type(self.X) is not np.ndarray or type(self.W) is not np.ndarray: | ||
raise RuntimeError("Data is not numpy array, type: {}, {}." | ||
.format(type(self.X), type(self.W))) | ||
|
||
if self.X.shape != self.W.shape: | ||
raise RuntimeError("Shape not equal, {}, {}". | ||
format(self.X.shape, self.W.shape)) | ||
|
||
if set(self.W.reshape(-1,1).squeeze()) != {0,1}: | ||
raise RuntimeError("Value of W is not binary, {}.".format(set(self.W.reshape(-1,1).squeeze()))) | ||
|
||
|
||
def sort(self): | ||
# sort columns | ||
row_sum = np.sum(self.W, axis=0) | ||
self.idx_col = np.argsort(-row_sum) | ||
W_sort_col = self.W[:,self.idx_col] | ||
X_sort_col = self.X[:, self.idx_col] | ||
# sort rows | ||
col_sum = np.sum(W_sort_col, axis=1) | ||
self.idx_row = np.argsort(-col_sum) | ||
W_sorted = W_sort_col[self.idx_row, :] | ||
X_sorted = X_sort_col[self.idx_row, :] | ||
self.W = W_sorted | ||
self.X = X_sorted | ||
|
||
|
||
def un_sort(self, X, W): | ||
X_tmp = X[self.idx_row,:] | ||
W_tmp = W[self.idx_row,:] | ||
X_unsort = X_tmp[:,self.idx_col] | ||
W_unsort = W_tmp[:,self.idx_col] | ||
return X_unsort, W_unsort | ||
|
||
|
||
def get_sparse(self): | ||
M = np.copy(self.X) | ||
M[self.W==0] = 0 | ||
return coo_matrix(M) | ||
|
||
|
||
def demean(self): | ||
mu = np.mean(self.X[self.W==1]) | ||
self.X -= mu | ||
|
||
|
||
def get_tall(self): | ||
row_sum = np.sum(self.W, axis=0) | ||
return self.X[:,row_sum==self.N] | ||
|
||
|
||
def get_wide(self): | ||
col_sum = np.sum(self.W, axis=1) | ||
return self.X[col_sum==self.T, :] | ||
|
||
|
||
def get_tall_rest(self): | ||
row_sum = np.sum(self.W, axis=0) | ||
return self.X[:,row_sum==self.N], self.X[:,row_sum<self.N], self.W[:,row_sum<self.N] | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
X = np.arange(25).reshape(5,5) | ||
W = np.array([[0, 0, 0, 0, 0], | ||
[0, 1, 1, 1, 0], | ||
[1, 0, 1, 1, 0], | ||
[1, 0, 1, 0, 0], | ||
[1, 0, 1, 0, 0]]) | ||
M = MissMat(X, W) | ||
M.sort() | ||
print(X) | ||
print(W) | ||
print(M.X) | ||
print(M.W) | ||
print(M.get_tall()) | ||
print(M.get_wide()) | ||
print(M.X_sparse) | ||
print(M.un_sort(M.X, M.W)) |
Oops, something went wrong.