Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
freshtaste committed Jun 5, 2021
1 parent 4e1380a commit 3347266
Show file tree
Hide file tree
Showing 12 changed files with 1,255 additions and 0 deletions.
Binary file added src/.DS_Store
Binary file not shown.
734 changes: 734 additions & 0 deletions src/FRED-MD.csv

Large diffs are not rendered by default.

Binary file added src/__pycache__/get_data.cpython-38.pyc
Binary file not shown.
Binary file added src/__pycache__/imputation.cpython-38.pyc
Binary file not shown.
Binary file added src/__pycache__/missing_mat.cpython-38.pyc
Binary file not shown.
Binary file added src/__pycache__/tools.cpython-38.pyc
Binary file not shown.
89 changes: 89 additions & 0 deletions src/empirical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import numpy as np
import pandas as pd
from imputation import Impute
from missing_mat import MissMat
import statsmodels.api as sm

TEST = 700
filename = 'FRED-MD.csv'

def get_regression_data(column_y, t_end=733, method=0, r=3):
data = pd.read_csv(filename)
y = data[column_y].to_numpy()[:t_end]
x = data.drop(['Date', column_y], axis=1).to_numpy()[:t_end]
w = np.ones(x.shape)
w[np.isnan(x)] = 0
mdata = MissMat(x, w)
mc = Impute(mdata)
if method == 0:
_ = mc.amputation(r)
F = mc.F
if method == 1:
_ = mc.fit_via_TP(r)
F = mc.F
if method == 2:
_ = mc.fit_via_Weight(r)
F = mc.F
if method == 3:
_ = mc.fit_via_Nuclear(r)
F = mc.F
if method == 4:
_ = mc.fit_via_PQ(r)
F = mc.F
return y, F


def one_step_predict(column_y, step=0, lags=4, method=0):
y, F = get_regression_data(column_y, t_end=TEST+step, method=method)
data = pd.DataFrame(F, columns=['f1','f2','f3'])
data['y'] = y
cols = ['y', 'f1','f2','f3']
data = data[cols]
for l in range(lags):
data = pd.concat([data, data['y'].shift(l+1).rename('y_{}'.format(l+1))], axis=1)
data = pd.concat([data, data['f1'].shift(l+1).rename('f1_{}'.format(l+1))], axis=1)
data = pd.concat([data, data['f2'].shift(l+1).rename('f2_{}'.format(l+1))], axis=1)
data = pd.concat([data, data['f3'].shift(l+1).rename('f3_{}'.format(l+1))], axis=1)
data = data.iloc[lags:]
model = sm.OLS(data['y'],data.iloc[:,4:])
results = model.fit()
yhat = results.predict(data.iloc[-1:,:len(data.columns)-4].to_numpy())
return yhat[0]


def out_sample_mse(column_y, method=0):
yhats = []
for t in range(733-TEST):
yhats.append(one_step_predict(column_y, step=t, method=method))
yhats = np.array(yhats)
data = pd.read_csv(filename)
y = data[column_y].to_numpy()[TEST:]
rmse = np.mean((y - yhats)**2)
return rmse, yhats

"""
cols = ['INDPRO', 'UNRATE', 'CPIAUCSL']
y, F = get_regression_data(cols[0], method=0)
rmse = out_sample_mse(cols[2])
print(rmse)
rmse = out_sample_mse(cols[2], method=1)
print(rmse)
rmse = out_sample_mse(cols[2], method=2)
print(rmse)
rmse = out_sample_mse(cols[2], method=3)
print(rmse)
"""

import matplotlib.pyplot as plt

cols = ['INDPRO', 'UNRATE', 'CPIAUCSL']
rmse, yhats = out_sample_mse(cols[0])
df = pd.read_csv(filename)
y = df[cols[0]].to_numpy()
yhat = np.copy(y)
yhat[700:] = yhats
plt.figure(figsize=(10,5))
plt.plot(y[630:], color='blue', label='Real IP Index')
plt.plot(yhat[630:], color='black', ls='-', lw=2,label='1 step forecast of IP Index')
plt.legend()

16 changes: 16 additions & 0 deletions src/get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import numpy as np


def get_data(N, T, r):
F = np.random.normal(size=(N, r))
L = np.random.normal(size=(T, r))
e = np.random.normal(size=(N, T))
T = F @ L.T + e
return T


def get_missing_lr(N, T, r, N_o, T_o, p):
W = np.ones((N,T))
Wb = np.random.choice([0,1], size=(N-N_o, T-T_o), p=[p,1-p])
W[N_o:, T_o:] = Wb
return W
134 changes: 134 additions & 0 deletions src/imputation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import numpy as np
import pandas as pd
from missing_mat import MissMat
from get_data import get_missing_lr, get_data
from sklearn.decomposition import PCA
from matrix_completion import svt_solve
from tools import SGD


class Impute():

def __init__(self, MissData: MissMat):
self.MissData = MissData

def fit_via_TW(self, r):
# PCA on Tall block
tall = self.MissData.get_tall()
pca = PCA(n_components=r)
pca.fit(tall)
F_tall = pca.transform(tall)
Lambda_tall = np.linalg.solve(F_tall.T @ F_tall, F_tall.T @ tall).T

# PCA on Wide block
wide = self.MissData.get_wide().T
pca.fit(wide)
Lambda_wide = pca.transform(wide)

# regress Lambda_tall on submatrix of Lambda_wide
N_o = Lambda_tall.shape[0]
Lsub = Lambda_wide[:N_o]
H = np.linalg.solve(Lsub.T @ Lsub, Lsub.T @ Lambda_tall)
C = F_tall @ H @ Lambda_wide.T

Xout = np.zeros((self.MissData.N, self.MissData.T))
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1]
Xout[self.MissData.W == 0] = C[self.MissData.W == 0]
self.Xout = Xout
self.F = F_tall
return Xout

def fit_via_TP(self, r):
# PCA on Tall block
tall, rest, W_rest = self.MissData.get_tall_rest()
pca = PCA(n_components=r)
pca.fit(tall)
F_tall = pca.transform(tall)

N, T = self.MissData.N, self.MissData.T
T_o = tall.shape[1]
Xm = np.zeros((N, T-T_o))
Ls = np.zeros((T-T_o, r))
for i in range(T-T_o):
x, w = rest[:,i], W_rest[:,i]
f = F_tall[w==1]
x = x[w==1]
l = np.linalg.solve(f.T @ f, f.T @ x)
Ls[i] = l
Xm = F_tall @ Ls.T
Xout = np.zeros((self.MissData.N, self.MissData.T))
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1]
Xout[:,T_o:][self.MissData.W[:,T_o:] == 0] = Xm[self.MissData.W[:,T_o:] == 0]
self.Xout = Xout
self.F = F_tall
return Xout

def fit_via_Weight(self, r):
N, T = self.MissData.N, self.MissData.T
cov = pd.DataFrame(self.MissData.X).cov().to_numpy()
cov[np.isnan(cov)] = np.mean(cov[~np.isnan(cov)])
eigval, eigvec = np.linalg.eig(cov)
Lambda = np.sqrt(T)*eigvec[:,:r]
F = np.zeros((N, r))
for i in range(N):
x = np.copy(self.MissData.X[i])
x[self.MissData.W[i]==0] = 0
F[i] = np.linalg.solve(Lambda.T @ np.diag(self.MissData.W[i]) @ Lambda,
Lambda.T @ x)
C = F @ Lambda.T
Xout = np.zeros((self.MissData.N, self.MissData.T))
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1]
Xout[self.MissData.W == 0] = C[self.MissData.W == 0]
self.Xout = Xout
self.F = F
return Xout

def fit_via_Nuclear(self, r):
C = svt_solve(self.MissData.X, self.MissData.W)
Xout = np.zeros((self.MissData.N, self.MissData.T))
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1]
Xout[self.MissData.W == 0] = C[self.MissData.W == 0]
self.Xout = Xout
pca = PCA(n_components=r)
pca.fit(Xout)
self.F = pca.transform(Xout)
return Xout

def fit_via_PQ(self, r):
F, L = SGD(self.MissData.X_sparse, r, gamma=0.01, lamda=0.1, steps=200)
C = F @ L
Xout = np.zeros((self.MissData.N, self.MissData.T))
Xout[self.MissData.W == 1] = self.MissData.X[self.MissData.W == 1]
Xout[self.MissData.W == 0] = C[self.MissData.W == 0]
self.Xout = Xout
self.F = F
return Xout

def amputation(self, r):
row_sum = np.sum(self.MissData.W, axis=0)
Xout = self.MissData.X[:, row_sum == self.MissData.N]
self.Xout = Xout
pca = PCA(n_components=r)
pca.fit(Xout)
self.F = pca.transform(Xout)
return Xout


if __name__ == "__main__":
N, T, r = 200, 100, 3
X = get_data(N, T, r)
W = get_missing_lr(N, T, r, 150, 80, 0.5)
Xm = np.copy(X)
Xm[W==0] = np.nan
M = MissMat(Xm, W)
imp = Impute(M)
Xhat = imp.fit_via_TW(r)
print(np.mean((Xhat[W==0] - X[W==0])**2))
Xhat = imp.fit_via_TP(r)
print(np.mean((Xhat[W==0] - X[W==0])**2))
Xhat = imp.fit_via_Weight(r)
print(np.mean((Xhat[W==0] - X[W==0])**2))
Xhat = imp.fit_via_Nuclear()
print(np.mean((Xhat[W==0] - X[W==0])**2))
Xhat = imp.fit_via_PQ(r)
print(np.mean((Xhat[W==0] - X[W==0])**2))
94 changes: 94 additions & 0 deletions src/missing_mat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import numpy as np
from scipy.sparse import coo_matrix


class MissMat():

def __init__(self, X, W):
self.X = X
self.W = W
self.N, self.T = X.shape
self.verify_data()
self.X_sparse = self.get_sparse()


def verify_data(self):
if type(self.X) is not np.ndarray or type(self.W) is not np.ndarray:
raise RuntimeError("Data is not numpy array, type: {}, {}."
.format(type(self.X), type(self.W)))

if self.X.shape != self.W.shape:
raise RuntimeError("Shape not equal, {}, {}".
format(self.X.shape, self.W.shape))

if set(self.W.reshape(-1,1).squeeze()) != {0,1}:
raise RuntimeError("Value of W is not binary, {}.".format(set(self.W.reshape(-1,1).squeeze())))


def sort(self):
# sort columns
row_sum = np.sum(self.W, axis=0)
self.idx_col = np.argsort(-row_sum)
W_sort_col = self.W[:,self.idx_col]
X_sort_col = self.X[:, self.idx_col]
# sort rows
col_sum = np.sum(W_sort_col, axis=1)
self.idx_row = np.argsort(-col_sum)
W_sorted = W_sort_col[self.idx_row, :]
X_sorted = X_sort_col[self.idx_row, :]
self.W = W_sorted
self.X = X_sorted


def un_sort(self, X, W):
X_tmp = X[self.idx_row,:]
W_tmp = W[self.idx_row,:]
X_unsort = X_tmp[:,self.idx_col]
W_unsort = W_tmp[:,self.idx_col]
return X_unsort, W_unsort


def get_sparse(self):
M = np.copy(self.X)
M[self.W==0] = 0
return coo_matrix(M)


def demean(self):
mu = np.mean(self.X[self.W==1])
self.X -= mu


def get_tall(self):
row_sum = np.sum(self.W, axis=0)
return self.X[:,row_sum==self.N]


def get_wide(self):
col_sum = np.sum(self.W, axis=1)
return self.X[col_sum==self.T, :]


def get_tall_rest(self):
row_sum = np.sum(self.W, axis=0)
return self.X[:,row_sum==self.N], self.X[:,row_sum<self.N], self.W[:,row_sum<self.N]



if __name__ == "__main__":
X = np.arange(25).reshape(5,5)
W = np.array([[0, 0, 0, 0, 0],
[0, 1, 1, 1, 0],
[1, 0, 1, 1, 0],
[1, 0, 1, 0, 0],
[1, 0, 1, 0, 0]])
M = MissMat(X, W)
M.sort()
print(X)
print(W)
print(M.X)
print(M.W)
print(M.get_tall())
print(M.get_wide())
print(M.X_sparse)
print(M.un_sort(M.X, M.W))
Loading

0 comments on commit 3347266

Please sign in to comment.