diff --git a/examples/gpr_aep_examples.py b/examples/gpr_aep_examples.py index 1229d7a..77a1f47 100644 --- a/examples/gpr_aep_examples.py +++ b/examples/gpr_aep_examples.py @@ -1,18 +1,20 @@ print "importing stuff..." import numpy as np import pdb +import matplotlib +matplotlib.use('Agg') import matplotlib.pylab as plt from scipy import special -# from .datautils import step, spiral -# from .context import aep +from .datautils import step, spiral +from .context import aep -import sys -import os -sys.path.insert(0, os.path.abspath( - os.path.join(os.path.dirname(__file__), '..'))) -import geepee.aep_models as aep -from datautils import step, spiral +# import sys +# import os +# sys.path.insert(0, os.path.abspath( +# os.path.join(os.path.dirname(__file__), '..'))) +# import geepee.aep_models as aep +# from datautils import step, spiral def run_regression_1D(): @@ -48,6 +50,7 @@ def plot(m): plot(model) plt.show() + def run_banana(): def gridParams(): @@ -98,6 +101,86 @@ def plot(m): plt.show() +def run_regression_1D_stoc(): + np.random.seed(42) + + print "create dataset ..." + N = 500 + X = np.random.rand(N,1) + Y = np.sin(12*X) + 0.5*np.cos(25*X) + np.random.randn(N,1)*0.2 + # plt.plot(X, Y, 'kx', mew=2) + + def plot(m): + xx = np.linspace(-0.5, 1.5, 100)[:,None] + mean, var = m.predict_f(xx) + zu = m.sgp_layer.zu + mean_u, var_u = m.predict_f(zu) + plt.figure() + plt.plot(X, Y, 'kx', mew=2) + plt.plot(xx, mean, 'b', lw=2) + plt.fill_between( + xx[:, 0], + mean[:, 0] - 2*np.sqrt(var[:, 0]), + mean[:, 0] + 2*np.sqrt(var[:, 0]), + color='blue', alpha=0.2) + plt.errorbar(zu, mean_u, yerr=2*np.sqrt(var_u), fmt='ro') + plt.xlim(-0.1, 1.1) + + # inference + print "create model and optimize ..." + M = 20 + model = aep.SGPR(X, Y, M, lik='Gaussian') + model.optimise(method='adam', alpha=0.2, maxiter=100000, mb_size=3*M, adam_lr=0.001) + plot(model) + plt.show() + plt.savefig('/tmp/reg_1D_stoc.pdf') + + +def run_banana_stoc(): + + def gridParams(): + mins = [-3.25,-2.85 ] + maxs = [ 3.65, 3.4 ] + nGrid = 50 + xspaced = np.linspace( mins[0], maxs[0], nGrid ) + yspaced = np.linspace( mins[1], maxs[1], nGrid ) + xx, yy = np.meshgrid( xspaced, yspaced ) + Xplot = np.vstack((xx.flatten(),yy.flatten())).T + return mins, maxs, xx, yy, Xplot + + def plot(m): + col1 = '#0172B2' + col2 = '#CC6600' + mins, maxs, xx, yy, Xplot = gridParams() + mf, vf = m.predict_f(Xplot) + plt.figure() + plt.plot( + Xtrain[:,0][Ytrain[:,0]==1], + Xtrain[:,1][Ytrain[:,0]==1], + 'o', color=col1, mew=0, alpha=0.5) + plt.plot( + Xtrain[:,0][Ytrain[:,0]==-1], + Xtrain[:,1][Ytrain[:,0]==-1], + 'o', color=col2, mew=0, alpha=0.5) + zu = m.sgp_layer.zu + plt.plot(zu[:,0], zu[:,1], 'ro', mew=0, ms=4) + plt.contour( + xx, yy, mf.reshape(*xx.shape), [0], + colors='k', linewidths=1.8, zorder=100) + + Xtrain = np.loadtxt( + './examples/data/banana_X_train.txt', delimiter=',') + Ytrain = np.loadtxt( + './examples/data/banana_Y_train.txt', delimiter=',').reshape(-1,1) + Ytrain[np.where(Ytrain==0)[0]] = -1 + M = 30 + model = aep.SGPR(Xtrain, Ytrain, M, lik='Probit') + model.optimise(method='adam', alpha=0.5, maxiter=100000, mb_size=3*M, adam_lr=0.005) + plot(model) + plt.show() + plt.savefig('/tmp/cla_probit_stoc.pdf') + + def run_step_1D(): np.random.seed(42) @@ -233,4 +316,7 @@ def run_boston(): # run_banana() # run_step_1D() # run_spiral() - run_boston() \ No newline at end of file + # run_boston() + + run_regression_1D_stoc() + run_banana_stoc() \ No newline at end of file diff --git a/geepee/aep_models.py b/geepee/aep_models.py index e7632a1..3426286 100644 --- a/geepee/aep_models.py +++ b/geepee/aep_models.py @@ -9,6 +9,7 @@ import pdb from scipy.cluster.vq import kmeans2 import pprint +from utils import profile pp = pprint.PrettyPrinter(indent=4) @@ -85,7 +86,6 @@ def compute_phi_cavity(self): self.Suhat, self.muhat)) return phi_cavity - # @profile def compute_phi(self, alpha=1.0): N = self.N scale_post = N * 1.0 / alpha - 1.0 @@ -228,7 +228,7 @@ def backprop_grads_lvm(self, m, v, dm, dv, psi1, psi2, mx, vx, alpha=1.0): return grad_hyper, grad_input - # @profile + @profile def backprop_grads_reg(self, m, v, dm, dv, kfu, x, alpha=1.0): N = self.N M = self.M @@ -637,7 +637,7 @@ def get_hypers(self): def optimise( self, method='L-BFGS-B', tol=None, reinit_hypers=True, - callback=None, maxiter=1000, alpha=0.5, adam_lr=0.05, **kargs): + callback=None, maxiter=1000, alpha=0.5, mb_size=None, adam_lr=0.001, **kargs): self.updated = False if reinit_hypers: @@ -648,15 +648,18 @@ def optimise( init_params_vec, params_args = flatten_dict(init_params_dict) N = self.N - idxs = np.arange(N) - try: if method.lower() == 'adam': + # TODO: do replacement sampling for now since this is faster + # alternative would be + # idxs = np.random.permutation(N)[:mb_size] + idxs = np.random.randint(N, size=mb_size) results = adam(objective_wrapper, init_params_vec, step_size=adam_lr, maxiter=maxiter, args=(params_args, self, idxs, alpha)) else: + idxs = np.arange(N) options = {'maxiter': maxiter, 'disp': True, 'gtol': 1e-8} results = minimize( fun=objective_wrapper, @@ -667,7 +670,6 @@ def optimise( tol=tol, callback=callback, options=options) - except KeyboardInterrupt: print 'Caught KeyboardInterrupt ...' results = [] @@ -951,7 +953,7 @@ def __init__(self, x_train, y_train, no_pseudo, lik='Gaussian'): else: raise NotImplementedError('likelihood not implemented') - # @profile + @profile def objective_function(self, params, idxs, alpha=1.0): N = self.N xb = self.x_train[idxs, :] diff --git a/geepee/tools.py b/geepee/tools.py index 160b596..754f95e 100644 --- a/geepee/tools.py +++ b/geepee/tools.py @@ -13,22 +13,6 @@ def inverse_sigmoid(x): return np.log(x / (1 - x)) -def adam(func, init_params, callback=None, maxiter=1000, - step_size=0.001, b1=0.9, b2=0.999, eps=1e-8, args=None): - """Adam as described in http://arxiv.org/pdf/1412.6980.pdf.""" - x = init_params - m = np.zeros_like(x) - v = np.zeros_like(x) - for i in range(maxiter): - f, g = func(x, *args) - m = (1 - b1) * g + b1 * m # First moment estimate. - v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. - mhat = m / (1 - b1**(i + 1)) # Bias correction. - vhat = v / (1 - b2**(i + 1)) - x = x - step_size * mhat / (np.sqrt(vhat) + eps) - return x - - def make_batches(N_data, batch_size): return [slice(i, min(i + batch_size, N_data)) for i in range(0, N_data, batch_size)] diff --git a/geepee/utils.py b/geepee/utils.py index 00b4781..0583c4d 100644 --- a/geepee/utils.py +++ b/geepee/utils.py @@ -1,7 +1,14 @@ import numpy as np import scipy.linalg as spla +import __builtin__ +try: + profile = __builtin__.profile +except AttributeError: + # No line profiler, provide a pass-through version + def profile(func): return func + def chol2inv(chol): return spla.cho_solve((chol, False), np.eye(chol.shape[0])) @@ -61,4 +68,24 @@ def unflatten_dict(params, params_args): params_dict = {} for i, key in enumerate(sorted(keys)): params_dict[key] = np.reshape(vals[i], shapes[key]) - return params_dict \ No newline at end of file + return params_dict + + +def adam(func, init_params, callback=None, maxiter=1000, + step_size=0.001, b1=0.9, b2=0.999, eps=1e-8, args=None): + """Adam as described in http://arxiv.org/pdf/1412.6980.pdf.""" + x = init_params + m = np.zeros_like(x) + v = np.zeros_like(x) + for i in range(maxiter): + f, g = func(x, *args) + if i % 100 == 0: + print 'iter %d \t obj %.3f' % (i, f) + if callback: callback(x, i, g) + m = (1 - b1) * g + b1 * m # First moment estimate. + v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. + mhat = m / (1 - b1**(i + 1)) # Bias correction. + vhat = v / (1 - b2**(i + 1)) + x = x - step_size * mhat / (np.sqrt(vhat) + eps) + return x + \ No newline at end of file diff --git a/tests/test_grads_aep.py b/tests/test_grads_aep.py index 9263f03..b079a50 100644 --- a/tests/test_grads_aep.py +++ b/tests/test_grads_aep.py @@ -4,6 +4,10 @@ import pdb import pprint import os +import time +import matplotlib +matplotlib.use('Agg') +import matplotlib.pylab as plt from .context import aep from .context import flatten_dict, unflatten_dict @@ -613,6 +617,320 @@ def test_gpr_aep_probit(): % (d, j, grad_all['eta2'][d][j], dR_id, (grad_all['eta2'][d][j]-dR_id)/dR_id)) +def test_gpr_aep_gaussian_stochastic(): + + # generate some datapoints for testing + N_train = 20 + alpha = 0.5 + M = 10 + idxs = np.arange(N_train)[:M] + D = 2 + Q = 3 + y_train = np.random.randn(N_train, Q) + x_train = np.random.randn(N_train, D) + # params tied + model = aep.SGPR(x_train, y_train, M, lik='Gaussian') + + # init hypers, inducing points and q(u) params + init_params = model.init_hypers(y_train) + params = init_params.copy() + logZ, grad_all = model.objective_function(params, idxs, alpha=alpha) + pp.pprint(logZ) + pp.pprint(params) + # pdb.set_trace() + + eps = 1e-5 + # check grad ls + Din_i = model.Din + for d in range(Din_i): + params1 = copy.deepcopy(params) + params1['ls'][d] = params1['ls'][d] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + + params2 = copy.deepcopy(params) + params2['ls'][d] = params2['ls'][d] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dls_id = (logZ1 - logZ2) / eps / 2 + # print logZ1, logZ2 + print ('ls d=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (d, grad_all['ls'][d], dls_id, (grad_all['ls'][d]-dls_id)/dls_id)) + + # check grad sf + params1 = copy.deepcopy(params) + params1['sf'] = params1['sf'] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['sf'] = params2['sf'] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dsf_i = (logZ1 - logZ2) / eps / 2 + print ('sf computed=%.5f, numerical=%.5f, diff=%.5f' + % (grad_all['sf'], dsf_i, (grad_all['sf'] - dsf_i) / dsf_i)) + + # check grad sn + params1 = copy.deepcopy(params) + params1['sn'] = params1['sn'] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['sn'] = params2['sn'] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dsn_i = (logZ1 - logZ2) / eps / 2 + print ('sn computed=%.5f, numerical=%.5f, diff=%.5f' + % (grad_all['sn'], dsn_i, (grad_all['sn'] - dsn_i) / dsn_i)) + + # check grad zu + Din_i = model.Din + M_i = model.M + Dout_i = model.Dout + for m in range(M_i): + for k in range(Din_i): + params1 = copy.deepcopy(params) + eps1 = 0.0 * params1['zu'] + eps1[m, k] = eps + params1['zu'] = params1['zu'] + eps1 + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['zu'] = params2['zu'] - eps1 + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dzu_id = (logZ1 - logZ2) / eps / 2 + print ('zu m=%d, k=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (m, k, grad_all['zu'][m, k], dzu_id, (grad_all['zu'][m, k]-dzu_id)/dzu_id)) + + # check grad theta_1 + for d in range(Dout_i): + for j in range(M_i * (M_i + 1) / 2): + params1 = copy.deepcopy(params) + params1['eta1_R'][d][j, ] = params1['eta1_R'][d][j, ] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['eta1_R'][d][j, ] = params2['eta1_R'][d][j, ] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dR_id = (logZ1 - logZ2) / eps / 2 + print ('eta1_R d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (d, j, grad_all['eta1_R'][d][j], dR_id, (grad_all['eta1_R'][d][j]-dR_id)/dR_id)) + + # check grad theta_2 + for d in range(Dout_i): + for j in range(M_i): + params1 = copy.deepcopy(params) + params1['eta2'][d][j, ] = params1['eta2'][d][j, ] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['eta2'][d][j, ] = params2['eta2'][d][j, ] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dR_id = (logZ1 - logZ2) / eps / 2 + print ('eta2 d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (d, j, grad_all['eta2'][d][j], dR_id, (grad_all['eta2'][d][j]-dR_id)/dR_id)) + + +def test_gpr_aep_probit_stochastic(): + + # generate some datapoints for testing + N_train = 5 + alpha = 0.5 + M = 3 + idxs = np.arange(N_train)[:M] + D = 2 + Q = 3 + x_train = np.random.randn(N_train, D) + y_train = 2*np.random.randint(0, 2, size=(N_train, Q)) - 1 + # params tied + model = aep.SGPR(x_train, y_train, M, lik='Probit') + + # init hypers, inducing points and q(u) params + init_params = model.init_hypers(y_train) + params = init_params.copy() + logZ, grad_all = model.objective_function(params, idxs, alpha=alpha) + pp.pprint(logZ) + pp.pprint(params) + # pdb.set_trace() + + eps = 1e-5 + # check grad ls + Din_i = model.Din + for d in range(Din_i): + params1 = copy.deepcopy(params) + params1['ls'][d] = params1['ls'][d] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + + params2 = copy.deepcopy(params) + params2['ls'][d] = params2['ls'][d] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dls_id = (logZ1 - logZ2) / eps / 2 + # print logZ1, logZ2 + print ('ls d=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (d, grad_all['ls'][d], dls_id, (grad_all['ls'][d]-dls_id)/dls_id)) + + # check grad sf + params1 = copy.deepcopy(params) + params1['sf'] = params1['sf'] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['sf'] = params2['sf'] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dsf_i = (logZ1 - logZ2) / eps / 2 + print ('sf computed=%.5f, numerical=%.5f, diff=%.5f' + % (grad_all['sf'], dsf_i, (grad_all['sf'] - dsf_i) / dsf_i)) + + # check grad zu + Din_i = model.Din + M_i = model.M + Dout_i = model.Dout + for m in range(M_i): + for k in range(Din_i): + params1 = copy.deepcopy(params) + eps1 = 0.0 * params1['zu'] + eps1[m, k] = eps + params1['zu'] = params1['zu'] + eps1 + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['zu'] = params2['zu'] - eps1 + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dzu_id = (logZ1 - logZ2) / eps / 2 + print ('zu m=%d, k=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (m, k, grad_all['zu'][m, k], dzu_id, (grad_all['zu'][m, k]-dzu_id)/dzu_id)) + + # check grad theta_1 + for d in range(Dout_i): + for j in range(M_i * (M_i + 1) / 2): + params1 = copy.deepcopy(params) + params1['eta1_R'][d][j, ] = params1['eta1_R'][d][j, ] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['eta1_R'][d][j, ] = params2['eta1_R'][d][j, ] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dR_id = (logZ1 - logZ2) / eps / 2 + print ('eta1_R d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (d, j, grad_all['eta1_R'][d][j], dR_id, (grad_all['eta1_R'][d][j]-dR_id)/dR_id)) + + # check grad theta_2 + for d in range(Dout_i): + for j in range(M_i): + params1 = copy.deepcopy(params) + params1['eta2'][d][j, ] = params1['eta2'][d][j, ] + eps + logZ1, grad1 = model.objective_function( + params1, idxs, alpha=alpha) + params2 = copy.deepcopy(params) + params2['eta2'][d][j, ] = params2['eta2'][d][j, ] - eps + logZ2, grad2 = model.objective_function( + params2, idxs, alpha=alpha) + + dR_id = (logZ1 - logZ2) / eps / 2 + print ('eta2 d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' + % (d, j, grad_all['eta2'][d][j], dR_id, (grad_all['eta2'][d][j]-dR_id)/dR_id)) + + +def plot_gpr_aep_gaussian_stochastic(): + + # generate some datapoints for testing + N_train = 2000 + alpha = 0.5 + M = 50 + idxs = np.arange(N_train) + D = 2 + Q = 3 + y_train = np.random.randn(N_train, Q) + x_train = np.random.randn(N_train, D) + # params tied + model = aep.SGPR(x_train, y_train, M, lik='Gaussian') + + # init hypers, inducing points and q(u) params + params = model.init_hypers(y_train) + logZ, grad_all = model.objective_function(params, idxs, alpha=alpha) + mbs = np.logspace(-2, 0, 10) + reps = 20 + times = np.zeros(len(mbs)) + objs = np.zeros((len(mbs), reps)) + for i, mb in enumerate(mbs): + no_points = int(N_train * mb) + start_time = time.time() + for k in range(reps): + idxs_ik = np.random.permutation(N_train)[:no_points] + objs[i, k] = model.objective_function(params, idxs_ik, alpha=alpha)[0] + times[i] = time.time() - start_time + + f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + ax1.plot(mbs, times, 'x-') + ax1.set_xlabel("Minibatch proportion") + ax1.set_ylabel("Time taken") + + ax2.plot(mbs, objs, 'kx') + ax2.axhline(logZ, color='b') + ax2.set_xlabel("Minibatch proportion") + ax2.set_ylabel("ELBO estimates") + plt.savefig('/tmp/gaussian_stochastic_aep_gpr.pdf') + + +def plot_gpr_aep_probit_stochastic(): + + # generate some datapoints for testing + N_train = 2000 + alpha = 0.5 + M = 50 + D = 2 + Q = 3 + x_train = np.random.randn(N_train, D) + y_train = 2*np.random.randint(0, 2, size=(N_train, Q)) - 1 + # params tied + model = aep.SGPR(x_train, y_train, M, lik='Probit') + + # init hypers, inducing points and q(u) params + params = model.init_hypers(y_train) + idxs = np.arange(N_train) + logZ, grad_all = model.objective_function(params, idxs, alpha=alpha) + mbs = np.logspace(-2, 0, 10) + reps = 20 + times = np.zeros(len(mbs)) + objs = np.zeros((len(mbs), reps)) + for i, mb in enumerate(mbs): + no_points = int(N_train * mb) + start_time = time.time() + for k in range(reps): + idxs_ik = np.random.permutation(N_train)[:no_points] + objs[i, k] = model.objective_function(params, idxs_ik, alpha=alpha)[0] + times[i] = time.time() - start_time + + f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + ax1.plot(mbs, times, 'x-') + ax1.set_xlabel("Minibatch proportion") + ax1.set_ylabel("Time taken") + + ax2.plot(mbs, objs, 'kx') + ax2.axhline(logZ, color='b') + ax2.set_xlabel("Minibatch proportion") + ax2.set_ylabel("ELBO estimates") + plt.savefig('/tmp/probit_stochastic_aep_gpr.pdf') + + def test_gpr_aep_gaussian_scipy(): # generate some datapoints for testing N_train = 20 @@ -1284,16 +1602,21 @@ def kink(T, process_noise, obs_noise, xprev=None): if __name__ == '__main__': - test_gplvm_aep_gaussian() - test_gplvm_aep_probit() - test_gplvm_aep_gaussian_scipy() - test_gplvm_aep_probit_scipy() + # test_gplvm_aep_gaussian() + # test_gplvm_aep_probit() + # test_gplvm_aep_gaussian_scipy() + # test_gplvm_aep_probit_scipy() # test_gpr_aep_gaussian() # test_gpr_aep_probit() # test_gpr_aep_gaussian_scipy() # test_gpr_aep_probit_scipy() + # plot_gpr_aep_probit_stochastic() + # plot_gpr_aep_gaussian_stochastic() + test_gpr_aep_probit_stochastic() + test_gpr_aep_gaussian_stochastic() + # test_dgpr_aep_gaussian() # test_dgpr_aep_probit() # test_dgpr_aep_gaussian_scipy()