diff --git a/examples/gpr_aep_examples.py b/examples/gpr_aep_examples.py
index 1229d7a..77a1f47 100644
--- a/examples/gpr_aep_examples.py
+++ b/examples/gpr_aep_examples.py
@@ -1,18 +1,20 @@
 print "importing stuff..."
 import numpy as np
 import pdb
+import matplotlib
+matplotlib.use('Agg')
 import matplotlib.pylab as plt
 from scipy import special
 
-# from .datautils import step, spiral
-# from .context import aep
+from .datautils import step, spiral
+from .context import aep
 
-import sys
-import os
-sys.path.insert(0, os.path.abspath(
-    os.path.join(os.path.dirname(__file__), '..')))
-import geepee.aep_models as aep
-from datautils import step, spiral
+# import sys
+# import os
+# sys.path.insert(0, os.path.abspath(
+#     os.path.join(os.path.dirname(__file__), '..')))
+# import geepee.aep_models as aep
+# from datautils import step, spiral
 
 
 def run_regression_1D():
@@ -48,6 +50,7 @@ def plot(m):
 	plot(model)
 	plt.show()
 
+
 def run_banana():
 
 	def gridParams():
@@ -98,6 +101,86 @@ def plot(m):
 	plt.show()
 
 
+def run_regression_1D_stoc():
+	np.random.seed(42)
+
+	print "create dataset ..."
+	N = 500
+	X = np.random.rand(N,1)
+	Y = np.sin(12*X) + 0.5*np.cos(25*X) + np.random.randn(N,1)*0.2
+	# plt.plot(X, Y, 'kx', mew=2)
+
+	def plot(m):
+		xx = np.linspace(-0.5, 1.5, 100)[:,None]
+		mean, var = m.predict_f(xx)
+		zu = m.sgp_layer.zu
+		mean_u, var_u = m.predict_f(zu)
+		plt.figure()
+		plt.plot(X, Y, 'kx', mew=2)
+		plt.plot(xx, mean, 'b', lw=2)
+		plt.fill_between(
+			xx[:, 0], 
+			mean[:, 0] - 2*np.sqrt(var[:, 0]), 
+			mean[:, 0] + 2*np.sqrt(var[:, 0]), 
+			color='blue', alpha=0.2)
+		plt.errorbar(zu, mean_u, yerr=2*np.sqrt(var_u), fmt='ro')
+		plt.xlim(-0.1, 1.1)
+
+	# inference
+	print "create model and optimize ..."
+	M = 20
+	model = aep.SGPR(X, Y, M, lik='Gaussian')
+	model.optimise(method='adam', alpha=0.2, maxiter=100000, mb_size=3*M, adam_lr=0.001)
+	plot(model)
+	plt.show()
+	plt.savefig('/tmp/reg_1D_stoc.pdf')
+
+
+def run_banana_stoc():
+
+	def gridParams():
+		mins = [-3.25,-2.85 ]
+		maxs = [ 3.65, 3.4 ]
+		nGrid = 50
+		xspaced = np.linspace( mins[0], maxs[0], nGrid )
+		yspaced = np.linspace( mins[1], maxs[1], nGrid )
+		xx, yy = np.meshgrid( xspaced, yspaced )
+		Xplot = np.vstack((xx.flatten(),yy.flatten())).T
+		return mins, maxs, xx, yy, Xplot
+
+	def plot(m):
+		col1 = '#0172B2'
+		col2 = '#CC6600'
+		mins, maxs, xx, yy, Xplot = gridParams()
+		mf, vf = m.predict_f(Xplot)
+		plt.figure()
+		plt.plot(
+			Xtrain[:,0][Ytrain[:,0]==1], 
+			Xtrain[:,1][Ytrain[:,0]==1], 
+			'o', color=col1, mew=0, alpha=0.5)
+		plt.plot(
+			Xtrain[:,0][Ytrain[:,0]==-1], 
+			Xtrain[:,1][Ytrain[:,0]==-1], 
+			'o', color=col2, mew=0, alpha=0.5)
+		zu = m.sgp_layer.zu
+		plt.plot(zu[:,0], zu[:,1], 'ro', mew=0, ms=4)
+		plt.contour(
+			xx, yy, mf.reshape(*xx.shape), [0], 
+			colors='k', linewidths=1.8, zorder=100)
+
+	Xtrain = np.loadtxt(
+		'./examples/data/banana_X_train.txt', delimiter=',')
+	Ytrain = np.loadtxt(
+		'./examples/data/banana_Y_train.txt', delimiter=',').reshape(-1,1)
+	Ytrain[np.where(Ytrain==0)[0]] = -1
+	M = 30
+	model = aep.SGPR(Xtrain, Ytrain, M, lik='Probit')
+	model.optimise(method='adam', alpha=0.5, maxiter=100000, mb_size=3*M, adam_lr=0.005)
+	plot(model)
+	plt.show()
+	plt.savefig('/tmp/cla_probit_stoc.pdf')
+
+
 def run_step_1D():
 	np.random.seed(42)
 
@@ -233,4 +316,7 @@ def run_boston():
 	# run_banana()
 	# run_step_1D()
 	# run_spiral()
-	run_boston()
\ No newline at end of file
+	# run_boston()
+
+	run_regression_1D_stoc()
+	run_banana_stoc()
\ No newline at end of file
diff --git a/geepee/aep_models.py b/geepee/aep_models.py
index e7632a1..3426286 100644
--- a/geepee/aep_models.py
+++ b/geepee/aep_models.py
@@ -9,6 +9,7 @@
 import pdb
 from scipy.cluster.vq import kmeans2
 import pprint
+from utils import profile
 
 pp = pprint.PrettyPrinter(indent=4)
 
@@ -85,7 +86,6 @@ def compute_phi_cavity(self):
             self.Suhat, self.muhat))
         return phi_cavity
 
-    # @profile
     def compute_phi(self, alpha=1.0):
         N = self.N
         scale_post = N * 1.0 / alpha - 1.0
@@ -228,7 +228,7 @@ def backprop_grads_lvm(self, m, v, dm, dv, psi1, psi2, mx, vx, alpha=1.0):
 
         return grad_hyper, grad_input
 
-    # @profile
+    @profile
     def backprop_grads_reg(self, m, v, dm, dv, kfu, x, alpha=1.0):
         N = self.N
         M = self.M
@@ -637,7 +637,7 @@ def get_hypers(self):
 
     def optimise(
         self, method='L-BFGS-B', tol=None, reinit_hypers=True, 
-        callback=None, maxiter=1000, alpha=0.5, adam_lr=0.05, **kargs):
+        callback=None, maxiter=1000, alpha=0.5, mb_size=None, adam_lr=0.001, **kargs):
         self.updated = False
 
         if reinit_hypers:
@@ -648,15 +648,18 @@ def optimise(
         init_params_vec, params_args = flatten_dict(init_params_dict)
 
         N = self.N
-        idxs = np.arange(N)
-
         try:
             if method.lower() == 'adam':
+                # TODO: do replacement sampling for now since this is faster
+                # alternative would be
+                # idxs = np.random.permutation(N)[:mb_size]
+                idxs = np.random.randint(N, size=mb_size)
                 results = adam(objective_wrapper, init_params_vec,
                                step_size=adam_lr,
                                maxiter=maxiter,
                                args=(params_args, self, idxs, alpha))
             else:
+                idxs = np.arange(N)
                 options = {'maxiter': maxiter, 'disp': True, 'gtol': 1e-8}
                 results = minimize(
                     fun=objective_wrapper,
@@ -667,7 +670,6 @@ def optimise(
                     tol=tol,
                     callback=callback,
                     options=options)
-
         except KeyboardInterrupt:
             print 'Caught KeyboardInterrupt ...'
             results = []
@@ -951,7 +953,7 @@ def __init__(self, x_train, y_train, no_pseudo, lik='Gaussian'):
         else:
             raise NotImplementedError('likelihood not implemented')
 
-    # @profile
+    @profile
     def objective_function(self, params, idxs, alpha=1.0):
         N = self.N
         xb = self.x_train[idxs, :]
diff --git a/geepee/tools.py b/geepee/tools.py
index 160b596..754f95e 100644
--- a/geepee/tools.py
+++ b/geepee/tools.py
@@ -13,22 +13,6 @@ def inverse_sigmoid(x):
     return np.log(x / (1 - x))
 
 
-def adam(func, init_params, callback=None, maxiter=1000,
-         step_size=0.001, b1=0.9, b2=0.999, eps=1e-8, args=None):
-    """Adam as described in http://arxiv.org/pdf/1412.6980.pdf."""
-    x = init_params
-    m = np.zeros_like(x)
-    v = np.zeros_like(x)
-    for i in range(maxiter):
-        f, g = func(x, *args)
-        m = (1 - b1) * g + b1 * m  # First  moment estimate.
-        v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
-        mhat = m / (1 - b1**(i + 1))    # Bias correction.
-        vhat = v / (1 - b2**(i + 1))
-        x = x - step_size * mhat / (np.sqrt(vhat) + eps)
-    return x
-
-
 def make_batches(N_data, batch_size):
     return [slice(i, min(i + batch_size, N_data)) for i in range(0, N_data, batch_size)]
 
diff --git a/geepee/utils.py b/geepee/utils.py
index 00b4781..0583c4d 100644
--- a/geepee/utils.py
+++ b/geepee/utils.py
@@ -1,7 +1,14 @@
 import numpy as np
 import scipy.linalg as spla
+import __builtin__
 
 
+try:
+    profile = __builtin__.profile
+except AttributeError:
+    # No line profiler, provide a pass-through version
+    def profile(func): return func
+
 def chol2inv(chol):
     return spla.cho_solve((chol, False), np.eye(chol.shape[0]))
 
@@ -61,4 +68,24 @@ def unflatten_dict(params, params_args):
     params_dict = {}
     for i, key in enumerate(sorted(keys)):
         params_dict[key] = np.reshape(vals[i], shapes[key])
-    return params_dict
\ No newline at end of file
+    return params_dict
+
+
+def adam(func, init_params, callback=None, maxiter=1000,
+         step_size=0.001, b1=0.9, b2=0.999, eps=1e-8, args=None):
+    """Adam as described in http://arxiv.org/pdf/1412.6980.pdf."""
+    x = init_params
+    m = np.zeros_like(x)
+    v = np.zeros_like(x)
+    for i in range(maxiter):
+        f, g = func(x, *args)
+        if i % 100 == 0:
+            print 'iter %d \t obj %.3f' % (i, f)
+        if callback: callback(x, i, g)
+        m = (1 - b1) * g + b1 * m  # First  moment estimate.
+        v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
+        mhat = m / (1 - b1**(i + 1))    # Bias correction.
+        vhat = v / (1 - b2**(i + 1))
+        x = x - step_size * mhat / (np.sqrt(vhat) + eps)
+    return x
+    
\ No newline at end of file
diff --git a/tests/test_grads_aep.py b/tests/test_grads_aep.py
index 9263f03..b079a50 100644
--- a/tests/test_grads_aep.py
+++ b/tests/test_grads_aep.py
@@ -4,6 +4,10 @@
 import pdb
 import pprint
 import os
+import time
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pylab as plt
 
 from .context import aep
 from .context import flatten_dict, unflatten_dict
@@ -613,6 +617,320 @@ def test_gpr_aep_probit():
                 % (d, j, grad_all['eta2'][d][j], dR_id, (grad_all['eta2'][d][j]-dR_id)/dR_id))
 
 
+def test_gpr_aep_gaussian_stochastic():
+
+    # generate some datapoints for testing
+    N_train = 20
+    alpha = 0.5
+    M = 10
+    idxs = np.arange(N_train)[:M]
+    D = 2
+    Q = 3
+    y_train = np.random.randn(N_train, Q)
+    x_train = np.random.randn(N_train, D)
+    # params  tied
+    model = aep.SGPR(x_train, y_train, M, lik='Gaussian')
+
+    # init hypers, inducing points and q(u) params
+    init_params = model.init_hypers(y_train)
+    params = init_params.copy()
+    logZ, grad_all = model.objective_function(params, idxs, alpha=alpha)
+    pp.pprint(logZ)
+    pp.pprint(params)
+    # pdb.set_trace()
+
+    eps = 1e-5
+    # check grad ls
+    Din_i = model.Din
+    for d in range(Din_i):
+        params1 = copy.deepcopy(params)
+        params1['ls'][d] = params1['ls'][d] + eps
+        logZ1, grad1 = model.objective_function(
+            params1, idxs, alpha=alpha)
+
+        params2 = copy.deepcopy(params)
+        params2['ls'][d] = params2['ls'][d] - eps
+        logZ2, grad2 = model.objective_function(
+            params2, idxs, alpha=alpha)
+
+        dls_id = (logZ1 - logZ2) / eps / 2
+        # print logZ1, logZ2
+        print ('ls d=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+            % (d, grad_all['ls'][d], dls_id, (grad_all['ls'][d]-dls_id)/dls_id))
+
+    # check grad sf
+    params1 = copy.deepcopy(params)
+    params1['sf'] = params1['sf'] + eps
+    logZ1, grad1 = model.objective_function(
+        params1, idxs, alpha=alpha)
+    params2 = copy.deepcopy(params)
+    params2['sf'] = params2['sf'] - eps
+    logZ2, grad2 = model.objective_function(
+        params2, idxs, alpha=alpha)
+
+    dsf_i = (logZ1 - logZ2) / eps / 2
+    print ('sf computed=%.5f, numerical=%.5f, diff=%.5f' 
+        % (grad_all['sf'], dsf_i, (grad_all['sf'] - dsf_i) / dsf_i))
+
+    # check grad sn
+    params1 = copy.deepcopy(params)
+    params1['sn'] = params1['sn'] + eps
+    logZ1, grad1 = model.objective_function(
+        params1, idxs, alpha=alpha)
+    params2 = copy.deepcopy(params)
+    params2['sn'] = params2['sn'] - eps
+    logZ2, grad2 = model.objective_function(
+        params2, idxs, alpha=alpha)
+
+    dsn_i = (logZ1 - logZ2) / eps / 2
+    print ('sn computed=%.5f, numerical=%.5f, diff=%.5f' 
+        % (grad_all['sn'], dsn_i, (grad_all['sn'] - dsn_i) / dsn_i))
+
+    # check grad zu
+    Din_i = model.Din
+    M_i = model.M
+    Dout_i = model.Dout
+    for m in range(M_i):
+        for k in range(Din_i):
+            params1 = copy.deepcopy(params)
+            eps1 = 0.0 * params1['zu']
+            eps1[m, k] = eps
+            params1['zu'] = params1['zu'] + eps1
+            logZ1, grad1 = model.objective_function(
+                params1, idxs, alpha=alpha)
+            params2 = copy.deepcopy(params)
+            params2['zu'] = params2['zu'] - eps1
+            logZ2, grad2 = model.objective_function(
+                params2, idxs, alpha=alpha)
+
+            dzu_id = (logZ1 - logZ2) / eps / 2
+            print ('zu m=%d, k=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+                % (m, k, grad_all['zu'][m, k], dzu_id, (grad_all['zu'][m, k]-dzu_id)/dzu_id))
+
+    # check grad theta_1
+    for d in range(Dout_i):
+        for j in range(M_i * (M_i + 1) / 2):
+            params1 = copy.deepcopy(params)
+            params1['eta1_R'][d][j, ] = params1['eta1_R'][d][j, ] + eps
+            logZ1, grad1 = model.objective_function(
+                params1, idxs, alpha=alpha)
+            params2 = copy.deepcopy(params)
+            params2['eta1_R'][d][j, ] = params2['eta1_R'][d][j, ] - eps
+            logZ2, grad2 = model.objective_function(
+                params2, idxs, alpha=alpha)
+
+            dR_id = (logZ1 - logZ2) / eps / 2
+            print ('eta1_R d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+                % (d, j, grad_all['eta1_R'][d][j], dR_id, (grad_all['eta1_R'][d][j]-dR_id)/dR_id))
+
+    # check grad theta_2
+    for d in range(Dout_i):
+        for j in range(M_i):
+            params1 = copy.deepcopy(params)
+            params1['eta2'][d][j, ] = params1['eta2'][d][j, ] + eps
+            logZ1, grad1 = model.objective_function(
+                params1, idxs, alpha=alpha)
+            params2 = copy.deepcopy(params)
+            params2['eta2'][d][j, ] = params2['eta2'][d][j, ] - eps
+            logZ2, grad2 = model.objective_function(
+                params2, idxs, alpha=alpha)
+
+            dR_id = (logZ1 - logZ2) / eps / 2
+            print ('eta2 d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+                % (d, j, grad_all['eta2'][d][j], dR_id, (grad_all['eta2'][d][j]-dR_id)/dR_id))
+
+
+def test_gpr_aep_probit_stochastic():
+
+    # generate some datapoints for testing
+    N_train = 5
+    alpha = 0.5
+    M = 3
+    idxs = np.arange(N_train)[:M]
+    D = 2
+    Q = 3
+    x_train = np.random.randn(N_train, D)
+    y_train = 2*np.random.randint(0, 2, size=(N_train, Q)) - 1
+    # params  tied
+    model = aep.SGPR(x_train, y_train, M, lik='Probit')
+
+    # init hypers, inducing points and q(u) params
+    init_params = model.init_hypers(y_train)
+    params = init_params.copy()
+    logZ, grad_all = model.objective_function(params, idxs, alpha=alpha)
+    pp.pprint(logZ)
+    pp.pprint(params)
+    # pdb.set_trace()
+
+    eps = 1e-5
+    # check grad ls
+    Din_i = model.Din
+    for d in range(Din_i):
+        params1 = copy.deepcopy(params)
+        params1['ls'][d] = params1['ls'][d] + eps
+        logZ1, grad1 = model.objective_function(
+            params1, idxs, alpha=alpha)
+
+        params2 = copy.deepcopy(params)
+        params2['ls'][d] = params2['ls'][d] - eps
+        logZ2, grad2 = model.objective_function(
+            params2, idxs, alpha=alpha)
+
+        dls_id = (logZ1 - logZ2) / eps / 2
+        # print logZ1, logZ2
+        print ('ls d=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+            % (d, grad_all['ls'][d], dls_id, (grad_all['ls'][d]-dls_id)/dls_id))
+
+    # check grad sf
+    params1 = copy.deepcopy(params)
+    params1['sf'] = params1['sf'] + eps
+    logZ1, grad1 = model.objective_function(
+        params1, idxs, alpha=alpha)
+    params2 = copy.deepcopy(params)
+    params2['sf'] = params2['sf'] - eps
+    logZ2, grad2 = model.objective_function(
+        params2, idxs, alpha=alpha)
+
+    dsf_i = (logZ1 - logZ2) / eps / 2
+    print ('sf computed=%.5f, numerical=%.5f, diff=%.5f' 
+        % (grad_all['sf'], dsf_i, (grad_all['sf'] - dsf_i) / dsf_i))
+
+    # check grad zu
+    Din_i = model.Din
+    M_i = model.M
+    Dout_i = model.Dout
+    for m in range(M_i):
+        for k in range(Din_i):
+            params1 = copy.deepcopy(params)
+            eps1 = 0.0 * params1['zu']
+            eps1[m, k] = eps
+            params1['zu'] = params1['zu'] + eps1
+            logZ1, grad1 = model.objective_function(
+                params1, idxs, alpha=alpha)
+            params2 = copy.deepcopy(params)
+            params2['zu'] = params2['zu'] - eps1
+            logZ2, grad2 = model.objective_function(
+                params2, idxs, alpha=alpha)
+
+            dzu_id = (logZ1 - logZ2) / eps / 2
+            print ('zu m=%d, k=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+                % (m, k, grad_all['zu'][m, k], dzu_id, (grad_all['zu'][m, k]-dzu_id)/dzu_id))
+
+    # check grad theta_1
+    for d in range(Dout_i):
+        for j in range(M_i * (M_i + 1) / 2):
+            params1 = copy.deepcopy(params)
+            params1['eta1_R'][d][j, ] = params1['eta1_R'][d][j, ] + eps
+            logZ1, grad1 = model.objective_function(
+                params1, idxs, alpha=alpha)
+            params2 = copy.deepcopy(params)
+            params2['eta1_R'][d][j, ] = params2['eta1_R'][d][j, ] - eps
+            logZ2, grad2 = model.objective_function(
+                params2, idxs, alpha=alpha)
+
+            dR_id = (logZ1 - logZ2) / eps / 2
+            print ('eta1_R d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+                % (d, j, grad_all['eta1_R'][d][j], dR_id, (grad_all['eta1_R'][d][j]-dR_id)/dR_id))
+
+    # check grad theta_2
+    for d in range(Dout_i):
+        for j in range(M_i):
+            params1 = copy.deepcopy(params)
+            params1['eta2'][d][j, ] = params1['eta2'][d][j, ] + eps
+            logZ1, grad1 = model.objective_function(
+                params1, idxs, alpha=alpha)
+            params2 = copy.deepcopy(params)
+            params2['eta2'][d][j, ] = params2['eta2'][d][j, ] - eps
+            logZ2, grad2 = model.objective_function(
+                params2, idxs, alpha=alpha)
+
+            dR_id = (logZ1 - logZ2) / eps / 2
+            print ('eta2 d=%d, j=%d, computed=%.5f, numerical=%.5f, diff=%.5f' 
+                % (d, j, grad_all['eta2'][d][j], dR_id, (grad_all['eta2'][d][j]-dR_id)/dR_id))
+
+
+def plot_gpr_aep_gaussian_stochastic():
+
+    # generate some datapoints for testing
+    N_train = 2000
+    alpha = 0.5
+    M = 50
+    idxs = np.arange(N_train)
+    D = 2
+    Q = 3
+    y_train = np.random.randn(N_train, Q)
+    x_train = np.random.randn(N_train, D)
+    # params  tied
+    model = aep.SGPR(x_train, y_train, M, lik='Gaussian')
+
+    # init hypers, inducing points and q(u) params
+    params = model.init_hypers(y_train)
+    logZ, grad_all = model.objective_function(params, idxs, alpha=alpha)
+    mbs = np.logspace(-2, 0, 10)
+    reps = 20
+    times = np.zeros(len(mbs))
+    objs = np.zeros((len(mbs), reps))
+    for i, mb in enumerate(mbs):
+        no_points = int(N_train * mb)
+        start_time = time.time()
+        for k in range(reps):
+            idxs_ik = np.random.permutation(N_train)[:no_points]
+            objs[i, k] = model.objective_function(params, idxs_ik, alpha=alpha)[0]
+        times[i] = time.time() - start_time
+
+    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+    ax1.plot(mbs, times, 'x-')
+    ax1.set_xlabel("Minibatch proportion")
+    ax1.set_ylabel("Time taken")
+
+    ax2.plot(mbs, objs, 'kx')
+    ax2.axhline(logZ, color='b')
+    ax2.set_xlabel("Minibatch proportion")
+    ax2.set_ylabel("ELBO estimates")
+    plt.savefig('/tmp/gaussian_stochastic_aep_gpr.pdf')
+
+
+def plot_gpr_aep_probit_stochastic():
+
+    # generate some datapoints for testing
+    N_train = 2000
+    alpha = 0.5
+    M = 50
+    D = 2
+    Q = 3
+    x_train = np.random.randn(N_train, D)
+    y_train = 2*np.random.randint(0, 2, size=(N_train, Q)) - 1
+    # params  tied
+    model = aep.SGPR(x_train, y_train, M, lik='Probit')
+
+    # init hypers, inducing points and q(u) params
+    params = model.init_hypers(y_train)
+    idxs = np.arange(N_train)
+    logZ, grad_all = model.objective_function(params, idxs, alpha=alpha)
+    mbs = np.logspace(-2, 0, 10)
+    reps = 20
+    times = np.zeros(len(mbs))
+    objs = np.zeros((len(mbs), reps))
+    for i, mb in enumerate(mbs):
+        no_points = int(N_train * mb)
+        start_time = time.time()
+        for k in range(reps):
+            idxs_ik = np.random.permutation(N_train)[:no_points]
+            objs[i, k] = model.objective_function(params, idxs_ik, alpha=alpha)[0]
+        times[i] = time.time() - start_time
+
+    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+    ax1.plot(mbs, times, 'x-')
+    ax1.set_xlabel("Minibatch proportion")
+    ax1.set_ylabel("Time taken")
+
+    ax2.plot(mbs, objs, 'kx')
+    ax2.axhline(logZ, color='b')
+    ax2.set_xlabel("Minibatch proportion")
+    ax2.set_ylabel("ELBO estimates")
+    plt.savefig('/tmp/probit_stochastic_aep_gpr.pdf')
+
+
 def test_gpr_aep_gaussian_scipy():
     # generate some datapoints for testing
     N_train = 20
@@ -1284,16 +1602,21 @@ def kink(T, process_noise, obs_noise, xprev=None):
 
 
 if __name__ == '__main__':
-    test_gplvm_aep_gaussian()
-    test_gplvm_aep_probit()
-    test_gplvm_aep_gaussian_scipy()
-    test_gplvm_aep_probit_scipy()
+    # test_gplvm_aep_gaussian()
+    # test_gplvm_aep_probit()
+    # test_gplvm_aep_gaussian_scipy()
+    # test_gplvm_aep_probit_scipy()
 
     # test_gpr_aep_gaussian()
     # test_gpr_aep_probit()
     # test_gpr_aep_gaussian_scipy()
     # test_gpr_aep_probit_scipy()
 
+    # plot_gpr_aep_probit_stochastic()
+    # plot_gpr_aep_gaussian_stochastic()
+    test_gpr_aep_probit_stochastic()
+    test_gpr_aep_gaussian_stochastic()
+
     # test_dgpr_aep_gaussian()
     # test_dgpr_aep_probit()
     # test_dgpr_aep_gaussian_scipy()