dnes7.tex

\documentclass[letterpaper, preprint]{aastex}
\usepackage{amsmath, amsfonts, bbm, bm, calc}
\usepackage{graphicx}
\usepackage[normalem]{ulem}

%\usepackage[pdftex]{graphicx}
%\usepackage{epstopdf}
\newcounter{address}
\usepackage{color}
\newcommand{\latin}[1]{\emph{#1}}
\newcommand{\etal}{\latin{et\,al.}}
\newcommand{\ie}{\latin{i.\,e.}}
\newcommand{\eg}{\latin{e.\,g.}}
\newcommand{\unit}[1]{\mathrm{#1}}
\newcommand{\bth} {\boldsymbol \theta}
\newcommand{\md}{\mathrm{d}}

\definecolor{red}{rgb}{1,0,0}
\definecolor{blue}{rgb}{0,0,1}
\definecolor{darkgreen}{rgb}{0,0.5,0}

%  Display a comment in red, for discussion in drafts
%  To make the comments go away, uncomment the second version of the command

\newcommand{\qer}[1]{{\color{red}#1}}
%\renewcommand{\qer}[1]{}     %  Uncomment this to make the red comments go away.

\usepackage{hyperref}


\begin{document}
\title{
  Diffusive Nested Ensemble Sampling
}

\begin{abstract}
This paper contains an exposition and some refinements of diffusive nested sampling as an approach to computing Bayesian evidence of Bayesian model selection. We show that an affine invariant ensemble sampler is effective in some cases. We use the modified algorithm to study multi-companion fits to radial velocity data for stars. 

\end{abstract}

\keywords{
methods: nested sampling
---
methods: markov chain monte carlo
---
methods: data analysis
---
bayesian decision theory
}

\section{Introduction}

When presented with competing models and data $\cal D$, Bayes theorem tells us to compare models as follows:
\begin{equation}
P(\mathrm{Model}_j\mid{\cal D})=\frac{P({\cal D}\mid\mathrm{Model}_j)\,P(\mathrm{Model}_j)}{\sum_j{P({\cal D}\mid\mathrm{Model}_j)\,P(\mathrm{Model}_j)}}.
\label{eq:bayesian-decision-theory}
\end{equation}
More precisely, suppose model $j$ has parameters $\bth_j = (\theta_1,\ldots,\theta_{n_j})$, where $n_j$ is the dimension of model $j$. Let $\pi_j(\bth_j)$ be the prior, and let $L_j({\cal D}\mid\bth_j)$ be the likelihood of data $\cal D$ in model $j$ with parameters $\bth_j$. The probability of data $\cal D$ given model $j$ in Eqn.~(\ref{eq:bayesian-decision-theory}), which is also the evidence of model $j$, is
\begin{equation}
Z_j({\cal D}) = P({\cal D}\mid\mathrm{Model}_j) = \int L({\cal D}\mid \bth_j)\, \pi_j(\bth_j) \,\md\bth_j \; .
\label{eqn:Zj}
\end{equation}
Our goal is to estimate $Z_j$ using Markov chain Monte Carlo (MCMC) and diffusive nested sampling.

We have in mind the application to estimating the number of companions about a star based on radial velocity data. Let $j$ represent a model with $j$ companions, and let $s$ represent the number of data sources. The parameters for model $j$ consist of two parameters (velocity offset and jitter) per data source and five orbital parameters per companion. This gives
\[
n_j=2s+5j
\]
in this case.

The \textit{likelihood} in Eqn.~(\ref{eq:bayesian-decision-theory}) is in fact the evidence $Z_{\mathrm{Model}\;j}$ for model j.
\begin{equation}
Z_{\mathrm{Model}\;j} \equiv P(\mathrm{Data}|\mathrm{Model}_j) \equiv \int P(\mathrm{Data}|\theta, \mathrm{Model}_j) \, P(\theta|\mathrm{Model}_j)\mathrm{d}\theta,
\end{equation}
where $P(\mathrm{Data}|\theta, \mathrm{Model}_j)$ is the likelihood of parameter $\theta$ for model j and $P(\theta|\mathrm{Model}_j)$ is the prior of parameter $\theta$ for model j. To make notations simple, we will use $L(\theta)$ for the likelihood and $\pi(\theta)$ for the prior. We'll also drop the index in $Z_{\mathrm{Model}\;j}$, because the discussion applies to all models. So the evidence can be written as
\begin{equation}
Z=\int\! L(\theta)\,\pi(\theta)\,\mathrm{d}\theta.
\end{equation}
The prior $\pi(\theta)$ is normalized in the parameter space, that is
\begin{equation}
\int\!\pi(\theta)\,\mathrm{d}\theta=1,
\end{equation}
while the likelihood $L(\theta)$ is not.

However, evaluating the evidence integral using markov chain Monte Carlo (MCMC) has always been challenging, because the integral $Z$ is the normalization of a probability density. Diffusive Nested Sampling proves to be an efficient and accurate method to evaluate the evidence \citep{brewer11a}. We hope to take advantage of an affine invariant ensemble sampler \citep{goodman10a, hou12a, foreman-mackey13a} to make diffusive nested sampling even more efficient.


\section{Diffusive Nested Sampling}

In this section, we explain how diffusive nested sampling works and how to apply the affine invariant sampler to diffusive nested sampling. We first introduce some basic concepts about diffusive nested sampling and describe how diffusive nested sampling works in general. Then, we discuss the algorithm in detail. At last, we try to make the evidence evaluation more accurate and analyze the uncertainty. 

We change the variable in the evidence integral from $\theta$ to
\begin{equation}
M(L^*) \equiv \int_{L(\theta)>L^*}\!\pi(\theta)\,\mathrm{d}\theta,
 \label{eq:prior-mass}
\end{equation}
which is, in another word, cumulant prior mass covering the area which has likelihood values greater than $L^*$ \citep{skilling06a}. $M$ is a monotonically decreasing function of $L^*$, it ranges from 0 to 1. The mapping between $M$ and $L^*$ is a bijection. An infinitesimal increment of $M$ is
\begin{equation}
\mathrm{d}M=\int_{L^*-\mathrm{d}L^*<L(\theta)<L^*}\!\pi(\theta)\,\mathrm{d}\theta = \pi(\theta)\times \mathrm{volume\,of}\,\theta,\,\mathrm{satisfying}\, L^*-\mathrm{d}L^*<L(\theta)<L^*,
\label{eq:dM}
\end{equation}
where signs have been ignored for clarity. Multiplying both sides by $L^*$ and integrating, we get
\begin{equation}
\int^1_0\! L^*\,\mathrm{d}M=\int\!L(\theta)\,\pi(\theta)\,\mathrm{d}\theta.
\label{eq:int-dM}
\end{equation}
so the evidence $Z$ can be expressed as
\begin{equation}
Z=\int^1_0\! L^*(M)\,\mathrm{d}M.
 \label{eq:evidence-prior-mass}
\end{equation}
In most cases, it is impossible to know the function $L^*(M)$ analytically. Based on the definition of $M$, Eqn.~(\ref{eq:prior-mass}), if we generate $N$ samples from the prior $\pi(\theta)$, $M(L^*)$ is the proportion of samples which have likelihood larger than $L^*$. Nested sampling takes advantage of this to find $L^*(M)$ statistically.

\subsection{Level and Constrained Prior}
In nested sampling, we first try to find several points, $\{(M_0, L_0^*),(M_1,L_1^*),\ldots\}$, on the $L^*(M)$ curve. We call these points `levels'. The $L^*_j$ is called a level's likelihood threshold or simply threshold. Each level defines a constrained prior,
\begin{equation} 
p_{j}(\theta) = \frac{\pi(\theta)}{M_j}\,\mathbbm{1}_{L(\theta)>L_j^*},
\label{eq:constrained-prior}
\end{equation}
where
\begin{eqnarray*}
\mathbbm{1}_{L(\theta)>L_j^*} = \left\{ \begin{array}{ll}
               1 & \mbox{ if } L(\theta)>L_j^*,\\
               0 & \mbox{ otherwise.} \end{array}  \right.
\end{eqnarray*}
Normalized by $M_j$, $p_{j}(\theta)$ is a properly defined probability density function. We can also define a mixture of these constrained priors,
\begin{equation*}
p(\theta) = \sum_{j} w_j\,p_{j}(\theta),
\end{equation*}
where $w_j$ are weights assigned to each $p_j(\theta)$ and $w_j$'s should sum up to 1. The mixture of constrained priors will be discussed in more detail later.


\subsection{Setting Level Thresholds}
\label{sec:constructing}
Our nested sampler first attempts to make the levels' prior masses $M_j = e^{-j}$ and estimate the corresponding likelihood thresholds $L^*_j$. The algorithm to achieve this is described in Section~(\ref{sec:algorithm}). Then we keep $L^*_j$ unchanged and look for a more accurate prior mass $\widehat{M_j}$ that corresponds to $L^*_j$. We call this procedure prior mass refinement. This is described in Section~(\ref{sec:refining-level-masses}). The zeroth level has $M_0 =\widehat{M_0}= 1$ and $L^*_0 = 0$.

To estimate $L^*_1$, we generate $N$ samples from the prior density $\pi(\theta)$, $\theta_1$, $\ldots$, $\theta_N$. We choose $L^*_1$ so that the number of $k$ with $L(\theta_k) > L^*_1$ is $N/e$. This may be done with the {\em quick find} algorithm that is part of the standard template library (STL) of C++. The actual prior mass corresponding to $L^*_1$ is subject to round-off error. Luckily, we are able to find $N$'s that make $M_1$ extremely close to $e^{-1}$.\footnote{For example, $N=1084483$ and $N/e$ is rounded to be $398959$. $\log{(398959/1084483)}=-0.999999999999823$.} We treat $M_j$ and $e^j$ as synonyms in this paper. 

To estimate the next level $\left(M_2, \,L^*_2\right)$, we need $N$ samples with likelihood larger than $L^*_1$ from the prior. There are many different ways to do this. One is sampling the constrained density $p_{1}(\theta)$ defined in Eqn.~(\ref{eq:constrained-prior}). Another would be sampling a mixture of $p_{1}(\theta)$ and prior $\pi(\theta)$. Sampling the mixture is a better method because the area covered by level 1 may be disconnected in parameter space and only sampling the constrained prior $p_{1}(\theta)$ may get us stuck in only one or few of those disconnected areas. In order to balance efficiency and the need to circumvent discontinuity, we give the latest level more weight. For example, we can use $w_1/w_0=e$. We keep sampling until we have a chain of $N$ likelihoods which are all larger than $L^*_1$, rank these likelihoods in descending order and find the $N/e$-th likelihood, which we call level 2. $L_2^*$ is the likelihood threshold of level 2. $M_2=1/e^2$ is the prior mass that level 2 covers. 

Continuing with the method described above, suppose we now have levels $(M_0,\,L^*_0)$, $(M_1,\,L^*_1)$, $(M_2,\,L^*_2)$, $\ldots$. All these levels are in fact estimations. This can be seen from two aspects. One aspect is that the $L^*_j$'s are the estimations of the true likelihood thresholds corresponding to prior masses $M_j=e^{-j}$. The other aspect is that the $M_j$'s are the estimations of the true prior masses covered by $L^*_j$. We choose the 2nd aspect and will refine the prior masses $M$'s in Section~(\ref{sec:refining-level-masses}).

There is a simple stopping criterion to tell how many levels are enough, assuming we have solved the optimization problem to find $L_{\mbox{\scriptsize \em max}}$. Suppose we already have $j$ new levels besides level 0. The evidence integral is 
$$
Z= \int_{M_{j}}^1 L^*(M)\,\mathrm{d}M+\int_0^{M_j} L^*(M)\,\mathrm{d}M=Z_j+\int_0^{M_j} L^*(M)\,\mathrm{d}M.
$$
Because $L^*(M) < L_{max}$ always, the 2nd term cannot be larger than $L_{max}\,M_j$. We choose a stopping point $J$ so that $L_{max}\,M_J \leq \epsilon Z_J$. We usually choose $\epsilon = 10^{-6}$. $Z_J$ can be roughly estimated from all the levels already built. We do not simply throw away the integration from $0$ to $M_J$. We just do not build new levels in that interval.

With total $J$ levels, the mixture of constrained priors can be defined as
\begin{equation}
p(\theta) = \sum_{j=0}^J w_j\,p_{j}(\theta),
\label{eq:mixture-constrained-prior}
\end{equation}
where $p_{j}(\theta)$ is the constrained prior defined in Eqn. (\ref{eq:constrained-prior})
and $w_j$ are the weights of each level which sum up to 1,
\begin{equation}
\sum_{j=0}^J w_j = 1.
\label{eq:weight-sum-1}
\end{equation}
The choice of weight may change according to different purposes. For example, when we are building a new level, we can use `exponential' weight
\[
w_j \propto \exp{\left(\frac{j-J}{\lambda}\right)},
\]
where $J$ is the latest level index and $\lambda$ is some constant. \citep{brewer11a} But when we refine the prior masses, we need to sample all the levels with equal weight.

Nested sampling can be better understood in the context of importance sampling. The goal is to evaluate the integral $Z$, defined in Eqn.~(\ref{eq:int-dM}). By performing nested sampling, we are in fact trying to find a probability density similar in shape with the integrand $L(\theta)\,\pi(\theta)$. The mixture of constrained priors $p(\theta)$ is that probability density function we are looking for. However, because both $M_j$ and $L^*_j$ are estimations, $p(\theta)$ is not a well-defined probability density function since normalization depends on both $M_j$ and $L^*_j$. This is why we will have to refine the levels.

\subsection{Nested Sampling by Stretch Move}
\label{sec:algorithm}
We define the joint probability density of $\theta$ and $j$ as
\begin{equation}
p(\theta,j) = w_j\,p_j(\theta),
\label{eq:joint}
\end{equation} 
so $p(\theta)$ defined in Eqn.~(\ref{eq:mixture-constrained-prior}) can be seen as the marginal density of $p(\theta,j)$ summed over $j$. We can sample $p(\theta,j)$ by partial re-sampling. That is to say, we first sample $p(\theta|j)$ with $j$ fixed and then sample $p(j|\theta)$ with $\theta$ fixed. The two conditional probability densities can be expressed more explicitly as
\begin{equation}
p(\theta|j)=\frac{p(\theta,j)}{\int p(\theta,j)\,\mathrm{d}\theta}=p_j(\theta),
\end{equation}
and
\begin{equation}
p(j|\theta)=\frac{p(\theta,j)}{\sum_j p(\theta,j)}=C\,\frac{w_j}{M_j}\,\mathbbm{1}_{L(\theta)>L_j^*}=C\,e^j\,w_j\,\mathbbm{1}_{L(\theta)>L_j^*},
\label{eq:conditional-j}
\end{equation}
where $C$ is the normalization and a function of $\theta$ but not of $j$.

We use stretch move to sample $p(\theta|j)$. Stretch move has the feature of affine invariance and is a very efficient ensemble sampler with low auto-correlation time and few tuning parameters. \citep{goodman10a} In order to apply stretch move to nested sampling, we assign a level index to every walker in the ensemble. The likelihood threshold of the level assigned to a walker must be smaller than the likelihood of that walker. So a walker with level index $j$ can be seen as a sample from $p_j(\theta)$, a.k.a. $p(\theta|j)$. The ensemble size has to be larger than both the dimension of $\theta$ and the total number of $j$ in order not to get stuck in a subspace. The stretch move can be described in pseudo-code as:
\begin{sffamily}
\begin{itemize}
\item to propose a new location for walker $X$, randomly choose a helping walker $Y$ in the ensemble different from $X$.
\item propose a new location with stretch move: $X_{new} = Y + z\, (X-Y)$, where $z$ is a random variable from some distribution \citep{goodman10a}.
\item accept the proposed $X_{new}$ with probability: $\mathrm{max}\left(z^{\mathrm{d}-1}\,\frac{p_{j}(X_{new})}{p_{j}(X_{old})},1\right)$. 
\end{itemize}
\end{sffamily}

There are many ways to sample $p(j|\theta)$ defined in Eqn.~(\ref{eq:conditional-j}). Simple Metropolis-Hastings would suffice. \citep{brewer11a} If $w_j$'s are simple functions like constant or exponential, we can also use direct sampling. For example, when $w_j$ are the same for all the levels, the sampling can be summarized as:
\begin{sffamily}
\begin{itemize}
\item find the largest possible level index $j_{max}$ for walker $X$.
\item the new index is $j_{new} = [j_{max}+\log{(U)}]$, where $U$ is a uniform random variable and the square bracket means truncating. If $j_{new}<0$, $j_{new} = 0$.
\end{itemize}
\end{sffamily}
This is actually slightly biased towards moving to smaller $j$'s but the bias is almost negligible. However, whichever method we use, we in fact have assumed that, if a walker is in level $j$, the probability that the walker can be assigned to any higher level $k>j$ is proportional to $\exp(j-k)$. This assumption cannot be guaranteed true because samples from $p(\theta|j)$ may not be independent, which means the new samples may be too close to the original ones.

The two procedures described above can happen in any order. In our code, as well as in \citep{brewer11a}, half the times we sample the parameter space first and the level indices second and the other way around for the other half.

\subsection{Refining Level Masses}
\label{sec:refining-level-masses}

Assume we have constructed $J$ levels following previous sections, $\{(M_1,L_1^*),(M_2,L_2^*), \ldots,(M_J,L_J^*)\}$. We sample $p(\theta,j)$, defined in Eqn.~(\ref{eq:joint}), to obtain a long enough chain of both the visited level indices and the likelihoods of the walkers during those visits. In practice, we keep the samples from visiting different levels in different chains. If we have $J$ levels besides level 0, we would have $J+1$ chains. The length of each of these chains should be at least a few times larger than $N$ which is the length we use to build each level in Section~(\ref{sec:constructing}). For the $j$-th chain ($0\leq j<J$), we can define an indicator function
\begin{eqnarray}
\mathbbm{1}_j(\theta) = \left\{ \begin{array}{ll}
               1 & \mbox{ if } L(\theta)>L_{j+1}^*,\\
               0 & \mbox{ otherwise.} \end{array}  \right.
\end{eqnarray}
In another word, $\mathbbm{1}_j$ indicates whether the walker is within level $j+1$ during a visit to level $j$. Let $n_j$ be the length of the $j$-th chain. Let $n_j^{j+1}$ be the number of times when the likelihood exceeds level $j+1$'s threshold $L^*_{j+1}$ during those $n_j$ visits to level $j$. $n_{j}^{j+1}$ can be expressed as 
\begin{equation}
n_j^{j+1}=\sum_{k=1}^{n_j}\mathbbm{1}_j(\theta_k),
\end{equation}
where $k$ is just a label for samples. The refined prior mass $\widehat{M_j}$ is then defined as \citep{brewer11a}
\begin{equation}
\widehat{M_j} = \widehat{M_{j-1}} \, \frac{n_{j-1}^j+C \,  {M_j}/{M_{j-1}}}{n_{j-1}+C}= \widehat{M_{j-1}} \, \frac{n_{j-1}^j+C \,  e^{-1}}{n_{j-1}+C},
\label{eq:refinement}
\end{equation}
where $C$ is a constant that reflects one's confidence in the accuracy of original levels. If $n_{j-1}$ and $n_{j-1}^j$ are large enough, $C$ will not be very important. As mentioned before, $\widehat{M_0}=M_0=1$ and we start with $\widehat{M_1}$, so $\widehat{M_{j-1}}$ will be known when we arrive at $\widehat{M_j}$. 

In fact, if both $n_{j}$ and $n_{j}^{j+1}$ are large enough and $C$ is neglectible, $\widehat{M_{j+1}}/\widehat{M_{j}}$ can be expressed as
\begin{equation}
\frac{\widehat{M_{j+1}}}{\widehat{M_{j}}}\approx\frac{n_j^{j+1}}{n_j}=\frac{1}{n_j}\sum_{k=1}^{n_j}\mathbbm{1}_j(\theta_k).
\label{eq:mass-ratio}
\end{equation}
So the expectation of $\widehat{M_{j+1}}/\widehat{M_{j}}$ is approximately the same as the expectation of $\mathbbm{1}_j$. And $\widehat{M_j}$ is
\begin{equation}
\widehat{M_j}\approx\prod_{l=0}^{j-1}\left(\frac{1}{n_l}\sum_{k=1}^{n_l}\mathbbm{1}_l(\theta_k)\right).
\end{equation}
%Because all the $\mathbbm{1}_j$'s are independent, we have the expectation of $\widehat{M_j}$
%\begin{equation}
%\mathrm{E}(\widehat{M_j}) \approx \mathrm{E}(\mathbbm{1}_0\,\mathbbm{1}_1\,\mathbbm{1}_2\,\ldots\,\mathbbm{1}_{j-1}).
%\end{equation}
The variance of $\mathbbm{1}_j$ is
\begin{equation}
\mathrm{Var}(\mathbbm{1}_j)=\mathrm{E}(\mathbbm{1}_j)\left(1-\mathrm{E}(\mathbbm{1}_j)\right)\approx \frac{n_{j}^{j+1}}{n_{j}}\left(1-\frac{n_{j}^{j+1}}{n_{j}}\right).
\end{equation}
By central limit theorem, taking the variance of both sides of Eqn.~(\ref{eq:mass-ratio}), we get
\begin{eqnarray}
\mathrm{Var}\left(\frac{\widehat{M_{j+1}}}{\widehat{M_{j}}}\right)\approx\mathrm{Var}\left(\frac{1}{n_j}\sum_{k=1}^{n_j}\mathbbm{1}_j(\theta_k)\right) = \frac{\mathrm{E}(\mathbbm{1}_j)\left(1-\mathrm{E}(\mathbbm{1}_j)\right)}{n_{j}/\tau} \approx  \frac{n_{j}^{j+1}/n_{j}\left(1-n_{j}^{j+1}/n_{j}\right)}{n_{j}/\tau},
\label{eq:var-ratio}
\end{eqnarray}
where $\tau$ is the auto-correlation time of the chain of $\mathbbm{1}_j(\theta_k)$, $k=1,\,2,\,\ldots,\,n_j$. And we can get the variance of $\widehat{M_j}$ using the following relationship
\begin{equation}
\mathrm{Var}\left(\widehat{M_{j}}\right)=\mathrm{Var}\left(\frac{\widehat{M_{j}}}{\widehat{M_{j-1}}}\right)\,\mathrm{Var}\left(\widehat{M_{j-1}}\right)+\mathrm{Var}\left(\frac{\widehat{M_{j}}}{\widehat{M_{j-1}}}\right)\,\mathrm{E}\left(\widehat{M_{j-1}}\right)^2+\mathrm{E}\left(\frac{\widehat{M_{j}}}{\widehat{M_{j-1}}}\right)^2\,\mathrm{Var}\left(\widehat{M_{j-1}}\right),
\end{equation}
and starting from $\mathrm{Var}\left(\widehat{M_0}\right)=0$. The uncertainty of interval $\widehat{M_{j-1}}-\widehat{M_{j}}$ can be similarly found,
\begin{align}
\mathrm{Var}\left(\widehat{M_{j-1}}-\widehat{M_{j}}\right)=&\mathrm{Var}\left(\frac{\widehat{M_{j}}}{\widehat{M_{j-1}}}\right)\,\mathrm{Var}\left(\widehat{M_{j-1}}\right)+\mathrm{Var}\left(\frac{\widehat{M_{j}}}{\widehat{M_{j-1}}}\right)\,\left(1-\mathrm{E}\left(\widehat{M_{j-1}}\right)\right)^2 \nonumber\\
&+\mathrm{E}\left(\frac{\widehat{M_{j}}}{\widehat{M_{j-1}}}\right)^2\,\mathrm{Var}\left(\widehat{M_{j-1}}\right).
\label{eq:refine-interval-variance}
\end{align}
The only difference form $\mathrm{Var}\left(\widehat{M_{j}}\right)$ is in the 2nd term.

While sampling the mixture of constrained priors Eqn.~(\ref{eq:mixture-constrained-prior}), it is possible that the weights $w_j$ are not sampled as desired. One cause of this is that the levels are not constructed accurately, which means the prior mass that $L^*_j$ covers is not exactly $e^{-1}$ of the prior mass that $L^*_{j-1}$ covers. Another cause is that the samples are not independent, which is talked about in Section~(\ref{sec:algorithm}). As a result, some of the $n_j$'s and $n_j^{j+1}$'s are too small to make a meaningful refinement. In such cases, we can use the number of visits to each level $n_j$ to enforce that the weights $w_j$ be sampled as desired \citep{brewer11a}. Although such enforcement would violate the Markov property, the violation only happens in sampling the indices and does not happen in the sampling of the parameter space. So the estimation of $Z$ should not be affected.

\subsection{Computing Evidence}
We take the mean of likelihoods sandwiched between two levels,
\begin{equation}
\bar{L}_j= \frac{1}{l_j}\sum_{L_j^*\leq L(\theta)<L_{j+1}^*} L(\theta),
\end{equation}
where $l_j$ is the number of samples sandwiched between level $j$ and level $j+1$. For the sake of notation simplicity, we can add one extra level whose likelihood threshold $L^*_{J+1}$ is the optimum likelihood and whose prior mass $M_{J+1}$ is 0. So the estimation of the evidence can be expressed as
\begin{equation}
\widehat{Z}=\sum_{j=0}^{J} \bar{L}_j \, \left(\widehat{M_j} - \widehat{M_{j+1}}\right).
\end{equation}
The variance of the evidence $Z$ can be estimated via
\begin{equation}
\mathrm{Var}\left(\widehat{Z}\right) \approx \sum_{j=0}^{J} {\bar{L}_j}^{2} \, \mathrm{Var}\left(\widehat{M_j} - \widehat{M_{j+1}}\right) + \mathrm{Var}\left(\bar{L}_j\right)\, \left(\widehat{M_j} - \widehat{M_{j+1}}\right)^2,
\label{eq:var-Z}
\end{equation}
where $\mathrm{Var}\left(\widehat{M_j} - \widehat{M_{j+1}}\right)$ can be calculated using Eqn.~(\ref{eq:refine-interval-variance}) and
\begin{equation}
\mathrm{Var}\left(\bar{L}_j\right) \approx \frac{1}{{l_j}^2/\tau_j}\sum_{L_j^*\leq L(\theta)<L_{j+1}^*} \left(L(\theta)-\bar{L}_j\right)^2,
\end{equation}
where $\tau_j$ is the auto-correlation time for $L(\theta)$ chain that satisfies $L_j^*\leq L(\theta)<L_{j+1}^*$. In Eqn.~(\ref{eq:var-Z}), cross terms are ignored because $\widehat{M_j} - \widehat{M_{j+1}}$ and $\bar{L}_j$ are independent. Also ignored is the the product of the variance of both $\widehat{M_j} - \widehat{M_{j+1}}$ and $\bar{L}_j$ because its contribution is of smaller order.


\begin{thebibliography}

\bibitem[Brewer \etal(2011)]{brewer11a}
Brewer,~B.~J., P\'{a}rtay,~L.~B. \& Cs\'{a}nyi,~G, 2011, Statistics and Computing, 21, 649

\bibitem[Foreman-Mackey \etal(2013)]{foreman-mackey13a}
Foreman-Mackey,~D., Hogg,~D.~W., Lang,~D., Goodman,~J., http://arxiv.org/abs/1202.3665

\bibitem[Goodman \etal(2010)]{goodman10a}
Goodman,~J., Weare,~J., 2010, Comm.\ App.\ Math.\ and Comp.\ Sci., 5, 65

\bibitem[Hou \etal(2012)]{hou12a}
Hou,~F., Goodman,~J., Hogg,~D.~W., Weare,~J., Schwab,~C., 2012, \apj, 745, 198

\bibitem[Skilling (2006)]{skilling06a}
Skilling,~J., 2006, Bayesian Analysis, 4, 833

\end{thebibliography}

\end{document}