forked from bcaffo/Caffo-Coursera
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlecture14.tex
258 lines (230 loc) · 8.95 KB
/
lecture14.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
\documentclass[aspectratio=169]{beamer}
\mode<presentation>
\usetheme{Hannover}
\useoutertheme{sidebar}
\usecolortheme{dolphin}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{enumerate}
% some bold math symbosl
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\Cor}{\mathrm{Cor}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\brho}{\boldsymbol{\rho}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bbeta}{\boldsymbol{\beta}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\bH}{\mathbf{H}}
\newcommand{\by}{\mathbf{y}}
\newcommand{\bolde}{\mathbf{e}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\cpp}[1]{\texttt{#1}}
\title{Mathematical Biostatistics Boot Camp: Lecture 14, Logs}
\author{Brian Caffo}
\date{\today}
\institute[Department of Biostatistics]{
Department of Biostatistics \\
Johns Hopkins Bloomberg School of Public Health\\
Johns Hopkins University
}
\begin{document}
\frame{\titlepage}
\section{Table of contents}
\frame{
\frametitle{Table of contents}
\tableofcontents
}
\section{Logs}
\begin{frame}\frametitle{Logs}
\begin{itemize}
\item Recall that $\log_B(x)$ is the number
$y$ so that $B^y = x$
\item Note that you can not take the log of a negative
number; $\log_B(1)$ is always 0 and $\log_B(0)$ is $-\infty$
\item When the base is $B = e$ we
write $\log_e$ as just $\log$ or $\ln$
\item Other useful bases are $10$ (orders of magnitude) or $2$
\item Recall that $\log(ab) = \log(a) + \log(b)$, $\log(a^b) = b\log(a)$,
$\log(a/b) = \log(a) - \log(b)$ ($\log$ turns multiplication into addition,
division into subtraction, powers into multiplication)
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Some reasons for ``logging'' data}
\begin{itemize}
\item To correct for right skewness
\item When considering ratios
\item In settings where errors are feasibly multiplicative, such as
when dealing with concentrations or rates
\item To consider orders of magnitude (using log base 10); for example
when considering astronomical distances
\item Counts are often logged (though note the problem with zero counts)
\end{itemize}
\end{frame}
\section{The geometric mean}
\begin{frame}\frametitle{The geometric mean}
\begin{itemize}
\item The (sample) {\bf geometric mean} of a data set $X_1,\ldots,X_n$ is
$$
\left(\prod_{i=1}^n X_i \right)^{1/n}
$$
\item Note that (provided that the $X_i$ are positive) the log of the
geometric mean is
$$
\frac{1}{n}\sum_{i=1}^n \log(X_i)
$$
\item As the log of the geometric mean is an average, the LLN and clt apply
(under what assumptions?)
\item The geometric mean is always less than or equal to the sample
(arithmetic) mean
\end{itemize}
\end{frame}
\begin{frame}\frametitle{The geometric mean}
\begin{itemize}
\item The geometric mean is often used when the $X_i$ are all multiplicative
\item Suppose that in a population of interest, the prevalence of a
disease rose $2\%$ one year, then fell $1\%$ the next, then rose
$2\%$, then rose $1\%$; since these factors act multiplicatively it
makes sense to consider the geometric mean
$$
\left(1.02 \times .99 \times 1.02 \times 1.01\right)^{1/4} = 1.01
$$
for a $1\%$ geometric mean increase in disease prevalence
\end{itemize}
\end{frame}
\begin{frame}
\begin{itemize}
\item Notice that multiplying the initial prevalence by $1.01^4$ is the
same as multiplying by the original four numbers in sequence
\item Hence $1.01$ is constant factor by which you would need to multiply
the initial prevalence each year to achieve the same overall increase
in prevalence over a four year period
\item The arithmetic mean, in contrast, is the constant factor by which
your would need to {\em add} each year to achieve the same {\em
total} increase ($1.02 + .99 + 1.02 + 1.01$)
\item In this case the product and hence the geometric mean make more
sense than the arithmetic mean
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Nifty fact}
\begin{itemize}
\item The {\em question corner} (google) at the University of
Toronto's web site (where I got much of this) has a fun interpretation
of the geometric mean
\item If $a$ and $b$ are the lengths of the sides of a rectangle then
\begin{itemize}
\item The arithmetic mean $(a + b) / 2$ is the length of the sides of the square that
has the same perimeter
\item The geometric mean $(ab)^{1/2}$ is the length of the sides of
the square that has the same area
\end{itemize}
\item So if you're interested in perimeters (adding) use the
arithmetic mean; if you're interested in areas (multiplying) use the
geometric mean
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Asymptotics}
\begin{itemize}
\item Note, by the LLN the log of the geometric mean converges to $\mu = E[\log(X)]$
\item Therefore the geometric mean converges to $\exp\{E[\log(X)]\} =
e^\mu$, which is {\em not} the population mean on the natural scale;
we call this the population geometric mean (but no one else seems
to)
\item To reiterate
$$
\exp\{E[\log(x)]\} \neq E[\exp\{\log(X)\}] = E[X]
$$
\item Note if the distribution of $\log(X)$ is symmetric
then
$$
.5 = P(\log X \leq \mu) = P(X \leq e^\mu)
$$
\item Therefore, for log-symmetric distributions the geometric mean is
estimating the median
\end{itemize}
\end{frame}
\section{GM and the CLT}
\begin{frame}\frametitle{Using the CLT}
\begin{itemize}
\item If you use the CLT to create a confidence interval for the log
measurements, your interval is estimating $\mu$, the expected
value of the log measurements
\item If you exponentiate the endpoints of the interval, you are
estimating $e^\mu$, the population geometric mean
\item Recall, $e^\mu$ is the population median when the distribution
of the logged data is symmetric
\item This is especially useful for paired data when their ratio, rather than
their difference, is of interest
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Example}
Rosner, Fundamentals of Biostatistics page 298 gives a paired design comparing SBP for matched oral contraceptive users and controls.
\begin{itemize}
\item The geometric mean ratio is 1.04 (4\% increase in SBP for the OC users)
\item The T interval on the difference of the log scale measurements is [0.010, 0.067] log(mm Hg)
\item Exponentiating yields [1.010, 1.069] (mm Hg).
\end{itemize}
\end{frame}
\section{Comparisons}
\begin{frame}\frametitle{Comparisons}
\begin{itemize}
\item Consider when you have two independent groups, logging the
individual data points and creating a confidence interval for the
difference in the log means
\item Prove to yourself that exponentiating the endpoints of this
interval is then an interval for the {\em ratio} of the population
geometric means, $\frac{e^{\mu_1}}{e^{\mu_2}}$
\end{itemize}
\end{frame}
\section{The log-normal distribution}
\begin{frame}\frametitle{The log-normal distribution}
\begin{itemize}
\item A random variable is {\bf log-normally} distributed {\em if its log
is a normally distributed random variable}
\item ``I am log-normal'' means ``take logs of me and then I'll then be normal''
\item Note log-normal random variables are not logs of normal random variables!!!!!! (You can't even take the log of a normal random variable)
\item Formally, $X$ is lognormal$(\mu,\sigma^2)$ if $\log(X) \sim \mbox{N}(\mu, \sigma^2)$
\item If $Y \sim \mbox{N}(\mu,\sigma^2)$ then $X = e^Y$ is log-normal
\end{itemize}
\end{frame}
\begin{frame}\frametitle{The log-normal distribution}
\begin{itemize}
\item The log-normal density is
$$
\frac{1}{\sqrt{2\pi}} \times \frac{\exp[-\{\log(x) - \mu\}^2/ (2\sigma^2)]}{x}
~~\mbox{for}~~ 0\leq x \leq \infty
$$
\item Its mean is $e^{\mu + (\sigma^2 / 2)}$ and variance is $e^{2\mu + \sigma^2}(e^{ \sigma^2} - 1)$
\item Its median is $e^\mu$
\end{itemize}
\end{frame}
\begin{frame}\frametitle{The log-normal distribution}
\begin{itemize}
\item Notice that if we assume that $X_1,\ldots,X_n$ are
log-normal$(\mu,\sigma^2)$
then $Y_1 = \log X_1,\ldots, Y_n = \log X_n$ are normally distributed
with mean $\mu$ and variance $\sigma^2$
\item Creating a Gosset's $t$ confidence interval on using the $Y_i$
is a confidence interval for $\mu$ the log of the median of the
$X_i$
\item Exponentiate the endpoints of the interval to obtain a confidence interval
for $e^\mu$, the median on the original scale
\item Assuming log-normality, exponentiating $t$ confidence intervals
for the difference in two log means again estimates ratios of geometric
means
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Example}
Gray matter volumes investigated
\begin{itemize}
\item Took GM volumes for the young and old groups, logged them
\item Did two independent group intervals, got
old [13.24, 13.27] log(cubic cm) and young [13.29, 13.31] log(cubic cm).
\item Exponentiating yields [564.4, 577.5] cc, [592.0, 606.9] cc.
\item Doing a two group T interval on the logged measurements yields [0.032, 0.066] log(cubic cm)
\item exponentiating this interval yields [1.032, 1.068]
\end{itemize}
\end{frame}
\end{document}