% NOTE -- ONLY EDIT THE .Rnw FILE!!!  The .tex file is
% likely to be overwritten.

\documentclass[dvipsnames,pdflatex,beamer]{beamer}
%\documentclass[letterpaper,11pt,notitlepage]{article}\usepackage{beamerarticle}
\mode<article>{\usepackage[text={6.2in,9in},centering]{geometry}}
\mode<beamer>{\usetheme{Boadilla}\usecolortheme{seahorse}\usecolortheme{rose}}
\usepackage{SweaveSlides}
\date[Sept 24, 2010]{Merck, Sharp \& Dohme; Rahway, NJ\\Sept 24, 2010}
\title[NLMM]{Mixed models in R using the lme4 package\\Part 8: Nonlinear mixed models}
\SweaveOpts{engine=R,eps=FALSE,pdf=TRUE,width=10,height=6.5,strip.white=all}
\SweaveOpts{prefix=TRUE,prefix.string=figs/GLMM,include=TRUE}
\SweaveOpts{keep.source=TRUE}
\mode<beamer>{\setkeys{Gin}{width=\textwidth}}
\mode<article>{\setkeys{Gin}{width=0.8\textwidth}}
\newcommand{\bLt}{\ensuremath{\bm\Lambda_\theta}}
<<preliminaries,echo=FALSE,results=hide>>=
options(width = 70, show.signif.stars = FALSE)
data(Contraception, package = "mlmRev")
library(lattice)
library(Matrix)
library(MatrixModels)
library(Rcpp)
library(minqa)
library(lme4a)
lattice.options(default.theme = function() standard.theme())
@ 
\begin{document}

\mode<article>{\maketitle\tableofcontents}
\mode<presentation>{\frame{\titlepage}}
\mode<presentation>{\frame{\frametitle{Outline}\tableofcontents[pausesections,hideallsubsections]}}
\section{Nonlinear mixed models}

\begin{frame}
  \frametitle{Nonlinear mixed models}
  \begin{itemize}
  \item Population pharmacokinetic data are often modeled using
    \Emph{nonlinear mixed-effects models} (NLMMs).
  \item These are \Emph{nonlinear} because pharmacokinetic parameters
    - rate constants, clearance rates, etc. - occur nonlinearly in the
    model function.
  \item In statistical terms these are \Emph{mixed-effects models}
    because they involve both \Emph{fixed-effects parameters},
    applying to the entire population or well-defined subsets of the
    population, and \Emph{random effects} associated with particular
    experimental or observational units under study.
  \item Many algorithms for obtaining parameter estimates, usually
    ``something like'' the \Emph{maximum likelihood estimates} (MLEs),
    for such models have been proposed and implemented.
  \item Comparing different algorithms is not easy.  Even
    understanding the definition of the model and the proposed
    algorithm is not easy.
  \end{itemize}
\end{frame}


\begin{frame}[fragile]
  \frametitle{An example: Theophylline pharmacokinetics}
<<Theophplot,fig=TRUE,echo=FALSE,height=5>>=  
print(xyplot(conc ~ Time|Subject, Theoph, type = c("g","b"),
             xlab = "Time since drug administration (hr)",
             ylab = "Serum concentration (mg/l)"))
@ 
\begin{itemize}
\item These are serum concentration profiles for 12 volunteers after
  injestion of an oral dose of Theophylline, as described in Pinheiro
  and Bates (2000).
\end{itemize}

\end{frame}
\begin{frame}
  \frametitle{Modeling pharmacokinetic data with a nonlinear model}
  \begin{itemize}
  \item These are longitudinal repeated measures data.
  \item For such data the time pattern of an individual's response is
    determined by pharmacokinetic parameters (e.g. rate constants)
    that occur nonlinearly in the expression for the expected response.
  \item The form of the nonlinear model is determined by the
    pharmacokinetic theory, not derived from the data.
    \begin{displaymath}
      d\cdot k_e\cdot k_a\cdot C\frac{e^{-k_et}-e^{-k_at}}{k_a-k_e}
    \end{displaymath}
  \item These pharmacokinetic parameters vary over the population.  We
    wish to characterize typical values in the population and the
    extent of the variation.
  \item Thus, we associate random effects with the parameters, $k_a$,
    $k_e$ and $C$ in the nonlinear model.
  \end{itemize}
\end{frame}

\section{Statistical theory, applications and approximations}
\label{sec:Philosophy}

\begin{frame}
  \frametitle{Statistical theory and applications - why we need both}
  \begin{itemize}
  \item For 30 years, I have had the pleasure of being part of the
    U. of Wisconsin-Madison Statistics Dept.  This year we celebrate
    the 50th anniversary of the founding of our department by George
    Box (who turned 90 earlier this year).
  \item George's approach, emphasizing \textbf{both} the theory and
    the applications of statistics, has now become second-nature to me.
  \item We are familiar with the dangers of practicing theory without
    knowledge of applications.  As George famously said, ``All models
    are wrong; some models are useful.''  How can you expect to decide
    if a model is useful unless you use it?
  \item We should equally be wary of the application of statistical
    techniques for which we know the ``how'' but not the ``why''.
    Despite the impression we sometimes give in courses, applied
    statistics is not just a ``black box'' collection of formulas into
    which you pour your data, hoping to get back a p-value that is
    less than 5\%.  (In the past many people felt that ``applied
    statistics is the use of SAS'' but now we know better.)
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{The evolving role of approximation}
  \begin{itemize}
  \item When Don Watts and I wrote a book on nonlinear regression we
    included a quote from Bertrand Russell, ``Paradoxically, all exact
    science is dominated by the idea of approximation''.  In
    translating statistical theory to applied techniques (computing
    algorithms) we almost always use some approximations.
  \item Sometimes the theory is deceptively simple (maximum
    likelihood estimates are the values of the parameters that
    maximize the likelihood, given the data) but the devil is in the
    details (so exactly how do I maximize this likelihood?).
  \item Decades of work by many talented people have provided us with
    a rich assortment of computational approximations and other tricks
    to help us get to the desired answer - or at least close to the
    desired answer.
  \item It is important to realize that approximations, like all
    aspects of computing, have a very short shelf life.  Books on
    theory can be useful for decades; books on computing may be
    outmoded in a few years.
  \end{itemize}
\end{frame}
\begin{frame}
  \frametitle{Failure to revisit assumptions leads to absurdities}
  \begin{itemize}
  \item Forty years ago, when I took an intro engineering stats class,
    we used slide rules or pencil and paper for calculations.  Our
    text took this into account, providing short-cut computational
    formulas and ``rules of thumb'' for the use of approximations, plus
    dozens of pages of tables of probabilities and quantiles.
  \item Today's computing resources are unimaginably more
    sophisticated yet the table of contents of most introductory text
    hasn't changed.
  \item The curriculum still includes using tables to evaluate
    probabilities, calculating coefficient estimates of a simple
    linear regression by hand, creating histograms (by hand, probably)
    to assess a density, approximating a binomial by a Poisson or by a
    Gaussian for cases not available in the tables, etc.
  \item Then we make up PDF slides of this content and put the file on a web
    site for the students to download and follow on their laptops
    during the lecture.  Apparently using the computer to evaluate the
    probabilities or to fit a model would be cheating - you are
    supposed to do this by hand.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{And what about nonlinear mixed-effects models?}
  \begin{itemize}
  \item Defining the statistical model is subtle and all methods
    proposed for determining parameter estimates use approximations.
  \item Often the many forms of approximations are presented as different
    ``types'' of estimates from which one can pick and choose. 
  \item In 2007-2008 a consortium of pharma companies, the NLMEc,
    discussed ``next generation'' simulation and estimation software
    for population PK/PD modeling.  They issued a set of user
    requirements for such software including, in section 4.4 on estimation
    \begin{quote}
      The system will support but not be limited to the following
      estimation methods: FO, FOI, FOCE, FOCEI, Laplacian, Lindstrom
      and Bates, MCMC, MCPEM, SAEM, Gaussian quadrature, and
      nonparametric methods.
    \end{quote}
  \item Note the emphasis on estimation methods (i.e. algorithms).
    All of these techniques are supposed to approximate the mle's but
    that is never mentioned.
  \end{itemize}
\end{frame}
\section[Model]{Model definition}

\begin{frame}
  \frametitle{Linear and nonlinear mixed-effects models}
  \begin{itemize}
  \item Both linear and nonlinear mixed-effects models, are based on
    the $n$-dimensional response random variable, $\bc Y$, whose
    value, $\bm y$, is observed, and the $q$-dimensional, unobserved
    random effects variable, $\bc B$.
  \item In the models we will consider $\bc B\sim\mathcal{N}\left(\bm
      0,\bm\Sigma_\theta\right)$.  The variance-covariance matrix
    $\bm\Sigma_\theta$ can be huge but it is completely determined by
    a small number of \Emph{variance-component parameters},
    $\bm\theta$.
  \item The conditional distribution of the response, $\bc Y$, is
    \begin{displaymath}
      \left(\bc Y|\bc B=\bm b\right)\sim
      \mathcal{N}\left(\bm\mu_{\bc Y|\bc B},\sigma^2\bm I_n\right)
    \end{displaymath}
  \item The conditional mean, $\bm\mu_{\bc Y|\bc B}$, depends on $\bm
    b$ and on the fixed-effects parameters, $\bm\beta$, through a
    \Emph{linear predictor} expression, $\bm Z\bm b+\bm X\bm\beta$.
  \item For a linear mixed model (LMM), $\bm\mu_{\bc Y|\bc B}$ is
    exactly the linear predictor.  For an NLMM the linear predictor
    determines the parameter values in the nonlinear model function
    which then determines the mean.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Conditional mode and profiled Laplace approximation for NLMMs}
  \begin{itemize}
  \item As previously stated, determining the conditional mode 
    \begin{displaymath}
      \tilde{\bm u}_{\theta,\beta}=\arg\min_{\bm u}
      \left[\norm{\bm y-\bm\mu_{\bc Y|\bc U}}^2 +
      \norm{\bm u}^2\right]
    \end{displaymath}
    in an NLMM is a penalized nonlinear least squares (PNLS) problem.
  \item It is a nonlinear optimization problem but a comparatively
    simple one.  The penalty term \Emph{regularizes} the optimization.
  \item The Laplace approximation to the profiled deviance (profiled
    over $\sigma^2$) is, as before,
    \begin{displaymath}
    -2\tilde{\ell}(\bm\theta,\bm\beta|\bm y)=\log(|\bm L_\theta|^2)+
      n\left[1+\log\left(\frac{2\pi r^2(\bm\theta,\bm\beta)}{n}\right)\right]
    \end{displaymath}
    where $\bm L_\theta$ is the sparse Cholesky factor evaluated at
    the conditional mode.
  \item The motivation for this approximation is that it replaces the
    conditional distribution, $(\bc U|\bc Y=\bm y)$, for parameters
    $\bm\beta$, $\bm\theta$ and $\sigma$, by a multivariate Gaussian
    approximation, \Emph{evaluated at the mode}.
  \end{itemize}
\end{frame}
\begin{frame}
  \frametitle{Laplace approximation and adaptive Gauss-Hermite quadrature}
  \begin{itemize}
  \item The Laplace approximation
    \begin{displaymath}
    -2\tilde{\ell}(\bm\theta,\bm\beta|\bm y)=\log(|\bm L_\theta|^2)+
      n\left[1+\log\left(\frac{2\pi r^2(\bm\theta,\bm\beta)}{n}\right)\right]
    \end{displaymath}
    is a type of \Emph{smoothing objective} consisting of
    two terms: $n\left[1+\log\left(\frac{2\pi
          r^2(\bm\theta,\bm\beta)}{n}\right)\right]$, which measures
    fidelity to the data, and $\log(|\bm L_\theta|^2)$, which measures
    the complexity of the model.
  \item For models with a simple structure for the random effects (the
    matrices $\bm\Sigma_\theta$ and $\bm\Lambda_\theta$ are block
    diagonal consisting of a large number of small blocks) a further
    enhancement is to use \Emph{adaptive Gauss-Hermite quadrature},
    requiring values of the RSS at several points near $\tilde{\bm
      u}_{\theta,\beta}$
  \item Note that the modifier \Emph{adaptive}, meaning evaluating at
    the conditional mode, is important.  Gauss-Hermite quadrature
    without first determining the conditional mode is not a good idea.
  \end{itemize}
\end{frame}

\section[Comparing methods]{Comparing estimation methods}

\begin{frame}
  \frametitle{Consequences for comparisons of methods}
  \begin{itemize}
  \item We should distinguish between an algorithm, which is a sort of
    a black box, and a criterion, such as maximizing the likelihood
    (or, equivalently, minimizing the deviance.
  \item The criterion is based on the statistical model and exists
    outside of any particular implementation or computing hardware.
    It is part of the theory, which has a long shelf life.
  \item A particular approximation, algorithm and implementation has a
    short shelf life.
  \item I claim it does not make sense to regard the FO, FOI, $\dots$
    methods as producing well-defined types of ``estimates'' in the
    same sense that maximum likelihood estimates, or maximum \emph{a
      posteriori} estimates are defined.
  \item If you use a criterion to define an estimation method then
    implementations should be compared on the basis of that criterion,
    not on something like mean squared error.

  \end{itemize}
\end{frame}

\section[Fitting NLMMs]{Fitting NLMMs in lme4 and lme4a}
\label{sec:fitting}

\begin{frame}[fragile]
  \frametitle{Specifying the nonlinear model function}
  \begin{itemize}
  \item We must specify the nonlinear model function and the linear
    predictor expression in the model formula for an NLMM.  We do this
    with a \Emph{3-part formula} expression.
  \item At present nonlinear model function must return both the
    conditional mean and the gradient expression (derivative of the
    conditional mean with respect to the nonlinear model parameters).
    It is helpful to use the \code{deriv()} function to symbolically
    differentiate the model function.
  \item Some common models have been encapsulated as ``self-starting''
    nonlinear regression models.  For example, the first-order
    pharmacokinetic model used for the \code{Theoph} data is available
    as \code{SSfol}.  Run \code{example(SSfol)} to see what is meant.
  \end{itemize}
\end{frame}
\begin{frame}
  \frametitle{Specifying the mixed-effects formula}
  \begin{itemize}
  \item The mixed-effects formula for an \code{nlmer} model has a
    similar form to that for \code{lmer} or \code{glmer} but with new
    constraints.
  \item In an NLMM all of the fixed-effects parameters and all of the
    random effects are with respect to the nonlinear model parameters,
    which are \code{lKe}, \code{lKa} and \code{lCl} in this case.

  \item For the purpose of specifying the model, these names are
    defined as indicator variables.
  \item In the \code{lme4} package, the default fixed-effects
    expression is \code{0 + lKe + lKa + lCl}, indicating that the
    intercept term is suppressed and that there is a single fixed
    effect for each nonlinear model parameter.  In \code{lme4a} this
    must be specified explicitly (although that may change).

  \item Random effects must also be specified with respect to the
    nonlinear model parameters.  In the \code{lme4a} version terms
    look like \code{(O + lKe|Subject)}.
  \end{itemize}
\end{frame}
\begin{frame}
  \frametitle{Model building}
  \begin{itemize}
  \item Even more so that for GLMMs and NLMMs, it is important to
    start with simple specification of the random effects.  Expecting
    to estimate many variance-covariance parameters for random effects
    filtered through a nonlinear model function is unrealistic.
  \item Use \code{verbose=TRUE} in all but the simplest cases.
  \item In \code{lme4a} the verbose option shows two sets of
    iterations, one over $\bm\theta$ only and one over $\bm\theta$ and
    $\bm\beta$.
  \item You can suppress the second optimization, which often does
    little to lower the deviance, by setting \code{nAGQ=0}.
  \item We will begin with independent scalar random effects for each
    nonlinear model parameter.
  \end{itemize}
\end{frame}
\begin{frame}[fragile]
  \frametitle{Initial fit}
<<nm1>>=
Th.start <- c(lKe = -2.5, lKa = 0.5, lCl = -3)
nm1 <- nlmer(conc ~ SSfol(Dose, Time,lKe, lKa, lCl) ~
             0+lKe+lKa+lCl+(0+lKe|Subject)+(0+lKa|Subject)
             +(0+lCl|Subject), nAGQ=0, Theoph,
             start = Th.start, verbose=TRUE)
@   
\end{frame}
\begin{frame}[fragile]
  \frametitle{Results of initial fit}
<<nm1out,echo=FALSE>>=
print(nm1, corr=FALSE)
@   
\end{frame}
\begin{frame}[fragile]
  \frametitle{Reducing the model}
The variance of the random effect for \code{lK3} is essentially zero. Eliminate it.
<<nm2>>=
nm2 <- nlmer(conc ~ SSfol(Dose, Time,lKe, lKa, lCl) ~
             0+lKe+lKa+lCl+(0+lKa|Subject)
             +(0+lCl|Subject), Theoph, nAGQ=0,
             start = Th.start, verbose=TRUE)
@   
\end{frame}
\begin{frame}[fragile]
  \frametitle{Allowing within-subject correlation of random effects}
Now allow for possible correlation of these random effects
<<nm3>>=
nm3 <- nlmer(conc ~ SSfol(Dose, Time,lKe, lKa, lCl) ~
             0+lKe+lKa+lCl+(0+lKa+lCl|Subject),
             Theoph, start = Th.start, verbose=TRUE)
@   
\end{frame}
\begin{frame}[fragile]
  \frametitle{Model nm3}
<<nm3show,echo=FALSE>>=
print(nm3, corr=FALSE)
@   
\end{frame}
\begin{frame}[fragile]
  \frametitle{Anova comparison}
<<anovanm2nm3>>=
anova(nm2,nm3)
@   
\begin{itemize}
\item This comparison is not quite fair because \code{nm2} was fit
  with \code{nAGQ=0} and \code{nm3} allowed both phases of the
  optimization.
\item But we already conclude that the more complex model \code{nm3}
  is not a significantly better fit.
\end{itemize}
\end{frame}
\begin{frame}
  \frametitle{Summary}
  \begin{itemize}
  \item The theory of NLMMs follows fairly closely the theory of LMMs
  \item Model specification is more complex because of an additional
    level of parameters to specify.
  \item The progress of the iterations should be carefully monitored.
    Often variance component estimates that are very close to zero but
    not exactly zero are provided.
  \item There is a tendency to incorporate too much complexity in such
    models. As Einstein said, ``A scientific theory should be as
    simple as possible, but no simpler.''
  \end{itemize}
\end{frame}
\end{document}