{\usepackage[text={6.2in,9in},centering]{geometry}} \mode{\usetheme{Boadilla}\usecolortheme{seahorse}\usecolortheme{rose}} \usepackage{SweaveSlides} \title[Scales for D-optimal design]{Design for nonlinear mixed-effects\\Are variances a reasonable scale?} \mode{\setkeys{Gin}{width=\textwidth}} \mode

{\setkeys{Gin}{width=0.8\textwidth}} \newcommand{\bLt}{\ensuremath{\bm\Lambda_\theta}} \begin{document} \mode

{\maketitle\tableofcontents} \mode{\frame{\titlepage}} \mode{\frame{\frametitle{Outline}\tableofcontents[pausesections,hideallsubsections]}} \section[Overview]{Overview} \begin{frame} \frametitle{D-optimal experimental design for fixed-effects models} \begin{itemize} \item The purpose of D-optimal experimental design is to minimize the volume of confidence regions or likelihood contours or HPD regions on the parameters. \item For simple cases (e.g. linear models with no random effects) the choice of parameters does not affect the design. In some ways the only parameters that make sense are the coefficients of the linear predictor and these are all equivalent up to linear transformation. \item For a nonlinear model the choice of parameters is less obvious. Nonlinear transformations of parameters can produce dramatically better or worse linear approximations. In terms of likelihood contours or H.P.D. regions the ideal shape is elliptical (i.e. a locally quadratic deviance function) but the actual shape can be quite different. \end{itemize} \end{frame} \begin{frame} \frametitle{D-optimal design for mixed-effects models} \begin{itemize} \item For a linear mixed-effects model the choice of scale of the variance components affects the shape of deviance or posterior density contours. \item For a nonlinear mixed-effects model, both the scale of the variance components and the choice of model parameters affect the shape of such contours. \item These distorsions of shape are more dramatic when there are fewer observations per group (i.e. per \code{Subject} or whatever is the grouping factor). But that is exactly the situation we are trying to achieve. \end{itemize} \end{frame} \section{Profiling nonlinear regression models} \label{sec:nonlinear} \begin{frame}[fragile] \frametitle{Profiling nonlinear regression models} \begin{itemize} \item This is a very brief example of profiling nonlinear regression models with a change of parameters. \item Take data from a single subject in the \code{Theoph} data set in \code{R} \end{itemize} \includegraphics{figs/profiling-Theophdat} \end{frame} \begin{frame}[fragile] \frametitle{My initial naive fit} \begin{Schunk} \begin{Sinput} > Theo.1 <- droplevels(subset(Theoph, Subject==1)) > summary(fm1 <- nls(conc ~ SSfol(Dose, Time, lKe, lKa, lCl), Theo.1), corr=TRUE) \end{Sinput} \begin{Soutput} Formula: conc ~ SSfol(Dose, Time, lKe, lKa, lCl) Parameters: Estimate Std. Error t value Pr(>|t|) lKe -2.9196 0.1709 -17.085 1.40e-07 lKa 0.5752 0.1728 3.328 0.0104 lCl -3.9159 0.1273 -30.768 1.35e-09 Residual standard error: 0.732 on 8 degrees of freedom Correlation of Parameter Estimates: lKe lKa lKa -0.56 lCl 0.96 -0.43 Number of iterations to convergence: 8 Achieved convergence tolerance: 4.907e-06 \end{Soutput} \end{Schunk} \end{frame} \begin{frame}[fragile] \frametitle{Following a suggestion from France Mentr\'{e}} \begin{Schunk} \begin{Sinput} > oral1cptSdlkalVlCl <- PKmod("oral", "sd", list(ka ~ exp(lka), k ~ exp(lCl)/V, V ~ exp(lV))) > summary(gm1 <- nls(conc ~ oral1cptSdlkalVlCl(Dose, Time, lV, lka, lCl), Theo.1, start=c(lV=-1, lka=0.5, lCl=-4)), corr=TRUE) \end{Sinput} \begin{Soutput} Formula: conc ~ oral1cptSdlkalVlCl(Dose, Time, lV, lka, lCl) Parameters: Estimate Std. Error t value Pr(>|t|) lV -0.99624 0.06022 -16.543 1.80e-07 lka 0.57516 0.17282 3.328 0.0104 lCl -3.91586 0.12727 -30.768 1.35e-09 Residual standard error: 0.732 on 8 degrees of freedom Correlation of Parameter Estimates: lV lka lka 0.68 lCl -0.61 -0.43 Number of iterations to convergence: 9 Achieved convergence tolerance: 4.684e-06 \end{Soutput} \end{Schunk} \end{frame} \begin{frame}[fragile] \frametitle{Contours based on profiling the objective, original} \includegraphics{figs/profiling-fm1splom} \end{frame} \begin{frame}[fragile] \frametitle{Contours based on profiling the objective, revised formulation} \includegraphics{figs/profiling-gm1splom} \end{frame} \begin{frame} \frametitle{Estimates based on optimizing a criterion} \begin{itemize} \item Maximum-likelihood estimators are an example of estimators defined as the values that optimize a criterion -- maximizing the log-likelihood or, equivalently, minimizing the deviance (negative twice the log-likelihood). \item Deriving the distribution of such an estimator can be difficult (which is why we fall back on asymptotic properties) but, for a given data set and model, we can assess the sensitivity of the objective (e.g. the deviance) to the values of the parameters. \item We can do this systematically by evaluating one-dimensional ``profiles'' of the objective, through conditional optimization of the objective. \end{itemize} \end{frame} \begin{frame} \frametitle{Profiling the objective} \begin{itemize} \item Profiling is based on conditional optimization of the objective, fixing one or more parameters at particular values and optimizing over the rest. \item We will concentrate on one-dimensional profiles of the deviance for mixed-effects models but the technique can be used more generally. \item We write the deviance as $d(\bm\phi|\bm y)$ where $\bm\phi$ is the parameter vector of length $p$ and $\bm y$ is the vector of observed responses. Write the individual components of $\bm\phi$ as $\phi_k,k=1,\dots,p$ and the complement of $\phi_i$ as $\bm\phi_{-i}$. \item The profile deviance is $\tilde{d}_i(\phi_i)=\min_{\bm\phi_{-i}}d((\phi_i,\bm\phi_{-i})|\bm y)$. The values of the other parameters at the optimum form the \emph{profile traces} \item If estimates and standard errors are an adequate summary then the deviance should be a quadratic function of $\bm\phi$, i.e.{} $\tilde{d}_i(\phi_i)$ should be a quadratic centered at $\hat{\phi}_i$ and the profile traces should be straight. \end{itemize} \end{frame} \section{A Simple Example} \begin{frame}[fragile] \frametitle{A Simple Example: the Dyestuff data} The \code{Dyestuff} data in the \pkg{lme4} package for \proglang{R} are from the the classic book \Emph{Statistical Methods in Research and Production}, edited by O.L. Davies and first published in 1947. \mode

{Figure~\ref{fig:Dyestuffplot} shows these data} \begin{figure}[tb] \centering \includegraphics{figs/profiling-Dyestuffplot} \mode

{\caption{The \code{Dyestuff} data from the \pkg{lme4} package for \proglang{R}.}\label{fig:Dyestuffplot}} \end{figure} The line joins the mean yields of the six batches, which have been reordered by increasing mean yield. \end{frame} \begin{frame} \frametitle{The effect of the batches} \begin{itemize} \item The particular batches observed are just a selection of the possible batches and are entirely used up during the course of the experiment. \item It is not particularly important to estimate and compare yields from these batches. Instead we wish to estimate the variability in yields due to batch-to-batch variability. \item The \code{Batch} factor will be used in \Emph{random-effects} terms in models that we fit. \item In the ``subscript fest'' notation such a model is \begin{displaymath} y_{i,j}=\mu + b_i+\epsilon_{i.j},\quad i=1,\dots,6;\: j=1,\dots,5 \end{displaymath} with $\epsilon_{i,j}\sim\mathcal{N}(0,\sigma^2)$ and $b_i\sim\mathcal{N}(0,\sigma_1^2)$. \item We obtain the maximum-likelihood estimates for such a model using \code{lmer} with the optional argument, \code{REML=FALSE}. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Fitted model} \begin{Schunk} \begin{Sinput} > (fm1 <- lmer(Yield ~ 1 + (1|Batch), Dyestuff, REML=FALSE)) \end{Sinput} \begin{Soutput} Linear mixed model fit by maximum likelihood ['lmerMod'] Formula: Yield ~ 1 + (1 | Batch) Data: Dyestuff AIC BIC logLik deviance 333.3271 337.5307 -163.6635 327.3271 Random effects: Groups Name Variance Std.Dev. Batch (Intercept) 1388 37.26 Residual 2451 49.51 Number of obs: 30, groups: Batch, 6 Fixed effects: Estimate Std. Error t value (Intercept) 1527.50 17.69 86.33 \end{Soutput} \end{Schunk} \end{frame} \section{Profiling the fitted model} \begin{frame}[fragile] \frametitle{Profiling the fitted model} \begin{Schunk} \begin{Sinput} > head(pr1 <- profile(fm1)) \end{Sinput} \end{Schunk} \begin{Schunk} \begin{Soutput} .zeta .sig01 .sigma (Intercept) .par 1 -2.3243980 0.000000 61.96437 1527.5 .sig01 2 -2.2270119 6.315107 60.74307 1527.5 .sig01 3 -1.9527642 12.317030 57.71369 1527.5 .sig01 4 -1.5986116 17.365631 54.69985 1527.5 .sig01 5 -1.1872929 22.297821 52.32154 1527.5 .sig01 6 -0.7326184 27.624184 50.72564 1527.5 .sig01 \end{Soutput} \end{Schunk} $\dots$ \begin{Schunk} \begin{Soutput} .zeta .sig01 .sigma (Intercept) .par 55 1.950491 55.23929 49.5101 1568.280 (Intercept) 56 2.305893 63.77264 49.5101 1579.255 (Intercept) 57 2.653119 74.71123 49.5101 1592.257 (Intercept) 58 2.993207 88.72455 49.5101 1608.022 (Intercept) 59 3.327036 106.75187 49.5101 1627.538 (Intercept) 60 3.655342 130.10234 49.5101 1652.153 (Intercept) \end{Soutput} \end{Schunk} \end{frame} \begin{frame} \frametitle{Reconstructing the profiled deviance} In \code{pr1} the profiled deviance, $\tilde{d}_i(\phi_i)$ is expressed on the \emph{signed square root} scale \begin{displaymath} \zeta_i(\phi_i)=\sgn(\phi_i-\hat{\phi_i})\sqrt{\tilde{d}_i(\phi_i)-d(\widehat{\bm\phi}|\bm y)} \end{displaymath} In the original scale, $\tilde{d}_i(\phi_i)$, the profiles are \mode

{as shown in Figure~\ref{fig:pr1dev}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1dev} \mode

{\caption{Profiled deviance for each of the parameters in fitted model \code{fm1}}\label{fig:pr1dev}} \end{figure} \end{frame} \begin{frame} \frametitle{After applying the square root} On the scale of $\sqrt{\tilde{d}_i(\phi_i)-d(\widehat{\bm\phi}|\bm y)}$ the profiles are \mode

{as shown in Figure~\ref{fig:pr1abs}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1abs} \mode

{\caption{Profiled deviance for each of the parameters in fitted model \code{fm1}}\label{fig:pr1abs}} \end{figure} We have added intervals corresponding to 50\%, 80\%, 90\%, 95\% and 99\% confidence intervals derived from the profiled deviance. \end{frame} \begin{frame} \frametitle{And, finally, on the $\zeta$ scale} \mode

{the profiles are as in Figure~\ref{fig:pr1zeta}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1zeta} \mode

{\caption{Profiled deviance for each of the parameters in fitted model \code{fm1}}\label{fig:pr1zeta}} \end{figure} The intervals are created by ``inverting'' likelihood ratio tests on particular values of the parameter. \end{frame} \section{Representing profiles as densities} \begin{frame} \frametitle{Representing profiles as densities} \begin{itemize} \item A univariate profile $\zeta$ plot is read like a normal probability plot \begin{itemize} \item a sigmoidal (elongated ``S''-shaped) pattern like that for the \code{(Intercept)} parameter indicates overdispersion relative to the normal distribution. \item a bending pattern, usually flattening to the right of the estimate, indicates skewness of the estimator and warns us that the confidence intervals will be asymmetric \item a straight line indicates that the confidence intervals based on the quantiles of the standard normal distribution are suitable \end{itemize} \item If we map the $\zeta$ scale through the cdf, $\Phi$, for the standard normal, $\mathcal{N}(0,1)$, we can derive a cdf and a density for a distribution that would produce this profile. \end{itemize} \end{frame} \begin{frame}[fragile] \mode{\frametitle{Profiles for parameters in \code{fm1} as densities}} \mode

{The profiles for the parameters in model \code{fm1} are shown as densities in Fig.~\ref{fig:pr1density}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1density} \mode

{\caption{Profile densities for model \code{fm1}}\label{fig:pr1density}} \end{figure} \end{frame} \begin{frame}[fragile] \mode{\frametitle{Profile $\zeta$ plots on the scale of the variance components}} Usually the variability estimates in a mixed-effects model are quoted on the scale of the ``variance components'', $\sigma_1^2$ and $\sigma^2$, not the standard deviations as we have shown. On the variance scale the profiles are \mode

{as shown in Fig.~\ref{fig:varianceProf}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-varianceprof} \mode

{\caption{Profile $\zeta$ plots for the variance components in model \code{fm1ML}}\label{fig:varianceProf}} \end{figure} \end{frame} \begin{frame}[fragile] \frametitle{Densities of variance components} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1vardensity} \mode

{\caption{Profile densities of variance components in model \code{fm1}}\label{fig:pr1vardensity}} \end{figure} \end{frame} \section{Practical implications} \begin{frame} \frametitle{Some practical implications} \begin{itemize} \item We have been using maximum likelihood estimates. For linear mixed-effects models the REML estimates are often preferred because they are assumed to be less biased. (Many people assume they are unbiased but, except in certain special cases, they're not.) \item But bias is a property of the expected value of the estimator. So bias of a variance estimator relates to the mean of one of those badly skewed distributions. Why should we use the mean? \item Similarly, it is common in simulation studies to compare estimators or computational methods based on mean squared error. That's not a meaningful criterion for skewed distributions of estimators. \end{itemize} \end{frame} \begin{frame} \frametitle{A more reasonable scale} Distributions of the estimators are closer to being symmetric on the scale of $\log(\sigma)$ and $\log(\sigma_1)$ (or, equivalently, $\log(\sigma^2)$ and $\log(\sigma_1^2)$) except when 0 is in the range of reasonable values, \mode

{as shown in Fig.~\ref{fig:logsigProf}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-logsigProf} \mode

{\caption{Profile $\zeta$ plots for the logarithms of the variance components in model \code{fm1}}\label{fig:logsigProf}} \end{figure} \end{frame} \begin{frame} \mode{\frametitle{Densities on the logarithm scale}} \mode

{The profile densities on the scale of the logarithms of the variance components are whown in Fig.~\ref{fig:logsigpr1density}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-logsigpr1density} \mode

{\caption{Profile densities for model \code{fm1} using the logarithm scale for variance components}\label{fig:logsigpr1density}} \end{figure} \end{frame} \section{Profile pairs} \begin{frame}\frametitle{Profile pairs plots} \begin{itemize} \item The information from the profile can be used to produce pairwise projections of likelihood contours. These correspond to pairwise joint confidence regions. \item Such a plot (next slide) can be somewhat confusing at first glance. \item Concentrate initially on the panels above the diagonal where the axes are the parameters in the scale shown in the diagonal panels. The contours correspond to 50\%, 80\%, 90\%, 95\% and 99\% pairwise confidence regions. \item The two lines in each panel are ``profile traces'', which are the conditional estimate of one parameter given a value of the other. \item The actual interpolation of the contours is performed on the $\zeta$ scale which is shown in the panels below the diagonal. \end{itemize} \end{frame} \begin{frame}[fragile] \mode{\frametitle{Profile pairs for model}} \mode

{Figure~\ref{fig:pr1pairs} is produced by} \begin{Schunk} \begin{Sinput} > splom(pr1) \end{Sinput} \end{Schunk} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1pairs} \mode

{\caption{Profile pairs for model \code{fm1}}\label{fig:pr1pairs}} \end{figure} \end{frame} \begin{frame}[fragile] \mode{\frametitle{Profile pairs for the variance components}} \mode

{The corresponding figure for the variance components is shown in Fig.~\ref{fig:pr1varpairs}} \begin{figure}[tb] \centering \includegraphics{figs/profiling-pr1varpairs} \mode

{\caption{Profile pairs for variance components for fmodel \code{fm1} }\label{fig:pr1varpairs}} \end{figure} \end{frame} \end{document}