\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@\textit{#1} package}\textit{#1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{`#1'} %%' %%% Math symbols \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\P}{\mathsf{P}} \usepackage{amstext} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R}, pdfsubject = {Book}, pdfauthor = {Brian S. Everitt and Torsten Hothorn}, colorlinks = {true}, linkcolor = {blue}, citecolor = {blue}, urlcolor = {red}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% : conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% \usepackage{longtable} \usepackage{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\item{\stepcounter{exercise} Ex. \arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R} \author{Brian S. Everitt and Torsten Hothorn} \maketitle %%\VignetteIndexEntry{Chapter Logistic Regression and Generalised Linear Models} \setcounter{chapter}{5} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)))) HSAURpkg <- require("HSAUR") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR")) rm(HSAURpkg) a <- Sys.setlocale("LC_ALL", "C") book <- TRUE refs <- cbind(c("AItR", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "SA", "ALDI", "ALDII", "MA", "PCA", "MDS", "CA"), 1:15) ch <- function(x, book = TRUE) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~\\\\ref{", ch[2], "}", sep = "")) } } @ \pagestyle{headings} \chapter[Logistic Regression and Generalised Linear Models]{Logistic Regression and Generalised Linear Models: Blood Screening, Women's Role in %' Society, \\ and Colonic Polyps \label{GLM}} \section{Introduction} \section{Logistic Regression and Generalised Linear Models} \section{Analysis Using \R{}} \subsection{ESR and Plasma Proteins} \begin{figure} \begin{center} <>= data("plasma", package = "HSAUR") layout(matrix(1:2, ncol = 2)) cdplot(ESR ~ fibrinogen, data = plasma) cdplot(ESR ~ globulin, data = plasma) @ \caption{Conditional density plots of the erythrocyte sedimentation rate (ESR) given fibrinogen and globulin. \label{GLM:plasma1}} \end{center} \end{figure} We can now fit a logistic regression model to the data using the \Rcmd{glm} function. We start with a model that includes only a single explanatory variable, \Robject{fibrinogen}. The code to fit the model is <>= plasma_glm_1 <- glm(ESR ~ fibrinogen, data = plasma, family = binomial()) @ The formula implicitly defines a parameter for the global mean (the intercept term) as discussed in Chapters~\ref{ANOVA} and \ref{MLR}. The distribution of the response is defined by the \Robject{family} argument, a binomial distribution in our case. %% \index{family argument@\Rcmd{family} argument} %% \index{Binomial distribution} (The default link function when the binomial family is requested is the logistic function.) \renewcommand{\nextcaption}{\R{} output of the \Robject{summary} method for the logistic regression model fitted to the \Robject{plasma} data. \label{GLM-plasma-summary-1}} \SchunkLabel <>= summary(plasma_glm_1) @ \SchunkRaw From the results in Figure~\ref{GLM-plasma-summary-1} we see that the regression coefficient for fibrinogen is significant at the $5\%$ level. An increase of one unit in this variable increases the log-odds in favour of an ESR value greater than $20$ by an estimated $\Sexpr{round(coef(plasma_glm_1)["fibrinogen"], 2)}$ with 95\% confidence interval <>= ci <- confint(plasma_glm_1)["fibrinogen",] @ <>= confint(plasma_glm_1, parm = "fibrinogen") @ <>= print(ci) @ These values are more helpful if converted to the corresponding values for the odds themselves by exponentiating the estimate <>= exp(coef(plasma_glm_1)["fibrinogen"]) @ and the confidence interval <>= ci <- exp(confint(plasma_glm_1, parm = "fibrinogen")) @ <>= exp(confint(plasma_glm_1, parm = "fibrinogen")) @ <>= print(ci) @ The confidence interval is very wide because there are few observations overall and very few where the ESR value is greater than $20$. Nevertheless it seems likely that increased values of fibrinogen lead to a greater probability of an ESR value greater than $20$. We can now fit a logistic regression model that includes both explanatory variables using the code <>= plasma_glm_2 <- glm(ESR ~ fibrinogen + globulin, data = plasma, family = binomial()) @ and the output of the \Rcmd{summary} method is shown in Figure \ref{GLM-plasma-summary-2}. \renewcommand{\nextcaption}{\R{} output of the \Robject{summary} method for the logistic regression model fitted to the \Robject{plasma} data. \label{GLM-plasma-summary-2}} \SchunkLabel <>= summary(plasma_glm_2) @ \SchunkRaw <>= plasma_anova <- anova(plasma_glm_1, plasma_glm_2, test = "Chisq") @ The coefficient for gamma globulin is not significantly different from zero. Subtracting the residual deviance of the second model from the corresponding value for the first model we get a value of $\Sexpr{round(plasma_anova$Deviance[2], 2)}$. Tested using a $\chi^2$-distribution with a single degree of freedom this is not significant at the 5\% level and so we conclude that gamma globulin is not associated with ESR level. In \R{}, the task of comparing the two nested models can be performed using the \Rcmd{anova} function <>= anova(plasma_glm_1, plasma_glm_2, test = "Chisq") @ Nevertheless we shall use the predicted values from the second model and plot them against the values of \stress{both} explanatory variables using a \stress{bubble plot} to illustrate the use of the \Rcmd{symbols} function. \index{Bubble plot} The estimated conditional probability of a ESR value larger $20$ for all observations can be computed, following formula (\ref{GLM:logitexp}), by <>= prob <- predict(plasma_glm_2, type = "response") @ and now we can assign a larger circle to observations with larger probability as shown in Figure~\ref{GLM:bubble}. The plot clearly shows the increasing probability of an ESR value above $20$ (larger circles) as the values of fibrinogen, and to a lesser extent, gamma globulin, increase. \begin{figure} \begin{center} <>= plot(globulin ~ fibrinogen, data = plasma, xlim = c(2, 6), ylim = c(25, 55), pch = ".") symbols(plasma$fibrinogen, plasma$globulin, circles = prob, add = TRUE) @ \caption{Bubble plot of fitted values for a logistic regression model fitted to the ESR data. \label{GLM:bubble}} \end{center} \end{figure} \subsection{Women's Role in Society} %' Originally the data in Table~\ref{GLM-womensrole-tab} would have been in a completely equivalent form to the data in Table~\ref{GLM-plasma-tab} data, but here the individual observations have been grouped into counts of numbers of agreements and disagreements for the two explanatory variables, \Robject{sex} and \Robject{education}. To fit a logistic regression model to such grouped data using the \Rcmd{glm} function we need to specify the number of agreements and disagreements as a two-column matrix on the left hand side of the model formula. We first fit a model that includes the two explanatory variables using the code <>= data("womensrole", package = "HSAUR") fm1 <- cbind(agree, disagree) ~ sex + education womensrole_glm_1 <- glm(fm1, data = womensrole, family = binomial()) @ \renewcommand{\nextcaption}{\R{} output of the \Robject{summary} method for the logistic regression model fitted to the \Robject{womensrole} data. \label{GLM-womensrole-summary-1}} \SchunkLabel <>= summary(womensrole_glm_1) @ \SchunkRaw From the \Rcmd{summary} output in Figure~\ref{GLM-womensrole-summary-1} it appears that education has a highly significant part to play in predicting whether a respondent will agree with the statement read to them, but the respondent's %' sex is apparently unimportant. As years of education increase the probability of agreeing with the statement declines. We now are going to construct a plot comparing the observed proportions of agreeing with those fitted by our fitted model. Because we will reuse this plot for another fitted object later on, we define a function which plots years of education against some fitted probabilities, e.g., <>= role.fitted1 <- predict(womensrole_glm_1, type = "response") @ and labels each observation with the person's sex: %%' <>= myplot <- function(role.fitted) { f <- womensrole$sex == "Female" plot(womensrole$education, role.fitted, type = "n", ylab = "Probability of agreeing", xlab = "Education", ylim = c(0,1)) lines(womensrole$education[!f], role.fitted[!f], lty = 1) lines(womensrole$education[f], role.fitted[f], lty = 2) lgtxt <- c("Fitted (Males)", "Fitted (Females)") legend("topright", lgtxt, lty = 1:2, bty = "n") y <- womensrole$agree / (womensrole$agree + womensrole$disagree) text(womensrole$education, y, ifelse(f, "\\VE", "\\MA"), family = "HersheySerif", cex = 1.25) } @ \begin{figure} \begin{center} <>= myplot(role.fitted1) @ \caption{Fitted (from \Robject{womensrole\_glm\_1}) and observed probabilities of agreeing for the \Robject{womensrole} data. \label{GLM-role1plot}} \end{center} \end{figure} The two curves for males and females in Figure~\ref{GLM-role1plot} are almost the same reflecting the non-significant value of the regression coefficient for sex in \Robject{womensrole\_glm\_1}. But the observed values plotted on Figure~\ref{GLM-role1plot} suggest that there might be an interaction of education and sex, a possibility that can be investigated by applying a further logistic regression model using \index{Interaction} <>= fm2 <- cbind(agree,disagree) ~ sex * education womensrole_glm_2 <- glm(fm2, data = womensrole, family = binomial()) @ The \Robject{sex} and \Robject{education} interaction term is seen to be highly significant, as can be seen from the \Rcmd{summary} output in Figure~\ref{GLM-womensrole-summary-2}. \renewcommand{\nextcaption}{\R{} output of the \Robject{summary} method for the logistic regression model fitted to the \Robject{womensrole} data. \label{GLM-womensrole-summary-2}} \SchunkLabel <>= summary(womensrole_glm_2) @ \SchunkRaw \begin{figure} \begin{center} <>= role.fitted2 <- predict(womensrole_glm_2, type = "response") myplot(role.fitted2) @ \caption{Fitted (from \Robject{womensrole\_glm\_2}) and observed probabilities of agreeing for the \Robject{womensrole} data. \label{GLM-role2plot}} \end{center} \end{figure} We can obtain a plot of deviance residuals plotted against fitted values using the following code above Figure~\ref{GLM:devplot}. \begin{figure} \begin{center} <>= res <- residuals(womensrole_glm_2, type = "deviance") plot(predict(womensrole_glm_2), res, xlab="Fitted values", ylab = "Residuals", ylim = max(abs(res)) * c(-1,1)) abline(h = 0, lty = 2) @ \caption{Plot of deviance residuals from logistic regression model fitted to the \Robject{womensrole} data. \label{GLM:devplot}} \end{center} \end{figure} The residuals fall into a horizontal band between $-2$ and $2$. This pattern does not suggest a poor fit for any particular observation or subset of observations. \subsection{Colonic Polyps} The data on colonic polyps in Table~\ref{GLM-polyps-tab} involves \stress{count} data. We could try to model this using multiple regression but there are two problems. The first is that a response that is a count can only take positive values, and secondly such a variable is unlikely to have a normal distribution. Instead we will apply a GLM with a log link function, ensuring that fitted values are positive, and a Poisson error distribution, i.e., \index{Poisson error distribution} \index{Poisson regression} \begin{eqnarray*} \P(y) = \frac{e^{-\lambda}\lambda^y}{y!}. \end{eqnarray*} This type of GLM is often known as \stress{Poisson regression}. We can apply the model using <>= data("polyps", package = "HSAUR") polyps_glm_1 <- glm(number ~ treat + age, data = polyps, family = poisson()) @ (The default link function when the Poisson family is requested is the log function.) \renewcommand{\nextcaption}{\R{} output of the \Robject{summary} method for the Poisson regression model fitted to the \Robject{polyps} data. \label{GLM-polyps-summary-1}} \SchunkLabel <>= summary(polyps_glm_1) @ \SchunkRaw We can deal with overdispersion by using a procedure known as \stress{quasi-likelihood}, \index{Quasi-likelihood} which allows the estimation of model parameters without fully knowing the error distribution of the response variable. \cite{HSAUR:McCullaghNelder1989} give full details of the quasi-likelihood approach. In many respects it simply allows for the estimation of $\phi$ from the data rather than defining it to be unity for the binomial and Poisson distributions. We can apply quasi-likelihood estimation to the colonic polyps data using the following \R{} code <>= polyps_glm_2 <- glm(number ~ treat + age, data = polyps, family = quasipoisson()) summary(polyps_glm_2) @ The regression coefficients for both explanatory variables remain significant but their estimated standard errors are now much greater than the values given in Figure~\ref{GLM-polyps-summary-1}. A possible reason for overdispersion in these data is that polyps do not occur independently of one another, but instead may `cluster' together. %' \index{Overdispersion|)} \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}