\documentclass[nojss]{jss}
\usepackage{amsfonts,bm,amsmath,amssymb}

%%\usepackage{Sweave} %% already provided by jss.cls
%%%\VignetteIndexEntry{Applications of finite mixtures of regression models}
%%\VignetteDepends{flexmix}
%%\VignetteKeywords{R, finite mixture model, generalized linear model, latent class regression}
%%\VignettePackage{flexmix}

\title{Applications of finite mixtures of regression models}
<<echo=false, hide=true>>=
library("stats")
library("graphics")
library("flexmix")
@ 
\author{Bettina Gr{\"u}n\\
  Wirtschaftsuniversit{\"a}t Wien \And
  Friedrich Leisch\\
  Universit\"at f\"ur Bodenkultur Wien} 

\Plainauthor{Bettina Gr{\"u}n, Friedrich Leisch}

\Address{
  Bettina Gr\"un\\
  Institute for Statistics and Mathematics\\
  Wirtschaftsuniversit{\"a}t Wien\\
  Welthandelsplatz 1\\
  1020 Wien, Austria\\
  E-mail: \email{Bettina.Gruen@R-project.org}\\

  Friedrich Leisch\\
  Institut f\"ur Angewandte Statistik und EDV\\
  Universit\"at f\"ur Bodenkultur Wien\\
  Peter Jordan Stra\ss{}e 82\\
  1190 Wien, Austria\\
  E-mail: \email{Friedrich.Leisch@boku.ac.at}
}

\Abstract{
  Package \pkg{flexmix} provides functionality for fitting finite
  mixtures of regression models. The available model class includes
  generalized linear models with varying and fixed effects for the
  component specific models and multinomial logit models for the
  concomitant variable models. This model class includes random
  intercept models where the random part is modelled by a finite
  mixture instead of a-priori selecting a suitable distribution.  

  The application of the package is illustrated on various datasets
  which have been previously used in the literature to fit finite
  mixtures of Gaussian, binomial or Poisson regression models. The
  \proglang{R} commands are given to fit the proposed models and
  additional insights are gained by visualizing the data and the
  fitted models as well as by fitting slightly modified models.
}

\Keywords{\proglang{R}, finite mixture models, generalized linear models, concomitant variables}
\Plainkeywords{R, finite mixture models, generalized linear models, concomitant variables}
%%-------------------------------------------------------------------------
%%-------------------------------------------------------------------------
\begin{document}
\SweaveOpts{engine=R, echo=true, height=5, width=8, eps=FALSE, keep.source=TRUE}
\setkeys{Gin}{width=0.8\textwidth}
<<start, echo=false, results=hide>>=
options(width=70, prompt = "R> ", continue = "+  ", useFancyQuotes = FALSE)
suppressWarnings(RNGversion("3.5.0"))
set.seed(1802)
library("lattice")
ltheme <- canonical.theme("postscript", FALSE)
lattice.options(default.theme=ltheme)
@ 
%%-------------------------------------------------------------------------
%%-------------------------------------------------------------------------
\section{Introduction}                         
Package \pkg{flexmix} provides infrastructure for flexible fitting of
finite mixtures models. The design principles of the package allow
easy extensibility and rapid prototyping. In addition, the main focus
of the available functionality is on fitting finite mixtures of
regression models, as other packages in \proglang{R} exist which have
specialized functionality for model-based clustering, such as
e.g.~\pkg{mclust} \citep{flexmix:Fraley+Raftery:2002a} for finite
mixtures of Gaussian distributions.

\cite{flexmix:Leisch:2004a} gives a general introduction into the
package outlining the main implementational principles and
illustrating the use of the package. The paper is also contained as a
vignette in the package. An example for fitting mixtures of Gaussian
regression models is given in \cite{flexmix:Gruen+Leisch:2006}. This
paper focuses on examples of finite mixtures of binomial logit and
Poisson regression models.  Several datasets which have been
previously used in the literature to demonstrate the use of finite
mixtures of regression models have been selected to illustrate the
application of the package.

The model class covered are finite mixtures of generalized linear
model with focus on binomial logit and Poisson regressions. The
regression coefficients as well as the dispersion parameters of the
component specific models are assumed to vary for all components, vary
between groups of components, i.e.~to have a nesting, or to be fixed
over all components. In addition it is possible to specify concomitant
variable models in order to be able to characterize the components.
Random intercept models are a special case of finite mixtures with
varying and fixed effects as fixed effects are assumed for the
coefficients of all covariates and varying effects for the intercept.
These models are often used to capture overdispersion in the data
which can occur for example if important covariates are omitted in the
regression.  It is then assumed that the influence of these covariates
can be captured by allowing a random distribution for the intercept.

This illustration does not only show how the package \pkg{flexmix} can
be used for fitting finite mixtures of regression models but also
indicates the advantages of using an extension package of an
environment for statistical computing and graphics instead of a
stand-alone package as available visualization techniques can be used
for inspecting the data and the fitted models. In addition users
already familiar with \proglang{R} and its formula interface should find
the model specification and a lot of commands for exploring the fitted
model intuitive.
%%-------------------------------------------------------------------------
%%-------------------------------------------------------------------------
\section{Model specification}                         
Finite mixtures of Gaussian regressions with concomitant variable
models are given by:
\begin{align*}
  H(y\,|\,\bm{x}, \bm{w}, \bm{\Theta}) &= \sum_{s = 1}^S \pi_s(\bm{w},
  \bm{\alpha}) \textrm{N}(y\,|\, \mu_s(\bm{x}), \sigma^2_s),
\end{align*}
where $\textrm{N}(\cdot\,|\, \mu_s(\bm{x}), \sigma^2_s)$ is the
Gaussian distribution with mean $\mu_s(\bm{x}) = \bm{x}' \bm{\beta}^s$
and variance $\sigma^2_s$. $\Theta$ denotes the vector of all
parameters of the mixture distribution and the dependent variables are
$y$, the independent $\bm{x}$ and the concomitant $\bm{w}$. 

Finite mixtures of binomial regressions with concomitant variable
models are given by:
\begin{align*}
  H(y\,|\,T, \bm{x}, \bm{w}, \bm{\Theta}) &= \sum_{s = 1}^S \pi_s(\bm{w},
  \bm{\alpha}) \textrm{Bi}(y\,|\,T, \theta_s(\bm{x})),
\end{align*}
where $\textrm{Bi}(\cdot\,|\,T, \theta_s(\bm{x}))$ is the binomial
distribution with number of trials equal to $T$ and success
probability $\theta_s(\bm{x}) \in (0,1)$ given by
$\textrm{logit}(\theta_s(\bm{x})) = \bm{x}' \bm{\beta}^s$. 

Finite mixtures of Poisson regressions are given by:
\begin{align*}
  H(y \,|\, \bm{x}, \bm{w}, \bm{\Theta}) &= \sum_{s = 1}^S \pi_s(\bm{w},
  \bm{\alpha}) \textrm{Poi} (y \,|\, \lambda_s(\bm{x})),
\end{align*}
where $\textrm{Poi}(\cdot\,|\,\lambda_s(\bm{x}))$ denotes the Poisson
distribution and $\log(\lambda_s(\bm{x})) = \bm{x}'\bm{\beta}^s$.

For all these mixture distributions the coefficients are split into
three different groups depending on if fixed, nested or varying
effects are specified:
\begin{align*}
  \bm{\beta}^s &=
  (\bm{\beta}_1, \bm{\beta}^{c(s)}_{2}, \bm{\beta}^{s}_3)
\end{align*}
where the first group represents the fixed, the second the nested and
the third the varying effects. For the nested effects a partition
$\mathcal{C} = \{c_s \,|\, s = 1,\ldots S\}$ of the $S$ components is
determined where $c_s = \{s^* = 1,\ldots,S \,|\, c(s^*) = c(s)\}$. A
similar splitting is possible for the variance of mixtures of Gaussian
regression models.

The function for maximum likelihood (ML) estimation with the
Expectation-Maximization (EM) algorithm is \code{flexmix()} which is
described in detail in \cite{flexmix:Leisch:2004a}. It takes as
arguments a specification of the component specific model and of the
concomitant variable model.  The component specific model with
varying, nested and fixed effects can be specified with the M-step
driver \code{FLXMRglmfix()} which has arguments \code{formula} for the
varying, \code{nested} for the nested and \code{fixed} for the fixed
effects.  \code{formula} and \code{fixed} take an argument of class
\code{"formula"}, whereas \code{nested} expects an object of class
\code{"FLXnested"} or a named list specifying the nested structure
with a component \code{k} which is a vector of the number of
components in each group of the partition and a component
\code{formula} which is a vector of formulas for each group of the
partition.  In addition there is an argument \code{family} which has
to be one of \code{gaussian}, \code{binomial}, \code{poisson} or
\code{Gamma} and determines the component specific distribution
function as well as an \code{offset} argument. The argument
\code{varFix} can be used to determine the structure of the dispersion
parameters.

If only varying effects are specified the M-step driver
\code{FLXMRglm()} can be used which only has an argument \code{formula}
for the varying effects and also a \code{family} and an \code{offset}
argument. This driver has the advantage that in the M-step the
weighted ML estimation is made separately for each component which
signifies that smaller model matrices are used. If a mixture model
with a lot of components $S$ is fitted to a large data set with $N$
observations and the model matrix used in the M-step of
\code{FLXMRglm()} has $N$ rows and $K$ columns, the model matrix used in
the M-step of \code{FLXMRglmfix()} has $S N$ rows and up to $S K$
columns.

In general the concomitant variable model is assumed to be a
multinomial logit model, i.e.~:
\begin{align*}
  \pi_s(\bm{w},\bm{\alpha}) &=
  \frac{e^{\bm{w}'\bm{\alpha}_s}}{\sum_{u = 1}^S
    e^{\bm{w}'\bm{\alpha}_u}} \quad \forall s,
\end{align*}
with $\bm{\alpha} = (\bm{\alpha}'_s)_{s=1,\ldots,S}$ and
$\bm{\alpha}_1 \equiv \bm{0}$. This model can be fitted in
\pkg{flexmix} with \code{FLXPmultinom()} which takes as argument
\code{formula} the formula specification of the multinomial logit
part. For fitting the function \code{nnet()} is used from package
\pkg{MASS} \citep{flexmix:Venables+Ripley:2002} with the independent
variables specified by the formula argument and the dependent
variables are given by the a-posteriori probability estimates.
%%-------------------------------------------------------------------------
%%-------------------------------------------------------------------------
\section[Using package flexmix]{Using package \pkg{flexmix}}                         
In the following datasets from different areas such as medicine,
biology and economics are used. There are three subsections: for
finite mixtures of Gaussian regressions, for finite mixtures of
binomial regression models and for finite mixtures of Poisson
regression models.
%%-------------------------------------------------------------------------
\subsection{Finite mixtures of Gaussian regressions}
This artificial dataset with 200 observations is given in
\cite{flexmix:Gruen+Leisch:2006}.  The data is generated from a
mixture of Gaussian regression models with three components. There is
an intercept with varying effects, an independent variable $x1$, which
is a numeric variable, with fixed effects and another independent
variable $x2$, which is a categorical variable with two levels, with
nested effects. The prior probabilities depend on a concomitant
variable $w$, which is also a categorical variable with two levels.
Fixed effects are also assumed for the variance. The data is
illustrated in Figure~\ref{fig:artificialData} and the true underlying
model is given by:
\begin{align*}
  H(y\,|\,(x1, x2), w, \bm{\Theta}) &= \sum_{s = 1}^S \pi_s(w,
  \bm{\alpha}) \textrm{N}(y\,|\, \mu_s, \sigma^2),
\end{align*}
with $\bm{\beta}^s = (\beta^s_{\textrm{Intercept}},
\beta^{c(s)}_{\textrm{x1}}, \beta_{\textrm{x2}})$. The nesting
signifies that $c(1) = c(2)$ and $\beta^{c(3)}_{\textrm{x1}} = 0$.

The mixture model is fitted by first loading the package and the
dataset and then specifying the component specific model. In a first
step a component specific model with only varying effects is
specified. Then the fitting function \code{flexmix()} is called
repeatedly using \code{stepFlexmix()}. Finally, we order the
components such that they are in ascending order with respect to the
coefficients of the variable \code{x1}.
<<NregFix>>=
set.seed(2807)
library("flexmix")
data("NregFix", package = "flexmix")
Model <- FLXMRglm(~ x2 + x1)
fittedModel <- stepFlexmix(y ~ 1, model = Model, nrep = 3, k = 3,
  data = NregFix, concomitant = FLXPmultinom(~ w))             
fittedModel <- relabel(fittedModel, "model", "x1")
summary(refit(fittedModel))
@ 

The estimated coefficients indicate that the components differ for the
intercept, but that they are not significantly different for the
coefficients of $x2$. For $x1$ the coefficient of the first component
is not significantly different from zero and the confidence intervals
for the other two components overlap. Therefore we fit a modified
model, which is equivalent to the true underlying model. The
previously fitted model is used for initializing the EM algorithm:
<<diffModel>>=
Model2 <- FLXMRglmfix(fixed = ~ x2, nested = list(k = c(1, 2), 
  formula = c(~ 0, ~ x1)), varFix = TRUE)
fittedModel2 <- flexmix(y ~ 1, model = Model2, 
  cluster = posterior(fittedModel), data = NregFix, 
  concomitant = FLXPmultinom(~ w))             
BIC(fittedModel)
BIC(fittedModel2)
@ 

The BIC suggests that the restricted model should be preferred.

\begin{figure}[tb]
\centering
\setkeys{Gin}{width=0.95\textwidth}
<<artificial-example, fig=true, echo=false, results=hide>>=
plotNregFix <- NregFix
plotNregFix$w <- factor(NregFix$w, levels = 0:1, labels = paste("w =", 0:1))
plotNregFix$x2 <- factor(NregFix$x2, levels = 0:1,
  labels = paste("x2 =", 0:1))
plotNregFix$class <- factor(NregFix$class, levels = 1:3, labels = paste("Class", 1:3))
print(xyplot(y ~ x1 | x2*w, groups = class, data = plotNregFix, cex = 0.6,
  auto.key = list(space="right"), layout = c(2,2)))
@ 

\setkeys{Gin}{width=0.8\textwidth}
\caption{Sample with 200 observations from the artificial example.}
\label{fig:artificialData}
\end{figure}

<<refit>>=
summary(refit(fittedModel2))
@ 

The coefficients are ordered such that the fixed coefficients are
first, the nested varying coefficients second and the varying
coefficients last.

%%-------------------------------------------------------------------------
\subsection{Finite mixtures of binomial logit regressions}
%%-------------------------------------------------------------------------
\subsubsection{Beta blockers}
The dataset is analyzed in \cite{flexmix:Aitkin:1999,
  flexmix:Aitkin:1999a} using a finite mixture of binomial regression
models. Furthermore, it is described in
\cite{flexmix:McLachlan+Peel:2000} on page 165. The dataset is from a
22-center clinical trial of beta-blockers for reducing mortality after
myocardial infarction. A two-level model is assumed to represent the
data, where centers are at the upper level and patients at the lower
level. The data is illustrated in Figure~\ref{fig:beta} and the model
is given by:
\begin{align*}
  H(\textrm{Deaths} \,|\, \textrm{Total}, \textrm{Treatment},
  \textrm{Center}, \bm{\Theta}) &= \sum_{s = 1}^S \pi_s \textrm{Bi}(
  \textrm{Deaths} \,|\, \textrm{Total}, \theta_s).
\end{align*}

First, the center classification is ignored and a binomial logit
regression model with treatment as covariate is fitted using
\code{glm}, i.e.~$S=1$:
<<beta-glm>>=
data("betablocker", package = "flexmix")
betaGlm <- glm(cbind(Deaths, Total - Deaths) ~ Treatment, 
  family = "binomial", data = betablocker)
betaGlm
@ 

In the next step the center classification is included by allowing a
random effect for the intercept given the centers, i.e.~the
coefficients $\bm{\beta}^s$ are given by
$(\beta^s_{\textrm{Intercept|Center}}, \beta_{\textrm{Treatment}})$.
This signifies that the component membership is fixed for each center.
In order to determine the suitable number of components, the mixture
is fitted with different numbers of components and the BIC information
criterion is used to select an appropriate model. In this case a model
with three components is selected. The fitted values for the model
with three components are given in Figure~\ref{fig:beta}.
<<beta-fix>>=
betaMixFix <- stepFlexmix(cbind(Deaths, Total - Deaths) ~ 1 | Center,
  model = FLXMRglmfix(family = "binomial", fixed = ~ Treatment), 
  k = 2:4, nrep = 3, data = betablocker)
betaMixFix
@ 

\begin{figure}
  \centering
<<beta-fig, fig=true, echo=false, results=hide, height = 5>>=
library("grid")
betaMixFix_3 <- getModel(betaMixFix, "3")
betaMixFix_3 <- relabel(betaMixFix_3, "model", "Intercept")
betablocker$Center <- with(betablocker, factor(Center, levels = Center[order((Deaths/Total)[1:22])]))
clusters <- factor(clusters(betaMixFix_3), labels = paste("Cluster", 1:3))
print(dotplot(Deaths/Total ~ Center | clusters, groups  = Treatment, as.table = TRUE,
  data  = betablocker, xlab = "Center", layout = c(3, 1), scales = list(x = list(draw = FALSE)),
  key = simpleKey(levels(betablocker$Treatment), lines = TRUE, corner = c(1,0))))
betaMixFix.fitted <- fitted(betaMixFix_3)
for (i in 1:3) {
  seekViewport(trellis.vpname("panel", i, 1))
  grid.lines(unit(1:22, "native"), unit(betaMixFix.fitted[1:22, i], "native"), gp = gpar(lty = 1))
  grid.lines(unit(1:22, "native"), unit(betaMixFix.fitted[23:44, i], "native"), gp = gpar(lty = 2))
}
@
  
\caption{Relative number of deaths for the treatment and the control
  group for each center in the beta blocker dataset. The centers are
  sorted by the relative number of deaths in the control group. The
  lines indicate the fitted values for each component of the 3-component
  mixture model with random intercept and fixed effect for treatment.}
  \label{fig:beta}
\end{figure}

In addition the treatment effect can also be included in the random
part of the model. As then all coefficients for the covariates and the
intercept follow a mixture distribution the component specific model
can be specified using \code{FLXMRglm()}. The coefficients are
$\bm{\beta}^s=(\beta^s_{\textrm{Intercept|Center}},
\beta^s_{\textrm{Treatment|Center}})$, i.e.~it is assumed that the
heterogeneity is only between centers and therefore the aggregated
data for each center can be used.
<<beta-full>>=
betaMix <- stepFlexmix(cbind(Deaths, Total - Deaths) ~ Treatment | Center,
  model = FLXMRglm(family = "binomial"), k = 3, nrep = 3, 
  data = betablocker)
summary(betaMix)
@
 
The full model with a random effect for treatment has a higher BIC and
therefore the smaller would be preferred.

The default plot of the returned \code{flexmix} object is a rootogramm
of the a-posteriori probabilities where observations with a-posteriori
probabilities smaller than \code{eps} are omitted. With argument
\code{mark} the component is specified to have those observations
marked which are assigned to this component based on the maximum
a-posteriori probabilities. This indicates which components overlap.
<<default-plot, fig=true>>=
print(plot(betaMixFix_3, mark = 1, col = "grey", markcol = 1))
@ 

The default plot of the fitted model indicates that the components are
well separated. In addition component 1 has a slight overlap with
component 2 but none with component 3.

The fitted parameters of the component specific models can be accessed
with:
<<parameters>>=
parameters(betaMix)
@ 

The cluster assignments using the maximum a-posteriori probabilities
are obtained with:
<<cluster>>=
table(clusters(betaMix))
@ 

The estimated probabilities for each component for the treated
patients and those in the control group can be obtained with:
<<predict>>=
predict(betaMix, 
  newdata = data.frame(Treatment = c("Control", "Treated")))
@ 
or 
<<fitted>>=
fitted(betaMix)[c(1, 23), ]
@ 

A further analysis of the model is possible with function
\code{refit()} which returns the estimated coefficients together with
the standard deviations, z-values and corresponding p-values:
<<refit>>=
summary(refit(getModel(betaMixFix, "3")))
@ 

The printed coefficients are ordered to have the fixed effects before
the varying effects.
%%-----------------------------------------------------------------------
\subsubsection{Mehta et al. trial}
This dataset is similar to the beta blocker dataset and is also
analyzed in \cite{flexmix:Aitkin:1999a}. The dataset is visualized in
Figure~\ref{fig:mehta}. The observation for the control group in
center 15 is slightly conspicuous and might classify as an outlier.

The model is given by:
\begin{align*}
  H(\textrm{Response} \,|\, \textrm{Total}, \bm{\Theta}) &= \sum_{s = 1}^S
  \pi_s \textrm{Bi}( \textrm{Response} \,|\, \textrm{Total}, \theta_s),
\end{align*}
with $\bm{\beta}^s = (\beta^s_{\textrm{Intercept|Site}},
\beta_{\textrm{Drug}})$. This model is fitted with:
<<mehta-fix>>=
data("Mehta", package = "flexmix")
mehtaMix <- stepFlexmix(cbind(Response, Total - Response)~ 1 | Site, 
  model = FLXMRglmfix(family = "binomial", fixed = ~ Drug), 
  control = list(minprior = 0.04), nrep = 3, k = 3, data = Mehta)
summary(mehtaMix)
@ 

One component only contains the observations for center 15 and in
order to be able to fit a mixture with such a small component it is
necessary to modify the default argument for \code{minprior} which is
0.05. The fitted values for this model are given separately for each
component in Figure~\ref{fig:mehta}.

\begin{figure}
  \centering
<<mehta-fig, fig=true, echo=false, results=hide, height = 5>>=
Mehta$Site <- with(Mehta, factor(Site, levels = Site[order((Response/Total)[1:22])]))
clusters <- factor(clusters(mehtaMix), labels = paste("Cluster", 1:3)) 
print(dotplot(Response/Total ~ Site | clusters, groups  = Drug,  layout = c(3,1),
              data  = Mehta, xlab = "Site", scales = list(x = list(draw = FALSE)),
              key = simpleKey(levels(Mehta$Drug), lines = TRUE, corner = c(1,0))))
mehtaMix.fitted <- fitted(mehtaMix)
for (i in 1:3) {
  seekViewport(trellis.vpname("panel", i, 1))
  sapply(1:nlevels(Mehta$Drug), function(j) 
  grid.lines(unit(1:22, "native"), unit(mehtaMix.fitted[Mehta$Drug == levels(Mehta$Drug)[j], i], "native"), gp = gpar(lty = j)))
}
@ 
\caption{Relative number of responses for the treatment and the
  control group for each site in the Mehta et al.~trial dataset
  together with the fitted values. The sites are sorted by the
  relative number of responses in the control group.}
  \label{fig:mehta}
\end{figure}

If also a random effect for the coefficient of $\textrm{Drug}$ is
fitted, i.e.~$\bm{\beta}^s = (\beta^s_{\textrm{Intercept|Site}},
\beta^s_{\textrm{Drug|Site}})$, this is estimated by:

<<mehta-full>>=
mehtaMix <- stepFlexmix(cbind(Response, Total - Response) ~ Drug | Site, 
  model = FLXMRglm(family = "binomial"), k = 3, data = Mehta, nrep = 3,
  control = list(minprior = 0.04))
summary(mehtaMix)
@ 

The BIC is smaller for the larger model and this indicates that the
assumption of an equal drug effect for all centers is not confirmed by
the data.

Given Figure~\ref{fig:mehta} a two-component model with fixed
treatment is also fitted to the data where site 15 is omitted:
<<mehta-sub>>=
Mehta.sub <- subset(Mehta, Site != 15)
mehtaMix <- stepFlexmix(cbind(Response, Total - Response) ~ 1 | Site, 
  model = FLXMRglmfix(family = "binomial", fixed = ~ Drug),
  data = Mehta.sub, k = 2, nrep = 3)
summary(mehtaMix)
@ 

%%-----------------------------------------------------------------------  
\subsubsection{Tribolium}
A finite mixture of binomial regressions is fitted to the Tribolium
dataset given in \cite{flexmix:Wang+Puterman:1998}. The data was
collected to investigate whether the adult Tribolium species Castaneum
has developed an evolutionary advantage to recognize and avoid eggs of
its own species while foraging, as beetles of the genus Tribolium are
cannibalistic in the sense that adults eat the eggs of their own
species as well as those of closely related species.

The experiment isolated a number of adult beetles of the same species
and presented them with a vial of 150 eggs (50 of each type), the eggs
being thoroughly mixed to ensure uniformity throughout the vial. The
data gives the consumption data for adult Castaneum species.  It
reports the number of Castaneum, Confusum and Madens eggs,
respectively, that remain uneaten after two day exposure to the adult
beetles. Replicates 1, 2, and 3 correspond to different occasions on
which the experiment was conducted. The data is visualized in
Figure~\ref{fig:tribolium} and the model is given by:
\begin{align*}
  H(\textrm{Remaining} \,|\, \textrm{Total}, \bm{\Theta}) &= \sum_{s = 1}^S
  \pi_s(\textrm{Replicate}, \bm{\alpha}) \textrm{Bi}( \textrm{Remaining} \,|\, \textrm{Total}, \theta_s),
\end{align*}
with $\bm{\beta}^s = (\beta^s_{\textrm{Intercept}},
\bm{\beta}_{\textrm{Species}})$. This model is fitted with:
<<tribolium>>=
data("tribolium", package = "flexmix")
TribMix <- stepFlexmix(cbind(Remaining, Total - Remaining) ~ 1, 
  k = 2:3, model = FLXMRglmfix(fixed = ~ Species, family = "binomial"),
  concomitant = FLXPmultinom(~ Replicate), data = tribolium)
@ 

The model which is selected as the best in
\cite{flexmix:Wang+Puterman:1998} can be estimated with:
<<wang-model>>=
modelWang <- FLXMRglmfix(fixed = ~ I(Species == "Confusum"), 
  family = "binomial") 
concomitantWang <- FLXPmultinom(~ I(Replicate == 3))
TribMixWang <- stepFlexmix(cbind(Remaining, Total - Remaining) ~ 1,
  data = tribolium, model = modelWang, concomitant = concomitantWang, 
  k = 2) 
summary(refit(TribMixWang))
@ 

\begin{figure}
  \centering
<<tribolium, fig=true, echo=false, results=hide, height = 4>>=
clusters <- factor(clusters(TribMixWang), labels = paste("Cluster", 1:TribMixWang@k))
print(dotplot(Remaining/Total ~ factor(Replicate) | clusters, groups  = Species, 
              data  = tribolium[rep(1:9, each = 3) + c(0:2)*9,], xlab = "Replicate",
              auto.key = list(corner = c(1,0))))
@ 

\caption{Relative number of remaining beetles for the number of
  replicate. The different panels are according to the cluster
  assignemnts based on the a-posteriori probabilities of the model
  suggested in \cite{flexmix:Wang+Puterman:1998}.}
  \label{fig:tribolium}
\end{figure}

\cite{flexmix:Wang+Puterman:1998} also considered a model where they
omit one conspicuous observation. This model can be estimated with:
<<subset>>=
TribMixWangSub <- stepFlexmix(cbind(Remaining, Total - Remaining) ~ 1, 
  k = 2, data = tribolium[-7,], model = modelWang,
  concomitant = concomitantWang)
@ 

%%-----------------------------------------------------------------------  
\subsubsection{Trypanosome}
The data is used in \cite{flexmix:Follmann+Lambert:1989}. It is from
a dosage-response analysis where the proportion of organisms belonging
to different populations shall be assessed. It is assumed that
organisms belonging to different populations are indistinguishable
other than in terms of their reaction to the stimulus. The
experimental technique involved inspection under the microscope of a
representative aliquot of a suspension, all organisms appearing within
two fields of view being classified either alive or dead. Hence the
total numbers of organisms present at each dose and the number showing
the quantal response were both random variables. The data is
illustrated in Figure~\ref{fig:trypanosome}.

The model which is proposed in \cite{flexmix:Follmann+Lambert:1989}
is given by:
\begin{align*}
  H(\textrm{Dead} \,|\,\bm{\Theta}) &= \sum_{s = 1}^S \pi_s
  \textrm{Bi}( \textrm{Dead} \,|\, \theta_s),
\end{align*}
where $\textrm{Dead} \in \{0,1\}$ and with $\bm{\beta}^s =
(\beta^s_{\textrm{Intercept}}, \bm{\beta}_{\textrm{log(Dose)}})$. This
model is fitted with:
<<trypanosome>>=
data("trypanosome", package = "flexmix")
TrypMix <- stepFlexmix(cbind(Dead, 1-Dead) ~ 1, k = 2, nrep = 3, 
  data = trypanosome,  model = FLXMRglmfix(family = "binomial", 
  fixed = ~ log(Dose)))
summary(refit(TrypMix))
@ 

The fitted values are given in Figure~\ref{fig:trypanosome} together
with the fitted values of a generalized linear model in order to
facilitate comparison of the two models.

\begin{figure}
  \centering
<<trypanosome, fig=true, echo=false, results=hide>>=
tab <- with(trypanosome, table(Dead, Dose))
Tryp.dat <- data.frame(Dead = tab["1",], Alive = tab["0",], 
                       Dose = as.numeric(colnames(tab)))
plot(Dead/(Dead+Alive) ~ Dose, data = Tryp.dat)
Tryp.pred <- predict(glm(cbind(Dead, 1-Dead) ~ log(Dose), family = "binomial", data = trypanosome), newdata=Tryp.dat, type = "response")
TrypMix.pred <- predict(TrypMix, newdata = Tryp.dat, aggregate = TRUE)[[1]]
lines(Tryp.dat$Dose, Tryp.pred, lty = 2)
lines(Tryp.dat$Dose, TrypMix.pred, lty = 3)
legend(4.7, 1, c("GLM", "Mixture model"), lty=c(2, 3), xjust=0, yjust=1)
@ 
\caption{Relative number of deaths for each dose level together with
  the fitted values for the generalized linear model (``GLM'') and the
  random intercept model (``Mixture model'').}
  \label{fig:trypanosome}
\end{figure}

%%-------------------------------------------------------------------------
\subsection{Finite mixtures of Poisson regressions}
% %%-----------------------------------------------------------------------                         
\subsubsection{Fabric faults}
The dataset is analyzed using a finite mixture of Poisson regression
models in \cite{flexmix:Aitkin:1996}. Furthermore, it is described in
\cite{flexmix:McLachlan+Peel:2000} on page 155. It contains 32
observations on the number of faults in rolls of a textile fabric. A
random intercept model is used where a fixed effect is assumed for the
logarithm of length:
<<fabric-fix>>=
data("fabricfault", package = "flexmix")
fabricMix <- stepFlexmix(Faults ~ 1, model = FLXMRglmfix(family="poisson", 
  fixed = ~ log(Length)), data = fabricfault, k = 2, nrep = 3)
summary(fabricMix)
summary(refit(fabricMix))
Lnew <- seq(0, 1000, by = 50)
fabricMix.pred <- predict(fabricMix, newdata = data.frame(Length = Lnew))
@ 

The intercept of the first component is not significantly different
from zero for a signficance level of 0.05. We therefore also fit a
modified model where the intercept is a-priori set to zero for the
first component. This nested structure is given as part of the model
specification with argument \code{nested}. 
<<fabric-fix-nested>>=
fabricMix2 <- flexmix(Faults ~ 0, data = fabricfault, 
  cluster = posterior(fabricMix), 
  model = FLXMRglmfix(family = "poisson", fixed = ~ log(Length),
  nested = list(k=c(1,1), formula=list(~0,~1))))
summary(refit(fabricMix2))
fabricMix2.pred <- predict(fabricMix2, 
  newdata = data.frame(Length = Lnew))
@ 

The data and the fitted values for each of the components for both
models are given in Figure~\ref{fig:fabric}.

\begin{figure}
  \centering
<<fabric-fig, fig=true, echo=false, results=hide>>=
plot(Faults ~ Length, data = fabricfault)
sapply(fabricMix.pred, function(y) lines(Lnew, y, lty = 1))
sapply(fabricMix2.pred, function(y) lines(Lnew, y, lty = 2))
legend(190, 25, paste("Model", 1:2), lty=c(1, 2), xjust=0, yjust=1)
@ 
 
\caption{Observed values of the fabric faults dataset together with
  the fitted values for the components of each of the two fitted
  models.}
  \label{fig:fabric}
\end{figure}

%%-----------------------------------------------------------------------
\subsubsection{Patent}                        
The patent data given in \cite{flexmix:Wang+Cockburn+Puterman:1998}
consist of 70 observations on patent applications, R\&D spending and
sales in millions of dollar from pharmaceutical and biomedical
companies in 1976 taken from the National Bureau of Economic Research
R\&D Masterfile. The observations are displayed in
Figure~\ref{fig:patent}. The model which is chosen as the best
in \cite{flexmix:Wang+Cockburn+Puterman:1998} is given by:
\begin{align*}
  H(\textrm{Patents} \,|\, \textrm{lgRD}, \textrm{RDS}, \bm{\Theta}) &=
  \sum_{s = 1}^S \pi_s(\textrm{RDS}, \bm{\alpha}) \textrm{Poi} ( \textrm{Patents} \,|\, \lambda_s),
\end{align*}
and $\bm{\beta}^s = (\beta^s_{\textrm{Intercept}}, \beta^s_{\textrm{lgRD}})$.

The model is fitted with:
<<patent, echo=true>>=
data("patent", package = "flexmix")
ModelPat <- FLXMRglm(family = "poisson")
FittedPat <- stepFlexmix(Patents ~ lgRD, k = 3, nrep = 3, 
  model = ModelPat, data = patent, concomitant = FLXPmultinom(~ RDS))             
summary(FittedPat)
@
The fitted values for the component specific models and the
concomitant variable model are given in Figure~\ref{fig:patent}. The
plotting symbol of the observations corresponds to the induced
clustering given by \code{clusters(FittedPat)}.

This model is modified to have fixed effects for the logarithmized
R\&D spendings, i.e.~$\bm(\beta)^s = (\beta^s_{\textrm{Intercept}},
\beta_{\textrm{lgRD}})$. The already fitted model is used for
initialization, i.e.~the EM algorithm is started with an M-step given
the a-posteriori probabilities.
<<patent-fixed>>=
ModelFixed <- FLXMRglmfix(family = "poisson", fixed = ~ lgRD)
FittedPatFixed <- flexmix(Patents ~ 1, model = ModelFixed, 
  cluster = posterior(FittedPat), concomitant = FLXPmultinom(~ RDS), 
  data = patent)             
summary(FittedPatFixed)
@ 

The fitted values for the component specific models and the
concomitant variable model of this model are also given in
Figure~\ref{fig:patent}.

\begin{figure}
\centering
\setkeys{Gin}{width=0.95\textwidth}
<<Poisson, fig=true, echo=false, results=hide>>=
lgRDv <- seq(-3, 5, by = 0.05)
newdata <- data.frame(lgRD = lgRDv)
plotData <- function(fitted) {
  with(patent, data.frame(Patents = c(Patents, 
                            unlist(predict(fitted, newdata = newdata))),
                          lgRD = c(lgRD, rep(lgRDv, 3)),
                          class = c(clusters(fitted), rep(1:3, each = nrow(newdata))),
                          type = rep(c("data", "fit"), c(nrow(patent), nrow(newdata)*3))))
}
plotPatents <- cbind(plotData(FittedPat), which = "Wang et al.")
plotPatentsFixed <- cbind(plotData(FittedPatFixed), which = "Fixed effects")
plotP <- rbind(plotPatents, plotPatentsFixed)
rds <- seq(0, 3, by = 0.02)
x <- model.matrix(FittedPat@concomitant@formula, data = data.frame(RDS = rds))
plotConc <- function(fitted) {
  E <- exp(x%*%fitted@concomitant@coef)
  data.frame(Probability = as.vector(E/rowSums(E)),
                         class = rep(1:3, each = nrow(x)),
                         RDS = rep(rds, 3))
}
plotConc1 <- cbind(plotConc(FittedPat), which = "Wang et al.")
plotConc2 <- cbind(plotConc(FittedPatFixed), which = "Fixed effects")
plotC <- rbind(plotConc1, plotConc2)
print(xyplot(Patents ~ lgRD | which, data = plotP, groups=class, xlab = "log(R&D)",
             panel = "panel.superpose", type = plotP$type,
             panel.groups = function(x, y, type = "p", subscripts, ...)
             {
               ind <- plotP$type[subscripts] == "data"
               panel.xyplot(x[ind], y[ind], ...)
               panel.xyplot(x[!ind], y[!ind], type = "l", ...)
             },
             scales = list(alternating=FALSE), layout=c(1,2), as.table=TRUE),
      more=TRUE, position=c(0,0,0.6, 1))
print(xyplot(Probability ~ RDS | which, groups = class, data = plotC, type = "l",
             scales = list(alternating=FALSE), layout=c(1,2), as.table=TRUE),
      position=c(0.6, 0.01, 1, 0.99))
@ 

\caption{Patent data with the fitted values of the component specific
  models (left) and the concomitant variable model (right) for the
  model in \citeauthor{flexmix:Wang+Cockburn+Puterman:1998} and with
  fixed effects for $\log(\textrm{R\&D})$. The plotting symbol for
  each observation is determined by the component with the maximum
  a-posteriori probability.}
\label{fig:patent}
\end{figure}
\setkeys{Gin}{width=0.8\textwidth}

With respect to the BIC the full model is better than the model with
the fixed effects. However, fixed effects have the advantage that the
different components differ only in their baseline and the relation
between the components in return of investment for each additional
unit of R\&D spending is constant. Due to a-priori domain knowledge
this model might seem more plausible. The fitted values for the
constrained model are also given in Figure~\ref{fig:patent}.
%%-----------------------------------------------------------------------
\subsubsection{Seizure}  
The data is used in \cite{flexmix:Wang+Puterman+Cockburn:1996} and is
from a clinical trial where the effect of intravenous gamma-globulin
on suppression of epileptic seizures is studied. There are daily
observations for a period of 140 days on one patient, where the first
27 days are a baseline period without treatment, the remaining 113
days are the treatment period. The model proposed in
\cite{flexmix:Wang+Puterman+Cockburn:1996} is given by:
\begin{align*}
  H(\textrm{Seizures} \,|\, (\textrm{Treatment}, \textrm{log(Day)},
  \textrm{log(Hours)}), \bm{\Theta}) &= \sum_{s = 1}^S \pi_s
  \textrm{Poi} ( \textrm{Seizures} \,|\, \lambda_s),
\end{align*}
where $\bm(\beta)^s = (\beta^s_{\textrm{Intercept}},
\beta^s_{\textrm{Treatment}}, \beta^s_{\textrm{log(Day)}},
\beta^s_{\textrm{Treatment:log(Day)}})$ and $\textrm{log(Hours)}$ is
used as offset. This model is fitted with:
<<seizure>>=
data("seizure", package = "flexmix")
seizMix <- stepFlexmix(Seizures ~ Treatment * log(Day), data = seizure, 
  k = 2, nrep = 3, model = FLXMRglm(family = "poisson", 
  offset = log(seizure$Hours)))
summary(seizMix)
summary(refit(seizMix))
@ 

A different model with different contrasts to directly estimate the
coefficients for the jump when changing between base and treatment
period is given by:
<<seizure>>=
seizMix2 <- flexmix(Seizures ~ Treatment * log(Day/27), 
  data = seizure, cluster = posterior(seizMix),
  model = FLXMRglm(family = "poisson", offset = log(seizure$Hours)))
summary(seizMix2)
summary(refit(seizMix2))
@ 

A different model which allows no jump at the change between base and
treatment period is fitted with:
<<seizure>>=
seizMix3 <- flexmix(Seizures ~ log(Day/27)/Treatment, data = seizure, 
  cluster = posterior(seizMix), model = FLXMRglm(family = "poisson", 
  offset = log(seizure$Hours)))
summary(seizMix3)
summary(refit(seizMix3))
@ 

With respect to the BIC criterion the smaller model with no jump is
preferred. This is also the more intuitive model from a practitioner's
point of view, as it does not seem to be plausible that starting the
treatment already gives a significant improvement, but improvement
develops over time. The data points together with the fitted values
for each component of the two models are given in
Figure~\ref{fig:seizure}. It can clearly be seen that the fitted
values are nearly equal which also supports the smaller model.

\begin{figure}
  \centering
<<seizure, fig=true, echo=false, results=hide>>=
plot(Seizures/Hours~Day, pch = c(1,3)[as.integer(Treatment)], data=seizure)
abline(v=27.5, lty=2, col="grey")
legend(140, 9, c("Baseline", "Treatment"), pch=c(1, 3), xjust=1, yjust=1)
matplot(seizure$Day, fitted(seizMix)/seizure$Hours, type="l", add=TRUE, lty = 1, col = 1)
matplot(seizure$Day, fitted(seizMix3)/seizure$Hours, type="l", add=TRUE, lty = 3, col = 1)
legend(140, 7, paste("Model", c(1,3)), lty=c(1, 3), xjust=1, yjust=1)
@ 

\caption{Observed values for the seizure dataset together with the
  fitted values for the components of the two different models.}
  \label{fig:seizure}
\end{figure}

%%-----------------------------------------------------------------------
\subsubsection{Ames salmonella assay data}                        
The ames salomnella assay dataset was used in
\cite{flexmix:Wang+Puterman+Cockburn:1996}. They propose a model
given by:
\begin{align*}
  H(\textrm{y} \,|\, \textrm{x}, \bm{\Theta}) &= \sum_{s = 1}^S \pi_s
  \textrm{Poi} ( \textrm{y} \,|\, \lambda_s),
\end{align*}
where $\bm{\beta}^s = (\beta^s_{\textrm{Intercept}},
\beta_{\textrm{x}}, \beta_{\textrm{log(x+10)}})$. The model is fitted with:
<<salmonella>>=
data("salmonellaTA98", package = "flexmix")
salmonMix <- stepFlexmix(y ~ 1, data = salmonellaTA98, k = 2, nrep = 3,
  model = FLXMRglmfix(family = "poisson", fixed = ~ x + log(x + 10)))
@ 
 
\begin{figure}
  \centering
<<salmonella, fig=true, echo=false, results=hide>>=
salmonMix.pr <- predict(salmonMix, newdata=salmonellaTA98)
plot(y~x, data=salmonellaTA98, 
     pch=as.character(clusters(salmonMix)), 
     xlab="Dose of quinoline", ylab="Number of revertant colonies of salmonella",
     ylim=range(c(salmonellaTA98$y, unlist(salmonMix.pr))))
for (i in 1:2) lines(salmonellaTA98$x, salmonMix.pr[[i]], lty=i)
@ 

\caption{Means and classification for assay data according to the
  estimated posterior probabilities based on the fitted model.}
\label{fig:almes}
\end{figure}
%%-----------------------------------------------------------------------
\section{Conclusions and future work}                        
Package \pkg{flexmix} can be used to fit finite mixtures of
regressions to datasets used in the literature to illustrate these
models. The results can be reproduced and additional insights can be
gained using visualization methods available in \proglang{R}. The fitted
model is an object in \proglang{R} which can be explored using
\code{show()}, \code{summary()} or \code{plot()}, as suitable methods
have been implemented for objects of class \code{"flexmix"} which are
returned by \code{flexmix()}.

In the future it would be desirable to have more diagnostic tools
available to analyze the model fit and compare different models.  The
use of resampling methods would be convenient as they can be applied
to all kinds of mixtures models and would therefore suit well the
purpose of the package which is flexible modelling of various finite
mixture models. Furthermore, an additional visualization method for the
fitted coefficients of the mixture would facilitate the comparison of
the components.
%%-----------------------------------------------------------------------  
\section*{Computational details}
<<echo=false,results=hide>>=
SI <- sessionInfo()
pkgs <- paste(sapply(c(SI$otherPkgs, SI$loadedOnly), function(x) 
                     paste("\\\\pkg{", x$Package, "} ", 
                           x$Version, sep = "")), collapse = ", ")
@ 

All computations and graphics in this paper have been done using
\proglang{R} version \Sexpr{getRversion()} with the packages
\Sexpr{pkgs}. 
%%-----------------------------------------------------------------------  
\section*{Acknowledgments}

This research was supported by the the Austrian Science Foundation
(FWF) under grant P17382 and the Austrian Academy of Sciences
({\"O}AW) through a DOC-FFORTE scholarship for Bettina Gr{\"u}n.
%%-----------------------------------------------------------------------  
\bibliography{flexmix}
\end{document}