Everyone is welcome here --- except those who have borrowed books from me for and have not returned them yet!

Easy Anova with R

Posted on août 25, 2012 in stats

This document provides a few examples of Analyses of Variance for typical experimental designs. We recommend to use of the ez package to perform ANOVAs with R.

require(ez)
## Loading required package: ez
require(lattice)
## Loading required package: lattice
require(ggplot2)
## Loading required package: ggplot2

S<G> Design (one-way ANOVA between subjects)

We start by analyzing a design where the factor Subject is nested within the factor Group, that is, we want to compare independent groups.

ns <- c(20, 30, 20, 15)
means <- c(100, 110, 100, 130)
sd <- c(20, 20, 30, 25)

dat1 <- data.frame(
    subj=factor(1:sum(ns)),
    group=factor(rep(1:4, ns)),
    y=c(rnorm(ns[1], means[1], sd[1]),
        rnorm(ns[2], means[2], sd[2]),
        rnorm(ns[3], means[3], sd[3]),
        rnorm(ns[4], means[4], sd[4])))

Let us inspect the data:

str(dat1)
## 'data.frame':    85 obs. of  3 variables:
##  $ subj : Factor w/ 85 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ group: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y    : num  75.9 108.9 148.1 80.3 63.4 ...
head(dat1)
##   subj group         y
## 1    1     1  75.85807
## 2    2     1 108.91287
## 3    3     1 148.09366
## 4    4     1  80.28648
## 5    5     1  63.41147
## 6    6     1 111.60601

Visual inspection

Different graphics can be used, the best choiice depends on the number of data points. For example, dotplots are good when there are not too many points:

require(lattice)
with(dat1, dotplot(y ~ group))

with(dat1, boxplot(y ~ group))

require(ggplot2)

ggplot(dat1, aes(group, y)) +
    geom_violin() + geom_dotplot(binaxis='y', stackdir='center', dotsize=.5)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

require(ez)

ezPlot(dat1, y, wid=subj, between=group, x=.(group))
## Warning: Data is unbalanced (unequal N per group). Make sure you specified
## a well-considered value for the type argument to ezANOVA().
## Warning in ezStats(data = data, dv = dv, wid = wid, within = within,
## within_full = within_full, : Unbalanced groups. Mean N will be used in
## computation of FLSD

Anova

ezANOVA(dat1, dv=y, wid=subj, between=group)
## Warning: Data is unbalanced (unequal N per group). Make sure you specified
## a well-considered value for the type argument to ezANOVA().
## $ANOVA
##   Effect DFn DFd        F          p p<.05       ges
## 1  group   3  81 3.409065 0.02139588     * 0.1121069
## 
## $`Levene's Test for Homogeneity of Variance`
##   DFn DFd      SSn      SSd         F         p p<.05
## 1   3  81 499.0771 20586.27 0.6545666 0.5823939

To conduct post-hoc tests comparing all pairs of means:

hsd <- TukeyHSD(aov(y ~ group, data=dat1))
print(hsd)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = y ~ group, data = dat1)
## 
## $group
##           diff        lwr      upr     p adj
## 2-1  0.6734241 -16.952321 18.29917 0.9996365
## 3-1  7.3623076 -11.945729 26.67034 0.7496736
## 4-1 22.1939720   1.338909 43.04903 0.0324969
## 3-2  6.6888835 -10.936861 24.31463 0.7523986
## 4-2 21.5205479   2.212512 40.82858 0.0227903
## 4-3 14.8316644  -6.023398 35.68673 0.2510137
plot(hsd)

If you worry about heteroscedasticity, you can check the diagnostic plots of lm:

par(mfcol=c(2, 2))
plot(lm(y ~ group, data=dat1))

If heteroscedasticity is severe, it will be visible on the plots of the “residuals vs. fitted” plot.

S*T Design (one-way Anova within subjects)

In this design, the factor ‘Subject’ is crossed with a factor ‘Treatment’, that is, there is one data point per subject and treatment.

ns <- 20
means <- c(100, 110, 100, 130)
nt <- length(means)
sd <- 20

dat2 <- data.frame(
    subj=gl(ns, nt),
    treat=gl(nt, 1, nt*ns))

dat2$y <- rep(means, ns) + rep(rnorm(ns, sd=20), each=nt) + rnorm(nt*ns, sd=20)

str(dat2)
## 'data.frame':    80 obs. of  3 variables:
##  $ subj : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ treat: Factor w/ 4 levels "1","2","3","4": 1 2 3 4 1 2 3 4 1 2 ...
##  $ y    : num  105.6 103.3 69.1 123 69.6 ...
head(dat2)
##   subj treat         y
## 1    1     1 105.59941
## 2    1     2 103.30880
## 3    1     3  69.06811
## 4    1     4 123.01079
## 5    2     1  69.58211
## 6    2     2  97.29196

We first plot the raw data, per subject:

require(lattice)
xyplot(y ~ treat | subj, data=dat2, type='b')

Or on a single plot (in the right panel, the subject effects are removed):

par(mfcol=c(1,2))
with(dat2, interaction.plot(treat, subj, y))
with(dat2, interaction.plot(treat, subj, y-tapply(y, subj, mean)[subj]))

And we perform the Anova with ez:

ezPlot(dat2, dv=y, wid=subj, within=treat, x=.(treat))

ezANOVA(dat2, dv=y, wid=subj, within=treat)
## $ANOVA
##   Effect DFn DFd        F            p p<.05       ges
## 2  treat   3  57 14.39573 4.280537e-07     * 0.1902848
## 
## $`Mauchly's Test for Sphericity`
##   Effect         W          p p<.05
## 2  treat 0.4735923 0.02137733     *
## 
## $`Sphericity Corrections`
##   Effect      GGe        p[GG] p[GG]<.05      HFe        p[HF] p[HF]<.05
## 2  treat 0.714377 1.257336e-05         * 0.808031 4.135472e-06         *

S<A*B> Design (Two-way between subjects Anova)

Generation of a dataset S<A3*B2>

subject = factor(paste('sub', 1:30, sep=''))
A = gl(3 ,10, 30, labels=c('a1','a2','a3'))
B = gl(2, 5, 30, labels=c('b1', 'b2'))
x = rnorm(30, mean=10) + 1 * (A=='a1' & B=='b2')
dat3 = data.frame(subject, A, B, x)
rm(subject, A, B, x)
attach(dat3)

Classical R approach

table(A, B)
##     B
## A    b1 b2
##   a1  5  5
##   a2  5  5
##   a3  5  5
tapply(x, list(A, B), mean)
##           b1       b2
## a1  9.614235 11.01789
## a2 10.133099 10.07080
## a3 10.000732 10.63608
interaction.plot(A,B,x)

summary(aov(x ~ A * B, data=dat3))
##             Df Sum Sq Mean Sq F value Pr(>F)
## A            2   0.31   0.155   0.068  0.934
## B            1   3.26   3.256   1.435  0.243
## A:B          2   2.69   1.344   0.592  0.561
## Residuals   24  54.47   2.270

Using ez

ezPlot(data=dat3,  dv=.(x), wid=.(subject), between=.(A,B), 
       x=.(B), split=.(A))

ezANOVA(data=dat3, dv=x, wid=subject, between=c('A','B'))
## $ANOVA
##   Effect DFn DFd         F         p p<.05         ges
## 1      A   2  24 0.0680766 0.9343686       0.005641048
## 2      B   1  24 1.4346721 0.2426994       0.056406155
## 3    A:B   2  24 0.5922504 0.5609637       0.047032926
## 
## $`Levene's Test for Homogeneity of Variance`
##   DFn DFd      SSn      SSd        F         p p<.05
## 1   5  24 2.677601 27.35433 0.469852 0.7948862
detach(dat3)

S*A*B Design (Two-way within-subject Anova)

Same dataset but with A & B within subject

dat4 <- dat3
subject = gl(5, 1, 30, labels=paste('sub', 1:5, sep=''))
dat4$subject = subject
table(dat4$subject, dat4$A, dat4$B)
## , ,  = b1
## 
##       
##        a1 a2 a3
##   sub1  1  1  1
##   sub2  1  1  1
##   sub3  1  1  1
##   sub4  1  1  1
##   sub5  1  1  1
## 
## , ,  = b2
## 
##       
##        a1 a2 a3
##   sub1  1  1  1
##   sub2  1  1  1
##   sub3  1  1  1
##   sub4  1  1  1
##   sub5  1  1  1
attach(dat4)
## The following object is masked _by_ .GlobalEnv:
## 
##     subject
interaction.plot(A:B, subject, x)

summary(aov(x ~ A*B + Error(subject/(A*B))))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  5.321    1.33               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)
## A          2  0.309  0.1545   0.113  0.894
## Residuals  8 10.922  1.3653               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  3.256   3.256   1.755  0.256
## Residuals  4  7.421   1.855               
## 
## Error: subject:A:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## A:B        2  2.688   1.344   0.349  0.716
## Residuals  8 30.807   3.851
summary(aov(x ~ A + Error(subject/A), data=dat4, subset= (B=='b1')))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  8.045   2.011               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)
## A          2  0.727  0.3634   0.155  0.859
## Residuals  8 18.713  2.3391
summary(aov(x ~ A + Error(subject/A), data=dat4, subset= (B=='b2')))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  4.698   1.174               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)
## A          2  2.271   1.135   0.395  0.686
## Residuals  8 23.016   2.877
for (a in levels(A))
  {
  print(paste("Effect of B for A =",a))
  print(summary(aov(x ~ B + Error(subject/(B), data=dat4, subset=(A==a)))))
}
## [1] "Effect of B for A = a1"
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  5.321    1.33               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  3.256   3.256   1.755  0.256
## Residuals  4  7.421   1.855               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 20  44.73   2.236               
## [1] "Effect of B for A = a2"
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  5.321    1.33               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  3.256   3.256   1.755  0.256
## Residuals  4  7.421   1.855               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 20  44.73   2.236               
## [1] "Effect of B for A = a3"
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  5.321    1.33               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  3.256   3.256   1.755  0.256
## Residuals  4  7.421   1.855               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 20  44.73   2.236
detach(dat4)
ezPlot(data=dat4,  dv=.(x), wid=.(subject), between=.(A), within=.(B), 
       x=.(B), split=.(A))
## Warning: The column supplied as the wid variable contains non-unique values
## across levels of the supplied between-Ss variables. Automatically fixing
## this by generating unique wid labels.

ezANOVA(data=dat4,
        dv=x,
        wid=subject,
        within=.(A, B))
## $ANOVA
##   Effect DFn DFd         F         p p<.05         ges
## 2      A   2   8 0.1131701 0.8944030       0.005641048
## 3      B   1   4 1.7550178 0.2558619       0.056406155
## 4    A:B   2   8 0.3490647 0.7155772       0.047032926
## 
## $`Mauchly's Test for Sphericity`
##   Effect         W         p p<.05
## 2      A 0.9922891 0.9884559      
## 4    A:B 0.8754243 0.8190829      
## 
## $`Sphericity Corrections`
##   Effect       GGe     p[GG] p[GG]<.05      HFe     p[HF] p[HF]<.05
## 2      A 0.9923481 0.8931132           1.965828 0.8944030          
## 4    A:B 0.8892242 0.6934962           1.551223 0.7155772

S<B>*A Split-plot ANOVA (A within, B between)

dat5 <- dat3
subject = gl(10, 1, 30, labels=paste('sub', 1:10, sep=''))
dat5$subject = subject
table(dat5$subject, dat5$A, dat5$B)
## , ,  = b1
## 
##        
##         a1 a2 a3
##   sub1   1  1  1
##   sub2   1  1  1
##   sub3   1  1  1
##   sub4   1  1  1
##   sub5   1  1  1
##   sub6   0  0  0
##   sub7   0  0  0
##   sub8   0  0  0
##   sub9   0  0  0
##   sub10  0  0  0
## 
## , ,  = b2
## 
##        
##         a1 a2 a3
##   sub1   0  0  0
##   sub2   0  0  0
##   sub3   0  0  0
##   sub4   0  0  0
##   sub5   0  0  0
##   sub6   1  1  1
##   sub7   1  1  1
##   sub8   1  1  1
##   sub9   1  1  1
##   sub10  1  1  1
table(dat5$subject, dat5$B:dat5$A)
##        
##         b1:a1 b1:a2 b1:a3 b2:a1 b2:a2 b2:a3
##   sub1      1     1     1     0     0     0
##   sub2      1     1     1     0     0     0
##   sub3      1     1     1     0     0     0
##   sub4      1     1     1     0     0     0
##   sub5      1     1     1     0     0     0
##   sub6      0     0     0     1     1     1
##   sub7      0     0     0     1     1     1
##   sub8      0     0     0     1     1     1
##   sub9      0     0     0     1     1     1
##   sub10     0     0     0     1     1     1
summary(aov(x ~ A*B + Error(subject/A), data=dat5))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  3.256   3.256   2.044  0.191
## Residuals  8 12.742   1.593               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)
## A          2   0.31  0.1545   0.059  0.943
## A:B        2   2.69  1.3442   0.515  0.607
## Residuals 16  41.73  2.6080
ezANOVA(data=dat5, dv=x, wid=subject, within=.(A), between=.(B))
## $ANOVA
##   Effect DFn DFd          F         p p<.05         ges
## 2      B   1   8 2.04433366 0.1906423       0.056406155
## 3      A   2  16 0.05924289 0.9426836       0.005641048
## 4    B:A   2  16 0.51539916 0.6068478       0.047032926
## 
## $`Mauchly's Test for Sphericity`
##   Effect         W         p p<.05
## 3      A 0.9431864 0.8148755      
## 4    B:A 0.9431864 0.8148755      
## 
## $`Sphericity Corrections`
##   Effect       GGe     p[GG] p[GG]<.05      HFe     p[HF] p[HF]<.05
## 3      A 0.9462406 0.9353259           1.230641 0.9426836          
## 4    B:A 0.9462406 0.5975871           1.230641 0.6068478

S<G>*A*B Design (Split-plot Anova with two within variables)

One can have both between and within-subject factors. We consider here the case of a S20<G2>*A4*B2 design where S=subject is nested within a factor Group and crossed with the factors A and B which are also crossed with each other.

dat6 = data.frame(
  subj = gl(20, 8, 160),
  group = gl(2, 80, 160),
  a = gl(4, 1, 160),
  b = gl(2, 4, 160),
  y = rnorm(160, 100, 10))

str(dat6)
## 'data.frame':    160 obs. of  5 variables:
##  $ subj : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 2 2 ...
##  $ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ a    : Factor w/ 4 levels "1","2","3","4": 1 2 3 4 1 2 3 4 1 2 ...
##  $ b    : Factor w/ 2 levels "1","2": 1 1 1 1 2 2 2 2 1 1 ...
##  $ y    : num  85.3 90.7 86.2 120.3 65.7 ...
head(dat6)
##   subj group a b         y
## 1    1     1 1 1  85.31750
## 2    1     1 2 1  90.68432
## 3    1     1 3 1  86.18480
## 4    1     1 4 1 120.25579
## 5    1     1 1 2  65.70271
## 6    1     1 2 2  88.61727
with(dat6, table(subj, group))
##     group
## subj 1 2
##   1  8 0
##   2  8 0
##   3  8 0
##   4  8 0
##   5  8 0
##   6  8 0
##   7  8 0
##   8  8 0
##   9  8 0
##   10 8 0
##   11 0 8
##   12 0 8
##   13 0 8
##   14 0 8
##   15 0 8
##   16 0 8
##   17 0 8
##   18 0 8
##   19 0 8
##   20 0 8
with(dat6, table(subj, a:b))
##     
## subj 1:1 1:2 2:1 2:2 3:1 3:2 4:1 4:2
##   1    1   1   1   1   1   1   1   1
##   2    1   1   1   1   1   1   1   1
##   3    1   1   1   1   1   1   1   1
##   4    1   1   1   1   1   1   1   1
##   5    1   1   1   1   1   1   1   1
##   6    1   1   1   1   1   1   1   1
##   7    1   1   1   1   1   1   1   1
##   8    1   1   1   1   1   1   1   1
##   9    1   1   1   1   1   1   1   1
##   10   1   1   1   1   1   1   1   1
##   11   1   1   1   1   1   1   1   1
##   12   1   1   1   1   1   1   1   1
##   13   1   1   1   1   1   1   1   1
##   14   1   1   1   1   1   1   1   1
##   15   1   1   1   1   1   1   1   1
##   16   1   1   1   1   1   1   1   1
##   17   1   1   1   1   1   1   1   1
##   18   1   1   1   1   1   1   1   1
##   19   1   1   1   1   1   1   1   1
##   20   1   1   1   1   1   1   1   1
xyplot(y ~ a  | subj, group=b, data=dat6, type='b')

signif(with(dat6, tapply(y, list(group=group, a=a, b=b), mean )), 3)
## , , b = 1
## 
##      a
## group   1   2    3     4
##     1 103 101 98.0 106.0
##     2 102 103 96.5  95.4
## 
## , , b = 2
## 
##      a
## group  1     2     3     4
##     1 98  96.2 101.0 100.0
##     2 95 104.0  97.3  95.8
ezPlot(dat6, dv=y, wid=subj, within = .(a, b), between=group, x=.(a), split=b, col=group)
## Warning: Mixed within-and-between-Ss effect requested; FLSD is only
## appropriate for within-Ss comparisons (see warning in ?ezStats or ?ezPlot).