library(ggplot2) # for general plotting
library(car)     # for ANOVA (Type II used, better than Type I when there is an unbalanced design)

Information of data source

Mite protease activity was measured from mite samples taken from either their rearing host or 2 days on Moneymaker (TU).

Read in the data and view structure to identify any issues in data formatting

cathepsin.data <- read.csv("~/Lab Stuff/Adapted mites/Tomato/Mite Protease Activity on Different Hosts/Mite Protease Activity on Different Hosts TU and TU-A R data.csv", header = TRUE)

# Factor as a trial
cathepsin.data$Trial <- factor(cathepsin.data$Trial)

str(cathepsin.data)
## 'data.frame':    108 obs. of  4 variables:
##  $ Trial     : Factor w/ 2 levels "2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Mite.Plant: Factor w/ 3 levels "TU-A MM","TU Bean",..: 2 2 2 2 2 3 3 3 3 3 ...
##  $ Protease  : Factor w/ 3 levels "Cathepsin B",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Activity  : num  8.7 9.55 9.03 8.93 8.52 ...
# subset proteases
cathepsin.data.CathepsinL <- subset(cathepsin.data, Protease== "Cathepsin L") 
cathepsin.data.CathepsinB <- subset(cathepsin.data, Protease =="Cathepsin B")
cathepsin.data.Legumain <- subset(cathepsin.data, Protease =="Legumain")

Formulate hypothesis

H0: Mite protease activity will not vary depending on what host they are feeding on.

HA: Mite protease activity will vary depending on what host they are feeding on.

Conduct data exploration

Outliers in the response variable (Activity) within explanatory variables (Trial, Mite.Plant)

ggplot(cathepsin.data.CathepsinL, aes(x = Trial, y = Activity)) + geom_boxplot() + theme_classic()

ggplot(cathepsin.data.CathepsinL, aes(x = Mite.Plant, y = Activity)) + geom_boxplot() + theme_classic()

ggplot(cathepsin.data.CathepsinB, aes(x = Trial, y = Activity)) + geom_boxplot() + theme_classic()

ggplot(cathepsin.data.CathepsinB, aes(x = Mite.Plant, y = Activity)) + geom_boxplot() + theme_classic()

ggplot(cathepsin.data.Legumain, aes(x = Trial, y = Activity)) + geom_boxplot() + theme_classic()

ggplot(cathepsin.data.Legumain, aes(x =Mite.Plant, y = Activity)) + geom_boxplot() + theme_classic()

Collinearity of the explanatory variables

Des not apply, all explanatory variables are categorical/factorial.

Spatial/temporal or other hierarchical aspects of sampling design (no information available)

No, I am treating Trial as a main effect to check for reproducibility (not a random effect/blocking factor).

Interactions (is the quality of the data good enough to include them?)

Interaction betweenTrial and Mite.Plant will be performed to test for reproducibility.

Zero inflation in Y

No

Are categorical covariates balanced?

summary(cathepsin.data.CathepsinL)
##  Trial    Mite.Plant        Protease     Activity     
##  2:15   TU-A MM:12   Cathepsin B: 0   Min.   : 6.044  
##  3:21   TU Bean:12   Cathepsin L:36   1st Qu.: 7.961  
##         TU MM  :12   Legumain   : 0   Median : 8.559  
##                                       Mean   : 8.433  
##                                       3rd Qu.: 8.998  
##                                       Max.   :10.247
summary(cathepsin.data.CathepsinB)
##  Trial    Mite.Plant        Protease     Activity     
##  2:15   TU-A MM:12   Cathepsin B:36   Min.   :0.3405  
##  3:21   TU Bean:12   Cathepsin L: 0   1st Qu.:0.9335  
##         TU MM  :12   Legumain   : 0   Median :1.4230  
##                                       Mean   :1.4579  
##                                       3rd Qu.:1.8874  
##                                       Max.   :2.7435
summary(cathepsin.data.Legumain)
##  Trial    Mite.Plant        Protease     Activity      
##  2:15   TU-A MM:12   Cathepsin B: 0   Min.   :0.09689  
##  3:21   TU Bean:12   Cathepsin L: 0   1st Qu.:0.11899  
##         TU MM  :12   Legumain   :36   Median :0.13127  
##                                       Mean   :0.13439  
##                                       3rd Qu.:0.13949  
##                                       Max.   :0.20054

Not for Trial. Yes for Mite.Plant.

Apply model

Cathepsin L:

# fit linear model and display model fit information and ANOVA table
m.CathepsinL <- lm(Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.CathepsinL)
summary(m.CathepsinL)
## 
## Call:
## lm(formula = Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, 
##     data = cathepsin.data.CathepsinL)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.13914 -0.45186 -0.09593  0.49554  1.18840 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                9.5100     0.2810  33.842  < 2e-16 ***
## Mite.PlantTU Bean         -0.5630     0.3974  -1.417 0.166881    
## Mite.PlantTU MM           -0.8304     0.3974  -2.090 0.045242 *  
## Trial3                    -1.3936     0.3679  -3.788 0.000682 ***
## Mite.PlantTU Bean:Trial3   1.1330     0.5203   2.177 0.037444 *  
## Mite.PlantTU MM:Trial3    -0.1029     0.5203  -0.198 0.844590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6284 on 30 degrees of freedom
## Multiple R-squared:  0.6222, Adjusted R-squared:  0.5593 
## F-statistic: 9.883 on 5 and 30 DF,  p-value: 1.165e-05
Anova(m.CathepsinL)
## Anova Table (Type II tests)
## 
## Response: Activity
##                   Sum Sq Df F value    Pr(>F)    
## Mite.Plant        7.1169  2  9.0124 0.0008607 ***
## Trial             9.6506  1 24.4417  2.73e-05 ***
## Mite.Plant:Trial  2.7433  2  3.4739 0.0439514 *  
## Residuals        11.8452 30                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size and display
result.anova<-anova(m.CathepsinL)
ss<-result.anova$"Sum Sq"    ##ss = sum of squares
pes<-ss/(ss+ss[length(ss)])  ##pes = partial e squared
pes[length(pes)]<-""
result.anova$"Part E Sq"<-pes
result.anova
## Analysis of Variance Table
## 
## Response: Activity
##                  Df  Sum Sq Mean Sq F value   Pr(>F) Part E Sq
## Mite.Plant        2  7.1169  3.5585  9.0124 0.000861   0.37532
## Trial             1  9.6506  9.6506 24.4417 0.000027   0.44895
## Mite.Plant:Trial  2  2.7433  1.3717  3.4739 0.043951   0.18805
## Residuals        30 11.8452  0.3948
# perform post-hoc Tukey-Kramer test of contrasts
TukeyHSD(aov(Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.CathepsinL))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.CathepsinL)
## 
## $Mite.Plant
##                        diff        lwr        upr     p adj
## TU Bean-TU-A MM  0.09791667 -0.5344947  0.7303280 0.9230376
## TU MM-TU-A MM   -0.89041667 -1.5228280 -0.2580053 0.0044170
## TU MM-TU Bean   -0.98833333 -1.6207447 -0.3559220 0.0016104
## 
## $Trial
##        diff       lwr        upr    p adj
## 3-2 -1.0502 -1.484031 -0.6163694 2.73e-05
## 
## $`Mite.Plant:Trial`
##                             diff        lwr         upr     p adj
## TU Bean:2-TU-A MM:2 -0.563000000 -1.7717649  0.64576489 0.7169349
## TU MM:2-TU-A MM:2   -0.830400000 -2.0391649  0.37836489 0.3195083
## TU-A MM:3-TU-A MM:2 -1.393571429 -2.5126703 -0.27447260 0.0081261
## TU Bean:3-TU-A MM:2 -0.823571429 -1.9426703  0.29552740 0.2507122
## TU MM:3-TU-A MM:2   -2.326857143 -3.4459560 -1.20775831 0.0000079
## TU MM:2-TU Bean:2   -0.267400000 -1.4761649  0.94136489 0.9836383
## TU-A MM:3-TU Bean:2 -0.830571429 -1.9496703  0.28852740 0.2427053
## TU Bean:3-TU Bean:2 -0.260571429 -1.3796703  0.85852740 0.9794860
## TU MM:3-TU Bean:2   -1.763857143 -2.8829560 -0.64475831 0.0005478
## TU-A MM:3-TU MM:2   -0.563171429 -1.6822703  0.55592740 0.6480189
## TU Bean:3-TU MM:2    0.006828571 -1.1122703  1.12592740 1.0000000
## TU MM:3-TU MM:2     -1.496457143 -2.6155560 -0.37735831 0.0039168
## TU Bean:3-TU-A MM:3  0.570000000 -0.4515928  1.59159279 0.5441741
## TU MM:3-TU-A MM:3   -0.933285714 -1.9548785  0.08830708 0.0890029
## TU MM:3-TU Bean:3   -1.503285714 -2.5248785 -0.48169292 0.0013086
# plot interactions
interaction.plot(cathepsin.data.CathepsinL$Mite.Plant, cathepsin.data.CathepsinL$Trial, cathepsin.data.CathepsinL$Activity, type="l", leg.bty="o", leg.bg="grey95", lwd=2, ylab="Activity", xlab="Mite Strain on Host", main="Mite.Plant:Trial")

Cathepsin B:

# fit linear model and display model fit information and ANOVA table
m.CathepsinB <- lm(Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.CathepsinB)
summary(m.CathepsinB)
## 
## Call:
## lm(formula = Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, 
##     data = cathepsin.data.CathepsinB)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75157 -0.12689  0.01665  0.12636  0.41593 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.4519     0.1141  12.728 1.26e-13 ***
## Mite.PlantTU Bean         -0.0399     0.1613  -0.247  0.80634    
## Mite.PlantTU MM           -0.5327     0.1613  -3.302  0.00249 ** 
## Trial3                     0.8757     0.1494   5.863 2.05e-06 ***
## Mite.PlantTU Bean:Trial3  -0.4150     0.2112  -1.965  0.05875 .  
## Mite.PlantTU MM:Trial3    -1.1997     0.2112  -5.680 3.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2551 on 30 degrees of freedom
## Multiple R-squared:  0.8709, Adjusted R-squared:  0.8494 
## F-statistic: 40.48 on 5 and 30 DF,  p-value: 1.869e-12
Anova(m.CathepsinB)
## Anova Table (Type II tests)
## 
## Response: Activity
##                   Sum Sq Df F value    Pr(>F)    
## Mite.Plant       10.0081  2  76.909 1.552e-12 ***
## Trial             0.9963  1  15.313 0.0004845 ***
## Mite.Plant:Trial  2.1652  2  16.639 1.374e-05 ***
## Residuals         1.9519 30                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size and display
result.anova<-anova(m.CathepsinB)
ss<-result.anova$"Sum Sq"  ##ss = sum of squares
pes<-ss/(ss+ss[length(ss)])  ##pes = partial e squared
pes[length(pes)]<-""
result.anova$"Part E Sq"<-pes
result.anova
## Analysis of Variance Table
## 
## Response: Activity
##                  Df  Sum Sq Mean Sq F value     Pr(>F) Part E Sq
## Mite.Plant        2 10.0081  5.0041  76.909 0.00000000   0.83679
## Trial             1  0.9963  0.9963  15.313 0.00048446   0.33794
## Mite.Plant:Trial  2  2.1652  1.0826  16.639 0.00001374   0.52590
## Residuals        30  1.9519  0.0651
# perform post-hoc Tukey-Kramer test of contrasts
TukeyHSD(aov(Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.CathepsinB))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.CathepsinB)
## 
## $Mite.Plant
##                    diff        lwr         upr     p adj
## TU Bean-TU-A MM -0.2820 -0.5387217 -0.02527826 0.0289655
## TU MM-TU-A MM   -1.2325 -1.4892217 -0.97577826 0.0000000
## TU MM-TU Bean   -0.9505 -1.2072217 -0.69377826 0.0000000
## 
## $Trial
##          diff       lwr       upr     p adj
## 3-2 0.3374429 0.1613332 0.5135525 0.0004845
## 
## $`Mite.Plant:Trial`
##                           diff          lwr          upr     p adj
## TU Bean:2-TU-A MM:2 -0.0399000 -0.530587308  0.450787308 0.9998596
## TU MM:2-TU-A MM:2   -0.5327000 -1.023387308 -0.042012692 0.0272255
## TU-A MM:3-TU-A MM:2  0.8756714  0.421383256  1.329959601 0.0000283
## TU Bean:3-TU-A MM:2  0.4207429 -0.033545315  0.875031030 0.0820407
## TU MM:3-TU-A MM:2   -0.8566857 -1.310973887 -0.402397542 0.0000402
## TU MM:2-TU Bean:2   -0.4928000 -0.983487308 -0.002112692 0.0485342
## TU-A MM:3-TU Bean:2  0.9155714  0.461283256  1.369859601 0.0000135
## TU Bean:3-TU Bean:2  0.4606429  0.006354685  0.914931030 0.0453779
## TU MM:3-TU Bean:2   -0.8167857 -1.271073887 -0.362497542 0.0000845
## TU-A MM:3-TU MM:2    1.4083714  0.954083256  1.862659601 0.0000000
## TU Bean:3-TU MM:2    0.9534429  0.499154685  1.407731030 0.0000067
## TU MM:3-TU MM:2     -0.3239857 -0.778273887  0.130302458 0.2813497
## TU Bean:3-TU-A MM:3 -0.4549286 -0.869635038 -0.040222105 0.0250544
## TU MM:3-TU-A MM:3   -1.7323571 -2.147063609 -1.317650677 0.0000000
## TU MM:3-TU Bean:3   -1.2774286 -1.692135038 -0.862722105 0.0000000
# plot interactions
interaction.plot(cathepsin.data.CathepsinB$Mite.Plant, cathepsin.data.CathepsinB$Trial, cathepsin.data.CathepsinB$Activity, type="l", leg.bty="o", leg.bg="grey95", lwd=2, ylab="Activity", xlab="Mite Strain on Host", main="Mite.Plant:Trial")

Legumain:

# fit linear model and display model fit information and ANOVA table
m.Legumain <- lm(Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.Legumain)
summary(m.Legumain)
## 
## Call:
## lm(formula = Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, 
##     data = cathepsin.data.Legumain)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.033770 -0.005303 -0.000331  0.006267  0.027955 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.102193   0.005795  17.635  < 2e-16 ***
## Mite.PlantTU Bean         0.015843   0.008195   1.933 0.062701 .  
## Mite.PlantTU MM           0.028020   0.008195   3.419 0.001830 ** 
## Trial3                    0.030460   0.007587   4.015 0.000367 ***
## Mite.PlantTU Bean:Trial3 -0.012916   0.010730  -1.204 0.238097    
## Mite.PlantTU MM:Trial3    0.011911   0.010730   1.110 0.275793    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01296 on 30 degrees of freedom
## Multiple R-squared:  0.7699, Adjusted R-squared:  0.7315 
## F-statistic: 20.07 on 5 and 30 DF,  p-value: 9.208e-09
Anova(m.Legumain)
## Anova Table (Type II tests)
## 
## Response: Activity
##                     Sum Sq Df F value    Pr(>F)    
## Mite.Plant       0.0080100  2 23.8526 6.311e-07 ***
## Trial            0.0079407  1 47.2923 1.239e-07 ***
## Mite.Plant:Trial 0.0008994  2  2.6783   0.08507 .  
## Residuals        0.0050372 30                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size and display
result.anova<-anova(m.Legumain)
ss<-result.anova$"Sum Sq"  ##ss = sum of squares
pes<-ss/(ss+ss[length(ss)])  ##pes = partial e squared
pes[length(pes)]<-""
result.anova$"Part E Sq"<-pes
result.anova
## Analysis of Variance Table
## 
## Response: Activity
##                  Df    Sum Sq   Mean Sq F value   Pr(>F) Part E Sq
## Mite.Plant        2 0.0080100 0.0040050 23.8526 0.000001   0.61393
## Trial             1 0.0079407 0.0079407 47.2923 0.000000   0.61186
## Mite.Plant:Trial  2 0.0008994 0.0004497  2.6783 0.085065   0.15150
## Residuals        30 0.0050372 0.0001679
# perform post-hoc Tukey-Kramer test of contrasts
TukeyHSD(aov(Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.Legumain))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Activity ~ Mite.Plant + Trial + Mite.Plant:Trial, data = cathepsin.data.Legumain)
## 
## $Mite.Plant
##                        diff         lwr        upr     p adj
## TU Bean-TU-A MM 0.008308298 -0.00473304 0.02134964 0.2737263
## TU MM-TU-A MM   0.034967752  0.02192641 0.04800909 0.0000008
## TU MM-TU Bean   0.026659454  0.01361812 0.03970079 0.0000605
## 
## $Trial
##           diff        lwr        upr p adj
## 3-2 0.03012481 0.02117852 0.03907109 1e-07
## 
## $`Mite.Plant:Trial`
##                            diff          lwr        upr     p adj
## TU Bean:2-TU-A MM:2 0.015842881 -0.009083793 0.04076955 0.4027454
## TU MM:2-TU-A MM:2   0.028019640  0.003092966 0.05294631 0.0205110
## TU-A MM:3-TU-A MM:2 0.030459935  0.007382320 0.05353755 0.0045003
## TU Bean:3-TU-A MM:2 0.033386389  0.010308773 0.05646400 0.0016058
## TU MM:3-TU-A MM:2   0.070390624  0.047313009 0.09346824 0.0000000
## TU MM:2-TU Bean:2   0.012176759 -0.012749915 0.03710343 0.6755106
## TU-A MM:3-TU Bean:2 0.014617055 -0.008460561 0.03769467 0.4065175
## TU Bean:3-TU Bean:2 0.017543508 -0.005534108 0.04062112 0.2206483
## TU MM:3-TU Bean:2   0.054547744  0.031470128 0.07762536 0.0000008
## TU-A MM:3-TU MM:2   0.002440295 -0.020637320 0.02551791 0.9994918
## TU Bean:3-TU MM:2   0.005366749 -0.017710867 0.02844436 0.9795973
## TU MM:3-TU MM:2     0.042370984  0.019293369 0.06544860 0.0000613
## TU Bean:3-TU-A MM:3 0.002926454 -0.018140431 0.02399334 0.9981045
## TU MM:3-TU-A MM:3   0.039930689  0.018863805 0.06099757 0.0000371
## TU MM:3-TU Bean:3   0.037004236  0.015937351 0.05807112 0.0001200
# plot interactions
interaction.plot(cathepsin.data.Legumain$Mite.Plant, cathepsin.data.Legumain$Trial, cathepsin.data.Legumain$Activity, type="l", leg.bty="o", leg.bg="grey95", lwd=2, ylab="Activity", xlab="Mite Strain on Host", main="Mite.Plant:Trial")

Validate models

Cathepsin L

cathepsin.data.CathepsinL$m.CathepsinL.fit <- fitted(m.CathepsinL)    # fitted values
cathepsin.data.CathepsinL$m.CathepsinL.res <- rstandard(m.CathepsinL) # Pearson residuals

Residual distribution / Overdispersion

We assumed normal residuals. This is the least important regression assumption but its can be tested with a qq plot.

ggplot(cathepsin.data.CathepsinL, aes(sample = m.CathepsinL.res)) + geom_qq() +
  geom_abline(intercept = 0, slope = 1) + theme_classic()

Looks good.

Residuals vs fitted values

Testing for:

Linearity - there should be no curvilinear pattern in the residuals.

Equal variance - the vertical spread of the residuals should e constant across all fitted values.

ggplot(cathepsin.data.CathepsinL, aes(x = m.CathepsinL.fit, y = m.CathepsinL.res)) + 
  geom_point() + geom_hline(yintercept = 0) + geom_smooth() + theme_classic()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Linearity - pretty good - confidence interval includes 0 for all values. Residuals farly equally dispersed.

Equal variance - no obvious problems

Residuals vs explanatory variables

Should be centered around 0, if not then model requires another explanatory variable(s), to account for observed variation.

ggplot(cathepsin.data.CathepsinL, aes(x = Mite.Plant, y = m.CathepsinL.res)) + 
  geom_boxplot() + geom_hline(yintercept = 0) + theme_classic()