# This R script is for Core Knowledge Seminar on 03/30/2022 about Biostatistics-ANOVA by Taehoon Ha.

# Load R packages we need today
library(dplyr)
library(readxl)
library(multcomp)
library(ggplot2)
library(lme4)
library(lmerTest)

# For those who do not have packages above, please uncomment the following line and install packages.
# install.packages(c("dplyr", "readxl", "multcomp", "ggplot2", "lme4", "lmerTest"))

# Today's dataset1 - Diet (low fat vs. high fat) and Ovx (shamOVX vs. OVX)
# Place the dataset and this R script file in the same folder and click [Session]-[Set working directory]-[To source file location] on the menu.
dat.work <- read_xlsx("CoreKnowledge.xlsx", sheet = 1)

# Preprocessing Factorize the categorical variable as preprocessing
dat.work$GroupName <- factor(dat.work$GroupName, levels = c("LowFat", "LowFat.OVX", "HighFat", "HighFat.OVX"))


#########################################################
# [1] One-way ANOVA

# 1. One-way ANOVA using cell means model
cellmeans_model <- lm(MouseWt ~ GroupName -1, data = dat.work)
summary(cellmeans_model)


# 2. As anova p-value is significant, we should move on to post-hoc analysis to identify which group mean(s) are significant.
anova(cellmeans_model)


# 3. Post-hoc analysis
aov.cellmeans <- aov(MouseWt ~ GroupName -1, data = dat.work)


# 3-1. Example 1: Tukey's method - compare all possible pairs
summary(glht(aov.cellmeans, linfct=mcp(GroupName="Tukey")), test = adjusted("none"))

# 3-2. Example 2: Dunnett's method - compare control to each treatment group. 
# Here, LowFat group is the reference, so compare LowFat to each of other groups.
summary(glht(aov.cellmeans, linfct=mcp(GroupName="Dunnett")), test = adjusted("none"))

# 3-3. Example 3: Contrast Matrix K
# - 1st column: LowFat.shamOVX group
# - 2nd column: LowFat.OVX group
# - 3rd column: HighFat.shamOVX group
# - 4th column: HighFat.OVX group
K <- rbind("OVX effect in LF"     = c(-1, 1, 0, 0),
           "OVX effect in HF"     = c(0, 0, -1, 1),
           "HF effect in shamOVX" = c(-1, 0, 1, 0),
           "HF effect in OVX"     = c(0, -1, 0, 1),
           "OVX effect"           = c(-1, 1, -1, 1),
           "HF effect"            = c(-1, -1, 1, 1), 
           "OVX HF Interation"    = c(1, -1, -1, 1))
summary(glht(aov.cellmeans, linfct=mcp(GroupName=K)), test = adjusted(type="none"))


# 4. Visualizing the group differences using boxplot
boxplot(MouseWt ~ GroupName, data = dat.work,
        xlab = "Group", ylab = "Mouse Weight",
        main = "Average Mouse Weight by Diet and OVX")


# 5. To see if we can improve our model - using weighted linear regression could improve the model fitting: inverse variance weighting
wts <- 1/rep(tapply(dat.work$MouseWt, dat.work$GroupName, var), each = 10)
cellmeans_model <- lm(MouseWt ~ GroupName - 1, data = dat.work, weights = wts)


# 6. Model diagnostics
# 6-1. assumption check (1) - Normality
qqnorm(rstudent(cellmeans_model))
qqline(rstudent(cellmeans_model))

# 6-2. assumption check (2) - Homogeneity of variances
plot(1:nrow(dat.work), rstudent(cellmeans_model), pch = 3,
     xlab="Index", ylab="Studentized Residual")

# > Both normality and homogeneity of variances assumptions were met!

#########################################################

# [2] Two-way ANOVA

# 1. Preprocessing the dataset
dat.work$Diet <- factor(dat.work$Diet, levels = c("LowFat", "HighFat"))
dat.work$OVX <- factor(dat.work$OVX, levels = c("shamOVX", "OVX"))

# 2. Two-way ANOVA using effects model
effects_model <- lm(MouseWt ~ Diet * OVX, data = dat.work)
summary(effects_model)

# 3. According to the ANOVA result, both Diet, Ovx, and the interaction of Diet and Ovx are all significant.
anova(effects_model)

# 4. Post-hoc analysis
# - 1st column: LowFat group mean (reference group)
# - 2nd column: Difference in mean between LowFat and HighFat
# - 3rd column: Difference in mean between LowFat.OVX and LowFat
# - 4th column: (HF.OVX - HF) - (LF.OVX - LF)
# > As cell means model and effects model are different in desinging contrast matrix, we need to precisely know what it means.
K2 <- rbind("LF.OVX - LF"               = c(0, 0, 1, 0),
            "HF.OVX - HF"               = c(0, 0, 1, 1),
            "HF - LF"                   = c(0, 1, 0, 0),
            "HF.OVX - LF.OVX"           = c(0, 1, 0, 1),
            "LF.OVX + HF.OVX - LF - HF" = c(0, 0, 2, 1),
            "HF + HF.OVX - LF - LF.OVX" = c(0, 2, 0, 1), 
            "HF.OVX - HF - LF.OVX + LF" = c(0, 0, 0, 1))
summary(glht(effects_model, linfct=K2), test=adjusted(type="none"))


# 5. Visualizing the interaction using line graph
# > Depending on the diet type and OVX, the slopes are different with each other. This is because Diet is significantly associated with Ovx.
interaction.plot(dat.work$Diet, dat.work$OVX, dat.work$MouseWt, 
                 xlab="Diet Type", ylab="Average Mouse Weight",
                 legend=F, lty=2:1)
legend("topleft", legend=levels(dat.work$OVX), lty=2:1, bty="n")

#########################################################

# [3] Partial F-test
# - Goal: To compare a full (complex) model to a reduced (simpler) model.
# - Caution: A full (complex) model should include all the predictors that a reduced (simpler) model has. ["nested model"]
# - To compare between non-nested models, use AIC, BIC, or Vuong's test.

reduced_model <- lm(MouseWt ~ Diet, data = dat.work)
full_model <- lm(MouseWt ~ Diet + OVX + Diet * OVX, data = dat.work)
anova(reduced_model, full_model)
# > As p<0.05, there is enough evidence that the full (complex) model is better than the reduced (simpler) model.

#########################################################

# [4] Mixed-effects model
# - Limitation of repeated measures ANOVA: if there is one sinlge NA value in the dataset, that sample is not available.
# - Solution: imputation.
# - Alternatively, we can try mixed effects model instead.

# 1. Today's dataset2 - longitudinal blood pressure between drug types
blood_data <- read_xlsx("CoreKnowledge.xlsx", sheet = 2) %>%
    mutate(id = factor(id),
           trt = factor(trt))


# 2. First 12 patients' bp change over time
# - Current issue: blood pressure change pattern differs from patient to patient.
# - Can we better measure the treatment effect and minimize the patients' effect (variations)?
ggplot(blood_data %>% filter(id %in% 1:12), aes(x=time, y=bp)) +
    geom_point(size=0.5) +
    stat_smooth(method = "lm",se=F,size=0.5)+
    facet_wrap("id", labeller = label_both)+ 
    theme_bw() +
    ggtitle("First 12 Patients' BP") +
    theme(plot.title = element_text(hjust = 0.5, size = 15, face = "bold"))

# 3. Interaction plot between Drug type and Time: Drug is associated with Time.
blood_data$time <- factor(blood_data$time)
with(blood_data, interaction.plot(x.factor = time, trace.factor = trt, response = d,
                                  xlab = "Day", ylab = "Difference of BP", 
                                  main = "Interaction plot"))

# 4. Mixed effects modeling
blood_fit <- lmer(d ~ trt + time + trt * time + (1|id), data = blood_data)
summary(blood_fit)
# > Treatment effect was significant. However, control drug decreased more blood pressure.
# > Time effect was not significant.
# > Interaction between treatment and time was not significant. 
# > Overall, we can conclude that not enough evidence was shown that the new (treatment) drug significantly decreased the blood pressure than the control (current) drug.