knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  out.width = "100%", 
  fig.asp = 0.7,
  fig.width = 12,
  fig.align = "center"
)
library(RMASCA)

Introduction to RM-ASCA+

Creating a dummy data set

Let us start out by creating some dummy data for us to work with. In this example we have 600 participants measured four different times. The participants belonged to three different groups -- for example control group, chocolate diet and salad diet -- and we measured ten variables each time.

nPart <- 600
nGroups <- c("Controls", "Chocolate", "Salad")
nTime <- c(1, 2, 3, 4)
variables <- c("BMI", "Glucose", "VLDL", "LDL", "HDL", "ferritin", "CRP", "Happiness", "Anger", "Age")

The RMASCA() function expects a data frame in long format that contains at least one column for time (called time), one for group (called group) and one for variables (called variable).

df <- data.frame(
  partid = c(1:nPart),
  group = nGroups[sample(c(1:length(unique(nGroups))), nPart, replace = TRUE)]
)
df$time <- nTime[1]
df_temp_temp <- df
for(i in nTime[2:length(nTime)]){
  df_temp <- df_temp_temp
  df_temp$time <- i
  df <- rbind(df, df_temp)
}
for(i in 1:length(variables)){
  df[,3+i] <- rnorm(nrow(df), mean = 10, sd = 3)
}
colnames(df) <- c("partid", "group", "time", variables)

Random data aren't very interesting, so let us add some trends.

The age and HDL, however, increases for all groups, and the other variables are unaffected and vary at random.

df$Anger[df$group == "Chocolate" & df$time == 2] <- df$Anger[df$group == "Chocolate" & df$time == 1]/2
df$Anger[df$group == "Chocolate" & df$time == 3] <- df$Anger[df$group == "Chocolate" & df$time == 1]/1.5
df$Happiness[df$group == "Chocolate" & df$time == 2] <- df$Happiness[df$group == "Chocolate" & df$time == 1]*2
df$Happiness[df$group == "Chocolate" & df$time == 3] <- df$Happiness[df$group == "Chocolate" & df$time == 1]*1.5
df$BMI[df$group == "Chocolate"] <- df$BMI[df$group == "Chocolate"]*df$time[df$group == "Chocolate"]
df$Glucose[df$group == "Chocolate"] <- df$Glucose[df$group == "Chocolate"]*df$time[df$group == "Chocolate"]

df$Anger[df$group == "Salad" & df$time == 2] <- df$Anger[df$group == "Salad" & df$time == 1]*2
df$Anger[df$group == "Salad" & df$time == 3] <- df$Anger[df$group == "Salad" & df$time == 1]*1.5
df$Happiness[df$group == "Salad" & df$time == 2] <- df$Happiness[df$group == "Salad" & df$time == 1]/2
df$Happiness[df$group == "Salad" & df$time == 3] <- df$Happiness[df$group == "Salad" & df$time == 1]/1.5
df$BMI[df$group == "Salad"] <- df$BMI[df$group == "Salad"]/df$time[df$group == "Salad"]
df$Glucose[df$group == "Salad"] <- df$Glucose[df$group == "Salad"]/df$time[df$group == "Salad"]

df$Age <- df$Age*df$time
df$HDL <- df$HDL*df$time

And we need to define the control group and transform it to the long format. And since they are "real" people, we will give them individual baselines so that we need to use linear mixed models with participant as random intercepts.

df$group <- factor(df$group)
df$group <- relevel(df$group, ref = "Controls")
df <- reshape2::melt(df, id.vars = c("partid", "group", "time"))
for(i in unique(df$variables)){
  for(j in unique(df$partid)){
    df$value[df$variables == i & df$partid == j] <- df$value[df$variables == i & df$partid == j] + sample(0:5,1)
  }
}

Let us see how it looks

ggplot2::ggplot(data = df,
                ggplot2::aes(x = factor(time), y = value, color = group)) +
  ggplot2::geom_boxplot() +
  ggplot2::facet_wrap(~variable, scales = "free_y") +
  ggplot2::theme(legend.position = "bottom") +
  ggplot2::labs(x = "Time", y = "Value", color = "Group")

And we can have a look at some random participants to see typical trajectories

ggplot2::ggplot(data = subset(df, partid %in% sample(unique(partid), 5)),
     ggplot2::aes(x = factor(time), y = value, group = partid, color = group)) +
     ggplot2::geom_point() + ggplot2::geom_line() +
     ggplot2::facet_wrap(~variable, scales = "free_y") +
     ggplot2::theme(legend.position = "bottom") +
     ggplot2::labs(x = "Time", y = "Value", color = "Group")

Using RM-ASCA+

Now that we have a data set, let us try the RMASCA() function! First we need to decide what kind of model we want. For simplicity, let us start with a very simple model with terms for time, group and interaction.

model.formula <- value ~ time*group + (1|partid)
res.simple <- RMASCA(df = df, formula = model.formula)

To visualize the result, simply use plot(res.simple);

plot(res.simple)

Let us interpret the figure from top left:

However, that is only the first component. In this case, that variable alone explains most of the variation in the data (>99%) NB: Må sjekke om forklart varians har blitt riktig. Let us plot the second component too:

plot(res.simple, component = "PC2")

The interpretation is similar to the one above. We are most interested in the bottom panels: the chocolate group has first an increase at time 2, but then decrease again. To the right, we can see that happiness follows that trajectory, whereas anger has the inverted trajectory (first decreased anger, then return to baseline). The salad group has the opposite trajectories. Cool, huh?

Scree plots

So, how many components do we need? With PCA, the first component is always the one explaining most variance. We can look at how much the various components explain,

screeplot(res.simple)

And it seems that we should use PC1 for the time effect, but probably both PC1 and PC2 for the group effect. Which makes sense, since we added several distinct group trends to our data that needs different trajectories to be explained.

Customized plots

Often, we want some more control over the plots. Luckily, we can easily get the necessary data using getLoadings() and getScores(). For example

scores <- getScores(res.simple)
loadings <- getLoadings(res.simple)

gst_1 <- ggplot2::ggplot(scores$time, ggplot2::aes(x = time, y = PC1, group = NA)) + 
  ggplot2::geom_point() + 
  ggplot2::geom_line() + 
  ggplot2::theme_bw() +
  ggplot2::labs(x = "Time", y = paste0("Pr Comp. 1 (",round(100*scores$explained$time[1], 2),"%)"))

gst_2 <- ggplot2::ggplot(scores$time, ggplot2::aes(x = time, y = PC2, group = NA)) + 
  ggplot2::geom_point() + 
  ggplot2::geom_line() + 
  ggplot2::theme_bw() +
  ggplot2::labs(x = "Time", y = paste0("Pr Comp. 2 (",round(100*scores$explained$time[2], 2),"%)"))

gsg_1 <- ggplot2::ggplot(scores$group, ggplot2::aes(x = time, y = PC1, group = group, color = group)) + 
  ggplot2::geom_point() + 
  ggplot2::geom_line() + 
  ggplot2::theme_bw() +
  ggplot2::labs(x = "Time", y = paste0("Pr Comp. 1 (",round(100*scores$explained$group[1], 2),"%)"))

gsg_2 <- ggplot2::ggplot(scores$group, ggplot2::aes(x = time, y = PC2, group = group, color = group)) + 
  ggplot2::geom_point() + 
  ggplot2::geom_line() + 
  ggplot2::theme_bw() +
  ggplot2::labs(x = "Time", y = paste0("Pr Comp. 2 (",round(100*scores$explained$group[2], 2),"%)"))

glt_1 <- ggplot2::ggplot(loadings$time, ggplot2::aes(x = covars, y = PC1)) + 
  ggplot2::geom_point() + 
  ggplot2::labs(x = "Variable", y = paste0("Pr Comp. 1 (",round(100*loadings$explained$time[1], 2),"%)")) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1))

glt_2 <- ggplot2::ggplot(loadings$time, ggplot2::aes(x = covars, y = PC2)) + 
  ggplot2::geom_point() + 
  ggplot2::labs(x = "Variable", y = paste0("Pr Comp. 2 (",round(100*loadings$explained$time[2], 2),"%)")) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1))

glg_1 <- ggplot2::ggplot(loadings$group, ggplot2::aes(x = covars, y = PC1)) + 
  ggplot2::geom_point() + 
  ggplot2::labs(x = "Variable", y = paste0("Pr Comp. 1 (",round(100*loadings$explained$group[1], 2),"%)")) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1))

glg_2 <- ggplot2::ggplot(loadings$group, ggplot2::aes(x = covars, y = PC2)) + 
  ggplot2::geom_point() + 
  ggplot2::labs(x = "Variable", y = paste0("Pr. Comp. 2 (",round(100*loadings$explained$group[2], 2),"%)")) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1))

ggpubr::ggarrange(
  ggpubr::ggarrange(gst_1, glt_1, gsg_1, glg_1, nrow = 1, widths = c(2,3,2,3), common.legend = TRUE, legend = "none"),
  ggpubr::ggarrange(gst_2, glt_2, gsg_2, glg_2, nrow = 1, widths = c(2,3,2,3), common.legend = TRUE, legend = "bottom"),
  nrow = 2
)

Robustness testing

We often need to now how robust our results are. In this package, we use leave-one-out jack-knifing where we divide our participants into nValFold groups and leave out one of the groups. Note that the group proportions are kept relatively stable. We repeat this process nValRuns times and use the 2.5 and 97.5 percentiles as our uncertainty estimate. If you know that you are going to do this, you can set validate = TRUE when you run the RMASCA model the first time. This usually takes some time. Also note that we need to specify which column that contains participant id with the argument partid. We can also call validation after having initialized the RMASCA object:

res.simple$nValRuns <- 50 # the more, the better

res.simple <- validate(res.simple, participantColumn = "partid")

plot(res.simple)

plot(res.simple, component = "PC2")

Alternative models

Interaction only

If our experiment was an intervention, then we can assume that the groups were equal at baseline (time 1). Then we can skip the main group effect and only use time and interaction, like this:

model.formula <- value ~ time + time:group + (1|partid)
res.mod2 <- RMASCA(df = df, formula = model.formula)
plot(res.mod2)
plot(res.mod2, component = "PC2")

As you can see, it is quite similar to the previous model, but all the groups have the same value for time 1.

Interaction only with combined time and group effect

Sometimes it doesn't make sense to use a reference group -- for example if we didn't have any controls and the first group had a salt-free diet. We can then use a single PCA instead of separating time and group effects:

model.formula <- value ~ time + time:group + (1|partid)
res.mod3 <- RMASCA(df = df, formula = model.formula, separateTimeAndGroup = FALSE)
plot(res.mod3)
plot(res.mod3, component = "PC2")

Covariates

This time we want to adjust for age and not include it in the ASCA itself;

age <- subset(df, variable == "Age")$value
df <- subset(df, variable != "Age")
df$age <- age
model.formula <- value ~ time + time:group + age + (1|partid)
res.mod4 <- RMASCA(df = df, formula = model.formula)
plot(res.mod4)
plot(res.mod4, component = "PC2")

We can see that the linear increase in age is no longer part of the output.

Diagnostics

Normal distribution of residuals

RM-ASCA+ is built on Linear Mixed Models, and one should check that the residuals approach normal distribution. With RMASCA, the LMM objects can be found as

summary(res.simple$lmer.models[[1]])

We can also assess the residuals,

plot(density(residuals(res.simple$lmer.models[[1]])), main = unique(df$variable)[1])

qqnorm(residuals(res.simple$lmer.models[[1]]), main = unique(df$variable)[1]) 
qqline(residuals(res.simple$lmer.models[[1]]))

P values

As you saw above, we can use the summary function to view the p values from your LMMs, but a shortcut is

head(res.simple$LMM.coefficient)

Note that these p values are corrected for multiple testing, whereas those in the summary function are not.



andjar/ALASCA documentation built on March 2, 2024, 12:55 p.m.