knitr::opts_chunk$set( collapse = TRUE, comment = "#>", out.width = "100%", fig.asp = 0.7, fig.width = 12, fig.align = "center" )
library(RMASCA)
Let us start out by creating some dummy data for us to work with. In this example we have 600 participants measured four different times. The participants belonged to three different groups -- for example control group, chocolate diet and salad diet -- and we measured ten variables each time.
nPart <- 600 nGroups <- c("Controls", "Chocolate", "Salad") nTime <- c(1, 2, 3, 4) variables <- c("BMI", "Glucose", "VLDL", "LDL", "HDL", "ferritin", "CRP", "Happiness", "Anger", "Age")
The RMASCA()
function expects a data frame in long format that contains at least one column for time (called time
), one for group (called group
) and one for variables (called variable
).
df <- data.frame( partid = c(1:nPart), group = nGroups[sample(c(1:length(unique(nGroups))), nPart, replace = TRUE)] ) df$time <- nTime[1] df_temp_temp <- df for(i in nTime[2:length(nTime)]){ df_temp <- df_temp_temp df_temp$time <- i df <- rbind(df, df_temp) } for(i in 1:length(variables)){ df[,3+i] <- rnorm(nrow(df), mean = 10, sd = 3) } colnames(df) <- c("partid", "group", "time", variables)
Random data aren't very interesting, so let us add some trends.
The age and HDL, however, increases for all groups, and the other variables are unaffected and vary at random.
df$Anger[df$group == "Chocolate" & df$time == 2] <- df$Anger[df$group == "Chocolate" & df$time == 1]/2 df$Anger[df$group == "Chocolate" & df$time == 3] <- df$Anger[df$group == "Chocolate" & df$time == 1]/1.5 df$Happiness[df$group == "Chocolate" & df$time == 2] <- df$Happiness[df$group == "Chocolate" & df$time == 1]*2 df$Happiness[df$group == "Chocolate" & df$time == 3] <- df$Happiness[df$group == "Chocolate" & df$time == 1]*1.5 df$BMI[df$group == "Chocolate"] <- df$BMI[df$group == "Chocolate"]*df$time[df$group == "Chocolate"] df$Glucose[df$group == "Chocolate"] <- df$Glucose[df$group == "Chocolate"]*df$time[df$group == "Chocolate"] df$Anger[df$group == "Salad" & df$time == 2] <- df$Anger[df$group == "Salad" & df$time == 1]*2 df$Anger[df$group == "Salad" & df$time == 3] <- df$Anger[df$group == "Salad" & df$time == 1]*1.5 df$Happiness[df$group == "Salad" & df$time == 2] <- df$Happiness[df$group == "Salad" & df$time == 1]/2 df$Happiness[df$group == "Salad" & df$time == 3] <- df$Happiness[df$group == "Salad" & df$time == 1]/1.5 df$BMI[df$group == "Salad"] <- df$BMI[df$group == "Salad"]/df$time[df$group == "Salad"] df$Glucose[df$group == "Salad"] <- df$Glucose[df$group == "Salad"]/df$time[df$group == "Salad"] df$Age <- df$Age*df$time df$HDL <- df$HDL*df$time
And we need to define the control group and transform it to the long format. And since they are "real" people, we will give them individual baselines so that we need to use linear mixed models with participant as random intercepts.
df$group <- factor(df$group) df$group <- relevel(df$group, ref = "Controls") df <- reshape2::melt(df, id.vars = c("partid", "group", "time")) for(i in unique(df$variables)){ for(j in unique(df$partid)){ df$value[df$variables == i & df$partid == j] <- df$value[df$variables == i & df$partid == j] + sample(0:5,1) } }
Let us see how it looks
ggplot2::ggplot(data = df, ggplot2::aes(x = factor(time), y = value, color = group)) + ggplot2::geom_boxplot() + ggplot2::facet_wrap(~variable, scales = "free_y") + ggplot2::theme(legend.position = "bottom") + ggplot2::labs(x = "Time", y = "Value", color = "Group")
And we can have a look at some random participants to see typical trajectories
ggplot2::ggplot(data = subset(df, partid %in% sample(unique(partid), 5)), ggplot2::aes(x = factor(time), y = value, group = partid, color = group)) + ggplot2::geom_point() + ggplot2::geom_line() + ggplot2::facet_wrap(~variable, scales = "free_y") + ggplot2::theme(legend.position = "bottom") + ggplot2::labs(x = "Time", y = "Value", color = "Group")
Now that we have a data set, let us try the RMASCA()
function! First we need to decide what kind of model we want. For simplicity, let us start with a very simple model with terms for time, group and interaction.
model.formula <- value ~ time*group + (1|partid) res.simple <- RMASCA(df = df, formula = model.formula)
To visualize the result, simply use plot(res.simple)
;
plot(res.simple)
Let us interpret the figure from top left:
However, that is only the first component. In this case, that variable alone explains most of the variation in the data (>99%) NB: Må sjekke om forklart varians har blitt riktig. Let us plot the second component too:
plot(res.simple, component = "PC2")
The interpretation is similar to the one above. We are most interested in the bottom panels: the chocolate group has first an increase at time 2, but then decrease again. To the right, we can see that happiness follows that trajectory, whereas anger has the inverted trajectory (first decreased anger, then return to baseline). The salad group has the opposite trajectories. Cool, huh?
So, how many components do we need? With PCA, the first component is always the one explaining most variance. We can look at how much the various components explain,
screeplot(res.simple)
And it seems that we should use PC1 for the time effect, but probably both PC1 and PC2 for the group effect. Which makes sense, since we added several distinct group trends to our data that needs different trajectories to be explained.
Often, we want some more control over the plots. Luckily, we can easily get the necessary data using getLoadings()
and getScores()
. For example
scores <- getScores(res.simple) loadings <- getLoadings(res.simple) gst_1 <- ggplot2::ggplot(scores$time, ggplot2::aes(x = time, y = PC1, group = NA)) + ggplot2::geom_point() + ggplot2::geom_line() + ggplot2::theme_bw() + ggplot2::labs(x = "Time", y = paste0("Pr Comp. 1 (",round(100*scores$explained$time[1], 2),"%)")) gst_2 <- ggplot2::ggplot(scores$time, ggplot2::aes(x = time, y = PC2, group = NA)) + ggplot2::geom_point() + ggplot2::geom_line() + ggplot2::theme_bw() + ggplot2::labs(x = "Time", y = paste0("Pr Comp. 2 (",round(100*scores$explained$time[2], 2),"%)")) gsg_1 <- ggplot2::ggplot(scores$group, ggplot2::aes(x = time, y = PC1, group = group, color = group)) + ggplot2::geom_point() + ggplot2::geom_line() + ggplot2::theme_bw() + ggplot2::labs(x = "Time", y = paste0("Pr Comp. 1 (",round(100*scores$explained$group[1], 2),"%)")) gsg_2 <- ggplot2::ggplot(scores$group, ggplot2::aes(x = time, y = PC2, group = group, color = group)) + ggplot2::geom_point() + ggplot2::geom_line() + ggplot2::theme_bw() + ggplot2::labs(x = "Time", y = paste0("Pr Comp. 2 (",round(100*scores$explained$group[2], 2),"%)")) glt_1 <- ggplot2::ggplot(loadings$time, ggplot2::aes(x = covars, y = PC1)) + ggplot2::geom_point() + ggplot2::labs(x = "Variable", y = paste0("Pr Comp. 1 (",round(100*loadings$explained$time[1], 2),"%)")) + ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1)) glt_2 <- ggplot2::ggplot(loadings$time, ggplot2::aes(x = covars, y = PC2)) + ggplot2::geom_point() + ggplot2::labs(x = "Variable", y = paste0("Pr Comp. 2 (",round(100*loadings$explained$time[2], 2),"%)")) + ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1)) glg_1 <- ggplot2::ggplot(loadings$group, ggplot2::aes(x = covars, y = PC1)) + ggplot2::geom_point() + ggplot2::labs(x = "Variable", y = paste0("Pr Comp. 1 (",round(100*loadings$explained$group[1], 2),"%)")) + ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1)) glg_2 <- ggplot2::ggplot(loadings$group, ggplot2::aes(x = covars, y = PC2)) + ggplot2::geom_point() + ggplot2::labs(x = "Variable", y = paste0("Pr. Comp. 2 (",round(100*loadings$explained$group[2], 2),"%)")) + ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.4, hjust=1)) ggpubr::ggarrange( ggpubr::ggarrange(gst_1, glt_1, gsg_1, glg_1, nrow = 1, widths = c(2,3,2,3), common.legend = TRUE, legend = "none"), ggpubr::ggarrange(gst_2, glt_2, gsg_2, glg_2, nrow = 1, widths = c(2,3,2,3), common.legend = TRUE, legend = "bottom"), nrow = 2 )
We often need to now how robust our results are. In this package, we use leave-one-out jack-knifing where we divide our participants into nValFold
groups and leave out one of the groups. Note that the group proportions are kept relatively stable. We repeat this process nValRuns
times and use the 2.5 and 97.5 percentiles as our uncertainty estimate. If you know that you are going to do this, you can set validate = TRUE
when you run the RMASCA model the first time. This usually takes some time. Also note that we need to specify which column that contains participant id with the argument partid
. We can also call validation after having initialized the RMASCA object:
res.simple$nValRuns <- 50 # the more, the better res.simple <- validate(res.simple, participantColumn = "partid") plot(res.simple) plot(res.simple, component = "PC2")
If our experiment was an intervention, then we can assume that the groups were equal at baseline (time 1). Then we can skip the main group effect and only use time and interaction, like this:
model.formula <- value ~ time + time:group + (1|partid) res.mod2 <- RMASCA(df = df, formula = model.formula) plot(res.mod2) plot(res.mod2, component = "PC2")
As you can see, it is quite similar to the previous model, but all the groups have the same value for time 1.
Sometimes it doesn't make sense to use a reference group -- for example if we didn't have any controls and the first group had a salt-free diet. We can then use a single PCA instead of separating time and group effects:
model.formula <- value ~ time + time:group + (1|partid) res.mod3 <- RMASCA(df = df, formula = model.formula, separateTimeAndGroup = FALSE) plot(res.mod3) plot(res.mod3, component = "PC2")
This time we want to adjust for age and not include it in the ASCA itself;
age <- subset(df, variable == "Age")$value df <- subset(df, variable != "Age") df$age <- age model.formula <- value ~ time + time:group + age + (1|partid) res.mod4 <- RMASCA(df = df, formula = model.formula) plot(res.mod4) plot(res.mod4, component = "PC2")
We can see that the linear increase in age is no longer part of the output.
RM-ASCA+ is built on Linear Mixed Models, and one should check that the residuals approach normal distribution. With RMASCA
, the LMM objects can be found as
summary(res.simple$lmer.models[[1]])
We can also assess the residuals,
plot(density(residuals(res.simple$lmer.models[[1]])), main = unique(df$variable)[1]) qqnorm(residuals(res.simple$lmer.models[[1]]), main = unique(df$variable)[1]) qqline(residuals(res.simple$lmer.models[[1]]))
As you saw above, we can use the summary
function to view the p values from your LMMs, but a shortcut is
head(res.simple$LMM.coefficient)
Note that these p values are corrected for multiple testing, whereas those in the summary
function are not.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.