knitr::opts_chunk$set(echo = TRUE)
library(metaboprep) library(tidyverse) suppressPackageStartupMessages(library(dendextend)) ########################### # Read in R object with Rdata ## file passed as a paramater ########################### ## necessary arguments ## 1) not QC'd metabolite ## 2) sample data ## 3) feature data ## 4) DF'd metabolite data ## 5) data/out directory path ## 6) project ## 7) platform load(params$Rdatafile) out_dir = params$out_dir
## RAW DATA raw_data$metabolite_data = tibble::as_tibble(raw_data$metabolite_data) raw_data$sample_data = tibble::as_tibble(raw_data$sample_data); colnames(raw_data$sample_data)[1] = "SamID" raw_data$feature_data = tibble::as_tibble(raw_data$feature_data); colnames(raw_data$feature_data)[1] = "FeatureID" raw_data$varexp = raw_data$varexp[,1] ## QC Data qc_data$metabolite_data = tibble::as_tibble(qc_data$metabolite_data) qc_data$sample_data = tibble::as_tibble(qc_data$sample_data); colnames(qc_data$sample_data)[1] = "SamID" qc_data$feature_data = tibble::as_tibble(qc_data$feature_data); colnames(qc_data$feature_data)[1] = "FeatureID" qc_data$varexp = qc_data$varexp[,1]
metaboprep report relates to:
r project
r platform
The metaboprep
R
package performs three operations:
This report provides descriptive information for raw and filtered metabolomics data for the project r project
.
The data filtering workflow is as follows:
f = paste0( .libPaths(), "/metaboprep/help/figures/metaboprep_workflow.png" ) knitr::include_graphics( f )
metaboprep
pipeline can be directed to David Hughes: d.a.hughes@bristol.ac.uk.metaboprep
is published in Journal to be determined and can be cited as:r project
data settout = data.frame("data.set" = c("number of samples","number of features"), "raw.data" = dim(raw_data$metabolite_data), "filtered.data" = dim(qc_data$metabolite_data)) ggpubr::ggtexttable(tout, rows = NULL, theme = ggpubr::ttheme("mBlue") )
Missingness is evaluated across samples and features using the original/raw data set.
namat = qc_data$metabolite_data namat[!is.na(namat)] = 1 namat[is.na(namat)] = 0 namat = as.matrix(namat) namat = reshape::melt(namat) colnames(namat) = c("individuals","metabolites","missingness") pcol = RColorBrewer::brewer.pal(n = 8, name = 'Dark2') ggplot(namat, aes(metabolites, individuals, fill = missingness)) + geom_tile() + scale_fill_gradient(low= "white", high=pcol[5]) + theme(legend.position = "none", axis.text.x = element_blank() )
\begin{center} \textbf{Figure Legend:} Missingness structure across the raw data table. White cells depict missing data. Individuals are in rows, metabolites are in columns. \end{center}
r_mis = missingness.sum(mydata = raw_data$metabolite_data) p = ggpubr::ggarrange( r_mis[[4]][[1]] , r_mis[[4]][[2]] , r_mis[[4]][[3]] , r_mis[[4]][[4]] , ncol = 2, nrow = 2, labels = c("a", "b", "c", "d") ) ggpubr::annotate_figure(p,top = paste0("-- Initial raw data set: Estimates of missingness for samples and features --\n") )
\begin{center} \textbf{Figure Legend:} Raw data - (a) Distribution of sample missingness with sample mean illustrated by the red vertical line. (b) Distribution of feature missingness sample mean illustrated by the red vertical line. (c) Table of sample and feature missingness percentiles. A tabled version of plot a and b. (d) Estimates of study samples sizes under various levels of feature missingness. \end{center}
temp = data.frame(exclusions = rownames(qcing_data$exclusion_data), count = qcing_data$exclusion_data[,1]) ggpubr::ggtexttable( temp, rows = NULL, theme = ggpubr::ttheme("mBlue"))
\begin{center} \textbf{Table Legend}: Six primary data filtering exclusion steps were made during the preparation of the data. (1) Samples with missingness >=80\% were excluded. (2) features with missingness >=80\% were excluded (xenobiotics are not included in this step). (3) sample exclusions based on the user defined threshold were excluded. (4) feature exclusions based on user defined threshold were excluded (xenobiotics are not included in this step). (5) samples with a total-peak-area or total-sum-abundance that is >= N standard deviations from the mean, where N was defined by the user, were excluded. (6) samples that are >= N standard deviations from the mean on principal component axis 1 and 2, where N was defined by the user, were excluded. \end{center}
A data reduction was carried out to identify a list of representative features for generating a sample principal component analysis. This step reduces the level of inter-correlation in the data to ensure that the principal components are not driven by groups of correlated features.
################# ## data reduction ## info ################# feature_count = length(qcing_data$feature_sumstats$k) features_included_in_data_reduction = length( na.omit(qcing_data$feature_sumstats$k) ) cluster_count = length( unique(qcing_data$feature_sumstats$k) ) number_of_rep_meatbolites = sum( qcing_data$feature_sumstats$independent_features_binary == 1 ) temp = data.frame( data.reduction = c("total metabolite count","metabolites included in data reduction","number of metabolite clusters","number of representative metabolites"), count = c(feature_count, features_included_in_data_reduction, cluster_count, number_of_rep_meatbolites ) ) ptable = ggpubr::ggtexttable( temp, rows = NULL, theme = ggpubr::ttheme("mBlue")) ################# ## PCA ################# ## define tibble pcs = as_tibble(qcing_data$pcs) ## define color scheme pcol = RColorBrewer::brewer.pal(9, "Set1") ## define accelerationfactor accelerationfactor = qcing_data$accelerationfactor if(accelerationfactor > 10){accelerationfactor = 10} if(accelerationfactor == 1){accelerationfactor = 2} ## define outliers Omat = outlier.matrix(pcs[, 1:accelerationfactor], nsd = PC_outlier_SD) outliers = apply(Omat, 1, sum) outliers[outliers>0]=1 pcs$outliers = as.factor( outliers ) ## cutoffs = apply(pcs[, 1:accelerationfactor], 2, function(x){ msd = c(mean(x, na.rm = TRUE), sd(x, na.rm = TRUE)) cutoff = msd[1] + (msd[2]*PC_outlier_SD) return(cutoff) } ) ## pcplot = pcs %>% ggplot( aes(x = PC1, y = PC2) ) + geom_point( aes(fill = outliers), size = 2.5, shape = 21 ) + scale_fill_manual(values = pcol[c(2,1)]) + geom_vline(xintercept = c(cutoffs[1], -cutoffs[1]), color = pcol[1] ) + geom_hline(yintercept = c(cutoffs[2], -cutoffs[2]), color = pcol[1] ) + theme(legend.position="bottom") + labs(title = paste0("Principal components 1-&-2 using ",number_of_rep_meatbolites ," representative metabolites"), subtitle = paste0(" - Outliers are those ", PC_outlier_SD , " SD from the mean of PCs 1 to", accelerationfactor )) ## gridExtra::grid.arrange( grobs = list(ptable, pcplot), widths = c(2, 3), ncol = 2)
\begin{center} \textbf{Figure Legend:} The data reduction table on the left presents the number of metabolites at each phase of the data reduction (Spearman's correlation distance tree cutting) analysis. On the right principal components 1 and 2 are plotted for all individuals, using the representitve features identified in the data reduction analysis. The red vertical and horizontal lines indicate the standard deviation (SD) cutoffs for identifying individual outliers, which are plotted in red. The standard deviations cuttoff were defined by the user. \end{center}
r nrow(qc_data$metabolite_data)
r ncol(qc_data$metabolite_data)
r nrow(raw_data$metabolite_data) - nrow(qc_data$metabolite_data)
samples were filtered out, given the user's criteria.r ncol(raw_data$metabolite_data) - ncol(qc_data$metabolite_data)
features were filtered out, given the user's criteria.pcol = RColorBrewer::brewer.pal(9, "Set1") ############## ## Missingness ############## qc_mis = missingness.sum(mydata = qc_data$metabolite_data) s_mis_plot = qc_mis[[4]][[1]] f_mis_plot = qc_mis[[4]][[2]] ############## ## TPA ############## s = tibble::tibble(TPA_completefeature = qc_data$sample_data$TPA_completefeature) tpa_plot = qc_data$sample_data %>% ggplot( aes( TPA_completefeature ) ) + geom_histogram( fill = pcol[2] , bins = 25) + geom_vline( xintercept = median(qc_data$sample_data$TPA_completefeature), color = pcol[1], size = 1) + labs(title = paste0("total abundance of samples\nat complete features only"), x="", y="") + theme_bw() + theme(plot.title = element_text(hjust = 0.5)) ############## ## Dendrogram ############## dend = qc_data$feature_tree %>% as.dendrogram ## create a vector of colors to color your tree labels w = which( qc_data$feature_data$independent_features_binary == 1) pv_ids = as.character( unlist( qc_data$feature_data[w,1] ) ) n = labels(dend) bcol = rep("black", length(n)) w = which(n %in% pv_ids ); bcol[w] = pcol[2] ## redefine elements of dendrogram dend = dend %>% set("labels_cex", 0.5) %>% set("labels_col", bcol) %>% set("branches_lwd", 0.5) %>% set("branches_k_color", value = bcol) dend <- as.ggdend(dend) ## plot the dendrogram tree_plot = dend %>% ggplot() + geom_hline(yintercept = tree_cut_height, color = "coral2") ############## ## Data Reduce ## Table ############## feature_count = length(qc_data$feature_data$k) features_included_in_data_reduction = length( na.omit(qc_data$feature_data$k) ) cluster_count = length( unique(qc_data$feature_data$k) ) number_of_rep_meatbolites = sum( qc_data$feature_data$independent_features_binary == 1 ) temp = data.frame( data.reduction = c("total metabolite count","metabolites included in data reduction","number of metabolite clusters","number of representative metabolites"), count = c(feature_count,features_included_in_data_reduction, cluster_count, number_of_rep_meatbolites ) ) ptable = ggpubr::ggtexttable( temp, rows = NULL, theme = ggpubr::ttheme("mGreen")) ############# ## scree plot ############# ve = tibble::tibble( data.frame(PC= 1:length(qc_data$varexp), VarExp = qc_data$varexp) ) screeplot = ve %>% ggplot(aes(x = PC, y = VarExp)) + geom_line(color = "grey") + geom_point(shape = 21, fill = pcol[2], size = 2) + labs(title = "Scree plot") + geom_vline(xintercept = qc_data$accelerationfactor, color = pcol[1]) + geom_vline(xintercept = unlist( qc_data$nparallel ), color = pcol[3]) ############## ## PC Plot ############## pcs = as_tibble(qc_data$sample_data[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")]) pcs$cluster_k = as.factor( kmeans(pcs[, 1:2], 4)$cluster ) ## define color scheme pcol = RColorBrewer::brewer.pal(9, "Set1") ## pcplot = pcs %>% ggplot( aes(x = PC1, y = PC2) ) + geom_point( size = 2.5, shape = 21, aes(fill = cluster_k) ) + scale_fill_manual(values = pcol[1:4]) + labs( title = paste0("Principal components 1-&-2 using ", number_of_rep_meatbolites ," representative metabolites"), x = paste0("PC1 VarExp = ", signif(qc_data$varexp[1], d = 4)*100, "%" ), y = paste0("PC2 VarExp = ", signif(qc_data$varexp[2], d = 4)*100 , "%"), fill = paste0("kmeans\ncluster k")) ####################### ## Plot the data ####################### m = matrix(c(1,2,3,4,4,4,5,7,7, 6,7,7), nrow = 4, byrow = TRUE) gridExtra::grid.arrange( grobs = list(s_mis_plot, f_mis_plot, tpa_plot, tree_plot, ptable, screeplot, pcplot), layout_matrix = m, heights = c(1,1,1,1) )
\begin{center}
\textbf{Figure Legend:} Filtered data summary. Distributions for sample missingness, feature missingness, and total abundance of samples. Row two of the figure provides a Spearman's correlation distance clustering dendrogram highlighting the metabolites used as representative features in blue, the clustering tree cut height is denoted by the horizontal line. Row three provides a summary of the metabolite data reduction in the table, a Scree plot of the variance explained by each PC and a plot of principal component 1 and 2, as derived from the representative metabolites. The Scree plot also identifies the number of PCs estimated to be informative (vertical lines) by the Cattel's Scree Test acceleration factor (red, n = r qc_data$accelerationfactor
) and Parallel Analysis (green, n = r unlist( qc_data$nparallel )
). Individuals in the PC plot were clustered into 4 kmeans (k) clusters, using data from PC1 and PC2. The kmeans clustering and color coding is strictly there to help provide some visualization of the major axes of variation in the sample population(s).
\end{center}
plotcolors = pcol[pcs$cluster_k] pcapairs_bymoose( as.matrix(pcs[, 1:5]) , qc_data$varexp, pcol = plotcolors)
\begin{center} \textbf{Figure Legend:} A matrix plot of the top five principal components including demarcations of the 3rd (grey), 4th (orange), and 5th (red) standard deviations from the mean. Samples are color coded as in the summary PC plot above using a kmeans analysis of PC1 and PC2 with a k (number of clusters) set at 4. The choice of k = 4 was not robustly chosen it was a choice of simplicity to help aid visualize variation and sample mobility across the PCs. \end{center}
wstat = qc_data$feature_data$W_stat_rawdata ## how many stats were estimated count = length(wstat) nacount = sum(is.na(wstat)) remain_count = count - nacount # wstat = na.omit(wstat) normcount = sum(wstat >= 0.95, na.rm = TRUE)
W = wstat pcol = RColorBrewer::brewer.pal(9, "Set1") W_log10 = qc_data$feature_data$W_stat_log10data ## Plot par(mfrow = c(1,2), oma = c(2,1,1,1)) hist(W, col = pcol[2], main = paste("Distribution of W statistics on\nRaw Metabolite Abundances"), xlab = "W", ylab = "Frequency", cex.main = 0.75) abline(v = 0.95, col = pcol[1]) ## mtext( paste0(normcount, " of the metabolites exhibit distributions\nthat may declared normal, given a W-stat >= 0.95"), cex = 0.75, side = 1, outer = TRUE, line = 0, adj = 0.1, col = pcol[2]) ### LOG LogMakesDistributionWorse = c( sum( W_log10 < W , na.rm = TRUE), signif( sum( W_log10 < W , na.rm = TRUE)/length(!is.na(W) ), d = 3)*100) ## hist(W_log10, col = pcol[3], main = paste("Distribution of W statistics on\nlog10 Metabolite Abundances"), xlab = "W", ylab = "Frequency", cex.main = 0.75) mtext( paste0("In ", LogMakesDistributionWorse[1], " instances or ", LogMakesDistributionWorse[2], "% of the tested metabolites\nthe log10 data W-stat is < raw data W-stat."), cex = 0.75, side = 1, outer = TRUE, line = 0, adj = 0.9, col = pcol[3])
\begin{center} \textbf{Figure Legend:} Histogram plots of Shapiro W-statistics for raw (left) and log transformed (right) data distributions. A W-statistic value of 1 indicates the sample distribution is perfectly normal and value of 0 indicates it is perfectly uniform. Please note that log transformation of the data may not improve the normality of your data. \end{center}
\begin{center}
\textbf{Analysis details:} Of the r count
features in the data r nacount
features were excluded from this analysis because of no variation or too few observations (n < 40). Of the remaining r remain_count
metabolite features, a total of r normcount
may be considered normally distributed given a Shapiro W-statistic >= 0.95.
\end{center}
A pdf report is being written to r paste0( project, "_outlier_detection_pre_filtering.pdf")
that contains dotplot, histogram and distribution summary statistics for each metabolite in your data set, providing an opportunity to visually inspect all your metabolites feature data distributions.
Evaluation of the number of samples and features that are outliers across the data.
today = Sys.Date() today = gsub("-","_",today) outlier_filename <- paste0( out_dir, project, "_outlier_detection_pre_filtering.pdf") ## feauture outlier summary outlier_summary = outlier.summary(dtst = qc_data$metabolite_data, pdf_filename = outlier_filename, nsd = 5)
outlier_summary[[2]]
\begin{center} \textbf{Table Legend:} The table reports the number of point estimates for the minimum (0\%) median (50\%) and maximum (100\%) number of outlying features across samples and the number of outlying samples across features. \end{center}
There may be extreme outlying observations at individual metabolites|features that have not been accounted for. You may want to:
Feature missingness may be influenced by the metabolites (or features) biology or pathway classification, or your technologies methodology. The figure(s) below provides an illustrative evaluation of the proportion of feature missigness as a product of the variable(s) available in the raw data files.
possible_batch_variables = c("path","platform", "class", "batch") ## matching variables w = unique( unlist( sapply(possible_batch_variables, function(x){ grep(x, tolower( colnames(qc_data$feature_data) ) ) }) ) ) if(length(w)>0){ matched_variables = colnames(qc_data$feature_data)[w] ## remove batch variables, that are invariable in the sample w = which( apply( qc_data$feature_data[,matched_variables], 2, function(x){ length(table(x)) }) <= 1 ) if(length(w)>0){ matched_variables = matched_variables[-w] } ## insure group sizes are larger than 1 w = which( apply( qc_data$feature_data[,matched_variables], 2, function(x){ sum( table(x) > 1 ) / length(table(x)) }) < 0.25 ) if(length(w)>0){ matched_variables = matched_variables[-w] } ## insure the number of variables is not larger than 96 w = which( apply( qc_data$feature_data[,matched_variables], 2, function(x){ length(table(x)) }) > 96 ) if(length(w)>0){ matched_variables = matched_variables[-w] } ## define the class_variables class_variables = matched_variables if( "SUB_PATHWAY" %in% class_variables ){ w = which(class_variables %in% "SUB_PATHWAY") class_variables = class_variables[-w] } }
## MISSINGNESS ################################### ## Iterate over each batch variable ################################### if(length(class_variables)>0){ ClassMisPlots = lapply( class_variables , function(x){ out = variable.by.factor( dep = qc_data$feature_data$feature_missingness , indep = unlist( qc_data$feature_data[,x] ), dep_name = "feature missingness", indep_name = x, orderfactor = TRUE, violin = FALSE ) return(out) }) ## plot the output gridExtra::grid.arrange( grobs = ClassMisPlots, ncol = 1) } else { paste0(" -- No feature level batch variables identified or all were invariable -- ") }
\begin{center} \textbf{Figure Legend:} Box plot illustration(s) of the relationship that available batch and biological variables have with feature missingness. \end{center}
The figure provides an illustrative evaluation of the proportion of sample missigness as a product of sample batch variables provided by your supplier. This is the univariate influence of batch effects on sample missingness.
possible_batch_variables = c("box", "day", "prep", "date", "time", "type", "nmr", "buffer", "lmwm") ## matching variables # w = which( possible_batch_variables %in% colnames(qc_data$sample_data) ) w = unique( unlist( sapply(possible_batch_variables, function(x){ grep(x, tolower( colnames(qc_data$sample_data) ) ) }) ) ) if( length(w) > 0 ){ # matched_variables = possible_batch_variables[w] matched_variables = colnames(qc_data$sample_data)[w] ## batch variables, that are variable in the sample w = which( apply( qc_data$sample_data[,matched_variables], 2, function(x){ length( table(x) ) } ) > 1 ) matched_variables = matched_variables[w] ## insure group sizes are larger than 1 w = which( apply( qc_data$sample_data[,matched_variables], 2, function(x){ sum( table(x) > 1 ) / length(table(x)) }) >= 0.25 ) matched_variables = matched_variables[w] ## insure the number of variables is not larger than 96 w = which( apply( qc_data$sample_data[,matched_variables], 2, function(x){ length(table(x)) }) < 97 ) matched_variables = matched_variables[w] batch_variables = matched_variables } else { batch_variables = NA } ## In case all batch variables are not variable (typically because sample size is small and all done in 1 go.) if(length(batch_variables) == 0){ batch_variables = NA }
## MISSINGNESS ################################### ## Iterate over each batch variable ################################### if( !is.na(batch_variables[1]) ) { BatchMisPlots = lapply( batch_variables , function(x){ out = variable.by.factor( dep = qc_data$sample_data$sample_missingness , indep = unlist( qc_data$sample_data[,x] ), dep_name = "sample missingness", orderfactor = FALSE, indep_name = x, violin = TRUE) return(out) }) ## plot the output gridExtra::grid.arrange( grobs = BatchMisPlots, ncol = 1) } else { paste0(" -- No sample level batch variables were provided or all were invariable -- ") }
\begin{center} \textbf{Figure Legend:} Box plot illustration(s) of the relationship that available batch variables have with sample missingness. \end{center}
if( !is.na(batch_variables[1]) ) { ( multivariate.anova(dep = qc_data$sample_data$sample_missingness, indep_df = qc_data$sample_data[ ,batch_variables ] ) ) } else { paste0(" -- No sample level batch variables were provided or all were invariable -- ") }
\begin{center} \textbf{Table Legend:} TypeII ANOVA: the eta-squared (eta-sq) estimates are an estimation of the percent of variation explained by each independent variable, after accounting for all other variables, as derived from the sum of squares. This is a multivariate evaluation of batch variables on sample missingness. \end{center}
Total peak|abundance area (TPA) is simply the sum of the abundances measured across all features. TPA is one measure that can be used to identify unusual samples given their entire profile. However, the level of missingness in a sample may influence TPA. To account for this we:
Correlation between total peak area (at complete features) and missingness
a = cor.test( qc_data$sample_data$sample_missingness, qc_data$sample_data$TPA_completefeature) ### ( tpamis = qc_data$sample_data %>% ggplot( aes(x = TPA_completefeature, y = sample_missingness)) + geom_point( fill = "grey", alpha = 0.8, size = 1.5) + geom_smooth(method = "loess", color = "red", size = 2) + geom_smooth(method = "lm", color = "blue", size = 2) + labs(x = "TPA, complete features", y = "sample missingness", title = paste0( "TPA as influenced by missingness \nSpearmans's cor = ", round(a$estimate, d = 4), " p-value = ", formatC( a$p.value, format = "e", digits = 2) )) )
\begin{center} \textbf{Figure Legend:} Relationship between total peak area at complete features (x-axis) and sample missingness (y-axis). \end{center}
The figure below provides an illustrative evaluation of the total peak area as a product of sample batch variables provided by your supplier.
if( !is.na(batch_variables[1]) ) { BatchMisPlots = lapply( batch_variables , function(x){ out = variable.by.factor( dep = qc_data$sample_data$TPA_completefeature, indep = unlist( qc_data$sample_data[,x] ), dep_name = "total peak area", orderfactor = FALSE, indep_name = x, violin = TRUE) return(out) }) ## plot the output gridExtra::grid.arrange( grobs = BatchMisPlots, ncol = 1) } else { paste0(" -- No sample level batch variables were provided or all were invariable -- ") }
\begin{center} \textbf{Figure Legend:} Violin plot illustration(s) of the relationship between total peak area (TPA) and sample batch variables that are available in your data. \end{center}
if( !is.na(batch_variables[1]) ) { ( multivariate.anova(dep = qc_data$sample_data$TPA_completefeature, indep_df = qc_data$sample_data[,batch_variables ] ) ) }else { paste0(" -- No sample level batch variables were provided or all were invariable -- ") }
\begin{center} \textbf{Table Legend:} TypeII ANOVA: the eta-squared (eta-sq) estimates are an estimation on the percent of variation explained by each independent variable, after accounting for all other variables, as derived from the sum of squares. This is a multivariate evaluation of batch variables on total peak|abundance area at complete features. \end{center}
Exploration for case/control and continuous outcome data using the filtered data set
Analytical power analysis for both continuous and imbalanced presence/absence correlation analysis.
#################################### # Run power analysis and generate plot #################################### #( run.power.make.plot(mydata = metdata ) ) p1 = run.cont.power.make.plot( mydata = qc_data$metabolite_data ) p2 = run.pa.imbalanced.power.make.plot( mydata = qc_data$metabolite_data ) # gridExtra::grid.arrange(p1, p2, nrow = 1) ggpubr::ggarrange(p1, p2, labels = c("A", "B"), ncol = 2, nrow = 1)
\begin{center}
\textbf{Figure Legend:} Simulated effect sizes are illustrated by their color in each figure. Figure (A) provides estimates of power for continuous traits with the total sample size on the x-axis and the estimated power on the y-axis. Figure (B) provides estimates of power for presence/absence (or binary) traits in an imbalanced design. The estimated power is on the y-axix. The total sample size is set to r nrow(qc_data$metabolite_data)
and the x-axis depicts the number of individuals present (or absent) for the trait. The effects sizes illustrated here were chosen by running an initial set of simulations which identified effects sizes that would span a broad range of power estimates given the sample population's sample size.
\end{center}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.