Nothing
## ---- include=FALSE-----------------------------------------------------------
knitr::opts_chunk$set(
dpi=72,fig.width=5,fig.height=5.5
)
set.seed(57475)
## ---- eval=FALSE, include=TRUE------------------------------------------------
# # install BiocManager if not present
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# # install structToolbox and dependencies
# BiocManager::install("structToolbox")
## ---- message=FALSE, warning=FALSE--------------------------------------------
## install additional bioc packages for vignette if needed
#BiocManager::install(c('pmp', 'ropls', 'BiocFileCache'))
## install additional CRAN packages if needed
#install.packages(c('cowplot', 'openxlsx'))
suppressPackageStartupMessages({
# Bioconductor packages
library(structToolbox)
library(pmp)
library(ropls)
library(BiocFileCache)
# CRAN libraries
library(ggplot2)
library(gridExtra)
library(cowplot)
library(openxlsx)
})
# use the BiocFileCache
bfc <- BiocFileCache(ask = FALSE)
## -----------------------------------------------------------------------------
## Iris dataset (comment if using MTBLS79 benchmark data)
D = iris_DatasetExperiment()
D$sample_meta$class = D$sample_meta$Species
## MTBLS (comment if using Iris data)
# D = MTBLS79_DatasetExperiment(filtered=TRUE)
# M = pqn_norm(qc_label='QC',factor_name='sample_type') +
# knn_impute(neighbours=5) +
# glog_transform(qc_label='QC',factor_name='sample_type') +
# filter_smeta(mode='exclude',levels='QC',factor_name='sample_type')
# M = model_apply(M,D)
# D = predicted(M)
# show info
D
## -----------------------------------------------------------------------------
# show some data
head(D$data[,1:4])
## -----------------------------------------------------------------------------
P = PCA(number_components=15)
P$number_components=5
P$number_components
## -----------------------------------------------------------------------------
param_ids(P)
## -----------------------------------------------------------------------------
P
## -----------------------------------------------------------------------------
M = mean_centre() + PCA(number_components = 4)
## -----------------------------------------------------------------------------
M[2]$number_components
## -----------------------------------------------------------------------------
M = model_train(M,D)
## -----------------------------------------------------------------------------
M = model_predict(M,D)
## -----------------------------------------------------------------------------
M = model_apply(M,D)
## -----------------------------------------------------------------------------
output_ids(M[2])
M[2]$scores
## -----------------------------------------------------------------------------
chart_names(M[2])
## -----------------------------------------------------------------------------
C = pca_scores_plot(factor_name='class') # colour by class
chart_plot(C,M[2])
## -----------------------------------------------------------------------------
# add petal width to meta data of pca scores
M[2]$scores$sample_meta$example=D$data[,1]
# update plot
C$factor_name='example'
chart_plot(C,M[2])
## ----fig.width=10-------------------------------------------------------------
# scores plot
C1 = pca_scores_plot(factor_name='class') # colour by class
g1 = chart_plot(C1,M[2])
# scree plot
C2 = pca_scree_plot()
g2 = chart_plot(C2,M[2])
# arange in grid
grid.arrange(grobs=list(g1,g2),nrow=1)
## -----------------------------------------------------------------------------
is(PCA(),'stato')
## -----------------------------------------------------------------------------
# this is the stato id for PCA
stato_id(M[2])
# this is the stato name
stato_name(M[2])
# this is the stato definition
stato_definition(M[2])
## -----------------------------------------------------------------------------
stato_summary(M[2])
## -----------------------------------------------------------------------------
M = mean_centre() + PLSDA(number_components=2,factor_name='class')
M
## -----------------------------------------------------------------------------
# create object
XCV = kfold_xval(folds=5,factor_name='class')
# change the number of folds
XCV$folds=10
XCV$folds
## -----------------------------------------------------------------------------
models(XCV)=M
models(XCV)
## -----------------------------------------------------------------------------
# cross validation of a mean centred PLSDA model
XCV = kfold_xval(
folds=5,
method='venetian',
factor_name='class') *
(mean_centre() + PLSDA(factor_name='class'))
## -----------------------------------------------------------------------------
XCV = run(XCV,D,balanced_accuracy())
XCV$metric
## -----------------------------------------------------------------------------
chart_names(XCV)
## ----warning=FALSE,fig.width=8,fig.height=4-----------------------------------
C = kfoldxcv_grid(
factor_name='class',
level=levels(D$sample_meta$class)[2]) # first level
chart_plot(C,XCV)
## -----------------------------------------------------------------------------
# permute sample order 10 times and run cross-validation
P = permute_sample_order(number_of_permutations = 10) *
kfold_xval(folds=5,factor_name='class')*
(mean_centre() + PLSDA(factor_name='class',number_components=2))
P = run(P,D,balanced_accuracy())
P$metric
## ----warning=FALSE,message=FALSE----------------------------------------------
# the pmp SE object
SE = MTBLS79
# convert to DE
DE = as.DatasetExperiment(SE)
DE$name = 'MTBLS79'
DE$description = 'Converted from SE provided by the pmp package'
# add a column indicating the order the samples were measured in
DE$sample_meta$run_order = 1:nrow(DE)
# add a column indicating if the sample is biological or a QC
Type=as.character(DE$sample_meta$Class)
Type[Type != 'QC'] = 'Sample'
DE$sample_meta$Type = factor(Type)
# convert to factors
DE$sample_meta$Batch = factor(DE$sample_meta$Batch)
DE$sample_meta$Class = factor(DE$sample_meta$Class)
# print summary
DE
## ----message=FALSE,warning=FALSE----------------------------------------------
M = # batch correction
sb_corr(
order_col='run_order',
batch_col='Batch',
qc_col='Type',
qc_label='QC'
)
M = model_apply(M,DE)
## ----fig.wide = TRUE,warning=FALSE,fig.width=10-------------------------------
C = feature_profile(
run_order='run_order',
qc_label='QC',
qc_column='Type',
colour_by='Batch',
feature_to_plot='200.03196'
)
# plot and modify using ggplot2
chart_plot(C,DE)+ylab('Peak area')+ggtitle('Before')
chart_plot(C,predicted(M))+ylab('Peak area')+ggtitle('After')
## -----------------------------------------------------------------------------
M2 = filter_na_count(
threshold=3,
factor_name='Batch'
)
M2 = model_apply(M2,predicted(M))
# calculate number of features removed
nc = ncol(DE) - ncol(predicted(M2))
cat(paste0('Number of features removed: ', nc))
## -----------------------------------------------------------------------------
M3 = kw_rank_sum(
alpha=0.0001,
mtc='none',
factor_names='Batch',
predicted='significant'
) +
filter_by_name(
mode='exclude',
dimension = 'variable',
seq_in = 'names',
names='seq_fcn', # this is a placeholder and will be replaced by seq_fcn
seq_fcn=function(x){return(x[,1])}
)
M3 = model_apply(M3, predicted(M2))
nc = ncol(predicted(M2)) - ncol(predicted(M3))
cat(paste0('Number of features removed: ', nc))
## -----------------------------------------------------------------------------
M4 = wilcox_test(
alpha=1e-14,
factor_names='Type',
mtc='none',
predicted = 'significant'
) +
filter_by_name(
mode='exclude',
dimension='variable',
seq_in='names',
names='place_holder'
)
M4 = model_apply(M4, predicted(M3))
nc = ncol(predicted(M3)) - ncol(predicted(M4))
cat(paste0('Number of features removed: ', nc))
## -----------------------------------------------------------------------------
M5 = rsd_filter(
rsd_threshold=20,
factor_name='Type'
)
M5 = model_apply(M5,predicted(M4))
nc = ncol(predicted(M4)) - ncol(predicted(M5))
cat(paste0('Number of features removed: ', nc))
## -----------------------------------------------------------------------------
# peak matrix processing
M6 = pqn_norm(qc_label='QC',factor_name='Type') +
knn_impute(neighbours=5) +
glog_transform(qc_label='QC',factor_name='Type')
M6 = model_apply(M6,predicted(M5))
## -----------------------------------------------------------------------------
# PCA
M7 = mean_centre() + PCA(number_components = 2)
# apply model sequence to data
M7 = model_apply(M7,predicted(M6))
# plot pca scores
C = pca_scores_plot(factor_name=c('Sample_Rep','Class'),ellipse='none')
chart_plot(C,M7[2]) + coord_fixed() +guides(colour=FALSE)
## -----------------------------------------------------------------------------
# chart object
C = pca_scores_plot(factor_name=c('Batch'),ellipse='none')
# plot
chart_plot(C,M7[2]) + coord_fixed()
## -----------------------------------------------------------------------------
data('sacurine',package = 'ropls')
# the 'sacurine' list should now be available
# move the annotations to a new column and rename the features by index to avoid issues
# later when data.frames get transposed and names get checked/changed
sacurine$variableMetadata$annotation=rownames(sacurine$variableMetadata)
rownames(sacurine$variableMetadata)=1:nrow(sacurine$variableMetadata)
colnames(sacurine$dataMatrix)=1:ncol(sacurine$dataMatrix)
# create DatasetExperiment
DE = DatasetExperiment(data = data.frame(sacurine$dataMatrix),
sample_meta = sacurine$sampleMetadata,
variable_meta = sacurine$variableMetadata,
name = 'Sacurine data',
description = 'See ropls package documentation for details')
# print summary
DE
## ----fig.width=15,fig.wide = TRUE---------------------------------------------
# prepare model sequence
M = autoscale() + PCA(number_components = 5)
# apply model sequence to dataset
M = model_apply(M,DE)
# pca scores plots
g=list()
for (k in colnames(DE$sample_meta)) {
C = pca_scores_plot(factor_name = k)
g[[k]] = chart_plot(C,M[2])
}
# plot using cowplot
plot_grid(plotlist=g, nrow=1, align='vh', labels=c('A','B','C'))
## ----fig.height=10,fig.width=9,fig.wide=TRUE----------------------------------
C = pca_scree_plot()
g1 = chart_plot(C,M[2])
C = pca_loadings_plot()
g2 = chart_plot(C,M[2])
C = pca_dstat_plot(alpha=0.95)
g3 = chart_plot(C,M[2])
p1=plot_grid(plotlist = list(g1,g2),align='h',nrow=1,axis='b')
p2=plot_grid(plotlist = list(g3),nrow=1)
plot_grid(p1,p2,nrow=2)
## ---- fig.width=5, fig.height=5-----------------------------------------------
# prepare model sequence
M = autoscale() + PLSDA(factor_name='gender')
M = model_apply(M,DE)
C = plsda_scores_plot(factor_name = 'gender')
chart_plot(C,M[2])
## ----fig.width=10,fig.height=9,fig.wide=TRUE----------------------------------
# convert gender to numeric
DE$sample_meta$gender=as.numeric(DE$sample_meta$gender)
# models sequence
M = autoscale(mode='both') + PLSR(factor_name='gender',number_components=3)
M = model_apply(M,DE)
# some diagnostic charts
C = plsr_cook_dist()
g1 = chart_plot(C,M[2])
C = plsr_prediction_plot()
g2 = chart_plot(C,M[2])
C = plsr_qq_plot()
g3 = chart_plot(C,M[2])
C = plsr_residual_hist()
g4 = chart_plot(C,M[2])
plot_grid(plotlist = list(g1,g2,g3,g4), nrow=2,align='vh')
## ----results='asis'-----------------------------------------------------------
# model sequence
M = kfold_xval(folds=7, factor_name='gender') *
(autoscale(mode='both') + PLSR(factor_name='gender'))
M = run(M,DE,r_squared())
## ----results='asis',echo=FALSE------------------------------------------------
# training set performance
cat('Training set R2:\n')
cat(M$metric.train)
cat('\n\n')
# test set performance
cat('Test set Q2:\n')
cat(M$metric.test)
## ----fig.width=5,fig.height=5-------------------------------------------------
# reset gender to original factor
DE$sample_meta$gender=sacurine$sampleMetadata$gender
# model sequence
M = permutation_test(number_of_permutations = 10, factor_name='gender') *
kfold_xval(folds=7,factor_name='gender') *
(autoscale() + PLSDA(factor_name='gender',number_components = 3))
M = run(M,DE,balanced_accuracy())
C = permutation_test_plot(style='boxplot')
chart_plot(C,M)+ylab('1 - balanced accuracy')
## -----------------------------------------------------------------------------
url = 'https://github.com/CIMCB/MetabWorkflowTutorial/raw/master/GastricCancer_NMR.xlsx'
# read in file directly from github...
# X=read.xlsx(url)
# ...or use BiocFileCache
path = bfcrpath(bfc,url)
X = read.xlsx(path)
# sample meta data
SM=X[,1:4]
rownames(SM)=SM$SampleID
# convert to factors
SM$SampleType=factor(SM$SampleType)
SM$Class=factor(SM$Class)
# keep a numeric version of class for regression
SM$Class_num = as.numeric(SM$Class)
## data matrix
# remove meta data
X[,1:4]=NULL
rownames(X)=SM$SampleID
# feature meta data
VM=data.frame(idx=1:ncol(X))
rownames(VM)=colnames(X)
# prepare DatasetExperiment
DE = DatasetExperiment(
data=X,
sample_meta=SM,
variable_meta=VM,
description='1H-NMR urinary metabolomic profiling for diagnosis of gastric cancer',
name='Gastric cancer (NMR)')
DE
## -----------------------------------------------------------------------------
# prepare model sequence
M = rsd_filter(rsd_threshold=20,qc_label='QC',factor_name='Class') +
mv_feature_filter(threshold = 10,method='across',factor_name='Class')
# apply model
M = model_apply(M,DE)
# get the model output
filtered = predicted(M)
# summary of filtered data
filtered
## ----fig.width=10,fig.height=5.5----------------------------------------------
# prepare the model sequence
M = log_transform(base = 10) +
autoscale() +
knn_impute(neighbours = 3) +
PCA(number_components = 10)
# apply model sequence to data
M = model_apply(M,filtered)
# get the transformed, scaled and imputed matrix
TSI = predicted(M[3])
# scores plot
C = pca_scores_plot(factor_name = 'SampleType')
g1 = chart_plot(C,M[4])
# loadings plot
C = pca_loadings_plot()
g2 = chart_plot(C,M[4])
plot_grid(g1,g2,align='hv',nrow=1,axis='tblr')
## -----------------------------------------------------------------------------
# prepare model
TT = filter_smeta(mode='include',factor_name='Class',levels=c('GC','HE')) +
ttest(alpha=0.05,mtc='fdr',factor_names='Class')
# apply model
TT = model_apply(TT,filtered)
# keep the data filtered by group for later
filtered = predicted(TT[1])
# convert to data frame
out=as_data_frame(TT[2])
# show first few features
head(out)
## -----------------------------------------------------------------------------
# prepare model
M = stratified_split(p_train=0.75,factor_name='Class')
# apply to filtered data
M = model_apply(M,filtered)
# get data from object
train = M$training
train
cat('\n')
test = M$testing
test
## -----------------------------------------------------------------------------
# scale/transform training data
M = log_transform(base = 10) +
autoscale() +
knn_impute(neighbours = 3,by='samples')
# apply model
M = model_apply(M,train)
# get scaled/transformed training data
train_st = predicted(M)
# prepare model sequence
MS = grid_search_1d(
param_to_optimise = 'number_components',
search_values = as.numeric(c(1:6)),
model_index = 2,
factor_name = 'Class_num',
max_min = 'max') *
permute_sample_order(
number_of_permutations = 10) *
kfold_xval(
folds = 5,
factor_name = 'Class_num') *
(mean_centre(mode='sample_meta')+
PLSR(factor_name='Class_num'))
# run the validation
MS = struct::run(MS,train_st,r_squared())
#
C = gs_line()
chart_plot(C,MS)
## ----fig.wide=TRUE,warning=FALSE,fig.width=15,fig.height=5.5------------------
# prepare the discriminant model
P = PLSDA(number_components = 2, factor_name='Class')
# apply the model
P = model_apply(P,train_st)
# charts
C = plsda_predicted_plot(factor_name='Class',style='boxplot')
g1 = chart_plot(C,P)
C = plsda_predicted_plot(factor_name='Class',style='density')
g2 = chart_plot(C,P)+xlim(c(-2,2))
C = plsda_roc_plot(factor_name='Class')
g3 = chart_plot(C,P)
plot_grid(g1,g2,g3,align='vh',axis='tblr',nrow=1, labels=c('A','B','C'))
## -----------------------------------------------------------------------------
# AUC for comparison with Mendez et al
MET = calculate(AUC(),P$y$Class,P$yhat[,1])
MET
## -----------------------------------------------------------------------------
# model sequence
MS = permutation_test(number_of_permutations = 20,factor_name = 'Class_num') *
kfold_xval(folds = 5,factor_name = 'Class_num') *
(mean_centre(mode='sample_meta') + PLSR(factor_name='Class_num', number_components = 2))
# run iterator
MS = struct::run(MS,train_st,r_squared())
# chart
C = permutation_test_plot(style = 'density')
chart_plot(C,MS) + xlim(c(-1,1)) + xlab('R Squared')
## -----------------------------------------------------------------------------
# prepare the discriminant model
P = PLSDA(number_components = 2, factor_name='Class')
# apply the model
P = model_apply(P,train_st)
C = plsda_scores_plot(components=c(1,2),factor_name = 'Class')
chart_plot(C,P)
## ----fig.width=10,fig.height=5.5----------------------------------------------
# prepare chart
C = plsda_vip_plot(level='HE')
g1 = chart_plot(C,P)
C = plsda_regcoeff_plot(level='HE')
g2 = chart_plot(C,P)
plot_grid(g1,g2,align='hv',axis='tblr',nrow=2)
## ---- include=FALSE-----------------------------------------------------------
knitr::opts_chunk$set(echo = TRUE,fig.wide=TRUE,fig.width=5,fig.height=5.5)
set.seed(57475)
# read in file using BiocFileCache
url='https://github.com/CIMCB/MetabWorkflowTutorial/raw/master/GastricCancer_NMR.xlsx'
path = bfcrpath(bfc,url)
X = read.xlsx(path)
# sample meta data
SM=X[,1:4]
rownames(SM)=SM$SampleID
# convert to factors
SM$SampleType=factor(SM$SampleType)
SM$Class=factor(SM$Class)
# keep a numeric version of class for regression
SM$Class_num = as.numeric(SM$Class)
## data matrix
# remove meta data
X[,1:4]=NULL
rownames(X)=SM$SampleID
# feature meta data
VM=data.frame(idx=1:ncol(X))
rownames(VM)=colnames(X)
# prepare DatasetExperiment
DE = DatasetExperiment(
data=X,
sample_meta=SM,
variable_meta=VM,
description='1H-NMR urinary metabolomic profiling for diagnosis of gastric cancer',
name='Gastric cancer (NMR)')
M = rsd_filter(rsd_threshold=20,qc_label='QC',factor_name='Class') +
mv_feature_filter(threshold = 10,method='across',factor_name='Class') +
log_transform(base = 10) +
autoscale() +
knn_impute(neighbours = 3)
M = model_apply(M,DE)
DE = predicted(M)
## -----------------------------------------------------------------------------
# summary of DatasetExperiment object
DE
## -----------------------------------------------------------------------------
# model sequence and pls model (NB data already centred)
MS = filter_smeta(mode = 'include', levels = c('GC','HE'), factor_name = 'Class') +
PLSDA(factor_name = 'Class',number_components = 2)
# apply PLS model
MS = model_apply(MS,DE)
# plot the data
C = plsda_scores_plot(factor_name = 'Class')
chart_plot(C,MS[2])
# new DatasetExperiment object from the PLS scores
DE2 = DatasetExperiment(
data = MS[2]$scores,
sample_meta = predicted(MS[1])$sample_meta,
variable_meta = data.frame('LV'=c(1,2),row.names = colnames(MS[2]$scores)),
name = 'Illustrativate SVM dataset',
description = 'Generated by applying PLS to the processed Gastric cancer (NMR) dataset'
)
DE2
## -----------------------------------------------------------------------------
# SVM model
M = SVM(
factor_name = 'Class',
kernel = 'linear'
)
# apply model
M = model_apply(M,DE2)
# plot boundary
C = svm_plot_2d(factor_name = 'Class')
chart_plot(C,M, DE2)
## ----fig.width=10,fig.height=11-----------------------------------------------
# low cost
M$cost=0.01
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g1=chart_plot(C,M,DE2)
# medium cost
M$cost=0.05
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g2=chart_plot(C,M,DE2)
# high cost
M$cost=100
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g3=chart_plot(C,M,DE2)
# plot
prow <- plot_grid(
g1 + theme(legend.position="none"),
g2 + theme(legend.position="none"),
g3 + theme(legend.position="none"),
align = 'vh',
labels = c("Low cost", "Medium cost", "High cost"),
hjust = -1,
nrow = 2
)
legend <- get_legend(
# create some space to the left of the legend
g1 + guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "bottom")
)
plot_grid(prow, legend, ncol=1, rel_heights = c(1, .1))
## ---- fig.width=10,fig.height=11----------------------------------------------
# set a fixed cost for this comparison
M$cost=1
# linear kernel
M$kernel='linear'
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g1=chart_plot(C,M,DE2)
# polynomial kernel
M$kernel='polynomial'
M$gamma=1
M$coef0=0
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g2=chart_plot(C,M,DE2)
# rbf kernel
M$kernel='radial'
M$gamma=1
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g3=chart_plot(C,M,DE2)
# sigmoid kernel
M$kernel='sigmoid'
M$gamma=1
M$coef0=0
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g4=chart_plot(C,M,DE2)
# plot
prow <- plot_grid(
g1 + theme(legend.position="none"),
g2 + theme(legend.position="none"),
g3 + theme(legend.position="none"),
g4 + theme(legend.position="none"),
align = 'vh',
labels = c("Linear", "Polynomial", "Radial","Sigmoid"),
hjust = -1,
nrow = 2
)
legend <- get_legend(
# create some space to the left of the legend
g1 + guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "bottom")
)
plot_grid(prow, legend, ncol = 1, rel_heights = c(1, .1))
## ----fig.width=10,fig.height=11-----------------------------------------------
# rbf kernel and cost
M$kernel = 'radial'
M$cost = 1
# low gamma
M$gamma=0.01
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g1=chart_plot(C,M,DE2)
# medium gamma
M$gamma=0.1
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g2=chart_plot(C,M,DE2)
# high gamma
M$gamma=1
M=model_apply(M,DE2)
C=svm_plot_2d(factor_name='Species')
g3=chart_plot(C,M,DE2)
# plot
prow <- plot_grid(
g1 + theme(legend.position="none"),
g2 + theme(legend.position="none"),
g3 + theme(legend.position="none"),
align = 'vh',
labels = c("Low gamma", "Medium gamma", "High gamma"),
hjust = -1,
nrow = 2
)
legend <- get_legend(
# create some space to the left of the legend
g1 + guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "bottom")
)
plot_grid(prow, legend, ncol = 1, rel_heights = c(1, .1))
## ---- warning=FALSE, message=FALSE--------------------------------------------
# path to zip
zipfile = "https://raw.github.com/STATegraData/STATegraData/master/Script_STATegra_Proteomics.zip"
## retrieve from BiocFileCache
path = bfcrpath(bfc,zipfile)
temp = bfccache(bfc)
## ... or download to temp location
# path = tempfile()
# temp = tempdir()
# download.file(zipfile,path)
# unzip
unzip(path, files = "Proteomics_01_uniprot_canonical_normalized.txt", exdir=temp)
# read samples
all_data <- read.delim(file.path(temp,"Proteomics_01_uniprot_canonical_normalized.txt"), as.is = TRUE, header = TRUE, sep = "\t")
## -----------------------------------------------------------------------------
# extract data matrix
data = all_data[1:2527,51:86]
# shorten sample names
colnames(data) = lapply(colnames(data), function (x) substr(x, 27, nchar(x)))
# replace 0 with NA
data[data == 0] <- NA
# transpose
data=as.data.frame(t(data))
# prepare sample meta
SM = lapply(rownames(data),function(x) {
s=strsplit(x,'_')[[1]] # split at underscore
out=data.frame(
'treatment' = s[[1]],
'time' = substr(s[[2]],1,nchar(s[[2]])-1) ,
'batch' = substr(s[[3]],6,nchar(s[[3]])),
'condition' = substr(x,1,6) # interaction between treatment and time
)
return(out)
})
SM = do.call(rbind,SM)
rownames(SM)=rownames(data)
# convert to factors
SM$treatment=factor(SM$treatment)
SM$time=ordered(SM$time,c("0","2","6","12","18","24"))
SM$batch=ordered(SM$batch,c(1,3,4,5,6,7))
SM$condition=factor(SM$condition)
# variable meta data
VM = all_data[1:2527,c(1,6,7)]
rownames(VM)=colnames(data)
# prepare DatasetExperiment
DS = DatasetExperiment(
data = data,
sample_meta = SM,
variable_meta = VM,
name = 'STATegra Proteomics',
description = 'downloaded from: https://github.com/STATegraData/STATegraData/'
)
DS
## ----fig.width=10,fig.height=4.5----------------------------------------------
# find id of reporters
Ldha = which(DS$variable_meta$Gene.names=='Ldha')
Hk2 = which(DS$variable_meta$Gene.names=='Hk2')
# chart object
C = feature_boxplot(feature_to_plot=Ldha,factor_name='time',label_outliers=FALSE)
g1=chart_plot(C,DS)+ggtitle('Ldha')+ylab('expression')
C = feature_boxplot(feature_to_plot=Hk2,factor_name='time',label_outliers=FALSE)
g2=chart_plot(C,DS)+ggtitle('Hk2')+ylab('expression')
plot_grid(g1,g2,nrow=1,align='vh',axis='tblr')
## ----fig.width=10,fig.height=4.5----------------------------------------------
# prepare model sequence
M = log_transform(
base=2) +
mean_of_medians(
factor_name = 'condition')
# apply model sequence
M = model_apply(M,DS)
# get transformed data
DST = predicted(M)
## ----fig.width=10,fig.height=4.5----------------------------------------------
# chart object
C = feature_boxplot(feature_to_plot=Ldha,factor_name='time',label_outliers=FALSE)
g1=chart_plot(C,DST)+ggtitle('Ldha')+ylab('log2(expression)')
C = feature_boxplot(feature_to_plot=Hk2,factor_name='time',label_outliers=FALSE)
g2=chart_plot(C,DST)+ggtitle('Hk2')+ylab('log2(expression)')
plot_grid(g1,g2,nrow=1,align='vh',axis='tblr')
## -----------------------------------------------------------------------------
# build model sequence
M2 = filter_na_count(
threshold=2,
factor_name='condition',
predicted='na_count') + # override the default output
filter_by_name(
mode='exclude',
dimension='variable',
names='place_holder',
seq_in='names',
seq_fcn=function(x) { # convert NA count pre group to true/false
x=x>2 # more the two missing per group
x=rowSums(x)>10 # in more than 10 groups
return(x)
}
)
# apply to transformed data
M2 = model_apply(M2,DST)
# get the filtered data
DSTF = predicted(M2)
## -----------------------------------------------------------------------------
# create new imputation object
set_struct_obj(
class_name = 'STATegra_impute1',
struct_obj = 'model',
stato=FALSE,
params=c(factor_sd='character',factor_name='character'),
outputs=c(imputed='DatasetExperiment'),
prototype = list(
name = 'STATegra imputation 1',
description = 'If missing values are present for all one group then they are replaced with min/2 + "random value below discovery".',
predicted = 'imputed'
)
)
# create method_apply for imputation method 1
set_obj_method(
class_name='STATegra_impute1',
method_name='model_apply',
definition=function(M,D) {
# for each feature count NA within each level
na = apply(D$data,2,function(x){
tapply(x,D$sample_meta[[M$factor_name]],function(y){
sum(is.na(y))
})
})
# count number of samples in each group
count=summary(D$sample_meta[[M$factor_name]])
# standard deviation of features within levels of factor_sd
sd = apply(D$data,2,function(x) {tapply(x,D$sample_meta[[M$factor_sd]],sd,na.rm=TRUE)})
sd = median(sd,na.rm=TRUE)
# impute or not
check=na == matrix(count,nrow=2,ncol=ncol(D)) # all missing in one class
# impute matrix
mi = D$data
for (j in 1:nrow(mi)) {
# index of group for this sample
g = which(levels(D$sample_meta[[M$factor_name]])==D$sample_meta[[M$factor_name]][j])
iv=rnorm(ncol(D),min(D$data[j,],na.rm=TRUE)/2,sd)
mi[j,is.na(mi[j,]) & check[g,]] = iv[is.na(mi[j,]) & check[g,]]
}
D$data = mi
M$imputed=D
return(M)
}
)
## -----------------------------------------------------------------------------
# create new imputation object
set_struct_obj(
class_name = 'STATegra_impute2',
struct_obj = 'model',
stato=FALSE,
params=c(factor_name='character'),
outputs=c(imputed='DatasetExperiment'),
prototype = list(
name = 'STATegra imputation 2',
description = 'For those conditions with only 1 NA impute with the mean of the condition.',
predicted = 'imputed'
)
)
# create method_apply for imputation method 2
set_obj_method(
class_name='STATegra_impute2',
method_name='model_apply',
definition=function(M,D) {
# levels in condition
L = levels(D$sample_meta[[M$factor_name]])
# for each feature count NA within each level
na = apply(D$data,2,function(x){
tapply(x,D$sample_meta[[M$factor_name]],function(y){
sum(is.na(y))
})
})
# standard deviation of features within levels of factor_sd
sd = apply(D$data,2,function(x) {tapply(x,D$sample_meta[[M$factor_name]],sd,na.rm=TRUE)})
sd = median(sd,na.rm=TRUE)
# impute or not
check=na == 1 # only one missing for a condition
# index of samples for each condition
IDX = list()
for (k in L) {
IDX[[k]]=which(D$sample_meta[[M$factor_name]]==k)
}
## impute
# for each feature
for (k in 1:ncol(D)) {
# for each condition
for (j in 1:length(L)) {
# if passes test
if (check[j,k]) {
# mean of samples in group
m = mean(D$data[IDX[[j]],k],na.rm=TRUE)
# imputed value
im = rnorm(1,m,sd)
# replace NA with imputed
D$data[is.na(D$data[,k]) & D$sample_meta[[M$factor_name]]==L[j],k]=im
}
}
}
M$imputed=D
return(M)
}
)
## ----fig.width=10,fig.height=4.5----------------------------------------------
# model sequence
M3 = STATegra_impute1(factor_name='treatment',factor_sd='condition') +
STATegra_impute2(factor_name = 'condition') +
filter_na_count(threshold = 3, factor_name='condition')
# apply model
M3 = model_apply(M3,DSTF)
# get imputed data
DSTFI = predicted(M3)
DSTFI
## ----fig.height=10,fig.width=9,warning=FALSE,message=FALSE,echo=FALSE---------
# distributions plot
C = compare_dist(factor_name = 'treatment')
g=chart_plot(C,DS,DSTFI)
## ----fig.width=9,fig.height=5,warning=FALSE,message=FALSE---------------------
# model sequence
P = filter_smeta(mode='include',factor_name='treatment',levels='IKA') +
mean_centre() +
PCA(number_components = 2)
# apply model
P = model_apply(P,DSTFI)
# scores plots coloured by factors
g = list()
for (k in c('batch','time')) {
C = pca_scores_plot(factor_name=k,ellipse='none')
g[[k]]=chart_plot(C,P[3])
}
plot_grid(plotlist = g,nrow=1)
## ---- warning=FALSE, message=FALSE--------------------------------------------
# path to zip
zipfile = "https://raw.github.com/STATegraData/STATegraData/master/Script_STATegra_Metabolomics.zip"
## retrieve from BiocFileCache
path = bfcrpath(bfc,zipfile)
temp = bfccache(bfc)
## ... or download to temp location
# path = tempfile()
# temp = tempdir()
# download.file(zipfile,path)
# unzip
unzip(zipfile=path, files = "LC_MS_raw_data.xlsx", exdir=temp)
# read samples
data <- as.data.frame(read.xlsx(file.path(temp,"LC_MS_raw_data.xlsx"),sheet = 'Data'))
## -----------------------------------------------------------------------------
# extract sample meta data
SM = data[ ,1:8]
# add coloumn for sample type (QC, blank etc)
blanks=c(1,2,33,34,65,66)
QCs=c(3,4,11,18,25,32,35,36,43,50,57,64)
SM$sample_type='Sample'
SM$sample_type[blanks]='Blank'
SM$sample_type[QCs]='QC'
# put qc/blank labels in all factors for plotting later
SM$biol.batch[SM$sample_type!='Sample']=SM$sample_type[SM$sample_type!='Sample']
SM$time.point[SM$sample_type!='Sample']=SM$sample_type[SM$sample_type!='Sample']
SM$condition[SM$sample_type!='Sample']=SM$sample_type[SM$sample_type!='Sample']
# convert to factors
SM$biol.batch=ordered(SM$biol.batch,c('9','10','11','12','QC','Blank'))
SM$time.point=ordered(SM$time.point,c('0h','2h','6h','12h','18h','24h','QC','Blank'))
SM$condition=factor(SM$condition)
SM$sample_type=factor(SM$sample_type)
# variable meta data
VM = data.frame('annotation'=colnames(data)[9:ncol(data)])
# raw data
X = data[,9:ncol(data)]
# convert 0 to NA
X[X==0]=NA
# force to numeric; any non-numerics will become NA
X=data.frame(lapply(X,as.numeric),check.names = FALSE)
# make sure row/col names match
rownames(X)=data$label
rownames(SM)=data$label
rownames(VM)=colnames(X)
# create DatasetExperiment object
DE = DatasetExperiment(
data = X,
sample_meta = SM,
variable_meta = VM,
name = 'STATegra Metabolomics LCMS',
description = 'https://www.nature.com/articles/s41597-019-0202-7'
)
DE
## -----------------------------------------------------------------------------
# prepare model sequence
MS = filter_smeta(mode = 'include', levels='QC', factor_name = 'sample_type') +
knn_impute(neighbours=5) +
vec_norm() +
log_transform(base = 10)
# apply model sequence
MS = model_apply(MS, DE)
## -----------------------------------------------------------------------------
# pca model sequence
M = mean_centre() +
PCA(number_components = 3)
# apply model
M = model_apply(M,predicted(MS))
# PCA scores plot
C = pca_scores_plot(factor_name = 'sample_type',label_factor = 'order',points_to_label = 'all')
# plot
chart_plot(C,M[2])
## -----------------------------------------------------------------------------
# prepare mdoel sequence
MS = filter_smeta(
mode = 'include',
levels='QC',
factor_name = 'sample_type') +
filter_by_name(
mode = 'exclude',
dimension='sample',
names = c('1358BZU_0001QC_H1','1358BZU_0001QC_A1','1358BZU_0001QC_G1')) +
knn_impute(
neighbours=5) +
vec_norm() +
log_transform(
base = 10) +
mean_centre() +
PCA(
number_components = 3)
# apply model sequence
MS = model_apply(MS, DE)
# PCA scores plot
C = pca_scores_plot(factor_name = 'sample_type',label_factor = 'order',points_to_label = 'all')
# plot
chart_plot(C,MS[7])
## -----------------------------------------------------------------------------
# prepare model sequence
MS = filter_smeta(
mode = 'exclude',
levels='Blank',
factor_name = 'sample_type') +
filter_smeta(
mode = 'exclude',
levels='12',
factor_name = 'biol.batch') +
filter_by_name(
mode = 'exclude',
dimension='sample',
names = c('1358BZU_0001QC_H1',
'1358BZU_0001QC_A1',
'1358BZU_0001QC_G1')) +
knn_impute(
neighbours=5) +
vec_norm() +
log_transform(
base = 10) +
mean_centre() +
PCA(
number_components = 3)
# apply model sequence
MS = model_apply(MS, DE)
# PCA scores plots
C = pca_scores_plot(factor_name = 'sample_type')
# plot
chart_plot(C,MS[8])
## ----fig.height=10,fig.width=10-----------------------------------------------
MS = filter_smeta(
mode = 'exclude',
levels = '12',
factor_name = 'biol.batch') +
filter_by_name(
mode = 'exclude',
dimension='sample',
names = c('1358BZU_0001QC_H1',
'1358BZU_0001QC_A1',
'1358BZU_0001QC_G1')) +
blank_filter(
fold_change = 20,
qc_label = 'QC',
factor_name = 'sample_type') +
filter_smeta(
mode='exclude',
levels='Blank',
factor_name='sample_type') +
mv_feature_filter(
threshold = 80,
qc_label = 'QC',
factor_name = 'sample_type',
method = 'QC') +
mv_feature_filter(
threshold = 50,
factor_name = 'sample_type',
method='across') +
rsd_filter(
rsd_threshold=20,
qc_label='QC',
factor_name='sample_type') +
mv_sample_filter(
mv_threshold = 50) +
pqn_norm(
qc_label='QC',
factor_name='sample_type') +
knn_impute(
neighbours=5,
by='samples') +
glog_transform(
qc_label = 'QC',
factor_name = 'sample_type') +
mean_centre() +
PCA(
number_components = 10)
# apply model sequence
MS = model_apply(MS, DE)
# PCA plots using different factors
g=list()
for (k in c('order','biol.batch','time.point','condition')) {
C = pca_scores_plot(factor_name = k,ellipse='none')
# plot
g[[k]]=chart_plot(C,MS[length(MS)])
}
plot_grid(plotlist = g,align='vh',axis='tblr',nrow=2,labels=c('A','B','C','D'))
## ----warning=FALSE,message=FALSE,fig.height=11,fig.width=5--------------------
# get the glog scaled data
GL = predicted(MS[11])
# extract the Ikaros group and apply PCA
IK = filter_smeta(
mode='include',
factor_name='condition',
levels='Ikaros') +
mean_centre() +
PCA(number_components = 5)
# apply the model sequence to glog transformed data
IK = model_apply(IK,GL)
# plot the PCA scores
C = pca_scores_plot(factor_name='time.point',ellipse = 'sample')
g1=chart_plot(C,IK[3])
g2=g1 + scale_color_viridis_d() # add continuous scale colouring
plot_grid(g1,g2,nrow=2,align='vh',axis = 'tblr',labels=c('A','B'))
## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.