In this example BIOS RNAseq freeze1 data is used.
library(BIOSRutils) data(rnaSeqData_freeze1_06032015BIOS)
The RNAseq counts are normalized using r Biocpkg("edgeR")
TMM-function.
Some experimentation showed that strong filtering improves the performance of the predictor!
library(edgeR) library(GenomicFeatures) d <- DGEList(counts=assays(rnaSeqData)$counts, remove.zeros=TRUE) keep <- rowSums(d$counts > 100) > 0.99*ncol(d) d <- d[keep,] dim(d) #[1] 13778 2116 library(genefilter) sds <- rowSds(d$counts) keep <- sds > quantile(sds, prob=0.50) sum(keep) d <- d[keep,] dim(d) #[1] 6889 2116 d <- calcNormFactors(d) data <- cpm(d, log=TRUE) ##TMM CPM data[1:5, 1:5]
Next cell percentages and covariates need to be extracted.
cellPercentages <- as.data.frame(colData(rnaSeqData)[, c("Neut_Perc", "Lymph_Perc", "Mono_Perc", "Eos_Perc", "Baso_Perc")]) colnames(cellPercentages) <- c("Neutrophils", "Lymphocytes", "Monocytes", "Eosinophils", "Basophils") cellPercentages <- apply(cellPercentages, 2, as.numeric) head(cellPercentages)
covariates <- as.data.frame(colData(rnaSeqData)) covariates <- covariates[, c("RNA_BloodSampling_Age", "Sex")] colnames(covariates) <- c("Age", "Gender") covariates <- apply(covariates, 2, as.numeric) head(covariates)
Make sure all data object have the same id.
rownames(covariates) <- rownames(cellPercentages) <- colnames(data) <- colData(rnaSeqData)$uuid head(covariates) head(cellPercentages) data[1:6, 1:6]
Find out which samples have complete cell percentages available.
nas <- apply(cellPercentages, 1, function(x) any(is.na(x))) table(nas)
And which are incomplete
complete <- which(!nas) head(cellPercentages[complete,])
split complete in train (2/3) and test (1/3) stratified splitting of the data might be better if sample size is small < 500? for example using split.sample from caTools
trainId <- sample(complete, 2*length(complete)/3) testId <- complete[!(complete %in% trainId)]
dataTrain <- data[,trainId] covarTrain <- covariates[trainId,] cellPerTrain <- cellPercentages[trainId,]
Now build the predictor!
library(wbccPredictor) pls.options(parallel=8) ##use 10 cores e.g. on the VM predictor <- train(dataTrain, covarTrain, cellPerTrain, ncomp=250, validation = "CV", model = formula(cellPercentages ~ covariates + data), keep.model=TRUE)
The usual r CRANpkg("pls")
functions can be used to inspect the
predictor-object. For example,
summary(predictor) cumsum(explvar(predictor))
and,
barplot(cumsum(explvar(predictor)), las=2, ylab="Cumulative Variance Explained") validationplot(predictor, val.type="R2", ncomp=1:50) ##select optimal number of components validationplot(predictor, val.type="RMSEP", ncomp=1:250) ##select optimal number of components validationplot(predictor, val.type="RMSEP", ncomp=1:50) ##select optimal number of components
Determine optimal number of pls-components e.g. 40 and save the coefficients like this:
RNAseqPredictor <- coef(predictor, ncomp = 40, intercept = TRUE)[,,1]
It is interesting to known which covariates have the highest prediction value and store these as well.
W <- predictor$loading.weights ord <- order(abs(W[, 40]), decreasing=TRUE) RNAseqTop <- gsub("data|covariates", "", rownames(W)[ord[1:1000]])
save(RNAseqTop, file="../data/RNAseqTop.RData") save(RNAseqPredictor, file="../data/RNAseqPredictor.RData")
corrplot <- function(measured, predicted, xlab="predicted", ylab="measured", ...) { for(k in 1:ncol(measured)) { type <- colnames(measured)[k] pc <- signif(cor(predicted[,k], measured[,k]), 3) ic <- signif(icc(predicted[,k], measured[,k]), 3) plot(predicted[,k], measured[,k], main=paste(type, " (Prs: ", pc, ", ICC: ", ic, ")", sep=""), xlab=xlab, ylab=ylab, bty="n", pty="s", ...) abline(lm(measured[,k]~0+predicted[,k]), col=2, lty=1, lwd=2) abline(0, 1, col="grey", lty=1, lwd=2) grid() } } baplot <- function (x, y, regline = FALSE, la = c("log", "lin", "both"), main = "", xlab = "Average", ylab = "Difference") { la <- match.arg(la) for (k in 1:ncol(x)) wbccPredictor:::.baplot(x[, k], y[, k], regline = regline, la = la, main = paste0(colnames(x)[k]), xlab = xlab, ylab = ylab) }
This is no surprise!
predictedRNATrain <- predictor$fitted.values[,,40] op <- par(mfcol=c(3,2)) corrplot(cellPerTrain, predictedRNATrain, xlab="predicted", ylab="measured") par(op)
Now let predict cell composition for the test data and plot.
dataTest <- data[,testId] covarTest <- covariates[testId,] cellPerTest <- cellPercentages[testId,] predictedRNATest <- prediction(predictor, dataTest, covarTest, ncomp=40, transformation=function(x) x) op <- par(mfcol=c(3,2)) corrplot(cellPerTest, predictedRNATest, xlab="measured (%)", ylab="predicted (%)") par(op) op <- par(mfcol=c(3,2)) baplot(cellPerTest, predictedRNATest, la="lin", xlab="Average: meas. - pred. (%)", ylab="Difference: meas. - pred. (%)", regline=FALSE) par(op)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.