Maarten van Iterson, Department of Molecular Epidemiology, Leiden University Medical Center, Leiden, The Netherlands 24 February 2016
!!! This is a project in development !!!
wbccPredictor can be used to build a predictor, using partial-least-squares, for WBCC based on gene expression or DNA methylation data. Or wbccPredictor can be used to predict WBCC using the build-in predictor for both gene expression and DNA methylation data.
Installation requires the package devtools,
library(devtools)
install_github("mvaniterson/wbccPredictor")
or using the biocLite
function downloaded from BioConductor.
source("http://bioconductor.org/biocLite.R")
biocLite("mvaniterson/wbccPredictor")
In this example BIOS RNAseq freeze1 data is used.
library(BIOSRutils)
data(rnaSeqData_freeze1_06032015BIOS)
The RNAseq counts are normalized using edgeR TMM-function.
Some experimentation showed that strong filtering improves the performance of the predictor!
library(edgeR)
library(GenomicFeatures)
d <- DGEList(counts = assays(rnaSeqData)$counts, remove.zeros = TRUE)
keep <- rowSums(d$counts > 100) > 0.99 * ncol(d)
d <- d[keep, ]
dim(d) #[1] 13778 2116
## [1] 13778 2116
library(genefilter)
sds <- rowSds(d$counts)
keep <- sds > quantile(sds, prob = 0.5)
sum(keep)
## [1] 6889
d <- d[keep, ]
dim(d) #[1] 6889 2116
## [1] 6889 2116
d <- calcNormFactors(d)
data <- cpm(d, log = TRUE) ##TMM CPM
data[1:5, 1:5]
## BD1NYRACXX-5-1 AD10W1ACXX-4-1 BD1NYRACXX-5-2
## ENSG00000000419 4.168507 4.339456 4.382197
## ENSG00000000938 10.069462 10.074595 9.959071
## ENSG00000001084 4.734471 4.288601 4.626306
## ENSG00000001461 5.629025 5.728971 5.536908
## ENSG00000001561 4.775422 3.304141 3.958505
## AD10W1ACXX-4-2 BD1NYRACXX-5-3
## ENSG00000000419 3.994125 4.134935
## ENSG00000000938 10.480343 10.460845
## ENSG00000001084 4.208172 4.204576
## ENSG00000001461 5.481339 5.190906
## ENSG00000001561 3.350550 3.214355
Next cell percentages and covariates need to be extracted.
cellPercentages <- as.data.frame(colData(rnaSeqData)[, c("Neut_Perc", "Lymph_Perc",
"Mono_Perc", "Eos_Perc", "Baso_Perc")])
colnames(cellPercentages) <- c("Neutrophils", "Lymphocytes", "Monocytes", "Eosinophils",
"Basophils")
cellPercentages <- apply(cellPercentages, 2, as.numeric)
head(cellPercentages)
## Neutrophils Lymphocytes Monocytes Eosinophils Basophils
## [1,] NA NA NA NA NA
## [2,] NA NA NA NA NA
## [3,] NA NA NA NA NA
## [4,] NA NA NA NA NA
## [5,] NA NA NA NA NA
## [6,] NA NA NA NA NA
covariates <- as.data.frame(colData(rnaSeqData))
covariates <- covariates[, c("RNA_BloodSampling_Age", "Sex")]
colnames(covariates) <- c("Age", "Gender")
covariates <- apply(covariates, 2, as.numeric)
head(covariates)
## Age Gender
## [1,] 77.9 0
## [2,] 70.5 1
## [3,] 66.3 0
## [4,] 76.5 0
## [5,] 71.9 0
## [6,] 68.6 1
Make sure all data object have the same id.
rownames(covariates) <- rownames(cellPercentages) <- colnames(data) <- colData(rnaSeqData)$uuid
head(covariates)
## Age Gender
## BIOS6DB3BAD1 77.9 0
## BIOSCFA14234 70.5 1
## BIOSCA449668 66.3 0
## BIOS415A8BFB 76.5 0
## BIOSD16ED999 71.9 0
## BIOSD2B838B2 68.6 1
head(cellPercentages)
## Neutrophils Lymphocytes Monocytes Eosinophils Basophils
## BIOS6DB3BAD1 NA NA NA NA NA
## BIOSCFA14234 NA NA NA NA NA
## BIOSCA449668 NA NA NA NA NA
## BIOS415A8BFB NA NA NA NA NA
## BIOSD16ED999 NA NA NA NA NA
## BIOSD2B838B2 NA NA NA NA NA
data[1:6, 1:6]
## BIOS6DB3BAD1 BIOSCFA14234 BIOSCA449668 BIOS415A8BFB
## ENSG00000000419 4.168507 4.339456 4.382197 3.994125
## ENSG00000000938 10.069462 10.074595 9.959071 10.480343
## ENSG00000001084 4.734471 4.288601 4.626306 4.208172
## ENSG00000001461 5.629025 5.728971 5.536908 5.481339
## ENSG00000001561 4.775422 3.304141 3.958505 3.350550
## ENSG00000001629 5.475888 4.807971 5.260365 4.826678
## BIOSD16ED999 BIOSD2B838B2
## ENSG00000000419 4.134935 3.954608
## ENSG00000000938 10.460845 10.450733
## ENSG00000001084 4.204576 4.223557
## ENSG00000001461 5.190906 5.125365
## ENSG00000001561 3.214355 2.613582
## ENSG00000001629 5.124162 4.607147
Find out which samples have complete cell percentages available.
nas <- apply(cellPercentages, 1, function(x) any(is.na(x)))
table(nas)
## nas
## FALSE TRUE
## 1276 840
And which are incomplete
complete <- which(!nas)
head(cellPercentages[complete, ])
## Neutrophils Lymphocytes Monocytes Eosinophils Basophils
## BIOSDC47446B 46.3 41.1 9.6 2.4 0.6
## BIOSDC466DD5 47.2 41.5 8.7 1.9 0.7
## BIOS60E9E875 46.1 36.0 10.0 7.2 0.7
## BIOS0F99014C 59.7 27.5 7.4 5.1 0.3
## BIOSCA63D390 56.5 32.9 8.5 1.6 0.5
## BIOS519649FD 39.7 44.9 13.0 1.3 1.1
split complete in train (2/3) and test (1/3) stratified splitting of the data might be better if sample size is small \< 500? for example using split.sample from caTools
trainId <- sample(complete, 2 * length(complete)/3)
testId <- complete[!(complete %in% trainId)]
dataTrain <- data[, trainId]
covarTrain <- covariates[trainId, ]
cellPerTrain <- cellPercentages[trainId, ]
Now build the predictor!
library(wbccPredictor)
pls.options(parallel = 8) ##use 10 cores e.g. on the VM
predictor <- train(dataTrain, covarTrain, cellPerTrain, ncomp = 250, validation = "CV",
model = formula(cellPercentages ~ covariates + data), keep.model = TRUE)
The usual pls functions can be used to inspect the predictor-object. For example,
summary(predictor)
cumsum(explvar(predictor))
and,
barplot(cumsum(explvar(predictor)), las = 2, ylab = "Cumulative Variance Explained")
validationplot(predictor, val.type = "R2", ncomp = 1:50) ##select optimal number of components
validationplot(predictor, val.type = "RMSEP", ncomp = 1:250) ##select optimal number of components
validationplot(predictor, val.type = "RMSEP", ncomp = 1:50) ##select optimal number of components
Determine optimal number of pls-components e.g. 40 and save the coefficients like this:
RNAseqPredictor <- coef(predictor, ncomp = 40, intercept = TRUE)[, , 1]
It is interesting to known which covariates have the highest prediction value and store these as well.
W <- predictor$loading.weights
ord <- order(abs(W[, 40]), decreasing = TRUE)
RNAseqTop <- gsub("data|covariates", "", rownames(W)[ord[1:1000]])
save(RNAseqTop, file = "../data/RNAseqTop.RData")
save(RNAseqPredictor, file = "../data/RNAseqPredictor.RData")
corrplot <- function(measured, predicted, xlab = "predicted", ylab = "measured",
...) {
for (k in 1:ncol(measured)) {
type <- colnames(measured)[k]
pc <- signif(cor(predicted[, k], measured[, k]), 3)
ic <- signif(icc(predicted[, k], measured[, k]), 3)
plot(predicted[, k], measured[, k], main = paste(type, " (Prs: ", pc,
", ICC: ", ic, ")", sep = ""), xlab = xlab, ylab = ylab, bty = "n",
pty = "s", ...)
abline(lm(measured[, k] ~ 0 + predicted[, k]), col = 2, lty = 1, lwd = 2)
abline(0, 1, col = "grey", lty = 1, lwd = 2)
grid()
}
}
baplot <- function(x, y, regline = FALSE, la = c("log", "lin", "both"), main = "",
xlab = "Average", ylab = "Difference") {
la <- match.arg(la)
for (k in 1:ncol(x)) wbccPredictor:::.baplot(x[, k], y[, k], regline = regline,
la = la, main = paste0(colnames(x)[k]), xlab = xlab, ylab = ylab)
}
This is no surprise!
predictedRNATrain <- predictor$fitted.values[, , 40]
op <- par(mfcol = c(3, 2))
corrplot(cellPerTrain, predictedRNATrain, xlab = "predicted", ylab = "measured")
par(op)
Now let predict cell composition for the test data and plot.
dataTest <- data[, testId]
covarTest <- covariates[testId, ]
cellPerTest <- cellPercentages[testId, ]
predictedRNATest <- prediction(predictor, dataTest, covarTest, ncomp = 40, transformation = function(x) x)
op <- par(mfcol = c(3, 2))
corrplot(cellPerTest, predictedRNATest, xlab = "measured (%)", ylab = "predicted (%)")
par(op)
op <- par(mfcol = c(3, 2))
baplot(cellPerTest, predictedRNATest, la = "lin", xlab = "Average: meas. - pred. (%)",
ylab = "Difference: meas. - pred. (%)", regline = FALSE)
par(op)
see example R script for 450K data
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.