In protViz/SRMService: Report Quantitative Mass Spectrometry Data

knitr::opts_chunk$set(echo = FALSE, warnings=FALSE, fig.width = 8, fig.height = 8, message = FALSE)

R Markdown

library(plyr)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(ggplot2)

nvariables <-2

wprot <- protData$getWideNoMissing()
wprot <- scale(wprot)

tmp <- unique(subset(protData$data, select = c("Condition","Run")))
wprot <- merge(tmp, wprot, by.x="Run", by.y="row.names")
p2grp <- wprot

iprotV <- data.frame(p2grp[,3:ncol(p2grp)])
Condition = as.numeric(grepl( Condition2Compare, p2grp$Condition))
protnames <- as.character(colnames(iprotV))
Variables <- t(combn(as.character(colnames(iprotV)) , nvariables , FUN = NULL, simplify = TRUE ))
colnames(Variables) <- paste("v",1:ncol(Variables),sep="")


modelData <- alply(Variables, 1, function(x, iprotV2 = iprotV, Condition2 = Condition){
    tmp <- data.frame(Condition = as.factor(Condition2), select(iprotV2, x))
    tmp
  }, iprotV
)

resModels <- llply(modelData, function(x){
    x <- glm(Condition ~ ., data= x , family = "binomial")
    x
  } )

coefficientsExtrat <- function(x){c( coefficients = coefficients(x), deviance = deviance(x) )}

resCoefficients <- ldply(resModels,coefficientsExtrat)
colnames(resCoefficients) <- c("mID", "c.intercept", paste("c", 1:ncol(Variables), sep=""), "deviance")


makeAUC <- function(x, ConditionX ){
  x <- predict(x, type="response")
  pROC::auc(controls = x[ConditionX == 0 ] , cases = x[ConditionX == 1])
}

resAUC <- rep(NA, length(resModels))
for(i in 1:length(resModels)){
  resAUC[i] <- makeAUC(resModels[[i]], ConditionX = Condition)
}

resultsCoeff <-data.frame(var = Variables, resCoefficients, AUC= resAUC )
resultsCoeff <- arrange(resultsCoeff, desc(AUC))
nrmodels <-choose(length(unique(protData$data$Protein)), nvariables)

The total number of models is :

nrmodels

TOP ROC curves

Histogram of AUC for models using 3 variables.

Top 1

model <- resModels[[resultsCoeff$mID[1]]]
tmp <- melt(model$data)

x <- predict(resModels[[resultsCoeff$mID[1]]], type="response")
quantable::makeROCplot(x[Condition == 0 ] , x[Condition == 1])

p <- ggplot(tmp , aes(variable, value)) + geom_boxplot(aes(colour = Condition)) + geom_jitter(width = 0.2,aes(colour = Condition))
p

Top 2

x <- predict(resModels[[resultsCoeff$mID[2]]], type="response")
quantable::makeROCplot(x[Condition == 0 ] , x[Condition == 1])

Top 3

x <- predict(resModels[[resultsCoeff$mID[3]]], type="response")
quantable::makeROCplot(x[Condition == 0 ] , x[Condition == 1])

Distribution of AUC

hist(resultsCoeff$AUC, breaks=20, xlab="AUC", main="AUC distribution")
abline(v= quantile(resultsCoeff$AUC,0.9),col=2)

TOP 20 models and variables used.

knitr::kable(resultsCoeff[1:20, ], digits=2)

Which variables were used most frequently in top 10% of models

Range of AUC values for top models:

top10pcModels <- resultsCoeff[1:(nrow(resCoefficients)/10),]
range(top10pcModels$AUC)

plot(sort(table(as.character(unlist(top10pcModels[,1:nvariables]))),decreasing = T)[1:25],las=2,ylab="# of models")

coefCounts <- data.frame(sort(table(as.character(unlist(top10pcModels[,1:nvariables]))),decreasing = T))
colnames(coefCounts) <- c("proteinID", "coefCounts")

cofw <- data.frame(names = unlist(top10pcModels[,1:nvariables]) , coef = unlist(top10pcModels[,(nvariables+3):(nvariables+2+nvariables)]))
cofw$abscoef<- abs(cofw$coef)
coefMean <- aggregate(cofw$coef, list(proteinID=cofw$names), median)
colnames(coefMean)[2] = "meanCoef"

xd <- merge(coefMean, coefCounts, by="proteinID")

knitr::kable(arrange(xd, desc(abs(meanCoef))), caption = "Coefficients sorted by absolute size in top 10% of models.")
res$coefficients <- arrange(xd, desc(abs(meanCoef)))

coefThreshold <- 0.5
ggplot(xd, aes(x=meanCoef, y=coefCounts)) +
    geom_point(shape=1) + geom_text_repel(data=dplyr::filter(xd, xd$meanCoef > coefThreshold | xd$meanCoef  < -coefThreshold ), aes(label=proteinID))

xt <- xd[xd$meanCoef > coefThreshold| xd$meanCoef < -coefThreshold,]