Nothing
## get index of max (created for data.table)
#' get index of max in vecotr of numeric values
#' @param v vector
#' @export
getMaxIndex <- function(v){
if(all(is.na(v))) return(1)
return(which(max(v,na.rm=T) == v)[1])
}
## return first entry per column
#' @export
.getFirstEntry <- function(x){
return(x[1])
}
#' @export
.log2Exprs <- function(eset){
exprs(eset) <- log2(exprs(eset))
return(eset)
}
#' @export
.exp2Exprs <- function(eset){
exprs(eset) <- 2^(exprs(eset))
return(eset)
}
#' @export
.getControlCondition <-function(eset){
return(unique(as.character(pData(eset)$condition[pData(eset)$isControl]))[1])
}
#' Create ExpressionSet object
#' @param expressionMatrix matrix of expression signals per feature and sample
#' @param expDesign experimental design data.frame
#' @param featureAnnotations data.frame including e.g: Protein Description, Id score etc.
#' @return ExpressionSet object
#' @import Biobase
#' @export
#' @note No note
#' @details No details
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
createExpressionDataset <- function(expressionMatrix=expressionMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations){
### make sure that only one unique condition is specified as control
if(length(unique(as.character(expDesign$condition[expDesign$isControl])) ) > 1){
stop("ERROR: createExpressionDataset, Invalid experimental design")
}
### phenoData: stores expDesign
# display with pData(eset):
# condition isControl
# A_rep_1 A T
# A_rep_2 A T
# B_rep_1 B F
# B_rep_2 B F
# C_rep_1 C F
# C_rep_2 C F
#pData <- new("AnnotatedDataFrame", data=expDesign)
expDesign$condition <- as.factor(gsub(" ","", expDesign$condition ))
pData <- AnnotatedDataFrame(data=expDesign)
### featureData:add more data to each feature. E.g: Protein Description, Id score etc.
return(ExpressionSet(assayData = expressionMatrix
, phenoData = pData ### yeah this is weird, but gives error if rolling up already
# rolled up eset unless colindices are explicitly specified
#, featureData = new("AnnotatedDataFrame", data= featureAnnotations[,1:ncol(featureAnnotations)])
, featureData = AnnotatedDataFrame(data= featureAnnotations[,1:ncol(featureAnnotations)])
)
)
}
#' Perform statistical test (mderated t-test), comparing all case to control
#' @param eset ExpressionSet
#' @param adjust TRUE/FALSE adjust for multiple testing using Benjamini & Hochberg (1995) method
#' @param log T/F log-transform expression values
#' @param method c("all","pairwise")
#' @param adjustFilter matrix T/F do not adjust for multiple testing
#' @return ExpressionSet object
#' @export
#' @import limma Biobase
#' @importFrom stats model.matrix p.adjust
#' @note No note
#' @details No details
#' @references Empirical Bayes method, Smyth (2004), \url{http://www.ncbi.nlm.nih.gov/pubmed/16646809}
#' @seealso \code{\link[limma]{eBayes}}
#' @examples print("No examples")
getAllEBayes <- function(eset=eset, adjust=F, log=T, method="pairwise", adjustFilter=matrix(F,nrow=nrow(eset),ncol=length(levels(pData(eset)$condition))-1 ) ){
#### calculate moderated t-statistic, empirical Bayes method, Smyth (2004)
# Question: limma multiple groups comparison produces different pvalue comparing with two group comparsion
# https://support.bioconductor.org/p/44216/
# https://support.bioconductor.org/p/60556/
uniqueConditions <- unique(pData(eset)$condition)
nbUniqueConditions <- length(uniqueConditions)
controlCondition <- .getControlCondition(eset)
caseConditions <- setdiff(uniqueConditions ,controlCondition)
# if no condition has replicates reurn NA's
if(max(table(pData(eset)$condition)) == 1){
return(data.frame( matrix(nrow=nrow(eset), ncol = length(caseConditions), dimnames = list(rownames(eset),caseConditions))))
}
if(log)(exprs(eset) <- log2(exprs(eset)))
# #### NON-PAIR-WISE COMPARISONS
if("all" %in% method){
# add subject term to allow for paired t-statistic
if("subject" %in% names(pData(eset))){
# no intercept
design <- model.matrix(~0+condition + subject, data=pData(eset))
}else{
# no intercept
design <- model.matrix(~0+condition, data=pData(eset))
}
colnames(design) <- gsub("^condition","",colnames(design))
fit <- lmFit(eset,design)
### create contract matrix describing desired condition comparisons
# e.g. B-A, C-A
# contrastMatrix
# # B C
# # A -1 -1
# # B 1 0
# # C 0 1
contrastMatrix <- makeContrasts(contrasts=paste(caseConditions,"-", controlCondition),levels=design)
colnames(contrastMatrix) <- caseConditions
# fit contrasts coefficients
fitContrasts <- eBayes(contrasts.fit(fit,contrastMatrix))
pvalues <- data.frame(fitContrasts$p.value[,caseConditions])
names(pvalues) <- caseConditions
#print(head(round(fitContrasts$coefficients[,caseConditions],3) == round(compRatios[,caseConditions],3)))
#print(cbind(fitContrasts$coefficients[,caseConditions[1]],compRatios[,caseConditions[1]]))
}else{ #### PAIR-WISE COMPARISONS "pairwise" %in% method
## REASONING
# General case (linera model, anova ..)
# Adding additional groups alters all Std. Errors. However if equal variance (homoscedastic) the std.error and p-values should not increase.
# -> We are increasing the number of samples used to estimate the standard error of the condiiton coefficients
# -> If equal variance the power increases
#
# limma
# The eBayes() is also affected by adding more groups as more data is used to estimated the variance prior (derived from data across genes).
pvalues <- data.frame(row.names=featureNames(eset))
for(cC in caseConditions){
### at least one replicate of one condition requires
selCol <- unlist(pData(eset)$condition %in% c(controlCondition,cC))
if(sum(selCol) > 2 ){
esetPair <- eset[,selCol]
# add subject term to allow for paired t-statistic
if("subject" %in% names(pData(eset))){
fit <-eBayes(lmFit(esetPair, model.matrix(~factor(esetPair$condition) + subject, data=pData(esetPair)) ) )
}else{
fit <-eBayes(lmFit(esetPair, model.matrix(~factor(esetPair$condition), data=pData(esetPair)) ) )
}
p <- fit$p.value[,2]
}else{
p <- rep(NA,nrow(eset))
}
pvalues <- cbind(pvalues,p)
}
names(pvalues) <- caseConditions
}
if(adjust){ ### adjust for multiple testing using Benjamini & Hochberg (1995) method
for(i in 1:ncol(pvalues)){
pvaluesTmp <- pvalues[,i]
pvaluesTmp[adjustFilter[,i]] <- NA
pvalues[,i] <-p.adjust(pvaluesTmp,method="BH")
}
}
return(pvalues)
}
#' Calculate ratios, comparing all case to control
#' @param eset ExpressionSet
#' @param method median, mean, paired
#' @param log2 transform
#' @return ExpressionSet object
#' @export
#' @import Biobase
#' @note No note
#' @details No details
#' @references NA
#' @examples print("No examples")
getRatios <- function(eset, method="median", log2=T){
if(sum(c("median","mean","paired") %in% method) == 0 ){
stop("Unknown method ",method)
}
if(("paired" %in% method) & !"subject" %in% names(pData(eset))){
stop("Invalid method ",method)
}
controlCondition <- .getControlCondition(eset)
caseConditions <- setdiff(unique(pData(eset)$condition) ,controlCondition)
ratios <- data.frame(row.names=featureNames(eset))
eCtrl <- subset(exprs(eset),select=which(pData(eset)$condition == controlCondition))
for(cC in caseConditions){
eCase <- subset(exprs(eset),select=which(pData(eset)$condition == cC))
if("subject" %in% names(pData(eset))){
if(log2){
rTmp <- log2(eCase) - log2(eCtrl)
}else{
rTmp <- eCase / eCtrl
}
if("paired" %in% method){ ### return paired all paired ratios instead of mean/median per condition
ratios <- cbind(ratios,rTmp)
}else{
ratios <- cbind(ratios,apply(rTmp,1,median))
}
}else{ # non-paired design
if(log2){
ratios <- cbind(ratios,apply(log2(eCase),1,method, na.rm=T) - apply(log2(eCtrl),1,method, na.rm=T))
}else{
ratios <- cbind(ratios,apply(eCase,1,method, na.rm=T) / apply(eCtrl,1,method, na.rm=T))
}
}
}
if("paired" %in% method){
names(ratios) <- rownames(pData(eset))[!pData(eset)$isControl]
}else{
names(ratios) <- caseConditions
}
return(ratios)
}
#getRatios <- function(eset, method="median", log2=T){
#
# if(sum(c("median","mean") %in% method) == 0 ){
# stop("Unknown method ",method)
# }
#
# if(log2){
# exprs(eset) <- log2(exprs(eset))
# }
#
# controlCondition <- .getControlCondition(eset)
# caseConditions <- setdiff(unique(pData(eset)$condition) ,controlCondition)
#
# ratios <- data.frame(row.names=featureNames(eset))
#
# for(cC in caseConditions){
#
# mCase <- exprs(eset)[, pData(eset)$condition == cC]
# mControl <- exprs(eset)[, pData(eset)$condition == controlCondition]
#
# if(method == "median"){
# if(sum(pData(eset)$condition == cC) > 1){
# mCase <- unlist(apply(exprs(eset)[, pData(eset)$condition == cC],1,median, na.rm=T))
# }
# if(sum(pData(eset)$condition == controlCondition) > 1){
# mControl <- unlist(apply(exprs(eset)[,pData(eset)$condition == controlCondition],1,median, na.rm=T))
# }
# }else if(method == "mean"){
#
# if(sum(pData(eset)$condition == cC) > 1){
# mCase <- unlist(apply(exprs(eset)[,pData(eset)$condition == cC],1,mean, na.rm=T))
# }
# if(sum(pData(eset)$condition == controlCondition) > 1){
# mControl <- unlist(apply(exprs(eset)[,pData(eset)$condition == controlCondition],1,mean, na.rm=T))
# }
# }
# if(log2){
# ratios <- cbind(ratios,mCase - mControl)
# }else{
# ratios <- cbind(ratios,mCase / mControl)
# }
# }
#
# head(ratios)
#
# names(ratios) <- caseConditions
# return(ratios)
#}
#' Calculate Coefficiant of Variance per feature (Relative standard Deviation) per Condition
#' @param eset ExpressionSet
#' @return data.frame of CVs per condition
#' @import Biobase
#' @export
#' @note No note
#' @details CV = sd / mean
#' @references NA
#' @seealso \code{\link{getCV}}
#' @examples print("No examples")
getAllCV <- function(eset){
signal <- exprs(eset)
conditions <- unique(pData(eset)$condition)
expDesign <- pData(eset)
cv <- data.frame(row.names=featureNames(eset))
#if paired design
if("subject" %in% names(pData(eset))){
#return(data.frame((matrix(nrow = nrow(eset), ncol = length(allConditions), dimnames=list(rownames(eset), allConditions) ))))
controlCondition <- .getControlCondition(eset)
caseConditions <- setdiff(unique(pData(eset)$condition) ,controlCondition)
conditions <- setdiff(unique(pData(eset)$condition) ,controlCondition)
# discard control from expDesign
#expDesign <- subset(expDesign, condition %in% conditions) # not accepted by CRAN
expDesign <- expDesign[expDesign$condition %in% conditions,]
# add all NAs for control condition
cv <- cbind(cv,rep(NA,nrow(cv)))
names(cv) <- controlCondition
signal <- getRatios(eset,method="paired",log2=F)
}
for(cond in conditions){
cvTmp <- rep(NA,nrow(cv))
colMatch <- expDesign$condition == cond
### if replicates
if(sum(colMatch) > 1){
cvTmp <- getCV(signal[,colMatch])
}
cv <- cbind(cv,cvTmp)
names(cv)[ncol(cv)] <- cond
}
return(cv)
}
#' Get signal at zscore x (x standard deviations below mean)
#' @param intensities refrence run signals
#' @param promille baseline value set as specified promille
#' @return baseline value
#' @export
#' @importFrom stats quantile
#' @note No note
#' @references NA
#' @examples print("No examples")
getBaselineIntensity <- function(intensities , promille = 5){
###
#intensities <- sample(intensities[!is.na(intensities)],1000,replace=T)
if((promille < 1) | (promille > 1001)){
stop("Invalid percentile: ", promille)
}
intensities <- intensities[is.finite(intensities)]
suppressWarnings(return(quantile(intensities,probs = seq(0,1,0.001),na.rm=T)[promille+1]))
}
#' Calculate Coefficiant of Variance per feature (Relative standard Deviation)
#' @param data data.frame of replicate signals
#' @return vector of CVs
#' @export
#' @importFrom stats sd
#' @note No note
#' @details CV = sd / mean
#' @references NA
#' @examples print("No examples")
getCV <- function(data){
return(apply(data,1,sd,na.rm=T)/apply(data,1,mean,na.rm=T))
}
#' Get retentiontime base normalization factors
#' @param eset ExpressionSet
#' @param minFeaturesPerBin minumum number of features per bin. If nb. features are < minFeaturesPerBin -> include neighbouring bins.
#' @return data.frame normalization factors per retention time bin (minute)
#' @import limma Biobase
#' @export
#' @note No note
#' @details No details
#' @references In Silico Instrumental Response Correction Improves Precision of Label-free Proteomics and Accuracy of Proteomics-based Predictive Models, Lyutvinskiy et al. (2013), \url{http://www.ncbi.nlm.nih.gov/pubmed/23589346}
#' @examples print("No examples")
getRTNormFactors <- function(eset, minFeaturesPerBin=100){
### check if eset contain necessary retentionTime colummn
if(is.null(fData(eset)$retentionTime)){
stop("retentionTime not found")
}
### check if isNormAnchor and isFiltered columns are defiend, if -> get normalization factors from nonFiltered anchor proteins
# if(!is.null(fData(eset)$isNormAnchor) & !is.null(fData(eset)$isFiltered)){
# sel <- fData(eset)$isNormAnchor & !fData(eset)$isFiltered
# if(sum(sel) == 0){
# return(stop("Invalid Anchor Protein Selection"))
# }
# eset <- eset[sel,]
# }
# No big difference if using all features. Avoids down stream bug when applying norm factors
### make sure minFeaturesPerBin is not > tot. nb. features
minFeaturesPerBin <- min(c(minFeaturesPerBin,nrow(eset)))
# get all ratios to sample 1
# @TODO How to select reference run?
#ratios <- log2(exprs(eset)) - log2(exprs(eset)[,1])
ratios <- log2(exprs(eset)) - apply(log2(exprs(eset)),1,mean,na.rm=T)
### get median ratio per minute bin
roundedRT <- round(fData(eset)$retentionTime)
rtBin <- sort(unique(round(fData(eset)$retentionTime)))
rtBin <- rtBin[!is.na(rtBin)]
# normalization factors per retention time bin bin
rtNormFactors <- data.frame(row.names=rtBin)
# iterate over samples
for(i in 1:ncol(ratios)){
ratio <- ratios[,i]
rtNFac <- data.frame(row.names=rtBin)
# iterate over all retention time bins
for(rt in rtBin){
match <- roundedRT %in% rt
### while number of features are < minFeaturesPerBin -> include neighbouring bins
rtBins <- rt
while(sum(match) < minFeaturesPerBin ){
rtBins <- ((min(rtBins)-1):(max(rtBins)+1))
match <- roundedRT %in% rtBins
}
# median or sum?
rtNFac[as.character(rt),1] <- median(ratio[match],na.rm=T)
}
# store rt bin norm factors
rtNormFactors <- cbind(rtNormFactors,rtNFac)
}
names(rtNormFactors) <- rownames(pData(eset))
return(rtNormFactors)
}
#' Normalization data per retention time bin
#' @param eset ExpressionSet
#' @param rtNormFactors obtained using getRTNormFactors
#' @return data.frame normalization factors per retention time bin (minute)
#' @export
#' @import limma Biobase
#' @note No note
#' @details Normalize for variations in elelctrospray ionization current.
#' @seealso \code{\link{getRTNormFactors}}
#' @references In Silico Instrumental Response Correction Improves Precision of Label-free Proteomics and Accuracy of Proteomics-based Predictive Models, Lyutvinskiy et al. (2013), \url{http://www.ncbi.nlm.nih.gov/pubmed/23589346}
#' @examples print("No examples")
rtNormalize <- function(eset,rtNormFactors){
### check if eset contain necessary retentionTime colummn
if(is.null(fData(eset)$retentionTime)){
stop("retentionTime not found")
}
roundedRT <- round(fData(eset)$retentionTime)
# FIX
if(sum(!(as.character(roundedRT)[!is.na(roundedRT)] %in% row.names(rtNormFactors))) > 0){
stop("ERROR: Missing R.T. Norm Factors.")
}
for(i in 1:ncol(eset)){
# normalize
nF <- rtNormFactors[as.character(roundedRT),i]
nF[is.na(nF)] <- 0
exprs(eset)[,i] <- 2^(log2(exprs(eset)[,i]) - nF)
}
return(eset)
}
#' Get normalization factors. calculated as summed/median signal per run (column) over summed/median of first run.
#' @param eset ExpressionSet
#' @param method c("sum","median)
#' @return vector of normalization factors
#' @export
#' @note No note
#' @details No details
#' @keywords normalization
#' @references NA
#' @examples print("No examples")
getGlobalNormFactors <- function(eset, method="median"){
if("sum" %in% method){
method <- "sum"
}else{
method <- "median"
}
sel <- rep(T,nrow(eset))
### check if isNormAnchor and isFiltered columns are defined, if -> get normalization factors from nonFiltered anchor proteins
if(!is.null(fData(eset)$isNormAnchor) & !is.null(fData(eset)$isFiltered)){
### only use feature qunatified in all runs for normalization
isAllSignal <- as.vector(apply(is.finite(exprs(eset)),1,sum, na.rm=T) == ncol(eset))
#isCalMix <- fData(eset)$proteinName %in% names(CALIBMIXRATIOS)
#sel <- fData(eset)$isNormAnchor & !fData(eset)$isFiltered & isAllSignal & !isCalMix
sel <- fData(eset)$isNormAnchor & !fData(eset)$isFiltered & isAllSignal
if(sum(sel) == 0){
#stop("Error: getGlobalNormFactors -> Invalid Anchor Protein Selection ")
cat("WARNING: getGlobalNormFactors -> No proteins matching Anchor Protein Selection \n ")
return(rep(1,ncol(eset)))
}
}
rawDataIdx = apply(data.frame(exprs(eset)[sel,]),2, FUN=method, na.rm=T)
normFactors = as.numeric(rawDataIdx[1]) / as.numeric(rawDataIdx)
return(normFactors)
}
#' Normalize, Norm factors calculated as median signal per run (column) over median of first run.
#' @param eset ExpressionSet
#' @param globalNormFactors globalNormFactors
#' @return eset ExpressionSet
#' @export
#' @note No note
#' @details No details
#' @keywords normalization
#' @references NA
#' @seealso getGlobalNormFactors
#' @examples print("No examples")
globalNormalize <- function(eset,globalNormFactors){
normData <- data.frame(matrix(t(apply(exprs(eset), 1, function(.rawData)mapply(globalNormFactors, .rawData, FUN="*"))),ncol=ncol(exprs(eset)),dimnames=list(row.names(exprs(eset)),names(exprs(eset)))))
names(normData) <- sampleNames(eset)
return(createExpressionDataset(as.matrix(normData),pData(eset),fData(eset)))
}
#' Normalize
#' @param eset ExpressionSet
#' @param method c("global","rt","quantile")
#' @return eset ExpressionSet
#' @export
#' @import limma
#' @note No note
#' @details No details
#' @keywords normalization
#' @references NA
#' @seealso getGlobalNormFactors, getRTNormFactors
#' @examples print("No examples")
sqNormalize <- function(eset, method="global"){
esetNorm <- eset
if("global" %in% method){
globalNormFactors <- getGlobalNormFactors(esetNorm,method=method)
### add normalization factors to ExpressionSet
pData(esetNorm)$globalNormFactors <- globalNormFactors
esetNorm <- globalNormalize(esetNorm,globalNormFactors)
}
if("rt" %in% method){
rtNormFactors <- getRTNormFactors(esetNorm, minFeaturesPerBin=100)
esetNorm <- rtNormalize(eset,rtNormFactors)
}
if("quantile" %in% method){
# limma
exprs(eset) <- normalizeQuantiles(exprs(eset))
}
### @ experimental
# if("vsn" %in% method){
# library(vsn)
# esetNorm <- justvsn(eset)
# exprs(esetNorm) <- 2^exprs(esetNorm)
# }
# if(identical(esetNorm,eset)){
# warning("No normalization performed")
# print( apply(exprs(esetNorm),2,median,na.rm=T))
# }
return(esetNorm)
}
#' Summarize replicate signal per condition (min)
#' @param eset ExpressionSet
#' @param method median (default), mean, max, min, sd
#' @return data.frame of per condition signals
#' @export
#' @note No note
#' @details No details
#' @references NA
#' @examples print("No examples")
getSignalPerCondition <- function(eset,method="median"){
conditionNames <- unique(as.character(pData(eset)$condition))
perCondSignal <- data.frame(row.names=rownames(eset))
if(sum(c("median","mean","mean","max", "min","sd") %in% method) == 0 ){
stop("Unknown method ",method)
}
for(cond in conditionNames){
condMatch <- cond== pData(eset)$condition
perCondSignal <- cbind(perCondSignal,apply(subset(exprs(eset),select=which(condMatch)),1,method,na.rm=T))
}
names(perCondSignal) <- conditionNames
return(perCondSignal)
}
# }
#
#
#
# else if(method=="medianOld"){
# for(cond in conditionNames){
# condMatch <- cond== pData(eset)$condition
# if(sum(condMatch) > 1){ # @TODO replace by subset function
# perCondSignal <- cbind(perCondSignal,apply(exprs(eset)[,condMatch],1,median, na.rm=T))
# }else{
# perCondSignal <- cbind(perCondSignal,exprs(eset)[,condMatch])
# }
# }
# }
#
#
#
#
# else if(method=="mean"){
# for(cond in conditionNames){
# condMatch <- cond== pData(eset)$condition
# if(sum(condMatch) > 1){
# perCondSignal <- cbind(perCondSignal,apply(exprs(eset)[,condMatch],1,mean, na.rm=T))
# }else{
# perCondSignal <- cbind(perCondSignal,exprs(eset)[,condMatch])
# }
# }
# }
# else if(method=="max"){
# for(cond in conditionNames){
# condMatch <- cond== pData(eset)$condition
# if(sum(condMatch) > 1){
# perCondSignal <- cbind(perCondSignal,apply(exprs(eset)[,condMatch],1,max, na.rm=T))
# }else{
# perCondSignal <- cbind(perCondSignal,exprs(eset)[,condMatch])
# }
# }
# }else if(method=="min"){
# for(cond in conditionNames){
# condMatch <- cond== pData(eset)$condition
# if(sum(condMatch) > 1){
# perCondSignal <- cbind(perCondSignal,apply(exprs(eset)[,condMatch],1,min, na.rm=T))
# }else{
# perCondSignal <- cbind(perCondSignal,exprs(eset)[,condMatch])
# }
# }
# }else if(method=="sd"){
# for(cond in conditionNames){
# condMatch <- cond== pData(eset)$condition
# if(sum(condMatch) > 1){
# perCondSignal <- cbind(perCondSignal,apply(exprs(eset)[,condMatch],1,sd, na.rm=T))
# }else{
# perCondSignal <- cbind(perCondSignal,NA)
# }
# }
# }
# else{
# stop("Unknown method: method ")
# }
#' Calculate Mean of X most intense features
#' @param entryData data.frame listing feature intensities of one entry. Typically rows corresponds to Peptide entries of one protein
#' @param topX best X flyers
#' @return vector of topX intensities per column (sample)
#' @export
#' @note No note
#' @details No details
#' @references Absolute quantification of proteins by LCMSE: A virtue of parallel MS acquisition, Silva (2006), \url{http://www.ncbi.nlm.nih.gov/pubmed/16219938}, Critical assessment of proteome-wide label-free absolute abundance estimation strategies. Ahrne (2013), \url{http://www.ncbi.nlm.nih.gov/pubmed/23794183}
#' @examples print("No examples")
getTopX <- function(entryData,topX=3){
# if number of rows are fewer than topX
topX <- min(c(topX,nrow(entryData)))
### get row order based on decreasing row sum
if( !is.null(ncol(entryData)) && (ncol(entryData) > 1) ){
o <- order(apply(entryData,1,sum,na.rm=T),decreasing=T)[1:topX]
if(topX==1){
return(entryData[o,])
}
return(apply(entryData[o,],2,mean,na.rm=T))
}
# only one column
o <- order(entryData,decreasing=T)[1:topX]
return(mean(entryData[o],na.rm=T))
}
#' Calculate intensity-based absolute-protein-quantification (iBAQ) metric per protein
#' @param eset protein level ExpressionSet
#' @param proteinDB list protein sequneces
#' @param peptideLength peptide length interval (to get number of peptides used for normalization)
#' @param nbMiscleavages number of mis-cleavages allowed when digesting protein sequneces in silico (to get number of peptides used for normalization)
#' @param proteaseRegExp protease Reg Exp cleavage rule
#' @return ExpressionSet
#' @export
#' @note No note
#' @details No details
#' @references Global quantification of mammalian gene expression control, Schwanhausser (2011), \url{http://www.ncbi.nlm.nih.gov/pubmed/21593866},
#' Critical assessment of proteome-wide label-free absolute abundance estimation strategies. Ahrne (2013), \url{http://www.ncbi.nlm.nih.gov/pubmed/23794183}
#' @examples print("No examples")
getIBAQEset <- function(eset
, proteinDB=NA
, peptideLength = c(5,36)
, nbMiscleavages = 0
, proteaseRegExp=.getProteaseRegExp("trypsin")
){
# get number of detectable peptides per protein
nbPeptides <- vector(length=nrow(eset))
i <- 0
for(i in 1:nrow(eset)){
ac <- as.character(fData(eset)$proteinName[i])
### keep first listed entry sp|P04049|RAF1_HUMAN;sp|P15056|BRAF_HUMAN -> sp|P04049|RAF1_HUMAN
#ac <- gsub("\\;.*","",ac)
nbPep <- NA
if( !is.null(proteinDB[[ac]]) ){
nbPep <- getNbDetectablePeptides(getPeptides(proteinDB[[ac]],proteaseRegExp=proteaseRegExp,nbMiscleavages=nbMiscleavages),peptideLength=peptideLength)
}else{
warning("WARN: ",ac," NOT FOUND IN PROTEIN DATABASE")
#cat("WARN: ",ac," NOT FOUND IN PROTEIN DATABASE\n")
}
nbPeptides[i] <- nbPep
}
### create new "absquant" eset
esetIBAQ <- eset
# scale protein intensity by number of detectable peptides
exprs(esetIBAQ) <- exprs(eset) / nbPeptides
### store normalization factors
fData(esetIBAQ)$nbTrypticPeptides <- nbPeptides
return(esetIBAQ)
}
### require colnames "signal", "cpc"
#' Leave-One-Out Cross Validate Qunatification Model
#' @param df data.frame of two columns 1) "signal" - ms metric 2) "cpc" absolute quantity
#' @return data.frame of fold errors per (left-out) protein
#' @export
#' @note No note
#' @details No details
#' @keywords normalization
#' @references NA
#' @seealso NA
#' @examples print("No examples")
getLoocvFoldError <- function(df){
ok <- is.finite(df[,1]) & is.finite(df[,2])
df <- df[ok,]
foldError <- vector(length=nrow(df))
for (i in 1:nrow(df)) {
fit <- lm(cpc ~ signal, data=df[-i,] )
foldError[i] <- 10^predict(fit,newdata=df[i,]) / 10^df[i,]$cpc
}
### if foldErro < 1, take neg. of inverse
foldError[foldError < 1] <- -1/foldError[foldError < 1]
### return fold error per protein
foldError <- data.frame(foldError,row.names=rownames(df))
return(foldError)
}
#' Set value to NA if it deviatves with more than 1.5 * IQR from lower/upper quantile
#' @param x vector numeric
#' @param na.rm logical indicating whether missing values should be removed.
#' @param ... qunatile args
#' @export
#' @importFrom stats quantile IQR
#' @note No note
#' @details No details
#' @keywords normalization
#' @references NA
#' @seealso NA
#' @examples print("No examples")
removeOutliers <- function(x, na.rm = TRUE, ...){
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
return(y)
}
#' Roll up feature intensites per unique colum combination
#' @param eset ExpressionSet
#' @param featureDataColumnName vector of column names e.g. peptide or proteinName
#' @param method "sum", "mean" or "top3"
#' @return ExpressionSet object
#' @export
#' @details featureDataColumnName = c("peptide","charge","ptm"), method= c("sum"), sums up intensities per peptie modification charge state
#' @import Biobase data.table
#' @note No note
#' @references No references
#' @examples print("No examples")
rollUp <- function(eset, method = "sum", featureDataColumnName = c("proteinName") ){
# HACK to please CRAN CHECK "rollUp: no visible binding for global variable "idScore""
idx <- idScore <- V1 <- allAccessions <- NULL
### apply filter
eset <- eset[!fData(eset)$isFiltered,]
selectedColumns <- names(fData(eset)) %in% featureDataColumnName
allIndexTags <- as.vector(unlist(apply(data.frame(fData(eset)[,selectedColumns]),1,function(t){
return(paste(as.vector(unlist(t)),collapse="_"))
})))
DT <- data.table(
idx = allIndexTags
,exprs(eset)
,fData(eset)
)
setkey(DT,idx)
if(method =="sum"){
rDT <- DT[, lapply(.SD, sum, na.rm=TRUE), by=idx, .SDcols=c(2:(ncol(eset)+1)) ]
rolledAssayData <- data.frame(rDT,row.names=1, check.names=F)
}else if(method =="median"){
rDT <- DT[, lapply(.SD, median, na.rm=TRUE), by=idx, .SDcols=c(2:(ncol(eset)+1)) ]
rolledAssayData <- data.frame(rDT,row.names=1, check.names=F)
}else if(method =="mean"){
rDT <- DT[, lapply(.SD, mean, na.rm=TRUE), by=idx, .SDcols=c(2:(ncol(eset)+1)) ]
rolledAssayData <- data.frame(rDT,row.names=1, check.names=F)
}else if(method =="top3"){
rDT <- DT[, lapply(.SD, getTopX, topX=3), by=idx, .SDcols=c(2:(ncol(eset)+1)) ]
rolledAssayData <- data.frame(rDT,row.names=1, check.names=F)
}else if(method =="top1"){
rDT <- DT[, lapply(.SD, getTopX, topX=1), by=idx, .SDcols=c(2:(ncol(eset)+1)) ]
rolledAssayData <- data.frame(rDT,row.names=1, check.names=F)
}
# idScore
if("idScore" %in% names(DT) ){
### get fData of highest scoring row per rollUP level
indices <- DT[, .I[getMaxIndex(idScore) ], by=idx]$V1
rolledFData <- data.frame(data.frame(DT)[indices,names(fData(eset))],row.names=rownames(rolledAssayData))
# replace idScore colum, by summed score
sumDT <- DT[, sum(idScore,na.rm=T), by=idx ]
rolledFData$idScore = sumDT[,V1]
}else{
# @TODO inefficient solution -> speed up
# allColumns but idx,intensity data and idScore
#selColOld <- which(!(names(DT) %in% c(colnames(exprs(eset)),"idx","idScore")))
selCol <- which((names(DT) %in% names(fData(eset))))
#rolledFDataOld <- data.frame(DT[, lapply(.SD, .getFirstEntry), by=idx, .SDcols=selCol],row.names=rownames(rolledAssayData))[,2:(length(selCol)+1)]
rolledFData <- data.frame(DT[, lapply(.SD, .getFirstEntry), by=idx, .SDcols=selCol],row.names=rownames(rolledAssayData))[,names(fData(eset))]
# print(names(rolledFData))
# print("")
# print(names(rolledFDataOld))
}
### concatenate allAccessions
if("allAccessions" %in% names(DT)){
rolledFData$allAccession <- DT[, list( allAccessionsTMP = paste(unique(unlist(strsplit(paste(allAccessions,collapse=";"),";"))),collapse=";") ), by = key(DT)]$allAccessionsTMP
}
rolledAssayData <- as.matrix(rolledAssayData)
rolledAssayData[rolledAssayData == 0 ] <- NA
#names(rolledFData) <- names(fData(eset))
if(!is.null(rolledFData$isNormAnchor)){
rolledFData$isNormAnchor <- T
}
# reset filter
if(!is.null(rolledFData$isFiltered)){
rolledFData$isFiltered <- F
}
### set peptides per protein
rolledFData$nbPeptides <- getNbPeptidesPerProtein(eset)[as.character(rolledFData$proteinName)]
### if
return(createExpressionDataset(expressionMatrix=rolledAssayData,expDesign=pData(eset),featureAnnotations=rolledFData))
}
#' Per Feature Normalization
#' @param eset ExpressionSet
#' @param normFactors matrix normalization factors (logged) (row names are proteins)
#' @return ExpressionSet object
#' @export
#' @details Example Usage: Normalize phospho peptide signals for Protein Changes
#' @note No note
#' @references No references
#' @examples print("No examples")
perFeatureNormalization <- function(eset,normFactors){
coveredPeptideSel <- fData(eset)$proteinName %in% rownames(normFactors)
if(sum(coveredPeptideSel) == 0){
warning("No shared entries between target data and normalisation data")
return(eset)
}
# make sure target data and norm data have same dimensions
if( !all(colnames(normFactors) %in% pData(eset)$condition) ){
stop("Invalid norm factors")
}
# normalise log ratios
exprs(eset)[coveredPeptideSel,] <- exprs(eset)[coveredPeptideSel, ] - normFactors[as.character(fData(eset)[coveredPeptideSel,]$proteinName),pData(eset)$condition]
return(eset)
}
#' Standardise data
#' @param d vector or data.frame or matrix
#' @return vector or data.frame or matrix
#' @export
#' @note No note
#' @details No details
#' @examples print("No examples")
standardise <- function(d){
d[!is.finite(as.matrix(d))] <- NA
if((class(d) == "data.frame") | (class(d) == "matrix")){
for(i in 1:ncol(d)){
d[,i] <- d[,i]- mean(d[,i],na.rm=T)
d[,i] <- d[,i]/ sd(d[,i],na.rm=T)
}
return(d)
}else{
return( (d - mean(d,na.rm=T)) / sd(d,na.rm=T) )
}
}
#> pData(eset)
#condition isControl subject
#A_rep_1 A FALSE 1
#A_rep_2 A FALSE 2
#B_rep_1 B FALSE 1
#B_rep_2 B FALSE 2
#C_rep_1 C TRUE 1
#C_rep_2 C TRUE 2
#' Create Paired Expdesign
#' @param eset ExpressionSet
#' @return ExpressionSet object
#' @import Biobase
#' @export
#' @note No note
#' @details Add subject colum to phenoData design data.frame
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
createPairedExpDesign <-function(eset){
# make sure all conditions include the same number of runs
nbRunsPerCond <- unique(table(as.character(pData(eset)$condition)))
if(length(nbRunsPerCond) != 1){
print(pData(eset))
stop("ERROR: createPairedExpDesign failed to create paired expDesign")
}
# Paired designnot meaningful if no replicates
if(table(pData(eset)$condition)[1] == 1){
cat("WARNING: createPairedExpDesign Paired experimental design not meaningful when no replicates\n" )
return(eset)
}
pData(eset) <- cbind(pData(eset),subject=as.factor(rep(1:nbRunsPerCond,length(unique(pData(eset)$condition)))))
return(eset)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.