# TODO: Add comment
#
# Author: erikahrne
###############################################################################
### @TODO
parseCSV <- function(file=file,expDesign=expDesign){}
################################## LFQ ##################################
### get column indices of intensity data
#.getProgenesisCsvIntColIndices <- function(file){
#
# con <- file(file)
# open(con);
# line1 <- readLines(con, n = 1)
# close(con)
# intStartCol <- grep("Normalized abundance",unlist(strsplit(line1,",")))
# intEndCol <- grep("Raw abundance",unlist(strsplit(line1,",")))-1
# nbSamples <- intEndCol - intStartCol + 1
# return((intStartCol:intEndCol)+nbSamples)
#}
## get column indices of spectral count data
#' @export
.getProgenesisCsvExpressionColIndices <- function(file, method="auc"){
con <- file(file)
open(con);
line1 <- readLines(con, n = 1)
close(con)
if(grepl("Raw abundance",line1)){
### non-fractionated data
intStartCol <- grep("Normalized abundance",unlist(strsplit(line1,",")))
intEndCol <- grep("Raw abundance",unlist(strsplit(line1,",")))-1
nbSamples <- intEndCol - intStartCol + 1
}else{
### fractionated data
intStartCol <- grep("Normalized abundance",unlist(strsplit(line1,",")))
intEndCol <- grep("ectral counts",unlist(strsplit(line1,",")))-1
nbSamples <- intEndCol - intStartCol + 1
intStartCol <- intStartCol - nbSamples
intEndCol <- intEndCol - nbSamples
}
if(method =="auc"){
return(return((intStartCol:intEndCol)+nbSamples))
}else if(method =="spc"){
startCol <- grep("ectral counts",unlist(strsplit(line1,",")))
endCol <- startCol+nbSamples-1
return((startCol:endCol))
}else{
stop("Unknown quantification indices\n")
}
#@TODO fractionated data. Exports only contain Normalized Abundance data
}
# Get Experimental
#' Parse Experimental Design from Progenesis Csv Export
#' @param file path to progenesis csv file
#' @param expressionColIndices default .getProgenesisCsvExpressionColIndices(file)
#' @return data.frame describing experimental design
#' @export
#' @importFrom utils read.csv
#' @note No note
#' @details No details
#' @references NA
#' @examples print("No examples")
getExpDesignProgenesisCsv <- function(file, expressionColIndices = .getProgenesisCsvExpressionColIndices(file)){
header <- read.csv(file,nrows=2, check.names=F)[,expressionColIndices]
header <- data.frame(header) ## in case only one cond one sample
conditionLine <- as.character(unlist(header[1,]))
currentCond <- as.character(conditionLine[1])
condition <- c()
for(i in 1:length(conditionLine)){
if(nchar(conditionLine[i]) > 0 ){
currentCond <- conditionLine[i]
}
condition <- c(condition,currentCond)
}
## set control condition
isControl <- rep(F,length(condition))
isControl[condition[1] == condition] <- T
### check if sample names are unique, if not, -> add running number
# substitute - -> .
#sampleNames <- gsub("\\-",".",as.character(as.vector(unlist(header[2,]))))
sampleNames <- as.character(as.vector(unlist(header[2,])))
if(T %in% (table(sampleNames) > 1)) sampleNames <- paste(sampleNames,1:length(sampleNames),sep="_")
expDesign <- data.frame(
condition=condition
,isControl=isControl
,row.names=sampleNames
)
return(expDesign)
}
#' Parse Progenesis Protein Csv
#' @param file path to Progenesis Protein csv file
#' @param expDesign experimental design data.frame
#' @param method auc (area under curve) or spc (spectral count)
#' @return ExpressionSet object
#' @export
#' @importFrom utils read.csv
#' @note No note
#' @details No details
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
parseProgenesisProteinCsv <- function(file=file,expDesign=expDesign, method="auc"){
res <- read.csv(file,skip=2,allowEscapes=T, check.names=F)
expMatrix <- as.matrix(res[,.getProgenesisCsvExpressionColIndices(file, method=method)])
# set 0 features to NA
expMatrix[expMatrix == 0] <- NA
# discard features where all intensities are NA (i.e. 0)
# allColNA <- as.vector(apply(expMatrix,1,function(r){ return(sum(!is.na(r)) == 0)}))
row.names(expMatrix) <- res$Accession
# [1] "Accession" "Peptide count"
# [3] "Peptides used for quantitation" "Confidence score"
# [5] "Anova (p)*" "Max fold change"
# [7] "Highest mean condition" "Lowest mean condition"
# [9] "Description" "A11-03216"
featureAnnotations <- data.frame(
proteinName=res$Accession
,geneName= res$Description %>% as.character %>% getGeneName()
,ac= res$Accession %>% as.character %>% getAccessionNumber()
,proteinDescription=res$Description
,idScore=res[,"Confidence score"]
# ,nbPeptides=res[,"Peptides used for quantitation"] ### old Progenesis
,nbPeptides=res[,which(names(res) == "Unique peptides" | names(res) == "Peptides used for quantitation")[1]] # Progensis QI
,isNormAnchor=rep(T,nrow(expMatrix))
,isFiltered=rep(F,nrow(expMatrix))
,row.names=res$Accession)
#featureAnnotations <- featureAnnotations[!allColNA,]
### strip off added .1 A11.03216.1 -> A11.03216
#colnames(expMatrix) <- gsub("\\.1$","",colnames(expMatrix))
### re-order and exclude channels
#expMatrix <- as.matrix(expMatrix[!allColNA ,rownames(expDesign)])
### re-order and exclude runs in agreement with experimental design
expMatrix <- subset(expMatrix,select=rownames(expDesign) )
### filter subset
subset <- !((rowSums(!is.na(expMatrix)) == 0) %>% as.vector)
expMatrix <- subset(expMatrix,subset=subset )
featureAnnotations <- subset(featureAnnotations,subset)
colnames(expMatrix) <- rownames(expDesign)
return(createExpressionDataset(expressionMatrix=expMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations))
}
#' Parse Progenesis Feature Csv Export
#' @param file path to Progenesis Feature csv file
#' @param expDesign experimental design data.frame
#' @param method auc (area under curve) or spc (spectral count)
#' @return ExpressionSet object
#' @export
#' @import Biobase
#' @importFrom utils read.csv
#' @note No note
#' @details No details
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
parseProgenesisFeatureCsv <- function(file=file,expDesign=getExpDesignProgenesisCsv(file), method="auc"){
### stop if not all samples labelled with a given condition are assigned as control
# Example:
# condition isControl
# A11.09066 Condition 1 TRUE
# A11.09067 Condition 1 FALSE
# A11.09068 Condition 1 FALSE
# A11.09070 Condition 2 FALSE
# A11.09071 Condition 2 FALSE
# A11.09072 Condition 2 FALSE
if(length(unique(expDesign$condition)) == length(unique(expDesign[!expDesign$isControl,]$condition))){
stop("Invalid Exp. Design")
}
# read csv file
res <- read.csv(file,skip=2,allowEscapes=T,check.names=F)
expMatrix <- as.matrix(res[,.getProgenesisCsvExpressionColIndices(file, method=method)])
# set 0 features to NA
expMatrix[expMatrix == 0] <- NA
# discard features where all intensities are NA (i.e. 0)
#allColNA <- as.vector(apply(expMatrix,1,function(r){ return(sum(!is.na(r)) == 0)}))
### Mass filed missing in old Progenesis Feature export
if(is.null(res$Mass)){res$Mass <- rep(NA,nrow(expMatrix))}
# added
ptm <- res[,"Variable modifications ([position] description)"]
# strip ; character, which appears sometimes (Proteome Discoverer data?) example: modifAnnot: [5] Phospho; (S)|[7] Phospho (S)
ptm <- gsub("\\;","",ptm)
nbPtmsPerPeptide <- unlist(lapply(ptm,function(t){
t <- as.character(t)
return(sum(unlist(gregexpr("\\|",t)[[1]]) > 0) + (nchar(t) > 0) )}))
# "#"
# [2] "m/z"
# [3] "Retention time (min)"
# [4] "Retention time window (min)"
# [5] "Mass"
# [6] "Charge"
# [7] "Max fold change"
# [8] "Highest mean condition"
# [9] "Lowest mean condition"
# [10] "Anova"
# [11] "Maximum CV"
# [60] "Notes"
# [61] "Score"
# [62] "Mass error (u)"
# [63] "Mass error (ppm)"
# [64] "Protein"
# [65] "Sequence"
# [66] "Variable modifications ([position] description)"
# [67] "Description"
featureAnnotations <- data.frame(
proteinName=res$Protein
,geneName= res$Description %>% as.character %>% getGeneName()
,ac= res$Protein %>% as.character %>% getAccessionNumber()
,proteinDescription=res$Description
,peptide=res$Sequence
,idScore=as.numeric(as.character(res$Score))
,mass=res$Mass
,pMassError=res[,"Mass error (ppm)"]
,mz=res[,"m/z"]
,retentionTime=res[,"Retention time (min)"]
,charge=as.numeric(res$Charge)
,ptm=ptm
,isNormAnchor=rep(T,nrow(expMatrix))
,isFiltered=rep(F,nrow(expMatrix))
# ,row.names=res$Accession
# added
,nbPtmsPerPeptide = nbPtmsPerPeptide
)
### re-order and exclude runs in agreement with experimental design
expMatrix <- subset(expMatrix,select=rownames(expDesign) )
### filter subset
isPep <- !is.na(featureAnnotations$idScore) # discard non peptide annotated rows
allColNA <- (rowSums(!is.na(expMatrix)) == 0) %>% as.vector
subset <- !allColNA & isPep
expMatrix <- subset(expMatrix,subset=subset )
featureAnnotations <- subset(featureAnnotations,subset)
#featureAnnotations <- data.frame(featureAnnotations)[!allColNA & isPep,]
### strip off added .1 A11.03216.1 -> A11.03216
#colnames(expMatrix) <- gsub("\\.1$","",colnames(expMatrix))
# ### re-order and exclude channels
# if(ncol(expMatrix) > 1){
# expMatrix <- as.matrix(expMatrix[!allColNA & isPep ,rownames(expDesign)])
# }else{ # to avoid crash when only one run
# expMatrix <- as.matrix(expMatrix[!allColNA & isPep,])
# }
colnames(expMatrix) <- rownames(expDesign)
return(createExpressionDataset(expressionMatrix=expMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations))
}
#' Parse Progenesis Peptide Measurement Csv Export
#' @param file path to Progenesis Peptide Measurement csv file
#' @param expDesign experimental design data.frame
#' @param method auc (area under curve) or spc (spectral count)
#' @param expressionColIndices default .getProgenesisCsvExpressionColIndices()
#' @param specificPeptides T/F keep specific peptides only
#' @return ExpressionSet object
#' @export
#' @import Biobase
#' @importFrom utils read.csv
#' @note No note
#' @details No details
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
parseProgenesisPeptideMeasurementCsv <- function(file,expDesign=expDesign, method="auc",
expressionColIndices = .getProgenesisCsvExpressionColIndices(file, method=method) , exclusivePeptides=F ){
# HACK to please CRAN CHECK "rollUp: no visible binding for global variable "Sequence""
Grouped <- Sequence <- Score <- resDTIndex <- proteinScore <- Accession <- NULL
### stop if not all samples labelled with a given condition are assigned as control
# Example:
# condition isControl
# A11.09066 Condition 1 TRUE
# A11.09067 Condition 1 FALSE
# A11.09068 Condition 1 FALSE
# A11.09070 Condition 2 FALSE
# A11.09071 Condition 2 FALSE
# A11.09072 Condition 2 FALSE
if(length(unique(expDesign$condition)) == length(unique(expDesign[!expDesign$isControl,]$condition))){
stop("Invalid Exp. Design")
}
# read csv file
res <- read.csv(file,skip=2,allowEscapes=T,check.names=F)
# [1] "#"
# [2] "Retention time (min)"
# [3] "Charge"
# [4] "m/z"
# [5] "Measured mass"
# [6] "Mass error (u)"
# [7] "Mass error (ppm)"
# [8] "Score"
# [9] "Sequence"
# [10] "Modifications"
# [11] "Accession"
# [12] "All accessions (for this sequence)"
# [13] "Grouped accessions (for this sequence)"
# [14] "Shared accessions (for this sequence)"
# [15] "Description"
# [16] "Use in quantitation"
# [17] "A15-02164"
# [18] "A15-02164"
# [19] "A15-02164"
# [20] "Mass"
################################## PROTEIN INFERENCE ##################################
# OCCAMS RAZOR IMPLEMENTATION (HIGHEST SCORING PROTEIN TAKES IT ALL)
# NOT COMPATIBLE WITH STANDARD iBAQ AND TOP3 ABSOLUT QUANTIFICATION! # EXAMPLE: Peptide Measurement Output
# Accession sp|O43707|ACTN4_HUMAN
# All accessions (for this sequence) sp|O43707|ACTN4_HUMAN;sp|P12814|ACTN1_HUMAN;sp|P35609|ACTN2_HUMAN;sp|Q08043|ACTN3_HUMAN
# Grouped accessions (for this sequence) sp|P35609|ACTN2_HUMAN;sp|Q08043|ACTN3_HUMAN
# Shared accessions (for this sequence) sp|P12814|ACTN1_HUMAN
# Here sp|P35609|ACTN2_HUMAN;sp|Q08043|ACTN3_HUMAN has no other peptides than those shared with (never appears without) sp|O43707|ACTN4_HUMAN
# Feature Score Sequence Accession All accessions (for this sequence) Grouped accessions (for this sequence) Shared accessions (for this sequence)
# 1770 62.91 AALSALESFLK sp|P78527|PRKDC_HUMAN sp|P78527|PRKDC_HUMAN
# A "Grouped accessions (for this sequence)" never appears in the "Accession" column as ProteinScore(GroupedProtein) <= ProteinScore(Accession)
# allGrouped <- as.vector(unlist(lapply(resDT$"Grouped accessions (for this sequence)",function(t){strsplit(as.character(t),";")})))
# sum(allGrouped %in% as.character(resDT$Accession))
# OCCAMS RAZOR IMPLEMENTATION (HIGHEST SCORING PROTEIN/PROTEIN GROUP TAKES IT ALL)
# NOT COMPATIBLE WITH STANDARD iBAQ AND TOP3 ABSOLUT QUANTIFICATION!
cat("INFO: ASSIGNING PEPTIDES TO PROTEINS APPLYING OCCAM'S RAZOR \n" )
#Check that file includes column "Grouped accessions (for this sequence)"
#if(!("Grouped accessions (for this sequence)" %in% names(res))) stop("Column \"Grouped accessions (for this sequence)\" missing in file ",file)
if(!("Grouped accessions (for this sequence)" %in% names(res))){
res$'Grouped accessions (for this sequence)' = ""
}
names(res)[1] <- "Feature"
names(res)[grepl("^Grouped",names(res))] <- "Grouped"
res$Score <- as.numeric(as.character(res$Score))
resDT <- data.table(res,key="Accession")
resDT$Grouped <- as.character(resDT$Grouped)
#resDT$"Grouped accessions (for this sequence)" <- as.character(resDT$"Grouped accessions (for this sequence)")
resDT$Accession <- as.character(resDT$Accession)
# 1) FIND PROTEIN GROUPS. A PROTEIN GROUP IS A SET OF PROTEINS THAT A) SHARE THE EXACT SAME PEPTIDES.
# B) All proteins of the group ONLY map to this set of peptides
# In the Progensis Export "Peptide Measurments", the column "Grouped accessions (for this sequence)"
# is defined according to Condition A) only.
# "leading" refers to column "Accession"
# Find all accessions that only appears as grouped
#isGroup <- nchar(resDT$"Grouped accessions (for this sequence)") > 0
isGroup <- nchar(resDT$Grouped) > 0
groupedAccessions <- unique(resDT$Accession[isGroup])
nonGroupedAccessions <- unique(resDT$Accession[!isGroup])
onlyGrouped <- groupedAccessions[ !(groupedAccessions %in% nonGroupedAccessions) ]
isOnlyGrouped <- resDT$Accession %in% onlyGrouped
# leadingGroupAcTable <- table(unique(data.frame(resDT$Accession[isOnlyGrouped],resDT$"Grouped accessions (for this sequence)"[isOnlyGrouped]))[,1])
leadingGroupAcTable <- table(unique(data.frame(resDT$Accession[isOnlyGrouped],resDT$Grouped[isOnlyGrouped]))[,1])
# leading AC only part of one group
leadingACTrueGroup <- names(leadingGroupAcTable[leadingGroupAcTable == 1])
# add column myProteinGroup
resDT <- cbind(resDT,myProteinGroup= resDT$Accession )
isTrueGroup <- resDT$Accession %in% leadingACTrueGroup
# resDT$myProteinGroup[isTrueGroup] <- paste(resDT[isTrueGroup,]$Accession,resDT[isTrueGroup,]$"Grouped accessions (for this sequence)",sep=";")
resDT$myProteinGroup[isTrueGroup] <- paste(resDT[isTrueGroup,]$Accession,resDT[isTrueGroup,]$Grouped,sep=";")
# 2) ASSIGN BEST SCORING PEPTIDE TO EACH FEATURE
# Note that a peptide can be listed multiple times per
# Feature, but assigned different ACs (i.e. the peptide is shared)
# A) Roll-up on feature level, keeping the best scoring peptide
# B) list all featureNB_peptides to be kept
# C) filter resDT keeping only these featureNB_peptides
setkey(resDT,"Feature")
# A) Roll-up on feature level, keeping the best scoring peptide
featureDTTmp <- resDT[, list( peptide = Sequence[order(Score,decreasing=T)[1]]), by = key(resDT)]
# B) list all featureNB_peptides to be kept
keptFeaturePeptides <- unique(paste(featureDTTmp$Feature,as.character(featureDTTmp$peptide),sep=""))
# C) filter resDT keeping only these featureNB_peptides
resDT <- resDT[paste(resDT$Feature,as.character(resDT$Sequence),sep="") %in% keptFeaturePeptides,]
# 3) Score all proteins
setkey(resDT,"Accession")
proteinDT <- resDT[, list(proteinScore = sum(Score,na.rm=T)), by = key(resDT)]
rownames(proteinDT) <- proteinDT$Accession
# 4) Add proteinScore to resDT
#resDT <- cbind(resDT,proteinScore=proteinDT[resDT$Accession,]$proteinScore, tmpAC = proteinDT[resDT$Accession,]$Accession) # check
resDT <- cbind(resDT,proteinScore=proteinDT[resDT$Accession,]$proteinScore)
# 5) Roll-up on feature level, keeping OCCAMS myProteinGroup per Feature
setkey(resDT,"Feature")
resDT <- cbind(resDT,resDTIndex=1:nrow(resDT))
featureDT <- resDT[resDT[, list( selectedResDTIndex = resDTIndex[order(proteinScore,decreasing=T)[1]]), by = key(resDT)]$selectedResDTIndex,]
# 6) Roll-up on peptide level concatenating all proteins matching a given peptide
# create peptide to protein dict
setkey(resDT,"Sequence")
#peptideDT <- resDT[, list(allAccessions = paste(unique(Accession),collapse=";")), by = key(resDT)] # could also include groups..
peptideDT <- resDT[, list(allAccessions = paste(unique(c(unlist(strsplit(Grouped,";")),Accession ) %>% na.omit ),collapse=";")), by = key(resDT)]
rownames(peptideDT) <- peptideDT$Sequence
# get allProteins of a given peptide
featureDT <- cbind(featureDT,peptideDT[as.character(featureDT$Sequence),])
################################## PROTEIN INFERENCE END ##################################
expMatrix <- as.matrix(featureDT[,expressionColIndices,with=FALSE])
# set 0 features to NA
expMatrix[expMatrix == 0] <- NA
### Mass field missing in old Progenesis Feature export
#if(is.null(featureDT$Mass)){featureDT$Mass <- rep(NA,nrow(expMatrix))}
# added
ptm <- featureDT$Modifications
# strip ; character, which appears sometimes (Proteome Discoverer data?) example: modifAnnot: [5] Phospho; (S)|[7] Phospho (S)
ptm <- gsub("\\;","",ptm)
nbPtmsPerPeptide <- unlist(lapply(as.character(ptm),function(t){
return(sum(unlist(gregexpr("\\|",t)[[1]]) > 0) + (nchar(t) > 0) )}))
suppressWarnings(score <- as.numeric(as.character(featureDT$Score)))
### non scored entries are assigned a score of zero
score[is.na(score)] <- 0
measuredMass = featureDT$"Measured mass" %>% as.character %>% as.numeric
massError = featureDT$"Mass error (ppm)" %>% as.character %>% as.numeric
mz = featureDT$"m/z" %>% as.character %>% as.numeric
rt = featureDT$"Retention time (min)" %>% as.character %>% as.numeric
if(is.null(measuredMass) | (length(measuredMass) == 0)){measuredMass <- rep(NA,nrow(expMatrix))}
if(is.null(massError) | (length(massError) == 0)){massError <- rep(NA,nrow(expMatrix))}
if(is.null(mz) | (length(mz) == 0)){mz <- rep(NA,nrow(expMatrix))}
if(is.null(rt) | (length(rt) == 0)){rt <- rep(NA,nrow(expMatrix))}
if(!is.null(featureDT$Charge)){charge = as.numeric(featureDT$Charge)}else{charge = rep(NA,nrow(expMatrix))}
featureAnnotations <- data.frame(
proteinName=featureDT$myProteinGroup
,ac=featureDT$myProteinGroup %>% as.character %>% getAccessionNumber()
,proteinDescription=featureDT$Description
,geneName=featureDT$Description %>% as.character %>% getGeneName()
,peptide=featureDT$Sequence
,idScore= score
,mass=measuredMass
,pMassError=massError
,mz=mz
,retentionTime=rt
,charge=charge
,ptm=ptm
,isNormAnchor=rep(T,nrow(expMatrix))
,isFiltered=rep(F,nrow(expMatrix))
,nbPtmsPerPeptide = nbPtmsPerPeptide
,allAccessions = featureDT$allAccessions
,nbProteinConflicts = unlist(lapply(featureDT$allAccessions,function(t){length(grep(";",strsplit(t,"")[[1]]))})) # e.g. sp|Q5T1J5|CHCH9_HUMAN;sp|Q9Y6H1|CHCH2_HUMAN
# ,row.names=res$Accession
# added
)
# discard non peptide annotated rows
isPep <- score > 0
isExclusive <- rep(T,nrow(featureAnnotations))
if(exclusivePeptides){ # keep unique peptides only
isExclusive <- featureAnnotations$nbProteinConflicts == 0
# cat("INFO: Discarded Non-unique peptide features", sum(!isExclusive)," / ", length(isExclusive),"\n" )
}
### re-order and exclude runs in agreement with experimental design
expMatrix <- subset(expMatrix,select=rownames(expDesign) )
### filter subset
allColNA <- (rowSums(!is.na(expMatrix)) == 0) %>% as.vector
subset <- !allColNA & isPep & isExclusive
expMatrix <- subset(expMatrix,subset=subset )
featureAnnotations <- subset(featureAnnotations,subset)
### strip off added .1 A11.03216.1 -> A11.03216
#colnames(expMatrix) <- gsub("\\.1$","",colnames(expMatrix))
### re-order and exclude channels
#expMatrix <- subset(expMatrix,subset=subset, select=rownames(expDesign) )
# if(ncol(expMatrix) > 1){
# expMatrix <- as.matrix(expMatrix[!allColNA & isPep,rownames(expDesign)])
# }else{ # to avoid crash when only one run
# expMatrix <- as.matrix(expMatrix[!allColNA & isPep,])
# }
colnames(expMatrix) <- rownames(expDesign)
return(createExpressionDataset(expressionMatrix=expMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations))
}
#parseProgenesisPeptideMeasurementCsv2 <- function(file,expDesign=expDesign, method="auc" ){
#
# ### stop if not all samples labelled with a given condition are assigned as control
# # Example:
# # condition isControl
# # A11.09066 Condition 1 TRUE
# # A11.09067 Condition 1 FALSE
# # A11.09068 Condition 1 FALSE
# # A11.09070 Condition 2 FALSE
# # A11.09071 Condition 2 FALSE
# # A11.09072 Condition 2 FALSE
# if(length(unique(expDesign$condition)) == length(unique(expDesign[!expDesign$isControl,]$condition))){
# stop("Invalid Exp. Design")
# }
#
# # read csv file
# res <- read.csv(file,skip=2,allowEscapes=T,check.names=F)
# # [1] "#"
## [2] "Retention time (min)"
## [3] "Charge"
## [4] "m/z"
## [5] "Measured mass"
## [6] "Mass error (u)"
## [7] "Mass error (ppm)"
## [8] "Score"
## [9] "Sequence"
## [10] "Modifications"
## [11] "Accession"
## [12] "All accessions (for this sequence)"
## [13] "Grouped accessions (for this sequence)"
## [14] "Shared accessions (for this sequence)"
## [15] "Description"
## [16] "Use in quantitation"
## [17] "A15-02164"
## [18] "A15-02164"
## [19] "A15-02164"
## [20] "Mass"
#
# expMatrix <- as.matrix(res[,.getProgenesisCsvExpressionColIndices(file, method=method)])
#
# # set 0 features to NA
# expMatrix[expMatrix == 0] <- NA
# # discard features where all intensities are NA (i.e. 0)
# allColNA <- as.vector(apply(expMatrix,1,function(r){ return(sum(!is.na(r)) == 0)}))
#
# ### Mass filed missing in old Progenesis Feature export
# if(is.null(res$Mass)){res$Mass <- rep(NA,nrow(expMatrix))}
#
# # added
# ptm <- res[,"Modifications"]
# nbPtmsPerPeptide <- unlist(lapply(ptm,function(t){
# t <- as.character(t)
# return(sum(unlist(gregexpr("\\|",t)[[1]]) > 0) + (nchar(t) > 0) )}))
#
# if("All accessions (for this sequence)" %in% names(res)){
# protein <- res[,"All accessions (for this sequence)"]
# }else{
# cat("WARN: All accessions per Peptide were not exported \n")
# protein <- res[,"Accession"]
# }
#
# suppressWarnings(score <- as.numeric(as.character(res$Score)))
# ### non scored entries are assigned a score of zero
# score[is.na(score)] <- 0
#
# # sort protein accessions
## protein <- as.character(protein)
## protein <- as.vector(unlist(lapply(protein, function(prot){
##
## prot <- paste(sort(as.vector(unlist(strsplit(prot,";")))),collapse=";")
## return(prot)
##
## } )))
#
# featureAnnotations <- data.frame(
# proteinName=protein
# ,proteinDescription=res$Description
# ,peptide=res$Sequence
# ,idScore= score
# ,mass=res[,"Measured mass"]
# ,pMassError=res[,"Mass error (ppm)"]
# ,mz=res[,"m/z"]
# ,retentionTime=res[,"Retention time (min)"]
# ,charge=as.numeric(res$Charge)
# ,ptm=ptm
# ,isNormAnchor=rep(T,nrow(expMatrix))
# ,isFiltered=rep(F,nrow(expMatrix))
# # ,row.names=res$Accession
# # added
# ,nbPtmsPerPeptide = nbPtmsPerPeptide
# )
#
# ### strip off added .1 A11.03216.1 -> A11.03216
# #colnames(expMatrix) <- gsub("\\.1$","",colnames(expMatrix))
# ## @TODO what if "X001_Yewtivya" "001_Yewtivya"
# #grepl("X[0-9]",colnames(expMatrix))
#
# ### re-order and exclude channels
# #expMatrix <- expMatrix[,rownames(expDesign)]
#
# #return(createExpressionDataset(expressionMatrix=expMatrix[!allColNA,],expDesign=expDesign,featureAnnotations=featureAnnotations[!allColNA,]))
#
# # discard non peptide annotated rows
# isPep <- nchar(as.character(featureAnnotations$peptide)) > 0
# featureAnnotations <- data.frame(featureAnnotations)[!allColNA & isPep,]
#
# ### strip off added .1 A11.03216.1 -> A11.03216
# #colnames(expMatrix) <- gsub("\\.1$","",colnames(expMatrix))
#
# ### re-order and exclude channels
# if(ncol(expMatrix) > 1){
# expMatrix <- as.matrix(expMatrix[!allColNA & isPep,rownames(expDesign)])
# }else{ # to avoid crash when only one run
# expMatrix <- as.matrix(expMatrix[!allColNA & isPep,])
# }
#
# colnames(expMatrix) <- rownames(expDesign)
#
# return(createExpressionDataset(expressionMatrix=expMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations))
#}
################################## LFQ END ##############################
################################## TMT ##################################
### get line number from which Scaffold file should be read
#' @export
.getSkipLineNb <- function(fileName){
conn<-file(fileName,open="r")
suppressWarnings(linn<-readLines(conn))
skip <- 1
while(regexpr("accession", ignore.case = T,as.character(linn[skip])) == -1 ){
#while(regexpr("Bio Sample",as.character(linn[skip])) == -1 ){
skip <- skip +1
}
close(conn)
return(skip-1)
}
### 6-plex or 10-plex
#' @export
.getNbPlex <- function(fileName){
### parse data
res <- read.csv(fileName,sep="\t",skip=.getSkipLineNb(fileName),nrows=1)
return(length(grep("^Normalized",names(res))))
}
### get file type
#' @export
.getFileType <- function(file){
if( sum(grepl("Modifications",names(read.csv(file,skip=2, nrows=1)))) > 0 ){
return("ProgenesisPeptide")
}else if( sum(grepl("Retention.time..min",names(read.csv(file,skip=2, nrows=1)))) > 0 ){
return("ProgenesisFeature")
}else if(sum( grepl("Peptide.count",names(read.csv(file,skip=2, nrows=1)) )) > 0 ){
return("ProgenesisProtein")
}else if((grepl("^Identification.Criteria",names(read.csv(file,skip=2, nrows=1))) > 0) &
(grepl("^Experiment",names(read.csv(file,skip=1, nrows=0))) > 0)){
return("ScaffoldTMT")
}else if(sum(grepl("^LFQ",names(read.csv(file,allowEscapes=T, check.names=F,sep="\t",nrows=1)))) > 0
& sum(grepl("Fasta headers",names(read.csv(file,allowEscapes=T, check.names=F,sep="\t",nrows=1)))) > 0
){
return("MaxQuantProteinGroup")
}else if(F){
return("GenericCSV")
}else{
return("Unknown")
}
}
#' Parse scaffold output .xls file (RAW export)
#' @param file path to Scaffold file
#' @param expDesign experimental design data.frame
#' @param keepFirstAcOnly TRUE/FALSE If multiple ACs in Accession.Numbers filed. Then keep the first one only
#' @param isPurityCorrect do purity correction
#' @return ExpressionSet object
#' @export
#' @import Biobase
#' @note No note
#' @details No details
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
parseScaffoldRawFile <- function(file, expDesign=expDesign,keepFirstAcOnly=FALSE,isPurityCorrect=T){
### parse data
res <- read.csv(file,sep="\t",skip=.getSkipLineNb(file))
if(keepFirstAcOnly) res$Accession.Numbers <- gsub("\\,.*","",as.character(res$Accession.Numbers))
res$Accession.Numbers <- gsub("\\[","",as.character(res$Accession.Numbers))
res$Accession.Numbers <- gsub("\\]","",as.character(res$Accession.Numbers))
###filter out con and decoy
### HACK, to make sure GFP proteins are not discarded
res$Accession.Numbers <- gsub(".*P42212.*","sp|P42212|GFP",res$Accession.Numbers)
# CREATE EXPRESSION SET
nbPlex <- length(grep("^Normalized",names(res)))
expMatrix <- as.matrix(res[, 10:(10+(nbPlex-1)) ])
if(isPurityCorrect){
expMatrix <- purityCorrectTMT(expMatrix,getImpuritiesMatrix(nbPlex))
}
### re-order and exclude channels
# expMatrix <- expMatrix[,as.numeric(rownames(expDesign))]
# colnames(expMatrix) <- rownames(expDesign)
# set 0 features to NA
expMatrix[expMatrix == 0] <- NA
# discard features where all intensities are NA (i.e. 0)
allColNA <- as.vector(apply(expMatrix,1,function(r){ return(sum(!is.na(r)) == 0)}))
### re-order and exclude channels
expMatrix <- as.matrix(expMatrix[!allColNA ,as.numeric(rownames(expDesign))])
colnames(expMatrix) <- rownames(expDesign)
featureAnnotations <- data.frame(
proteinName=res$Accession.Numbers
,ac= res$Accession.Numbers %>% as.character %>% getAccessionNumber()
,geneName= res$Protein.Name %>% as.character %>% getGeneName()
,proteinDescription=res$Protein.Name
,peptide=res$Peptide.Sequence
#,idScore=res$Score
#,mass=res$Acquired.Plus.One.Mass-1
#,pMassError=res$Mass.error..ppm.
,mz=((as.numeric(as.character(res$Acquired.Plus.One.Mass)) -1)+res$Charge)/res$Charge
#,retentionTime=res$Retention.time..min.
,charge=as.numeric(res$Charge)
,spectrumName = res$Spectrum.Name
#,ptm=res$Variable.modifications...position..description.
,isNormAnchor=rep(T,nrow(res))
,isFiltered=rep(F,nrow(res))
)
featureAnnotations <- subset(featureAnnotations,subset=!allColNA)
### strip off added .1 A11.03216.1 -> A11.03216
#colnames(expMatrix) <- gsub("\\.1$","",colnames(expMatrix))
#print(head(expMatrix))
eset <- createExpressionDataset(expressionMatrix=expMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations)
# #rownames(eset) <- fData(eset)$spectrumName # spectrum names are not always unique (?)
return(eset)
}
### maxQuant protein group parser
#' Parse MaxQuant Protein Group Txt
#' @param file path to MaxQuant Protein txt file
#' @param expDesign experimental design data.frame
#' @param method auc (area under curve) or spc (spectral count)
#' @return ExpressionSet object
#' @export
#' @note No note
#' @details No details
#' @references NA
#' @seealso \code{\link[Biobase]{ExpressionSet}}
#' @examples print("No examples")
parseMaxQuantProteinGroupTxt <- function(file=file,expDesign=expDesign, method="auc"){
res <- read.csv(file,allowEscapes=T, check.names=F,sep="\t")
### get
if(method == "auc"){
expMatrix <- as.matrix(res[,grepl("^LFQ",names(res))])
}else if(method == "spc"){
expMatrix <- as.matrix(res[,grepl("^MS\\/MS Count",names(res))])
}else{
stop("Unknown parsing method:", method )
}
# set 0 features to NA
expMatrix[expMatrix == 0] <- NA
# rename runs 1:X
colnames(expMatrix) <- 1:ncol(expMatrix)
# discard features where all intensities are NA (i.e. 0)
allColNA <- as.vector(apply(expMatrix,1,function(r){ return(sum(!is.na(r)) == 0)}))
# names(res)
# [1] "Protein IDs"
# [2] "Majority protein IDs"
# [3] "Peptide counts (all)"
# [4] "Peptide counts (razor+unique)"
# [5] "Peptide counts (unique)"
# [6] "Protein names"
# [7] "Gene names"
# [8] "Fasta headers"
# [9] "Number of proteins"
# [10] "Peptides"
# [11] "Razor + unique peptides"
# [12] "Unique peptides"
# [13] "Peptides 43"
# [14] "Peptides 44"
# [15] "Peptides 45"
# [33] "Razor + unique peptides 43"
# [34] "Razor + unique peptides 44"
# [35] "Razor + unique peptides 45"
# [53] "Unique peptides 43"
# [54] "Unique peptides 44"
# [55] "Unique peptides 45"
# [73] "Sequence coverage [%]"
# [74] "Unique + razor sequence coverage [%]"
# [75] "Unique sequence coverage [%]"
# [76] "Mol. weight [kDa]"
# [77] "Sequence length"
# [78] "Sequence lengths"
# [79] "Q-value"
# [80] "Sequence coverage 43 [%]"
# [81] "Sequence coverage 44 [%]"
# [82] "Sequence coverage 45 [%]"
# [100] "Intensity"
# [101] "Intensity 43"
# [102] "Intensity 44"
# [103] "Intensity 45"
# [121] "LFQ intensity 43"
# [122] "LFQ intensity 44"
# [123] "LFQ intensity 45"
# [141] "MS/MS Count 43"
# [142] "MS/MS Count 44"
# [143] "MS/MS Count 45"
# [161] "MS/MS Count"
# [162] "Only identified by site"
# [163] "Reverse"
# [164] "Potential contaminant"
# [165] "id"
# [166] "Peptide IDs"
# [167] "Peptide is razor"
# [168] "Mod. peptide IDs"
# [169] "Evidence IDs"
# [170] "MS/MS IDs"
# [171] "Best MS/MS"
# [172] "Oxidation (M) site IDs"
# [173] "Oxidation (M) site positions"
row.names(expMatrix) <- res[,"Protein IDs"]
if( !all(rownames(expDesign) %in% colnames(expMatrix)) ){
print(expDesign)
stop("Invalid ExpDesign")
}
featureAnnotations <- data.frame(
proteinName=res[,"Protein IDs"]
,ac= res[,"Protein IDs"] %>% as.character %>% getAccessionNumber()
,geneName= res[,"Fasta headers"] %>% as.character %>% getGeneName()
,proteinDescription=res[,"Fasta headers"]
,idScore=res[,"Q-value"]
# ,nbPeptides=res[,"Peptides used for quantitation"] ### old Progenesis
,isDecoy=res[,"Reverse"] == "+"
,nbPeptides=res[,"Peptides"] # Progensis QI
,isNormAnchor=rep(T,nrow(expMatrix))
,isFiltered=res[,"Reverse"] == "+"
,row.names=res[,"Protein IDs"])
featureAnnotations <- featureAnnotations[!allColNA,]
### re-order and exclude channels
expMatrix <- as.matrix(expMatrix[!allColNA ,rownames(expDesign)])
colnames(expMatrix) <- rownames(expDesign)
return(createExpressionDataset(expressionMatrix=expMatrix,expDesign=expDesign,featureAnnotations=featureAnnotations))
}
#' Parse scaffold PTM Spectrum Report
#' @param file path to Scaffold file
#' @return data.frame
#' @export
#' @note No note
#' @details No details
#' @references NA
#' @examples print("No examples")
parseScaffoldPTMReport <- function(file){
d <- read.csv(file,sep="\t")
# Note that peptides with multiple protein assignments are listed once per protein.
# Howerver, only one peptide per spectrum is listed
#d <- read.csv(file,sep="\t",skip=.getSkipLineNb(file))[,c(4,5,9,10,16,20)]
d <- read.csv(file,sep="\t")[,c(3,4,5,9,10,16,20)]
d <- unique(d)
rownames(d) <- d$Spectrum.Name
# d <- d[,1:5]
# names(d) <- c("ptm","ptmLocProb","idScore","ptmLocMascotConfidence","pMassError")
d <- d[,1:(ncol(d) - 1)]
names(d) <- c("ptmPeptide","ptm","ptmLocProb","idScore","ptmLocMascotConfidence","pMassError")
### add column nbPtmsPerPeptide
# a <- c("dfgdsg,dfsd,s","sdfsd,we","we","")
# unlist(lapply(strsplit(a,""),function(t){return(sum(grepl("\\,",t)) + (length(t)> 0))}))
#
d$ptm <- as.character(d$ptm )
#S8 Phospho, S30 Phospho, M35 Oxidation (number of ',' +1 )
d <- cbind(d,nbPtmsPerPeptide= unlist(lapply(strsplit(d$ptm,""),function(t){return(sum(grepl("\\,",t)) + (length(t)> 0))})) )
return( d )
}
#' Add scaffold ptm annotaitons to tmt experiment
#' @param eset ExpressionSet
#' @param file path to Scaffold file
#' @return ExpressionSet object
#' @export
#' @import Biobase
#' @note No note
#' @references No references
#' @examples print("No examples")
addScaffoldPTMFAnnotations <- function(eset,file){
scaffoldPTMFData <- parseScaffoldPTMReport(file)
spectrumNameIntersect <- intersect(fData(eset)$spectrumName,rownames(scaffoldPTMFData))
# conflicting peptide entries
misMatch <- fData(eset)[spectrumNameIntersect,]$peptide != toupper(scaffoldPTMFData[spectrumNameIntersect,]$ptmPeptide)
# discard mismatches from intersect and Scaffold PTM df
spectrumNameIntersect <- spectrumNameIntersect[!misMatch]
scaffoldPTMFData <- scaffoldPTMFData[spectrumNameIntersect,]
missingEntriesFrac <- 1- length(spectrumNameIntersect) / nrow(eset)
if(missingEntriesFrac > 0)cat("INFO: ", round((missingEntriesFrac * 100)), "% of TMT spectra not listed in Scaffold PTM export \n" )
fData(eset) <- data.frame(fData(eset), scaffoldPTMFData[rownames(fData(eset)),], row.names=rownames(fData(eset)))
#set NA scores 0
#fData(eset)$idScore[is.na(fData(eset)$idScore)] <- 0
return(eset)
}
################################## TMT END ##############################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.