R/PhilosophertoMSstatsFormat.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

## Converter for MSFragger+Philosopher

## output from MSFragger+Philosopher : MSstats report

#' @export
PhilosophertoMSstatsFormat <- function(input,
                              annotation,
                              which.sequence = 'Peptide.Sequence',
                              useIsUniqueColumn = FALSE,
                              useUniquePeptide=TRUE,
                              summaryforMultipleRows=max,
                              fewMeasurements="remove",
                              removeOxidationMpeptides=FALSE,
                              removeProtein_with1Peptide=FALSE){

    ## check annotation
    required.annotation <- c('Condition', 'BioReplicate', 'Run')
    
    if (!all(required.annotation %in% colnames(annotation))) {
        
        missedAnnotation <- which(!(required.annotation %in% colnames(annotation)))
        
        stop(paste("**", toString(required.annotation[missedAnnotation]), 
                   "is not provided in Annotation. Please check the annotation file.",
                   "'Run' will be matched with 'Spectrum.File' "))
    }
    
    ## check annotation information
    ## get annotation
    annotinfo <- unique(annotation[, c("Run", "Condition", 'BioReplicate')])	

    ## Each Run should has unique information about condition and bioreplicate
    check.annot <- xtabs(~Run, annotinfo)
    if ( any(check.annot > 1) ) {
        stop('** Please check annotation. Each MS run can\'t have multiple conditions or BioReplicates.' )
    }
    
    ################################################
    ## 0.1. which intensity : Intensity column, no any other quantified information
    ################################################

    ################################################
    ## 0.2. which protein id : Protein.Accessions
    ################################################

    ################################################
    ## 0.3. which sequence : Peptide.Sequence or combination with Modified.Peptide.Sequence
    ################################################
    ## default : Peptide.Sequence
    which.seq <- NULL
    
    if (which.sequence == 'Peptide.Sequence') {
        which.seq <- 'Peptide.Sequence'
    } else if (which.sequence == 'Modified.Peptide.Sequence') {
        which.seq <- 'Modified.Peptide.Sequence'
        
        ## Modified.peptide Sequence column has empty for non-modified.peptide.sequence.
        ## Therefore, we need to fill in with peptide.sequence
        input$Modified.Peptide.Sequence <- as.character(input$Modified.Peptide.Sequence)
        input$Peptide.Sequence <- as.character(input$Peptide.Sequence)
        
        input[is.na(input$Modified.Peptide.Sequence), 'Modified.Peptide.Sequence'] <- input[is.na(input$Modified.Peptide.Sequence), 'Peptide.Sequence']
    } 
    
    if (is.null(which.sequence)) {
        stop('** Please select which columns should be used for peptide sequence, between twp options (Peptide.Sequence or Modified.Peptide.Sequence).')
    }
    
    if (!is.element(which.seq, colnames(input))) {
        stop('** Please select which columns should be used for peptide sequence, between twp options (Peptide.Sequence or Modified.Peptide.Sequence).')
    }
    
    ################################################
    ## 1. get subset of columns
    ################################################
  
    input <- input[, which(colnames(input) %in% c("Protein.Accessions", 
                                                which.seq, "Charge",
                                                "Spectrum.File", "Intensity", "Is.Unique"))]
    
    colnames(input)[colnames(input) == "Protein.Accessions"] <- 'ProteinName'
    
    colnames(input)[colnames(input) == which.seq] <- 'PeptideSequence'

    colnames(input)[colnames(input) == 'Spectrum.File'] <- 'Run'
    
    colnames(input)[colnames(input) == "Intensity"] <- 'Intensity'

    ################################################
    ## 2. remove the peptides including oxidation (M) sequence
    ################################################
    
    ## need to check from felipe
    if (removeOxidationMpeptides) {
        remove_m_sequence <- unique(input[grep("Oxidation", input$Modifications), "Modifications"])
    
        if (length(remove_m_sequence) > 0) {
            input <- input[-which(input$Modifications %in% remove_m_sequence), ]
        }
    
        message('Peptides including oxidation(M) in the Modifications are removed.')
    }
 
    ################################################
    ## 3. remove peptides which are used in more than one protein
    ## we assume to use unique peptide
    ################################################
    
    if (useIsUniqueColumn) {
        
        ## remove rows with Is.Unique is false
        input <- input[input$Is.Unique != 'false', ]
        
        message('** Rows with Is.Unique = false are removed.')
    }
    
    if (useUniquePeptide) {
        ## If useIsUniqueColumn = T and Is.Unique column is correct, we dont need this part.
        ## But, double check
        pepcount <- unique(input[, c("ProteinName", "PeptideSequence")]) 
        pepcount$PeptideSequence <- factor(pepcount$PeptideSequence)
        
        ## count how many proteins are assigned for each peptide
        structure <- aggregate(ProteinName ~ ., data=pepcount, length)
        remove_peptide <- structure[structure$ProteinName != 1, ]
        
        ## remove the peptides which are used in more than one protein
        if (length(remove_peptide$Proteins != 1) != 0) {
            input <- input[-which(input$Sequence %in% remove_peptide$Sequence), ]
            
            message('** Peptides, that are used in more than one proteins, are removed.')
        }
    }
    
    ##############################
    ## 4. remove featuares with all na or zero
    ## some rows have all zero values across all MS runs. They should be removed.
    ##############################
    
    input$fea <- paste(input$PeptideSequence,
                       input$Charge,
                       sep="_")
    
    inputtmp <- input[!is.na(input$Intensity) & input$Intensity > 1, ]
    
    count <- inputtmp %>% group_by(fea) %>% summarise(length=length(Intensity))
    
    ## get feature with all NA or zeros
    getfea <- count[count$length > 0, 'fea']
    
    if (nrow(getfea) > 0) {
        
        nfea.remove <- length(unique(input$fea)) - nrow(getfea)
        input <- input[which(input$fea %in% getfea$fea), ]
        
        message(paste0('** ', nfea.remove, ' features have all NAs or zero intensity values and are removed.'))
        
    }
    
    rm(inputtmp)
    input <- input[, -which(colnames(input) %in% c('fea'))]
    

    ##############################
    ## 5. remove multiple measurements per feature and run
    ##############################
    ## maximum or sum up abundances among intensities for identical features within one run
    input_sub <- dcast( ProteinName + PeptideSequence + Charge ~ Run, data=input, 
                        value.var='Intensity', 
                        fun.aggregate=summaryforMultipleRows, 
                        fill=NA_real_) 
    ## need to check whether Modification is needed or not.
    # input_sub <- dcast( ProteinName + PeptideSequence + Modifications + Charge ~ Run, data=input, 
    #                  value.var='Intensity', 
    #                  fun.aggregate=summaryforMultipleRows, 
    #                     fill=NA_real_) 
 
    ## reformat for long format
    #input_sub <- melt(input_sub, id=c('ProteinName', 'PeptideSequence', 'Modifications', 'Charge'))
    input_sub <- melt(input_sub, id=c('ProteinName', 'PeptideSequence', 'Charge'))
    
    colnames(input_sub)[which(colnames(input_sub) %in% c('variable','value'))] <- c("Run", "Intensity")
  
    message('** Multiple measurements in a feature and a run are summarized by summaryforMultipleRows.')
  
    input <- input_sub

    
    ##############################
    ##  6. remove features which has 1 or 2 measurements across runs
    ##############################
    if (fewMeasurements=="remove") {
        
        ## it is the same across experiments. # measurement per feature. 
        xtmp <- input[!is.na(input$Intensity) & input$Intensity > 0, ]
        xtmp$feature <- paste(xtmp$PeptideSequence, xtmp$Charge, sep="_")
        count_measure <- xtabs( ~feature, xtmp)
    
        remove_feature_name <- count_measure[count_measure < 3]
    
        input$feature <- paste(input$PeptideSequence, input$PrecursorCharge, sep="_")
    
        if (length(remove_feature_name) > 0) {
            input <- input[-which(input$feature %in% names(remove_feature_name)), ]
        }
    
        input <- input[, -which(colnames(input) %in% c('feature'))]
    }
  
    ##############################
    ##  8. remove proteins with only one peptide and charge per protein
    ##############################
	
	if (removeProtein_with1Peptide) {
	  
	    ## remove protein which has only one peptide
	    input$feature <- paste(input$PeptideSequence, 
	                           input$Charge, 
	                           sep="_")
	  
	    tmp <- unique(input[, c("ProteinName", 'feature')])
	    tmp$ProteinName <- factor(tmp$ProteinName)
	    count <- xtabs( ~ ProteinName, data=tmp)
        lengthtotalprotein <- length(count)
    
	    removepro <- names(count[count <= 1])
	  
	    if (length(removepro) > 0) {
	    
	        input <- input[-which(input$ProteinName %in% removepro), ]
	        message(paste0("** ", length(removepro), 
	                       ' proteins, which have only one feature in a protein, are removed among ', 
	                       lengthtotalprotein, ' proteins.'))
	    }
	  
	    input <- input[, -which(colnames(input) %in% c('feature'))]
	}
  
    input$ProteinName <- input$ProteinName
    
    ##############################
    ## 9. add annotation
    ##############################
    
    noruninfo <- setdiff(unique(input$Run), unique(annotation$Run))
    if (length(noruninfo) > 0) {
        stop(paste('** Annotation for Run :', 
                   paste(noruninfo, collapse = ', '), 
                   ' are needed. Please update them in annotation file.') )
    }
    
    input <- merge(input, annotation, by="Run", all=TRUE)
    
    ## add other required information
    # need to update
    #input$PeptideModifiedSequence <- paste(input$PeptideSequence, input$Modifications, sep="_")
    
    input.final <- data.frame("ProteinName" = input$ProteinName,
                              "PeptideSequence" = input$PeptideSequence,
                              "PrecursorCharge" = input$Charge,
                              "FragmentIon" = NA,
                              "ProductCharge" = NA,
                              "IsotopeLabelType" = "L",
                              "Condition" = input$Condition,
                              "BioReplicate" = input$BioReplicate,
                              "Run" = input$Run,
                              "Intensity" = input$Intensity)
    
    if (any(is.element(colnames(input), 'Fraction'))) {
        input.final <- data.frame(input.final,
                                  "Fraction" = input$Fraction)
    }
    
    input <- input.final
    rm(input.final)
  
	return(input)
}

Any scripts or data that you put into this service are public.

MSstats documentation built on Feb. 28, 2021, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MSstats
Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/PhilosophertoMSstatsFormat.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Defines functions PhilosophertoMSstatsFormat

Try the MSstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MSstats Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/PhilosophertoMSstatsFormat.R In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Defines functions PhilosophertoMSstatsFormat

Try the MSstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MSstats
Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/PhilosophertoMSstatsFormat.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments