UHRIonStarApp: R Shiny-based web application UHR-IonStar

Documented in SpecRepoGen

#' Spectrum Report Generation
#'
#' Convert mzid files produced by IDpicker to tsv file, then generate a PSM list
#'
#' @param protein target protein file generated by IDpicker.
#' @param peptide target peptide file generated by IDpicker.
#' @param spectrum target spectrum file generated by IDpicker.
#' @param mziddir directory of mzid files
#' @param MSGFdir directory of MSGF+ java file
#' @param output_name set the name for new PSM list
#' @return A PSM list
#' @export

SpecRepoGen <- function(protein, peptide, spectrum, mziddir, MSGFdir, output_name){
  withProgress(message = 'Spectrum Report Generation', value = 0, {
    incProgress(0.2, detail = '.mzid file converting...')
    CurrentDir <- getwd()
    setwd(mziddir)
    filenames <- gsub('.raw.*', '.mzid', as.character(spectrum$Spectrum.Rank))
    filenames <- unique(gsub('\\/','', filenames))

    CMD <- paste("java -Xmx10000M -cp", paste(MSGFdir,"/MSGFPlus.jar",sep = ''), "edu.ucsd.msjava.ui.MzIDToTsv -i", filenames,
                 "-showQValue 1 -showDecoy 1 -unroll 1", sep = ' ')

    # perform '.mzid to .tsv' convertion
    lapply(CMD, shell)

    # perform spectrum report generation
    incProgress(0.8, detail = 'Integrate spectrum information...')
    list <- strsplit(as.matrix(spectrum$Spectrum.Rank),"/")
    require("plyr")
    df <- ldply(list)
    df <- df[,2:3]
    colnames(df) <- c("Sample.Name","Scan.Num")
    spectrum <- data.frame(spectrum,df)
    spectrumrepo_total <- data.frame(matrix(, ncol=25))
    colnames(spectrumrepo_total) <- c("SpecFile","SpecID","ScanNum","FragMethod","Precursor","IsotopeError","PrecursorError.ppm.","Charge","DeNovoScore",
                                      "MSGFScore","SpecEValue","EValue","QValue","PepQValue","Peptide.Sequence","Accession","Cluster","Count","Coverage",
                                      "Protein.Group","Distinct.Peptides","Distinct.Matches","Filtered.Spectra","Description","Accession.2")
    samplename <- unique(spectrum$Sample.Name)
    #Subsetting the sample list and use part of the files
    #Create a dataframe with individual protein names and group assignment
    protein_2 <- protein[1,]
    protein_2$Accession.2[1] <- "000000"
    for (j in 1:nrow(protein)) {
      if (grepl(",", as.character(protein$Accession[j]))==FALSE) {
        pro_row <- protein[j,]
        pro_row$Accession.2 <- protein[j,1]
        protein_2 <- rbind(protein_2, pro_row)
      } else if (grepl(",", as.character(protein$Accession[j]))==TRUE) {
        proteinname <- unlist(strsplit(as.character(protein$Accession[j]),","))
        pro_row <- protein[j,]
        pro_row <- do.call("rbind", replicate(length(proteinname),pro_row,simplify=FALSE))
        pro_row$Accession.2 <- matrix(proteinname, nrow=length(proteinname))
        protein_2 <- rbind(protein_2, pro_row)
      }
    }
    protein_2 <- protein_2[2:nrow(protein_2),]

    peptide$Sequence <- apply(as.matrix(peptide$Sequence),2, function(x) gsub("\\d|\\W","",x))

    #Combining information from both IDPicker and MS-GF+ to generate the spectrum report

    for (i in 1:length(samplename)) {
      filename <- paste(samplename[i],".tsv", sep="")
      filename <- gsub(".raw","",filename)
      MSGF <- read.table(filename,header=FALSE,sep="\t",fileEncoding="windows-1252")
      colnames(MSGF) <- c("SpecFile","SpecID","ScanNum","FragMethod","Precursor","IsotopeError","PrecursorError(ppm)","Charge","Peptide","Protein",
                          "DeNovoScore","MSGFScore","SpecEValue","EValue","QValue","PepQValue")
      matchlist <- match(as.matrix(spectrum$Sample.Name), as.character(samplename[i]), nomatch="NA")
      spectrum_2 <- data.frame(spectrum, matchlist)
      spectrum_used <- subset(spectrum_2, matchlist=="1")
      specnum <- spectrum_used$Scan.Num
      matchrow <- pmatch(MSGF$ScanNum,specnum,nomatch="NA",duplicates.ok=TRUE)
      matchlist <- unlist(strsplit(as.character(matchrow), " "))
      MSGF$Matchlist <- matchlist
      MSGF_2 <- na.omit(MSGF)
      MSGF_2$Peptide.Sequence <- apply(as.matrix(MSGF_2$Peptide),2,function(x) gsub("\\d|\\W","",x))
      MSGF_2$Seq <- row.names(MSGF_2)
      MSGF_2 <- MSGF_2[!duplicated(MSGF_2),]
      MSGF_2$Peptide.Sequence <- apply(MSGF_2$Peptide.Sequence, 2, function(x) substr(as.character(x),2,nchar(as.character(x))-1))
      MSGF_2$pepmatch <- match(MSGF_2$Peptide.Sequence, peptide$Sequence)
      MSGF_2 <- na.omit(MSGF_2)
      promatch <- pmatch(as.matrix(MSGF_2$Protein), as.matrix(protein_2$Accession.2), nomatch="NA", duplicates.ok=TRUE)
      protein_2$Number <- 1:nrow(protein_2)
      MSGF_2 <- data.frame(MSGF_2,promatch)
      colnames(MSGF_2)[21] <- "Number"
      spectrumrepo <- merge(MSGF_2, protein_2, by.y="Number")
      spectrumrepo <- spectrumrepo[,c(2:9,12:17,19,22:31)]
      spectrumrepo_total <- rbind(spectrumrepo_total,spectrumrepo)
    }
    #Delete the first row containing "NA"s and output the spectrum report
    spectrumrepo_total <- spectrumrepo_total[2:nrow(spectrumrepo_total),]
    write.csv(spectrumrepo_total, output_name, row.names=FALSE)
    setwd(CurrentDir)
  })
}