#' Spectrum Report Generation
#'
#' Convert mzid files produced by IDpicker to tsv file, then generate a PSM list
#'
#' @param protein target protein file generated by IDpicker.
#' @param peptide target peptide file generated by IDpicker.
#' @param spectrum target spectrum file generated by IDpicker.
#' @param mziddir directory of mzid files
#' @param MSGFdir directory of MSGF+ java file
#' @param output_name set the name for new PSM list
#' @return A PSM list
#' @export
SpecRepoGen <- function(protein, peptide, spectrum, mziddir, MSGFdir, output_name){
withProgress(message = 'Spectrum Report Generation', value = 0, {
incProgress(0.2, detail = '.mzid file converting...')
CurrentDir <- getwd()
setwd(mziddir)
filenames <- gsub('.raw.*', '.mzid', as.character(spectrum$Spectrum.Rank))
filenames <- unique(gsub('\\/','', filenames))
CMD <- paste("java -Xmx10000M -cp", paste(MSGFdir,"/MSGFPlus.jar",sep = ''), "edu.ucsd.msjava.ui.MzIDToTsv -i", filenames,
"-showQValue 1 -showDecoy 1 -unroll 1", sep = ' ')
# perform '.mzid to .tsv' convertion
lapply(CMD, shell)
# perform spectrum report generation
incProgress(0.8, detail = 'Integrate spectrum information...')
list <- strsplit(as.matrix(spectrum$Spectrum.Rank),"/")
require("plyr")
df <- ldply(list)
df <- df[,2:3]
colnames(df) <- c("Sample.Name","Scan.Num")
spectrum <- data.frame(spectrum,df)
spectrumrepo_total <- data.frame(matrix(, ncol=25))
colnames(spectrumrepo_total) <- c("SpecFile","SpecID","ScanNum","FragMethod","Precursor","IsotopeError","PrecursorError.ppm.","Charge","DeNovoScore",
"MSGFScore","SpecEValue","EValue","QValue","PepQValue","Peptide.Sequence","Accession","Cluster","Count","Coverage",
"Protein.Group","Distinct.Peptides","Distinct.Matches","Filtered.Spectra","Description","Accession.2")
samplename <- unique(spectrum$Sample.Name)
#Subsetting the sample list and use part of the files
#Create a dataframe with individual protein names and group assignment
protein_2 <- protein[1,]
protein_2$Accession.2[1] <- "000000"
for (j in 1:nrow(protein)) {
if (grepl(",", as.character(protein$Accession[j]))==FALSE) {
pro_row <- protein[j,]
pro_row$Accession.2 <- protein[j,1]
protein_2 <- rbind(protein_2, pro_row)
} else if (grepl(",", as.character(protein$Accession[j]))==TRUE) {
proteinname <- unlist(strsplit(as.character(protein$Accession[j]),","))
pro_row <- protein[j,]
pro_row <- do.call("rbind", replicate(length(proteinname),pro_row,simplify=FALSE))
pro_row$Accession.2 <- matrix(proteinname, nrow=length(proteinname))
protein_2 <- rbind(protein_2, pro_row)
}
}
protein_2 <- protein_2[2:nrow(protein_2),]
peptide$Sequence <- apply(as.matrix(peptide$Sequence),2, function(x) gsub("\\d|\\W","",x))
#Combining information from both IDPicker and MS-GF+ to generate the spectrum report
for (i in 1:length(samplename)) {
filename <- paste(samplename[i],".tsv", sep="")
filename <- gsub(".raw","",filename)
MSGF <- read.table(filename,header=FALSE,sep="\t",fileEncoding="windows-1252")
colnames(MSGF) <- c("SpecFile","SpecID","ScanNum","FragMethod","Precursor","IsotopeError","PrecursorError(ppm)","Charge","Peptide","Protein",
"DeNovoScore","MSGFScore","SpecEValue","EValue","QValue","PepQValue")
matchlist <- match(as.matrix(spectrum$Sample.Name), as.character(samplename[i]), nomatch="NA")
spectrum_2 <- data.frame(spectrum, matchlist)
spectrum_used <- subset(spectrum_2, matchlist=="1")
specnum <- spectrum_used$Scan.Num
matchrow <- pmatch(MSGF$ScanNum,specnum,nomatch="NA",duplicates.ok=TRUE)
matchlist <- unlist(strsplit(as.character(matchrow), " "))
MSGF$Matchlist <- matchlist
MSGF_2 <- na.omit(MSGF)
MSGF_2$Peptide.Sequence <- apply(as.matrix(MSGF_2$Peptide),2,function(x) gsub("\\d|\\W","",x))
MSGF_2$Seq <- row.names(MSGF_2)
MSGF_2 <- MSGF_2[!duplicated(MSGF_2),]
MSGF_2$Peptide.Sequence <- apply(MSGF_2$Peptide.Sequence, 2, function(x) substr(as.character(x),2,nchar(as.character(x))-1))
MSGF_2$pepmatch <- match(MSGF_2$Peptide.Sequence, peptide$Sequence)
MSGF_2 <- na.omit(MSGF_2)
promatch <- pmatch(as.matrix(MSGF_2$Protein), as.matrix(protein_2$Accession.2), nomatch="NA", duplicates.ok=TRUE)
protein_2$Number <- 1:nrow(protein_2)
MSGF_2 <- data.frame(MSGF_2,promatch)
colnames(MSGF_2)[21] <- "Number"
spectrumrepo <- merge(MSGF_2, protein_2, by.y="Number")
spectrumrepo <- spectrumrepo[,c(2:9,12:17,19,22:31)]
spectrumrepo_total <- rbind(spectrumrepo_total,spectrumrepo)
}
#Delete the first row containing "NA"s and output the spectrum report
spectrumrepo_total <- spectrumrepo_total[2:nrow(spectrumrepo_total),]
write.csv(spectrumrepo_total, output_name, row.names=FALSE)
setwd(CurrentDir)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.