R/PSMtab2SAM.R
In proBAMr: Generating SAM file for PSMs in shotgun proteomics data

Documented in PSMtab2SAM

##' Generate SAM files from confident peptide-spectrum-matches (PSMs). 
##' 
##' @title Generate SAM files from PSMs.
##' @param passedPSM a data frame of PSMs passed FDR.
##' @param XScolumn specify the column which represents the matching score.
##' @param exon_anno a dataframe of exon annotations.
##' @param proteinseq a dataframe containing  protein ids and protein 
##' sequences.
##' @param procodingseq a data frame cotaining coding sequence for each 
##' protein.
##' @param ... additional arguments
##' @return a dataframe containing 
##' @author Xiaojing Wang
##' @examples
##' 
##' load(system.file("extdata/GENCODE", "exon_anno.RData", package="proBAMr"))
##' load(system.file("extdata/GENCODE", "proseq.RData", package="proBAMr"))
##' load(system.file("extdata/GENCODE", "procodingseq.RData", 
##'     package="proBAMr"))
##' options(stringsAsFactors=FALSE)
##' passedPSM <- read.table(system.file("extdata", "passedPSM.tab", 
##'     package="proBAMr"), sep='\t', header=TRUE)
##' SAM <- PSMtab2SAM(passedPSM, XScolumn='mvh', exon, proteinseq, 
##'     procodingseq)
##' write.table(SAM, file=paste(tempdir(), '/test.sam', sep=''), 
##'             sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE)
##' 

PSMtab2SAM <- function(passedPSM, XScolumn='mvh', exon_anno, proteinseq, 
    procodingseq, ...)
{
    options(stringsAsFactors=FALSE)
    #passedPSM
    PEP <- passedPSM[, 'peptide']
    Spectrumid <- apply(passedPSM, 1, function(x){
        paste(strsplit(x['spectrum'], '\\.')[[1]][1], 
        strsplit(strsplit(x['spectrumNativeID'], ' ')[[1]][1], '=')[[1]][2], 
        strsplit(strsplit(x['spectrumNativeID'], ' ')[[1]][2], '=')[[1]][2], 
        strsplit(strsplit(x['spectrumNativeID'], ' ')[[1]][3], '=')[[1]][2], 
        sep='.')
        })
    #PEP_SEQ <- formatPep(spectra[, 'Sequence'])

    SAM <- c()

    spectrumcount <- table(Spectrumid)

    for(i in 1:dim(passedPSM)[1]){
        #print(i)
        peptide <- PEP[i]
        QNAME <- Spectrumid[i]
        idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE)
        if(length(idx) == 0){
            RNAME <- '*' 
            MAPQ <- 255
            RNEXT <- '*'
            PNEXT <- 0
            TLEN <- 0
            QUAL <- '*'
            POS <- 0
            SEQ <- '*'
            CIGAR <- '*'            
            FLAG <- 0x4 
            annoted <- '?'
            XA <- paste('XA:Z:', annoted, sep='')
            res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                    as.character(SEQ), QUAL, XA)
            res <-  unique(data.frame(t(res))) 
        }else{
            pro <- proteinseq[idx, ]
            sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) 
                        regexpr(peptide, x, fixed=TRUE)))
            pep_len <- nchar(peptide)
            end_pos <- sta_pos + pep_len - 1
            
            coding <- procodingseq[match(pro[, 'pro_name'], 
                                        procodingseq[, 'pro_name']), ]
            code_s <- (sta_pos-1) * 3 + 1
            code_e <- end_pos * 3
            codingseq <- substring(coding[, 'coding'], code_s, code_e)
            
            
            
            exonp <- lapply(pro[, 'tx_name'], function(x)
                        exon_anno[exon_anno[, 'tx_name']==x, ])
            
            exonp <- lapply(exonp, function(x){
                            if(length(unique(x[, 'tx_id'])) > 1){
                                x[grep(x[1, 'tx_id'], x[, 'tx_id'], 
                                fixed=TRUE), ]
                            }else x
                        })
                       
            if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE
                
            res <- mapply(function(x, y, z, m) 
                    if(dim(z)[1] == 0){
                        .proteinUnannotated(x, y, z, m, primary=pri)
                    }else{
                        if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], 
                            na.rm = TRUE))){
                        #if(toString(translate(DNAString(m))) != peptide){
                            .peptideUnannotated(x, y, z, m, primary=pri)
                        }else{
                            .MapCoding2genome(x, y, z, m, primary=pri)
                        }
                    }, 
                    code_s, code_e, exonp, codingseq) 
                    
            res <-  unique(data.frame(t(res)))
            
        }
        XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='')
        NH <- paste('NH:i:', dim(res)[1], sep='')
        XP <- paste('XP:Z:', peptide, sep='')
        #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='')
        XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='')
        XS <- paste('XS:f:', round(as.numeric(passedPSM[i, XScolumn]), 
                        digits=4), sep='')
        #XA <- paste('XA:Z:', annoted, sep='')
        XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='')
        XT <- paste('XT:i:', passedPSM[i, 'NTT'], sep='')
        
        if(length(idx) > 0){
            pep_g <- 'N'
        }
        
        XG <- paste('XG:Z:', pep_g, sep='')
        
        XM <-  ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'),
                    paste('XM:Z:', passedPSM[i, 'modification'], sep=''))
        
        res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT, XG)    
        SAM <- rbind(SAM, res)
    }

    SAM

}



.proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, 
...)
{ 
    RNAME <- '*' 
    MAPQ <- 255
    RNEXT <- '*'
    PNEXT <- 0
    TLEN <- 0
    QUAL <- '*'
    POS <- 0
    SEQ <- '*'
    CIGAR <- '*'
    annoted <- 2
    XA <- paste('XA:Z:', annoted, sep='')
    FLAG <- 0x4 
    
    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                as.character(SEQ), QUAL, XA)
    tmp
}



.peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, 
...)
{    
    #RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
    RNAME <- '*'
    MAPQ <- 255
    RNEXT <- '*'
    PNEXT <- 0
    TLEN <- 0
    QUAL <- '*'
    POS <- 0
    SEQ <- '*'
    CIGAR <- '*'
    annoted <- 1
    XA <- paste('XA:Z:', annoted, sep='')
    FLAG <- 0x4 
    #if(exon_anno[1, 'strand'] == '+'){        
    #    FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
    #}else{
    #    FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100)   
    #}
    
    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                as.character(SEQ), QUAL, XA)
    tmp
}




.MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, 
...)
{
    idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), 
                        which(exon_anno[, 'cds_end'] >= c_sta))
    idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), 
                        which(exon_anno[, 'cds_end'] >= c_end))              
    len <- c_end - c_sta + 1 
    RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
    MAPQ <- 255
    RNEXT <- '*'
    PNEXT <- 0
    TLEN <- 0
    QUAL <- '*'
    annoted <- 0
    XA <- paste('XA:Z:', annoted, sep='')
    
    if(exon_anno[1, 'strand'] == '+'){
            POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - 
                    exon_anno[idxs, 'cds_start']
            SEQ <- DNAString(cseq)
            FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
            
     }else{
            POS <- exon_anno[idxe, 'cds_chr_start'] +  
                    exon_anno[idxe, 'cds_end'] - c_end
            SEQ <- reverseComplement(DNAString(cseq))
            FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) 
     }
    
    if(idxe == idxs){
        CIGAR <- paste(len, 'M', sep='')        
    }else{
         if(exon_anno[1, 'strand'] == '+'){
            #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, 
            #                        'cds_chr_end']- 1
            part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 
            part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1  
            
            insert <- unlist(lapply(1:(idxe - idxs), function(x) 
                paste(exon_anno[idxs + x, 'cds_chr_start'] - 
                    exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep='')))
            if(idxe-idxs >1){
                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
                        paste(exon_anno[idxs + x, 'cds_chr_end'] - 
                        exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', 
                        sep=''))) 
            }else{ innerexon <- ''}
            
            #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) 
            #paste(exon_anno[idxs+x, 'cds_chr_end'] - 
            #             exon_anno[idxs+x, 'cds_chr_start']+1, 
            #             'M', sep=''))), '')
            midpattern <- rep(NA, length(insert) + length(innerexon))
            midpattern[seq(1, length(insert) + length(innerexon), 
                                                        by=2)] <- insert
            midpattern[seq(2, length(insert) + length(innerexon), 
                                                        by=2)] <- innerexon
            midpattern <- paste(midpattern, collapse='') 
            
        }else{
            #insert <- exon_anno[idxs, 'cds_chr_start'] - 
            #            exon_anno[idxe, 'cds_chr_end']- 1
            part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 
            part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1  
            
            insert <- unlist(lapply(1:(idxe-idxs), function(x) 
                paste(exon_anno[idxe - x, 'cds_chr_start'] - 
                        exon_anno[idxe-x + 1, 'cds_chr_end']- 1, 
                        'N', sep='')))
            if(idxe-idxs >1){
                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
                        paste(exon_anno[idxe-x, 'cds_chr_end'] - 
                        exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep='')))
            }else{ innerexon <-''}
        
            midpattern <- rep(NA, length(insert)+length(innerexon))
            midpattern[seq(1, length(insert) + length(innerexon), 
                                                        by=2)] <- insert
            midpattern[seq(2, length(insert) + length(innerexon), 
                                                        by=2)] <- innerexon
            midpattern <- paste(midpattern, collapse='') 

        }

        CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='')    
    }

    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                as.character(SEQ), QUAL, XA)
    tmp
}

Any scripts or data that you put into this service are public.

proBAMr documentation built on Nov. 8, 2020, 8:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

proBAMr
Generating SAM file for PSMs in shotgun proteomics data

R/PSMtab2SAM.R
In proBAMr: Generating SAM file for PSMs in shotgun proteomics data

Defines functions .MapCoding2genome .peptideUnannotated .proteinUnannotated PSMtab2SAM

Documented in PSMtab2SAM

Try the proBAMr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

proBAMr Generating SAM file for PSMs in shotgun proteomics data

R/PSMtab2SAM.R In proBAMr: Generating SAM file for PSMs in shotgun proteomics data

Defines functions .MapCoding2genome .peptideUnannotated .proteinUnannotated PSMtab2SAM

Documented in PSMtab2SAM

Try the proBAMr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

proBAMr
Generating SAM file for PSMs in shotgun proteomics data

R/PSMtab2SAM.R
In proBAMr: Generating SAM file for PSMs in shotgun proteomics data