R/Outputaberrant.R

##' Short insertion/deletion may lead to aberrant proteins in cells. We provide a function to generate FASTA file containing this kind of proteins.
##'
##'
##' the function applys the INDEL into the coding sequence, then translates them into protein sequence, terminated by stop codon. Remove the sequences the same as normal ones or as part of normal ones.
##' @title generate FASTA file containing short INDEL
##' @param positiontab a data frame which is the output of function Positionincoding() for INDELs.
##' @param outfile output file name
##' @param coding a data frame cotaining coding sequence for each protein.
##' @param proteinseq a data frame cotaining amino acid sequence for each protein.
##' @param ids a dataframe containing gene/transcript/protein id mapping information.
##' @param RPKM if includes the RPKM value in the header of each sequence, default is NULL.
##' @param ... Additional arguments.
##' @return FASTA file containing aberrant proteins.
##' @author Xiaojing Wang
##' @importFrom stringr str_sub<-
##' @export
##' @examples
##' 
##' vcffile <- system.file("extdata/vcfs", "test1.vcf", package="customProDB")
##' vcf <- InputVcf(vcffile)
##' table(GenomicRanges::values(vcf[[1]])[['INDEL']])
##' index <- which(GenomicRanges::values(vcf[[1]])[['INDEL']] == TRUE)
##' indelvcf <- vcf[[1]][index]
##' 
##' load(system.file("extdata/refseq", "exon_anno.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "dbsnpinCoding.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "procodingseq.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "proseq.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "ids.RData", package="customProDB"))
##' postable_indel <- Positionincoding(indelvcf, exon)
##' txlist_indel <- unique(postable_indel$txid)
##' codingseq_indel <- procodingseq[procodingseq$tx_id %in% txlist_indel, ]
##' outfile <-  paste(tempdir(), '/test_indel.fasta', sep='')
##' Outputaberrant(postable_indel, coding=codingseq_indel, 
##' proteinseq=proteinseq, outfile=outfile, ids=ids)
##'                 
##' 
##' 
 

Outputaberrant <- function(positiontab, outfile, coding, proteinseq, ids, 
                    RPKM=NULL, ...)
{
        stopifnot(nrow(coding) > 0, nrow(proteinseq) > 0, nrow(ids) > 0)
        if (nrow(positiontab) < 1)
        {
          warning("empty positiontab; Outputaberrant has nothing to do")
          return()
        }
  
        old <- options(stringsAsFactors = FALSE)
        on.exit(options(old), add = TRUE)
  
        idx <- grep(',', positiontab$varbase, fixed=T)
        rsidIndex = which("rsid" %in% colnames(positiontab))
        cosmicIndex = which("cosid" %in% colnames(positiontab))
        if(length(idx) > 0) {
            muti <- positiontab[idx, ]
            muti_new <- c()
            for(i in 1:dim(muti)[1]){
                tmp <- cbind(muti[i, 1:8], 
                        unlist(strsplit(as.character(muti[i, 'varbase']), ',', fixed=TRUE)), 
                        muti[i, 10])
                if (length(rsidIndex) > 0) tmp <- cbind(tmp, muti[i, rsidIndex, with=FALSE])
                if (length(cosmicIndex) > 0) tmp <- cbind(tmp, muti[i, cosmicIndex, with=FALSE])
                muti_new <- rbind(muti_new, tmp)
            }
            colnames(muti_new) <- colnames(positiontab)
            positiontab <- positiontab[-idx, ]
            positiontab <- rbind(positiontab, muti_new)
        }

        
        coding$tx_id = as.integer(coding$tx_id)
        mtable <- merge(positiontab, coding, by.x='txid', by.y='tx_id', all.x=TRUE, stringsAsFactors = FALSE)
        mtable <- mtable[!is.na(mtable$coding),]
        #mtable <- cbind(positiontab,codingseq)

        mtab_plus <- as.data.frame(subset(mtable, strand == '+'))
        mtab_minus <- as.data.frame(subset(mtable, strand == '-'))

        str_sub(mtab_plus$coding, mtab_plus$pincoding, 
        nchar(mtab_plus$refbase) + mtab_plus$pincoding - 1) <- unlist(mtab_plus$varbase)

        str_sub(mtab_minus$coding, mtab_minus$pincoding, 
                mtab_minus$pincoding + nchar(mtab_minus$refbase) - 1) <- fastComplement(mtab_minus$varbase)

        total <- rbind(mtab_plus,mtab_minus)

        ## the translate() function cann't deal with base which is not 'ATGC', so replace 'N' with 'A' and lable it in the header
        lable <- unlist(lapply(total$coding, function(x){
                         if(grepl ('N',x,fixed=T)){
                            paste('with N in pos ', paste(as.character(
                            gregexpr('N', x, fixed=T)[[1]]), collapse=' '), 
                            sep='')
                         }else  ""}))
        total$coding <- gsub('N', 'A', total$coding, fixed=T)
        
        # suppress warnings about last codon being ignored if it isn't 3 bases long
        suppressWarnings(aa <- as.character(translate(DNAStringSet(total$coding))))

        total <- cbind(total, aa, lable)
        pep <- apply(total, 1, function(x) unlist(strsplit(x['aa'], '\\*'))[1])
        total <- cbind(total, pep)

        #########remove the entry identical to normal sequence
        plist <- unique(total$proname)
        proteinseq <- subset(proteinseq, pro_name %in% plist)
        mseq <- merge(total,proteinseq, by.x='proname', by.y='pro_name', 
                    all=FALSE, stringsAsFactors = FALSE)
        #index <- apply(mseq, 1, function(x)  grep(x['pep'],x['peptide'],fixed=T))
        #grep(mseq[4,'pep'],mseq[4,'peptide'])

        index <-c()
        for(i in 1:dim(mseq)[1]){
            index <- c(index, ifelse(grep(mseq[i, 'pep'], 
                            mseq[i, 'peptide'], fixed=T) == 1, i, '' ))
            #print(i)
        }
        if(length(index) > 0) mseq_r <- mseq[-index, ] else mseq_r <- mseq

        mseq_f <- merge(mseq_r, ids, by.x='pro_name', by.y='pro_name', 
                    all=FALSE, stringsAsFactors = FALSE)
        outformat <- paste('>', mseq_f$pro_name, "_", mseq_f$pincoding, 
                    ":", mseq_f$refbase, '>', mseq_f$varbase, " |", 
                    mseq_f$tx_name, "|", mseq_f$gene_name, "|", 
                    mseq_f$description, "|", mseq_f$lable, '\n', 
                    mseq_f$pep, sep='')
        
        if(!is.null(RPKM)){
            v <- unlist(lapply(mseq_f$pro_name, function(x) 
                ifelse(x %in% names(RPKM), paste(round(RPKM[x], 4)), paste('NA'))))
            mseq_f <- cbind(mseq_f, v)
            mseq_f <- mseq_f[order(as.numeric(mseq_f$v), decreasing=T),]
            outformat <- paste('>', mseq_f$pro_name, "_", 
                        mseq_f$pincoding, ":", mseq_f$refbase, '>', 
                        mseq_f$varbase, " |", mseq_f$v, "|", 
                        mseq_f$tx_name, "|", mseq_f$gene_name, "|", 
                        mseq_f$description, "|", mseq_f[, 'lable'], '\n', 
                        mseq_f[, 'pep'], sep='')
    
        }
                
        write(outformat, file=outfile)

        
        #######used as input for proBAMr 
        
        # add DNA complement column if it is missing
        if (is.null(mseq_f$dna_complement))
        {
          #message("Optimizing procodingseq for fast reverse complement access...")
        }
        
        indelproseq <- mseq_f
        indelproseq$pro_name <- paste0(mseq_f$pro_name, "_", mseq_f$pincoding, 
                                       ":", mseq_f$refbase, '>', mseq_f$varbase)
        
        indelproseq <- indelproseq[, c('pro_name', 'peptide', 'tx_name.x','gene_name', 'description')]
        colnames(indelproseq) <- c('pro_name', 'peptide', 'tx_name','gene_name', 'description')
        
        indelprocoding <- mseq_f
        indelprocoding$pro_name <- paste0(mseq_f$pro_name, "_", mseq_f$pincoding, 
                                          ":", mseq_f$refbase, '>', mseq_f$varbase)
        
        indelprocoding <- indelprocoding[, c('pro_name', 'tx_name.x', 'txid', 'coding',
                                             'gene_name', 'description')]
        colnames(indelprocoding) <- c('pro_name', 'tx_name', 'tx_id', 'coding',
                                      'gene_name', 'description')
        indelprocoding$dna_complement = fastComplement(indelprocoding$coding)
        
        list(indelprocoding=indelprocoding, indelproseq=indelproseq)
    }
chambm/customProDB documentation built on May 31, 2019, 12:08 p.m.