R/OutputVarprocodingseq.R

##' Output 'snvprocoding'
##'
##' This function uses the output of aaVariation() as input, introduces the nonsynonymous variation into the protein database.
##' @title Output the variant(SNVs) protein coding sequences
##' @param vartable A data frame which is the output of aaVariation().
##' @param procodingseq A dataframe containing protein ids and coding sequence for the protein.
##' @param ids A dataframe containing gene/transcript/protein id mapping information.
##' @param lablersid If includes the dbSNP rsid in the header of each sequence, default is FALSE. 
##'             Must provide dbSNP information in function Positionincoding() if put TRUE here.
##' @param show_progress If true, a progress bar will be shown.
##' @param ... Additional arguments
##' @return a data frame containing protein coding sequence proteins with single nucleotide variation.
##' @author Xiaojing Wang
##' @importFrom data.table data.table rbindlist setkey setDT set
##' @export
##' @examples
##' 
##' vcffile <- system.file("extdata/vcfs", "test1.vcf", package="customProDB")
##' vcf <- InputVcf(vcffile)
##' table(GenomicRanges::values(vcf[[1]])[['INDEL']])
##' index <- which(GenomicRanges::values(vcf[[1]])[['INDEL']] == FALSE)
##' SNVvcf <- vcf[[1]][index]
##' load(system.file("extdata/refseq", "exon_anno.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "dbsnpinCoding.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "procodingseq.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "ids.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "proseq.RData", package="customProDB"))
##' postable_snv <- Positionincoding(SNVvcf, exon, dbsnpinCoding)
##' txlist <- unique(postable_snv$txid)
##' codingseq <- procodingseq[procodingseq$tx_id %in% txlist, ]
##' mtab <- aaVariation (postable_snv, codingseq)
##' OutputVarprocodingseq(mtab, codingseq, ids, lablersid=TRUE)
##' 

OutputVarprocodingseq <- function(vartable, procodingseq, ids, lablersid=FALSE, show_progress=FALSE, ...)
    {
        stopifnot(!lablersid || "rsid" %in% colnames(vartable))
  
        old <- options(stringsAsFactors = FALSE)
        on.exit(options(old), add = TRUE)
        
        nonsy <- vartable[vartable$vartype == "non-synonymous", ]
        
        aavar2pro <- nonsy
        
        refbase <- mapply(function(base, strand) ifelse(strand=='+', base, fastComplement(base)), aavar2pro$refbase, aavar2pro$strand)
        varbase <- mapply(function(base, strand) ifelse(strand=='+', base, fastComplement(base)), aavar2pro$varbase, aavar2pro$strand)

        aavar2pro$refbase <- refbase
        aavar2pro$varbase <- varbase
        aavar2pro <- aavar2pro[aavar2pro$aaref != "*", ]
        #aavar2pro <- aavar2pro[aavar2pro$aavar!="*", ]
        aavar2pro <- unique(aavar2pro)

        setDT(aavar2pro, key=c("proname", "aapos"))
        
        plist <- unique(aavar2pro$proname)
        pepcoding <- procodingseq[procodingseq$pro_name %in% plist, ]
        if (lablersid)
          pep_vars_by_name = lapply(plist, function(x) aavar2pro[x, .(varbase, aaref, aavar=unlist(aavar), aapos, pincoding, rsid)])
        else
          pep_vars_by_name = lapply(plist, function(x) aavar2pro[x, .(varbase, aaref, aavar=unlist(aavar), aapos, pincoding)])
        names(pep_vars_by_name) = plist
        
        # add DNA complement column if it is missing
        if (is.null(procodingseq$dna_complement))
        {
          #message("Optimizing procodingseq for fast reverse complement access...")
          pepcoding$dna_complement = fastComplement(pepcoding$coding)
        }
        
        coding_index = which(colnames(pepcoding)=="coding")
        complement_index = which(colnames(pepcoding)=="dna_complement")
        
        #pep_var <- pepcoding
        var_names <- vector('character', length(plist))
        if (show_progress) { pb = txtProgressBar(0, nrow(pepcoding), style=3) }
        for(i in 1:nrow(pepcoding)){
            if (show_progress) { setTxtProgressBar(pb, i) }
            #print(i)
            #pvar <- aavar2pro[pep_var$pro_name[i, 'pro_name'])
            #pvar <- pvar[order(as.numeric(pvar$aapos)), ]
            pro_name = pepcoding$pro_name[[i]]
            varcoding = pepcoding$coding[[i]]
            varcomplement = pepcoding$dna_complement[[i]]
            
            pvar = pep_vars_by_name[[pro_name]]
            pincoding = pvar$pincoding
            aaref = pvar$aaref
            aapos = pvar$aapos
            aavar = pvar$aavar
            rsid = pvar$rsid
            varbase = pvar$varbase
            
            var_names_each = vector('character', nrow(pvar))
            
            # apply the SNPs to the reference coding sequence
            for(j in 1:nrow(pvar)){
                p = pincoding[[j]]
                var = substr(varbase[[j]], 1, 1)
                substr(varcoding, p, p) <- var
                substr(varcomplement, p, p) <- fastComplement(var)
                
                if(lablersid && !is.na(rsid[[j]])){
                  var_names_each[[j]] <- paste0(rsid[[j]], ":", aaref[[j]], aapos[[j]], substr(aavar[[j]], 1, 1))
                }else{
                  var_names_each[[j]] <- paste0(aaref[[j]], aapos[[j]], substr(aavar[[j]], 1, 1))
                }
            }
            # assign by reference
            set(pepcoding, i, coding_index, varcoding)
            set(pepcoding, i, complement_index, varcomplement)

            var_names[[i]] = paste0(var_names_each, collapse=",")

            split_on_stop <- strsplit(var_names[[i]], '*', fixed=TRUE)
            has_stop_gain <- length(split_on_stop[[1]]) > 1
            var_names[[i]] <- paste0(split_on_stop[[1]][1], ifelse(has_stop_gain, '*', ''))
        }
        if (show_progress) { close(pb) }
        
        pep_all = data.table(pro_name=pepcoding$pro_name,
                             tx_name=pepcoding$tx_name,
                             tx_id=pepcoding$tx_id,
                             coding=pepcoding$coding,
                             dna_complement=pepcoding$dna_complement,
                             var_name=var_names,
                             key="pro_name")

        setDT(ids, key="pro_name")
        snvprocoding = pep_all[ids, nomatch=0]
        snvprocoding$pro_name <- paste0(snvprocoding$pro_name, "_", snvprocoding$var_name)
        
        snvprocoding[, .(pro_name, tx_name, tx_id,
                         coding, dna_complement,
                         gene_name, description)]
        #write(outformat, file=outfile)
    }
chambm/customProDB documentation built on May 31, 2019, 12:08 p.m.