R/generatingCDSaaFile.R
In geno2proteo: Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci

Documented in generatingCDSaaFile

generatingCDSaaFile <- function(geneticCodeFile_line, gtfFile, DNAfastaFile, 
                                outputFolder='./', perlExec='perl') {

    scriptFolder= system.file("exec", package="geno2proteo")
    # the perl script used to get the DNA for the CDS from the fasta file.
    perlScript_getDNAseq=file.path(scriptFolder, 
        "getDNAseqsFromGenome_loci.perl")
    # the perl script for translating a DNA sequence into a AA sequence, 
    # using the genetic codes provided.
    perlScript_getAAseq=file.path(scriptFolder, "DNAseqTranslation_CDS.perl") 
    filenamePrefix = sub('.*/', '', gtfFile)
    # read the CDS information from the GTF file 
    kk21 = read.table(gtfFile, sep='\t', stringsAsFactors=FALSE)
    kk = c(1:22, 'X', 'Y', 'MT')
    kk = kk21[,1] %in% kk
    kk21 = kk21[kk,]
    kk = kk21[,3] =='CDS' # CDS are those coding regions
    kk22 = kk21[kk,]
    kkt = kk22[,9]
    # get all the transcript ids
    kk = sub('.*transcript_id ', '',kkt); kk=sub(';.*', '', kk)
    # for each transcipt, get all its CDS (indexes of them)
    kk9991 = tapply(1:length(kk), kk, sort ) 

    # get all the protein ids
    kkt1 = sub('.*protein_id ', '',kkt); kkt1=sub(';.*', '', kkt1)
    # get all the protein ids
    kkt2 = sub('.*gene_name ', '',kkt); kkt2=sub(';.*', '', kkt2)

    pIDgene = cbind(proteinId=kkt1, geneName=kkt2)
    rownames(pIDgene) = kk
    # get the genomic position information of each CDS, and frame for 
    # protein translation
    kk9992 = kk22[,c(1, 4,5, 7, 8)] 
    kk9993 = lapply(kk9991, function(a) kk9992[a,] )
    # order the CDS of each transcript by their start
    kk9994 = lapply(kk9993, function(a) a[order(a[,2]),] )

    kk99941 = lapply(kk9994, as.matrix)

    kk = names(kk99941)

    kk9995 = lapply(1:length(kk), function(n) {
        kkt=kk99941[[n]]; kkt1=1:nrow(kkt); 
        if(kkt[1,4]=='-') kkt1=rev(kkt1); 
        cbind(paste(kk[n], kkt1, sep='_'), kkt) 
    } )

    kk99951 = lapply(kk9995, t)
    kk = unlist(kk99951)
    kk = t(matrix(kk, nrow=6))

    # the 6th column is for the frame of translation
    kk9996 = kk[,c(2:5, 1, 6)] 

    kk = strsplit(kk9996[,5], split='_')
    kk = t(matrix(unlist(kk), nrow=2))
    # matrix containing the ordered CDS for each transcript
    kk9996 = cbind(kk9996[,1:4], kk, kk9996[,6]) 

    ## get a proper bed format
    #kk9996 = kk9996[,c(1:3, 5:6, 4, 7)] # now using the

    #kk9996[,2] = as.numeric(kk9996[,2])
    
    filenameAllCDS = paste(outputFolder, '/', filenamePrefix, 
                            '_CDS_transcript.txt', sep='');
    
    write.table(kk9996, file=filenameAllCDS, sep='\t', 
                            quote=FALSE, row.names=FALSE, col.names=FALSE)
    
    # then run the perl script to get the DNA sequence of the CDS regions
    
    filenameAllCDSdnaSeq = paste(outputFolder, '/', 
                    filenamePrefix, '_CDS_transcript_DNAseq.txt', sep='');
    DNAfastaFile00 = DNAfastaFile
    # uncompress the fasta file, not remove the compressed
    if(grepl('\\.gz$', DNAfastaFile)[1]) { 
        #R.utils::gunzip(DNAfastaFile, remove=FALSE)
        #DNAfastaFile = sub('\\.gz$','', DNAfastaFile)
        destName=paste(outputFolder, '/', 
                    sub('.gz$', '', sub('.*\\/', '', DNAfastaFile)), sep='')
        R.utils::decompressFile(filename=DNAfastaFile, destname=destName, 
                                ext='gz', FUN=gzfile, remove=FALSE)
        DNAfastaFile = destName;
        
    }
    # using the double quotation to address the problem associate with blank
    # spaces in the command.
    command = paste('"', perlExec, '" "', perlScript_getDNAseq, '" "', 
                    filenameAllCDS, '" "', DNAfastaFile, '" "', 
                    filenameAllCDSdnaSeq, '"', sep='')
    
    system(command)
    
    if(grepl('\\.gz$', DNAfastaFile00)[1]) { # if uncompressed the fasta file 
        if (file.exists(DNAfastaFile)) { 
            invisible(file.remove(DNAfastaFile)) 
        }
        DNAfastaFile= DNAfastaFile00
    }
    
    # then run the perl script to get the AA sequences from the DNA sequence 
    # of a transcipts from its CDS.
    
    filenameAllCDSdnaSeqAA = paste(outputFolder, '/', 
                                    filenamePrefix, '_AAseq.txt', sep='');
    
    command = paste('"', perlExec, '" "', perlScript_getAAseq, '" "', 
        filenameAllCDSdnaSeq, '" "', geneticCodeFile_line, '" "', 
        filenameAllCDSdnaSeqAA, '"', sep='')
    
    system(command)

    resultsNow = read.table(filenameAllCDSdnaSeqAA, stringsAsFactors=FALSE,
                        sep='\t')
    
    kk = pIDgene[resultsNow[,5],]
    rownames(kk) = NULL
    
    # append the protein ID and gene symbol to the table.
    resultsNow = cbind(resultsNow, kk) 
    
    write.table(resultsNow, file=filenameAllCDSdnaSeqAA, sep='\t', 
                    row.names=FALSE, col.names=FALSE, quote=FALSE)
    
    R.utils::gzip(filenameAllCDSdnaSeqAA, overwrite=TRUE) # compress the file
    
    if (file.exists(filenameAllCDSdnaSeq)) { 
        invisible(file.remove(filenameAllCDSdnaSeq)) 
    }

    if ( file.exists(filenameAllCDS) ) { 
        invisible(file.remove(filenameAllCDS)) 
    }
}

Any scripts or data that you put into this service are public.

geno2proteo documentation built on June 13, 2022, 5:08 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

geno2proteo
Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci

R/generatingCDSaaFile.R
In geno2proteo: Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci

Defines functions generatingCDSaaFile

Documented in generatingCDSaaFile

Try the geno2proteo package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

geno2proteo Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci

R/generatingCDSaaFile.R In geno2proteo: Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci

Defines functions generatingCDSaaFile

Documented in generatingCDSaaFile

Try the geno2proteo package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

geno2proteo
Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci

R/generatingCDSaaFile.R
In geno2proteo: Finding the DNA and Protein Sequences of Any Genomic or Proteomic Loci