R/OutputNovelJun.R
In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

Documented in OutputNovelJun

##' Three-frame translation of novel junctions. And remove those could be found in normal protein sequences.
##' This function requires a genome built by BSgenome package.
##' 
##' @title generate peptide FASTA file that contains novel junctions.
##' @param junction_type a data frame which is the output of function JunctionType()
##' @param genome a BSgenome object. (e.g. Hsapiens)
##' @param outfile output file name
##' @param proteinseq a data frame cotaining amino acid sequence for each protein.
##' @param ... Additional arguments.
##' @return FASTA file that contains novel junction peptides.
##' @author Xiaojing Wang
##' @examples
##' 
##' bedfile <- system.file("extdata/beds", "junctions1.bed", package="customProDB")
##' jun <-  Bed2Range(bedfile,skip=1,covfilter=5)
##' load(system.file("extdata/refseq", "splicemax.RData", package="customProDB"))
##' load(system.file("extdata/refseq", "ids.RData", package="customProDB"))
##' txdb <- loadDb(system.file("extdata/refseq", "txdb.sqlite", 
##'             package="customProDB"))
##' junction_type <- JunctionType(jun, splicemax, txdb, ids)
##' table(junction_type[, 'jun_type'])
##' chrom <- paste('chr',c(1:22,'X','Y','M'),sep='')
##' junction_type <- subset(junction_type, seqnames %in% chrom)
##' outf_junc <- paste(tempdir(), '/test_junc.fasta', sep='')
##' #outf_junc_coding <- paste(tempdir(), '/test_junc_coding.fasta', sep='')
##' load(system.file("extdata/refseq", "proseq.RData", package="customProDB"))
##' library('BSgenome.Hsapiens.UCSC.hg19')
##' OutputNovelJun <- OutputNovelJun(junction_type, Hsapiens, outf_junc, 
##'              proteinseq)
##' 


OutputNovelJun <- function(junction_type, genome, outfile, 
    #outfile_c, 
    proteinseq, ...)
    {
        options(stringsAsFactors=FALSE)
        #ids <- subset(ids,pro_name!='')
        
        #trans <- transcripts(txdb)
        #index <- which(values(trans)[['tx_name']] %in% ids[,'tx_name'])
        #pro_trans <- trans[index]
        novel_junc <- subset(junction_type, jun_type != 'known junction')
        if(!length(grep('chr', novel_junc[, 'seqnames'], fixed=T))>0) {
            novel_junc[, 'seqnames'] <- paste('chr', novel_junc[, 'seqnames'], sep='')
            idx <- which(novel_junc[, 'seqnames'] %in% seqnames(genome))
            novel_junc <- novel_junc[idx, ]
        }
        
        ###remove abnormal junctions
        idx_abn <- union(which(novel_junc[, 'start'] < 0),  
                    which(novel_junc[, 'end'] < 0))
        if(length(idx_abn > 0)) novel_junc <- novel_junc[-idx_abn, ]
        
        junRange1 <- GRanges(seqnames=novel_junc$seqnames, 
                ranges=IRanges(start=novel_junc$part1_sta, 
                end=novel_junc$part1_end), 
                strand=novel_junc$strand, 
                junction_id=novel_junc$id)
        
        junRange2 <- GRanges(seqnames=novel_junc$seqnames, 
                ranges=IRanges(start=novel_junc$part2_sta, 
                end=novel_junc$part2_end), 
                strand=novel_junc$strand, 
                junction_id=novel_junc$id)
        
		######prepare annotation for proBAMr
		jr1 <-  IRanges::as.data.frame(junRange1)
		jr2 <- IRanges::as.data.frame(junRange2)
		jr1_rank <- unlist(lapply(jr1[, 'strand'], function(x) ifelse(x=='+', 1, 2)))
		jr1 <- cbind(jr1, rank=jr1_rank)
		jr2_rank <- unlist(lapply(jr2[, 'strand'], function(x) ifelse(x=='+', 2, 1)))
		
		jr2 <- cbind(jr2, rank=jr2_rank)
		
		jrs <- rbind(jr1, jr2)
		colnames(jrs) <- c('chromosome_name', 'cds_chr_start', 'cds_chr_end', 'width', 'strand', 'tx_name', 'rank')
		ttt <- split(jrs, jrs$tx_name)

        jrs_list <-lapply(ttt, function(x){
        #len <- x[,'cds_e']-x[,'cds_s']+1
        #cum <- cumsum(len)
		x <- x[order(x$rank),]
        cum <- cumsum(x[, 'width'])
        rdis <- cbind(c(1, cum[1:length(cum)-1]+1), cum)
        colnames(rdis) <- c('cds_start', 'cds_end')
        tmp <- cbind(x, rdis)
        tmp
        })
		
		nov_jun_anno <- do.call(rbind, jrs_list)
		
		jun_anno <- merge(novel_junc[, 1:7], nov_jun_anno, by.x='id', by.y='tx_name')
		jun_anno <- jun_anno[, -c(2, 6)]
		colnames(jun_anno) <- c("tx_name", "start_position", "end_position", 'intron_len', 'cov',"chromosome_name", 
			"cds_chr_start", "cds_chr_end", "width", 'strand', "rank", "cds_start", 
			"cds_end")
		pro_name <- paste(paste(jun_anno[, 'tx_name'], '_', 
                    jun_anno[, 'chromosome_name'], ':', 
                    jun_anno[, 'start_position'], '-', jun_anno[, 'end_position'], 
                    sep=''), jun_anno[, 'cov'],sep='|')
		jun_anno <- cbind(jun_anno, 'pro_name'=pro_name)
		#jun_anno[, 'chromosome_name'] <- gsub('chr', '', jun_anno[, 'chromosome_name'] )
		save(jun_anno, file=paste(outfile, '_jun_anno.RData', sep=''))
        
        
		#match1_protx <- findOverlaps(junRange1,pro_trans)
        #match2_protx <- findOverlaps(junRange2,pro_trans)
        
        #juntransRange1 <- junRange1[unique(queryHits(match1_protx))]
        #juntransRange2 <- junRange2[unique(queryHits(match2_protx))]
        
        #junseq1 <- getSeq(genome,'chr1',start=1000,end=2000,as.character=TRUE)
        ###already did reverseComplement
        junseq1 <- getSeq(genome, junRange1)
        junseq2 <- getSeq(genome, junRange2)
        
        junseq_cat <- DNAStringSet(mapply(function(x, y, z) 
                ifelse(z == '+', paste(x, y, sep=''), paste(y, x, sep='')), 
                as.data.frame(junseq1)[, 1], 
                as.data.frame(junseq2)[, 1], as.character(strand(junRange1))))
        
        #index_plus <- which(strand(junRange1) == '+')
        #index_minus <- which(strand(junRange1) == '-')
        #seqs_plus <- junseq_cat[index_plus]
        #seqs_minus <- reverseComplement(junseq_cat[index_minus])
        #seqs <- c(seqs_plus, seqs_minus)
        
        #novel_junc_new <- rbind(novel_junc[index_plus, ], 
        #                                novel_junc[index_minus, ])
        novel_junc_new <- novel_junc
        seqs <- junseq_cat
        
        ##Remove sequences contains NNN
        Nindx <- grep('N', seqs)
        if(length(Nindx) > 0){
            seqs <- seqs[-Nindx]
            novel_junc_new <- novel_junc_new[-Nindx, ]
        }
        
        seqs_name <- paste(paste(novel_junc_new[, 'id'], '_', 
                    novel_junc_new[, 'seqnames'], ':', 
                    novel_junc_new[, 'start'], '-', novel_junc_new[, 'end'], 
                    sep=''), novel_junc_new[, 'cov'],sep='|')
                    
        junpepcoding <- data.frame('pro_name'=seqs_name, 
                                    'coding'=as.data.frame(seqs)[, 1])
        ######## coding seqs could be used as input for proBAMr
		save(junpepcoding, file=paste(outfile, '_coding.RData', sep=''))
        
        #######three frame translation
		peptides_r1 <- translate(seqs)
        peptides_r2 <- translate(subseq(seqs, start=2))
        peptides_r3 <- translate(subseq(seqs, start=3))
        
        junpos_rna_p1 <- ifelse(novel_junc_new[, 'strand'] == '+', 
                        as.numeric(novel_junc_new[, 'part1_len']), 
                        as.numeric(novel_junc_new[, 'part2_len']))
        junpos_rna_p2 <- ifelse(novel_junc_new[, 'strand'] == '+', 
                        as.numeric(novel_junc_new[, 'part1_len'])+1, 
                        as.numeric(novel_junc_new[, 'part2_len'])+1)
        
        junpos_r1_p1 <- ceiling(junpos_rna_p1/3)
        junpos_r1_p2 <- ceiling(junpos_rna_p2/3)
        
        junpos_r2_p1 <- ceiling((junpos_rna_p1-1)/3)
        junpos_r2_p2 <- ceiling((junpos_rna_p2-1)/3)
        
        junpos_r3_p1 <- ceiling((junpos_rna_p1-2)/3)
        junpos_r3_p2 <- ceiling((junpos_rna_p2-2)/3)
        
        name_r1 <- paste(paste(novel_junc_new[, 'id'], '_', novel_junc_new[, 'seqnames'], 
                    ':', novel_junc_new[, 'start'], '-', novel_junc_new[, 'end'], 
                    sep=''), novel_junc_new[, 'cov'], 'ORF1 ', 
                    paste('Junpos:', junpos_r1_p1, '-', junpos_r1_p2, sep=''), 
                    novel_junc_new[, 'strand'], novel_junc_new[, 'tx_name_part1'], 
                    novel_junc_new[, 'tx_name_part2'], 
                    novel_junc_new[, 'jun_type'], sep='|')
        name_r2 <- paste(paste(novel_junc_new[, 'id'],'_', novel_junc_new[, 'seqnames'], 
                    ':',novel_junc_new[, 'start'], '-', novel_junc_new[, 'end'], 
                    sep=''), novel_junc_new[, 'cov'], 'ORF2 ', 
                    paste('Junpos:', junpos_r2_p1, '-', junpos_r2_p2, sep=''), 
                    novel_junc_new[, 'strand'], novel_junc_new[, 'tx_name_part1'], 
                    novel_junc_new[, 'tx_name_part2'], 
                    novel_junc_new[, 'jun_type'], sep='|')
        name_r3 <- paste(paste(novel_junc_new[, 'id'],'_', novel_junc_new[, 'seqnames'], 
                    ':',novel_junc_new[, 'start'], '-', novel_junc_new[, 'end'], 
                    sep=''), novel_junc_new[, 'cov'], 'ORF3 ', 
                    paste('Junpos:', junpos_r3_p1, '-', junpos_r3_p2, sep=''), 
                    novel_junc_new[, 'strand'], novel_junc_new[, 'tx_name_part1'], 
                    novel_junc_new[, 'tx_name_part2'], 
                    novel_junc_new[, 'jun_type'], sep='|')
        
        all_pep<- rbind(cbind(name_r1, as.data.frame(peptides_r1)[, 1]), 
                    cbind(name_r2, as.data.frame(peptides_r2)[, 1]), 
                    cbind(name_r3, as.data.frame(peptides_r3)[, 1]))
        
        ### remove peptide contain stop codon
        #index_stop <- grep('*', all_pep[, 2], fixed=T)
        #if(length(index_stop) > 0){
        #    all_pep_rmstop <- all_pep[-index_stop, ]
        #}else all_pep_rmstop <- all_pep
        
		### remove any amino acids after first stop codon. after that,
		### if the peptide is less than 7 amino acid, remove it
		###	if the peptide is not cover the junction position, remove it
		all_pep_rmstop <- all_pep
		all_pep_rmstop[, 2] <- unlist(lapply(all_pep_rmstop[, 2], function(x) 
									strsplit(x, '*',fixed=TRUE)[[1]][1]))
		peplen <- lapply(all_pep_rmstop[, 2], nchar)
		index_keep <- intersect(which(peplen > c(junpos_r1_p1, junpos_r2_p1, junpos_r3_p1)),
						which(peplen > 7))
		all_pep_rmstop <- all_pep_rmstop[index_keep, ]
		
		### check if any peptides can be found in the normal database, remove those
		PEP <- all_pep_rmstop[, 2]
        normalProteins = proteinseq[, 'peptide']
		names(normalProteins) = proteinseq[, 'pro_name']
		idxList = AhoCorasickSearchList(PEP, list(normal=normalProteins), groupByKeyword=T)
		normal_pep <- names(idxList$normal)
		
		index_nor <- which(all_pep_rmstop[, 2] %in% normal_pep)
		
		###slow
		#index_nor <- lapply(all_pep_rmstop[, 2], function(x) 
        #                    grep(x, proteinseq[, 'peptide'], fixed=T))
        #index_nor <- which(unlist(lapply(index_nor, length)) > 0)
        
        if(length(index_nor) > 0){
            all_pep_new <- all_pep_rmstop[-index_nor, ] 
        }else all_pep_new <- all_pep_rmstop
        
		tmp <- paste('>', all_pep_new[, 1], '\n', all_pep_new[, 2], sep='')
        write(tmp, file=outfile)
        
		junpep <- data.frame( 'peptide'=all_pep_new[, 2], 'pro_name_v'=all_pep_new[, 1], 
                             'pro_name' = unlist(lapply(all_pep_new[, 1], function(x)
												strsplit(x, ' ')[[1]][1]
											))
							  )
		
		save(junpep, file=paste(outfile, '_junpep.RData', sep=''))
        
    }

Any scripts or data that you put into this service are public.

customProDB documentation built on Nov. 8, 2020, 8:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

customProDB
Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

R/OutputNovelJun.R
In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

Defines functions OutputNovelJun

Documented in OutputNovelJun

Try the customProDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

customProDB Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

R/OutputNovelJun.R In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

Defines functions OutputNovelJun

Documented in OutputNovelJun

Try the customProDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

customProDB
Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

R/OutputNovelJun.R
In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search