R/PrepareAnnotationRefseq.R

Defines functions .UCSC_dbselect .gen_splicmatrix PrepareAnnotationRefseq

Documented in PrepareAnnotationRefseq

##' prepare the annotation for Refseq through UCSC table browser.
##'
##' @title prepare annotation for Refseq
##' @param genome specify the UCSC DB identifier (e.g. "hg19")
##' @param CDSfasta path to the fasta file of coding sequence.
##' @param pepfasta path to the fasta file of protein sequence, check 'introduction' for more detail.
##' @param annotation_path specify a folder to store all the annotations.
##' @param dbsnp specify a snp dataset to be used for the SNP annotation, default is NULL. (e.g. "snp148")
##' @param transcript_ids optionally, only retrieve transcript annotation data for the specified set of transcript ids. Default is NULL.
##' @param splice_matrix whether generate a known exon splice matrix from the annotation. this is not necessary if you don't want to analyse junction results, default is FALSE. 
##' @param ClinVar whether to download ClinVar data, default is FALSE.
##' @param ... additional arguments
##' @return several .RData file containing annotations needed for further analysis.
##' @author Xiaojing Wang
##' @examples
##' 
##' transcript_ids <- c("NM_001126112", "NM_033360", "NR_073499", "NM_004448",
##'         "NM_000179", "NR_029605", "NM_004333", "NM_001127511")
##' pepfasta <- system.file("extdata", "refseq_pro_seq.fasta", 
##'             package="customProDB")
##' CDSfasta <- system.file("extdata", "refseq_coding_seq.fasta", 
##'             package="customProDB")
##' annotation_path <- tempdir()
##' PrepareAnnotationRefseq(genome='hg19', CDSfasta, pepfasta, annotation_path, 
##'             dbsnp=NULL, transcript_ids=transcript_ids, 
##'             splice_matrix=FALSE, ClinVar=FALSE)
##' 



PrepareAnnotationRefseq <- function(genome='hg19', CDSfasta, pepfasta, 
                annotation_path, dbsnp=NULL, transcript_ids=NULL, 
                splice_matrix=FALSE, ClinVar=FALSE,
 ...) {
    options(stringsAsFactors=FALSE)
    session  <- browserSession()
    genome(session) <- genome
    tablename <- 'refGene'

    message("Build TranscriptDB object (txdb.sqlite) ... ", appendLF=TRUE)    
    txdb <- makeTxDbFromUCSC(genome=genome, tablename=tablename, 
                transcript_ids=transcript_ids)
    saveDb(txdb, file=paste(annotation_path, '/txdb.sqlite', sep=''))
    packageStartupMessage(" done")
    
    
	message("Prepare gene/transcript/protein id mapping information (ids.RData) ... ", 
            appendLF=FALSE)
    
    query_refGene <- ucscTableQuery(session, "refSeqComposite", table="refGene")
    refGene <- getTable(query_refGene)
	if(!is.null(transcript_ids)){
		refGene <- subset(refGene, name %in% transcript_ids)
		}
    #query <- ucscTableQuery(session, table="hgFixed.refLink", 
    #            names=refGene[, 'name2'])
    reflink <- .UCSC_dbselect("hgFixed", "refLink")
    ids <- subset(reflink, mrnaAcc %in% refGene[, 'name'], select = name:protAcc)
    colnames(ids) <- c('gene_name', 'description', 'tx_name', 'pro_name')
    save(ids, file=paste(annotation_path, '/ids.RData', sep=''))
    packageStartupMessage(" done")
    
    #tr_coding <- subset(ids,pro_name!="")
    #tr_noncoding <- subset(ids,pro_name == "")
    #txdb_coding <- makeTranscriptDbFromUCSC(genome=genome, tablename=tablename, 
    #                transcript_ids=tr_coding[, "tx_name"] )
    #saveDb(txdb_coding, 
    #       file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))
    
    #txdb_noncoding <- makeTranscriptDbFromUCSC(genome=genome, 
    #        tablename=tablename, transcript_ids=tr_noncoding[, "tx_name"] )
    #saveDb(txdb_noncoding, 
    #        file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))
    
    message("Prepare exon annotation information (exon_anno.RData) ... ", 
            appendLF=FALSE)
    
    transGrange <- transcripts(txdb)
    tr <- IRanges::as.data.frame(transGrange)
    cdsByTx <- cdsBy(txdb, "tx", use.names=F)
    exonByTx <- exonsBy(txdb, "tx", use.names=F)
    fiveutrByTx <- fiveUTRsByTranscript(txdb, use.names=F)
    threeutrByTx <- threeUTRsByTranscript(txdb, use.names=F)
    
    #####################################################
    # get error in most recent version of IRanges package
    # previous: rownames after unlist 1.1 1.2 1.3
    # now: .1 .2 .3 ... were removed, so there are some rows with same rownames
    ######################################################
    #cdss <-  IRanges::as.data.frame(IRanges::unlist(cdsByTx))     
    #exons <- IRanges::as.data.frame(IRanges::unlist(exonByTx))
    #fiveutrs <- IRanges::as.data.frame(IRanges::unlist(fiveutrByTx))
    #threeutrs <- IRanges::as.data.frame(IRanges::unlist(threeutrByTx))

    cdss <-  IRanges::as.data.frame(cdsByTx)
    exons <- IRanges::as.data.frame(exonByTx)
    fiveutrs <- IRanges::as.data.frame(fiveutrByTx)
    threeutrs <- IRanges::as.data.frame(threeutrByTx)
    
    #txid <- matrix(unlist(strsplit(rownames(exons), '\\.')), ncol = 2, byrow =T)[, 1]
    #txid <- gsub('=','\\.', txid)
    exon_p <- data.frame(txid=exons[, "group_name"], chr=exons[, "seqnames"], 
                exon_s=exons[, "start"], exon_e=exons[, "end"], 
                exon_rank=exons[, "exon_rank"])
    exon2tr <-  merge(exon_p, tr,by.y="tx_id", by.x="txid")
    exon2tr <- exon2tr[, -which(names(exon2tr) %in% c("seqnames", "width"))]

    #txid <- matrix(unlist(strsplit(rownames(cdss), '\\.')), ncol = 2, 
    #       byrow =T)[, 1]
    #txid <- gsub('=','\\.',txid)
    cds_p <- data.frame(txid=cdss[, "group_name"], cds_s=cdss[, "start"], 
                cds_e=cdss[, "end"], exon_rank=cdss[, "exon_rank"], 
                width=cdss[, "width"])
    ttt <- split(cds_p, cds_p$txid)
    
        cds_p_new_list <-lapply(ttt, function(x){
        #len <- x[,'cds_e']-x[,'cds_s']+1
        #cum <- cumsum(len)
        cum <- cumsum(x[, 'width'])
        rdis <- cbind(c(1, cum[1:length(cum)-1]+1), cum)
        colnames(rdis) <- c('cds_start', 'cds_end')
        tmp <- cbind(x, rdis)
        tmp
        })
    

    cds_p_new <- do.call(rbind, cds_p_new_list)
    cds_p_new <- cds_p_new[, -which(names(cds_p_new) %in% c("width"))]
    
    #for(i in 1:length(ttt)) {
    #    print(i)
    #    ttt1 <- ttt[[i]]
    #    len <- ttt1[,'cds_e']-ttt1[,'cds_s']+1
    #    cum <- cumsum(len)
    #    rdis <- cbind(c(1,cum[1:length(cum)-1]+1),cum)
    #    colnames(rdis) <- c('cds_start','cds_end')
    #    tmp <- cbind(ttt1,rdis)
    #    cds_p_new <- rbind(cds_p_new,tmp)
    #}

    cds2exon <- merge(exon2tr, cds_p_new, by.x=c("txid", "exon_rank"), 
                        by.y=c("txid", "exon_rank"), all.x = TRUE)
    #txid <- matrix(unlist(strsplit(rownames(fiveutrs), '\\.')), ncol = 2, 
    #               byrow =T)[, 1]
    #txid <- gsub('=','\\.', txid)
    if(dim(fiveutrs)[1] > 0){
        fiveutr_p <- data.frame(txid=fiveutrs[, "group_name"], 
                                fiveutr_s=fiveutrs[, "start"], 
                                fiveutr_e=fiveutrs[, "end"], 
                                exon_rank=fiveutrs[, "exon_rank"])
        fiveutr2exon <- merge(cds2exon, fiveutr_p, by.x=c("txid", "exon_rank"), 
                    by.y =c("txid", "exon_rank"), all.x = TRUE)
                 
        #txid <- matrix(unlist(strsplit(rownames(threeutrs),'\\.')), 
        #s    ncol = 2,byrow =T)[, 1]
        #txid <- gsub('=','\\.', txid)
        threeutr_p <- data.frame(txid=threeutrs[, "group_name"], 
                                threeutr_s=threeutrs[, "start"], 
                                threeutr_e=threeutrs[, "end"], 
                                exon_rank=threeutrs[, "exon_rank"])
        threeutr2exon <- merge(fiveutr2exon, threeutr_p, 
                    by.x=c("txid", "exon_rank"),
                    by.y=c("txid", "exon_rank"), all.x = TRUE)
        
        #exon <- merge(threeutr2exon,ids,by.x=c("tx_name"),by.y="tx_name",all.x = TRUE)
        exon <- threeutr2exon[order(threeutr2exon$txid, threeutr2exon$exon_rank), ]
        
        colnames(exon) <- c("tx_id","rank", "chromosome_name", 
            "exon_chrom_start", "exon_chrom_end", "start_position", 
            "end_position", "strand", "tx_name", "cds_chr_start", 
            "cds_chr_end", "cds_start", "cds_end", "5_utr_start", 
            "5_utr_end", "3_utr_start", "3_utr_end")
    }else{
        exon <- cds2exon
        colnames(exon) <- c("tx_id","rank", "chromosome_name", 
            "exon_chrom_start", "exon_chrom_end", "start_position", 
            "end_position", "strand", "tx_name", "cds_chr_start", 
            "cds_chr_end", "cds_start", "cds_end")
    }
    pro_name <- ids[match(exon[, 'tx_name'], ids[, 'tx_name']), 'pro_name']
    gene_name <- ids[match(exon[, 'tx_name'], ids[, 'tx_name']), 'gene_name']
    exon <- cbind(exon, pro_name, gene_name)
    
    save(exon, file=paste(annotation_path, '/exon_anno.RData', sep=''))
    packageStartupMessage(" done")
    
    message("Prepare protein sequence (proseq.RData) ... ", appendLF=FALSE)
    pro_seqs <- readAAStringSet(pepfasta, format= 'fasta')
    pro_name_v <- names(pro_seqs)
    pro_name <- unlist(lapply(pro_name_v, function(x) strsplit(x, '\\.')[[1]][1]))
    tx_name <- ids[match(pro_name, ids[, 'pro_name']), 'tx_name']
    proteinseq <- as.data.frame(pro_seqs)
    proteinseq <- cbind(proteinseq, pro_name_v, pro_name, tx_name)
    colnames(proteinseq) <- c("peptide", "pro_name_v", "pro_name", "tx_name")
    proteinseq <- subset(proteinseq, tx_name %in% refGene[, 'name'])
    save(proteinseq, file=paste(annotation_path, '/proseq.RData', sep=''))
    packageStartupMessage(" done")
    
    message("Prepare protein coding sequence (procodingseq.RData)... ", 
                appendLF=FALSE)
    cds_seqs <- readDNAStringSet(CDSfasta, format= 'fasta')
    tx_name_tmp <- unlist(lapply(names(cds_seqs), function(x) strsplit(x, ' ')[[1]][1]))
    tx_name <- unlist(lapply(tx_name_tmp, function(x) paste(strsplit(x, '_')[[1]][3:4], collapse='_')))
    tx_range_tmp <- unlist(lapply(names(cds_seqs), function(x) strsplit(x, ' ')[[1]][2]))
    tx_chr <- unlist(lapply(tx_range_tmp, function(x) substring(strsplit(x, ':')[[1]][1],7)))
    tx_cds_sta <- unlist(lapply(tx_range_tmp, function(x) strsplit(strsplit(x, ':')[[1]][2], '-')[[1]][[1]]))
    tx_cds_end <- unlist(lapply(tx_range_tmp, function(x) strsplit(strsplit(x, ':')[[1]][2], '-')[[1]][[2]]))
    
    tx_name_cds <- refGene[match(paste(tx_name, tx_chr, tx_cds_sta, tx_cds_end, sep=' '), 
                    paste(refGene[, 'name'], refGene[, 'chrom'], 
                    refGene[, 'cdsStart']+1, refGene[, 'cdsEnd'], sep=' ')),
                    c('name','chrom','txStart','txEnd')]
    
    tx_id <- tr[match(paste(tx_name_cds[, 'name'], tx_name_cds[, 'chrom'], 
            tx_name_cds[, 'txStart']+1, tx_name_cds[, 'txEnd'], sep=' '), 
            paste(tr[, 'tx_name'], tr[, 'seqnames'], tr[, 'start'], 
            tr[, 'end'], sep=' ')), 'tx_id']
    
    pro_name <- ids[match(tx_name,ids[, 'tx_name']), 'pro_name']
    procodingseq <- as.data.frame(cds_seqs)
    procodingseq <- cbind(procodingseq, names(cds_seqs), pro_name, tx_name, tx_id)
    colnames(procodingseq) <- c("coding", "tx_name_full", "pro_name", "tx_name", "tx_id")
    procodingseq <- subset(procodingseq, tx_name %in% refGene[, 'name'])
    save(procodingseq, file=paste(annotation_path, '/procodingseq.RData', sep=''))
    
    packageStartupMessage(" done")
    
    if (!is.null(dbsnp)) {
        message("Prepare dbSNP information (dbsnpinCoding.RData) ... ", 
                appendLF=FALSE)
        dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
        dbsnp <- pmatch(dbsnp, dbsnps)
        if (is.na(dbsnp)) 
            stop("invalid dbsnp name for specified genome")
        if (dbsnp == -1) 
            stop("ambiguous dbsnp name")
        
		snpCodingTab <- .UCSC_dbselect(genome(session), 
				paste(dbsnps[dbsnp], 'CodingDbSnp', sep=''))
		#rawToChar(t3[1:20,'alleles'][[2]])   ###hex code to character

		#dbsnp_query <- ucscTableQuery(session, dbsnps[dbsnp],
        #            table=paste(dbsnps[dbsnp], 'CodingDbSnp', sep=''))
        #snpCodingTab <- getTable(dbsnp_query)
        snpCoding <- subset(snpCodingTab,transcript %in% refGene[, 'name'], 
                        select=c(chrom:name, alleleCount, alleles))
        snpCoding[, 'alleles'] <- unlist(lapply(snpCoding[, 'alleles'], 
					function(x) rawToChar(x)))
		snpCoding <- unique(snpCoding)
        #save(snpCoding,file=paste(annotation_path, '/snpcoding.RData', sep=''))
		dbsnpinCoding <- GRanges(seqnames=snpCoding[, 'chrom'], 
            ranges=IRanges(start=snpCoding[, 'chromStart'], 
            end=snpCoding[, 'chromEnd']), strand='*', 
            rsid=snpCoding[, 'name'], alleleCount=snpCoding[, 'alleleCount'], 
            alleles=snpCoding[, 'alleles'])    
        save(dbsnpinCoding, file=paste(annotation_path, '/dbsnpinCoding.RData', 
                sep=''))
        packageStartupMessage(" done")
    }
    
	if (ClinVar) {
        message("Prepare ClinVar information (ClinVar.RData) ... ", appendLF=FALSE)
        
        clinvar_query <- ucscTableQuery(session, 'clinvar', table='clinvarMain')
        clinvarTab <- getTable(clinvar_query)
		
		clinvar <- GRanges(seqnames=clinvarTab[, 'chrom'], 
        ranges=IRanges(start=clinvarTab[, 'chromStart'], end=clinvarTab[, 'chromEnd']), 
        strand = '*', rcvAcc=clinvarTab[,'rcvAcc'], clinSign=clinvarTab[,'clinSign'])    
        
        clinvar <- subsetByOverlaps(clinvar, transGrange)
        
        save(clinvar,file=paste(annotation_path, '/clinvar.RData', sep=''))
        packageStartupMessage(" done")        
    }
	
	
	if(0 > 1){
		####no longer available from UCSC table browser
		if (COSMIC) {
			#cosmic <- trackNames(session)[grep('cosmic',trackNames(session), fixed=T)]
			message("Prepare COSMIC information (cosmic.RData) ... ", appendLF=FALSE)
			
			#cosmic_query <- ucscTableQuery(session, 'cosmic', table='cosmic')
			#cosmicTab <- getTable(cosmic_query)
			cosmicTab <- .UCSC_dbselect(genome(session), 'cosmic')
			
			cosmic <- GRanges(seqnames=cosmicTab[, 'chrom'], 
			ranges=IRanges(start=cosmicTab[, 'chromStart'], end=cosmicTab[, 'chromEnd']), 
			strand = '*', cosid=cosmicTab[,'name'])    
			
			#cosmic <- keepSeqlevels(cosmic,transGrange)
			cosmic <- subsetByOverlaps(cosmic, transGrange)
			
			save(cosmic,file=paste(annotation_path, '/cosmic.RData', sep=''))
			packageStartupMessage(" done")        
		}
	}
	
    if(splice_matrix){
        message("Prepare exon splice information (splicemax.RData) ... ", 
                appendLF=FALSE)
        index <- which(elementNROWS(exonByTx)==1)
        exonByTx_mul <- exonByTx[-index]
        exons_mul <- IRanges::as.data.frame(exonByTx_mul)
        exonslist <- split(exons_mul, exons_mul$group_name)
        #system.time( exonByTx <- exonsBy(txdb,"tx", use.names=F))
        splicemax_list <- lapply(exonslist, .gen_splicmatrix)
        splicemax <- do.call(rbind, splicemax_list)
        save(splicemax, file=paste(annotation_path, '/splicemax.RData', sep=''))
        packageStartupMessage(" done")        
    }
                
}
    
.gen_splicmatrix <- function(x, 
     ...) {           
         mystrand=x[1,'strand']
         a=x[,'exon_rank']
         b=x[,'exon_id']
         n=length(a)
         if (mystrand=='+'){
            tmp=order(a)
            
        }else{
            tmp=order(a,decreasing=T)
            
        }
        mm=cbind(b[tmp[1:(n-1)]], b[tmp[2:n]])
        mm
    }

	
### See https://genome.ucsc.edu/goldenpath/help/mysql.html for how to connect
### to a MySQL server at UCSC. By default UCSC_dbselect() uses the server
### located on the US west coast.
.UCSC_dbselect <- function(dbname, from, columns=NULL, where=NULL,
                          server="genome-mysql.soe.ucsc.edu")
{
    columns <- if (is.null(columns)) "*" else paste0(columns, collapse=",")
    SQL <- sprintf("SELECT %s FROM %s", columns, from)
    if (!is.null(where)) {
        stopifnot(isSingleString(where))
        SQL <- paste(SQL, "WHERE", where)
    }
    dbconn <- dbConnect(RMariaDB::MariaDB(), dbname=dbname,
                                             username="genome",
                                             host=server,
                                             port=3306)
    on.exit(dbDisconnect(dbconn))
    dbGetQuery(dbconn, SQL)
}

Try the customProDB package in your browser

Any scripts or data that you put into this service are public.

customProDB documentation built on Nov. 8, 2020, 8:06 p.m.