#' prepare the annotation from ENSEMBL through biomaRt.
#'
#' this function automaticlly prepares all annotation infromation needed in the following analysis.
#' @title prepare annotation from ENSEMBL
#' @param mart which version of ENSEMBL dataset to use. see useMart from package biomaRt for more detail.
#' @param annotation_path specify a folder to store all the annotations
#' @param dbsnp specify a snp dataset you want to use for the SNP annotation, default is NULL.
#' @param transcript_ids optionally, only retrieve transcript annotation data for the specified set of transcript ids
#' @param splice_matrix whether generate a known exon splice matrix from the annotation; not necessary if you don't want to analyse junction results, default is FALSE.
#' @param COSMIC whether to download COSMIC data, default is FALSE.
#' @param local_cache_path if non-NULL, refers to a directory where previously downloaded resources
#' (like protein coding sequences and COSMIC data) are cached so that the function can be re-run without
#' needing to download identical data again
#' @param ensembl_to_UCSC_genome_map a named list of named lists used to look up the UCSC dbkey for a given biomart; only used for downloading dbSNPs;
#' if DEFAULT_ENSEMBL_UCSC_GENOME_MAP does not contain an up-to-date mapping, pass a new mapping like
#' \code{list("<species>_gene_ensembl" = list("<month>.archive.ensembl.org" = "<ucsc_dbkey>"))}
#' @param ... additional arguments, currently unused
#' @return several .RData file containing annotations needed for following analysis.
#' @author Xiaojing Wang
#' @importFrom rtracklayer browserSession ucscTableQuery tableNames getTable trackNames ucscSchema genome<-
#' @importFrom plyr ddply .
#' @import biomaRt
#' @export
#' @examples
#'
#' ensembl <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL",
#' dataset="hsapiens_gene_ensembl",
#' host="sep2015.archive.ensembl.org")
#'
#' cache_path <- system.file("extdata", "cache", package="customProDB")
#' annotation_path <- tempdir()
#' transcript_ids <- c("ENST00000234420", "ENST00000269305", "ENST00000445888",
#' "ENST00000257430", "ENST00000508376", "ENST00000288602",
#' "ENST00000269571", "ENST00000256078", "ENST00000384871")
#'
#' PrepareAnnotationEnsembl(mart=ensembl, annotation_path=annotation_path,
#' splice_matrix=FALSE, dbsnp=NULL, transcript_ids=transcript_ids,
#' COSMIC=FALSE, local_cache_path=cache_path)
#'
#'\dontrun{
#' # full annotation tests
#'
#' test_datasets = c("hsapiens", "mmusculus", "cfamiliaris", "scerevisiae", "ggorilla")
#' test_releases = c("mar2017", "may2009", "may2012", "may2017")
#' for (d in 1:length(test_datasets))
#' for (r in 1:length(test_releases)) {
#' dataset = paste0(test_datasets[d], "_gene_ensembl")
#' host = paste0(test_releases[r], ".archive.ensembl.org")
#' mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset, host)
#' PrepareAnnotationEnsembl(mart, annotation_path, splice_matrix=FALSE, dbsnp=NULL, COSMIC=FALSE,
#' local_cache_path=file.path(annotation_path, "cache"))
#' }
#'
#'}
#'
PrepareAnnotationEnsembl <- function(mart, annotation_path, splice_matrix=FALSE,
dbsnp=NULL, transcript_ids=NULL, COSMIC=FALSE, local_cache_path=NULL,
ensembl_to_UCSC_genome_map = DEFAULT_ENSEMBL_UCSC_GENOME_MAP,
dbsnp_and_cosmic_only=FALSE, ...) {
old <- options(stringsAsFactors = FALSE)
on.exit(options(old), add = TRUE)
dataset <- mart@dataset
biomart <- mart@biomart
version <- sub("Ensembl (?:Genes )?(\\d+)", "\\1", listEnsembl(mart)[listEnsembl(mart)["biomart"]=="ensembl", 2])
host <- strsplit(strsplit(mart@host, ':')[[1]][2], '//')[[1]][2]
if (!dir.exists(annotation_path) && !dir.create(annotation_path, recursive=TRUE)) {
stop("error creating annotation_path: ", annotation_path)
}
if (!is.null(local_cache_path)) {
local_cache_path = qq("@{local_cache_path}/@{dataset}_@{version}")
}
if (!is.null(dbsnp) && nzchar(dbsnp)) {
genome = ensembl_to_UCSC_genome_map[[dataset]][[host]]
if (is.null(genome))
stop(qq("no UCSC dbkey in ensembl_to_UCSC_genome_map for genome @{dataset} and host @{host}"))
if (!is.null(local_cache_path))
dbsnp_cache_path = paste0(local_cache_path, "/", dbsnp)
else
dbsnp_cache_path = NULL
}
message("Prepare gene/transcript/protein id mapping information (ids.RData) ... ", appendLF=FALSE)
original_transcript_ids = transcript_ids
if(is.null(transcript_ids)){
transcript_ids <- read_or_update_local_cache(getBM(attributes=c("ensembl_transcript_id"), mart=mart)[,1],
local_cache_path, "transcript_ids")
}
external_gene_name = ifelse(version < 77, "external_gene_id", "external_gene_name")
attributes.id <- c("ensembl_gene_id", external_gene_name, "description")
idstab <- read_or_update_local_cache(getBM(attributes=attributes.id, mart=mart,
filters='ensembl_transcript_id', values=transcript_ids),
local_cache_path, "idstab")
stopifnot(nrow(idstab) > 0)
colnames(idstab) <- c("ensembl_gene_id", external_gene_name, "description")
idssum <- ddply(idstab, .(ensembl_gene_id), function(x) {
new.x <- x[1, ]
new.x$external_gene_name <- paste(x$external_gene_name, collapse=",")
new.x
})
attributes.tr <- c("ensembl_gene_id", "ensembl_transcript_id",
"ensembl_peptide_id")
tr <- read_or_update_local_cache(getBM(attributes=attributes.tr, mart=mart,
filters='ensembl_transcript_id', values=transcript_ids),
local_cache_path, "tr")
stopifnot(nrow(tr) > 0)
colnames(tr) <- c("ensembl_gene_id", "ensembl_transcript_id",
"ensembl_peptide_id")
ids <- merge(tr, idssum, by='ensembl_gene_id')
description <- paste(ids$external_gene_name, ids$description, sep='|')
ids <- cbind(ids[, 1:3], description)
colnames(ids) <- c('gene_name', 'tx_name', 'pro_name', 'description')
if (!dbsnp_and_cosmic_only) {
save(ids, file=paste(annotation_path, '/ids.RData', sep=''))
}
packageStartupMessage(" done")
message("Build TranscriptDB object (txdb.sqlite) ... ", appendLF=TRUE)
tr_coding <- subset(ids, pro_name != "")
tr_noncoding <- subset(ids, pro_name == "")
txdb<- read_or_update_local_cacheDb(makeTxDbFromBiomart(biomart=biomart, dataset=dataset, host=host,
transcript_ids=original_transcript_ids),
local_cache_path, "txdb")
#saveFeatures(txdb, file=paste(annotation_path,'/txdb.sqlite',sep=''))
saveDb(txdb, file=paste(annotation_path, '/txdb.sqlite', sep=''))
packageStartupMessage(" done")
#txdb_coding <- makeTranscriptDbFromBiomart_archive(biomart=biomart,
# dataset=dataset, host=host, path="/biomart/martservice", archive=FALSE,
# transcript_ids=tr_coding$tx_name)
#saveFeatures(txdb_coding, file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))
#saveDb(txdb_coding, file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))
#txdb_noncoding <- makeTranscriptDbFromBiomart_archive(biomart=biomart,
# dataset =dataset, host=host, path="/biomart/martservice",
# archive=FALSE, transcript_ids=tr_noncoding$tx_name)
#saveFeatures(txdb_noncoding, file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))
#saveDb(txdb_noncoding, file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))
transGrange <- transcripts(txdb)
transintxdb <- IRanges::as.data.frame(transGrange)[, c('tx_id', 'tx_name')]
message("Prepare exon annotation information (exon_anno.RData) ... ",
appendLF=FALSE)
attributes.exon <- c("ensembl_exon_id", "ensembl_peptide_id", 'ensembl_gene_id',
"chromosome_name", "start_position", "end_position", "exon_chrom_start",
"exon_chrom_end", "strand", "5_utr_start", "5_utr_end", "3_utr_start",
"3_utr_end", "cds_start", "cds_end", "rank", "ensembl_transcript_id")
exon <- read_or_update_local_cache(getBM(attributes=attributes.exon, mart=mart,
filters='ensembl_transcript_id', values=transcript_ids),
local_cache_path, "exon")
stopifnot(nrow(exon) > 0)
colnames(exon) <- attributes.exon
exon <- merge(exon, transintxdb, by.x="ensembl_transcript_id", by.y="tx_name")
colnames(exon) <- c("tx_name", "exon_name", "pro_name", "gene_name",
"chromosome_name", "start_position", "end_position", "exon_chrom_start",
"exon_chrom_end", "strand", "5_utr_start", "5_utr_end", "3_utr_start",
"3_utr_end", "cds_start", "cds_end", "rank", "tx_id")
cdsByTx <- cdsBy(txdb, "tx", use.names=FALSE)
cdss <- IRanges::as.data.frame(cdsByTx)
cds_chr_p <- data.frame(tx_id=cdss$group_name,
cds_chr_start=cdss$start,
cds_chr_end=cdss$end, rank=cdss$exon_rank)
cds_chr_p_coding <- subset(cds_chr_p, tx_id %in% exon[which(exon$pro_name != ''), 'tx_id'])
exon <- merge(exon, cds_chr_p_coding, by.y=c("tx_id", "rank"),
by.x=c("tx_id", "rank"), all.x=T)
###Ensembl use 1 & -1, change it to +/-
exon$strand <- unlist(lapply(exon$strand, function(x)
ifelse(x=='1', '+', '-')))
if (!dbsnp_and_cosmic_only) {
save(exon,file=paste(annotation_path, '/exon_anno.RData', sep=''))
}
packageStartupMessage(" done")
message("Prepare protein coding sequence (procodingseq.RData)... ", appendLF=FALSE)
attributes.codingseq <- c("coding", "ensembl_peptide_id",
"ensembl_transcript_id")
getCoding = function() {
if(length(tr_coding$pro_name)<1000){
getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id",
values=tr_coding$pro_name, mart=mart)
}else{
index <- floor(length(tr_coding$pro_name)/1000)
coding <- c()
ptm <- proc.time()
for(i in 1:index) {
st <- (i-1)*1000+1
message(st)
ed <- i*1000
tmp <- getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id",
values=tr_coding[st:ed, 'pro_name'], mart=mart)
coding <- rbind(coding, tmp)
message(round(object.size(coding) / (proc.time()-ptm)[[3]], 3), " B/s")
#print(i)
}
tmp <- getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id",
values=tr_coding[ed+1:length(tr_coding$pro_name), 'pro_name'],
mart=mart)
coding <- rbind(coding, tmp)
}
}
coding <- read_or_update_local_cache(getCoding(), local_cache_path, "coding")
stopifnot(nrow(coding) > 0)
colnames(coding) <- attributes.codingseq
tx_id <- transintxdb[match(coding$ensembl_transcript_id,
transintxdb$tx_name), 'tx_id']
procodingseq <- cbind(coding, tx_id)
colnames(procodingseq) <- c("coding", "pro_name", "tx_name", "tx_id")
if (!dbsnp_and_cosmic_only) {
save(procodingseq,file=paste(annotation_path, '/procodingseq.RData', sep=''))
}
packageStartupMessage(" done")
message("Prepare protein sequence (proseq.RData) ... ", appendLF=FALSE)
attributes.proseq <- c("peptide", "ensembl_peptide_id", "ensembl_transcript_id")
getProteinseq = function() {
if(length(tr_coding$pro_name)<1000){
getBM(attributes=attributes.proseq, filters="ensembl_peptide_id",
values=tr_coding$pro_name, mart=mart)
}else{
index <- floor(length(tr_coding$pro_name)/1000)
proteinseq <- c()
for(i in 1:index) {
st <- (i-1)*1000+1
message(st)
ed <- i*1000
tmp <- getBM(attributes=attributes.proseq, filters="ensembl_peptide_id",
values= tr_coding[st:ed, 'pro_name'], mart=mart)
proteinseq <- rbind(proteinseq, tmp)
#print(i)
}
tmp <- getBM(attributes=attributes.proseq, filters="ensembl_peptide_id",
values=tr_coding[ed+1:length(tr_coding$pro_name), 'pro_name'],
mart=mart)
proteinseq <- rbind(proteinseq, tmp)
}
}
proteinseq <- read_or_update_local_cache(getProteinseq(), local_cache_path, "proteinseq")
stopifnot(nrow(proteinseq) > 0)
colnames(proteinseq) <- c("peptide", "pro_name", "tx_name")
if (!dbsnp_and_cosmic_only) {
save(proteinseq, file=paste(annotation_path, '/proseq.RData', sep=''))
}
packageStartupMessage(" done")
if (!is.null(dbsnp) && nzchar(dbsnp)) {
message("Prepare dbSNP information (dbsnpinCoding.RData) ... ", appendLF=FALSE)
getSnpTable = function(genome, transGrange) {
# can't easily query UCSC MySQL with Ensembl transcripts, so always download full coding dbSNP file
dbSnpFile = qq("@{dbsnp}CodingDbSnp.txt.gz")
dbSnpURL = qq("ftp://hgdownload.soe.ucsc.edu/goldenPath/@{genome}/database/@{dbSnpFile}")
download_error = function(e) {stop(qq("error downloading @{dbSnpURL}; either @{dbsnp} is not available for @{dataset} or UCSC's website is down"))}
if (!is.null(local_cache_path) && length(transcript_ids) > 1000)
dbSnpFile = file.path(local_cache_path, dbSnpFile)
if (!file.exists(dbSnpFile))
tryCatch({download.file(dbSnpURL, dbSnpFile, quiet=T, mode='wb')}, error=download_error, warning=download_error)
snpCodingTab = .temp_unzip(dbSnpFile, data.table::fread, showProgress=FALSE,
select=c(2:5, 8, 10), sep="\t",
col.names=c("chrom", "chromStart", "chromEnd", "name",
"alleleCount", "alleles"))
snpCodingTab$chrom <- gsub('chr', '', snpCodingTab$chrom)
snpCodingTab <- unique(snpCodingTab)
snpCoding <- GRanges(seqnames=snpCodingTab$chrom,
ranges=IRanges(start=snpCodingTab$chromStart,
end=snpCodingTab$chromEnd), strand='*',
rsid=snpCodingTab$name, alleleCount=snpCodingTab$alleleCount,
alleles=snpCodingTab$alleles)
return(subsetByOverlaps(snpCoding, transGrange))
}
dbsnpinCoding = read_or_update_local_cache(getSnpTable(genome, transGrange),
dbsnp_cache_path, "dbsnpinCoding")
# snpCodingTab$chrom <- gsub('chr', '', snpCodingTab$chrom)
# chrlist <- paste(c(seq(1:22),'X','Y'))
# snpCoding <- subset(snpCodingTab, chrom %in% chrlist ,select=c(chrom:name, alleleCount, alleles))
# snpCoding <- unique(snpCoding)
# #snpCoding$chrom <- gsub('chrM', 'MT', snpCoding$chrom)
# #
#
# #save(snpCoding,file=paste(annotation_path,'/snpcoding.RData',sep=''))
# snpCoding <- GRanges(seqnames=snpCoding$chrom,
# ranges=IRanges(start=snpCoding$chromStart,
# end=snpCoding$chromEnd), strand='*',
# rsid=snpCoding$name, alleleCount=snpCoding$alleleCount,
# alleles=snpCoding$alleles)
#
# #seqlevels(snpCoding)
#
# #if(TRUE%in% grep('chr',seqlevels(snpCoding)) > 0 ) {
# # rchar <- sub('chr','',seqlevels(snpCoding))
# # names(rchar) <- seqlevels(snpCoding)
# # snpCoding <- renameSeqlevels(snpCoding, rchar) }
# #if('M'%in%seqlevels(snpCoding)) snpCoding <- renameSeqlevels(snpCoding, c( M='MT'))
# #chrlist <- paste(c(seq(1:22),'X','Y'))
# transGrange_snp <- transGrange
# #transGrange_snp <- keepSeqlevels(transGrange_snp, snpCoding)
# #snpCoding <- keepSeqlevels(snpCoding, transGrange_snp)
#
# #snpCoding <- keepSeqlevels(snpCoding, transGrange)
#
# dbsnpinCoding <- subsetByOverlaps(snpCoding,transGrange_snp)
save(dbsnpinCoding,file=paste(annotation_path, '/dbsnpinCoding.RData', sep=''))
packageStartupMessage(" done")
}
if (COSMIC) {
message("Prepare COSMIC information (cosmic.RData) ... ", appendLF=FALSE)
getCOSMIC = function() {
# hsapiens_snp_som.default.snp.refsnp_id
# hsapiens_snp_som.default.snp.chr_name
# hsapiens_snp_som.default.snp.chrom_start
# hsapiens_snp_som.default.snp.chrom_end
#FILTERS=hsapiens_snp_som.default.filters.variation_source."COSMIC"
varmart = useMart("ENSEMBL_MART_SNP", host=host, dataset="hsapiens_snp_som")
cosmicAttributes = c("refsnp_id", "chr_name", "chrom_start")
if (length(unique(exon$gene_name)) < 500) {
cosmicTab = getBM(attributes=cosmicAttributes,
filters=c("variation_source", "ensembl_gene"),
values=list("COSMIC", unique(exon$gene_name)),
mart=varmart)
} else {
cosmicTab = getBM(attributes=cosmicAttributes,
filters="variation_source",
values="COSMIC",
mart=varmart)
}
chrlist <- paste(c(seq(1:22),'X','Y','MT'))
cosmicTab <- subset(cosmicTab, chr_name %in% chrlist)
cosmic <- GRanges(seqnames=cosmicTab$chr_name,
ranges=IRanges(start=cosmicTab$chrom_start,
end=cosmicTab$chrom_start), strand = '*',
cosid=cosmicTab$refsnp_id)
transGrange_cosmic <- transGrange
#transGrange_cosmic <- keepSeqlevels(transGrange_cosmic, cosmic)
#cosmic <- keepSeqlevels(cosmic, transGrange_cosmic)
cosmic <- subsetByOverlaps(cosmic, transGrange_cosmic)
cosmic
}
cosmic <- read_or_update_local_cache(getCOSMIC(), local_cache_path, "cosmic")
save(cosmic, file=paste(annotation_path, '/cosmic.RData', sep=''))
packageStartupMessage(" done")
}
if(!dbsnp_and_cosmic_only && splice_matrix){
message("Prepare exon splice information (splicemax.RData) ... ",
appendLF=FALSE)
exonByTx <- exonsBy(txdb, "tx", use.names=F)
index <- which(elementNROWS(exonByTx)==1)
exonByTx_mul <- exonByTx[-index]
exons_mul <- IRanges::as.data.frame(exonByTx_mul)
exonslist <- split(exons_mul, exons_mul$group)
splicemax_list <- lapply(exonslist, .gen_splicmatrix)
splicemax <- do.call(rbind, splicemax_list)
save(splicemax, file=paste(annotation_path, '/splicemax.RData', sep=''))
packageStartupMessage(" done")
}
#splice <- paste(splicemax[, 1], splicemax[, 2], sep='-')
}
###########################################################################################
#generate splicing matrix
###########################################################
.gen_splicmatrix <- function(x,
...) {
mystrand=x[1,'strand']
a=x[,'exon_rank']
b=x[,'exon_id']
n=length(a)
if (mystrand=='+'){
tmp=order(a)
}else{
tmp=order(a,decreasing=T)
}
mm=cbind(b[tmp[1:(n-1)]], b[tmp[2:n]])
mm
}
# convenient data structure for mapping Ensembl genome and archive hostname to a UCSC dbkey;
# only needed for dbSNP genomes (human and mouse currently); users can override this mapping in case a new
# Ensembl archive hostname or new genome assembly becomes available
DEFAULT_ENSEMBL_UCSC_GENOME_MAP = list("hsapiens_gene_ensembl" = list("may2009.archive.ensembl.org" = "hg18",
"may2012.archive.ensembl.org" = "hg19",
"dec2013.archive.ensembl.org" = "hg19",
"feb2014.archive.ensembl.org" = "hg19",
"aug2014.archive.ensembl.org" = "hg38",
"oct2014.archive.ensembl.org" = "hg38",
"dec2014.archive.ensembl.org" = "hg38",
"mar2015.archive.ensembl.org" = "hg38",
"may2015.archive.ensembl.org" = "hg38",
"jul2015.archive.ensembl.org" = "hg38",
"sep2015.archive.ensembl.org" = "hg38",
"dec2015.archive.ensembl.org" = "hg38",
"mar2016.archive.ensembl.org" = "hg38",
"jul2016.archive.ensembl.org" = "hg38",
"oct2016.archive.ensembl.org" = "hg38",
"dec2016.archive.ensembl.org" = "hg38",
"mar2017.archive.ensembl.org" = "hg38",
"may2017.archive.ensembl.org" = "hg38",
"www.ensembl.org" = "hg38"),
"mmusculus_gene_ensembl" = list("may2009.archive.ensembl.org" = "mm9",
"may2012.archive.ensembl.org" = "mm9",
"dec2013.archive.ensembl.org" = "mm10",
"feb2014.archive.ensembl.org" = "mm10",
"aug2014.archive.ensembl.org" = "mm10",
"oct2014.archive.ensembl.org" = "mm10",
"dec2014.archive.ensembl.org" = "mm10",
"mar2015.archive.ensembl.org" = "mm10",
"may2015.archive.ensembl.org" = "mm10",
"jul2015.archive.ensembl.org" = "mm10",
"sep2015.archive.ensembl.org" = "mm10",
"dec2015.archive.ensembl.org" = "mm10",
"mar2016.archive.ensembl.org" = "mm10",
"jul2016.archive.ensembl.org" = "mm10",
"oct2016.archive.ensembl.org" = "mm10",
"dec2016.archive.ensembl.org" = "mm10",
"mar2017.archive.ensembl.org" = "mm10",
"may2017.archive.ensembl.org" = "mm10",
"www.ensembl.org" = "mm10"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.