#' @title Download database files from public genome database server
#'
#' @description This function should be excuted before running annotation functions.
#' By this function, user can download genome database file from UCSC and ENSEMBL database.
#' User can download 5 types of human-related database and 4 types of chicken-related database.
#' Function connects ucsc server to download repeats, CpG island and annotated gap database file.
#' Plus, ENSEMBL data fils are also downloaded to annotate about transcription start site and genomic site.
#'
#' @usage downloadDB(dbtype = c('Gene', 'Repeat', 'CpG', 'Gap'), organism = 'hg19',
#' dbPath = paste0(.libPaths()[1], '/VVIPS/extdata'))
#'
#' @param dbtype an array or vector. Avaliable values are 'Gene', 'Repeat', 'CpG' and 'Gap'.
#' \itemize{
#' \item{"Gene"}{Gene database includes transcription start sites.}
#' \item{"Repeat"}{Repeat database file made by Repeatmasker.}
#' \item{"CpG"}{CpG database includes the number of CpG site and GC ratio}
#' \item{"Gap"}{This data is about annotated gaps. By this data, Annotation regions about centromere,
#' telomere and heterochromatin is possible. That is provided by UCSC genome browser.}
#' }
#' @param organism a single character. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#' and galGal6 (Chicken).
#' @param dbPath a string vector. Path for saving database files.
#'
#' @keywords genome database
#'
#' @export
downloadDB = function(dbtype = c('Gene', 'Repeat', 'CpG', 'Gap'), organism = 'hg19', dbPath = paste0(.libPaths()[1], '/VVIPS/extdata')){
### 00. Load packages
require(utils); require(stringr); require(biomaRt)
cat('---------- Download files from UCSC genome browser / ENSEMBL BioMart ----------\n')
cat(paste0('Start time : ', date(), '\n'))
if(length(which(c('Gene', 'Repeat', 'CpG', 'Gap') %in% dbtype)) == 0){
return(cat("[ERROR] You can download 3 types of data such as genes, repeats, CpGs and annotated gaps data. ( Input : ", paste(dbtype, collapse = ', '), ")\n",
'---------- Download process is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
if(length(setdiff(dbtype, c('Gene', 'Repeat', 'CpG', 'Gap'))) > 0){
return(cat("[ERROR] You can download only 3 types of data such as genes, repeats, CpGs and annotated gaps data. ( Input : ", paste(dbtype, collapse = ', '), ")\n",
'---------- Download process is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
if(length(which(organism %in% c('hg19', 'hg38', 'galGal6'))) == 0){
return(cat("[ERROR] You can download hg19, hg38 and galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
'---------- Download process is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
### 01. Find package path
cat(paste0('Data files path : ', dbPath, '\n(If you want to continue, enter yes(Y/y).)\n'))
check = readline("")
if(check == 'y'){} else if(check == 'yes'){} else if(check == 'Y'){} else {
return(cat('---------- Donwload process is halted. ----------\nFinish time : ', date(), '\n'))
}
if(str_ends(dbPath, pattern = '/')){
dbPath = word(dbPath, start = 1, end = nchar(dbPath), sep = '')
} else {}
### 02. Organism & dbtype
if(organism == 'hg19'){
cat(paste0('Organism : Human (GRCh37/hg19)\n'))
} else if(organism == 'hg38'){
cat(paste0('Organism : Human (GRCh38/hg38)\n'))
} else if(organism == 'galGal6') {
cat(paste0('Organism : Chicken (GRCg6a/galGal6)\n'))
} else {}
if(length(which(c('Repeat', 'CpG', 'Gap') %in% dbtype)) != 0){
cat(paste0('---------- Download file(s) from UCSC genome browser : ', paste(dbtype, collapse = ', '), ' ----------\n'))
db_list= c('rmsk.txt.gz', 'cpgIslandExtUnmasked.txt.gz', 'microsat.txt.gz', 'gap.txt.gz')
db_file = switch(paste(which(c('Repeat', 'CpG', 'Gap') %in% dbtype), collapse = ','),
'1' = db_list[c(1,3)],
'2' = db_list[2],
'3' = db_list[4],
'1,2' = db_list[c(1:3)],
'1,3' = db_list[c(1,3,4)],
'2,3' = db_list[c(2,4)],
'1,2,3' = db_list[c(1:4)])
### 03. Download files (Repeat, CpG, Gap)
ucsc_url = paste0("ftp://hgdownload.soe.ucsc.edu/goldenPath/", organism, '/database/', db_file)
for(x in 1:length(ucsc_url)){
download.file(url = ucsc_url[x], destfile = paste0(dbPath, '/', db_file[x]))
}
cat('Done.\n')
} else {}
if('Gene' %in% dbtype){
### 04. Download Ensembl data
cat("---------- Download data from ensembl BioMart ----------\n")
cat("Connect to ensembl BioMart ftp server.\n")
if(organism == 'hg19'){
ensembl = useMart(biomart="ENSEMBL_MART_ENSEMBL", host = "grch37.ensembl.org",
path = "/biomart/martservice", dataset = "hsapiens_gene_ensembl")
hgene_set = useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
} else if(organism == 'hg38'){
ensembl = useMart("ensembl")
hgene_set = useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
} else if(organism == 'galGal6'){
ensembl = useMart("ensembl")
hgene_set = useDataset(dataset = "ggallus_gene_ensembl", mart = ensembl)
} else {}
cat("Make an ensembl gene table.\n")
gene_anno_set = getBM(attributes = c('hgnc_symbol', 'ensembl_gene_id', 'ensembl_transcript_id',
'refseq_mrna', 'refseq_ncrna',
'chromosome_name', 'start_position', 'end_position', 'strand',
'transcript_start', 'transcript_end', 'transcription_start_site',
'percentage_gene_gc_content', 'description'), mart = hgene_set)
gene_set = subset(gene_anno_set, str_detect(gene_anno_set$chromosome_name, '_') == FALSE)
gene_set = subset(gene_set, str_detect(gene_set$chromosome_name, '[.]') == FALSE)
gene_set = subset(gene_set, gene_set$hgnc_symbol != '')
cat('Done.\n')
cat('Save a gene table to package directory.\n')
write.table(gene_set, file = paste0(dbPath, '/Ensembl_gene_', organism, '.txt'),
append = FALSE, quote = FALSE, sep = "\t", na = "NA", row.names = FALSE,
col.names = TRUE)
} else {}
cat('---------- Donwload process is finished. ----------\n')
cat(paste0('Finish time : ', date(), '\n'))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.