R/downloadDB.R

Defines functions downloadDB

Documented in downloadDB

#' @title Download database files from public genome database server
#' 
#' @description This function should be excuted before running annotation functions.
#'              By this function, user can download genome database file from UCSC and ENSEMBL database.
#'              User can download 5 types of human-related database and 4 types of chicken-related database.
#'              Function connects ucsc server to download repeats, CpG island and annotated gap database file.
#'              Plus, ENSEMBL data fils are also downloaded to annotate about transcription start site and genomic site.
#' 
#' @usage downloadDB(dbtype = c('Repeat', 'CpG', 'Gap'), organism = 'hg19', 
#'                   dbPath = paste0(.libPaths()[1], '/IRFinder/extdata'))
#' 
#' @param dbtype an array or vector. Avaliable values are 'Repeat', 'CpG' and 'Gap'.
#'               \strong{Repeat} : Repeat database file made by Repeatmasker.
#'               \strong{CpG} : It means CpG islands.
#'               \strong{Gap} : This data is about annotated gaps. By this data, Annotation regions about centromere, 
#'                               telomere and heterochromatin is possible. That is provided by UCSC genome browser.
#' @param organism a single character. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#'                 and galGal6 (Chicken).
#' @param dbPath a string vector. Path for saving database files.
#'
#' @keywords genome database
#'
#' @export

downloadDB = function(dbtype = c('Repeat', 'CpG', 'Gap'), organism = 'hg19', dbPath = paste0(.libPaths()[1], '/IRFinder/extdata')){
  ### 00. Load packages
  library(utils); library(stringr); library(biomaRt)

  cat('---------- Download files from UCSC genome browser / ENSEMBL BioMart ----------\n')
  cat(paste0('Start time : ', date(), '\n'))

  if(length(which(c('Repeat', 'CpG', 'Gap') %in% dbtype)) == 0){
    return(cat("[ERROR] You can download 3 types of data such as repeats, CpG and annotated gap data. ( Input : ", paste(dbtype, collapse = ', '), ")\n",
                '---------- Donwload process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}
  
  if(length(setdiff(dbtype, c('Repeat', 'CpG', 'Gap'))) > 0){
    return(cat("[ERROR] You can download only 3 types of data such as repeats, CpG and annotated gap data. ( Input : ", paste(dbtype, collapse = ', '), ")\n",
               '---------- Donwload process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}

  if(length(which(organism %in% c('hg19', 'hg38', 'galGal6'))) == 0){
    return(cat("[ERROR] You can download hg19, hg38 and galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
           '---------- Donwload process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}

  ### 01. Find package path
  cat(paste0('Data files path : ', dbPath, '\n(If you want to continue, enter yes(y).)\n'))
  check = readline("")
  if(check == 'y' || check == 'yes'){} else {
    return(cat('---------- Donwload process is halted. ----------\nFinish time : ', date(), '\n'))
  }
  
  ### 02. Organism & dbtype
  if(organism == 'hg19'){
    cat(paste0('Organism : Human (GRCh37/hg19)\n'))
  } else if(organism == 'hg38'){
    cat(paste0('Organism : Human (GRCh38/hg38)\n'))
  } else if(organism == 'galGal6') {
    cat(paste0('Organism : Chicken (GRCg6a/galGal6)\n'))
  } else {}

  cat(paste0('---------- Download file(s) from UCSC genome browser : ', paste(dbtype, collapse = ', '), '  ----------\n'))

  db_list= c('rmsk.txt.gz', 'cpgIslandExtUnmasked.txt.gz', 'microsat.txt.gz', 'gap.txt.gz')

  db_file = switch(paste(which(c('Repeat', 'CpG', 'Gap') %in% dbtype), collapse = ','),
                   '1' = db_list[c(1,3)],
                   '2' = db_list[2],
                   '3' = db_list[4],
                   '1,2' = db_list[c(1:3)],
                   '1,3' = db_list[c(1,3,4)],
                   '2,3' = db_list[c(2,4)],
                   '1,2,3' = db_list[c(1:4)])

  ### 03. Download files (CpG, Repeat)
  ucsc_url = paste0("ftp://hgdownload.soe.ucsc.edu/goldenPath/", organism, '/database/', db_file)

  for(x in 1:length(ucsc_url)){
    download.file(url = ucsc_url[x], destfile = paste0(dbPath, '/', db_file[x]))
  }
  cat('Done.\n')

  ### 04. Download Ensembl data
  cat("---------- Download data from ensembl BioMart ----------\n")
  cat("Connect to ensembl BioMart ftp server.\n")
  if(organism == 'hg19'){
    ensembl = useMart(biomart="ENSEMBL_MART_ENSEMBL", host = "grch37.ensembl.org",
                      path = "/biomart/martservice", dataset = "hsapiens_gene_ensembl")
    hgene_set = useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
  } else if(organism == 'hg38'){
    ensembl = useMart("ensembl")
    hgene_set = useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
  } else if(organism == 'galGal6'){
    ensembl = useMart("ensembl")
    hgene_set = useDataset(dataset = "ggallus_gene_ensembl", mart = ensembl)
  } else {}
    cat("Make an ensembl gene table.\n")
    gene_anno_set = getBM(attributes = c('hgnc_symbol', 'ensembl_gene_id', 'ensembl_transcript_id',
                                         'refseq_mrna', 'refseq_ncrna',
                                         'chromosome_name', 'start_position', 'end_position', 'strand',
                                         'transcript_start', 'transcript_end', 'transcription_start_site',
                                         'percentage_gene_gc_content', 'description'), mart = hgene_set)
    gene_set = subset(gene_anno_set, str_detect(gene_anno_set$chromosome_name, '_') == FALSE)
    gene_set = subset(gene_set, str_detect(gene_set$chromosome_name, '[.]') == FALSE)
    gene_set = subset(gene_set, gene_set$hgnc_symbol != '')
    cat('Done.\n')
    cat('Save a gene table to package directory.\n')
    write.table(gene_set, file = paste0(dbPath, '/Ensembl_gene_', organism, '.txt'),
                append = FALSE, quote = FALSE, sep = "\t", na = "NA", row.names = FALSE,
                col.names = TRUE)
    Sys.sleep(1)

  cat('---------- Donwload process is finished. ----------\n')
  cat(paste0('Finish time : ', date(), '\n'))
}
bioinfo16/IRFinder documentation built on Aug. 19, 2019, 10:37 a.m.