R/downloadDB.R

Defines functions downloadDB

Documented in downloadDB

#' @title Download database files from public genome database server
#' 
#' @description This function should be excuted before running annotation functions.
#'              By this function, user can download genome database file from UCSC and ENSEMBL database.
#'              User can download 5 types of human-related database and 4 types of chicken-related database.
#'              Function connects ucsc server to download repeats, CpG island and annotated gap database file.
#'              Plus, ENSEMBL data fils are also downloaded to annotate about transcription start site and genomic site.
#' 
#' @usage downloadDB(dbtype = c('Gene', 'Repeat', 'CpG', 'Gap'), organism = 'hg19', 
#'                   dbPath = paste0(.libPaths()[1], '/VVIPS/extdata'))
#' 
#' @param dbtype an array or vector. Avaliable values are 'Gene', 'Repeat', 'CpG' and 'Gap'.
#' \itemize{
#'  \item{"Gene"}{Gene database includes transcription start sites.}
#'  \item{"Repeat"}{Repeat database file made by Repeatmasker.}
#'  \item{"CpG"}{CpG database includes the number of CpG site and GC ratio}
#'  \item{"Gap"}{This data is about annotated gaps. By this data, Annotation regions about centromere, 
#'               telomere and heterochromatin is possible. That is provided by UCSC genome browser.} 
#' }
#' @param organism a single character. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#'                 and galGal6 (Chicken).
#' @param dbPath a string vector. Path for saving database files.
#'
#' @keywords genome database
#'
#' @export

downloadDB = function(dbtype = c('Gene', 'Repeat', 'CpG', 'Gap'), organism = 'hg19', dbPath = paste0(.libPaths()[1], '/VVIPS/extdata')){
  ### 00. Load packages
  require(utils); require(stringr); require(biomaRt)
  
  cat('---------- Download files from UCSC genome browser / ENSEMBL BioMart ----------\n')
  cat(paste0('Start time : ', date(), '\n'))
  
  if(length(which(c('Gene', 'Repeat', 'CpG', 'Gap') %in% dbtype)) == 0){
    return(cat("[ERROR] You can download 3 types of data such as genes, repeats, CpGs and annotated gaps data. ( Input : ", paste(dbtype, collapse = ', '), ")\n",
               '---------- Download process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}
  
  if(length(setdiff(dbtype, c('Gene', 'Repeat', 'CpG', 'Gap'))) > 0){
    return(cat("[ERROR] You can download only 3 types of data such as genes, repeats, CpGs and annotated gaps data. ( Input : ", paste(dbtype, collapse = ', '), ")\n",
               '---------- Download process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}
  
  if(length(which(organism %in% c('hg19', 'hg38', 'galGal6'))) == 0){
    return(cat("[ERROR] You can download hg19, hg38 and galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
               '---------- Download process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}
  
  ### 01. Find package path
  cat(paste0('Data files path : ', dbPath, '\n(If you want to continue, enter yes(Y/y).)\n'))
  check = readline("")
  if(check == 'y'){} else if(check == 'yes'){} else if(check == 'Y'){} else {
    return(cat('---------- Donwload process is halted. ----------\nFinish time : ', date(), '\n'))
  }
  
  if(str_ends(dbPath, pattern = '/')){
    dbPath = word(dbPath, start = 1, end = nchar(dbPath), sep = '')
  } else {}
  
  ### 02. Organism & dbtype
  if(organism == 'hg19'){
    cat(paste0('Organism : Human (GRCh37/hg19)\n'))
  } else if(organism == 'hg38'){
    cat(paste0('Organism : Human (GRCh38/hg38)\n'))
  } else if(organism == 'galGal6') {
    cat(paste0('Organism : Chicken (GRCg6a/galGal6)\n'))
  } else {}
  
  if(length(which(c('Repeat', 'CpG', 'Gap') %in% dbtype)) != 0){
    cat(paste0('---------- Download file(s) from UCSC genome browser : ', paste(dbtype, collapse = ', '), '  ----------\n'))
    
    db_list= c('rmsk.txt.gz', 'cpgIslandExtUnmasked.txt.gz', 'microsat.txt.gz', 'gap.txt.gz')
    
    db_file = switch(paste(which(c('Repeat', 'CpG', 'Gap') %in% dbtype), collapse = ','),
                     '1' = db_list[c(1,3)],
                     '2' = db_list[2],
                     '3' = db_list[4],
                     '1,2' = db_list[c(1:3)],
                     '1,3' = db_list[c(1,3,4)],
                     '2,3' = db_list[c(2,4)],
                     '1,2,3' = db_list[c(1:4)])
    
    ### 03. Download files (Repeat, CpG, Gap)
    ucsc_url = paste0("ftp://hgdownload.soe.ucsc.edu/goldenPath/", organism, '/database/', db_file)
    
    for(x in 1:length(ucsc_url)){
      download.file(url = ucsc_url[x], destfile = paste0(dbPath, '/', db_file[x]))
    }
    cat('Done.\n')
  } else {}
  
  
  if('Gene' %in% dbtype){
    ### 04. Download Ensembl data
    cat("---------- Download data from ensembl BioMart ----------\n")
    cat("Connect to ensembl BioMart ftp server.\n")
    if(organism == 'hg19'){
      ensembl = useMart(biomart="ENSEMBL_MART_ENSEMBL", host = "grch37.ensembl.org",
                        path = "/biomart/martservice", dataset = "hsapiens_gene_ensembl")
      hgene_set = useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
    } else if(organism == 'hg38'){
      ensembl = useMart("ensembl")
      hgene_set = useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
    } else if(organism == 'galGal6'){
      ensembl = useMart("ensembl")
      hgene_set = useDataset(dataset = "ggallus_gene_ensembl", mart = ensembl)
    } else {}
    cat("Make an ensembl gene table.\n")
    gene_anno_set = getBM(attributes = c('hgnc_symbol', 'ensembl_gene_id', 'ensembl_transcript_id',
                                         'refseq_mrna', 'refseq_ncrna',
                                         'chromosome_name', 'start_position', 'end_position', 'strand',
                                         'transcript_start', 'transcript_end', 'transcription_start_site',
                                         'percentage_gene_gc_content', 'description'), mart = hgene_set)
    gene_set = subset(gene_anno_set, str_detect(gene_anno_set$chromosome_name, '_') == FALSE)
    gene_set = subset(gene_set, str_detect(gene_set$chromosome_name, '[.]') == FALSE)
    gene_set = subset(gene_set, gene_set$hgnc_symbol != '')
    cat('Done.\n')
    cat('Save a gene table to package directory.\n')
    write.table(gene_set, file = paste0(dbPath, '/Ensembl_gene_', organism, '.txt'),
                append = FALSE, quote = FALSE, sep = "\t", na = "NA", row.names = FALSE,
                col.names = TRUE)
  } else {}
  
  cat('---------- Donwload process is finished. ----------\n')
  cat(paste0('Finish time : ', date(), '\n'))
}
bioinfo16/VVIPS documentation built on Nov. 4, 2019, 7:20 a.m.