R/buildReference.r

Defines functions build_viruSite_reference build_Ensembl_reference build_IgBlast_reference build_IMGT_reference

Documented in build_Ensembl_reference build_IgBlast_reference build_IMGT_reference build_viruSite_reference

#' @include utils.r download.r html.r
NULL

#' Build IMGT database reference
#'
#' Download reference sequences from the IMGT (the international ImMunoGeneTics information system, 
#' http://www.imgt.org) database and split the sequences by species.
#'
#' @param outdir character. Default \code{getwd()}
#' @param method character. Method to be used for downloading files, equal to \code{download.file}. Default 'libcurl'
#' @param verbose logical. Default TRUE
#'
#' @importFrom stats setNames
#' @importFrom Biostrings readBStringSet chartr writeXStringSet
#'
#' @return if success, return TRUE
#' @export
#'
#' @examples
#' \donttest{build_IMGT_reference('IMGT_ref')}
#'
build_IMGT_reference = function(outdir = NULL, method = NULL, verbose = TRUE) {

  # check parameter
  outdir = as.character(outdir %|||% getwd())
  method = as.character(method %|||% 'libcurl')
  if(verbose) cat('-->', timer(), '1. Build VDJ reference from IMGT website in:', outdir, '<--\n')
  dir.create(outdir, FALSE, TRUE)

  # catch
  species_web = 'TrustVDJ_species.html'
  species_fa  = 'TrustVDJ_imgt.fa'
  species_web_file = paste0(outdir, '/', species_web)
  species_fa_file  = paste0(outdir, '/', species_fa)
  URLs = paste0('http://www.imgt.org/download/', c('V-QUEST/IMGT_V-QUEST_reference_directory',
                  'GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP'))
  suppressWarnings(file.remove(c(species_web_file, species_fa_file)))
  Download(URLs, c(species_web, species_fa), outdir = outdir, method = method, verbose = verbose)

  # process html
  species = grep('/', sub('/$', '', unique(
    grep('\\w/$', htmlHref(species_web_file, 'body section table a'), value = TRUE) )), invert = TRUE, value = TRUE)

  # read fa
  fa      = Biostrings::readBStringSet(species_fa_file)
  fa_name = strsplit(names(fa), split = '\\|')

  # extract by species
  species_ok = unlist(lapply(sort(unique(species)), function(sp) {

    # only TCR/BCR
    fa_sp = fa[sapply(fa_name, function(nm) grepl(sp, gsub(' ', '_', nm[3])) & grepl('^IG|^TR', nm[2]) )]
    if(length(fa_sp)) {

      # process name
      if(verbose) cat('-->', timer(), 'process fa for:', sp, '<--\n')
      names(fa_sp) = pick(names(fa_sp), 2)
      
      # combine same name 
      fa_sp = do.call(c, lapply(unique(names(fa_sp)), function(nm) 
        stats::setNames(Biostrings::BStringSet(unlist(fa_sp[names(fa_sp) %in% nm])), nm) ))
        
      # case conversion
      fa_sp = Biostrings::chartr('atcgn', 'ATCGN', fa_sp)

      # save fa
      Biostrings::writeXStringSet(fa_sp, paste0(outdir, '/IMGT_', sp, '.fa'))
      if(verbose) cat('-->', timer(), 'saved fa in:', sp, '<--\n')

      # return
      return(sp)

    } else if(verbose) warning('--! ', timer(), ' no fa content in: ', sp, ' !--', call. = FALSE)

    # return
    NULL
  }))

  # save log
  writeLines(c(timer(), species_ok), paste0(outdir, '/species.available.txt'))
  if(verbose) cat('-->', timer(), 'build IMGT reference done <--\n')

  # return
  file.remove(c(species_web_file, species_fa_file))
  TRUE
}

#' Build NCBI-Igblast database reference
#'
#' Download Igblast reference files from NCBI.
#'
#' @param outdir character. Default \code{getwd()}
#' @param method character. Method to be used for downloading files, equal to download.file. Default 'curl'
#' @param verbose logical. Default TRUE
#'
#' @importFrom utils untar
#'
#' @return if success, return TRUE
#' @export
#'
#' @examples
#' \donttest{build_IgBlast_reference('IgBlast_ref')}
#' 
build_IgBlast_reference = function(outdir = NULL, method = NULL, verbose = TRUE) {
  
  # check parameter
  outdir = as.character(outdir %|||% getwd())
  method = as.character(method %|||% 'curl')
  if(verbose) cat('-->', timer(), '1. Build IgBlast reference from NCBI website in:', outdir, '<--\n')
  dir.create(outdir, FALSE, TRUE)
  
  # catch
  names = c('mouse_gl_VDJ.tar', 'ncbi_human_c_genes.tar', 'rhesus_monkey_VJ.tar')
  files = paste0(outdir, '/', names)
  URLs  = paste0('ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/database/', names)
  suppressWarnings(file.remove(files))
  Download(URLs, names, outdir = outdir, method = method, verbose = verbose)
  
  # untar 
  lapply(files, function(file) utils::untar(file, exdir = outdir))
  if(verbose) cat('-->', timer(), 'build IgBlast reference done <--\n') 
 
  # return
  file.remove(files)
  TRUE
}

#' Build Ensembl database reference
#'
#' Download latest reference files (\code{.fa} and \code{.gtf}) from Ensembl database 
#' and split the files by species.
#'
#' @param outdir character. Default \code{getwd()}
#' @param species character. Default \code{'ALL'}
#' @param method character. Method to be used for downloading html files, equal to \code{download.file}. Default 'libcurl'
#' @param ftp_method character. Method to be used for downloading ftp files (.fa and .gtf), equal to \code{download.file}. Default 'curl'
#' @param verbose logical. Default \code{TRUE}
#'
#' @return if success, return \code{TRUE}
#' @export
#'
#' @examples
#' \donttest{build_Ensembl_reference('Ensembl_ref', 'acanthochromis_polyacanthus')}
#' 
build_Ensembl_reference = function(outdir = NULL, species = NULL, method = NULL, ftp_method = NULL, verbose = TRUE) {
  
  # check parameter
  outdir     = as.character(outdir     %|||% getwd())
  species    = as.character(species    %|||% 'ALL')
  method     = as.character(method     %|||% 'libcurl')
  ftp_method = as.character(ftp_method %|||% 'curl')
  
  if(verbose) cat('-->', timer(), '1. Build Ensembl reference in:', outdir, '<--\n')
  dir.create(outdir, FALSE, TRUE)
  
  # catch ensembl
  web      = 'Ensembl.html'
  web_file = paste0(outdir, '/', web)
  suppressWarnings(file.remove(web_file))
  Download('http://ftp.ensembl.org/pub', web, outdir = outdir, method = method, verbose = FALSE)
  
  # process ensembl html
  release = sort(as.numeric(
    sub('/$', '', sub('release-', '', grep('^release.*/$', htmlHref(web_file), value = TRUE))) ), TRUE)[1]
  file.remove(web_file)
  if(is.na(release)) stop('!!! ', timer(), ' No Ensembl release number detected !!!')
  if(verbose) cat('-->', timer(), 'catch the latest Ensembl release:', release, '<--\n')
  
  # release URL
  URL     = paste0('http://ftp.ensembl.org/pub/release-', release, '/')
  URL_ftp = paste0('ftp://ftp.ensembl.org/pub/release-', release, '/')
  
  # catch species
  sp_web      = paste0('Ensembl_release', release, c('.fa.html', '.gtf.html'))
  sp_web_file = paste0(outdir, '/', sp_web)
  suppressWarnings(file.remove(sp_web_file))
  Download(paste0(URL, c('fasta', 'gtf')), sp_web, outdir = outdir, method = method, verbose = FALSE)
  
  # process species html
  Species = do.call(intersect, lapply(sp_web_file, function(file)
    sub('/$', '', grep('\\w/$', htmlHref(file), value = TRUE)) ))
  file.remove(sp_web_file)
  if('ALL' %in% species) {
    species = Species
    if(verbose) cat('-->', timer(), 'catch all species <--\n')
  } else {
    species = intersect(species, Species)
    if(verbose) cat('-->', timer(), 'catch species:', paste(species, collapse = ', '), '<--\n')
  }
  if(!length(species))
    stop('!!! ', timer(), ' No species detected !!!')

  # download for each species
  if(verbose) p = utils::txtProgressBar(style = 3)
  species_ok = unlist(lapply(seq(species), function(i) {
    
    # species
    sp = species[i]
    
    # catch fa and gtf html
    spi_web      = paste0('Ensembl_release', release, sp, c('.fa.html', '.gtf.html'))
    spi_web_file = paste0(outdir, '/', spi_web)
    suppressWarnings(file.remove(spi_web_file))
    Download(paste0(URL, c('fasta', 'gtf'), '/', sp, c('/dna', '')),
             spi_web, outdir = outdir, method = method, verbose = FALSE)
    
    # process html
    files = unlist(lapply(spi_web_file, function(file) 
      lapply(c('sm.toplevel.fa', paste0(release, '.gtf')), function(part) 
        grep(part, htmlHref(file), value = TRUE)[1] %|||% NULL) ))
    file.remove(spi_web_file)
    
    # download files
    if(length(files) > 1) {
      suppressWarnings(file.remove(files))
      Download(paste0(URL_ftp, c('fasta', 'gtf'), '/', sp, c('/dna', ''), '/', files),
               files, outdir = paste0(outdir, '/', sp), method = ftp_method, verbose = FALSE)
    } else {
      warning('--! ', timer(), ' no fasta | gtf in species: ', sp, ' !--', call. = FALSE)
      sp = NULL
    }
    
    # progress
    if(verbose) utils::setTxtProgressBar(p, i/length(species))
    sp
  }))
  if(verbose) close(p)
  
  # save log
  writeLines(c(timer(), species_ok), paste0(outdir, '/species.available.txt'))
  if(verbose) cat('-->', timer(), 'build Ensembl reference done <--\n') 
 
  # return
  TRUE
}

#' Build viruSite database reference
#'
#' Download reference sequences from the viruSite database and make genome and gene gtf.
#'
#' @param outdir character. Default \code{getwd()}
#' @param method character. Method to be used for downloading html files, equal to \code{download.file}. Default 'libcurl'
#' @param ftp_method character. Method to be used for downloading ftp files (.fa and .gtf), equal to \code{download.file}. Default 'curl'
#' @param verbose logical. Default \code{TRUE}
#'
#' @importFrom Biostrings readBStringSet width writeXStringSet
#' @importFrom utils write.table unzip
#'
#' @return if success, return \code{TRUE}
#' @export
#'
#' @examples
#' \donttest{build_viruSite_reference('viruSite')}
#' 
build_viruSite_reference = function(outdir = NULL, method = NULL, ftp_method = NULL, verbose = TRUE) {

  # check parameter
  outdir     = as.character(outdir     %|||% getwd())
  method     = as.character(method     %|||% 'libcurl')
  ftp_method = as.character(ftp_method %|||% 'curl')
  if(verbose) cat('-->', timer(), '1. Build viruses reference from viruSite in:', outdir, '<--\n')
  dir.create(outdir, FALSE, TRUE)

  # catch
  URL = 'http://www.virusite.org/'
  viruse_web = 'TrustVDJ_viruSite.html'
  viruse_web_file = paste0(outdir, '/', viruse_web)
  suppressWarnings(file.remove(viruse_web_file))
  Download(URL, viruse_web, outdir = outdir, method = method, verbose = verbose)

  # process html
  release = sort(as.numeric(
    sub('^Release ', '', grep('^Release', htmlText(viruse_web_file, 'body div b'), value = TRUE)) ), TRUE)[1]
  file.remove(viruse_web_file)
  if(is.na(release)) stop('!!! ', timer(), ' No viruSite release number detected !!!')
  if(verbose) cat('-->', timer(), 'catch the latest viruSite release:', release, '<--\n')

  # release download
  names = c('genes.fasta.zip', 'genomes.fasta.zip')
  URLs  = paste0(URL, 'archive/', release, '/', names)
  files = paste0(outdir, '/', names)
  suppressWarnings(file.remove(files))
  Download(URLs, names, outdir = outdir, method = ftp_method, verbose = verbose)
  lapply(files, function(file) utils::unzip(file, exdir = outdir))

  # Ref from Genome
  genome_file = paste0(outdir, '/genomes.fasta')
  if(verbose) cat('-->', timer(), 'process genome reference <--\n')
  genome = Biostrings::readBStringSet(genome_file)
  id = pick(names(genome), 2)
  genomeGTF = data.frame(
    ID     = id,
    source = 'viruSite',
    fea    = 'exon',
    start  = 1,
    end    = Biostrings::width(genome),
    score  = '.',
    strand = '+',
    frame  = '.',
    attr   = paste0('gene_id "', id, '";transcript_id "', id,
                    '";gene_name "', pick(names(genome), 4), '";')
  )
  utils::write.table(genomeGTF, paste0(outdir, '/genomes.only.gtf'),
                     row.names = FALSE, col.names = FALSE, sep = '\t', quote = FALSE)
  names(genome) = id
  Biostrings::writeXStringSet(genome, genome_file)
  rm(genome, genomeGTF)

  # Ref from gene
  gene_file = paste0(outdir, '/genes.fasta')
  if(verbose) cat('-->', timer(), 'process gene reference <--\n')
  gene   = Biostrings::readBStringSet(gene_file)
  parent = pick(names(gene), 2)
  id     = make.unique(paste0(parent, '.gene'))
  name   = paste0(pick(names(gene), 4), sub('.*\\.gene', '-gene', id))
  # gene gtf
  geneGTF = data.frame(
    ID     = id,
    source = 'viruSite',
    fea    = 'exon',
    start  = 1,
    end    = Biostrings::width(gene),
    score  = '.',
    strand = '+',
    frame  = '.',
    attr   = paste0('gene_id "', id, '";transcript_id "', id, '";gene_name "', name, '";')
  )
  utils::write.table(geneGTF, paste0(outdir, '/genes.only.gtf'),
                     row.names = FALSE, col.names = FALSE, sep = '\t', quote = FALSE)
  # gene-genome gtf (remove join, >, < and order)
  local = sub('complement\\(', '', sub('\\)', '', pick(names(gene), 3)))
  rm    = grep('join|order|<|>', local)
  local = strsplit(local[-rm], '\\.\\.')
  GTF   = data.frame(
    ID     = parent[-rm],
    source = 'viruSite',
    fea    = 'exon',
    start  = sapply(local, function(i) as.numeric(i[1])),
    end    = sapply(local, function(i) as.numeric(i[2])),
    score  = '.',
    strand = '+',
    frame  = '.',
    attr   = paste0('gene_id "', id[-rm], '";transcript_id "', id[-rm], '";gene_name "', name[-rm], '";')
  )
  utils::write.table(GTF, paste0(outdir, '/genes.genome.gtf'),
                     row.names = FALSE, col.names = FALSE, sep = '\t', quote = FALSE)
  names(gene) = id
  Biostrings::writeXStringSet(gene, gene_file)

  # save log
  writeLines(c(timer(), paste('latest release:', release)), paste0(outdir, '/release.latest.txt'))
  if(verbose) cat('-->', timer(), 'build viruSite reference done <--\n')

  # return
  file.remove(files)
  TRUE
}
HatsuneCode/TrustVDJ documentation built on Aug. 13, 2022, 9:36 p.m.