R/geneticDataTools.R

Defines functions getNcbiFeatures getNcbiChr2Acc getUcscQuery getUcscCytoBands getNcbiFeatureTable

getNcbiFeatureTable <- function(v = "GCF_000001405.30_GRCh38.p4") {
  ## v: chr of length 1, the NCBI version 
  if(!curl::has_internet()) stop("Must be connected to the internet.")
  d <- file.path("ftp:/", 
                 "ftp.ncbi.nih.gov", 
                 "genomes", 
                 "refseq", 
                 "vertebrate_mammalian", 
                 "Homo_sapiens", 
                 "all_assembly_versions",
                 v)
  f <- sprintf("%s_feature_table.txt.gz", v)
  tbl <- try(suppressWarnings(fread(file.path(d, f), showProgress = FALSE)), 
             silent = TRUE)
  if (is(tbl, "try-error")) {
    m <- "Could not download the feature file for the given version."
    c <- match.call()
    stop(simpleError(m, c))
  }
  setnames(tbl, "# feature", "feature")
  tbl[]
}

getUcscCytoBands <- function(db = "hg38", oneBased = TRUE) {
  if(!curl::has_internet()) stop("Must be connected to the internet.")
  ucsc <- dbConnect(MySQL(), 
                    username = "genome", 
                    dbname = db,
                    host = "genome-mysql.soe.ucsc.edu",
                    port = 3306,
                    password = "")
  dat <- suppressWarnings(dbGetQuery(ucsc, "SELECT * FROM cytoBand;"))
  dat <- as.data.table(dat)
  dbDisconnect(ucsc)
  if (oneBased) {
    dat[, chromStart := chromStart + 1]
  }
  dat[]
}

getUcscQuery <- function(qstr, db = "hg38") {
  if(!curl::has_internet()) stop("Must be connected to the internet.")
  ucsc <- dbConnect(MySQL(), 
                    username = "genome", 
                    dbname = db,
                    host = "genome-mysql.soe.ucsc.edu",
                    port = 3306,
                    password = "")
  dat <- suppressWarnings(dbGetQuery(ucsc, qstr))
  dat <- as.data.table(dat)
  dbDisconnect(ucsc)
  warning("UCSC uses 0-based start and 1-based end positions.", call. = FALSE)
  dat[]
}

getNcbiChr2Acc <- function(v = "GCF_000001405.26_GRCh38") {
  ## v: chr of length 1, the NCBI version 
  if(!curl::has_internet()) stop("Must be connected to the internet.")
  d <- file.path("ftp:/", 
                 "ftp.ncbi.nih.gov", 
                 "genomes", 
                 "refseq", 
                 "vertebrate_mammalian", 
                 "Homo_sapiens", 
                 "all_assembly_versions",
                 v, 
                 sprintf("%s_assembly_structure", v))
  f <- file.path("assembled_chromosomes", "chr2acc")
  m1 <- try(silent = TRUE, {
    suppressWarnings(fread(file.path(d, "Primary_Assembly", f), 
                           showProgress = FALSE))
  })
  if (is(m1, "try-error")) {
    m <- "Could not download the chr2acc file for the given version."
    c <- match.call()
    stop(simpleError(m, c))
  }
  m1[ , prefix := "g"]
  m2 <- try(silent = TRUE, {
    suppressWarnings(fread(file.path(d, "non-nuclear", f), 
                           showProgress = FALSE))
  })
  if (is(m2, "try-error")) {
    m <- "Could not download the chr2acc file for the given version."
    c <- match.call()
    stop(simpleError(m, c))
  }
  m2[ , prefix := "m"]
  out <- rbind(m1, m2)
  setnames(out, c("chr", "acc", "prefix"))
  out[]
}

getNcbiFeatures <- function(v = "GCF_000001405.26_GRCh38") {
  ## v: chr of length 1, the NCBI version 
  if(!curl::has_internet()) stop("Must be connected to the internet.")
  f <- file.path("ftp:/", 
                 "ftp.ncbi.nih.gov", 
                 "genomes", 
                 "refseq", 
                 "vertebrate_mammalian", 
                 "Homo_sapiens", 
                 "all_assembly_versions",
                 v,
                 sprintf("%s_genomic.gff.gz", v))
  cl <- c("seqid", "type", "start", "end", "strand")
  tg <- c("gene", "genome", "chromosome", "Dbxref")
  tmpFile <- tempfile()
  tmp <- try(download.file(f, destfile = tmpFile, quiet = TRUE), silent = TRUE)
  if (is(tmp, "try-error")) {
    m <- "Could not download the feature file for the given version."
    c <- match.call()
    stop(simpleError(m, c))
  }
  tbl <- rtracklayer::readGFF(tmpFile, columns = cl, tags = tg)
  file.remove(tmpFile)
  tbl <- as.data.table(tbl)
  map <- tbl[!is.na(genome), .(seqid, genome, chromosome)]
  map[genome == "mitochondrion", chromosome := "MT"]
  tbl <- merge(tbl[ , .(seqid, type, start, end, strand, gene)], map)
  tbl <- unique(tbl)
  tbl[]
}
daynefiler/dlfUtils documentation built on Oct. 22, 2021, 9:49 a.m.