R/get_genome_db.R
In utr.annotation: Annotate Variants in the Untranslated Regions

Documented in createEnsDbFromAH getEnsembl getOrgName getSeqTable getTranscriptIds getTranscriptIdsOneVariant getTrasncriptsRegions queryEnsemblInfo validateTranscripts

# Author: Yating Liu
# Description: Define functions to get genome information from database
# Date: 02/01/2019

#' Convert species name to AnnotationHub acceptable species name
#' @keywords internal
#' @param species human or mouse
#' @return AnnotationHub acceptable species name
getOrgName <- function(species) {
  if (species == "human") {
    return ("Homo sapiens")
  } else if (species == "mouse") {
    return ("Mus musculus")
  } else {
    stop("We only support human and mouse at this point!")
  }
}


#' Create a EnsDb database file from AnnotationHub
#' @keywords internal
#' @description \code{createEnsDbFromAH} create a sqlite db for specified species and version
#' @param species human or mouse
#' @param ensemblVersion Ensembl database version
#' @param dataDir path to store database information
#' @return save Db file into dataDir/species/ensemblVersion/
#' @import AnnotationHub
#' @importFrom ensembldb ensDbFromAH ensemblVersion organism
createEnsDbFromAH <- function(species, ensemblVersion, dataDir) {
  dataDir <- file.path(dataDir, species, ensemblVersion)
  # If DB file already exists, load the Db file
  if (length(list.files(dataDir, pattern = "sqlite$")) == 1) {
    return (list.files(dataDir, pattern = "sqlite$", full.names = T))
  }
  if (!dir.exists(dataDir)) {
    dir.create(dataDir, showWarnings = TRUE, recursive = TRUE, mode = "0777")
  }
  ## Load the AnnotationHub data.
  ah <- AnnotationHub()
  ## Query all available files for Ensembl release version for the species
  orgName <- getOrgName(species)
  gtffiles <- query(ah, c(orgName, paste0("release-", ensemblVersion), "GTF"))
  ## select full annotation by default
  ahId <- names(gtffiles)[str_detect(gtffiles$title, paste0(ensemblVersion, ".gtf$")) & tolower(gtffiles$species) == tolower(orgName)]
  ## Get the resource for the gtf file with the gene/transcript definitions.
  Gtf <- ah[ahId]
  ## Create a EnsDb database file from this.
  DbFile <- ensDbFromAH(Gtf, path = dataDir)
  return (DbFile)
}

#' Get protein coding transcript ids that overlap with each variant in the variant table
#' @keywords internal
#' @description \code{getTranscriptIds} Get transcript ids that overlap with each variant in the variant table
#' @param variantsTable variant table
#' @param species human or mouse
#' @param ensemblVersion Ensembl database version
#' @param dataDir path to store database information
#' @return a list of protein coding transcript ids that overlap with each variant, if more than one transcripts overlap with a variant, they are concatenated with ";"
getTranscriptIds <- function(variantsTable, species, ensemblVersion, dataDir) {
  i <- NULL
  # Create a EnsDb database file from AnnotationHub
  createEnsDbFromAH(species, ensemblVersion, dataDir)
  transcriptIds <- foreach(i=1:nrow(variantsTable), .combine = c) %dopar% getTranscriptIdsOneVariant(variantsTable[i,], species, ensemblVersion, dataDir)
  return (transcriptIds)
}

#' Search the db to find protein coding transcript ids that overlap with a variant
#' @keywords internal
#' @description \code{getTranscriptIdsOneVariant} Get transcript ids that overlap with one variant
#' @param variant one variant
#' @param species human or mouse
#' @param ensemblVersion Ensembl database version
#' @param dataDir path to store database information
#' @return protein coding transcript ids that overlap with the variant, if more than one transcripts overlap with the variant, they are concatenated with ";", if no transcript found, return NA.
#' @importFrom ensembldb EnsDb
#' @importFrom AnnotationFilter GRangesFilter TxBiotypeFilter
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicFeatures transcripts
#' @importFrom IRanges IRanges
getTranscriptIdsOneVariant <- function(variant, species, ensemblVersion, dataDir) {
  # Connect to an EnsDb object
  ensDb <- EnsDb(createEnsDbFromAH(species, ensemblVersion, dataDir))
  ## Define the filter
  grf <- GRangesFilter(GRanges(str_remove(variant[["Chr"]], "chr"),
                               ranges = IRanges(as.numeric(variant[["Pos"]]), as.numeric(variant[["Pos"]]) + max(nchar(variant[["Ref"]]), nchar(variant[["Alt"]])) - 1)),
                       type = "any")
  biotype <- TxBiotypeFilter("protein_coding", "==")

  ## Query transcripts that overlap with the variant and must be protein coding gene
  trans <- transcripts(ensDb, filter = c(grf, biotype))
  if (length(trans) > 0) {
    return (paste0(sort(trans$tx_id), collapse = ";"))
  } else {
    return (NA)
  }
}

#' Checking whether a transcript is valid
#' @keywords internal
#' @description \code{validateTranscripts} checks 1) whether a transcript has a peptide sequence, 2) whether the peptide sequence ends with "*", 3) whether the peptide starts with "M"
#' @param transcriptIds a list of Ensembl transcript ids
#' @param species human or mouse
#' @param ensemblVersion Ensembl database version
#' @return transcript_regions with additional flags: protein_coding, valid_startcodon, valid_stopcodon
#' @import data.table
validateTranscripts <- function(transcriptIds, species, ensemblVersion, dataDir = "data") {
  pep_seq <- getSeqTable(transcriptIds, 'peptide', species, ensemblVersion, dataDir)
  pep_seq[, `:=` (protein_coding = (seq != "Sequence unavailable"),
                  valid_startcodon = !startsWith(seq, "X"),
                  valid_stopcodon = endsWith(seq, "*"),
                  startcodon_M = startsWith(seq, "M"))]
  return (pep_seq[, seq := NULL])
}

#' query transcrips regions, utr sequences, and coding sequences from Ensembl database
#' @keywords internal
#' @description \code{queryEnsemblInfo} query transcrips regions, utr sequences, and coding sequences from Ensembl database and save rds into sepecified directory
#' @param transcriptIds Ensembl transcript ids
#' @param species human or mouse
#' @param ensemblVersion Ensembl database version
#' @param dataDir path to store database information
#' @param verbose Whether print diagnostic messages. The default is FALSE.
#' @return save all Db objects into dataDir
#' @import data.table
queryEnsemblInfo <- function(transcriptIds, species, ensemblVersion, dataDir, verbose = FALSE) {
  ensembl_transcript_id <- protein_coding <- valid_stopcodon <- valid_startcodon <-  NULL
  dataDir <- file.path(dataDir, species, ensemblVersion)
  if (!dir.exists(dataDir)) {
    dir.create(dataDir, showWarnings = TRUE, recursive = TRUE, mode = "0777")
  }
  # If no Transcript that overlap with any variants, return
  if (all(is.na(transcriptIds)) == TRUE) {
    if (verbose) cat("no transcript overlaps with variants, return.")
    return (NA)
  }
  # Get transcript info, 5'UTR and 3'UTR coordinates for each transcript.
  if (verbose) cat("Get all ensembl transcript id to extract sequences from DB...\n")
  transcript_regions <- getTrasncriptsRegions(transcriptIds, species, ensemblVersion, dataDir)
  transcriptIds <- unique(transcript_regions[, ensembl_transcript_id])
  # check whether a transcript is protein coding, have valid start codon, have valid stop codon
  transcript_validation <- validateTranscripts(transcriptIds, species, ensemblVersion, dataDir)
  transcript_validation <- transcript_validation[protein_coding == TRUE & valid_stopcodon == TRUE & valid_startcodon,]
  ## remove transcripts that are not protein coding and do not have stop codon
  transcript_regions <- left_join(transcript_validation[, c("ensembl_transcript_id", "startcodon_M")], transcript_regions)
  transcript_regions <- as.data.table(transcript_regions)
  transcriptIds <- unique(transcript_validation[, ensembl_transcript_id])
  # To check if any transcripts. If no transcripts in the variants, no further analysis is needed
  if (length(transcriptIds) == 0) {
    if (verbose) cat("no valid transcript overlaps with variants, return.")
  } else {
    if (verbose) cat("Saving transcript_regions to:", file.path(dataDir, "transcript_regions.rds"))
    saveRDS(transcript_regions, file.path(dataDir, "transcript_regions.rds"))
    # extract sequences from DB
    if (verbose) cat("extract sequences from DB...\n")
    utr5_seq <- getSeqTable(transcriptIds, '5utr', species, ensemblVersion, dataDir)
    if (verbose) cat("Saving utr5_seq to:", file.path(dataDir, "utr5_seq.rds"))
    saveRDS(utr5_seq, file.path(dataDir, "utr5_seq.rds"))
    utr3_seq <- getSeqTable(transcriptIds, '3utr', species, ensemblVersion, dataDir)
    if (verbose) cat("Saving utr3_seq to:", file.path(dataDir, "utr3_seq.rds"))
    saveRDS(utr3_seq, file.path(dataDir, "utr3_seq.rds"))
    coding_seq <- getSeqTable(transcriptIds, 'coding', species, ensemblVersion, dataDir)
    if (verbose) cat("Saving coding_seq to:", file.path(dataDir, "coding_seq.rds"))
    saveRDS(coding_seq, file.path(dataDir, "coding_seq.rds"))
  }

}

#' Get Ensembl database object for the specified species
#' @keywords internal
#' @param species human or mouse
#' @param ensemblVersion Ensembl version number
#' @param dataDir Path to store database information
#' @return Save BioMart object to dataDir
#' @importFrom biomaRt useEnsembl
getEnsembl <- function(species, ensemblVersion, dataDir) {
  if (!dir.exists(dataDir)) {
    dir.create(dataDir, showWarnings = TRUE, recursive = FALSE, mode = "0777")
  }
  if (species == "human") {
    if (!file.exists(file.path(dataDir, paste0("ensembl_human_", ensemblVersion, ".rds")))) {
      ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", version = ensemblVersion)
      saveRDS(ensembl, file.path(dataDir, paste0("ensembl_human_", ensemblVersion, ".rds")))
    } else {
      ensembl <- readRDS(file.path(dataDir, paste0("ensembl_human_", ensemblVersion, ".rds")))
    }
  } else if (species == "mouse") {
    if (!file.exists(file.path(dataDir, paste0("ensembl_mouse_", ensemblVersion, ".rds")))) {
      ensembl = useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl", version = ensemblVersion)
      saveRDS(ensembl, file.path(dataDir, paste0("ensembl_mouse_", ensemblVersion, ".rds")))
    } else {
      ensembl <- readRDS(file.path(dataDir, paste0("ensembl_mouse_", ensemblVersion, ".rds")))
    }
  } else {
    stop("Species ", species, " is not supported currently! Only human or mouse is supported.")
  }
  return (ensembl)
}

#' Get information on transcript regions, UTRs regions, coding regions, chromosome name, and strand from Ensembl database
#' @description \code{getTrasncriptsRegions} query "ensembl_gene_id", "ensembl_transcript_id", "transcript_start", "transcript_end", "5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end", "genomic_coding_start", "genomic_coding_end", "gene_biotype", "strand" from Ensembl database
#' @keywords internal
#' @param transcriptIds Ensembl transcript ids
#' @param species human or mouse
#' @param ensemblVersion Ensembl database version
#' @param dataDir directory to store DB information
#' @return transcript structure table containing "ensembl_gene_id", "ensembl_transcript_id", "transcript_start", "transcript_end", "5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end", "genomic_coding_start", "genomic_coding_end","gene_biotype", "strand"
#' @importFrom biomaRt getBM
getTrasncriptsRegions <- function(transcriptIds, species, ensemblVersion, dataDir) {
  ensembl <- getEnsembl(species, ensemblVersion, dataDir)
  tran_ids <- unique(unlist(str_split(transcriptIds, ";")))
  transcript_regions <- getBM(attributes = c("ensembl_gene_id", "ensembl_transcript_id", "transcript_start", "transcript_end", "5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end", "genomic_coding_start", "genomic_coding_end", "gene_biotype", "strand"), filters = c("ensembl_transcript_id", "transcript_gencode_basic"), values = list(tran_ids, TRUE), mart = ensembl)
  transcript_regions <- as.data.table(transcript_regions)
  # change column names which start with a number, which is a not good name convension for data table
  setnames(transcript_regions, c("5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end"), c("utr5_start", "utr5_end", "utr3_start", "utr3_end"))
  return (transcript_regions)
}

#' Get UTR sequences of a list of transcripts
#' @keywords internal
#' @description \code{getSeqTable} will query 5' UTR or 3' UTR sequences of a list of transcripts from Ensembl database
#' @param transcriptIds a list of Ensembl transcript ids
#' @param seqType Either '5utr' or '3utr'. 5utr' for 5' UTR sequence, '3utr' for 3' UTR sequences
#' @param species human or mouse
#' @param dataDir directory to store DB information
#' @return sequences table containing columns: ensembl_transcript_id and seq
#' @importFrom biomaRt getSequence
getSeqTable <- function(transcriptIds, seqType, species, ensemblVersion, dataDir, downstream = NULL, upstream = NULL) {
  ensembl <- getEnsembl(species, ensemblVersion, dataDir)

  if (!is.null(downstream) & !is.null(upstream)) {
    stop("Currently getSequence only allows the user to specify either an upstream of a downstream argument but not both.")
  }
  if (!is.null(downstream)) {
    seqTable <- getSequence(id=transcriptIds,
                            type='ensembl_transcript_id',
                            seqType = seqType,
                            downstream = downstream,
                            mart = ensembl)
  } else if (!is.null(upstream)) {
    seqTable <- getSequence(id=transcriptIds,
                            type='ensembl_transcript_id',
                            seqType = seqType,
                            upstream = upstream,
                            mart = ensembl)
  } else {
    seqTable <- getSequence(id=transcriptIds,
                            type='ensembl_transcript_id',
                            seqType = seqType,
                            mart = ensembl)
  }
  seqTable <- as.data.table(seqTable)
  setnames(seqTable, old=seqType, new='seq')

  return (seqTable)
}

Any scripts or data that you put into this service are public.

utr.annotation documentation built on Aug. 23, 2021, 9:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

utr.annotation
Annotate Variants in the Untranslated Regions

R/get_genome_db.R
In utr.annotation: Annotate Variants in the Untranslated Regions

Defines functions getSeqTable getTrasncriptsRegions getEnsembl queryEnsemblInfo validateTranscripts getTranscriptIdsOneVariant getTranscriptIds createEnsDbFromAH getOrgName

Documented in createEnsDbFromAH getEnsembl getOrgName getSeqTable getTranscriptIds getTranscriptIdsOneVariant getTrasncriptsRegions queryEnsemblInfo validateTranscripts

Try the utr.annotation package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

utr.annotation Annotate Variants in the Untranslated Regions

R/get_genome_db.R In utr.annotation: Annotate Variants in the Untranslated Regions

Defines functions getSeqTable getTrasncriptsRegions getEnsembl queryEnsemblInfo validateTranscripts getTranscriptIdsOneVariant getTranscriptIds createEnsDbFromAH getOrgName

Documented in createEnsDbFromAH getEnsembl getOrgName getSeqTable getTranscriptIds getTranscriptIdsOneVariant getTrasncriptsRegions queryEnsemblInfo validateTranscripts

Try the utr.annotation package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

utr.annotation
Annotate Variants in the Untranslated Regions

R/get_genome_db.R
In utr.annotation: Annotate Variants in the Untranslated Regions