R/archs4-features.R

#' Create meta information for the genes and transcripts in the ARCHS4 dataset.
#'
#' @description
#' This is a preprocessing function that is required to successfully build an
#' `Archs4Repository`. It is not really intended for use during analyses.
#'
#' This Function creates *all* of the feature-level CSV files for the features
#' enumerated in the `meta/genes` gene-level hdf5 file, and the
#' `meta/transcript` transcript identfiers in the transctipt-level hdf5 file for
#' the mouse and human files found in `datadir`.
#'
#' **In order for this to work** you have to download the approprate human and
#' mouse gtf files from ensembl and save them in `datadir`. Reference the
#' [archs4_local_data_dir_validate()] function.
#'
#' For the initial relesae of the ARCHS4 dataset, the
#' `Homo_sapiens.GRCh38.90.gtf.gz` and `Mus_musculus.GRCm38.90.gtf.gz` were
#' used.
#'
#' @details
#' This function will write the augmented transcript- and gene-level files in
#' the `datadir`, using the following pattern:
#' `<organism>_<feature_type>_augmented_info.csv.gz`
#'
#' Gene symbols are the only piece of information provided for the row-level
#' identifieres for the gene count matrices. Furthermore, the gene symbol used
#' in mouse are in all uppercase, which is not how genes are referred to there.
#' In order to augment the gene symbol information with gene-level identifiers
#' and other information, we parse relatively recent GTFs provided by GENCODE.
#'
#' The fruits of the labor generated by this function are used by the
#' [archs4_feature_info()] function.
#'
#' Note that this function will replace already existing "`augmented`" files
#' if the already exist in `datadir`.
#'
#' @export
#' @importFrom utils write.csv
#' @seealso [archs4_feature_info()]
#'
#' @param datadir The directory that has the mouse and human expression
#'   hdf5 files. There will be `SPECIES_FEATURETYPE_augmented_info.csv.gz` files
#'   saved in this directory whe this function completes.
create_augmented_feature_info <- function(datadir = getOption("archs4.datadir")) {
  if (!requireNamespace("GenomicRanges", quietly = TRUE)) {
    stop("GenomicRanges package is required to create augmented feature tables")
  }
  if (!requireNamespace("rtracklayer", quietly = TRUE)) {
    stop("rtracklayer package is required to create augmented feature tables")
  }

  # Ensure that the appropriate GTF files exist
  sources <- archs4_sources()
  source.keys <- paste0(sources, "_gtf")
  gtfs <- archs4_file_path(source.keys, stop_if_missing = FALSE,
                           datadir = datadir)
  if (any(is.na(gtfs))) {
    stop("Some GTF files are missing, please run ",
         "`archs4_local_data_dir_validate()` for further information")
  }

  for (s in sources) {
    message("Constructing ", s, " augmented annotations ...")
    gtf.fn <- gtfs[paste0(s, "_gtf")]
    tx.fn <- local({
      key <- paste0(s, "_transcript_info")
      fn <- suppressWarnings(
        archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE,
                         datadir = datadir)
      )
      sub("\\.gz$", "", fn)
    })
    gn.fn <- local({
      key <- paste0(s, "_gene_info")
      fn <- suppressWarnings(
        archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE,
                         datadir = datadir)
      )
      sub("\\.gz$", "", fn)
    })

    gr <- rtracklayer::import.gff(gtf.fn)

    a4.tinfo <- archs4_feature_info("transcript", s, augmented = FALSE)
    txinfo <- .augmented_transcript_info(gr, a4.tinfo)
    write.csv(txinfo, tx.fn, row.names = FALSE)
    system(paste("gzip", tx.fn))

    a4.ginfo <- archs4_feature_info("gene", s, augmented = FALSE)
    ginfo <- .augmented_gene_info(gr, a4.ginfo)
    write.csv(ginfo, gn.fn, row.names = FALSE)
    system(paste("gzip", gn.fn))
    message("")
  }

}

#' Helper function that creats augmented transcript information file.
#'
#' @noRd
#'
#' @param gr a GRanges object from .load_gtf, we assume this is enembl gtfs
#' @param a4.tinfo The "raw" information that the ARCHS4 data stores for its
#'   transcripts, ie. the output from
#' @return a decorated `a4.tinfo` tibble of augmented transcript information.
.augmented_transcript_info <- function(gr, a4.tinfo) {
  requireNamespace("GenomicRanges", quietly = TRUE)
  dfa <- GenomicRanges::as.data.frame(gr)

  tx.info.all <- filter(dfa, type == "transcript")
  tx.info <- select(tx.info.all, transcript_id, transcript_biotype,
                    gene_name, gene_id, gene_biotype, gene_source,
                    seqnames, start, end, strand)
  if (any(duplicated(tx.info$transcript_id))) {
    stop("Duplicated transcript ids found in gtf file")
  }

  # After the full_join, we will find which transcripts are exclusive to the
  # gtf and the archs4 transcript file (hopefully none in the later case)
  check.me <- tx.info %>%
    full_join(a4.tinfo, by = c("transcript_id" = "ensembl_id"))
  stopifnot(sum(duplicated(check.me$transcript_id)) == 0L)

  gtf.only <- filter(check.me, is.na(h5idx))
  message(
    nrow(gtf.only), " transcripts in gtf only ",
    sprintf("(%.2f%% of transcripts from gtf)",
            nrow(gtf.only) / nrow(tx.info)))

  a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype))
  message(
    nrow(a4.only), " transcripts are only in arch4 tx file ",
    sprintf("(%.2f%% of transcripts in ARCHS4 hdf5 file)",
            nrow(a4.only) / nrow(a4.tinfo)))

  out <- a4.tinfo %>%
    left_join(tx.info, by = c("ensembl_id" = "transcript_id")) %>%
    rename(symbol = gene_name)
  stopifnot(
    nrow(out) == nrow(a4.tinfo),
    all(out$h5idx == a4.tinfo$h5idx))
  out
}

#' Helper function to create augmented gene-level metadata.
#'
#' @noRd
#'
#' @param x a GRanges object from .load_gtf, we assume this is enembl gtfs
#' @param a4.ginfo result from
#'   `archs4_feature_info("gene", ..., augmented = FALSE)`
#' @return a decorated `a4.ginfo` table.
.augmented_gene_info <- function(gr, a4.ginfo) {
  requireNamespace("GenomicRanges", quietly = TRUE)
  has.ens <- "ens_id" %in% colnames(a4.ginfo)
  # Not trusting the ensembl_id's for now, due to this:
  # https://github.com/MaayanLab/archs4/issues/3
  has.ens <- FALSE
  if (has.ens) {
    a4.ginfo$join <- a4.ginfo$ens_id
  } else {
    a4.ginfo$join <- a4.ginfo$a4name
  }
  dup.join <- a4.ginfo$join[duplicated(a4.ginfo$join)]

  a4.ginfo.all <- a4.ginfo
  a4.ginfo <- filter(a4.ginfo, !join %in% dup.join)

  # a4.ginfo <- mutate(a4.ginfo, join = tolower(a4name))
  stopifnot(sum(duplicated(a4.ginfo$join)) == 0L)

  gr.exons <- gr[gr$type == "exon"]
  grl <- GenomicRanges::split(gr.exons, gr.exons$gene_id)
  rgrl <- GenomicRanges::reduce(grl)
  gwidths <- sum(GenomicRanges::width(grl))
  gwidths <- tibble(gene_id = names(gwidths), length = unname(gwidths))

  dfa <- as.data.frame(gr)
  txstats <- dfa %>%
    filter(type == "transcript") %>%
    filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>%
    droplevels %>%
    group_by(gene_id) %>%
    summarize(ntx = n())

  # Grooming a symbol-based, gene-level metadata table
  g.info.all <- filter(dfa, type == "gene")
  g.info <- g.info.all %>%
    mutate(seqnames = as.character(seqnames)) %>%
    filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>%
    droplevels %>%
    select(gene_id, gene_name, gene_biotype, seqnames, start, end, strand)
  if (has.ens) {
    g.info$join <- g.info$gene_id
  } else {
    g.info$join <- g.info$gene_name
  }

  if (any(duplicated(g.info.all$gene_id))) {
    stop("Duplicated gene ids")
  }

  # There are likely duplicated "join" (symbol) rows in g.info, so deal with
  # that first -- this is a hack. Let's order the info table so that the
  # annotation entries (ie. gene_biotype) we care about appear first and pick
  # the first by join
  if (FALSE) {
    dup.symbols <- g.info$join[duplicated(g.info$join)]
    wtf <- g.info %>%
      filter(join %in% dup.symbols) %>%
      arrange(join, gene_id)
    View(wtf)
  }
  bt.order <- c('protein_coding', 'miRNA', 'lincRNA', 'rRNA', 'snoRNA',
                'scRNA', 'scaRNA', 'sRNA')
  bt.order <- c(bt.order, setdiff(g.info$gene_biotype, bt.order))
  g.info$gene_biotype <- factor(g.info$gene_biotype, bt.order)
  g.info.u <- g.info %>%
    arrange(join, gene_biotype) %>%
    filter(!duplicated(join))

  # After the full_join, we will find which genes are exclusive to the
  # gtf and the archs4 gene file (hopefully none in the later case)
  check.me <- full_join(g.info.u, a4.ginfo, by = "join")
  stopifnot(sum(duplicated(check.me$join)) == 0L)

  gtf.only <- filter(check.me, is.na(h5idx))
  message(
    nrow(gtf.only), " genes in gtf only ",
    sprintf("(%.2f%% of genes from gtf)",
            nrow(gtf.only) / nrow(g.info.u)))

  a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype))
  message(
    nrow(a4.only), " genes are only in arch4 tx file ",
    sprintf("(%.2f%% of genes in ARCHS4 hdf5 file)",
            nrow(a4.only) / nrow(a4.ginfo)))

  out <- a4.ginfo %>%
    left_join(g.info.u, by = "join") %>%
    select(-join) %>%
    rename(symbol = gene_name)
  out <- left_join(out, gwidths, by = "gene_id")
  stopifnot(
    nrow(out) == nrow(a4.ginfo),
    all(out$h5idx == a4.ginfo$h5idx))
  out
}
denalitherapeutics/archs4 documentation built on May 17, 2019, 1:29 p.m.