#' Create meta information for the genes and transcripts in the ARCHS4 dataset.
#'
#' @description
#' This is a preprocessing function that is required to successfully build an
#' `Archs4Repository`. It is not really intended for use during analyses.
#'
#' This Function creates *all* of the feature-level CSV files for the features
#' enumerated in the `meta/genes` gene-level hdf5 file, and the
#' `meta/transcript` transcript identfiers in the transctipt-level hdf5 file for
#' the mouse and human files found in `datadir`.
#'
#' **In order for this to work** you have to download the approprate human and
#' mouse gtf files from ensembl and save them in `datadir`. Reference the
#' [archs4_local_data_dir_validate()] function.
#'
#' For the initial relesae of the ARCHS4 dataset, the
#' `Homo_sapiens.GRCh38.90.gtf.gz` and `Mus_musculus.GRCm38.90.gtf.gz` were
#' used.
#'
#' @details
#' This function will write the augmented transcript- and gene-level files in
#' the `datadir`, using the following pattern:
#' `<organism>_<feature_type>_augmented_info.csv.gz`
#'
#' Gene symbols are the only piece of information provided for the row-level
#' identifieres for the gene count matrices. Furthermore, the gene symbol used
#' in mouse are in all uppercase, which is not how genes are referred to there.
#' In order to augment the gene symbol information with gene-level identifiers
#' and other information, we parse relatively recent GTFs provided by GENCODE.
#'
#' The fruits of the labor generated by this function are used by the
#' [archs4_feature_info()] function.
#'
#' Note that this function will replace already existing "`augmented`" files
#' if the already exist in `datadir`.
#'
#' @export
#' @importFrom utils write.csv
#' @seealso [archs4_feature_info()]
#'
#' @param datadir The directory that has the mouse and human expression
#' hdf5 files. There will be `SPECIES_FEATURETYPE_augmented_info.csv.gz` files
#' saved in this directory whe this function completes.
create_augmented_feature_info <- function(datadir = getOption("archs4.datadir")) {
if (!requireNamespace("GenomicRanges", quietly = TRUE)) {
stop("GenomicRanges package is required to create augmented feature tables")
}
if (!requireNamespace("rtracklayer", quietly = TRUE)) {
stop("rtracklayer package is required to create augmented feature tables")
}
# Ensure that the appropriate GTF files exist
sources <- archs4_sources()
source.keys <- paste0(sources, "_gtf")
gtfs <- archs4_file_path(source.keys, stop_if_missing = FALSE,
datadir = datadir)
if (any(is.na(gtfs))) {
stop("Some GTF files are missing, please run ",
"`archs4_local_data_dir_validate()` for further information")
}
for (s in sources) {
message("Constructing ", s, " augmented annotations ...")
gtf.fn <- gtfs[paste0(s, "_gtf")]
tx.fn <- local({
key <- paste0(s, "_transcript_info")
fn <- suppressWarnings(
archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE,
datadir = datadir)
)
sub("\\.gz$", "", fn)
})
gn.fn <- local({
key <- paste0(s, "_gene_info")
fn <- suppressWarnings(
archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE,
datadir = datadir)
)
sub("\\.gz$", "", fn)
})
gr <- rtracklayer::import.gff(gtf.fn)
a4.tinfo <- archs4_feature_info("transcript", s, augmented = FALSE)
txinfo <- .augmented_transcript_info(gr, a4.tinfo)
write.csv(txinfo, tx.fn, row.names = FALSE)
system(paste("gzip", tx.fn))
a4.ginfo <- archs4_feature_info("gene", s, augmented = FALSE)
ginfo <- .augmented_gene_info(gr, a4.ginfo)
write.csv(ginfo, gn.fn, row.names = FALSE)
system(paste("gzip", gn.fn))
message("")
}
}
#' Helper function that creats augmented transcript information file.
#'
#' @noRd
#'
#' @param gr a GRanges object from .load_gtf, we assume this is enembl gtfs
#' @param a4.tinfo The "raw" information that the ARCHS4 data stores for its
#' transcripts, ie. the output from
#' @return a decorated `a4.tinfo` tibble of augmented transcript information.
.augmented_transcript_info <- function(gr, a4.tinfo) {
requireNamespace("GenomicRanges", quietly = TRUE)
dfa <- GenomicRanges::as.data.frame(gr)
tx.info.all <- filter(dfa, type == "transcript")
tx.info <- select(tx.info.all, transcript_id, transcript_biotype,
gene_name, gene_id, gene_biotype, gene_source,
seqnames, start, end, strand)
if (any(duplicated(tx.info$transcript_id))) {
stop("Duplicated transcript ids found in gtf file")
}
# After the full_join, we will find which transcripts are exclusive to the
# gtf and the archs4 transcript file (hopefully none in the later case)
check.me <- tx.info %>%
full_join(a4.tinfo, by = c("transcript_id" = "ensembl_id"))
stopifnot(sum(duplicated(check.me$transcript_id)) == 0L)
gtf.only <- filter(check.me, is.na(h5idx))
message(
nrow(gtf.only), " transcripts in gtf only ",
sprintf("(%.2f%% of transcripts from gtf)",
nrow(gtf.only) / nrow(tx.info)))
a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype))
message(
nrow(a4.only), " transcripts are only in arch4 tx file ",
sprintf("(%.2f%% of transcripts in ARCHS4 hdf5 file)",
nrow(a4.only) / nrow(a4.tinfo)))
out <- a4.tinfo %>%
left_join(tx.info, by = c("ensembl_id" = "transcript_id")) %>%
rename(symbol = gene_name)
stopifnot(
nrow(out) == nrow(a4.tinfo),
all(out$h5idx == a4.tinfo$h5idx))
out
}
#' Helper function to create augmented gene-level metadata.
#'
#' @noRd
#'
#' @param x a GRanges object from .load_gtf, we assume this is enembl gtfs
#' @param a4.ginfo result from
#' `archs4_feature_info("gene", ..., augmented = FALSE)`
#' @return a decorated `a4.ginfo` table.
.augmented_gene_info <- function(gr, a4.ginfo) {
requireNamespace("GenomicRanges", quietly = TRUE)
has.ens <- "ens_id" %in% colnames(a4.ginfo)
# Not trusting the ensembl_id's for now, due to this:
# https://github.com/MaayanLab/archs4/issues/3
has.ens <- FALSE
if (has.ens) {
a4.ginfo$join <- a4.ginfo$ens_id
} else {
a4.ginfo$join <- a4.ginfo$a4name
}
dup.join <- a4.ginfo$join[duplicated(a4.ginfo$join)]
a4.ginfo.all <- a4.ginfo
a4.ginfo <- filter(a4.ginfo, !join %in% dup.join)
# a4.ginfo <- mutate(a4.ginfo, join = tolower(a4name))
stopifnot(sum(duplicated(a4.ginfo$join)) == 0L)
gr.exons <- gr[gr$type == "exon"]
grl <- GenomicRanges::split(gr.exons, gr.exons$gene_id)
rgrl <- GenomicRanges::reduce(grl)
gwidths <- sum(GenomicRanges::width(grl))
gwidths <- tibble(gene_id = names(gwidths), length = unname(gwidths))
dfa <- as.data.frame(gr)
txstats <- dfa %>%
filter(type == "transcript") %>%
filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>%
droplevels %>%
group_by(gene_id) %>%
summarize(ntx = n())
# Grooming a symbol-based, gene-level metadata table
g.info.all <- filter(dfa, type == "gene")
g.info <- g.info.all %>%
mutate(seqnames = as.character(seqnames)) %>%
filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>%
droplevels %>%
select(gene_id, gene_name, gene_biotype, seqnames, start, end, strand)
if (has.ens) {
g.info$join <- g.info$gene_id
} else {
g.info$join <- g.info$gene_name
}
if (any(duplicated(g.info.all$gene_id))) {
stop("Duplicated gene ids")
}
# There are likely duplicated "join" (symbol) rows in g.info, so deal with
# that first -- this is a hack. Let's order the info table so that the
# annotation entries (ie. gene_biotype) we care about appear first and pick
# the first by join
if (FALSE) {
dup.symbols <- g.info$join[duplicated(g.info$join)]
wtf <- g.info %>%
filter(join %in% dup.symbols) %>%
arrange(join, gene_id)
View(wtf)
}
bt.order <- c('protein_coding', 'miRNA', 'lincRNA', 'rRNA', 'snoRNA',
'scRNA', 'scaRNA', 'sRNA')
bt.order <- c(bt.order, setdiff(g.info$gene_biotype, bt.order))
g.info$gene_biotype <- factor(g.info$gene_biotype, bt.order)
g.info.u <- g.info %>%
arrange(join, gene_biotype) %>%
filter(!duplicated(join))
# After the full_join, we will find which genes are exclusive to the
# gtf and the archs4 gene file (hopefully none in the later case)
check.me <- full_join(g.info.u, a4.ginfo, by = "join")
stopifnot(sum(duplicated(check.me$join)) == 0L)
gtf.only <- filter(check.me, is.na(h5idx))
message(
nrow(gtf.only), " genes in gtf only ",
sprintf("(%.2f%% of genes from gtf)",
nrow(gtf.only) / nrow(g.info.u)))
a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype))
message(
nrow(a4.only), " genes are only in arch4 tx file ",
sprintf("(%.2f%% of genes in ARCHS4 hdf5 file)",
nrow(a4.only) / nrow(a4.ginfo)))
out <- a4.ginfo %>%
left_join(g.info.u, by = "join") %>%
select(-join) %>%
rename(symbol = gene_name)
out <- left_join(out, gwidths, by = "gene_id")
stopifnot(
nrow(out) == nrow(a4.ginfo),
all(out$h5idx == a4.ginfo$h5idx))
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.