#' Prepare genesize control network
#'
#' \code{prepare_genesize_control_network} takes a gene list and finds
#' semi-randomly selected gene lists which are matched for gene length and
#' GC content.
#'
#' @param reps Number of gene lists to sample.
#' @param localHub If working offline, add argument localHub=TRUE to work
#' with a local, non-updated hub; It will only have resources available that
#' have previously been downloaded. If offline, Please also see BiocManager
#' vignette section on offline use to ensure proper functionality.
#' @inheritParams bootstrap_enrichment_test
#' @inheritParams orthogene::create_background
#'
#' @return A list containing three data frames:
#' \itemize{
#' \item \code{hits}: Array of HGNC symbols containing the hit genes.
#' May be slightly reduced if gene length / GC content could not be found
#' for all genes.
#' \item \code{list_network}: The control gene lists as a data frame of HGNC
#' symbols
#' }
#'
#' @keywords internal
#' @importFrom ewceData ensembl_transcript_lengths_GCcontent
#' @importFrom stats aggregate quantile
#' @importFrom orthogene create_background
prepare_genesize_control_network <- function(hits,
bg = NULL,
reps = 10000,
no_cores = 1,
sctSpecies = NULL,
genelistSpecies = NULL,
verbose = TRUE,
localHub = FALSE) {
target <- Gene.Symbol <- NULL
## Currently, can only convert to human genes
## since the gene metadata is only for human genes.
output_species <- "human"
#### Check species ####
species <- check_species(
genelistSpecies = genelistSpecies,
sctSpecies = sctSpecies,
verbose = verbose
)
genelistSpecies <- species$genelistSpecies
sctSpecies <- species$sctSpecies
#### Check background ####
#if statement added down to issue with orthogene:
#https://github.com/neurogenomics/orthogene/issues/22
if(is.null(bg) | !all(list(sctSpecies,genelistSpecies)==output_species)){
bg <- orthogene::create_background(
species1 = sctSpecies,
species2 = genelistSpecies,
output_species = output_species,
method = "gprofiler",
bg = bg,
verbose = verbose
)
}else{
bg <- unique(bg)
}
#### Prepare to query all_hgnc_wtEnsembl ####
combined_human_genes <- unique(c(hits, bg))
#### First get all Ensembl gene IDs from the human genes ####
## Old method (can only get human genes)
# all_hgnc_wtEnsembl <- ewceData::all_hgnc_wtEnsembl()
## New method (can get any species' genes)
# Must use gprofiler, as it's the only method with Ensembl IDs
all_hgnc_wtEnsembl <- orthogene::all_genes(
species = output_species,
method = "gprofiler",
ensure_filter_nas = FALSE,
verbose = verbose
) |>
dplyr::rename(
ensembl_gene_id = target,
gene_symbol = Gene.Symbol
)
messager(
formatC(length(unique(all_hgnc_wtEnsembl$ensembl_gene_id)),
big.mark = ","
),
output_species, "Ensembl IDs", "and",
formatC(length(unique(all_hgnc_wtEnsembl$gene_symbol)),
big.mark = ","
),
output_species, "Gene Symbols",
"imported.",
v = verbose
)
hum_ens <- all_hgnc_wtEnsembl[
all_hgnc_wtEnsembl$gene_symbol %in% combined_human_genes,
]
#### Check the gene lists are proper gene symbols ####
if (sum(hits %in% hum_ens$gene_symbol) == 0) {
err_msg <- paste0(
"ERROR: No hits recognised as human HGNC symbols.",
" Perhaps the gene list was wrongly provided as MGI",
" symbols? prepare_genesize_control_network only",
" accepts HGNC symbols."
)
stop(err_msg)
}
if (sum(bg %in% hum_ens$gene_symbol) == 0) {
err_msg2 <- paste0(
"ERROR: No bg genes recognised as human HGNC symbols.",
" Perhaps the gene list was wrongly provided as MGI",
" symbols? prepare_genesize_control_network only",
" accepts HGNC symbols."
)
stop(err_msg2)
}
### GET THE TRANSCRIPT LENGTHS AND GC CONTENT FROM ewceData ####
ensembl_transcript_lengths_GCcontent <-
ewceData::ensembl_transcript_lengths_GCcontent(localHub = localHub)
all_lengths <-
ensembl_transcript_lengths_GCcontent[
ensembl_transcript_lengths_GCcontent$ensembl_gene_id %in%
hum_ens$ensembl_gene_id,
]
all_lengths <- all_lengths[!is.na(all_lengths$transcript_length), ]
all_lens <- data.table::merge.data.table(
x = data.table::data.table(all_lengths),
y = data.table::data.table(hum_ens),
by = "ensembl_gene_id"
)
#### TAKE THE MEAN TRANSCRIPT LENGTH & GC-CONTENT FOR EACH GENE ####
transcript_lengths <- stats::aggregate(
x = all_lens$transcript_length,
by = list(all_lens$gene_symbol),
FUN = mean
)
percentage_gene_gc_content <- stats::aggregate(
x = all_lens$percentage_gene_gc_content,
by = list(all_lens$gene_symbol),
FUN = mean
)
data_byGene <- cbind(
transcript_lengths,
percentage_gene_gc_content[, 2]
)
colnames(data_byGene) <- c(
"HGNC.symbol", "transcript_lengths",
"percentage_gene_gc_content"
)
data_byGene <- data_byGene[data_byGene$HGNC.symbol != "", ]
#### Drop genes not in 1:1 ortholog-derived bg ###
# Previously used less flexible ewceData::mouse_to_human_homologs()
data_byGene2 <- data_byGene[
data_byGene$HGNC.symbol %in% combined_human_genes,
]
#### Assign each gene to gene length/GC content quadrant ####
data_byGene2 <- create_quadrants(data_byGene2 = data_byGene2)
## FOR EACH 'DISEASE LIST' GENERATE SET
## OF 10000 QUADRANT MATCHED GENE LISTS
## -Get new set of sctSpecies hits,
## containing only those within data_byGene2
hits_NEW <- data_byGene2[
data_byGene2$HGNC.symbol %in% hits,
"HGNC.symbol"
]
list_genes1d <- hits[hits %in% data_byGene$HGNC.symbol]
## Get matrix with 10000 randomly sampled genes from the same quadrant
## as the gene list.
list_network <- create_list_network(
data_byGene2 = data_byGene2,
hits_NEW = hits_NEW,
reps = reps,
no_cores = no_cores
)
messager("Controlled bootstrapping network generated.", v = verbose)
return(list(
hits = hits_NEW,
list_network = list_network
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.