## These are the options I tend to favor
library(hpgltools)
library(hpgldata)
## tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(progress = TRUE,
                     verbose = TRUE,
                     width = 90,
                     echo = TRUE)
knitr::opts_chunk$set(error = TRUE,
                      fig.width = 8,
                      fig.height = 8,
                      dpi = 96)
old_options <- options(digits = 4,
                       stringsAsFactors = FALSE,
                       knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size = 10))
set.seed(1)
rmd_file <- "c-03_fission_differential_expression.Rmd"

Example hpgltool usage with a real data set (fission)

This document aims to provide further examples in how to use the hpgltools in order to perform a set of differential expression analyses.

Setting up

Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the fission data.

library(hpgltools)
tt <- sm(library(fission))
tt <- data(fission)

Annotation collection

Later on in this, I will do some ontology shenanigans. But I can grab some annotations from biomart now.

pombe_annotations <- load_biomart_annotations(
    host = "fungi.ensembl.org",
    trymart = "fungal_mart",
    trydataset = "spombe_eg_gene",
    gene_requests = c("pombase_transcript", "ensembl_gene_id", "ensembl_transcript_id",
                      "hgnc_symbol", "description", "gene_biotype"),
    species = "spombe", overwrite = TRUE)
pombe_mart <- pombe_annotations[["mart"]]
annotations <- pombe_annotations[["annotation"]]
rownames(annotations) <- make.names(gsub(pattern = "\\.\\d+$",
                                         replacement = "",
                                         x = rownames(annotations)), unique = TRUE)

Data import

All the work I do in Dr. El-Sayed's lab makes some pretty hard assumptions about how data is stored. As a result, to use the fission data set I will do a little bit of shenanigans to match it to the expected format. Now that I have played a little with fission, I think its format is quite nice and am likely to have my experiment class instead be a SummarizedExperiment.

## Extract the meta data from the fission dataset
meta <- as.data.frame(fission@colData)
## Make conditions and batches
meta[["condition"]] <- paste(meta$strain, meta$minute, sep = ".")
meta[["batch"]] <- meta[["replicate"]]
meta[["sample.id"]] <- rownames(meta)
## Grab the count data
fission_data <- fission@assays[["data"]][["counts"]]
## This will make an experiment superclass called 'expt' and it contains
## an ExpressionSet along with any arbitrary additional information one might want to include.
## Along the way it writes a Rdata file which is by default called 'expt.Rdata'
fission_expt <- create_expt(metadata = meta,
                            count_dataframe = fission_data,
                            gene_info = annotations)

Some simple differential expression analyses

My tools by default attempt all possible pairwise comparisons, this can take a long time.

fun_data <- subset_expt(fission_expt,
                        subset = "condition=='wt.120'|condition=='wt.30'")
fun_filt <- normalize_expt(fun_data, filter = "simple")
fun_norm <- sm(normalize_expt(fun_filt, batch = "limma", norm = "quant",
                              transform = "log2", convert = "cpm"))

Try using limma first

The following block will perform all pairwise comparisons of the fission dataset using limma. If one were to add the model_batch argument, the statistical model could include a known batch effect or estimates from SVA. If one wishes to use combat modified data, then one must use a normalize function to modify the counts with combat.

limma_comparison <- limma_pairwise(fun_filt)
names(limma_comparison[["all_tables"]])
summary(limma_comparison[["all_tables"]][["wt30_vs_wt120"]])
scatter_wt_mut <- extract_coefficient_scatter(limma_comparison, type = "limma",
                                              x = "wt30", y = "wt120")
scatter_wt_mut[["scatter"]]
scatter_wt_mut[["both_histogram"]][["plot"]] +
  ggplot2::scale_y_continuous(limits = c(0, 0.20))
ma_wt_mut <- extract_de_plots(limma_comparison, type = "limma")
ma_wt_mut[["ma"]][["plot"]]
ma_wt_mut[["volcano"]][["plot"]]

Then DESeq2

deseq2_pairwise() invokes DESeq2 in as an identical fashion as possible to how limma was run previously. Note, I commented out the extract_de_plots() call because I changed it so that, for the moment, it only works when using combine_de_tables(). This is something I intend to change soon.

deseq_comparison <- deseq2_pairwise(fun_filt)
summary(deseq_comparison[["all_tables"]][["wt30_vs_wt120"]])
scatter_wt_mut <- extract_coefficient_scatter(deseq_comparison, type = "deseq",
                                              x = "wt30", y = "wt120")
scatter_wt_mut[["scatter"]]
#plots_wt_mut <- extract_de_plots(deseq_comparison, type = "deseq")
#plots_wt_mut[["ma"]][["plot"]]
#plots_wt_mut[["volcano"]][["plot"]]

EdgeR

edger_pairwise(), as I suspect you already guessed, does the same process using EdgeR. This time I made explicit the model_batch parameter. It is also worth noting that I am feeding these programs the filtered expression data. This is not required, but will generally help the fidelity of the result. One may also filter the data at runtime via the 'filter' argument to *_pairwise().

edger_comparison <- edger_pairwise(fun_filt, model_batch = TRUE)
#plots_wt_mut <- extract_de_plots(edger_comparison, type = "edger")
scatter_wt_mut <- extract_coefficient_scatter(edger_comparison, type = "edger",
                                              x = "wt30", y = "wt120")
scatter_wt_mut[["scatter"]]
#plots_wt_mut[["ma"]][["plot"]]
#plots_wt_mut[["volcano"]][["plot"]]

EBSeq

EBSeq is a bit of an outlier method. It is a purely Bayesian method and makes some peculiar decisions with respect to how it handles the inputs. In addition, it is much slower for larger datasets and is therefore automatically disabled when there are more than a few contrasts in a dataset.

ebseq_comparison <- ebseq_pairwise(fun_filt)
head(ebseq_comparison$all_tables[[1]])

My basic comparison

basic_pairwise() is also an outlier method. It explicitly does not perform any statistical operations on the data beyond a log2(cpm()) subtraction of the data. Thus, if another method agrees with this, the various normalizations, modelling, etc of that method had no effect on the result.

basic_comparison <- basic_pairwise(fun_filt)
summary(basic_comparison$all_tables$wt30_vs_wt120)
scatter_wt_mut <- extract_coefficient_scatter(basic_comparison, type = "basic",
                                              x = "wt30", y = "wt120")
scatter_wt_mut[["scatter"]]
#plots_wt_mut <- extract_de_plots(basic_comparison, type = "basic")
#plots_wt_mut[["ma"]][["plot"]]
#plots_wt_mut[["volcano"]][["plot"]]

NoiSeq

This is a new entrant to the hpgltools. It has its own way of handling batches/surrogates. As a result, it ignores the model_batch parameter. Noiseq also prints a lot of random stuff to the user that (to my eyes) has no utility. I am therefore wrapping it in the sm() function to silence it.

noiseq_comparison <- sm(noiseq_pairwise(fun_filt))
summary(noiseq_comparison$all_tables$wt30_vs_wt120)

Dream

This is another recent addition. The authors of variancePartition showed in their last paper how one might include the variance estimates produced in order to supplement a statistical model. In their paper, they used limma to implement these ideas; I assume that the way voom models variance is most appropriate for this method and so in my implementation did not change that aspect of it. dream_pairwise also ignores the model_batch parameter.

Combine them all

all_comparisons <- all_pairwise(fun_data, model_batch = TRUE)
all_comparisons
all_combined <- combine_de_tables(all_comparisons, excel = "excel/wt30_vs_wt120.xlsx")
all_combined

save(file = "de_table.rda", list = "all_combined")

Shiny shenanigans

I am not sure how this will render in a rmarkdown.

## This is a neat interactive shiny widget which I forgot I wrote
slide_de_threshold(all_combined)

Back to regularly scheduled programming

## Here we see that edger and deseq agree the least:
all_comparisons[["comparison"]][["comp"]]

## And here we can look at the set of 'significant' genes according to various tools:
yeast_sig <- sm(extract_significant_genes(all_combined, excel = FALSE))
yeast_barplots <- sm(significant_barplots(combined = all_combined))
yeast_barplots[["limma"]]
yeast_barplots[["edger"]]
yeast_barplots[["deseq"]]

Setting up

Since I didn't acquire this data in a 'normal' way, I am going to post-generate a gff file which may be used by clusterprofiler, topgo, and gostats.

Therefore, I am going to make use of TxDb to make the requisite gff file.

limma_results <- limma_comparison[["all_tables"]]
## The set of comparisons performed
names(limma_results)
table <- limma_results[["wt30_vs_wt120"]]
dim(table)
gene_names <- rownames(table)

updown_genes <- get_sig_genes(table, p = 0.05, lfc = 0.4, p_column = "P.Value")
tt <- please_install("GenomicFeatures")
tt <- please_install("biomaRt")
available_marts <- biomaRt::listMarts(host = "fungi.ensembl.org")
available_marts
ensembl_mart <- biomaRt::useMart("fungi_mart", host = "fungi.ensembl.org")
available_datasets <- biomaRt::listDatasets(ensembl_mart)
pombe_hit <- grep(pattern = "pombe", x = available_datasets[["description"]])
pombe_name <- available_datasets[pombe_hit, "dataset"]
pombe_mart <- biomaRt::useDataset(pombe_name, mart = ensembl_mart)

pombe_goids <- biomaRt::getBM(attributes = c("pombase_transcript", "go_id"),
                              values = gene_names, mart = pombe_mart)
colnames(pombe_goids) <- c("ID", "GO")

Setting up with hpgltools

The above worked, it provided a table of ID and ontology. It was however a bit fraught. Here is another way.

## In theory, the above should work with a single function call:
pombe_goids_simple <- load_biomart_go(species = "spombe", overwrite = TRUE,
                                      dl_rows = c("pombase_transcript", "go_id"),
                                      host = "fungi.ensembl.org")
head(pombe_goids_simple[["go"]])
head(pombe_goids)

## This used to work, but does so no longer and I do not know why.
## pombe <- sm(GenomicFeatures::makeTxDbFromBiomart(biomart = "fungal_mart",
##                                                  dataset = "spombe_eg_gene",
##                                                  host = "fungi.ensembl.org"))

## I bet I can get all this information from ensembl now.
## This was found at the bottom of: https://www.biostars.org/p/232005/
link <- "ftp://ftp.ensemblgenomes.org/pub/release-34/fungi/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.34.gff3.gz"
pombe <- GenomicFeatures::makeTxDbFromGFF(link, format = "gff3", taxonomyId = 4896,
                                          organism = "Schizosaccharomyces pombe")

pombe_transcripts <- as.data.frame(GenomicFeatures::transcriptsBy(pombe))
lengths <- pombe_transcripts[, c("group_name","width")]
colnames(lengths) <- c("ID","width")
lengths[["ID"]] <- gsub(x = lengths[["ID"]], pattern = "^gene:", replacement = "")
## Something useful I didn't notice before:
## makeTranscriptDbFromGFF()  ## From GenomicFeatures, much like my own gff2df()
gff_from_txdb <- GenomicFeatures::asGFF(pombe)
## why is GeneID: getting prefixed to the IDs!?
gff_from_txdb$ID <- gsub(x = gff_from_txdb$ID, pattern = "GeneID:", replacement = "")
written_gff <- rtracklayer::export.gff3(gff_from_txdb, con = "pombe.gff")

GOSeq test

summary(updown_genes)
test_genes <- updown_genes[["down_genes"]]
##rownames(test_genes) <- paste0(rownames(test_genes), ".1")
##lengths[["ID"]] <- paste0(lengths[["ID"]], ".1")
goseq_result <- simple_goseq(sig_genes = test_genes, go_db = pombe_goids,
                                length_db = lengths)
head(goseq_result[["all_data"]])
goseq_result[["pvalue_plots"]][["mfp_plot_over"]]
goseq_result[["pvalue_plots"]][["bpp_plot_over"]]

test_genes <- updown_genes[["up_genes"]]
##rownames(test_genes) <- paste0(rownames(test_genes), ".1")
goseq_result <- simple_goseq(sig_genes = test_genes, go_db = pombe_goids,
                                length_db = lengths)
head(goseq_result[["all_data"]])
goseq_result[["pvalue_plots"]][["mfp_plot_over"]]
goseq_result[["pvalue_plots"]][["bpp_plot_over"]]

ClusterProfiler test

clusterProfiler really prefers an orgdb instance to use, which is probably smart, as they are pretty nice. Sadly, there is no pre-defined orgdb for pombe...

## holy crap makeOrgPackageFromNCBI is slow, no slower than some of mine, so who am I to complain.
if (! "org.Spombe.eg.db" %in% installed.packages()) {
  orgdb <- AnnotationForge::makeOrgPackageFromNCBI(
                                version = "0.1", author = "atb <abelew@gmail.com>",
                                maintainer = "atb <abelew@gmail.com>", tax_id = "4896",
                                genus = "Schizosaccharomyces", species = "pombe")
  ## This created the directory 'org.spombe.eg.db'
  devtools::install_local("org.Spombe.eg.db")
}
library(org.Spombe.eg.db)
## Don't forget to remove the terminal .1 from the gene names...
## If you do forget this, it will fail for no easily visible reason until you remember
## this and get really mad at yourself.
rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
cp_result <- simple_clusterprofiler(sig_genes = test_genes, do_david = FALSE, do_gsea = FALSE,
                                    de_table = all_combined$data[[1]],
                                    orgdb = org.Spombe.eg.db, orgdb_to = "ALIAS")
cp_result[["pvalue_plots"]][["ego_all_mf"]]
## Yay bar plots!
## Get rid of those stupid terminal .1s.
#rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
#pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
tp_result <- simple_topgo(sig_genes = test_genes,
                          go_db = pombe_goids, pval_column = "limma_adjp")
tp_result
## Get rid of those stupid terminal .1s.
##rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
##pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
## universe_merge is the column in the final data frame when.
## gff_type is the field in the gff file providing the id, this may be redundant with
## universe merge, that is something to check on...
gst_result <- simple_gostats(sig_genes = test_genes, go_db = pombe_goids, universe_merge = "id",
                             gff_type = "gene",
                             gff = "pombe.gff", pval_column = "limma_adjp")
gst_result
pander::pander(sessionInfo())


elsayed-lab/hpgltools documentation built on May 9, 2024, 5:02 a.m.