In elsayed-lab/hpgltools: A pile of (hopefully) useful R functions

## These are the options I tend to favor
library("hpgltools")
## tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(progress = TRUE,
                     verbose = TRUE,
                     width = 90,
                     echo = TRUE)
knitr::opts_chunk$set(error = TRUE,
                      fig.width = 8,
                      fig.height = 8,
                      dpi = 96)
old_options <- options(digits = 4,
                       stringsAsFactors = FALSE,
                       knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size = 10))
set.seed(1)
rmd_file <- "c-03_fission_differential_expression.Rmd"

Example hpgltool usage with a real data set (fission)

This document aims to provide further examples in how to use the hpgltools.

Note to self, the header has rmarkdown::pdf_document instead of html_document or html_vignette because it gets some bullcrap error 'margins too large'...

Setting up

Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the fission data.

library(hpgltools)
tt <- sm(library(fission))
tt <- data(fission)

Annotation collection

Later on in this, I will do some ontology shenanigans. But I can grab some annotations from biomart now.

pombe_annotations <- load_biomart_annotations(
    host = "fungi.ensembl.org",
    trymart = "fungal_mart",
    trydataset = "spombe_eg_gene",
    gene_requests = c("pombase_transcript", "ensembl_gene_id", "ensembl_transcript_id",
                      "hgnc_symbol", "description", "gene_biotype"),
    species = "spombe", overwrite = TRUE)
pombe_mart <- pombe_annotations[["mart"]]
annotations <- pombe_annotations[["annotation"]]
rownames(annotations) <- make.names(gsub(pattern = "\\.\\d+$",
                                         replacement = "",
                                         x = rownames(annotations)), unique = TRUE)

Data import

All the work I do in Dr. El-Sayed's lab makes some pretty hard assumptions about how data is stored. As a result, to use the fission data set I will do a little bit of shenanigans to match it to the expected format. Now that I have played a little with fission, I think its format is quite nice and am likely to have my experiment class instead be a SummarizedExperiment.

## Extract the meta data from the fission dataset
meta <- as.data.frame(fission@colData)
## Make conditions and batches
meta[["condition"]] <- paste(meta[["strain"]], meta[["minute"]], sep = ".")
meta[["batch"]] <- meta[["replicate"]]
meta[["sample.id"]] <- rownames(meta)
## Grab the count data
fission_data <- fission@assays[["data"]][["counts"]]
## This will make an experiment superclass called 'expt' and it contains
## an ExpressionSet along with any arbitrary additional information one might want to include.
## Along the way it writes a Rdata file which is by default called 'expt.Rdata'
fission_expt <- create_expt(metadata = meta,
                            count_dataframe = fission_data,
                            gene_info = annotations)

Some simple differential expression analyses

Travis wisely imposes a limit on the amount of time for building vignettes. My tools by default will attempt all possible pairwise comparisons, which takes a long time. Therefore I am going to take a subset of the data and limit these comparisons to that.

fun_data <- subset_expt(fission_expt,
                        subset = "condition=='wt.120'|condition=='wt.30'")
fun_filt <- normalize_expt(fun_data, filter = "simple")
fun_norm <- sm(normalize_expt(fun_filt, batch = "limma", norm = "quant",
                              transform = "log2", convert = "cpm"))

Try using limma first

limma_comparison <- sm(limma_pairwise(fun_data))
names(limma_comparison[["all_tables"]])
summary(limma_comparison[["all_tables"]][["wt30_vs_wt120"]])
scatter_wt_mut <- extract_coefficient_scatter(limma_comparison, type = "limma",
                                              x = "wt30", y = "wt120")
scatter_wt_mut[["scatter"]]
scatter_wt_mut[["both_histogram"]][["plot"]] +
  ggplot2::scale_y_continuous(limits = c(0, 0.20))
ma_wt_mut <- extract_de_plots(limma_comparison, type = "limma")
ma_wt_mut[["ma"]][["plot"]]
ma_wt_mut[["volcano"]][["plot"]]

Then DESeq2

deseq_comparison <- sm(deseq2_pairwise(fun_data))
summary(deseq_comparison[["all_tables"]][["wt30_vs_wt120"]])
scatter_wt_mut <- extract_coefficient_scatter(deseq_comparison, type = "deseq",
                                              x = "wt30", y = "wt120", gvis_filename = NULL)
scatter_wt_mut[["scatter"]]
plots_wt_mut <- extract_de_plots(deseq_comparison, type = "deseq")
plots_wt_mut[["ma"]][["plot"]]
plots_wt_mut[["volcano"]][["plot"]]

EdgeR

edger_comparison <- sm(edger_pairwise(fun_data, model_batch = TRUE))
plots_wt_mut <- extract_de_plots(edger_comparison, type = "edger")
scatter_wt_mut <- extract_coefficient_scatter(edger_comparison, type = "edger",
                                              x = "wt30", y = "wt120", gvis_filename = NULL)
scatter_wt_mut[["scatter"]]
plots_wt_mut[["ma"]][["plot"]]
plots_wt_mut[["volcano"]][["plot"]]

EBSeq

```{r simple_edger2 ebseq_comparison <- sm(ebseq_pairwise(fun_data)) head(ebseq_comparison$all_tables[[1]])

## My stupid basic comparison

```r
basic_comparison <- sm(basic_pairwise(fun_data))
summary(basic_comparison$all_tables$wt30_vs_wt120)
scatter_wt_mut <- extract_coefficient_scatter(basic_comparison, type = "basic",
                                              x = "wt30", y = "wt120")
scatter_wt_mut[["scatter"]]
plots_wt_mut <- extract_de_plots(basic_comparison, type = "basic")
plots_wt_mut[["ma"]][["plot"]]
plots_wt_mut[["volcano"]][["plot"]]

Combine them all

all_comparisons <- sm(all_pairwise(fun_data, model_batch = TRUE, parallel = FALSE))
all_combined <- sm(combine_de_tables(all_comparisons, excel = FALSE))
head(all_combined$data[[1]], n = 3)
sig_genes <- sm(extract_significant_genes(all_combined, excel = FALSE))
head(sig_genes$limma$ups[[1]], n = 3)

## Here we see that edger and deseq agree the least:
all_comparisons[["comparison"]][["comp"]]

## And here we can look at the set of 'significant' genes according to various tools:
yeast_sig <- sm(extract_significant_genes(all_combined, excel = FALSE))
yeast_barplots <- sm(significant_barplots(combined = all_combined))
yeast_barplots[["limma"]]
yeast_barplots[["edger"]]
yeast_barplots[["deseq"]]

Setting up

Since I didn't acquire this data in a 'normal' way, I am going to post-generate a gff file which may be used by clusterprofiler, topgo, and gostats.

Therefore, I am going to make use of TxDb to make the requisite gff file.

limma_results <- limma_comparison[["all_tables"]]
## The set of comparisons performed
names(limma_results)
table <- limma_results[["wt30_vs_wt120"]]
dim(table)
gene_names <- rownames(table)

updown_genes <- get_sig_genes(table, p = 0.05, lfc = 0.4, p_column = "P.Value")
tt <- please_install("GenomicFeatures")
tt <- please_install("biomaRt")
available_marts <- biomaRt::listMarts(host = "fungi.ensembl.org")
available_marts
ensembl_mart <- biomaRt::useMart("fungi_mart", host = "fungi.ensembl.org")
available_datasets <- biomaRt::listDatasets(ensembl_mart)
pombe_hit <- grep(pattern = "pombe", x = available_datasets[["description"]])
pombe_name <- available_datasets[pombe_hit, "dataset"]
pombe_mart <- biomaRt::useDataset(pombe_name, mart = ensembl_mart)

pombe_goids <- biomaRt::getBM(attributes = c("pombase_transcript", "go_id"),
                              values = gene_names, mart = pombe_mart)
colnames(pombe_goids) <- c("ID", "GO")

Setting up with hpgltools

The above worked, it provided a table of ID and ontology. It was however a bit fraught. Here is another way.

## In theory, the above should work with a single function call:
pombe_goids_simple <- load_biomart_go(species = "spombe", overwrite = TRUE,
                                      dl_rows = c("pombase_transcript", "go_id"),
                                      host = "fungi.ensembl.org")
head(pombe_goids_simple[["go"]])
head(pombe_goids)

## This used to work, but does so no longer and I do not know why.
## pombe <- sm(GenomicFeatures::makeTxDbFromBiomart(biomart = "fungal_mart",
##                                                  dataset = "spombe_eg_gene",
##                                                  host = "fungi.ensembl.org"))

## I bet I can get all this information from ensembl now.
## This was found at the bottom of: https://www.biostars.org/p/232005/
link <- "ftp://ftp.ensemblgenomes.org/pub/release-34/fungi/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.34.gff3.gz"
pombe <- GenomicFeatures::makeTxDbFromGFF(link, format = "gff3", taxonomyId = 4896,
                                          organism = "Schizosaccharomyces pombe")

pombe_transcripts <- as.data.frame(GenomicFeatures::transcriptsBy(pombe))
lengths <- pombe_transcripts[, c("group_name","width")]
colnames(lengths) <- c("ID","width")
## Something useful I didn't notice before:
## makeTranscriptDbFromGFF()  ## From GenomicFeatures, much like my own gff2df()
gff_from_txdb <- GenomicFeatures::asGFF(pombe)
## why is GeneID: getting prefixed to the IDs!?
gff_from_txdb$ID <- gsub(x = gff_from_txdb$ID, pattern = "GeneID:", replacement = "")
written_gff <- rtracklayer::export.gff3(gff_from_txdb, con = "pombe.gff")

GOSeq test

Since I changed the annotation source of this data, I need to rethink my strategy for matching up the IDs of the genes and their go/lengths.

summary(updown_genes)
test_genes <- updown_genes[["down_genes"]]
lengths[["ID"]] <- gsub(x = lengths[["ID"]], pattern = "^gene:", replacement = "")
lengths[["ID"]] <- gsub(x = lengths[["ID"]], pattern = "\\.1$", replacement = "")
pombe_goids[["ID"]] <- gsub(x = pombe_goids[["ID"]], pattern = "\\.1$", replacement = "")
goseq_result <- simple_goseq(sig_genes = test_genes, go_db = pombe_goids,
                                length_db = lengths)
head(goseq_result[["all_data"]])
goseq_result[["pvalue_plots"]][["mfp_plot_over"]]
goseq_result[["pvalue_plots"]][["bpp_plot_over"]]

test_genes <- updown_genes[["up_genes"]]
goseq_result <- simple_goseq(sig_genes = test_genes, go_db = pombe_goids,
                             length_db = lengths)
head(goseq_result[["all_data"]])
goseq_result[["pvalue_plots"]][["mfp_plot_over"]]
goseq_result[["pvalue_plots"]][["bpp_plot_over"]]

ClusterProfiler test

clusterProfiler really prefers an orgdb instance to use, which is probably smart, as they are pretty nice. Sadly, there is no pre-defined orgdb for pombe...

## holy crap makeOrgPackageFromNCBI is slow, no slower than some of mine, so who am I to complain.
if (! "org.Spombe.eg.db" %in% installed.packages()) {
  orgdb <- AnnotationForge::makeOrgPackageFromNCBI(
                                version = "0.1", author = "atb <abelew@gmail.com>",
                                maintainer = "atb <abelew@gmail.com>", tax_id = "4896",
                                genus = "Schizosaccharomyces", species = "pombe")
  ## This created the directory 'org.spombe.eg.db'
  devtools::install_local("org.Spombe.eg.db")
}
library(org.Spombe.eg.db)

## Don't forget to remove the terminal .1 from the gene names...
## If you do forget this, it will fail for no easily visible reason until you remember
## this and get really mad at yourself.
##rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
##pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
cp_result <- simple_clusterprofiler(sig_genes = test_genes, do_david = FALSE, do_gsea = FALSE,
                                    de_table = all_combined[["data"]][[1]],
                                    orgdb = org.Spombe.eg.db, orgdb_to = "ALIAS")
cp_result[["pvalue_plots"]][["ego_all_mf"]]
## Yay bar plots!

## Get rid of those stupid terminal .1s.
##rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
##pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
tp_result <- simple_topgo(sig_genes = test_genes, go_db = pombe_goids, pval_column = "limma_adjp")

tp_result[["pvalue_plots"]][["mfp_plot_over"]]
tp_result[["pvalue_plots"]][["bpp_plot_over"]]

## Get rid of those stupid terminal .1s.
##rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
## universe_merge is the column in the final data frame when.
## gff_type is the field in the gff file providing the id, this may be redundant with
## universe merge, that is something to check on...
gst_result <- sm(simple_gostats(sig_genes = test_genes, go_db = pombe_goids, universe_merge = "id",
                                gff_type = "gene",
                                gff = "pombe.gff", pval_column = "limma_adjp"))
gst_result[["pvalue_plots"]][["mfp_plot_over"]]
gst_result[["pvalue_plots"]][["bpp_plot_over"]]

pander::pander(sessionInfo())

elsayed-lab/hpgltools documentation built on May 9, 2024, 5:02 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

elsayed-lab/hpgltools
A pile of (hopefully) useful R functions

In elsayed-lab/hpgltools: A pile of (hopefully) useful R functions

Example hpgltool usage with a real data set (fission)

Setting up

Annotation collection

Data import

Some simple differential expression analyses

Try using limma first

Then DESeq2

EdgeR

EBSeq

Combine them all

Setting up

Setting up with hpgltools

GOSeq test

ClusterProfiler test

R Package Documentation

Browse R Packages

We want your feedback!

elsayed-lab/hpgltools A pile of (hopefully) useful R functions

In elsayed-lab/hpgltools: A pile of (hopefully) useful R functions

Example hpgltool usage with a real data set (fission)

Setting up

Annotation collection

Data import

Some simple differential expression analyses

Try using limma first

Then DESeq2

EdgeR

EBSeq

Combine them all

Setting up

Setting up with hpgltools

GOSeq test

ClusterProfiler test

R Package Documentation

Browse R Packages

We want your feedback!

elsayed-lab/hpgltools
A pile of (hopefully) useful R functions