knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "bioconductor/"
)

This article collects notes on Bioconductor packages, made available here to faciliate their use and extensions.

pkgs <- c("AnnotationDbi", "Biostrings", "ComplexHeatmap", "DESeq2", "EnsDb.Hsapiens.v86",
          "FlowSorted.DLPFC.450k", "GeneNet", "GenomicFeatures", "IlluminaHumanMethylation450kmanifest",
          "OUTRIDER","RColorBrewer", "RMariaDB", "Rgraphviz", "S4Vectors", "SummarizedExperiment",
          "TxDb.Hsapiens.UCSC.hg38.knownGene", "bladderbatch", "clusterProfiler",
          "corpcor", "ensembldb", "fdrtool", "graph", "graphite", "heatmaply",
          "minfi", "org.Hs.eg.db", "plyr", "quantro", "recount3", "sva")
for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) {
    if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install"))
}
invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))

liftover

See inst/turbomanin the source, https://github.com/jinghuazhao/pQTLtools/tree/master/inst/turboman, or turboman/ directory in the installed package.

Normalisation

ComBat

This is the documentation example, based on Bioconductor 3.14.

data(bladderdata, package="bladderbatch")
edat <- bladderEset[1:50]

pheno <- Biobase::pData(edat)
batch <- pheno$batch
table(batch)
quantro::matboxplot(edat,batch,cex.axis=0.6,notch=TRUE,pch=19,ylab="Expression")
quantro::matdensity(edat,batch,xlab=" ",ylab="density")
legend("topleft",legend=1:5,col=1:5,lty=1)

# 1. parametric adjustment
combat_edata1 <- sva::ComBat(dat=edat, batch=batch, par.prior=TRUE, prior.plots=TRUE)

# 2. non-parametric adjustment, mean-only version
combat_edata2 <- sva::ComBat(dat=edat, batch=batch, par.prior=FALSE, mean.only=TRUE)

# 3. reference-batch version, with covariates
mod <- model.matrix(~as.factor(cancer), data=pheno)
combat_edata3 <- sva::ComBat(dat=edat, batch=batch, mod=mod, par.prior=TRUE, ref.batch=3, prior.plots=TRUE)

quantro

This is also adapted from the package vignette but with FlowSorted.DLPFC.450k in place of FlowSorted.

data(FlowSorted.DLPFC.450k,package="FlowSorted.DLPFC.450k")
p <- getBeta(FlowSorted.DLPFC.450k,offset=100)
pd <- Biobase::pData(FlowSorted.DLPFC.450k)
quantro::matboxplot(p, groupFactor = pd$CellType, xaxt = "n", main = "Beta Values", pch=19)
quantro::matdensity(p, groupFactor = pd$CellType, xlab = " ", ylab = "density",
                    main = "Beta Values", brewer.n = 8, brewer.name = "Dark2")
legend('top', c("NeuN_neg", "NeuN_pos"), col = c(1, 2), lty = 1, lwd = 3)
qtest <- quantro::quantro(object = p, groupFactor = pd$CellType)
if (FALSE)
{
  doParallel::registerDoParallel(cores=10)
  qtestPerm <- quantro::quantro(p, groupFactor = pd$CellType, B = 1000)
  quantro::quantroPlot(qtestPerm)
}

Outlier detection in RNA-Seq

The following is adapted from package OUTRIDER,

ctsFile <- system.file('extdata', 'KremerNBaderSmall.tsv', package='OUTRIDER')
ctsTable <- read.table(ctsFile, check.names=FALSE)
ods <- OUTRIDER::OutriderDataSet(countData=ctsTable)
ods <- OUTRIDER::filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
ods <- OUTRIDER::OUTRIDER(ods)
res <- OUTRIDER::results(ods)
knitr::kable(res,caption="A check list of outliers")
OUTRIDER::plotQQ(ods, res["geneID"],global=TRUE)

Differential expression

ex <- DESeq2::makeExampleDESeqDataSet(m=4)
dds <- DESeq2::DESeq(ex)
res <- DESeq2::results(dds, contrast=c("condition","B","A"))
rld <- DESeq2::rlogTransformation(ex, blind=TRUE)
dat <- DESeq2::plotPCA(rld, intgroup=c("condition"),returnData=TRUE)
percentVar <- round(100 * attr(dat,"percentVar"))
ggplot2::ggplot(dat, ggplot2::aes(PC1, PC2, color=condition, shape=condition)) +
ggplot2::geom_point(size=3) +
ggplot2::xlab(paste0("PC1:",percentVar[1],"% variance")) +
ggplot2::ylab(paste0("PC2:",percentVar[2],"% variance"))
ex$condition <- relevel(ex$condition, ref="B")
dds2 <- DESeq2::DESeq(dds)
res <- DESeq2::results(dds2)
knitr::kable(head(as.data.frame(res)))

See the package in action from a snakemake workflow @koster21.

Gene co-expression and network analysis

A simple network is furnished with the GeneNet documentation example,

## A random network with 40 nodes 
# it contains 780=40*39/2 edges of which 5 percent (=39) are non-zero
true.pcor <- GeneNet::ggm.simulate.pcor(40)

# A data set with 40 observations
m.sim <- GeneNet::ggm.simulate.data(40, true.pcor)

# A simple estimate of partial correlations
estimated.pcor <- corpcor::cor2pcor( cor(m.sim) )

# A comparison of estimated and true values
sum((true.pcor-estimated.pcor)^2)

# A slightly better estimate ...
estimated.pcor.2 <- GeneNet::ggm.estimate.pcor(m.sim)
sum((true.pcor-estimated.pcor.2)^2)

## ecoli data 
data(ecoli, package="GeneNet")

# partial correlation matrix 
inferred.pcor <- GeneNet::ggm.estimate.pcor(ecoli)

# p-values, q-values and posterior probabilities for each potential edge 
test.results <- GeneNet::network.test.edges(inferred.pcor)

# best 20 edges (strongest correlation)
test.results[1:20,]

# network containing edges with prob > 0.9 (i.e. local fdr < 0.1)
net <- GeneNet::extract.network(test.results, cutoff.ggm=0.9)
net

# significant based on FDR cutoff Q=0.05?
num.significant.1 <- sum(test.results$qval <= 0.05)
test.results[1:num.significant.1,]

# significant based on "local fdr" cutoff (prob > 0.9)?
num.significant.2 <- sum(test.results$prob > 0.9)
test.results[test.results$prob > 0.9,]

# parameters of the mixture distribution used to compute p-values etc.
c <- fdrtool::fdrtool(corpcor::sm2vec(inferred.pcor), statistic="correlation")
c$param

## A random network with 20 nodes and 10 percent (=19) edges
true.pcor <- GeneNet::ggm.simulate.pcor(20, 0.1)

# convert to edge list
test.results <- GeneNet::ggm.list.edges(true.pcor)[1:19,]
nlab <- LETTERS[1:20]

# graphviz
# network.make.dot(filename="test.dot", test.results, nlab, main = "A graph")
# system("fdp -T svg -o test.svg test.dot")

# Rgraphviz
gr <- GeneNet::network.make.graph( test.results, nlab)
gr
num.nodes(gr)
edge.info(gr)
gr2 <- GeneNet::network.make.graph( test.results, nlab, drop.singles=TRUE)
gr2
GeneNet::num.nodes(gr2)
GeneNet::edge.info(gr2)

# plot network
plot(gr, "fdp")
plot(gr2, "fdp")

A side-by-side heatmaps

set.seed(123454321)
m <- matrix(runif(2500),50)
r <- cor(m)
g <- as.matrix(r>=0.7)+0
f1 <- ComplexHeatmap::Heatmap(r)
f2 <- ComplexHeatmap::Heatmap(g)
f <- f1+f2
ComplexHeatmap::draw(f)

df <- heatmaply::normalize(mtcars)
hm <- heatmaply::heatmaply(df,k_col=5,k_row=5,
                           colors = grDevices::colorRampPalette(RColorBrewer::brewer.pal(3, "RdBu"))(256))
htmlwidgets::saveWidget(hm,file="heatmaply.html")
htmltools::tags$iframe(src = "heatmaply.html", width = "100%", height = "550px")

so we have heatmaply.html and a module analysis with WGCNA,

pwr <- c(1:10, seq(from=12, to=30, by=2))
sft <- WGCNA::pickSoftThreshold(dat, powerVector=pwr, verbose=5)
ADJ <- abs(cor(dat, method="pearson", use="pairwise.complete.obs"))^6
dissADJ <- 1-ADJ
dissTOM <- WGCNA::TOMdist(ADJ)
TOM <- WGCNA::TOMsimilarityFromExpr(dat)
Tree <- hclust(as.dist(1-TOM), method="average")
for(j in pwr)
{
  pam_name <- paste0("pam",j)
  assign(pam_name, cluster::pam(as.dist(dissADJ),j))
  pamTOM_name <- paste0("pamTOM",j)
  assign(pamTOM_name,cluster::pam(as.dist(dissTOM),j))
  tc <- table(get(pam_name)$clustering,get(pamTOM_name)$clustering)
  print(tc)
  print(diag(tc))
}
colorStaticTOM <- as.character(WGCNA::cutreeStaticColor(Tree,cutHeight=.99,minSize=5))
colorDynamicTOM <- WGCNA::labels2colors(cutreeDynamic(Tree,method="tree",minClusterSize=5))
Colors <- data.frame(pamTOM6$clustering,colorStaticTOM,colorDynamicTOM)
WGCNA::plotDendroAndColors(Tree, Colors, dendroLabels=FALSE, hang=0.03, addGuide=TRUE, guideHang=0.05)
meg <- WGCNA::moduleEigengenes(dat, color=1:ncol(dat), softPower=6)

Meta-data

This section is based on package recount3.

hs <- recount3::available_projects()
dim(subset(hs,file_source=="gtex"))
recount3::annotation_options("human")
blood_rse <- recount3::create_rse(subset(hs,project=="BLOOD"))
S4Vectors::metadata(blood_rse)
SummarizedExperiment::rowRanges(blood_rse)
colnames(SummarizedExperiment::colData(blood_rse))[1:20]
recount3::expand_sra_attributes(blood_rse)

Pathway and enrichment analysis

reactome <- graphite::pathways("hsapiens", "reactome")
kegg <- graphite::pathways("hsapiens","kegg")
pharmgkb <- graphite::pathways("hsapiens","pharmgkb")
nodes(kegg[[21]])
kegg_t2g <- ldply(lapply(kegg, nodes), data.frame)
names(kegg_t2g) <- c("gs_name", "gene_symbol")
VEGF <- subset(kegg_t2g,gs_name=="VEGF signaling pathway")[[2]]
eKEGG <- clusterProfiler::enricher(gene=VEGF, TERM2GENE = kegg_t2g,
                                   universe=,
                                   pAdjustMethod = "BH",
                                   pvalueCutoff = 0.1, qvalueCutoff = 0.05,
                                   minGSSize = 10, maxGSSize = 500)

Peptide sequence

Here is an example for PROC_HUMAN, which is handled by the Biostrings package,

fasta_file_path <- 'https://rest.uniprot.org/uniprotkb/P04070.fasta'
fasta_sequences <- Biostrings::readAAStringSet(fasta_file_path, format = "fasta")
AA_sequence <- fasta_sequences[[1]]
cat("Sequence:", toString(AA_sequence), "\n")
iso_442688365 <- 'TDGEGALSEPSATVTIEELAAPPPPVLMHHGESSQVLHPGNK'
match_position <- regexpr(iso_442688365, AA_sequence)
match_position
mp <- matchPattern(iso_442688365,AA_sequence)
mp
protein <- "PROC"
cistrans <- read.csv(paste0("~/pQTLtools/tests","/",protein,".cis.vs.trans"))
load(paste0("~/pQTLtools/tests/",protein,".rda"))
pQTLtools::peptideAssociationPlot(protein,cistrans)

Transcript databases

An overview of annotation is available @carlson16.

options(width=200)

# columns(org.Hs.eg.db)
# keyref <- keys(org.Hs.eg.db, keytype="ENTREZID")
# symbol_uniprot <- select(org.Hs.eg.db,keys=keyref,columns = c("SYMBOL","UNIPROT"))
# subset(symbol_uniprot,SYMBOL=="MC4R")

x <- EnsDb.Hsapiens.v86
ensembldb::listColumns(x, "protein", skip.keys=TRUE)
ensembldb::listGenebiotypes(x)
ensembldb::listTxbiotypes(x)
ensembldb::listTables(x)
ensembldb::metadata(x)
ensembldb::organism(x)
ensembldb::returnFilterColumns(x)
ensembldb::seqinfo(x)
ensembldb::seqlevels(x)
ensembldb::updateEnsDb(x)

ensembldb::genes(x, columns=c("gene_name"),
             filter=list(SeqNameFilter("X"), AnnotationFilter::GeneBiotypeFilter("protein_coding")))
ensembldb ::transcripts(x, columns=ensembldb::listColumns(x, "tx"),
                        filter = AnnotationFilter::AnnotationFilterList(), order.type = "asc", return.type = "GRanges")

txdbEnsemblGRCh38 <- GenomicFeatures::makeTxDbFromEnsembl(organism="Homo sapiens", release=98)
txdb <- as.list(txdbEnsemblGRCh38)
lapply(txdb,head)

txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

# liverExprs <- quantifyExpressionsFromBWs(txdb = txdb,BWfiles=,experimentalDesign=)

Bionconductor forum

Web: https://support.bioconductor.org/

Bioconductor/CRAN packages

Package | Description --------|------------ Bioconductor | AnnotationDbi | AnnotationDb objects and their progeny, methods etc. Biobase | Base functions for Bioconductor Biostrings | Efficient manipulation of biological strings clusterProfiler | Functional profiles for genes and gene clusters ComplexHeatmap | Make complex heatmaps DESSeq2 | Differential gene expression analysis based on the negative binomial distribution edgeR | Empirical analysis of digital gene expression EnsDb.Hsapiens.v86 | Exposes an annotation databases generated from Ensembl ensembldb | Retrieve annotation data from an Ensembl based package FlowSorted.DLPFC.450k | Illumina HumanMethylation data on sorted frontal cortex cell populations graphite | GRAPH Interaction from pathway topological environment IlluminaHumanMethylation450kmanifest | Annotation for Illumina's 450k methylation arrays INSPEcT | Quantification of the intronic and exonic gene features and the post-transcriptional regulation analysis org.Hs.eg.db | Conversion of Entrez ID -- gene symbols OUTRIDER | OUTlier in RNA-Seq fInDER Pi | Priority index, leveraging genetic evidence to prioritise drug targets at the gene and pathway level quantro | A test for when to use quantile normalisation recount3 | Interface to uniformly processed RNA-seq data Rgraphiz | Interfaces R with the AT&T graphviz library for plotting R graph objects from the graph package sva | Surrogate Variable Analysis TxDb.Hsapiens.UCSC.hg38.knownGene | Annotation of the human genome CRAN | doParallel | Foreach Parallel Adaptor for the 'parallel' Package GeneNet | Modeling and Inferring Gene Networks ggplot2 | Data Visualisations Using the grammar of graphics heatmaply | Interactive Cluster Heat Maps Using plotly and ggplot2 pheatmap | results visualisation plyr | Splitting, applying and combining data RColorBrewer | ColorBrewer Palettes WGCNA | Weighted correlation network analysis

References



jinghuazhao/pQTLtools documentation built on May 18, 2024, 12:14 p.m.