set.seed(0) knitr::opts_chunk$set( out.extra = 'style="display:block; margin: auto"', fig.align = "center", fig.path = "esse/", collapse = TRUE, comment = "#>", dev = "png")
es_pkgs <- c("Biobase", "arrayQualityMetrics", "dplyr", "knitr", "mclust", "pQTLtools", "rgl", "scatterplot3d") se_pkgs <- c("BiocGenerics","GenomicRanges","IRanges","MsCoreUtils","S4Vectors","SummarizedExperiment") pkgs <- c(es_pkgs,se_pkgs) for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) { if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install")) } invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))
We start with Bioconductor/Biobase's ExpressionSet example and finish with a real application.
dataDirectory <- system.file("extdata", package="Biobase") exprsFile <- file.path(dataDirectory, "exprsData.txt") exprs <- as.matrix(read.table(exprsFile, header=TRUE, sep="\t", row.names=1, as.is=TRUE)) pDataFile <- file.path(dataDirectory, "pData.txt") pData <- read.table(pDataFile, row.names=1, header=TRUE, sep="\t") all(rownames(pData)==colnames(exprs)) metadata <- data.frame(labelDescription=c("Patient gender", "Case/control status", "Tumor progress on XYZ scale"), row.names=c("gender", "type", "score")) phenoData <- Biobase::AnnotatedDataFrame(data=pData, varMetadata=metadata) experimentData <- Biobase::MIAME(name="Pierre Fermat", lab="Francis Galton Lab", contact="pfermat@lab.not.exist", title="Smoking-Cancer Experiment", abstract="An example ExpressionSet", url="www.lab.not.exist", other=list(notes="Created from text files")) exampleSet <- pQTLtools::make_ExpressionSet(exprs,phenoData,experimentData=experimentData, annotation="hgu95av2") data(sample.ExpressionSet, package="Biobase") identical(exampleSet,sample.ExpressionSet)
The great benefit is to use the object directly as a data.frame.
lm.result <- Biobase::esApply(exampleSet,1,function(x) lm(score~gender+x)) beta.x <- unlist(lapply(lapply(lm.result,coef),"[",3)) beta.x[1] lm(score~gender+AFFX.MurIL2_at,data=exampleSet)
We wish to examine the distribution of each feature via histogram, scatter and boxplot.
One could resort to esApply()
for its simplicity as before
invisible(Biobase::esApply(exampleSet[1:2],1,function(x) {par(mfrow=c(3,1));boxplot(x);hist(x);plot(x)} ))
but it is nicer to add feature name in the title.
par(mfrow=c(1,3)) f <- Biobase::featureNames(exampleSet[1:2]) invisible(sapply(f,function(x) { d <- t(Biobase::exprs(exampleSet[x])) fn <- Biobase::featureNames(exampleSet[x]) hist(d,main="",xlab=fn); plot(d, ylab=fn); boxplot(d,ylab=fn) } ) )
where the expression set is indexed using feature name(s).
This illustrates one mechanism,
list_outliers <- function(es, method="upperquartile") arrayQualityMetrics::outliers(exprs(es),method=method) for (method in c("KS","sum","upperquartile")) { ZWK_outliers <- list_outliers(protein_ZWK,method=method) print(ZWK_outliers@statistic[ZWK_outliers@which]) }
We employ model-based clustering absed on principal compoents to see potential groupings in the data,
pca <- prcomp(na.omit(t(Biobase::exprs(exampleSet))), rank=10, scale=TRUE) pc1pc2pc3 <- with(pca,x)[,1:3] mc <- mclust::Mclust(pc1pc2pc3,G=3) with(mc, { cols <- c("blue","red", "purple") s3d <- scatterplot3d::scatterplot3d(with(pca,x[,c(2,1,3)]), color=cols[classification], pch=16, type="h", main="Plot of the PC1, PC2 and PC3") s3d.coords <- s3d$xyz.convert(with(pca,x[,c(2,1,3)])) text(s3d.coords$x, s3d.coords$y, cex = 1.2, col = cols[classification], labels = row.names(pc1pc2pc3), pos = 4) legend("right", legend=levels(as.factor(classification)), col=cols[classification], pch=16) rgl::open3d(width = 500, height = 500) rgl::plot3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],size=5) rgl::text3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],texts=row.names(pc1pc2pc3)) htmlwidgets::saveWidget(rgl::rglwidget(), file = "mcpca3d.html") })
An interactive version is also available,
htmltools::tags$iframe(src = "mcpca3d.html", width = "100%", height = "450px")
Suppose we wish use log2 for those greater than zero but set those negative values to be missing.
log2.na <- function(x) log2(ifelse(x>0, x, NA)) Biobase::exprs(exampleSet) <- log2.na(Biobase::exprs(exampleSet))
We generate a lod.max
~ U[0,1] variable to experiment
Biobase::fData(exampleSet) Biobase::fData(exampleSet)$lod.max <- apply(Biobase::exprs(exampleSet),1,quantile,runif(nrow(exampleSet))) lod <- pQTLtools::get.prop.below.LLOD(exampleSet) x <- dplyr::arrange(Biobase::fData(lod),desc(pc.belowLOD.new)) knitr::kable(head(lod)) plot(x[,2], main="Random quantile cutoff", ylab="<lod%")
The quantity has been shown to have a big impact on protein abundance and therefore pQTL detection as is shown with a real example.
rm(list=ls()) dir <- "~/rds/post_qc_data/interval/phenotype/olink_proteomics/post-qc/" eset <- readRDS(paste0(dir,"eset.inf1.flag.out.outlier.in.rds")) x <- pQTLtools::get.prop.below.LLOD(eset) annot <- Biobase::fData(x) annot$MissDataProp <- as.numeric(gsub("\\%$", "", annot$MissDataProp)) plot(annot$MissDataProp, annot$pc.belowLOD.new, xlab="% <LLOD in Original", ylab="% <LLOD in post QC dataset", pch=19) INF <- Sys.getenv("INF") np <- read.table(paste(INF, "work", "INF1.merge.nosig", sep="/"), header=FALSE, col.names = c("prot", "uniprot")) kable(np, caption="Proteins with no pQTL") annot$pQTL <- rep(NA, nrow(annot)) no.pQTL.ind <- which(annot$uniprot.id %in% np$uniprot) annot$pQTL[no.pQTL.ind] <- "red" annot$pQTL[-no.pQTL.ind] <- "blue" annot <- annot[order(annot$pc.belowLOD.new, decreasing = TRUE),] annot <- annot[-grep("^BDNF$", annot$ID),] saveRDS(annot,file=file.path("~","pQTLtools","tests","annot.RDS"))
annot <- readRDS(file.path(find.package("pQTLtools"),"tests","annot.RDS")) %>% dplyr::left_join(pQTLdata::inf1[c("prot","target.short","alt_name")],by=c("ID"="prot")) %>% dplyr::mutate(prot=if_else(is.na(alt_name),target.short,alt_name),order=1:n()) %>% dplyr::arrange(desc(order)) xtick <- seq(1, nrow(annot)) attach(annot) par(mar=c(10,5,1,1)) plot(100-pc.belowLOD.new,cex=2,pch=19,col=pQTL,xaxt="n",xlab="",ylab="",cex.axis=0.8) text(66,16,"IL-17C",offset=0,pos=2,cex=1.5,font=2,srt=0) arrows(67,16,71,16,lwd=2) axis(1, at=xtick, labels=prot, lwd.tick=0.5, lwd=0, las=2, hadj=1, cex.axis=0.8) mtext("% samples above LLOD",side=2,line=2.5,cex=1.2) mtext("Ordered protein",side=1,line=6.5,cex=1.2,font=1) legend(x=1,y=25,c("without pQTL","with pQTL"),box.lwd=0,cex=2,col=c("red","blue"),pch=19) detach(annot) options(width=120) knitr::kable(annot,caption="Summary statistics",row.names=FALSE)
Web: https://bioconductor.org/packages/release/workflows/html/maEndToEnd.html.
Examples can be found on PCA, heatmap, normalisation, linear models, and enrichment analysis from this Bioconductor package.
This is a more modern construct. Based on the documentation example, ranged summarized experiment (rse) and imputation are shown below.
set.seed(123) nrows <- 20 ncols <- 4 counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows) missing_indices <- sample(length(counts), size = 5, replace = FALSE) counts[missing_indices] <- NA rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(1, 3) * nrows / 4), IRanges::IRanges(floor(runif(nrows, 1e5, 1e6)), width=ncols), strand=sample(c("+", "-"), nrows, TRUE), feature_id=sprintf("ID%03d", 1:nrows)) colData <- S4Vectors::DataFrame(Treatment=rep(c("ChIP", "Input"), ncols/2), row.names=LETTERS[1:ncols]) rse <- SummarizedExperiment(assays=S4Vectors::SimpleList(counts=counts), rowRanges=rowRanges, colData=colData) assay(rse) imputed <- MsCoreUtils::impute_knn(as.matrix(SummarizedExperiment::assay(rse)),2) imputed_counts <- MsCoreUtils::impute_RF(as.matrix(assay(rse)),2) imputed-imputed_counts SummarizedExperiment::assays(rse) <- S4Vectors::SimpleList(counts=imputed_counts) SummarizedExperiment::assay(rse) SummarizedExperiment::assays(rse) <- S4Vectors::endoapply(SummarizedExperiment::assays(rse), asinh) SummarizedExperiment::assay(rse) SummarizedExperiment::rowRanges(rse) SummarizedExperiment::rowData(rse) SummarizedExperiment::colData(rse) rse[, rse$Treatment == "ChIP"] # the same ranges but different samples: rse1 <- rse rse2 <- rse1[, 1:3] colnames(rse2) <- letters[1:ncol(rse2)] cmb1 <- cbind(rse1, rse2) dim(cmb1) dimnames(cmb1) # the same samples but different ranges: rse1 <- rse rse2 <- rse1[1:nrows, ] rownames(rse2) <- letters[1:nrow(rse2)] cmb2 <- rbind(rse1, rse2) dim(cmb2) dimnames(cmb2) se0 <- as(rse, "SummarizedExperiment") se0 as(se0, "RangedSummarizedExperiment") # rowRanges(a SummarizedExperiment object) --> RangedSummarizedExperiment object: se <- se0 rowRanges(se) <- rowRanges se stopifnot(identical(SummarizedExperiment::assays(se0), SummarizedExperiment::assays(rse))) stopifnot(identical(dim(se0), dim(rse))) stopifnot(identical(dimnames(se0), dimnames(rse))) stopifnot(identical(SummarizedExperiment::rowData(se0), SummarizedExperiment::rowData(rse))) stopifnot(identical(SummarizedExperiment::colData(se0), SummarizedExperiment::colData(rse)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.