set.seed(0) knitr::opts_chunk$set( out.extra = 'style="display:block; margin: auto"', fig.align = "center", fig.height = 8, fig.path = "bioconductor/", fig.width = 8, collapse = TRUE, comment = "#>", dev = "CairoPNG") Sys.setlocale("LC_CTYPE", "en_US.UTF-8")
This article collects notes on Bioconductor packages, made available here to faciliate their use and extensions.
pkgs <- c("AnnotationDbi", "AnnotationFilter", "ComplexHeatmap", "DESeq2", "EnsDb.Hsapiens.v86", "FlowSorted.DLPFC.450k", "GeneNet", "GenomicFeatures", "IlluminaHumanMethylation450kmanifest", "OUTRIDER","RColorBrewer", "RMariaDB", "Rgraphviz", "S4Vectors", "SummarizedExperiment", "TxDb.Hsapiens.UCSC.hg38.knownGene", "bladderbatch", "clusterProfiler", "corpcor", "doParallel", "ensembldb", "fdrtool", "graph", "graphite", "heatmaply", "minfi", "org.Hs.eg.db", "plyr", "quantro", "recount3", "sva") es_pkgs <- c("Biobase", "arrayQualityMetrics", "dplyr", "knitr", "mclust", "pQTLtools", "rgl", "scatterplot3d") se_pkgs <- c("BiocGenerics", "GenomicRanges", "IRanges", "MsCoreUtils", "SummarizedExperiment", "impute") sp_pkgs <- c("Biostrings", "CAMERA", "MSnbase", "MSstats", "Spectra", "mzR", "protViz", "rawrr") pkgs <- c(pkgs, es_pkgs,se_pkgs,sp_pkgs) for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) { if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install")) } invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))
See inst/turboman
in the source, https://github.com/jinghuazhao/pQTLtools/tree/master/inst/turboman, or turboman/
directory in the installed package.
We start with Bioconductor/Biobase's ExpressionSet example and finish with a real application.
dataDirectory <- system.file("extdata", package="Biobase") exprsFile <- file.path(dataDirectory, "exprsData.txt") exprs <- as.matrix(read.table(exprsFile, header=TRUE, sep="\t", row.names=1, as.is=TRUE)) pDataFile <- file.path(dataDirectory, "pData.txt") pData <- read.table(pDataFile, row.names=1, header=TRUE, sep="\t") all(rownames(pData)==colnames(exprs)) metadata <- data.frame(labelDescription=c("Patient gender", "Case/control status", "Tumor progress on XYZ scale"), row.names=c("gender", "type", "score")) phenoData <- Biobase::AnnotatedDataFrame(data=pData, varMetadata=metadata) experimentData <- Biobase::MIAME(name="Pierre Fermat", lab="Francis Galton Lab", contact="pfermat@lab.not.exist", title="Smoking-Cancer Experiment", abstract="An example ExpressionSet", url="www.lab.not.exist", other=list(notes="Created from text files")) exampleSet <- pQTLtools::make_ExpressionSet(exprs,phenoData,experimentData=experimentData, annotation="hgu95av2") data(sample.ExpressionSet, package="Biobase") identical(exampleSet,sample.ExpressionSet)
The great benefit is to use the object directly as a data.frame.
lm.result <- Biobase::esApply(exampleSet,1,function(x) lm(score~gender+x)) beta.x <- unlist(lapply(lapply(lm.result,coef),"[",3)) beta.x[1] lm(score~gender+AFFX.MurIL2_at,data=exampleSet)
We wish to examine the distribution of each feature via histogram, scatter and boxplot.
One could resort to esApply()
for its simplicity as before
invisible(Biobase::esApply(exampleSet[1:2],1,function(x) {par(mfrow=c(3,1));boxplot(x);hist(x);plot(x)} ))
but it is nicer to add feature name in the title.
par(mfrow=c(1,3)) f <- Biobase::featureNames(exampleSet[1:2]) invisible(sapply(f,function(x) { d <- t(Biobase::exprs(exampleSet[x])) fn <- Biobase::featureNames(exampleSet[x]) hist(d,main="",xlab=fn); plot(d, ylab=fn); boxplot(d,ylab=fn) } ) )
where the expression set is indexed using feature name(s).
This illustrates one mechanism,
list_outliers <- function(es, method="upperquartile") arrayQualityMetrics::outliers(exprs(es),method=method) for (method in c("KS","sum","upperquartile")) { ZWK_outliers <- list_outliers(protein_ZWK,method=method) print(ZWK_outliers@statistic[ZWK_outliers@which]) }
We employ model-based clustering absed on principal compoents to see potential groupings in the data,
pca <- prcomp(na.omit(t(Biobase::exprs(exampleSet))), rank=10, scale=TRUE) pc1pc2pc3 <- with(pca,x)[,1:3] mc <- mclust::Mclust(pc1pc2pc3,G=3) with(mc, { cols <- c("blue","red", "purple") s3d <- scatterplot3d::scatterplot3d(with(pca,x[,c(2,1,3)]), color=cols[classification], pch=16, type="h", main="Plot of the PC1, PC2 and PC3") s3d.coords <- s3d$xyz.convert(with(pca,x[,c(2,1,3)])) text(s3d.coords$x, s3d.coords$y, cex = 1.2, col = cols[classification], labels = row.names(pc1pc2pc3), pos = 4) legend("right", legend=levels(as.factor(classification)), col=cols[classification], pch=16) rgl::open3d(width = 500, height = 500) rgl::plot3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],size=5) rgl::text3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],texts=row.names(pc1pc2pc3)) htmlwidgets::saveWidget(rgl::rglwidget(), file = "mcpca3d.html") })
An interactive version is also available,
htmltools::tags$iframe(src = "mcpca3d.html", width = "100%", height = "650px")
Suppose we wish use log2 for those greater than zero but set those negative values to be missing.
log2.na <- function(x) log2(ifelse(x>0, x, NA)) Biobase::exprs(exampleSet) <- log2.na(Biobase::exprs(exampleSet))
We generate a lod.max
~ U[0,1] variable to experiment
Biobase::fData(exampleSet) Biobase::fData(exampleSet)$lod.max <- apply(Biobase::exprs(exampleSet),1,quantile,runif(nrow(exampleSet))) lod <- pQTLtools::get.prop.below.LLOD(exampleSet) x <- dplyr::arrange(Biobase::fData(lod),desc(pc.belowLOD.new)) knitr::kable(head(lod)) plot(x[,2], main="Random quantile cutoff", ylab="<lod%")
The quantity has been shown to have a big impact on protein abundance and therefore pQTL detection as is shown with a real example.
rm(list=ls()) dir <- "~/rds/post_qc_data/interval/phenotype/olink_proteomics/post-qc/" eset <- readRDS(paste0(dir,"eset.inf1.flag.out.outlier.in.rds")) x <- pQTLtools::get.prop.below.LLOD(eset) annot <- Biobase::fData(x) annot$MissDataProp <- as.numeric(gsub("\\%$", "", annot$MissDataProp)) plot(annot$MissDataProp, annot$pc.belowLOD.new, xlab="% <LLOD in Original", ylab="% <LLOD in post QC dataset", pch=19) INF <- Sys.getenv("INF") np <- read.table(paste(INF, "work", "INF1.merge.nosig", sep="/"), header=FALSE, col.names = c("prot", "uniprot")) kable(np, caption="Proteins with no pQTL") annot$pQTL <- rep(NA, nrow(annot)) no.pQTL.ind <- which(annot$uniprot.id %in% np$uniprot) annot$pQTL[no.pQTL.ind] <- "red" annot$pQTL[-no.pQTL.ind] <- "blue" annot <- annot[order(annot$pc.belowLOD.new, decreasing = TRUE),] annot <- annot[-grep("^BDNF$", annot$ID),] saveRDS(annot,file=file.path("~","pQTLtools","tests","annot.RDS"))
annot <- readRDS(file.path(find.package("pQTLtools"),"tests","annot.RDS")) %>% dplyr::left_join(pQTLdata::inf1[c("prot","target.short","alt_name")],by=c("ID"="prot")) %>% dplyr::mutate(prot=if_else(is.na(alt_name),target.short,alt_name),order=1:n()) %>% dplyr::arrange(desc(order)) xtick <- seq(1, nrow(annot)) attach(annot) par(mar=c(10,5,1,1)) plot(100-pc.belowLOD.new,cex=2,pch=19,col=pQTL,xaxt="n",xlab="",ylab="",cex.axis=0.8) text(66,16,"IL-17C",offset=0,pos=2,cex=1.5,font=2,srt=0) arrows(67,16,71,16,lwd=2) axis(1, at=xtick, labels=prot, lwd.tick=0.5, lwd=0, las=2, hadj=1, cex.axis=0.8) mtext("% samples above LLOD",side=2,line=2.5,cex=1.2) mtext("Ordered protein",side=1,line=6.5,cex=1.2,font=1) legend(x=1,y=25,c("without pQTL","with pQTL"),box.lwd=0,cex=2,col=c("red","blue"),pch=19) detach(annot) knitr::kable(annot,caption="Summary statistics",row.names=FALSE)
Web: https://bioconductor.org/packages/release/workflows/html/maEndToEnd.html.
Examples can be found on PCA, heatmap, normalisation, linear models, and enrichment analysis from this Bioconductor package.
This is a more modern construct. Based on the documentation example, ranged summarized experiment (rse) and imputation are shown below.
set.seed(123) nrows <- 20 ncols <- 4 counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows) missing_indices <- sample(length(counts), size = 5, replace = FALSE) counts[missing_indices] <- NA rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(1, 3) * nrows / 4), IRanges::IRanges(floor(runif(nrows, 1e5, 1e6)), width=ncols), strand=sample(c("+", "-"), nrows, TRUE), feature_id=sprintf("ID%03d", 1:nrows)) colData <- S4Vectors::DataFrame(Treatment=rep(c("ChIP", "Input"), ncols/2), row.names=LETTERS[1:ncols]) rse <- SummarizedExperiment::SummarizedExperiment(assays=S4Vectors::SimpleList(counts=counts), rowRanges=rowRanges, colData=colData) SummarizedExperiment::assay(rse) imputed <- MsCoreUtils::impute_knn(as.matrix(SummarizedExperiment::assay(rse)),2) imputed_counts <- MsCoreUtils::impute_RF(as.matrix(SummarizedExperiment::assay(rse)),2) imputed-imputed_counts SummarizedExperiment::assays(rse) <- S4Vectors::SimpleList(counts=imputed_counts) SummarizedExperiment::assay(rse) SummarizedExperiment::assays(rse) <- S4Vectors::endoapply(SummarizedExperiment::assays(rse), asinh) SummarizedExperiment::assay(rse) SummarizedExperiment::rowRanges(rse) SummarizedExperiment::rowData(rse) SummarizedExperiment::colData(rse)
This is the documentation example, based on Bioconductor 3.14.
data(bladderdata, package="bladderbatch") edat <- bladderEset[1:50] pheno <- Biobase::pData(edat) batch <- pheno$batch table(batch) quantro::matboxplot(edat,batch,cex.axis=0.6,notch=TRUE,pch=19,ylab="Expression") quantro::matdensity(edat,batch,xlab=" ",ylab="density") legend("topleft",legend=1:5,col=1:5,lty=1) # 1. parametric adjustment combat_edata1 <- sva::ComBat(dat=edat, batch=batch, par.prior=TRUE, prior.plots=TRUE) # 2. non-parametric adjustment, mean-only version combat_edata2 <- sva::ComBat(dat=edat, batch=batch, par.prior=FALSE, mean.only=TRUE) # 3. reference-batch version, with covariates mod <- model.matrix(~as.factor(cancer), data=pheno) combat_edata3 <- sva::ComBat(dat=edat, batch=batch, mod=mod, par.prior=TRUE, ref.batch=3, prior.plots=TRUE)
This is also adapted from the package vignette but with FlowSorted.DLPFC.450k
in place of FlowSorted
.
data(FlowSorted.DLPFC.450k,package="FlowSorted.DLPFC.450k") p <- getBeta(FlowSorted.DLPFC.450k,offset=100) pd <- Biobase::pData(FlowSorted.DLPFC.450k) quantro::matboxplot(p, groupFactor = pd$CellType, xaxt = "n", main = "Beta Values", pch=19) quantro::matdensity(p, groupFactor = pd$CellType, xlab = " ", ylab = "density", main = "Beta Values", brewer.n = 8, brewer.name = "Dark2") legend('top', c("NeuN_neg", "NeuN_pos"), col = c(1, 2), lty = 1, lwd = 3) qtest <- quantro::quantro(object = p, groupFactor = pd$CellType) if (FALSE) { doParallel::registerDoParallel(cores=10) qtestPerm <- quantro::quantro(p, groupFactor = pd$CellType, B = 1000) quantro::quantroPlot(qtestPerm) }
The following is adapted from package OUTRIDER, noting its plotQQ() has issues
ctsFile <- system.file('extdata', 'KremerNBaderSmall.tsv', package='OUTRIDER') ctsTable <- read.table(ctsFile, check.names=FALSE) ods <- OUTRIDER::OutriderDataSet(countData=ctsTable) ods <- OUTRIDER::filterExpression(ods, minCounts=TRUE, filterGenes=TRUE) ods <- OUTRIDER::OUTRIDER(ods) res <- OUTRIDER::results(ods) knitr::kable(res,caption="A check list of outliers") if ("geneID" %in% colnames(res) & FALSE) OUTRIDER::plotQQ(ods, res$geneID, global=TRUE)
ex <- DESeq2::makeExampleDESeqDataSet(m=4) dds <- DESeq2::DESeq(ex) res <- DESeq2::results(dds, contrast=c("condition","B","A")) rld <- DESeq2::rlogTransformation(ex, blind=TRUE) dat <- DESeq2::plotPCA(rld, intgroup=c("condition"),returnData=TRUE) percentVar <- round(100 * attr(dat,"percentVar")) ggplot2::ggplot(dat, ggplot2::aes(PC1, PC2, color=condition, shape=condition)) + ggplot2::geom_point(size=3) + ggplot2::xlab(paste0("PC1:",percentVar[1],"% variance")) + ggplot2::ylab(paste0("PC2:",percentVar[2],"% variance")) ex$condition <- relevel(ex$condition, ref="B") dds2 <- DESeq2::DESeq(dds) res <- DESeq2::results(dds2) knitr::kable(head(as.data.frame(res)))
See the package in action from a snakemake workflow @koster21.
A simple network is furnished with the GeneNet
documentation example,
## A random network with 40 nodes # it contains 780=40*39/2 edges of which 5 percent (=39) are non-zero true.pcor <- GeneNet::ggm.simulate.pcor(40) # A data set with 40 observations m.sim <- GeneNet::ggm.simulate.data(40, true.pcor) # A simple estimate of partial correlations estimated.pcor <- corpcor::cor2pcor( cor(m.sim) ) # A comparison of estimated and true values sum((true.pcor-estimated.pcor)^2) # A slightly better estimate ... estimated.pcor.2 <- GeneNet::ggm.estimate.pcor(m.sim) sum((true.pcor-estimated.pcor.2)^2) ## ecoli data data(ecoli, package="GeneNet") # partial correlation matrix inferred.pcor <- GeneNet::ggm.estimate.pcor(ecoli) # p-values, q-values and posterior probabilities for each potential edge test.results <- GeneNet::network.test.edges(inferred.pcor) # best 20 edges (strongest correlation) test.results[1:20,] # network containing edges with prob > 0.9 (i.e. local fdr < 0.1) net <- GeneNet::extract.network(test.results, cutoff.ggm=0.9) net # significant based on FDR cutoff Q=0.05? num.significant.1 <- sum(test.results$qval <= 0.05) test.results[1:num.significant.1,] # significant based on "local fdr" cutoff (prob > 0.9)? num.significant.2 <- sum(test.results$prob > 0.9) test.results[test.results$prob > 0.9,] # parameters of the mixture distribution used to compute p-values etc. c <- fdrtool::fdrtool(corpcor::sm2vec(inferred.pcor), statistic="correlation") c$param ## A random network with 20 nodes and 10 percent (=19) edges true.pcor <- GeneNet::ggm.simulate.pcor(20, 0.1) # convert to edge list test.results <- GeneNet::ggm.list.edges(true.pcor)[1:19,] nlab <- LETTERS[1:20] # graphviz # network.make.dot(filename="test.dot", test.results, nlab, main = "A graph") # system("fdp -T svg -o test.svg test.dot") # Rgraphviz gr <- GeneNet::network.make.graph( test.results, nlab) gr num.nodes(gr) edge.info(gr) gr2 <- GeneNet::network.make.graph( test.results, nlab, drop.singles=TRUE) gr2 GeneNet::num.nodes(gr2) GeneNet::edge.info(gr2) # plot network plot(gr, "fdp") plot(gr2, "fdp")
A side-by-side heatmaps
set.seed(123454321) m <- matrix(runif(2500),50) r <- cor(m) g <- as.matrix(r>=0.7)+0 f1 <- ComplexHeatmap::Heatmap(r) f2 <- ComplexHeatmap::Heatmap(g) f <- f1+f2 ComplexHeatmap::draw(f) df <- heatmaply::normalize(mtcars) hm <- heatmaply::heatmaply(df,k_col=5,k_row=5, colors = grDevices::colorRampPalette(RColorBrewer::brewer.pal(3, "RdBu"))(256)) htmlwidgets::saveWidget(hm,file="heatmaply.html") htmltools::tags$iframe(src = "heatmaply.html", width = "100%", height = "550px")
so we have heatmaply.html and a module analysis with WGCNA,
pwr <- c(1:10, seq(from=12, to=30, by=2)) sft <- WGCNA::pickSoftThreshold(dat, powerVector=pwr, verbose=5) ADJ <- abs(cor(dat, method="pearson", use="pairwise.complete.obs"))^6 dissADJ <- 1-ADJ dissTOM <- WGCNA::TOMdist(ADJ) TOM <- WGCNA::TOMsimilarityFromExpr(dat) Tree <- hclust(as.dist(1-TOM), method="average") for(j in pwr) { pam_name <- paste0("pam",j) assign(pam_name, cluster::pam(as.dist(dissADJ),j)) pamTOM_name <- paste0("pamTOM",j) assign(pamTOM_name,cluster::pam(as.dist(dissTOM),j)) tc <- table(get(pam_name)$clustering,get(pamTOM_name)$clustering) print(tc) print(diag(tc)) } colorStaticTOM <- as.character(WGCNA::cutreeStaticColor(Tree,cutHeight=.99,minSize=5)) colorDynamicTOM <- WGCNA::labels2colors(cutreeDynamic(Tree,method="tree",minClusterSize=5)) Colors <- data.frame(pamTOM6$clustering,colorStaticTOM,colorDynamicTOM) WGCNA::plotDendroAndColors(Tree, Colors, dendroLabels=FALSE, hang=0.03, addGuide=TRUE, guideHang=0.05) meg <- WGCNA::moduleEigengenes(dat, color=1:ncol(dat), softPower=6)
This section is based on package recount3
.
hs <- recount3::available_projects() dim(subset(hs,file_source=="gtex")) recount3::annotation_options("human") blood_rse <- recount3::create_rse(subset(hs,project=="BLOOD")) S4Vectors::metadata(blood_rse) SummarizedExperiment::rowRanges(blood_rse) colnames(SummarizedExperiment::colData(blood_rse))[1:20] recount3::expand_sra_attributes(blood_rse)
reactome <- graphite::pathways("hsapiens", "reactome") kegg <- graphite::pathways("hsapiens","kegg") pharmgkb <- graphite::pathways("hsapiens","pharmgkb") nodes(kegg[[21]]) kegg_t2g <- ldply(lapply(kegg, nodes), data.frame) names(kegg_t2g) <- c("gs_name", "gene_symbol") VEGF <- subset(kegg_t2g,gs_name=="VEGF signaling pathway")[[2]] eKEGG <- clusterProfiler::enricher(gene=VEGF, TERM2GENE = kegg_t2g, universe=, pAdjustMethod = "BH", pvalueCutoff = 0.1, qvalueCutoff = 0.05, minGSSize = 10, maxGSSize = 500)
An overview of annotation is available @carlson16.
# columns(org.Hs.eg.db) # keyref <- keys(org.Hs.eg.db, keytype="ENTREZID") # symbol_uniprot <- select(org.Hs.eg.db,keys=keyref,columns = c("SYMBOL","UNIPROT")) # subset(symbol_uniprot,SYMBOL=="MC4R") x <- EnsDb.Hsapiens.v86 ensembldb::listColumns(x, "protein", skip.keys=TRUE) ensembldb::listGenebiotypes(x) ensembldb::listTxbiotypes(x) ensembldb::listTables(x) ensembldb::metadata(x) ensembldb::organism(x) ensembldb::returnFilterColumns(x) ensembldb::seqinfo(x) ensembldb::seqlevels(x) ensembldb::updateEnsDb(x) ensembldb::genes(x, columns=c("gene_name"), filter=list(SeqNameFilter("X"), AnnotationFilter::GeneBiotypeFilter("protein_coding"))) ensembldb ::transcripts(x, columns=ensembldb::listColumns(x, "tx"), filter = AnnotationFilter::AnnotationFilterList(), order.type = "asc", return.type = "GRanges") #txdbEnsemblGRCh38 <- GenomicFeatures::makeTxDbFromEnsembl(organism="Homo sapiens", release=98) #txdb <- as.list(txdbEnsemblGRCh38) #lapply(txdb,head)
This section collects notes on peptide/protein analysis, especially with respect to spectrum data.
Here is an example for PROC_HUMAN, which is handled by the Biostrings
package,
fasta_file_path <- 'https://rest.uniprot.org/uniprotkb/P04070.fasta' fasta_sequences <- Biostrings::readAAStringSet(fasta_file_path, format = "fasta") AA_sequence <- fasta_sequences[[1]] cat("Sequence:", toString(AA_sequence), "\n") iso_442688365 <- 'TDGEGALSEPSATVTIEELAAPPPPVLMHHGESSQVLHPGNK' match_position <- regexpr(iso_442688365, AA_sequence) match_position mp <- matchPattern(iso_442688365,AA_sequence) mp
load(system.file("tests","PROC.rda",package="pQTLtools")) pQTLtools::peptideAssociationPlot(protein,cistrans)
The .raw files can be handled by rawrr
package nevertheless it requires necessary files,
library(rawrr) if (isFALSE(rawrr::.checkDllInMonoPath())){ rawrr::installRawFileReaderDLLs() } if (isFALSE(file.exists(rawrr:::.rawrrAssembly()))){ rawrr::installRawrrExe() }
Based on a real project, the following is an example of listing/generating multiple .raw from .zip files
# ZWK .raw data spectra_ZWK <- "~/Caprion/pre_qc_data/spectral_library_ZWK" raw_files <- list.files(spectra_ZWK, pattern = "\\.raw$", full.names = TRUE) ## collectively suppressMessages(library(MsBackendRawFileReader)) ZWK <- Spectra::backendInitialize(MsBackendRawFileReader::MsBackendRawFileReader(), files = raw_files) class(ZWK) methods(class=class(ZWK)) Spectra(ZWK) spectraData(ZWK) ZWK ZWKvars <- ZWK |> Spectra::spectraVariables() ZWKdata <- ZWK |> Spectra::spectraData() dim(ZWKdata) # rows with >=1 non-NA value in the columns with prefix "precursor" precursor <- apply(ZWKdata[grep("precursor",ZWKvars)], 1, function(x) any(!is.na(x))) ZWKdata_filtered <- ZWKdata[precursor, ] save(ZWK,file="~/Caprion/analysis/work/ZWK.rda") # ZYQ/UDP library(utils) spectra <- "~/Caprion/pre_qc_data/spectra" zip_files <- dir(spectra, recursive = TRUE, full.names=TRUE) work_dir <- "~/Caprion/analysis/work" for (zip_file in zip_files) unzip(zip_file, exdir=work_dir) ZYQ_UDP <- Spectra::backendInitialize(MsBackendRawFileReader::MsBackendRawFileReader(), files = dir(work_dir,patt="raw",full.names=TRUE)) class(ZYQ_UDP) ZYQ_UDP ZYQ_UDP |> Spectra::spectraVariables() save(ZYQ_UDP,file="~/Caprion/analysis/work/ZYQ_UDP.rda")
Various facilities are shown below.
# various files d <- "/rds/project/rds-zuZwCZMsS0w/Caprion_proteomics/analysis/crux" f <- file.path(d,"szwk901104i19801xms1.mzML") x <- file.path(d,"szwk901104i19801xms1.mzXML") g <- file.path(d,"szwk901104i19801xms1.mgf") r <- file.path(d,"szwk901104i19801xms1.rda") z <- file.path(d,"szwk901104i19801xms1.mzML.gz") # mzML mz <- mzR::openMSfile(f) header_info <- mzR::header(mz) table(header_info$msLevel) peak_data <- mzR::peaks(mz) spec <- mzR::spectra(mz) class(spec) length(spec) lapply(spec,head,3) methods(class="mzRpwiz") mzR::close(mz) mz <- mzR::openMSfile(z, backend = "pwiz") mz nChrom(mz) head(tic(mz)) head(chromatogram(mz, 1L)) ## same as tic(x) str(chromatogram(mz)) head(peaks(mz, scan=4)) # MSnbase mzXML <- MSnbase::readMSData(x) mgf <- MSnbase::readMgfData(g) save(mzXML,mgf,file=r) MSnbase::extractSpectraData(mzXML) MSnbase::hasSpectra(z) MSnbase::hasChromatograms(z) MSnbase::plot2d(mzXML,z="peaks.count") MSnbase::plotDensity(mzXML,z="precursor.mz") MSnbase::extractSpectraData(mgf) methods(class="MSpectra") MSnbase::mz(mgf) MSnbase::intensity(mgf) MSnbase::rtime(mgf) MSnbase::precursorMz(mgf) MSnbase::precursorCharge(mgf) MSnbase::precScanNum(mgf) MSnbase::precursorIntensity(mgf) MSnbase::acquisitionNum(mgf) MSnbase::scanIndex(mgf) MSnbase::peaksCount(mgf) MSnbase::msLevel(mgf) MSnbase::tic(mgf) MSnbase::ionCount(mgf) MSnbase::collisionEnergy(mgf) MSnbase::fromFile(mgf) MSnbase::polarity(mgf) MSnbase::smoothed(mgf) MSnbase::centroided(mgf) MSnbase::isCentroided(mgf) MSnbase::writeMgfData(mgf, con = "spectra.mgf", COM = NULL, TITLE = NULL) MSnbase::removePeaks(mgf, t, msLevel., ...) MSnbase::filterMsLevel(mgf, msLevel=2) MSnbase::as.ExpressionSet(mgf) # This turned to be really slow! sp_list <- lapply(seq_along(mgf), function(i) { intensity_i <- MSnbase::intensity(mgf)[[i]] mz_i <- MSnbase::mz(mgf)[[i]] centroided_i <- MSnbase::centroided(mgf)[[i]] return(new("Spectrum1", intensity = intensity_i, mz = mz_i, centroided = centroided_i)) }) sp1 <- do.call(rbind, sp_list) # only the first one is more manageable sp1 <- new("Spectrum1",intensity=MSnbase::intensity(mgf)[[1]],mz=MSnbase::mz(mgf)[[1]],centroided=MSnbase::centroided(mgf)[[1]]) sp2 <- MSnbase::pickPeaks(sp1) MSnbase::intensity(sp2) plot(MSnbase::mz(sp1),MSnbase::intensity(sp1),type="h") ## Without m/z refinement points(MSnbase::mz(sp2), MSnbase::intensity(sp2), col = "darkgrey") ## Using k = 1, closest signals sp3 <- MSnbase::pickPeaks(sp1, refineMz = "kNeighbors", k = 1) points(MSnbase::mz(sp3), MSnbase::intensity(sp3), col = "green", type = "h") ## Using descendPeak requiring at least 50% or the centroid's intensity sp4 <- MSnbase::pickPeaks(sp1, refineMz = "descendPeak", signalPercentage = 50) points(MSnbase::mz(sp4), MSnbase::intensity(sp4), col = "red", type = "h") # CAMERA xs <- CAMERA::xcmsSet(f, method="centWave", ppm=30, peakwidth=c(5,10)) an <- CAMERA::xsAnnotate(xs) an <- CAMERA::groupFWHM(an) #For one group peaklist <- CAMERA::getpspectra(an, 1) #For two groups peaklist <- CAMERA::getpspectra(an, c(1,2)) # Spectra suppressMessages(library(Spectra)) sp <- Spectra::Spectra(z) head(sp) table(sp$msLevel) d <- Spectra::computeMzDeltas(sp[1:1000]) Spectra::plotMzDelta(d) # protViz protViz::fragmentIon("TFVLNFIK") esd <- MSnbase::extractSpectraData(mgf) op <- par(mfrow=c(2,1)) ms <- function(i) with(esd[i,],list(title=TITLE,rtinseconds=RTINSECONDS,pepmass=PEPMASS,charge=CHARGE, mZ=MSnbase::mz(mgf[[i]]),intensity=MSnbase::intensity(mgf[[i]]))) protViz::peakplot("TAFDEAIAELDTLNEESYK", ms(1)) protViz::peakplot("TAFDEAIAELDTLSEESYK", ms(2)) par(op) load("~/Caprion/pilot/ZWK.rda") peptides <- subset(mapping_ZWK,Protein=="PROC_HUMAN")[["Modified.Peptide.Sequence"]] |> unique() pim <- protViz::parentIonMass(peptides) fi <- protViz::fragmentIon(peptides) df <- as.data.frame(fi) op <- par(mfrow=c(3,1)) for (i in 1:length(peptides)){ plot(0, 0, xlab='m/Z', ylab='', xlim=range(c(fi[[i]]$b,fi[[i]]$y)), ylim=c(0,1), type='n', axes=FALSE, sub=paste(peptides[i], "/", pim[i], "Da")); box() axis(1, fi[[i]]$b, round(fi[[i]]$b,1), las=2) axis(1, fi[[i]]$y, round(fi[[i]]$y,1), las=2) pepSeq<-strsplit(peptides[i], "") axis(3,fi[[i]]$b, paste("b", row.names(fi[[i]]),sep=''),las=2) axis(3,fi[[i]]$y, paste("y", row.names(fi[[i]]),sep=''),las=2) text(fi[[i]]$b, rep(0.3, nchar(peptides[i])), pepSeq[[1]],pos=3,cex=4, lwd=4, col="#aaaaaaaa") abline(v=fi[[i]]$b, col='red') abline(v=fi[[i]]$y, col='blue',lwd=2) } par(op) # MSstats head(SRMRawData) QuantData <- MSstats::dataProcess(SRMRawData, use_log_file = FALSE) quant <- MSstats::dataProcess(SRMRawData, normalization = "equalizeMedians", summaryMethod = "TMP", censoredInt = "NA", MBimpute = TRUE, maxQuantileforCensored = 0.999, logTrans=2, use_log_file=FALSE, numberOfCores=5) names(quant)
MSstats @kohler23 takes output from other data-processing pipelines.
Web: https://support.bioconductor.org/
Package | Description
--------|------------
Bioconductor |
AnnotationDbi | AnnotationDb objects and their progeny, methods etc.
Biobase | Base functions for Bioconductor
Biostrings | Efficient manipulation of biological strings
CAMERA | Collection of annotation related methods
clusterProfiler | Functional profiles for genes and gene clusters
ComplexHeatmap | Make complex heatmaps
DESSeq2 | Differential gene expression analysis based on the negative binomial distribution
edgeR | Empirical analysis of digital gene expression
EnsDb.Hsapiens.v86 | Exposes an annotation databases generated from Ensembl
ensembldb | Retrieve annotation data from an Ensembl based package
FlowSorted.DLPFC.450k | Illumina HumanMethylation data on sorted frontal cortex cell populations
graphite | GRAPH Interaction from pathway topological environment
IlluminaHumanMethylation450kmanifest | Annotation for Illumina's 450k methylation arrays
INSPEcT | Quantification of the intronic and exonic gene features and the post-transcriptional regulation analysis
MSnbase | Base Functions and Classes for Mass Spectrometry and Proteomics
MSstats | Protein Significance Analysis in DDA, SRM and DIA Proteomics
mzR | parser for netCDF, mzXML and mzML and mzIdentML files
org.Hs.eg.db | Conversion of Entrez ID -- gene symbols
OUTRIDER | OUTlier in RNA-Seq fInDER
Pi | Priority index, leveraging genetic evidence to prioritise drug targets at the gene and pathway level
quantro | A test for when to use quantile normalisation
rawrr | Direct Access to Orbitrap Data and Beyond
recount3 | Interface to uniformly processed RNA-seq data
Rgraphiz | Interfaces R with the AT&T graphviz library for plotting R graph objects from the graph package
Spectra | Spectra Infrastructure for Mass Spectrometry
sva | Surrogate Variable Analysis
TxDb.Hsapiens.UCSC.hg38.knownGene | Annotation of the human genome
CRAN |
doParallel | Foreach Parallel Adaptor for the 'parallel' Package
GeneNet | Modeling and Inferring Gene Networks
ggplot2 | Data Visualisations Using the grammar of graphics
heatmaply | Interactive Cluster Heat Maps Using plotly
and ggplot2
pheatmap | results visualisation
plyr | Splitting, applying and combining data
protViz | Foreach Parallel Adaptor for the 'parallel' Package
RColorBrewer | ColorBrewer Palettes
WGCNA | Weighted correlation network analysis
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.