r params$update_date
knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) #options(rmarkdown.html_vignette.check_title = FALSE)
The Cancer Genome Atlas (TCGA) datasets are an extensive set of Cancer datasets widely used in Cancer research and clinical publications. The TCGA Research Network has cataloged different molecular information of 33 human cancer types to increase our biological insight into cancers. One major aspect of TCGA cancer research uses the gene expression data gathered from the Gene expression experiments that are conducted to better understand the biological mechanisms in cells and tissues. But Gene expression data is almost always compromised by unwanted variation that may lead to inaccurate and wrong biological results and retractions. Effective removal of unwanted variation such as batch effects is one of the main challenges of the analysis of gene expression data, particularly when the data comes from large and complex experiments.
The goal of the tcgaCleaneR package is to minimize this challenge and help users to analyse and handle these unwanted variations. This package acts as a tool to let users account for the unwanted variations in their research and publications.
library(tcgaCleaneR)
data("brca.data")
brca.data
The package contains a highly condensed version of the original dataset with 100 unique genes and 1196 samples. The data consists three individual data that can be accessed using \code{SummarizedExperiment::assay()}. Sample meta-data describing the samples can be accessed using \code{SummarizedExperiment::colData()}, and is a DataFrame that can store any number of descriptive columns for each sample row. Similarly, Gene meta-data describing the genes can be accessed using \code{SummarizedExperiment::rowData()}, and is a DataFrame that can store any number of descriptive columns for each gene row.
gene.annot <- as.data.frame(SummarizedExperiment::rowData(brca.data)) sample.info <- as.data.frame(SummarizedExperiment::colData(brca.data)) raw.count <- as.data.frame(SummarizedExperiment::assay(brca.data, 'HTseq_counts'))
df1 <- filterGenesByBiotypes(data=brca.data,gene.type=c("protein.coding"))
df2 <- filterLowExprGenes(data=df1,gene_count = 20,sample_size = 200)
df3 <- filterSamplesByPurity(data= df2,purity_cutoff= 0.496)
plotLibSize(data = df3, plot_type = "Scatterplot")
df4 <- filterSamplesByLibSize(data = df3, ls_cutoff = 17.5)
plotStudyOutline(data = df4)
# Is data input for PCA logical is.logical(df4)
df5 <- computePCA(data = df4, nPcs = 7, is.log = FALSE)
# Generated 7 PCs for first two genes head(df5[['HTseq_counts']]$sing.val$u,2)
library(ggplot2) library(cowplot) plotPC(pca.data = df5, data = df4, group = "Time", plot_type = "DensityPlot", pcs.no = c(1,2,3))
library(tidyverse) df6 <- plotPCsVar(pca.data = df5, data = df4, type = "purity", nPCs = 7) df6
df7 <- computeCorr(data = df4, is.log = FALSE, type = "librarysize", cor.method = 'spearman', n.cores = 1) head(df7)
df8 <- computeANOVA(data = df4, variable = "Time", is.log = FALSE, n.cores = 1) head(df8)
library(ggplot2) library(cowplot) checkNegCtrlGenes(data =df4, ncg_set= c("Microrray_HK"), group='Time', plot_type="DensityPlot", nPcs=10, npcs = 3, is.log=FALSE)
library(tidyverse) plotPRPS(data = df4)
sample.info <- as.data.frame(SummarizedExperiment::colData(df4)) expr.data <- as.matrix(SummarizedExperiment::assay(df4, 'HTseq_counts')) # gene expression data sample.info$ls <- colSums(expr.data) # adding library size variable df9 <- createPRPS(expr.data, sample.info, librarySize = 'ls', batch=c('Year', 'Plates'), biology = 'Subtypes', purity='Purity_singscore',include.ls=T, include.purity=T, minSamplesPerBatchPS = 3, minSamplesForPuirtyPS = 3, minSamplesForPurityPerBiology = 12, minSamplesForLibrarySizePerBatch = 6, minSamplesForLibrarySizePS = 3)
### data input library(SummarizedExperiment) ### PRPS values prps.batch <- df9$ps.batch colnames(prps.batch) <- unlist(lapply( colnames(prps.batch), function(x) strsplit(x, '-')[[1]][1] )) prps.ls <- df9$ps.ls prps.purity <- df9$ps.purity raw.data <- as.matrix(SummarizedExperiment::assay(df4, 'HTseq_counts')) ruv.data <- cbind(raw.data ,prps.batch ,prps.ls, prps.purity ) ruv.data <- t(log2(ruv.data + 1)) ### replicate matrix ruv.rep <- ruv::replicate.matrix(row.names(ruv.data)) gene.annot <- as.data.frame(SummarizedExperiment::rowData(df4)) ### NCG sets ncg.set <- colnames(ruv.data) %in% gene.annot$Gene_symbol[gene.annot$RNAseq_HK == 'yes']
#library(BiocParallel) #library(BiocSingular) df10 <- runRUV_III_PRPS(ruv.data = ruv.data, ruv.rep = ruv.rep, ncg.set = ncg.set, k=1, return.info = TRUE)
#library(SummarizedExperiment) gene.annot <- as.data.frame(SummarizedExperiment::rowData(df4)) sample.info <- as.data.frame(SummarizedExperiment::colData(df4)) ruv.iii.adj <- t(df10$new.ruv.data[1:ncol(raw.data) , ]) # transpose raw.count <- SummarizedExperiment::assay(df4, 'HTseq_counts') raw.count <- log2(raw.count + 1) fpkm <- SummarizedExperiment::assay(df4, 'HTseq_FPKM') fpkm <- log2(fpkm + 1) fpkm.uq <- SummarizedExperiment::assay(df4, 'HTseq_FPKM.UQ') fpkm.uq <- log2(fpkm.uq + 1) RUV_III <- ruv.iii.adj combined_data <- SummarizedExperiment(assays = list(HTseq_counts = raw.count, HTseq_FPKM = fpkm, HTseq_FPKM.UQ = fpkm.uq, RUV_III = RUV_III), colData = sample.info, rowData = gene.annot)
df11 <- computePCA(data = combined_data, nPcs = 7, is.log = TRUE)
plotPC(pca.data = df11, data = combined_data, group = "Time", plot_type = "DensityPlot", pcs.no = 1:3)
df12 <- plotPCsVar(pca.data = df11, data = combined_data, type = "purity", nPCs = 7) df12
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.