r params$update_date

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
#options(rmarkdown.html_vignette.check_title = FALSE)

Introduction

The Cancer Genome Atlas (TCGA) datasets are an extensive set of Cancer datasets widely used in Cancer research and clinical publications. The TCGA Research Network has cataloged different molecular information of 33 human cancer types to increase our biological insight into cancers. One major aspect of TCGA cancer research uses the gene expression data gathered from the Gene expression experiments that are conducted to better understand the biological mechanisms in cells and tissues. But Gene expression data is almost always compromised by unwanted variation that may lead to inaccurate and wrong biological results and retractions. Effective removal of unwanted variation such as batch effects is one of the main challenges of the analysis of gene expression data, particularly when the data comes from large and complex experiments.

The goal of the tcgaCleaneR package is to minimize this challenge and help users to analyse and handle these unwanted variations. This package acts as a tool to let users account for the unwanted variations in their research and publications.

library(tcgaCleaneR)

Data

data("brca.data")
brca.data

The package contains a highly condensed version of the original dataset with 100 unique genes and 1196 samples. The data consists three individual data that can be accessed using \code{SummarizedExperiment::assay()}. Sample meta-data describing the samples can be accessed using \code{SummarizedExperiment::colData()}, and is a DataFrame that can store any number of descriptive columns for each sample row. Similarly, Gene meta-data describing the genes can be accessed using \code{SummarizedExperiment::rowData()}, and is a DataFrame that can store any number of descriptive columns for each gene row.

gene.annot <-  as.data.frame(SummarizedExperiment::rowData(brca.data))
sample.info <-  as.data.frame(SummarizedExperiment::colData(brca.data))
raw.count <- as.data.frame(SummarizedExperiment::assay(brca.data, 'HTseq_counts'))

Data Wrangling

Gene Filter

Filter data by specific gene.

df1 <- filterGenesByBiotypes(data=brca.data,gene.type=c("protein.coding"))

Removing lowly expressed genes

Filter data for genes with low count set number of samples

df2 <- filterLowExprGenes(data=df1,gene_count = 20,sample_size = 200)

Purity Filter

Filter Samples based on Tumor Purity

df3 <- filterSamplesByPurity(data= df2,purity_cutoff= 0.496)

Library Size Filter

Determine Library Size

plotLibSize(data = df3, plot_type = "Scatterplot")

Filter samples based on library size

df4 <- filterSamplesByLibSize(data = df3, ls_cutoff = 17.5)

Data Analysis

Study Design Plot

plotStudyOutline(data = df4)

PCA

Generate PCA

# Is data input for PCA logical
is.logical(df4)
df5 <- computePCA(data = df4, nPcs = 7, is.log = FALSE)
# Generated 7 PCs for first two genes
head(df5[['HTseq_counts']]$sing.val$u,2)

Plot PCA

library(ggplot2)
library(cowplot)
plotPC(pca.data = df5, data = df4, group = "Time", plot_type = "DensityPlot", pcs.no = c(1,2,3))

PCs correlation with unwanted variations

library(tidyverse)
df6 <- plotPCsVar(pca.data = df5, data = df4, type = "purity", nPCs = 7)
df6

Gene correlation

df7 <- computeCorr(data = df4, is.log = FALSE, type = "librarysize", cor.method = 'spearman', n.cores = 1)
head(df7)

Anova Test

df8 <- computeANOVA(data = df4, variable = "Time", is.log = FALSE, n.cores = 1)
head(df8)

RUV - III

Check NCG Sets

library(ggplot2)
library(cowplot)
checkNegCtrlGenes(data =df4, ncg_set= c("Microrray_HK"), group='Time', plot_type="DensityPlot", nPcs=10, npcs = 3, is.log=FALSE)

PRPS Map

library(tidyverse)
plotPRPS(data = df4)

PRPS Generation

sample.info <- as.data.frame(SummarizedExperiment::colData(df4))
expr.data <- as.matrix(SummarizedExperiment::assay(df4, 'HTseq_counts')) # gene expression data
sample.info$ls <- colSums(expr.data) # adding library size variable

df9 <- createPRPS(expr.data, sample.info, librarySize = 'ls', batch=c('Year', 'Plates'), biology = 'Subtypes', purity='Purity_singscore',include.ls=T, include.purity=T, minSamplesPerBatchPS = 3, minSamplesForPuirtyPS = 3, 
                 minSamplesForPurityPerBiology = 12, minSamplesForLibrarySizePerBatch = 6,
                 minSamplesForLibrarySizePS = 3)

RUV-III

### data input
library(SummarizedExperiment)
### PRPS values
prps.batch <- df9$ps.batch
colnames(prps.batch) <- unlist(lapply(
  colnames(prps.batch),
  function(x) strsplit(x, '-')[[1]][1]
))
prps.ls <- df9$ps.ls
prps.purity <- df9$ps.purity

raw.data <- as.matrix(SummarizedExperiment::assay(df4, 'HTseq_counts'))
ruv.data <- cbind(raw.data ,prps.batch ,prps.ls, prps.purity )
ruv.data <- t(log2(ruv.data + 1))

### replicate matrix
ruv.rep <- ruv::replicate.matrix(row.names(ruv.data))

gene.annot <-  as.data.frame(SummarizedExperiment::rowData(df4))

### NCG sets
ncg.set <- colnames(ruv.data) %in% gene.annot$Gene_symbol[gene.annot$RNAseq_HK == 'yes']
#library(BiocParallel)
#library(BiocSingular)
df10 <- runRUV_III_PRPS(ruv.data = ruv.data, ruv.rep = ruv.rep, ncg.set = ncg.set, k=1, return.info = TRUE)

Combined Analysis

Combined data

#library(SummarizedExperiment)
gene.annot <-  as.data.frame(SummarizedExperiment::rowData(df4))

sample.info <-  as.data.frame(SummarizedExperiment::colData(df4))

ruv.iii.adj <- t(df10$new.ruv.data[1:ncol(raw.data) , ]) # transpose

raw.count <- SummarizedExperiment::assay(df4, 'HTseq_counts')
raw.count <- log2(raw.count + 1)
fpkm <- SummarizedExperiment::assay(df4, 'HTseq_FPKM')
fpkm <- log2(fpkm + 1)
fpkm.uq <- SummarizedExperiment::assay(df4, 'HTseq_FPKM.UQ')
fpkm.uq <- log2(fpkm.uq + 1)
RUV_III <- ruv.iii.adj

combined_data <- SummarizedExperiment(assays = list(HTseq_counts = raw.count, HTseq_FPKM = fpkm,
                                                HTseq_FPKM.UQ = fpkm.uq, RUV_III = RUV_III),
                                  colData = sample.info,
                                  rowData = gene.annot)

PCA on Combined Data

df11 <- computePCA(data = combined_data, nPcs = 7, is.log = TRUE)

Plot PCA for Combined Data

plotPC(pca.data = df11, data = combined_data, group = "Time", plot_type = "DensityPlot", pcs.no = 1:3)

PCs correlation with unwanted variations in Combined Data

df12 <- plotPCsVar(pca.data = df11, data = combined_data, type = "purity", nPCs = 7)
df12


AbhishekSinha28/tcgaCleaneR documentation built on May 6, 2022, 6:46 a.m.