inst/doc/comparison_with_base_R.R

## ---- echo=FALSE, include=FALSE-----------------------------------------------
library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
                      message = FALSE, cache.lazy = FALSE)

library(dplyr)
library(tidyr)
library(tibble)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(tidybulk)


my_theme = 	
	theme_bw() +
	theme(
		panel.border = element_blank(),
		axis.line = element_line(),
		panel.grid.major = element_line(size = 0.2),
		panel.grid.minor = element_line(size = 0.1),
		text = element_text(size=12),
		legend.position="bottom",
		aspect.ratio=1,
		strip.background = element_blank(),
		axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
		axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
	)


## -----------------------------------------------------------------------------
tt = counts_mini %>% tidybulk(sample, transcript, count)

## ----aggregate, cache=TRUE, message=FALSE, warning=FALSE, results='hide', class.source='yellow'----
tt.aggr = tt %>% aggregate_duplicates()

## ----aggregate long, eval=FALSE-----------------------------------------------
#  temp = data.frame(
#  	symbol = dge_list$genes$symbol,
#  	dge_list$counts
#  )
#  dge_list.nr <- by(temp,	temp$symbol,
#  	function(df)
#  		if(length(df[1,1])>0)
#  			matrixStats:::colSums(as.matrix(df[,-1]))
#  )
#  dge_list.nr <- do.call("rbind", dge_list.nr)
#  colnames(dge_list.nr) <- colnames(dge_list)

## ----normalise, cache=TRUE----------------------------------------------------
tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance()

## ----normalise long, eval=FALSE-----------------------------------------------
#  library(edgeR)
#  
#  dgList <- DGEList(count_m=x,group=group)
#  keep <- filterByExpr(dgList)
#  dgList <- dgList[keep,,keep.lib.sizes=FALSE]
#  [...]
#  dgList <- calcNormFactors(dgList, method="TMM")
#  norm_counts.table <- cpm(dgList)

## ----filter variable, cache=TRUE----------------------------------------------
tt.norm.variable = tt.norm %>% keep_variable()

## ----filter variable long, eval=FALSE-----------------------------------------
#  library(edgeR)
#  
#  x = norm_counts.table
#  
#  s <- rowMeans((x-rowMeans(x))^2)
#  o <- order(s,decreasing=TRUE)
#  x <- x[o[1L:top],,drop=FALSE]
#  
#  norm_counts.table = norm_counts.table[rownames(x)]
#  
#  norm_counts.table$cell_type = tidybulk::counts[
#  	match(
#  		tidybulk::counts$sample,
#  		rownames(norm_counts.table)
#  	),
#  	"Cell type"
#  ]

## ----mds, cache=TRUE----------------------------------------------------------
tt.norm.MDS =
  tt.norm %>%
  reduce_dimensions(method="MDS", .dims = 2)


## ---- eval = FALSE------------------------------------------------------------
#  library(limma)
#  
#  count_m_log = log(count_m + 1)
#  cmds = limma::plotMDS(ndim = .dims, plot = FALSE)
#  
#  cmds = cmds %$%	
#  	cmdscale.out %>%
#  	setNames(sprintf("Dim%s", 1:6))
#  
#  cmds$cell_type = tidybulk::counts[
#  	match(tidybulk::counts$sample, rownames(cmds)),
#  	"Cell type"
#  ]

## ----pca, cache=TRUE, message=FALSE, warning=FALSE, results='hide'------------
tt.norm.PCA =
  tt.norm %>%
  reduce_dimensions(method="PCA", .dims = 2)

## ----eval=FALSE---------------------------------------------------------------
#  count_m_log = log(count_m + 1)
#  pc = count_m_log %>% prcomp(scale = TRUE)
#  variance = pc$sdev^2
#  variance = (variance / sum(variance))[1:6]
#  pc$cell_type = counts[
#  	match(counts$sample, rownames(pc)),
#  	"Cell type"
#  ]

## ----tsne, cache=TRUE, message=FALSE, warning=FALSE, results='hide'-----------
tt.norm.tSNE =
	breast_tcga_mini %>%
	tidybulk(		sample, ens, count_scaled) %>%
	identify_abundant() %>%
	reduce_dimensions(
		method = "tSNE",
		perplexity=10,
		pca_scale =TRUE
	)

## ---- eval=FALSE--------------------------------------------------------------
#  count_m_log = log(count_m + 1)
#  
#  tsne = Rtsne::Rtsne(
#  	t(count_m_log),
#  	perplexity=10,
#  		pca_scale =TRUE
#  )$Y
#  tsne$cell_type = tidybulk::counts[
#  	match(tidybulk::counts$sample, rownames(tsne)),
#  	"Cell type"
#  ]

## ----rotate, cache=TRUE-------------------------------------------------------
tt.norm.MDS.rotated =
  tt.norm.MDS %>%
	rotate_dimensions(`Dim1`, `Dim2`, rotation_degrees = 45, action="get")

## ---- eval=FALSE--------------------------------------------------------------
#  rotation = function(m, d) {
#  	r = d * pi / 180
#  	((bind_rows(
#  		c(`1` = cos(r), `2` = -sin(r)),
#  		c(`1` = sin(r), `2` = cos(r))
#  	) %>% as_matrix) %*% m)
#  }
#  mds_r = pca %>% rotation(rotation_degrees)
#  mds_r$cell_type = counts[
#  	match(counts$sample, rownames(mds_r)),
#  	"Cell type"
#  ]

## ----de, cache=TRUE, message=FALSE, warning=FALSE, results='hide'-------------
tt.de =
	tt %>%
	test_differential_abundance( ~ condition, action="get")
tt.de

## ---- eval=FALSE--------------------------------------------------------------
#  library(edgeR)
#  
#  dgList <- DGEList(counts=counts_m,group=group)
#  keep <- filterByExpr(dgList)
#  dgList <- dgList[keep,,keep.lib.sizes=FALSE]
#  dgList <- calcNormFactors(dgList)
#  design <- model.matrix(~group)
#  dgList <- estimateDisp(dgList,design)
#  fit <- glmQLFit(dgList,design)
#  qlf <- glmQLFTest(fit,coef=2)
#  topTags(qlf, n=Inf)

## ----adjust, cache=TRUE, message=FALSE, warning=FALSE, results='hide'---------
tt.norm.adj =
	tt.norm %>% adjust_abundance(	~ condition + time)


## ---- eval=FALSE--------------------------------------------------------------
#  library(sva)
#  
#  count_m_log = log(count_m + 1)
#  
#  design =
#  		model.matrix(
#  			object = ~ condition + time,
#  			data = annotation
#  		)
#  
#  count_m_log.sva =
#  	ComBat(
#  			batch =	design[,2],
#  			mod = design,
#  			...
#  		)
#  
#  count_m_log.sva = ceiling(exp(count_m_log.sva) -1)
#  count_m_log.sva$cell_type = counts[
#  	match(counts$sample, rownames(count_m_log.sva)),
#  	"Cell type"
#  ]
#  

## ----cibersort, cache=TRUE----------------------------------------------------
tt.cibersort =
	tt %>%
	deconvolve_cellularity(action="get", cores=1)


## ---- eval=FALSE--------------------------------------------------------------
#  
#  source(‘CIBERSORT.R’)
#  count_m %>% write.table("mixture_file.txt")
#  results <- CIBERSORT(
#  	"sig_matrix_file.txt",
#  	"mixture_file.txt",
#  	perm=100, QN=TRUE
#  )
#  results$cell_type = tidybulk::counts[
#  	match(tidybulk::counts$sample, rownames(results)),
#  	"Cell type"
#  ]
#  

## ----cluster, cache=TRUE------------------------------------------------------
tt.norm.cluster = tt.norm.MDS %>%
  cluster_elements(method="kmeans",	centers = 2, action="get" )

## ---- eval=FALSE--------------------------------------------------------------
#  count_m_log = log(count_m + 1)
#  
#  k = kmeans(count_m_log, iter.max = 1000, ...)
#  cluster = k$cluster
#  
#  cluster$cell_type = tidybulk::counts[
#  	match(tidybulk::counts$sample, rownames(cluster)),
#  	c("Cell type", "Dim1", "Dim2")
#  ]
#  

## ----SNN, cache=TRUE, message=FALSE, warning=FALSE, results='hide'------------
tt.norm.SNN =
	tt.norm.tSNE %>%
	cluster_elements(method = "SNN")

## ---- eval=FALSE--------------------------------------------------------------
#  library(Seurat)
#  
#  snn = CreateSeuratObject(count_m)
#  snn = ScaleData(
#  	snn, display.progress = TRUE,
#  	num.cores=4, do.par = TRUE
#  )
#  snn = FindVariableFeatures(snn, selection.method = "vst")
#  snn = FindVariableFeatures(snn, selection.method = "vst")
#  snn = RunPCA(snn, npcs = 30)
#  snn = FindNeighbors(snn)
#  snn = FindClusters(snn, method = "igraph", ...)
#  snn = snn[["seurat_clusters"]]
#  
#  snn$cell_type = tidybulk::counts[
#  	match(tidybulk::counts$sample, rownames(snn)),
#  	c("Cell type", "Dim1", "Dim2")
#  ]
#  

## ----drop, cache=TRUE---------------------------------------------------------
tt.norm.non_redundant =
	tt.norm.MDS %>%
  remove_redundancy(	method = "correlation" )

## ---- eval=FALSE--------------------------------------------------------------
#  library(widyr)
#  
#  .data.correlated =
#  	pairwise_cor(
#  		counts,
#  		sample,
#  		transcript,
#  		rc,
#  		sort = TRUE,
#  		diag = FALSE,
#  		upper = FALSE
#  	) %>%
#  	filter(correlation > correlation_threshold) %>%
#  	distinct(item1) %>%
#  	rename(!!.element := item1)
#  
#  # Return non redundant data frame
#  counts %>% anti_join(.data.correlated) %>%
#  	spread(sample, rc, - transcript) %>%
#  	left_join(annotation)
#  
#  
#  

## ----heat, eval=FALSE---------------------------------------------------------
#  tt.norm.MDS %>%
#  
#    # filter lowly abundant
#    keep_abundant() %>%
#  
#    # extract 500 most variable genes
#    keep_variable( .abundance = count_scaled, top = 500) %>%
#  
#    # create heatmap
#  	as_tibble() %>%
#    heatmap(sample, transcript, count_scaled, transform = log1p) %>%
#  	add_tile(`Cell type`)

## ---- eval=FALSE--------------------------------------------------------------
#  # Example taken from airway dataset from BioC2020 workshop.
#  dgList <- SE2DGEList(airway)
#  group <- factor(dgList$samples$`Cell type`)
#  keep.exprs <- filterByExpr(dgList, group=group)
#  dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
#  dgList <- calcNormFactors(dgList)
#  logcounts <- cpm(dgList, log=TRUE)
#  var_genes <- apply(logcounts, 1, var)
#  select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
#  highly_variable_lcpm <- logcounts[select_var,]
#  colours <- c("#440154FF", "#21908CFF", "#fefada" )
#  col.group <- c("red","grey")[group]
#  gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")

## ----density, eval=FALSE------------------------------------------------------
#  # Example taken from airway dataset from BioC2020 workshop.
#  airway %>%
#      tidybulk() %>%
#  	  identify_abundant() %>%
#      scale_abundance() %>%
#      pivot_longer(cols = starts_with("counts"), names_to = "source", values_to = "abundance") %>%
#      filter(!lowly_abundant) %>%
#      ggplot(aes(x=abundance + 1, color=sample)) +
#      geom_density() +
#      facet_wrap(~source) +
#      scale_x_log10()

## ---- eval=FALSE--------------------------------------------------------------
#  # Example taken from airway dataset from BioC2020 workshop.
#  dgList <- SE2DGEList(airway)
#  group <- factor(dgList$samples$dex)
#  keep.exprs <- filterByExpr(dgList, group=group)
#  dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
#  dgList <- calcNormFactors(dgList)
#  logcounts <- cpm(dgList, log=TRUE)
#  var_genes <- apply(logcounts, 1, var)
#  select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
#  highly_variable_lcpm <- logcounts[select_var,]
#  colours <- c("#440154FF", "#21908CFF", "#fefada" )
#  col.group <- c("red","grey")[group]
#  gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")

## -----------------------------------------------------------------------------
sessionInfo()

Try the tidybulk package in your browser

Any scripts or data that you put into this service are public.

tidybulk documentation built on April 7, 2021, 6 p.m.