#!/usr/bin/env Rscript
logging::basicConfig(level = "INFO")
# oppsite to %in%
'%!in%' <- function(x,y) !('%in%'(x,y))
# Return the indices of the rows that average above the cut off
#
# Args:
# data: Data to measure the average row and evaluate
# against the cutoff. Row = Genes, Col = Cells.
# cutoff: Threshold to be above to be kept.
#
# Returns:
# Returns a vector of row indicies to keep (are above the cutoff).
#
above_cutoff <- function(data, cutoff) {
average_gene <- log2(rowMeans(((2^data) - 1), na.rm = TRUE) + 1)
logging::loginfo(":Averages (counts).")
# Find averages above a certain threshold
genes <- rownames(data)[which(average_gene > cutoff)]
if (length(genes) > 0) {
return(genes)
} else {
return(NULL)
}
}
# Center data and threshold (both negative and postive values)
#
# Args:
# center_data: Matrix to center. Row = Genes, Col = Cells.
# threshold: Values will be required to be with -/+1 *
# threshold after centering.
# Returns:
# Centered and thresholded matrix.
#
center_with_threshold <- function(center_data) {
# Center data (automatically ignores zeros)
center_data <- center_data - rowMeans(center_data, na.rm = TRUE)
# Cap values between threshold and -threshold and recenter
center_data[center_data > 3] <- 3
center_data[center_data < -3] <- -3
center_data <- center_data - rowMeans(center_data, na.rm = TRUE)
return(center_data)
}
# Smooth a matrix by column using a simple moving average.
# Tails of the averages use a window length that is truncated to
# available data.
#
# Args:
# data: Data matrix to smooth. Row = Genes, Col = Cells.
# window_size: Length of window to use for the moving average.
# Should be a positive, odd integer.
#
# Returns:
# Matrix with columns smoothed with a simple moving average.
#
smooth_window <- function(data, window_size) {
if (window_size < 2)
return(data)
if (window_size > nrow(data))
return(data)
tail_length <- (window_size - 1)/2
num_genes <- nrow(data)
data_sm <- apply(data, 2, smooth_window_helper, window_size = window_size)
# Fix ends
data_end <- apply(data, 2, smooth_ends_helper, obs_tails = tail_length)
for (row_end in 1:tail_length) {
end_bound <- (num_genes - row_end) + 1
data_sm[row_end, ] <- data_end[row_end, ]
data_sm[end_bound, ] <- data_end[end_bound, ]
}
# Set back row and column names
row.names(data_sm) <- row.names(data)
colnames(data_sm) <- colnames(data)
return(data_sm)
}
# Helper function for smoothing the ends of a moving average.
#
# Args:
# obs_data: Data to smooth
# obs_tails: Length of the tail to smooth.
#
# Returns:
# Data smoothed.
#
smooth_ends_helper <- function(obs_data, obs_tails) {
end_data <- rep(NA, length(obs_data))
obs_count <- length(obs_data)
for (tail_end in 1:obs_tails) {
bounds <- tail_end - 1
end_tail <- obs_count - bounds
end_data[tail_end] <- mean(obs_data[(tail_end - bounds):(tail_end + bounds)],
na.rm = TRUE)
end_data[end_tail] <- mean(obs_data[(end_tail - bounds):(end_tail + bounds)],
na.rm = TRUE)
}
return(end_data)
}
# Smooth vector of values over the given window length.
#
# Args:
# obs_data: Vector of data to smooth with a moving average.
# window_size: Length of the window for smoothing.
# Must be and odd, positive, integer.
#
# Returns:
# Vector of values smoothed with a moving average.
#
smooth_window_helper <- function(obs_data, window_size) {
return(stats::filter(obs_data, rep(1/window_size, window_size), sides = 2))
}
# Center data after smoothing. Center with in cells using median.
#
# Args:
# data_smoothed: Matrix to center.
# Row = Genes, Col = cells.
#
# Returns:
# Matrix that is median centered.
# Row = Genes, Col = cells.
#
center_smoothed <- function(data_smoothed) {
logging::loginfo(":Center_smoothed.")
# Center within columns
row_median <- apply(data_smoothed, 2, median)
return(t(apply(data_smoothed, 1, "-", row_median)))
}
# Remove the average of the genes of the reference observations from all
# observations' expression. Normalization by column.
#
# Args:
# average_data: Matrix containing the data to remove average
# from (this includes the reference observations).
# Row = Genes, Col = Cells.
# ref_observations: Indices of reference observations.
# Only these are used in the average.
# ref_groups: A list of vectors of indices refering to the
# different groups of the reference indices.
#
# Returns:
# Expression with the average gene expression in the reference
# observations removed.
#
average_over_ref <- function(average_data, ref_observations) {
# r = genes, c = cells Max and min mean gene expression within reference groups.
ref_average <- rowMeans(average_data[, ref_observations, drop = FALSE])
# Remove the average gene expression of the reference groups from the For each
# gene.
average_data <- sweep(average_data, 1, ref_average, "-")
return(average_data)
}
# Remove the tails of values of a specific chromosome.
# The smooth_matrix values are expected to be in genomic order.
# If the tail is too large and no contig will be left 1/3 of the
# contig is left.
#
# Args:
# smooth_matrix: Smoothed values in genomic order.
# Row = Genes, Col = Cells.
# chr: Indices of the chr in which the tails are to be removed.
# tail_length: Length of the tail to remove on both ends of the
# chr indices.
# Returns:
# Indices to remove.
#
remove_tails <- function(smooth_matrix, chr, tail_length) {
chr_length <- length(chr)
if ((tail_length < 3) || (chr_length < 3)) {
return(c())
}
if (chr_length < (tail_length * 2)) {
tail_length <- floor(chr_length/3)
}
remove_indices <- -1 * chr[1:tail_length]
remove_indices <- c(remove_indices, -1 * (chr[((chr_length + 1) - tail_length):chr_length]))
return(remove_indices)
}
# Order the data and subset the data to data in the genomic position file.
#
# Args:
# data: Data (expression) matrix where the row names should be in
# the row names of the genomic_position file.
# genomic_position: Data frame read in from the genomic position file
#
# Returns:
# A matrix of expression in the order of the
# genomic_position file. NULL is returned if the genes in both
# data parameters do not match.
#
order_reduce <- function(data, genomic_position) {
ret_results <- list(expr = NULL, order = NULL, chr_order = NULL)
if (is.null(data) || is.null(genomic_position)) {
return(ret_results)
}
# Drop pos_gen entries that are position 0
remove_by_position <- -1 * which(genomic_position[2] + genomic_position[3] ==
0)
if (length(remove_by_position)) {
genomic_position <- genomic_position[remove_by_position, , drop = FALSE]
}
# Reduce to genes in pos file
keep_genes <- row.names(data)[which(row.names(data) %in% row.names(genomic_position))]
# Keep genes found in position file
if (length(keep_genes)) {
ret_results$expr <- data[keep_genes, , drop = FALSE]
ret_results$order <- genomic_position[keep_genes, , drop = FALSE]
} else {
logging::loginfo(paste("inferCNV:order_reduce: The position file ", "and the expression file row (gene) names do not match."))
return(list(expr = NULL, order = NULL, chr_order = NULL))
}
# Set the chr to factor so the order can be arbitrarily set and sorted.
chr_levels <- unique(genomic_position[["chr"]])
ret_results$order[["chr"]] <- factor(ret_results$order[["chr"]], levels = chr_levels)
# Sort genomic position file and expression file to genomic position file Order
# genes by genomic region
order_names <- row.names(ret_results$order)[with(ret_results$order, order(chr,
start, stop))]
ret_results$expr <- ret_results$expr[order_names, , drop = FALSE]
# This is the contig order, will be used in visualization. Get the contig order
# in the same order as the genes.
ret_results$order <- ret_results$order[order_names, , drop = FALSE]
ret_results$chr_order <- ret_results$order[1]
# Remove any gene without position information Genes may be sorted correctly by
# not have position information Here they are removed.
logging::loginfo(paste0(":Order and subset genes, ", "new dimensions (genes,cells) = ",
paste(dim(data), collapse = ","), "."))
return(ret_results)
}
#' @title Calculate CNV scores given a matrix of RNASeq counts. Output a matrix of final values.
#'
#' @param data: expression matrix (genes X cells),
#' assumed to be log2(TPM+1) .
#' @param gene_order: ordering of the genes (data's rows)
#' according to their genomic location
#' To include all genes use 0.
#' @param cutoff: cut-off for the average expression of genes to be
#' used for CNV inference.
#' @param reference_obs: Column names of the subset of cells (data's columns)
#' that should be used as references.
#' If not given, the average of all cells will
#' be the reference.
#' @param window_size: length of the window for the moving average
#' (smoothing). Should be an odd integer.
#' @param out_path: the path to what to save the pdf as. The raw data is
#' also written to this path but with the extension .txt .
#' @param contig_tail: length of the tail removed from the ends of contigs.
#' @param noise_filter: the minimum difference a value can be from the
#' average reference in order for it not to be
#' removed as noise.
#' @param vis_bounds: Used as upper and lower bounds for values in the visualization.
#' Should be given in the form of '-1,1' (lower bound, upper bound).
#'
#' @return
#' Returns an 'infercnv' object including:
#' 1. CNV matrix before visualization.
#' 2. CNV matrix after denoise and outlier removal for visualization.
#' 3. Chromosome order.
#' 4. Names of cells in reference groups.
#'
calcCNV <- function(data,
gene_pos,
cutoff,
reference_obs,
window_size,
out_path,
contig_tail,
noise_filter,
vis_bounds) {
ret_list <- list()
# Data
data <- as.matrix(data)
logging::loginfo(paste(":Original matrix dimensions (genes,cells) =", paste(dim(data),
collapse = ",")))
# Order and reduce the expression to the genomic file.
colnames(gene_pos) <- c("chr", "start", "stop")
order_ret <- order_reduce(data = data, genomic_position = gene_pos)
data <- order_ret$expr
gene_order <- order_ret$order
if (is.null(data)) {
error_message <- paste("None of the genes in the expression data", "matched the genes in the reference genomic",
"position file. Analysis Stopped.")
stop(error_message)
}
# Reduce by cutoff
keep_genes <- above_cutoff(data, cutoff)
if (!is.null(keep_genes)) {
data <- data[keep_genes, , drop = FALSE]
gene_order <- gene_order[keep_genes, , drop = FALSE]
logging::loginfo(paste0(":Reduce by cutoff, ", "new dimensions (genes,cells) = ",
paste(dim(data), collapse = ","), "."))
logging::logdebug(":Keeping genes.")
} else {
logging::loginfo(":Reduce by cutoff.")
logging::logwarn(paste("::No genes left to keep.", "Stoping."))
stop(998)
}
# Reduce contig info
chr_order <- gene_order[1]
gene_order <- NULL
# Center data (automatically ignores zeros)
data <- center_with_threshold(data)
logging::loginfo(paste0(":Outlier removal, ", " Min=", round(min(data), 3), " Max=",
round(max(data), 3), "."))
# Smooth the data with gene windows
data_smoothed <- smooth_window(data, window_size)
data <- NULL
logging::loginfo(":Smoothed data.")
# Center cells/observations after smoothing. This helps reduce the effect of
# complexity.
data_smoothed <- center_smoothed(data_smoothed)
# Remove average reference
data_smoothed <- average_over_ref(average_data = data_smoothed, ref_observations = reference_obs)
logging::loginfo(paste0(":Remove average, ", " Min=", round(min(data_smoothed), 3),
" Max=", round(max(data_smoothed), 3), "."))
# Remove Ends
remove_indices <- c()
logging::loginfo(":Remove tails.")
for (chr in unlist(unique(chr_order))) {
logging::loginfo(paste0("::Remove tail contig ", chr, "."))
remove_chr <- remove_tails(data_smoothed, which(chr_order == chr), contig_tail)
remove_indices <- c(remove_indices, remove_chr)
}
if (length(remove_indices) > 0) {
chr_order <- chr_order[remove_indices, ]
data_smoothed <- data_smoothed[remove_indices, , drop = FALSE]
}
# Save objects
ret_list[["cnv_score_raw"]] <- as.matrix(data_smoothed)
ret_list[["chr_order"]] <- paste(as.vector(as.matrix(chr_order)))
ret_list[["reference_obs"]] <- reference_obs
ret_list[["parameter"]] <- list(gene_cutoff = cutoff, window_size = window_size,
contig_tail = contig_tail)
class(ret_list) <- "infercnv"
# Denoise and outliers removal for visualization
ret_list <- denoiseVis(data = ret_list,
noise_threshold = noise_filter,
vis_bounds = vis_bounds)
assay <- ifelse(is.null(noise_filter), "cnv_score_vis",
paste0("cnv_score_vis_filter", noise_filter))
finalMat <- ret_list[[assay]]
# Log output
logging::loginfo(paste0(":Final dimensions (genes,cells) = ", paste(dim(finalMat),
collapse = ","), " Min=", round(min(finalMat), 3), " Max=", round(max(finalMat),
3), "."))
return(ret_list)
}
#' Clustering and plotting cells based on cnv score matrix using ComplexHeatmap package.
#'
#' Args:
#' @param data: an 'infercnv' object as produced by inferCNV.
#' @param assay: assay which assigned to use for plotting.
#' @param ref_lab: label for reference cells.
#' @param obs_lab: label for observations.
#' @param dist_method: distance measure used in clustering cells, possible
#' values are 'correlation' for Pearson correlation
#' and all the distances supported by stats::dist.
#' @param clustering_method: clustering method used. Accepts the same values as stats::hclust.
#' @param cutree_k: an integer scalar or vector with the desired number of groups
#' @param colors: vector of colors used in heatmap.
#' @param border: whether draw border. The value can be logical or a string of color.
#' @param out_file: filename to save plot
#' @param out_path: output directory
#'
#' @return
#' Returns an 'infercnv' object including:
#' 1. Clustering result.
#' 2. Dendlist of 'cutree'.
#'
plotCNV <- function(data,
assay,
ref_lab,
obs_lab,
dist_method,
clustering_method,
cutree_k,
colors,
border,
plot_dend,
out_file,
out_path) {
# data processing -----------------
logging::loginfo(":Data processing")
ret_list <- data
ret_list[["cluster"]] <- list()
plot_data <- as.matrix(t(data[[assay]]))
hc <- hclust(dist(plot_data, method = dist_method), method = clustering_method)
ret_list[["cluster"]][["hclust"]] <- hc
dend <- as.dendrogram(hc)
if (is.numeric(cutree_k)) {
ret_list <- pruneCutree(ret_list, k = cutree_k, out_path = out_path, plot_dend=plot_dend)
dend <- ret_list[["cluster"]][["dend"]]
ret_list[["cluster"]][["dend"]] <- NULL
}
# plot ----------------------
logging::loginfo(":Plot cnv")
# set annotation column annotation: chromosome
chr_order <- data$chr_order
chr <- ifelse(chr_order %in% c(seq(1, 22, 2), "X"), "odd", "even")
col_anno <- HeatmapAnnotation(chr = chr,
col = list(chr = c(odd = "gray60", even = "gray80")),
show_legend = FALSE, show_annotation_name = TRUE)
## row annotation: origin
ref_obs <- as.character(data$reference_obs)
if (length(ref_obs) == nrow(plot_data)){
ref_obs <- NULL
}
origin <- rep(obs_lab, nrow(plot_data))
origin[which(rownames(plot_data) %in% ref_obs)] <- ref_lab
row_anno <- rowAnnotation(origin=origin, col = list(origin = c('obs'='#FFFF00',
'ref'='#696969')), annotation_legend_param = list(title_gp=gpar(fontsize=12, fontface='bold'),
labels_gp=gpar(fontsize=11), legend_height = unit(4, "cm"), grid_width=unit(0.5, "cm")),
show_legend=TRUE, show_annotation_name=FALSE)
# set legend parameters
vis_bounds <- data[["parameter"]][["vis_bounds"]]
limits <- c(min(vis_bounds), 0, max(vis_bounds))
lg_param <- list(title='cnv',
at=limits,
labels=limits,
title_gp=gpar(fontsize=12, fontface='bold'),
labels_gp=gpar(fontsize=11),
legend_height = unit(3, "cm"),
grid_width=unit(0.5, "cm"),
border='black')
# plot
png(file.path(out_path, out_file), width = 9.5, height = 6, units = "in", res = 300)
ht <- Heatmap(plot_data,
name = "cnv",
col = colors,
cluster_columns = FALSE,
cluster_rows = dend,
show_row_names = FALSE,
show_column_names = FALSE,
row_split = cutree_k,
heatmap_legend_param = lg_param,
top_annotation = col_anno,
left_annotation = row_anno,
row_dend_width = unit(0.15, "npc"),
row_gap = unit(0, "mm"),
border=TRUE)
draw(ht)
# add chromosome labels
chr_lab <- c()
x_axis <- c()
for (c in unique(chr_order)) {
perc <- length(which(chr_order == c))/length(chr_order)
chr_lab <- c(chr_lab, ifelse(perc >= 0.02, c, "."))
x_axis <- c(x_axis, median(which(chr_order == c))/length(chr_order))
}
decorate_annotation("chr", {
grid.text(chr_lab, x = unit(x_axis, "npc"), gp=gpar(fontsize=10.5))
})
dev.off()
return(ret_list)
}
# Subtract cells from specific branches
#
# Args:
# data: an 'infercnv' object as produced by inferCNV.
# subtrees: specify the numbers of subtrees to extract.
# lab_to_rm: a character scalar with the label of cells to exclude.
#
# Returns:
# An 'infercnv' object with target cells
#
extract_helper <- function(data,
subtrees = NULL,
lab_to_rm = "ref") {
# extract leaves
dendlist <- data[["cluster"]][["cutree"]]
dendl <- dendlist[as.numeric(subtrees)]
leaves <- unlist(sapply(dendl, labels))
cells <- grep(lab_to_rm, leaves, value = TRUE, invert = TRUE)
data[["target"]] <- cells
data[["parameter"]][["subtrees"]] <- subtrees
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.