# TDM (Training Distribution Matching)
#
# This file contains functions used to perform training distribution
# matching on the gene expression values in a file, as well as other
# functions that may be useful when using machine learning on such
# datasets. It is a gentle 'normalization' technique that equalizes
# values in the tails of the data to make the tails shorter and the
# overall distribution more similar to those in a reference file.
# It assumes that the target distribution is more skewed than the
# reference distribution, such as is usually the case between RNA-seq
# and microarray expression datasets.
#
# author: Jeffrey Ahearn Thompson
#' Inverse Log2 of a Single Value
#'
#' Takes the inverse log of a value.
#'
#' @param value -- A numeric value.
#'
#' @return the inverse log of value.
#'
#' @export
inv_log <- function(value) {
return(2^as.numeric(value))
} # end inv_log
#' Log2 plus 1 a Single Value
#'
#' Takes the log2 of 1 plus a value.
#'
#' @param value -- A numeric value.
#'
#' @return the log2(1 + value)
#'
#' @export
log2p1 <- function(value) {
return(log1p(as.numeric(value))/log(2))
}
#' Data.table MAX
#'
#' Finds the max of a data.table.
#'
#' @param datatable -- A data.table whose first column will be ignored
#' but whose other columns are all numeric.
#' @return the maximum numeric value in the data.table.
#' @export
max_dt <- function(datatable) {
return(max(as.numeric(as.matrix(datatable[,2:ncol(datatable),with=F]))))
} # end max_dt
#' Data.table MIN
#'
#' Finds the min of a data.table.
#'
#' @param datatable -- A data.table whose first column will be ignored
#' but whose other columns are all numeric.
#' @return the minimum numeric value in the data.table.
#' @export
min_dt <- function(datatable) {
return(min(as.numeric(as.matrix(datatable[,2:ncol(datatable),with=F]))))
} # end min_dt
#' IQR
#'
#' Finds the IQR of a data.table.
#'
#' @param datatable -- A data.table whose first column will be ignored
#' but whose other columns are all numeric.
#' @return IQR of the data.table.
#' @export
iqr_dt <- function(datatable) {
quartiles=(summary(as.numeric(data.matrix(datatable[,2:ncol(datatable), with=F])))[c(2,5)])
return(as.numeric(quartiles[2]) - as.numeric(quartiles[1]))
} # end iqr_dt
#' Quartiles
#'
#' Finds the 1st and 3rd quartiles of a data.table.
#'
#' @param datatable -- A data.table whose first column will be ignored
#' but whose other columns are all numeric.
#' @return 1st and 3rd quartiles of the data.table.
#' @export
quartiles_dt <- function(datatable) {
quartiles=(summary(as.numeric(array(as.matrix(datatable[,2:ncol(datatable), with=F]))))[c(2,5)])
return(quartiles)
}
#' Inverse Log Transform
#'
#' Given a data.table of values, each value is inverse log transformed.
#'
#' @param data -- A data.table of values. The first column should be
#' gene symbols and the rest should be expression values.
#' @param file -- A filename with expression values. This is an alternative
#' to specifying data.
#' @return the transformed data.table.
#'
#' @export
inv_log_transform = function(data = NULL, file = NULL) {
if(is.null(data)) {
if(is.null(file)) {
stop("You must specifcy either data or file.")
} else {
data <- fread(file, header=T, data.table=T)
}
}
# ensure gene expression values are numeric
data <- ensure_numeric_gex(data)
# Convert the numeric part of the data.table to a matrix.
# Assuming first column contains gene symbols.
datamat = data.matrix(data[,2:ncol(data),with=F])
# Inverse log transform the matrix.
inv_log = apply(datamat,1,inv_log)
# Convert the result back to a data.table and bind the symbols back on.
result = data.table(cbind(as.character(data[[1]]), t(inv_log)))
setnames(result, colnames(result), colnames(data))
return(ensure_numeric_gex(result))
} # end inv_log_transform
#' Log2 Transform 1 Plus
#'
#' The input is log transformed such that all values are non-negative by
#' adding 1 to each value before transformation.
#'
#' @param data -- A data.table of values. The first column should be
#' gene symbols and the rest should be expression values.
#' @param file -- A filename with expression values. This is an alternative
#' to specifying data.
#'
#' @return the transformed data.table.
#'
#' @export
log_transform_p1 = function(data = NULL, file = NULL) {
if(is.null(data)) {
if(is.null(file)) {
stop("You must specify either data or file.")
} else {
data <- fread(file, header=T, data.table=T)
}
}
# ensure gene expression values are numeric
data <- ensure_numeric_gex(data)
# Convert the numeric part of the data.table to a matrix.
# Assuming first column contains gene symbols.
datamat = data.matrix(data[,2:ncol(data),with=F])
# Log (+1) transform the matrix.
log_transform = apply(datamat,1,log2p1)
# Convert the result back to a data.table and bind the
# gene symbols back on.
result = data.table(cbind(as.character(data[[1]]), t(log_transform)))
setnames(result, colnames(result), colnames(data))
return(ensure_numeric_gex(result))
} # end log_transform_p1
#' Zero to One Transform a Vector
#'
#' A data vector is transformed so that all values are in range [0,1].
#'
#' @param data -- A vector of gene expression values.
#'
#' @return the transformed vector.
#'
#' @export
zero_to_one = function(data) {
# If the data are already all 0, then do nothing.
# If the data are all the same, then stop.
if(all(data==0)) {
return(as.vector(rep(0.0, length(data))))
}else if(min(data) == max(data)){
return(as.vector(rep(0.0, length(data))))
if(min(data) < 1){
return(data)
} else {
return(as.vector(rep(0.0, length(data))))
}
}
# Transform the values.
line = (data-min(data))/(max(data)-min(data))
return(as.vector(line))
}
#' Zero to One Transformation
#'
#' The input is standardized to the range [0,1] by row.
#'
#' @param data -- A data.table of values. The first column should be
#' gene symbols and the rest should be expression values.
#'
#' @return the transformed datatable.
#'
#' @export
zero_to_one_transform = function(datatable) {
# ensure gene expression values are numeric
datatable <- ensure_numeric_gex(datatable)
# Convert the data to a matrix.
datamat = data.matrix(datatable[,2:ncol(datatable),with=F])
# Transform to [0,1] range, row by row.
zo = apply(datamat,1,zero_to_one)
# Bind on the gene symbols.
result = data.table(data.frame(as.character(datatable[[1]]), t(zo)))
setnames(result, colnames(result), colnames(datatable))
return(ensure_numeric_gex(result))
}
#' TDM Transformation on a Single Value
#'
#' Applies TDM transformation to a single value.
#'
#' @param value -- A single value from a distribution that is being
#' TDM transformed.
#' @param iqr -- inter-quartile-range of distribution being transformed
#' @param old_min -- min of distribution being transformed
#' @param old_max -- max of distribution being transformed
#' @param old_first_q -- first quartile of distribution being transformed
#' @param old_third_q -- third quartile of distribution being transformed
#' @param ref_min -- min of reference distribution
#' @param ref_max -- max of reference distribution
#' @param ref_third_q -- third quartile of reference distribution
#' @param ref_first_q -- first quartile of reference distribution
#' @param log_reference -- logical indcating if reference distribution was logged
#' before values were extracted
#' @param log_target -- logical indicating if target distribution should be logged
#'
#' @return TDM of the value passed.
#' @export
tdm <- function(value, iqr, old_min, old_max, old_third_q, old_first_q, ref_min, ref_max, ref_third_q, ref_first_q, inv_reference=TRUE, log_target=TRUE, tdm_factor=1) {
# If the reference distribution was log transformed, then inverse log its
# values here. This will come out the same, because log is monotonically
# increasing.
if(inv_reference) {
ref_max = 2^ref_max
ref_min = 2^ref_min
ref_third_q = 2^ref_third_q
ref_first_q = 2^ref_first_q
}
# Find the number of iqrs from reference distribution that fit between
# it's third quartile and max.
topscale = (ref_max- ref_third_q) / ((ref_third_q - ref_first_q)*tdm_factor)
# Do the same for min and first quartile.
bottomscale = (ref_first_q - ref_min) / ((ref_third_q - ref_first_q)*tdm_factor)
# Set cutoff values that would make the target distribution have same
# relationship between quartiles and min and max.
upandout = old_third_q + topscale*iqr*tdm_factor
downandout = old_first_q - bottomscale*iqr*tdm_factor
# The bottom threshold should not be less than the minimum already was.
if(downandout < old_min) {
downandout = old_min
}
# Set any values that are greater or less than the thresholds to be equal
# to the thresholds.
if(value > upandout) {
value = upandout
} else if(value < downandout) {
value = downandout
}
# Now scale the value so that all values are within the same min and max
# as the reference distribution.
result = (value - downandout)/(upandout - downandout) * (ref_max - ref_min) + ref_min
# If the target distribution should be log transformed, then do it.
if(log_target) {
result = log2(result)
}
return(result)
} # end tdm
euc.dist = function(x1, x2) sqrt(sum((x1 - x2) ^ 2))
trim = function (x) gsub("^\\s+|\\s+$", "", x)
#' TDM Transformation
#'
#' The expression files should have the same sorts of gene symbols. The target file
#' will be filtered to include only symbols that are in the reference file and the
#' order will also be changed to match. If a gene is not present in the target file,
#' an entry of all 0s will be added for that gene.
#'
#' @param file -- The file of gene expression values to transform, presumably
#' containing RNA-seq values. It should have a header, but the header
#' should only contain column names, not a row name. All other rows should
#' have the same number of values and the first value should be a row name.
#' This is an inconveniance of working with data.table, but they are much faster.
#' Values should be tab-separated.
#'
#' @param ref_file -- the file to transform in relation to, presumably microarray
#' expression values. Values should be tab-separated.
#'
#' @param negative -- should the reference file be first inverse log transformed, then
#' log transformed again adding 1 to each value? Do this if the file has already
#' been log transformed and their are negative numbers in it.
#'
#' @param filter_p -- should the patients in the target file be filtered to include
#' only those that match patients (column ids) in the reference file? Only the first
#' 15 characters are checked for match.
#'
#' @param inv_reference -- should reference data be inverse log transformed? Do this
#' if the target is not already log transformed (which is the typical case). This just
#' ensures that the comparison is meaningful.
#'
#' @param log_target -- should the target file be log transformed after TDM? Usually
#' it should, since microarray data are typically log transformed and the whole
#' point of the transformation is to make the data more comparable.
#'
#' @param tdm_factor -- a factor of the IQR, to adjust the granularity of distribution
#' matching.
#'
#' @return void
#'
#' @export
tdm_transform <- function(target_data=NULL,
ref_data=NULL,
file=NULL,
ref_file=NULL,
negative=FALSE,
filter_p=FALSE,
inv_reference = TRUE,
log_target=TRUE,
tdm_factor=1){
load_it(c("data.table", "binr", "scales"))
# Preprocess the reference data.
if(is.null(ref_data)) {
# Read the first line of the reference file.
ref_head = readLines(ref_file, n=1)
# Split all values in the header of the reference file by tabs.
split_head = unlist(strsplit(ref_head, "\t"))
# Read in the reference expression values.
ref_values <- fread(ref_file, header=F, skip=1, data.table=T)
# If there is no row name for the header, then handle it.
if(ncol(ref_values) == length(split_head)) {
ref_head = split_head[2:length(split_head)]
} else {
ref_head = split_head
}
rm(split_head)
# Add a rowname to the header, just to make things consistent.
setnames(ref_values, colnames(ref_values), c("gene", ref_head))
rm(ref_head)
} else {
ref_values = ref_data
}
# Get the gene symbols from the reference data.
genes = data.frame(gene = ref_values[[1]], drop=F)
# Inverse log, then relog reference values if asked.
if(negative) {
ref_values = inv_log_transform(ref_values)
ref_values = log_transform_p1(ref_values)
ref_values$gene = genes$gene
}
# Key the table to gene symbol.
#setkey(ref_values, "gene")
# Preprocess the target data.
if(is.null(target_data)) {
# Get the header of the target file.
exp_head = readLines(file, n=1)
split_head = unlist(strsplit(exp_head, "\t"))
# Read in the target expression file.
expression_values <- fread(file, header=F, skip=1, data.table=T)
# Again, handle the case when there is no row name for the target
# file header.
if(ncol(expression_values) == length(split_head)) {
exp_head = split_head[2:length(split_head)]
} else {
exp_head = split_head
}
rm(split_head)
setnames(expression_values, colnames(expression_values), c("gene", exp_head))
rm(exp_head)
}else {
expression_values = target_data
}
# Key the table to gene symbol.
#setkey(expression_values, "gene")
# If the gene symbols come with a vertical bar and Entrez id,
# such as Illumina data often do, then strip that suffix.
expression_values$gene = gsub('^(.*)\\|.*$', '\\1', expression_values$gene)
expression_values$gene = trim(expression_values$gene)
# Figure out which genes are in the reference file, but not
# in the target file.
missing = ref_values$gene[!(ref_values$gene %in% expression_values$gene)]
# Order the genes in the target file to be the same as those in
# the reference file.
expression_values = expression_values[match(ref_values$gene, expression_values$gene),1:ncol(expression_values),with=FALSE]
if(nrow(expression_values) < 1) {
stop("No matching genes found between datasets.")
}
# Filter the genes in the target file to include only those in
# the reference file.
expression_values = expression_values[expression_values$gene %in% ref_values$gene,1:ncol(expression_values),with=FALSE]
# Add missing data entries of all 0's for the genes that were in the
# reference file but not in the target file.
missing_matrix = matrix(rep(0,
(ncol(expression_values)-1) * length(missing)),
nrow=length(missing))
if(nrow(missing_matrix) > 0) {
missing_dt <- data.table(cbind(gene = missing, data.frame(missing_matrix)))
setnames(missing_dt, colnames(missing_dt), colnames(expression_values))
expression_values <- rbindlist(list(expression_values, missing_dt))
}
if(filter_p) {
# Filter the patients in the target file to include only those in
# the reference file. Currently, this assumes TCGA style patient ids.
setnames(expression_values, colnames(expression_values), substr(colnames(expression_values),1,15))
setnames(ref_values, colnames(ref_values), substr(colnames(ref_values),1,15))
cols = intersect(colnames(expression_values), colnames(ref_values))
expression_values <- expression_values[,na.omit(cols),with=FALSE]
}
# Set variables that are needed for TDM.
old_min = as.numeric(min_dt(expression_values))
old_max = as.numeric(max_dt(expression_values))
old_quartiles <- quartiles_dt(expression_values)
old_third_q <- as.numeric(old_quartiles[2])
old_first_q <- as.numeric(old_quartiles[1])
iqr <- as.numeric(iqr_dt(expression_values))
ref_min <- as.numeric(min_dt(ref_values))
ref_max <- as.numeric(max_dt(ref_values))
ref_quartiles <- quartiles_dt(ref_values)
ref_third_q <- as.numeric(ref_quartiles[2])
ref_first_q <- as.numeric(ref_quartiles[1])
# Get the coloumn names of the target file.
cols <- colnames(expression_values[,2:ncol(expression_values), with=F])
# Convert all expression entries to numeric.
for(j in cols) set(expression_values, j=j, value=as.numeric(expression_values[[j]]))
# Perform TDM
datamat = data.matrix(expression_values[,2:ncol(expression_values),with=F])
tdmmat = apply(datamat,1,function(x) {sapply(x, function(y) tdm(y, iqr, old_min, old_max, old_third_q, old_first_q, ref_min, ref_max, ref_third_q, ref_first_q, inv_reference, log_target, tdm_factor))})
result = data.table(cbind(as.character(expression_values[[1]]), t(tdmmat)))
result[[1]] = as.factor(result[[1]])
setnames(result, colnames(result), colnames(expression_values))
expression_values = result
# Order the genes in the target file to be the same as those in
# the reference file.
expression_values <- expression_values[match(genes$gene, expression_values$gene),1:ncol(expression_values),with=FALSE]
# ensure gene expression values are numeric
expression_values <- ensure_numeric_gex(expression_values)
return(expression_values)
} # end tdm_transform
#' Numeric version of data.table
#'
#' Ensure gene expression values are numeric in a given data.table
#'
#' @param input_data: a data.table with gene in the first column and gene
#' expression in the remaining columns
#'
#' @return a data.table with numeric values for the gene expression columns
#'
#' @export
ensure_numeric_gex <- function(input_data) {
if ("data.table" %in% class(input_data)) {
# save gene names as character vector
gene_names <- as.character(input_data[[1]])
# force gene expression values to be numeric
gex_values <- t(apply(input_data[,-1], 1, as.numeric))
# create data frame of gene names and gene expression values
# set column names to be same as input data
return_df <- data.frame(gene_names, gex_values)
names(return_df) <- names(input_data)
# return as data.table
return(data.table(return_df))
} else {
stop("\nInput must be a data.table")
}
} # ensure_numeric_gex
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.