# Compute predictive distribution of inferred profiles using VB
.predictive_infer_profile <- function(model, x, region = 1){
# Predictive mean
H <- design_matrix(model$basis, x)$H
W_pred <- c(H %*% model$W[region, ])
if (methods::is(model, "infer_profiles_vb_binomial") ||
methods::is(model, "infer_profiles_vb_bernoulli")) {
# Predictive variance
W_sd_pred <- sqrt(1 + diag(H %*% model$W_Sigma[[region]] %*% t(H)))
W_pred <- pnorm(W_pred / W_sd_pred)
}else if (methods::is(model, "infer_profiles_vb_gaussian")) {
W_sd_pred <- sqrt(1 / model$gaussian_l +
diag(H %*% model$W_Sigma[[region]] %*% t(H)))
}else {stop("Not predictive distribution for this object") }
return(list(W_pred = W_pred, W_sd_pred = W_sd_pred))
}
# Compute predictive distribution of clustered profiles using VB
.predictive_cluster_profile <- function(model, x){
K <- NCOL(model$W) # NUmber of clusters
H <- design_matrix(model$basis, x)$H # Design matrix
W_pred = W_sd_pred <- matrix(0, ncol = K, nrow = NROW(H))
for (k in 1:K) {
# Predictive mean
W_pred[,k] <- c(H %*% model$W[,k])
if (methods::is(model, "cluster_profiles_vb_binomial") ||
methods::is(model, "cluster_profiles_vb_bernoulli")) {
# Predictive variance
W_sd_pred[,k] <- sqrt(1 + diag(H %*% model$W_Sigma[[k]] %*% t(H)))
W_pred[,k] <- pnorm(W_pred[,k] / W_sd_pred[,k])
}else if (methods::is(model, "cluster_profiles_vb_gaussian")) {
W_sd_pred <- sqrt(1 / model$gaussian_l +
diag(H %*% model$W_Sigma[[k]] %*% t(H)))
}else {stop("Not predictive distribution for this object") }
}
return(list(W_pred = W_pred, W_sd_pred = W_sd_pred))
}
# Compute the min-max scaling
#
# \code{.minmax_scaling} normalizes a given vector using the the min-max
# scaling method. More formally:
# \deqn{scaled = \frac{data -x_{min}}{x_{max} - x_{min}} \times (f_{max} -
# f_{min}) + f_{min}}
#
# @param data Vector with numeric data to be scaled.
# @param xmin Optional minimum value, otherwise \code{min(data)} will be used.
# @param xmax Optional maximum value, otherwise \code{max(data)} will be used.
# @param fmin Optional minimum range value, default is -1.
# @param fmax Optional maximum range value, default is 1.
#
# @return The scaled data in the given range, default is between (-1, 1). If
# xmin = xmax the input vector \code{data} is returned.
#
.minmax_scaling <- function(data, xmin = NULL, xmax = NULL,
fmin = -1, fmax = 1){
if (is.null(xmin)) { xmin <- min(data) }
if (is.null(xmax)) { xmax <- max(data) }
if ( (xmin - xmax) == 0) { return(data) }
minmax <- (data - xmin) / (xmax - xmin)
minmax_scaled <- minmax * (fmax - fmin) + fmin
return(minmax_scaled)
}
# Compute inverse of min-max scaling to bring relative locations to actual
# genomic coordinates.
.inv_minmax_scaling <- function(data, xmin, xmax, fmin = -1, fmax = 1) {
x <- ( (data - fmin) * (xmax - xmin) / (fmax - fmin ) ) + xmin
return(x)
}
# Centre CpG locations
#
# \code{centre_loc} centres CpG locations relative to middle point
# of feature of interest.
#
# @param region CpG locations
# @param centre Genomic region central point
# @param strand_direction Strand direction
#
# @return Centred location data
#
.do_centre_loc <- function(region, centre, strand_direction){
assertthat::assert_that(is.character(strand_direction))
tmp <- region - centre
# If '-' strand, swap CpG locations
if (identical(strand_direction, "-")) { tmp <- (-tmp) }
return(tmp)
}
# Partition data in train and test set
#
# \code{.partition_data} partition data randomly in train and test sets.
#
# @param x Input / Independent variables
# @param y Dependent variables
# @param train_ind Index of traininig data, if NULL a random one is generated.
# @param train_perc Percentage of training data when partitioning.
#
# @return A list containing the train, test data and index of training data.
#
.partition_data <- function(x, y, train_ind = NULL, train_perc = 0.7){
# Convert both x and y to matrices
x <- data.matrix(x); colnames(x) <- NULL
y <- data.matrix(y); colnames(y) <- NULL
if (is.null(train_ind)) {
pivot <- NROW(x) * train_perc
train_ind <- sample(NROW(x), round(pivot))
}
train <- data.frame(x = x[train_ind, , drop = FALSE],
y = y[train_ind, , drop = FALSE])
test <- data.frame(x = x[-train_ind, , drop = FALSE],
y = y[-train_ind, , drop = FALSE])
return(list(train = train, test = test, train_ind = train_ind))
}
# @title Number of parallel cores
#
# @description Function for creating the number of parallel cores that will be
# used during EM.
# @param no_cores Number of cores given as input
# @param is_parallel Logical, did we require parallel computations
# @param M Total number of sources
#
.parallel_cores <- function(no_cores=NULL, is_parallel=FALSE, max_cores = NULL){
if (is_parallel) { # If parallel mode is ON
# If number of cores is not given
if (is.null(no_cores)) { no_cores <- parallel::detectCores() - 1
} else{if (no_cores >= parallel::detectCores()) {
no_cores <- parallel::detectCores() - 1 }
}
if (is.na(no_cores)) { no_cores <- 2 }
if (!is.null(max_cores)) {
if (no_cores > max_cores) { no_cores <- max_cores }
}
}
return(no_cores)
}
# Calculate error metrics
#
# \code{.calculate_errors} calculates error metrics so as to assess the
# performance of a model, e.g. linear regression.
# @param x Actual values.
# @param y Predicted values.
# @param summary Logical, indicating if the erorr metrics should be printed.
#
# @return A list containing the following components:
# \itemize{
# \item \code{mae} mean absolute error.
# \item \code{mse} mean squared error (the variance).
# \item \code{rmse} root mean squared error (std dev).
# \item \code{mape} mean absolute percentage error.
# \item \code{rstd} relative standard deviation.
# }
#
.calculate_errors <- function(x, y, summary = FALSE){
# TODO Compute actual errors using the right DoF!!
R <- list()
if (!is.numeric(x) || !is.numeric(y))
stop("Arguments 'x' and 'y' must be numeric vectors.")
if (length(x) != length(y))
stop("Vectors 'x' and ' y' must have the same length.")
error <- y - x
# mean absolute error
R$mae <- mean(abs(error))
mae_f <- formatC(R$mae, digits = 4, format = "f")
# mean squared error (the variance?!)
R$mse <- mean(error ^ 2)
mse_f <- formatC(R$mse, digits = 4, format = "f")
# root mean squared error (std. dev.)
R$rmse <- sqrt(R$mse)
rmse_f <- formatC(R$rmse, digits = 4, format = "f")
# mean absolute percentage error
R$mape <- mean(abs(error / x))
# relative standard deviation
R$rstd <- R$rmse / mean(x)
# Compute r-squared
R$rsq <- 1 - (sum(error ^ 2) / sum( (x - mean(x)) ^ 2) )
rsq_f <- formatC(R$rsq, digits = 4, format = "f")
# Pearson Correlation Coefficient
R$pcc <- stats::cor(x, y)
pcc_f <- formatC(R$pcc, digits = 4, format = "f")
if (summary) {
cat("-- Error Terms ----\n")
cat(" MAE: ", mae_f, "\n")
cat(" MSE: ", mse_f, "\n")
cat(" RMSE: ", rmse_f, "\n")
cat(" R-sq: ", rsq_f, "\n")
cat(" PCC: ", pcc_f, "\n")
cat("-------------------\n\n")
}
if (summary) { invisible(R)
}else {return(R) }
}
# Internal function to make all the appropriate type checks.
.do_checks <- function(w = NULL, basis = NULL){
if (is.null(basis)) { basis <- create_rbf_object(M = 3) }
if (is.null(w)) { w <- rep(0.5, basis$M + 1) }
if (length(w) != (basis$M + 1) ) {
stop("Coefficient vector does not match basis functions!")
}
return(list(w = w, basis = basis))
}
# Internal function to make all the appropriate type checks for EM.
.em_checks <- function(X, H, K=3, pi_k=NULL, model, basis=NULL, lambda=.5,
beta_dispersion=5, w=NULL, opt_method="CG",
init_opt_itnmax=100, is_parallel=FALSE, no_cores=NULL){
if (is.null(w)) { # If we haven't initialized coefficient matrix
out <- infer_profiles_mle(X = X, model = model, basis = basis, H = H,
lambda = lambda, w = w, beta_dispersion = beta_dispersion,
opt_method = opt_method, init_opt_itnmax = init_opt_itnmax,
is_parallel = is_parallel, no_cores = no_cores)
basis <- out$basis # Extract basis object
W <- out$W # Extract the optimized coefficients
cl <- stats::kmeans(W, K, nstart = 25) # Use K-means
C_n <- cl$cluster # Get the mixture components
w <- t(cl$centers) # Mean for each cluster
# Mixing proportions
if (is.null(pi_k)) { pi_k <- as.vector(table(C_n) / length(X)) }
}else{
if (is.null(basis)) { basis <- create_rbf_object(M = NROW(w)) }
if (NROW(w) != (basis$M + 1) ) {
stop("Coefficients w should match number of basis functions!")
}
if (is.null(pi_k)) { pi_k <- rep(1 / K, K) }
}
return(list(w = w, basis = basis, pi_k = pi_k))
}
# Compute stable Log-Sum-Exp
#
# \code{.log_sum_exp} computes the log sum exp trick for avoiding numeric
# underflow and have numeric stability in computations of small numbers.
#
# @param x A vector of observations
#
# @return The logs-sum-exp value
#
# @references
# \url{https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/}
#
.log_sum_exp <- function(x) {
# Computes log(sum(exp(x))
offset <- max(x)
s <- log(sum(exp(x - offset))) + offset
i <- which(!is.finite(s))
if (length(i) > 0) { s[i] <- offset }
return(s)
}
## A general-purpose adder:
.add_func <- function(x) Reduce(f = "+", x = x)
# Extract FPKM from string
#
# \code{.extract_fpkm} Extracts FPKM value from a string
#
# @param x a string containing FPKM information
#
# @return The FPKM numeric value
#
.extract_fpkm <- function(x){
# TODO test when no FPKM is available
fpkm <- gsub(".* FPKM ([^;]+);.*", "\\1", x)
return(as.numeric(fpkm))
}
# Extract gene name from string
#
# \code{.extract_gene_name} Extracts gene name from a string
#
# @param x a string containing gene name information
#
# @return The gene name as a string
#
.extract_gene_name <- function(x){
# TODO test when no gene name is available
gene_name <- gsub(".* gene_name ([^;]+);.*", "\\1", x)
return(gene_name)
}
# Discard selected chromosomes
#
# \code{.discard_chr} Discards selected chromosomes
#
# @param x The HTS data stored in a data.table object
# @param chr_discarded A vector with chromosome names to be discarded.
#
# @return The reduced HTS data.
#
.discard_chr <- function(x, chr_discarded = NULL){
assertthat::assert_that(methods::is(x, "data.table"))
if (!is.null(chr_discarded)) {
message("Removing selected chromosomes ...")
for (i in 1:length(chr_discarded)) {
x <- x[x$chr != chr_discarded[i]]
}
}
return(x)
}
# Discard BS-Seq noisy reads
#
# \code{.discard_bs_noise_reads} discards low coverage and (really) high reads
# from BS-Seq experiments. These reads can be thought as noise of the
# experiment.
#
# @param bs_data A GRanges object containing the BS-Seq data.
# @param min_bs_cov The minimum number of reads mapping to each CpG site.
# @param max_bs_cov The maximum number of reads mapping to each CpG site.
#
# @return The reduced GRanges object without noisy observations
#
.discard_bs_noise_reads <- function(bs_data, min_bs_cov = 2, max_bs_cov = 1000){
message("Discarding noisy reads ...")
bs_data <- subset(bs_data, bs_data$total >= min_bs_cov)
bs_data <- subset(bs_data, bs_data$total <= max_bs_cov)
return(bs_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.