#'
#' Utility/convenience functions
#'
#'
#' Make pairs out of two vectors of rownames (convenience function
#' to be used with empirical_controls.R)
#'
#' This function creates a pair matrix using specifically-provided row names.
#'
#' @param rn_min Rownames for group 1
#' @param rn_max Rownames for group 2
#' @param mat Expression matrix (NOTE: must have rownames)
#' @param n Length of rn_min and rn_max
#'
#' @return A matrix of dimension n*n with pairs generated for
#' every comparision in rn_min to rn_max.
make_pairs <- function(rn_min, rn_max, mat, n){
tmp <- matrix(NA, n*n, ncol(mat))
rn_tmp <- vector("character", n*n)
for(j in 1:length(rn_max)){
for(k in 1:length(rn_min)){
tmp[(n*(j-1) + k),] <- mat[rn_min[k],] < mat[rn_max[j],]
rn_tmp[(n*(j-1) + k)] <- paste0(rn_min[k], "<", rn_max[j])
}
}
tmp <- ifelse(tmp, 1, 0)
rownames(tmp) <- rn_tmp
tmp
}
#'
#' Build individual pairs based on rownames
#'
#' A convenience function to quickly build a small matrix of pairs. Used
#' when we want to predict on a validation set.
#'
#' @param pair A single pairname "A<B", where "A" and "B" are valid rownames.
#' @param mat A matrix with rownames that contain all "A<B" pairs.
#'
#' @return A vector of 0's and 1's corresponding to the pair "A<B" we want
#'
single_pairs <- function(pair, mat){
names <- unlist(strsplit(pair, "<"))
if(!(names[1] %in% rownames(mat)) | !(names[2] %in% rownames(mat))){
rep(NA, ncol(mat))
} else {
ifelse(mat[names[1],] - mat[names[2],] < 0, 1, 0)
}
}
#'
#' Empirical controls calculation function
#'
#' Generate candidate top-scoring pairs (TSPs) by finding "empirical
#' control" genes within quantiles of the data.
#'
#' @param exprs An p x m input matrix of gene expression values with p genes and m samples. This matrix MUST have rownames.
#' @param n Number of pairs to create within each quantile bin (default 40)
#' @param q Number of quantiles desired (default 4)
#'
#' @details We intend to use TSPs as the features in our gene signature. Examining all features
#' would require generating $p$ choose 2 comparisons, most of which would be uninformative. We prioritize
#' the creation of features where one gene is expressed at a constant level across classes and the other
#' gene is expressed high and low across classes. To unearth pairs that behave like this, we rank all
#' genes by their average expression and break them into quantile groups. We then rank all genes within
#' each group by their variance and compute comparisons between the top $n$ most variable and least variable genes.
#' The result of this function provides a matrix of size $(n*q)$ x $m$, where each row is a binary vector indicating
#' the value of I($g_1 > g_2$) for two selected.
#'
#' @return A (n*q) x m matrix with TSPs in rows and samples in columns
empirical_controls = function(exprs, n=40, q=4){
if(is.null(rownames(exprs))){
stop("Input data matrix needs rownames.")
}
gm <- cbind(rowMeans(exprs), apply(exprs,1,sd))
qt <- quantile(gm[,1], na.rm=T, probs=seq(0,1,length.out=q+1)) # Get cut points
all_pairs <- all_min <- all_max <- c()
for(i in 1:(length(qt)-1)){
# Get the ith quantile subset and operate within it
sub <- gm[which(gm[,1] >= qt[i] & gm[,1] < qt[i+1]),]
rn <- rownames(sub)
sd_ord <- order(sub[,2], sub[,1])
rn_min <- rn[head(sd_ord, n)]
rn_max <- rn[tail(sd_ord, n)]
all_pairs <- rbind(all_pairs, make_pairs(rn_min, rn_max, exprs, n))
}
all_pairs
}
#'
#' Calculate f-statistics quickly
#'
#' Using matrix algebbra, quickly calculate
#' f statistics for comparing nested models
#'
#' @param dat An input data set
#' @param mod The model matrix for the alternative model to be fit
#' @param mod0 The model matrix for the null model to be fit. If NULL, the null model
#' is the model with only an intercept.
#'
#' @return fstats A vector of f-statistics
#'
fstats <- function(dat,mod,mod0=NULL){
n <- dim(dat)[2]
m <- dim(dat)[1]
if(is.null(mod0)){
mod0 <- cbind(rep(1,n))
}
df1 <- dim(mod)[2]
df0 <- dim(mod0)[2]
p <- rep(0,m)
Id <- diag(n)
resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*% t(mod))
rss1 <- rowSums(resid*resid)
rm(resid)
#gc()
resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*% t(mod0))
rss0 <- rowSums(resid0*resid0)
rm(resid0)
#gc()
fstats <- ((rss0 - rss1)/(df1-df0))/(rss1/(n-df1))
rm(rss0,rss1)
#gc()
return(fstats)
}
#'
#' Regression-based feature selection
#'
#' Leverages the fact that the F-statistic for y~x and
#' x~y is the same to quickly choose pairs that add conditional
#' value for predicting an outcome. Also runs a separate pair-on-pair
#' regression to eliminate collinear pairs.
#'
#' @param pairmat An m x n matrix of pairwise features with rows=features, columns=samples
#' (generated by empirical control feature selection)
#' @param outcome A vector of outcomes of length n
#' @param covar An optional n x p matrix of additional covariates to adjust for
#' @param npair The number of paris we wish to select
#'
#' @details This function takes the output from empirical_controls.R and reduces it
#' to a set of pairs of size "npairs" that are predictive of the outcome of interest.
#' The paris are chosen conditionally so that the next pair chosen adds predictive
#' ability additional to the pairs that have already been chosen. At each step,
#' we consider a regression of
#'
#' @return pairs A vector with the index for each chosen pair
#'
reg_fs <- function(pairmat, outcome, covar=NULL, npair=5){
# Tolerance for testing residuals
tol <- 1e-10
# Covariates are always accounted for
if(!is.null(covar)){
if(nrow(covar) != ncol(pairmat)){
stop("covar and pairmat must have agreeing dimensions.")
}
outmat <- cbind(outcome, covar)
} else {
outmat <- cbind(outcome)
}
if(ncol(pairmat) != length(outcome)){
stop("pairmat and outcome must have agreeing dimensions.")
}
tpm <- pairmat # temp copy
pairs <- vector("integer", npair)
# Initial step is for best overall pair
mod0 <- cbind(rep(1, length(outcome)))
mod <- cbind(mod0, outmat)
idx <- 1:nrow(tpm)
for(i in 1:npair){
# Get F-statistic for every remaining pair
fs <- fstats(tpm, mod, mod0)
cur <- which.max(fs)
mod0 <- cbind(mod0, tpm[cur,])
mod <- cbind(mod, tpm[cur,])
pairs[i] <- idx[cur]
# Get residuals for regressing all remaining pairs on all chosen features
res <- apply(tpm, 1, function(y){sum((lm.fit(mod0, y)$residuals)^2)})
# Need to dump this and all identical rows
#all_cur <- which(apply(tpm, 1, function(x){all(x == tpm[cur,])}))
all_cur <- which(res < tol)
tpm <- tpm[-all_cur,]
idx <- idx[-all_cur]
}
pairs
}
#'
#' Build and cross-validate a TSP-based model
#'
#' This function takes training/test data and pairs generated via empirical control feature
#' selection and builds a decision tree model. It also cross-validates to get an out-of-sample
#' accuracy estimate
#'
#' @param train p x n training data matrix
#' @param train_outcome Outcome data of length n
#' @param train_covar n x q additional covariates for training data (optional)
#' @param pairs r x n matrix of TSP generated via empirical controls
#' @param test p x m test data matrix, where p columns and column names match up with train
#' @param test_covar m x s additional covariates for training data (necessary if train_covar specified; column names must match)
#' @param npair Number of pairs desired in the model
#' @param predtype Type of predictions to make - "class" if initial outcome is factor, "vector" if initial outcome is non-factor
#'
#' @details This is a wrapper for a series of model-building steps. The main output of
#' this function is the TSP decision tree model. We incorporate a second
#' feature selection step (after empirical controls, done separately) that chooses
#' from the candidate pairs. Pairs are chosen based on how much additional predictive
#' value they provide on top of pairs already selected (and non-pair covariates, if specified).
#' We also cross-validate this entire procedure five times to get an estimate of
#' out-of-sample accuracy of our model.
#'
#' @return A list contaiing the following attributes:
#' \item{tree}{The final decision tree built on training data}
#' \item{p_train}{Model predictions on training data}
#' \item{p_test}{Model predictions on test data}
#' \item{final_names}{Pair names in final model}
#' \item{pair_names}{Pair name aliases for pretty tree printing}
#' \item{acc}{Out-of-sample accuracy calculated via cross-validation}
#'
tsp_model_builder <- function(train, train_outcome, train_covar, pairs, test, test_covar, npair, predtype){
ncv <- 5 # no. cross validation folds
idxs <- split(sample(1:ncol(train)), rep(1:ncv, each=ncol(train)/ncv))
acc <- vector("numeric", ncv)
# we're going to get an out-of-sample accuracy measure via CV
for(i in 1:ncv){
idx <- idxs[[i]]
ktrain <- pairs[,-idx]
ktest <- pairs[,idx]
ktrain_outcome <- train_outcome[-idx]
ktest_outcome <- train_outcome[idx]
if(!is.null(train_covar)){
ktrain_covar <- train_covar[-idx,]
ktest_covar <- train_covar[idx,]
} else {
ktrain_covar <- ktest_covar <- train_covar # All null
}
# Now that we have a subset, need to again check which pairs do not flip
rmp <- which(rowMeans(ktrain) == 1 | rowMeans(ktrain) == 0)
if(length(rmp) > 0){
ktrain <- ktrain[-rmp,]
ktest <- ktest[-rmp,]
}
# Do regression feature selection on ktrain
cp <- reg_fs(ktrain, ktrain_outcome, ktrain_covar, npair)
tmp_train_data <- as.data.frame(cbind(ktrain_covar, t(ktrain[cp,])))
tree <- rpart(ktrain_outcome~., data = tmp_train_data)
tree <- prune(tree, cp=tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"])
tmp_test_data <- as.data.frame(cbind(ktest_covar,t(ktest[cp,])))
preds <- predict(tree, newdata=tmp_test_data, type=predtype)
if(predtype == "class"){
acc[i] <- sum(preds == ktest_outcome)/length(ktest_outcome)
} else {
acc[i] <- mean((ktest_outcome - preds)^2)
}
}
# Now we build the overall model on the whole data
cp_final <- reg_fs(pairs, train_outcome, train_covar, npair)
pairtmp <- as.data.frame(cbind(train_covar, t(pairs[cp_final,])))
final_names <- c(colnames(train_covar), rownames(pairs[cp_final,]))
pair_names <- c(colnames(train_covar), paste0("p", 1:npair))
tree <- rpart(train_outcome~., data=pairtmp)
tree <- prune(tree, cp=tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"])
colnames(pairtmp) <- pair_names
display_tree <- rpart(train_outcome~., data=pairtmp)
display_tree <- prune(display_tree, cp=tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"])
p_train <- predict(tree, type=predtype)
test_dm <- as.data.frame(cbind(test_covar, sapply(final_names, single_pairs, test)))
#colnames(test_dm) <- pair_names
p_test <- predict(tree, newdata=test_dm, type=predtype)
list("tree"=tree, "display_tree"=display_tree, "p_train"=as.numeric(p_train), "p_test"=as.numeric(p_test), "final_names"=final_names, "pair_names"=pair_names, "acc"
=acc)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.