# Utility functions:
#' @title collect gabbage
#
#' @description
#' Collects garbage until the memory is clean
#' @export
collect.garbage = function(){
#if (exists("collect.garbage") ) rm(collect.garbage)
## The following function collects garbage until the memory is clean.
## Usage: 1. immediately call this function after you call a function or
## 2. rm()
while (gc()[2,4] != gc()[2,4]){}
}
#
# returns t(x).y
cprod <- function(x,y=NULL){
if(is.complex(x) | is.complex(y)){
if(is.null(y)){
return(crossprod(Conj(x),x))
} else {
return(crossprod(Conj(x),y))
}
} else {
return(crossprod(x,y))
}
}
### parse and manipulating mixed-model formulas
## deparse(.) returning \bold{one} string
## @param x,collapse character vector and how to combine them
## @note Protects against the possibility that results from deparse() will be
## split after 'width.cutoff' (by default 60, maximally 500)
safeDeparse <- function(x, collapse=" ") paste(deparse(x, 500L), collapse=collapse)
#
# Random Effects formula only
reOnly <- function(f) {
reformulate(paste0("(", vapply(findbars(f), safeDeparse, ""), ")"))
}
#
## @param x a random effect (i.e., data frame with rows equal to levels, columns equal to terms
## @param n vector of new levels
levelfun <- function(x,nl.n,allow.new.levels=FALSE) {
## 1. find and deal with new levels
if (!all(nl.n %in% rownames(x))) {
if (!allow.new.levels) stop("new levels detected in newdata")
## create an all-zero data frame corresponding to the new set of levels ...
newx <- as.data.frame(matrix(0, nrow=length(nl.n), ncol=ncol(x),
dimnames=list(nl.n, names(x))))
## then paste in the matching RE values from the original fit/set of levels
newx[rownames(x),] <- x
x <- newx
}
## 2. find and deal with missing old levels
## ... these should have been dropped when making the Z matrices
## etc. in mkReTrms, so we'd better drop them here to match ...
if (!all(r.inn <- rownames(x) %in% nl.n)) {
x <- x[r.inn,,drop=FALSE]
}
return(x)
}
######################################################################################
################# Train-Test split longitudinal data: train data lags test data by lag
################# there must be at least 2 visit times for each observation
#' @title Train-Test longitudinal data split
#
#' @description
#' Split longitudinal data into training and test set, where the training set lags the
#' test set by an amount "lag". e.g predict an outcome at lag=x hospital visits in advanced.
#' There must be at least 2 repeated measurements for each unit
#' @export
LongiLagSplit <- function(dat, id, rhs.vars, resp.vars, order.vars = "time", lag=1){
tab <- data.frame(table(dat[, id]))
idx <- tab$Var1[tab$Freq >= lag+3]
dat <- unique(na.omit(dat[dat$id%in%idx, ]))
dat$id <- factor(dat$id, levels = unique(dat$id))
lag.split <- function(x){
x <- unique(x)
x <- x[order(x[, order.vars], decreasing = FALSE), , drop=FALSE]
ix1 <- 1:(nrow(x)-lag)
ix2 <- ix1 + lag
dd <- cbind.data.frame(x[ix1, c(id, rhs.vars, resp.vars)], response = x[ix2, resp.vars])
dd[,resp.vars] <- factor(dd[, resp.vars])
train <- head(dd, nrow(dd)-1)
test <- tail(dd, 1) ### test using patients last observation
return(list(train = train, test = test))
}
X <- dlply(dat, .variables = id, .fun = lag.split)
X[sapply(X, is.null)] <- NULL
train <- do.call(rbind.data.frame, lapply(X, function(x) x$train))
test <- do.call(rbind.data.frame, lapply(X, function(x) x$test))
list(train = train, test = test, resp.vars = "response", rhs.vars = c(rhs.vars, resp.vars))
}
##### do not split into training and testing
LongiLagSplit.boot <- function(dat, id, rhs.vars, resp.vars, order.vars = "time", lag=1){
tab <- data.frame(table(dat[, id]))
idx <- tab$Var1[tab$Freq >= lag+3]
dat <- unique(na.omit(dat[dat$id%in%idx, ]))
dat$id <- factor(dat$id, levels = unique(dat$id))
lag.split <- function(x){
x <- unique(x)
x <- x[order(x[, order.vars], decreasing = FALSE), , drop=FALSE]
ix1 <- 1:(nrow(x)-lag)
ix2 <- ix1 + lag
dd <- cbind.data.frame(x[ix1, c(id, rhs.vars, resp.vars)], response = x[ix2, resp.vars])
dd[,resp.vars] <- factor(dd[, resp.vars])
dd
}
X <- ddply(dat, .variables = id, .fun = lag.split)
list(dat = X, resp.vars = "response", rhs.vars = c(rhs.vars, resp.vars))
}
#### Always predict the next or last visit, i.e use all repeated measure seen so far and predict the next possible one
### i.e for visit =1 (next visit) data = (x0, y1), (x1, y2), .....
### visit = 2 data (x1, y2), (x2, y3), .... so y0, and y1 must be available before this will work
LongiLagSplit.visit <- function(dat, id, rhs.vars, resp.vars, order.vars = "time", visit=1){
tab <- data.frame(table(dat[, id]))
idx <- tab$Var1[tab$Freq >= (visit+3)]
dat <- unique(na.omit(dat[dat$id%in%idx, ]))
dat$id <- factor(dat$id, levels = unique(dat$id))
lag.split <- function(x){
x <- unique(x)
x <- x[order(x[, order.vars], decreasing = FALSE), , drop=FALSE]
ix1 <- 1:(nrow(x)-1)
ix2 <- ix1 + 1
dd <- cbind.data.frame(x[ix1, c(id, rhs.vars, resp.vars)], response = x[ix2, resp.vars])
train <- head(dd, nrow(dd)-1)
test <- tail(dd, 1) ### test using patients last observation
return(list(train = train, test = test))
}
X <- dlply(dat, .variables = id, .fun = lag.split)
X[sapply(X, is.null)] <- NULL
train <- do.call(rbind.data.frame, lapply(X, function(x) x$train))
test <- do.call(rbind.data.frame, lapply(X, function(x) x$test))
list(train = train, test = test, resp.vars = "response", rhs.vars = c(rhs.vars, resp.vars))
}
LongiLagSplit.visit.boot <- function(dat, id, rhs.vars, resp.vars, order.vars = "time", visit=1, parallel=TRUE){
tab <- data.frame(table(dat[, id]))
idx <- tab$Var1[tab$Freq >= (visit+3)]
dat <- unique(na.omit(dat[dat$id%in%idx, ]))
dat$id <- factor(dat$id, levels = unique(dat$id))
lag.split <- function(x){
x <- unique(x)
x <- x[order(x[, order.vars], decreasing = FALSE), , drop=FALSE]
ix1 <- 1:(nrow(x)-1)
ix2 <- ix1 + 1
cbind.data.frame(x[ix1, c(id, rhs.vars, resp.vars)], response = x[ix2, resp.vars])
}
X <- ddply(dat, .variables = id, .fun = lag.split, .parallel = parallel)
list(dat = X, resp.vars = "response", rhs.vars = c(rhs.vars, resp.vars))
}
LongiLagSplit.current <- function(dat, id, rhs.vars, resp.vars, order.vars = "time"){
tab <- data.frame(table(dat[, id]))
idx <- tab$Var1[tab$Freq >= 3]
dat <- unique(na.omit(dat[dat$id%in%idx, ]))
dat$id <- factor(dat$id, levels = unique(dat$id))
dat$past.HbA1c <- 0
lag.split <- function(x){
x <- x[order(x[, order.vars], decreasing = FALSE), , drop=FALSE]
ix1 <- 1:(nrow(x)-1)
ix2 <- 2:nrow(x)
x1 <- x[1, ,drop=FALSE]
r1 <- x[ix1, resp.vars]
r2 <- x[ix2, resp.vars]
dd <- cbind.data.frame(visit = 2:nrow(x), x[ix2, c(id, rhs.vars)], past.HbA1c=r1, response=r2)
x1 <- cbind.data.frame(visit = 1, x1[, c(id, rhs.vars)], past.HbA1c=x1$past.HbA1c,
response= as.numeric(unlist(x1[, resp.vars])))
rbind.data.frame(x1, dd)
}
X <- dlply(dat, .variables = id, .fun = lag.split)
X[sapply(X, is.null)] <- NULL
X <- do.call(rbind.data.frame, X)
rhs.vars <- c("past.HbA1c", rhs.vars[rhs.vars!=resp.vars])
list(dd = X, resp.vars = "response", rhs.vars = rhs.vars)
}
#' @title Performance measures
#
#' @description
#' Compute accuracy, AUC, sensitivity, specificity, positive predictive values, etc.
#' @param pred predicted probabilities
#' @param obs true class
#' @param threshold class probability threshold (default to NULL)
#' @param prevalence class prevalence in the population (default to NULL)
#
#' @export
Performance.measures <- function(pred, obs, threshold=NULL, prevalence=NULL){
if(length(unique(obs)) == 2) {
obs <- as.numeric(factor(obs))-1
## get best cut-off
if(is.null(threshold))
threshold <- opt.thresh(pred, obs)
### get these performance measures
nme = c("PCC", "PCC.sd", "AUC", "AUC.sd", "sensitivity", "sensitivity.sd",
"specificity", "specificity.sd")
xx = cbind.data.frame(plotID = 1:length(pred), Observed = obs, Predicted = pred)
accuracy <- presence.absence.accuracy(xx, threshold = threshold, st.dev = TRUE)[, nme]
accuracy$G.mean <- sqrt(as.numeric(accuracy$sensitivity)*as.numeric(accuracy$specificity))
accuracy$BER <- 1 - 0.5*(as.numeric(accuracy$sensitivity) + as.numeric(accuracy$specificity))
pred.prev <- predicted.prevalence(DATA=xx,threshold=threshold)[, c("Obs.Prevalence", "Predicted")]
nme <- c("Pos Pred Value", "Neg Pred Value", "Balanced Accuracy")
if(is.null(prevalence)) prevalence <- as.numeric(pred.prev$Obs.Prevalence)
obs <- factor(ifelse(obs==1, "Yes", "No"), levels=c("Yes", "No"))
pred <- factor(ifelse(pred >= threshold, "Yes", "No"), levels = c("Yes","No"))
cmx <- confusionMatrix(data=pred, reference=obs,prevalence=prevalence)$byClass[nme]
#fmeas <- F.measure.single(pred= as.character(pred),labels= as.character(obs))
#accuracy$F.measure <- fmeas["F"]
#accuracy$Accuracy <- fmeas["A"]
#colnames(cmx) <- c("PPV", "NPV", "BACC")
accuracy$PPV <- cmx[1]; accuracy$NPV = cmx[2]; accuracy$BACC <- cmx[3]
accuracy$F.measure = 2 *(accuracy$PPV * accuracy$sensitivity) / (accuracy$PPV + accuracy$sensitivity);
accuracy$threshold = threshold
} else
accuracy = data.frame(t(regr.eval(obs, pred, stats= c("mae","mse","rmse"))))
return(accuracy)
}
sigmoid <- function(x) 1.0 / (1 + exp(-x))
# get left hand side of formula
lhs.form <- function(formula) {
tt <- terms(formula)
vars <- as.character(attr(tt, "variables"))[-1] ## [1] is the list call
response <- attr(tt, "response") # index of response var
vars[response]
}
# get right hand side of formula
rhs.form <- function(formula) {
tt <- terms(formula)
vars <- as.character(attr(tt, "variables"))[-1] ## [1] is the list call
response <- attr(tt, "response") # index of response var
vars[-response]
}
#' @title Optimal threshold
#
#' @description
#' Compute the the optimal classification threshold
#'
#' @param prob Predicted probabilities by a classifier
#' @param obs ground truth (correct) 0-1 labels vector
#' @param opt.method optima classifcation threshold method see package \code{PresenceAbsence}. Default
#' is the minRoc distance: i.e the threshold value at the minimum distance between the ROC curve and the
#' to left hand corner (0,1)
#' @return threhold
#' @export
opt.thresh <- function(pred, obs){
thresh = 0.5
if(length(unique(obs)) > 1){
obs <- as.numeric(as.factor(obs))-1
SIMDATA = cbind.data.frame(plotID = 1:length(obs), Observed = obs, Predicted = pred)
thresh <- optimal.thresholds(SIMDATA, threshold = 101, which.model = 1, opt.methods = 9)
thresh <- ifelse(length(thresh["Predicted"]) >= 1,as.numeric(thresh["Predicted"]), 0.5)
}
return(thresh)
}
myapplyLearner <- function (learner, X)
{
leftIx <- 1:nrow(X)
predY <- rep("", nrow(X))
predCond <- rep("", nrow(X))
for (i in 1:nrow(learner)) {
ixMatch <- eval(parse(text = paste("which(", learner[i, "condition"], ")")))
ixMatch <- intersect(leftIx, ixMatch)
if (length(ixMatch) > 0) {
predY[ixMatch] <- learner[i, "pred"]
predCond[ixMatch] <- learner[i, "condition"]
leftIx <- setdiff(leftIx, ixMatch)
}
if (length(leftIx) == 0) {
break
}
}
return(list(predY=predY,predCond=predCond))
}
SplitVector <- function (v, K = 3)
{
splitV <- quantile(v, probs = seq(0, 1, 1/K), na.rm = FALSE,
names = TRUE, type = 3)
splitV <- splitV[-c(1, length(splitV))]
numSplit <- length(splitV)
if (numSplit == 0)
return(v)
newV <- vector("character", length(v))
newV[which(v <= splitV[1])] = paste("L1", sep = "")
if (numSplit >= 2) {
for (jj in 2:numSplit) {
newV[which(v > splitV[jj - 1] & v <= splitV[jj])] = paste("L",
jj, sep = "")
}
}
newV[which(v > splitV[numSplit])] = paste("L", (numSplit +
1), sep = "")
return(list(newV=newV, splitV = splitV))
}
predictSplitVector <- function (v, splitV)
{
numSplit <- length(splitV)
if (numSplit == 0)
return(v)
newV <- vector("character", length(v))
newV[which(v <= splitV[1])] = paste("L1", sep = "")
if (numSplit >= 2) {
for (jj in 2:numSplit) {
newV[which(v > splitV[jj - 1] & v <= splitV[jj])] = paste("L",
jj, sep = "")
}
}
newV[which(v > splitV[numSplit])] = paste("L", (numSplit +
1), sep = "")
return(list(newV=newV))
}
mapLabels <- function (pred, splitV)
{
newV <- rep(0, length(pred))
labs <- pred
xx <- unique(labs)
ix <- 1:length(xx)
is <- which(xx == "L1")
newV[which(labs == "L1")] <- splitV[1]
ll <- xx[xx != "L1"]
names(splitV) <- ll
if (length(splitV) >= 2) {
for (jj in 1:(length(ll)-1)) {
newV[which(labs == ll[jj])] <- 0.5*(splitV[jj] + splitV[jj+1])
}
}
newV[which(labs == ll[length(ll)])] <- splitV[length(ll)] + 0.5
newV
}
sparseFactor <- function (dummy, thresh = 0.00016)
{
nme <- FALSE
lunique = length(unique(dummy))
if (lunique == 1) {
nme <- TRUE
}
if (lunique == 2) {
levs <- sort(unique(dummy))
m1 = mean(dummy == levs[1], na.rm = TRUE)
m0 = mean(dummy == levs[2], na.rm = TRUE)
if (m1 <= thresh | m0 <= thresh)
nme <- TRUE
}
return(nme)
}
sortImp <- function (object, top)
{
best <- "max"
featureRank <- switch(best, max = rank(-apply(object, 1, max, na.rm = TRUE)),
min = rank(apply(object, 1, min, na.rm = TRUE)),
maxabs = rank(-apply(abs(object), 1, max, na.rm = TRUE)))
tiedRanks <- as.numeric(names(table(featureRank)[table(featureRank) > 1]))
if (length(tiedRanks) > 0) {
for (i in seq(along = tiedRanks)) {
tmp <- featureRank[featureRank == tiedRanks[i]]
featureRank[featureRank == tiedRanks[i]] <- tmp +
runif(length(tmp), min = 0.001, max = 0.999)
}
}
featureOrder <- order(featureRank)
out <- object[featureOrder, , drop = FALSE]
out <- out[1:top, , drop = FALSE]
out
}
varImpPlot.RF <- function (x, sort = TRUE, n.var = min(30, nrow(x$importance)),
type = NULL, class = NULL, scale = TRUE, main = "", xlab = "", ...)
{
if (!inherits(x, "randomForest"))
stop("This function only works for objects of class `randomForest'")
imp <- importance(x, class = class, scale = scale, type = type,
...)
if (ncol(imp) > 2)
imp <- imp[, -(1:(ncol(imp) - 2))]
nmeas <- ncol(imp)
if (nmeas > 1) {
op <- par(mfrow = c(1, 2), mar = c(4, 5, 4, 1), mgp = c(2,
0.8, 0), oma = c(0, 0, 2, 0), no.readonly = TRUE)
on.exit(par(op))
}
for (i in 1:nmeas) {
ord <- if (sort)
rev(order(imp[, i], decreasing = TRUE)[1:n.var])
else 1:n.var
xmin <- if (colnames(imp)[i] %in% c("IncNodePurity", "MeanDecreaseGini"))
0
else min(imp[ord, i])
dotchart(imp[ord, i], xlab = xlab, ylab = "",
main = if (nmeas == 1)
main
else NULL, xlim = c(xmin, max(imp[, i])), color = "blue", ...)
}
if (nmeas > 1)
mtext(outer = TRUE, side = 3, text = main, cex = 1.2)
invisible(imp)
}
#
Multilevel.boot <- function(dat, groups, nboots = 10) {
cid <- unique(dat[, groups])
ncid <- length(cid)
recid <- sample(cid, size = ncid * nboots, replace = TRUE)
tst <- data.frame()
trn <- data.frame()
for(ii in 1:length(recid)){
idx <- which(dat[, groups] == recid[ii])
trn.ix = sample(idx, size = length(idx), replace = TRUE)
tst.ix = setdiff(idx, trn.ix)
if(length(tst.ix) <= 1) next
trn = rbind(trn, cbind(id = ii, trn.ix))
tst = rbind(tst, cbind(id = ii, tst.ix))
}
# rid <- lapply(seq_along(recid), function(i) {
# idx <- which(dat[, groups] == recid[i])
## trn.ix = sample(idx, size = length(idx), replace = TRUE)
# tst.ix = setdiff(idx, trn.ix)
# list(trn.ix = cbind(id = i, trn.ix),
## tst.ix = cbind(id = i, tst.ix))})
#
# trn <- as.data.frame(do.call(rbind, lapply(rid, function(x) x$trn.ix)))
# tst <- as.data.frame(do.call(rbind, lapply(rid, function(x) x$tst.ix)))
trn$Replicate <- cut(trn$id, breaks = c(1, ncid * 1:nboots), include.lowest = TRUE,
labels = FALSE)
tst$Replicate <- cut(tst$id, breaks = c(1, ncid * 1:nboots), include.lowest = TRUE,
labels = FALSE)
return(list(trn = trn, tst=tst))
}
#
multilevelboot <- function(dat, id = "id", inbag, outbag) {
### for each patient, select time indices that appeared in the inbag sample, sort these and keep aside the last
### index
res <- dlply(dat, .variables = id, .fun = function(xx){
p.ix <- xx[, "time.id"]
ix <- sort(inbag[inbag%in%p.ix])
last.ix <- ix[which(ix == tail(ix, 1))] ### select the last index and remove from the inbag, this then goes in to test set
inbag <- inbag[!inbag%in%last.ix]
## get all outbags that are greater than last.ix - 1
ix1 <- sort(inbag[inbag%in%p.ix])
last.ix1 <- unique(ix1[which(ix1 == tail(ix1, 1))])
out.ix <- outbag[which(outbag >= last.ix1)]
test.ix <- unique(last.ix, out.ix)
train.ix <- inbag
trn <- subset(xx, time.id%in%train.ix)
tst <- subset(xx, time.id%in%test.ix)
list(trn = trn, tst=tst)
})
trn <- do.call(rbind, lapply(res, function(x) x$trn))
tst <- do.call(rbind, lapply(res, function(x) x$tst))
rownames(trn) <- NULL
rownames(tst) <- NULL
list(trn = trn, tst=tst)
}
multilevelboot.list <- function(dat.list, inbag, outbag) {
### for each patient, select time indices that appeared in the inbag sample, sort these and keep aside the last
### index
res <- llply(dat.list, .fun = function(xx){
p.ix <- xx[, "time.id"]
ix <- sort(inbag[inbag%in%p.ix])
last.ix <- ix[which(ix == tail(ix, 1))] ### select the last index and remove from the inbag, this then goes in to test set
inbag <- inbag[!inbag%in%last.ix]
## get all outbags that are greater than last.ix - 1
ix1 <- sort(inbag[inbag%in%p.ix])
last.ix1 <- unique(ix1[which(ix1 == tail(ix1, 1))])
out.ix <- outbag[which(outbag >= last.ix1)]
test.ix <- unique(last.ix, out.ix)
train.ix <- inbag
trn <- subset(xx, time.id%in%train.ix)
tst <- subset(xx, time.id%in%test.ix)
list(trn = trn, tst=tst)
})
trn <- do.call(rbind, lapply(res, function(x) x$trn))
tst <- do.call(rbind, lapply(res, function(x) x$tst))
rownames(trn) <- NULL
rownames(tst) <- NULL
list(trn = trn, tst=tst)
}
createBootSamples <- function (y, times = 10, replace = TRUE, prob = NULL) {
trainIndex <- matrix(0, ncol = times, nrow = length(y))
out <- apply(trainIndex, 2, function(data) {
index <- seq(along = data)
out <- sort(sample(index, size = length(index), replace = replace, prob = prob))
# out <- sample(index, size = length(index), replace = replace, prob = prob)
out
})
out
}
multiClassSummary <- function (data, class.names=NULL, eps = 1e-15){
#Check data
# if (!all(levels(data[, "pred"]) == levels(data[, "obs"])))
# stop("levels of observed and predicted data do not match")
#Calculate custom one-vs-all stats for each class
prob_stats <- lapply(class.names, function(class){
#Grab one-vs-all data for the class
pred <- ifelse(data[, "pred"] == class, 1, 0)
obs <- ifelse(data[, "obs"] == class, 1, 0)
prob <- data[,class]
#Calculate one-vs-all AUC and logLoss and return
# cap_prob <- pmin(pmax(prob, .000001), .999999)
cap_prob <- pmin(pmax(prob, eps), 1-eps)
# prob_stats <- auc(obs, prob)
# names(prob_stats) <- 'ROC'
prob_stats <- c(auc(obs, prob), logLoss(obs, cap_prob))
names(prob_stats) <- c('ROC', 'logLoss')
return(prob_stats)
})
prob_stats <- do.call(rbind, prob_stats)
rownames(prob_stats) <- paste('Class:', class.names)
#Calculate confusion matrix-based statistics
CM <- confusionMatrix(data[, "pred"], data[, "obs"])
#Aggregate and average class-wise stats
#Todo: add weights
class_stats <- cbind(CM$byClass, prob_stats)
class_stats <- colMeans(class_stats, na.rm=TRUE)
#Aggregate overall stats
overall_stats <- c(CM$overall)
#Combine overall with class-wise stats and remove some stats we don't want
stats <- c(overall_stats, class_stats)
stats <- stats[! names(stats) %in% c('AccuracyNull',
'Prevalence', 'Detection Prevalence')]
#Clean names and return
names(stats) <- gsub('[[:blank:]]+', '_', names(stats))
return(stats)
}
#' @title normalize data
#
#' @description
#' Normalize matrix or data frame to the min-max range of the variables.
#
#' @param x matrix or data frame
#' @return nomalized matrix/data frame with min and max of each variable as attributes
#' @export
# normalize data to
normalize <- function(x) {
x <- as.matrix(x)
minAttr=apply(x, 2, min, na.rm=TRUE)
maxAttr=apply(x, 2, max, na.rm=TRUE)
x <- sweep(x, 2, minAttr, FUN="-")
x=sweep(x, 2, maxAttr-minAttr, "/")
attr(x, 'min') = minAttr
attr(x, 'max') = maxAttr
return (x)
}
normalizeMinusOneToOne <- function(x, a = -1, b = 1) {
A= min(x, na.rm=TRUE)
B <-max(x, na.rm = TRUE)
(((x - A)*(b-a))/(B-A) + a)
}
#' @title denomalize data
#
#' @description
#' tranform normalized data back to original scale
#
#' @param normalized the normalized data frame/matrix
#' @param min,max the min and max of each variable in the data. This can be otained by retrieving the attribute values of
#' the normalized data, i.e. output of \code{normalize}
#' @export
# denormalize back to original scale
# denormalize back to original scale
denormalize <- function (normalized, min, max) {
if(dim(normalized)[2]!= length(min)) stop("length of min or max must equal number of columns of data ")
nme <- colnames(normalized)
if( !all(nme%in%names(min)) ) stop("colnames of data do not match names of min or max")
sapply(nme, function(x) normalized[, x] * (max[x]-min[x]) + min[x] )
}
############################################################
######## 2. F-score ########################################
############################################################
# Function to compute the F-measure for a single class
# Input
# pred : vector of the predicted labels
# labels : vector of the true labels
# Note that 0 stands for negative and 1 for positive.
# In general the first level is negative and the second positive
# Output:
# a named vector with six elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 loss accuracy
# npos : number of positives
F.measure.single <- function(pred,labels) {
if (length(pred)!=length(labels))
stop("F.measure: lengths of true and predicted labels do not match.");
neg.ex <- which(labels <= 0);
np <- length(neg.ex);
pos.ex <- which(labels > 0);
npos <- length(pos.ex);
TP <- sum(pred[pos.ex] > 0);
FN <- sum(pred[pos.ex] <= 0);
TN <- sum(pred[neg.ex] <= 0);
FP <- sum(pred[neg.ex] > 0);
acc <- (TP+TN)/length(labels);
print(TP)
print(TN)
if ((TP+FP) == 0)
precision <- 0
else
precision <- TP/(TP+FP);
if ((TP+FN) == 0)
recall <- 0
else
recall <- TP/(TP+FN);
if ((TN+FP) == 0)
specificity <- 0
else
specificity <- TN/(TN+FP);
if ((precision+recall) == 0)
F <- 0
else
F = 2 *(precision*recall) / (precision+recall);
res <- c(precision,recall,specificity,F,acc, npos);
names(res) <- c("P", "R", "S", "F", "A", "Pos.");
return (res);
}
################################################################################
################################################################################
#################### predict method for mixed effect model based tree
################ create a new mixed effect model frame
## Make new random effect terms from specified object and new data,
## possibly omitting some random effect terms
## @param object fitted model object
## @param newdata (optional) data frame containing new data
## @param re.form,allow.new.levels formula specifying random effect terms to include (NULL=all, ~0)
#
## @note Hidden; _only_ used (twice) in this file
mkNewReTrms <- function(object, newdata, re.form = NULL, allow.new.levels=FALSE)
{
## construct (fixed) model frame in order to find out whether there are
## missing data/what to do about them
## need rfd to inherit appropriate na.action; need grouping
## variables as well as any covariates that are included
## in RE terms
if (is.null(newdata))
rfd <- model.frame(object)
else
rfd <- newdata
## note: mkReTrms automatically *drops* unused levels
ReTrms <- mkReTrms(findbars(re.form[[2]]), rfd)
## update Lambdat (ugh, better way to do this?)
ReTrms <- within(ReTrms,Lambdat@x <- unname(getME(object,"theta")[Lind]))
if (!allow.new.levels && any(vapply(ReTrms$flist, anyNA, NA)))
stop("NAs are not allowed in prediction data",
" for grouping variables unless allow.new.levels is TRUE")
ns.re <- names(re <- ranef(object))
nRnms <- names(Rcnms <- ReTrms$cnms)
if (!all(nRnms %in% ns.re))
stop("grouping factors specified in re.form that were not present in original model")
new_levels <- lapply(ReTrms$flist, function(x) levels(factor(x)))
## fill in/delete levels as appropriate
re_x <- Map(function(r,n) levelfun(r,n,allow.new.levels=allow.new.levels),
re[names(new_levels)], new_levels)
## pick out random effects values that correspond to
## random effects incorporated in re.form ...
## NB: Need integer indexing, as nRnms can be duplicated: (age|Subj) + (sex|Subj) :
re_new <- lapply(seq_along(nRnms), function(i) {
rname <- nRnms[i]
if (!all(Rcnms[[i]] %in% names(re[[rname]])))
stop("random effects specified in re.form that were not present in original model")
re_x[[rname]][,Rcnms[[i]]]
})
re_new <- unlist(lapply(re_new, t)) ## must TRANSPOSE RE matrices before unlisting
Zt <- ReTrms$Zt
list(Zt=Zt, b=re_new, Lambdat = ReTrms$Lambdat)
}
mkNewReTrmsV2 <- function(object, newdata, re.form=NULL, na.action=na.pass,
allow.new.levels=FALSE)
{
## construct (fixed) model frame in order to find out whether there are
## missing data/what to do about them
## need rfd to inherit appropriate na.action; need grouping
## variables as well as any covariates that are included
## in RE terms
if (is.null(newdata)) {
rfd <- mfnew <- model.frame(object)
} else {
mfnew <- model.frame(delete.response(terms(object,fixed.only=TRUE)),
newdata, na.action=na.action)
## make sure we pass na.action with new data
## it would be nice to do something more principled like
## rfd <- model.frame(~.,newdata,na.action=na.action)
## but this adds complexities (stored terms, formula, etc.)
## that mess things up later on ...
rfd <- na.action(newdata)
if (is.null(attr(rfd,"na.action")))
attr(rfd,"na.action") <- na.action
}
if (inherits(re.form, "formula")) {
## DROP values with NAs in fixed effects
if (length(fit.na.action <- attr(mfnew,"na.action")) > 0) {
newdata <- newdata[-fit.na.action,]
}
## note: mkReTrms automatically *drops* unused levels
ReTrms <- mkReTrms(findbars(re.form[[2]]), rfd)
## update Lambdat (ugh, better way to do this?)
ReTrms <- within(ReTrms,Lambdat@x <- unname(getME(object,"theta")[Lind]))
if (!allow.new.levels && any(vapply(ReTrms$flist, anyNA, NA)))
stop("NAs are not allowed in prediction data",
" for grouping variables unless allow.new.levels is TRUE")
ns.re <- names(re <- ranef(object))
nRnms <- names(Rcnms <- ReTrms$cnms)
if (!all(nRnms %in% ns.re))
stop("grouping factors specified in re.form that were not present in original model")
new_levels <- lapply(ReTrms$flist, function(x) levels(factor(x)))
## fill in/delete levels as appropriate
re_x <- Map(function(r,n) levelfun(r,n,allow.new.levels=allow.new.levels),
re[names(new_levels)], new_levels)
## pick out random effects values that correspond to
## random effects incorporated in re.form ...
## NB: Need integer indexing, as nRnms can be duplicated: (age|Subj) + (sex|Subj) :
re_new <- lapply(seq_along(nRnms), function(i) {
rname <- nRnms[i]
if (!all(Rcnms[[i]] %in% names(re[[rname]])))
stop("random effects specified in re.form that were not present in original model")
re_x[[rname]][,Rcnms[[i]]]
})
re_new <- unlist(lapply(re_new, t)) ## must TRANSPOSE RE matrices before unlisting
## FIXME? use vapply(re_new, t, FUN_VALUE=????)
}
Zt <- ReTrms$Zt
attr(Zt, "na.action") <- attr(re_new, "na.action") <- attr(mfnew, "na.action")
list(Zt=Zt, b=re_new, Lambdat = ReTrms$Lambdat)
}
myextractRules <- function (treeList, X, ntree = 100, maxdepth = 6, random = FALSE,
digits = NULL, verbose = FALSE)
{
if (is.numeric(digits))
digits <- as.integer(abs(digits))
levelX = list()
for (iX in 1:ncol(X)) levelX <- c(levelX, list(levels(X[,
iX])))
ntree = min(treeList$ntree, ntree)
allRulesList = list()
for (iTree in 1:ntree) {
if (random == TRUE) {
max_length = sample(1:maxdepth, 1, replace = FALSE)
}
else {
max_length = maxdepth
}
rule = list()
count = 0
rowIx = 1
tree <- treeList$list[[iTree]]
if (nrow(tree) <= 1)
next
ruleSet = vector("list", length(which(tree[, "status"] ==
-1)))
res = treeVisit(tree, rowIx = rowIx, count, ruleSet,
rule, levelX, length = 0, max_length = max_length,
digits = digits)
allRulesList = c(allRulesList, res$ruleSet)
}
allRulesList <- allRulesList[!unlist(lapply(allRulesList,
is.null))]
if(verbose) cat(paste(length(allRulesList), " rules (length<=", max_length,
") were extracted from the first ", ntree, " trees.",
"\n", sep = ""))
rulesExec <- ruleList2Exec(X, allRulesList)
return(rulesExec)
}
##################################################################
############# Tuning by CV concordance for GBM ###################
##################################################################
cv.gbm <- function(X, y, outcome=NULL, n.trees=seq(150,500,by=10), interaction.depth, n.minobsinnode = 5,
shrinkage=10^(seq(-3,-1,length=10)), bag.fraction = .5, distribution="bernoulli", foldid=NULL, nfolds=10, seed=220, verbose=FALSE){
n <- nrow(X)
if(is.null(foldid))
foldid <- get.foldid(n,10, seed=seed)
nfolds <- max(foldid)
ns <- length(shrinkage)
nnt <- length(n.trees)
concord.mat <- foreach(s=1:ns, .combine=rbind)%dopar%{
concord.s <- rep(0,nnt)
yhat <- matrix(0,n,nnt)
for(k in 1:nfolds){
X.k <- X[foldid==k,,drop=F]
y.k <- y[foldid==k]
X.mk <- X[foldid!=k,,drop=F]
y.mk <- y[foldid!=k]
if(distribution=="poisson"){
outcome.mk <- outcome[foldid!=k]
ans.sk <- gbm.fit(X.mk, outcome.mk, offset=y.mk, n.trees=n.trees[nnt],
interaction.depth=interaction.depth, n.minobsinnode=n.minobsinnode, shrinkage=shrinkage[s],
bag.fraction=bag.fraction, distribution=distribution, verbose=verbose)
}
else{
ans.sk <- gbm.fit(X.mk, y.mk, n.trees=n.trees[nnt], interaction.depth=interaction.depth,
n.minobsinnode=n.minobsinnode, shrinkage=shrinkage[s], bag.fraction=bag.fraction, distribution=distribution, verbose=verbose)
}
if(distribution=="multinomial"){
for(t in 1:nnt){
foo <- predict(ans.sk, X.k, n.trees=n.trees[t],type="response", verbose=verbose)
fuh <- foo[cbind(1:length(y.k),as.numeric(y.k),1)]
fuh[is.na(fuh)] <- 0
yhat[foldid==k,t] <- fuh
}
}
else{
for(t in 1:nnt)
yhat[foldid==k,t] <- predict(ans.sk, X.k, n.trees=n.trees[t])
}
}
for(t in 1:nnt){
if(distribution=="coxph")
concord.s[t] <- survConcordance(y~yhat[,t])$concord
else if(distribution=="poisson")
concord.s[t] <- survConcordance(Surv(y,outcome)~yhat[,t])$concord
else if(distribution=="multinomial")
concord.s[t] <- sum(log(yhat[,t]))
# concord.s[t] <- mean(y==yhat[,t])
else if(distribution=="gaussian")
concord.s[t] <- 1-sum((y-yhat[,t])^2)/sum((y-mean(y))^2)
else
concord.s[t] <- get.ROC(y, yhat[,t])$auc
}
concord.s
}
inds <- which(concord.mat==max(concord.mat), arr.ind=TRUE)
opt.shrink <- shrinkage[inds[1]]
opt.n.trees <- n.trees[inds[2]]
opt.concord <- concord.mat[inds[1], inds[2]]
return(list(shrinkage=opt.shrink, n.trees=opt.n.trees, concord=opt.concord))
}
get.foldid <- function(n, nfolds=10, seed=220){
replace.seed <- T
if(missing(seed))
replace.seed <- F
if(replace.seed){
## set seed to specified value
if(!any(ls(name='.GlobalEnv', all.names=T)=='.Random.seed')){
set.seed(1)
}
save.seed <- .Random.seed
set.seed(seed)
}
perm <- sample(1:n, n)
n.cv <- rep(floor(n/nfolds),nfolds)
rem <- n - n.cv[1]*nfolds
n.cv[index(1,rem)] <- n.cv[index(1,rem)]+1
foldid <- rep(0,n)
ind2 <- 0
for(i in 1:nfolds){
ind1 <- ind2+1
ind2 <- ind2+n.cv[i]
foldid[perm[ind1:ind2]] <- i
}
if(replace.seed){
## restore random seed to previous value
.Random.seed <<- save.seed
}
return(foldid)
}
index <- function(m,n){
if(m<=n) return(m:n)
else return(numeric(0))
}
which.equal <- function(x, y){
n <- length(x)
ans <- rep(0,n)
for(i in 1:n){
ans[i] <- any(approx.equal(y,x[i]))
}
return(as.logical(ans))
}
approx.equal <- function(x, y, tol=1E-9){
return(abs(x - y) < tol)
}
Train.Boot <- function(classifier, nboots = 2, XY.dat, resp.vars, rhs.vars, part.vars, reg.vars,
para, parallel=TRUE, n.cores = 2, seed=12345678){
set.seed(seed)
form.rules <- as.formula(paste0(paste0(resp.vars, " ~"), paste0(rhs.vars, collapse = "+")))
form.glm <- as.formula(paste0(paste0(resp.vars, " ~"), paste0(c(paste0(rhs.vars,collapse="+"), "+", "(",
paste0(c(rand.vars),collapse = "+"), "|", groups, ")"), collapse="")))
if(parallel) {
cl <- makeCluster(n.cores)
pfun <- get("parLapply")
} else {
pfun = get("lapply")
}
names(XY.dat)[names(XY.dat)==para$group] <- "old.id"
tmp <- Multilevel.boot(dat=XY.dat, groups="old.id", nboots = nboots)
trn <- cbind.data.frame(tmp$trn, XY.dat[tmp$trn$trn.ix, ])
tst <- cbind.data.frame(tmp$tst, XY.dat[tmp$tst$tst.ix, ])
#if(parallel) clusterExport(cl, varlist=c("trn", "tst"))
#ix.boot <- unique(trn$Replicate)
res <- pfun(X= unique(trn$Replicate), function(kk, ...){
dat.trn <- trn[trn$Replicate==kk, ]
dat.tst <- tst[tst$Replicate==kk, ]
Y.trn <- dat.trn[, resp.vars]
X.trn <- dat.trn[, unique(c(reg.vars, part.vars, para$group)), drop = FALSE]
Y.tst <- dat.tst[, resp.vars]
X.tst <- dat.tst[, unique(c(reg.vars, part.vars, para$group)), drop = FALSE]
Train.Models <- lapply(TrainModels(), function(x) x)
mod <- tryCatch(
{
lapply(classifier, function(x) {
if(any(x%in%c("MEglmTree", "MECTree")))
Train.Models[[x]](X.trn, Y.trn, X.tst, Y.tst, para, rand.vars, part.vars, reg.vars, rhs.vars, ...)
else
if(x=="GBMrules") form <- form.rules else form <- form.glm
Train.Models[[x]](trn=dat.trn, tst=dat.tst, form, para, rand.vars, ...)
})
}, error=function(e){
cat("Error in the Expression: ", paste(e$call, collapse= ", "),
": original error message = ", e$message, "\n")
list()
}) ## tryCatch
cat("Done boot :", kk, "\n")
names(mod) <- classifier
mod
}, cl = cl) ## pfun
stopCluster(cl)
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.