#' Reload a package.
#' @description Unload and reload a package and sets the namespace search order.
#' @param package Unquoted package name.
#' @param pos Namespace search position.
#' @examples
#' \code{reload(trqwe)}
#' @export
reload <- function(package, pos=2) {
pstring <- deparse(substitute(package))
unloadNamespace(pstring)
library(pstring, pos=pos, character.only=T)
}
#' Unload and reload trqwe.
#' @description Unload and reload trqwe.
#' Shortcut for \code{reload(trqwe)}
#' @export
reloadtrqwe <- function() {
reload(trqwe)
}
#' Install Bioconductor package.
#' @description Utility function for installing packages from bioconductor easily.
#' @param package unquoted package name.
#' @examples
#' \code{bioc(DESeq2)}
#' @export
bioc <- function(package) {
source("http://bioconductor.org/biocLite.R")
pstring <- deparse(substitute(package))
biocLite(pstring)
}
#' Install CRAN package.
#' @description Utility function for installing packages from CRAN easily.
#' @param package unquoted package name.
#' @examples
#' \code{install(Rcpp)}
#' @export
install <- function(package, repos="http://cran.us.r-project.org") {
pstring <- deparse(substitute(package))
install.packages(pstring, repos=repos)
}
#' Time profiling.
#' @description Reports time in seconds to the R prompt of the previous command.
#' This function adds a callback which saves the running of individual commands and reports the time in seconds on the next line.
#' @examples
#' > timePrompt()
#' 0.000s> x <- sample(1:10, size=1e8, replace=T)
#' 1.240s>
#' Note - this time is not accurate if child processes or multithreading is involved.
#' @export
timePrompt <- function() {
removeTaskCallback(".exec_time_prompt")
updatePrompt <- function(...) {
utime <- proc.time()
utime_prev <- get0(".prev_time", envir=globalenv(), ifnotfound=utime)
utime_diff <- sprintf("%.3f", sum(utime[c(1,2)] - utime_prev[c(1,2)]))
options(prompt=paste0(utime_diff,"s> "))
assign(".prev_time", utime, envir=globalenv())
return(TRUE)
}
ret <- addTaskCallback(updatePrompt, name=".exec_time_prompt")
}
#' Variable information.
#' @description Automatically stores basic information of variables in the previous command.
#' This function adds a callback which reports information on previous variables and stores this information in \code{.stats}.
#' @examples
#' statsCallback()
#' my_data <- VADeaths
#' .stats
#' > [1] "dim: 5 4, length: 20, class: matrix, typeof: double"
#' @export
statsCallback <- function() {
removeTaskCallback(".stats_callback")
statsUpdate <- function(...) {
lv <- get0(".Last.value", envir=globalenv(), ifnotfound=NULL)
dim <- paste0(dim(lv), collapse=" ")
length <- length(lv)
class <- class(lv)
typeof <- typeof(lv)
assign(".stats", paste0("dim: ",dim,", length: ", length, ", class: ", class, ", typeof: ",typeof), envir=globalenv())
return(TRUE)
}
ret <- addTaskCallback(statsUpdate, name=".stats_callback")
}
#' All duplicates.
#' @description Finds all duplicates in a vector including first instances of duplicates.
#' @param vec A vector.
#' @return A boolean vector of the same length as \code{vec}. TRUE if the element is duplicated, FALSE if not.
#' @examples
#' allDups(sample(1:100, size=100, replace=T)
#' @export
allDups <- function(vec) {
duplicated(vec) | rev(duplicated(rev(vec)))
}
#' Fast readLines.
#' @description Replacement for readLines, faster.
#' @param fname Filename to read.
#' @param newlinechar The new line character in the file.
#' @return A vector containing all the lines in the file.
#' @examples
#' library(microbenchmark)
#' writeLines(replicate(100000, sample(letters, size=100, replace=T)), con="/tmp/temp.txt")
#' microbenchmark(fastReadLines("/tmp/temp.txt"),
#' readLines("/tmp/temp.txt"), times=3)[,c("expr", "mean"), drop=F]
#'
#' Unit: milliseconds
#' expr min lq mean median uq
#' fastReadLines("/tmp/temp.txt") 335.3874 335.9188 336.5900 336.4502 337.1912
#' readLines("/tmp/temp.txt") 724.9136 725.4523 727.2444 725.9911 728.4098
#' @export
fastReadLines <- function(fname, newlinechar="\n", nchars=NULL) {
if(is.null(nchars)) {
nchars = file.info(fname)$size
}
buf = readChar(fname, nchars, useBytes=T)
strsplit(buf,newlinechar,fixed=T,useBytes=T)[[1]]
}
#' Upper-left corner of matrix.
#' @description Shortcut function for previewing a matrix or data.frame by displaying the upper-left corner. Similar to \code{head}.
#' @param x A wide matrix or data.frame.
#' @param n Number of lines to display. Default 10.
#' @param ncols Number of columns to display. Default 10.
#' @return \code{n} by \code{ncols} subset of the matrix taken from the upper-left corner.
#' @export
head2 <- function(x, n = 10, ncols = 10) {
head(x[,seq_len(ncols)])
}
#' Lower-left corner of matrix.
#' @description Shortcut function for previewing the bottom of a matrix or data.frame by displaying the lower-left corner. Similar to \code{tail}.
#' @param x A wide matrix or data.frame.
#' @param n Number of lines to display. Default 10.
#' @param ncols Number of columns to display. Default 10.
#' @return \code{n} by \code{ncols} subset of the matrix taken from the lower-left corner.
#' @export
tail2 <- function(x, n = 10, ncols = 10) {
tail(x[,seq_len(ncols)])
}
#' Parse TCGA barcode.
#' @description Taking in a full TCGA sample barcode, or any subset of the barcode, and return extracted values.
#' @param x TCGA barcode, or a vector of barcodes.
#' @param what Which information to return.
#' @return The specified information contained in the barcode of the same length as \code{x}.
#' @examples
#' TCGA_barcode(c("TCGA-02-0001-01C-01D-0182-01", "TCGA-02-0001-11C-01D-0182-01"), what="tissue")
#'
#' [1] "01" "11"
#' @export
TCGA_barcode <- function(x, what = "patient") {
reg <- "TCGA-(..)-(....)-?(..)?(.)?-?(..)?(.)?-?(....)?-?(..)?"
group <- switch(what,
TSS = "\\1",
participant = "\\2",
patient = "TCGA-\\1-\\2",
sample = "\\3",
tissue = "\\3",
vial = "\\4",
portion = "\\5",
analyte = "\\6",
plate = "\\7",
center = "\\8")
gsub(reg, group, x, perl=T)
}
#' Multiple grepl.
#' @description Takes in a list of regex patterns and returns true if any pattern matches.
#' @param pattern A vector of regex patterns.
#' @param x A vector of strings to search.
#' @return A vector of the same length as \code{x}, TRUE if any pattern matches.
#' @examples
#' x <- fastReadLines("http://textfiles.com/ufo/ufobooks.ufo", newlinechar="\r\n", 1e5)
#' head(x[mgrepl(c("ALIENS", "UFO"), x)])
#'
#' [1] " UFOLOGY BOOKS (REVISION 2.1 343 books)"
#' [2] " H. S. Stewart on the subject of UFO's. The list is alphabetic"
#' [3] " Tom Mickus's most excellent board UFONET I. (416-237-1204)"
#' [4] " Bill Adler * LETTERS TO THE AIR FORCE ON UFOS 1967"
#' [5] " Gordon W. Allen OVERLORDS OLYMPIANS AND THE UFO 1974"
#' [6] " Robert B. Beard FLYING SAUCERS, UFO'S AND EXTRA"
#' @export
mgrepl <- function(patterns, x, ...) {
Reduce("|",lapply(patterns, grepl, x, ...))
}
fileIgnoreCase <- function(x) {
dir <- dirname(x)
base <- basename(x)
files <- list.files(dir)
select <- grepl(base, files, ignore.case=T)
if(any(select)) {
paste0(dir,"/",files[select][1])
} else {
return(NA)
}
}
tableVector <- function(x, factors=NULL) {
if(!is.null(factors)) {
x <- factor(x, levels = factors)
}
table(x) -> tab
structure(as.vector(tab), .Names=names(tab))
}
#' Fast C++ tabulation.
#' @description Takes in a character, integer or factor vector and tabulates the number of times each element appears.
#' @param x A character, integer or factor vector. NAs are allowed.
#' @param sort TRUE if the result names should be sorted alphanumerically.
#' @return A integer vector of counts of each element.
#' @examples
#'
#' x <- factor(sample(1e5, 1e8, replace=T))
#' microbenchmark(table(x), tablec(x), times=3)
#'
#' Unit: milliseconds
#' expr min lq mean median uq max
#' table(x) 9777.6457 10479.0949 10717.3382 11180.544 11187.1844 11193.8246
#' tablec(x) 678.0364 685.9467 713.1181 693.857 730.6589 767.4608
#'
#'
#' x <- sample(letters, 1e8, replace=T)
#' microbenchmark(table(x), tablec(x), tablec(x,sort=T), times=3)
#'
#'Unit: seconds
#' expr min lq mean median uq max
#' table(x) 5.778514 5.829125 5.855560 5.879737 5.894083 5.908430
#' tablec(x) 1.589360 1.589381 1.589516 1.589402 1.589594 1.589786
#' tablec(x, sort = T) 1.589386 1.590520 1.591824 1.591655 1.593044 1.594432
#' @export
tablec <- function(x, sort=F) {
cl <- class(x)
if(cl == "character") {
ret <- tablec_string(x)
} else if(cl == "integer") {
ret <- tablec_int(x)
} else if(cl == "factor") {
ret <- tablec_factor(x)
} else {
stop("x must be a character, integer or factor vector")
}
if(sort) {
return(ret[sort(names(ret), na.last=F)])
} else {
return(ret)
}
}
#' Size of R objects.
#' @description Prints out the size of all R objects in the environment.
#' @param env The environment to search (default global environment).
#' @param units Units to print out for each variable.
#' @return A data.frame containing the size of each object.
#' @export
varSizes <- function(env=globalenv(), units="KB") {
vars <- ls(envir=env)
if(length(vars) == 0) return(NULL)
sizes <- sapply(vars, function(v) {
format(object.size(get(v, envir=env)), units=units)
})
data.frame(vars=vars, size=sizes, row.names=1:length(sizes), stringsAsFactors=F)
}
#' Calculate FPKM.
#' @description Calculates FPKM from a count matrix.
#' @param mat An integer matrix representing the counts of a RNA-Seq dataset.
#' @param gene_lengths A named vector with the length of each gene.
#' @param uq_norm If TRUE, will perform upper-quartile normalization.
#' @return A matrix containing FPKM with the same dimensions as mat.
#' @seealso
#' FPKM-UQ - A normalized read count in which gene expression values, in FPKM, are divided by the 75th percentile value.
#'
#' \url{https://gdc-docs.nci.nih.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/}
#'
#' \url{http://vinaykmittal.blogspot.com/2013/10/fpkmrpkm-normalization-caveat-and-upper.html}
#' @export
fpkmFromCounts<- function(mat, gene_lengths, uq_norm=T) {
gene_lengths <- gene_lengths[rownames(mat)]
gene_lengths <- gene_lengths / 1000
lib_sizes <- colSums(mat) / 1e6
mat <- diag(1 / gene_lengths) %*% mat
mat <- subset(mat, apply(mat, 1, sum) != 0)
if(uq_norm == F) {
return(mat)
} else {
uqs <- apply(mat, 2, quantile, 0.75)
print(summary(uqs))
mat <- mat %*% diag(1 / uqs)
return(mat)
}
}
# plotROC <- function(pred, class, showplot=T, main="ROC Curve", ...) {
# require(pROC)
# roc <- roc(class, pred)
# auc <- auc(roc)
# conf_mat <- data.matrix(table(pred >= 0.5, class))
# acc <- (conf_mat[1,1] + conf_mat[2,2])/sum(conf_mat)
# specificity <- roc$specificities
# sensitivity <- roc$sensitivities
# if(showplot) {
# plot(1-specificity, sensitivity, xlim = c(0,1), ylim = c(0,1), sub=paste0("auc=",round(auc,4), "; accuracy=",round(acc,4)), type="l", lwd=2, main=main,...)
# abline(a=0,b=1, col="gray")
# }
# return(list(specificity, sensitivity))
# }
# needs work, kinda broken atm
# safemclapply <- function(arg_list, p_function, mc.cores=32, tasks_per_core_per_round=3, save_folder=".temp/") {
# require(parallel)
# require(digest)
# # require(dplyr)
# substr(
# digest(list(capture.output(str(arg_list)),
# p_function,
# mc.cores,
# tasks_per_core_per_round,
# save_folder)),
# 1,10) -> run_md5
# function_ls <- ls()
# print(paste("md5 hash: ", run_md5))
# tasks_per_round = mc.cores*tasks_per_core_per_round
# task_sequence = seq(1,ceiling(length(arg_list)/tasks_per_round))
# dir.create(save_folder, showWarnings = FALSE)
# # for(lsv in ls(envir=parent.frame(1))) {lockBinding(lsv,parent.frame(1))}
# # for(lsv in ls(envir=parent.frame(2))) {lockBinding(lsv,parent.frame(2))}
# for(i in task_sequence) {
# savefile = paste0(save_folder,run_md5,"_",i,".Rds")
# if(file.exists(savefile) && file.size(savefile) > 0) next
# batch_indices = seq((i-1)*tasks_per_round+1,min(i*tasks_per_round,length(arg_list)))
# print(paste("Processing ",batch_indices[1], " to ", tail(batch_indices,1)))
# mclapply(arg_list[batch_indices], p_function, mc.cores=mc.cores) -> temp
# saveRDS(temp, file=savefile)
# gc()
# }
# # for(lsv in ls(envir=parent.frame(1))) {unlockBinding(lsv,parent.frame(1))}
# # for(lsv in ls(envir=parent.frame(2))) {unlockBinding(lsv,parent.frame(2))}
# ret_list <- list()
# for(i in task_sequence) {
# savefile = paste0(save_folder,run_md5,"_",i,".Rds")
# readRDS(savefile) -> ret_list[[i]]
# }
# do.call(c, ret_list) -> ret
# return(ret)
# }
# splitForMC <- function(arg_list, folder) {
# if(is.null(names(arg_list))) {
# names(arg_list) <- 1:length(arg_list)
# }
# for(i in 1:length(arg_list)) {
# names(arg_list)[i] -> na
# saveRDS(arg_list[[i]], file=sprintf("%s/%s.Rds",folder,na))
# }
# }
#' Parallel split-matrix or dataframe apply loop
#' @description
#' Splits a matrix or data.frame into subsets based on a factor, and applies a function to each subset. Typical use case: sum exon count data to gene count data. This is similar to the dplyr idiom: \code{df \%>\% group_by(f) \%>\% do(...)}, but has several advantages. 1) mcsplitapply can be used on matrices (and is therefore much faster), 2) inherently parallelized, 3) can return results other than dataframes, 4) you can specify how the data are combined (default is rbind).
#' @param mat The matrix.
#' @param f A factor of length equal to nrow(mat). The levels of this factor will split the matrix into subsets.
#' @param func The function to apply to each subset.
#' @param .combine The function to combine the results with. Default is rbind. Use NA to return a list.
#' @param mc.cores The number of cores to use.
#' @return A list or a combined object depending on the .combine parameter.
#' @examples
#' library(pasilla)
#' library(DEXSeq)
#' library(trqwe)
#' data(pasillaDEXSeqDataSet)
#' exon_counts <- counts(dxd)
#' f <- rowData(dxd)$groupID
#' gene_counts <- mcsplitapply(exon_counts, f, colSums)
#' @export
mcsplitapply <- function(mat, f, func, mc.cores=4, .combine=rbind, ...) {
if(mc.cores > 1) require(parallel)
uf <- unique(f)
ret <- parallel::mclapply(uf, function(fi) {
func(mat[f == fi,,drop=F])
}, mc.cores=mc.cores)
names(ret) <- uf
if(!is.function(.combine)) {
return(ret)
} else {
ret <- do.call(.combine, ret)
return(ret)
}
}
#' Concatenate strings.
#' @description Concatenates two strings.
#' @param a First string.
#' @param b Second string.
#' @return The concatenated string.
#' @examples
#' 'Hello ' %Q% 'World'
#'
#' [1] "Hello World"
#' @export
`%Q%` <- function(a, b) paste0(a, b)
#' Append to a vector.
#' @description Appends the 2nd argument to the 1st.
#' @param x A vector.
#' @param value The element to append.
#' @examples
#' x <- 1:5
#' append(x) <- 6
#' print(x)
#'
#' [1] 1 2 3 4 5 6
#' @export
`append<-` <- function(x, value) {
c(x, value)
}
#' Prepend to a vector.
#' @description Prepends the 2nd argument to the 1st.
#' @param x A vector.
#' @param value The element to append.
#' @examples
#' x <- 1:5
#' prepend(x) <- 6
#' print(x)
#'
#' [1] 6 1 2 3 4 5
#' @export
`prepend<-` <- function(x, value) {
c(value, x)
}
#' Highest elements in a vector.
#' @description Finds the top elements in a vector very quickly. Equivalent ot \code{-sort(-x, partial=1:n)}
#' @param x A numeric vector.
#' @param n The number of top elements to return.
#' @param lowest If TRUE, returns the lowest elements instead of the highest.
#' @param value If TRUE, returns the values of the top elements. If FALSE, returns the indices.
#' @return A vector containing the indices or the values of the top elements.
#' @examples
#' naive_top <- function(x, n) {
#' -sort(-x, partial=1:n)
#' }
#' x <- runif(1e7)
#' microbenchmark(naive_top(x,100), topn(x,100,value=T), times=10)
#'
#' Unit: milliseconds
#' expr min lq mean median uq
#' naive_top(x, 100) 1070.0180 1071.5951 1075.964 1072.3520 1073.9989
#' topn(x, 100, value = T) 433.6682 433.8882 434.771 434.4986 435.6029
#' @seealso
#' \url{http://stackoverflow.com/questions/18450778/}
#' @export
topn <- function(x, n=100, value=F, lowest=F) {
if(lowest) x <- -x
nx <- length(x)
p <- nx-n
xp <- sort(x, partial=p)[p]
wh <- x > xp
if(value) {
return(x[wh])
} else {
return(which(wh))
}
}
#' Standard error.
#' @description Calculates the standard error of a sampling distribution.
#' @param x A vector.
#' @return The standard error of x.
#' @examples
#' x <- rnorm(1e3)
#' se(x)
#' [1] 0.03192027
#' @export
se <- function(x) {x<-na.omit(x);sd(x)/sqrt(length(x))}
# rescale <- function(x, min=0, max=1) {
# (x-min(x))/(max(x)-min(x)) * (max-min) + min
# }
colorInterpolate <- function(x, color1="white", color2="blue") {
colramp <- colorRamp(c(color1, color2))
cols <- colramp(rescale(x))
cols <- cols/255
ncol(cols) -> nc
apply(cols, 1, function(r) {
if(nc == 3) {
rgb(r[1], r[2], r[3])
} else {
rgb(r[1], r[2], r[3], r[4])
}
})
}
#devel branch : https://github.com/GuangchuangYu/clusterProfiler/blob/master/R/GMT.R
read.gmt <- function(gmtfile) {
require(GSEABase)
gmt <- getGmt(con=gmtfile)
ont2gene <- stack(geneIds(gmt))
ont2gene <- ont2gene[, c("ind", "values")]
colnames(ont2gene) <- c("ont", "gene")
return(ont2gene)
}
#specific function for pulling down pathway descriptions from GSEA
# pathwayDescriptionPullDown <- function(gmt_file) {
# require(curl)
# readLines(gmt_file) -> gmt
# strsplit(gmt, split ="\t") -> gmt
# t(sapply(gmt, function(p) {
# c(p[1], p[2])
# })) -> pathway_urls
# sapply(pathway_urls[,2], function(url) {
# tryCatch({
# con <- curl(url)
# lines <- readLines(con)
# desc <- lines[which(grepl("Brief", lines))+1]
# desc <- gsub("^.*<td>","",desc)
# desc <- gsub("</td>$","",desc)
# desc <- gsub("<a href.+?</a>","",desc)
# desc <- gsub(" "," ",desc)
# desc <- gsub("[\\. ]+$","",desc)
# print(desc)
# Sys.sleep(.5)
# return(desc)
# }, error=function(e) e)
# }) -> descs
# data.frame(short=pathway_urls[,1], url=pathway_urls[,2], desc=descs)
# }
#' Cosine distance.
#' @description Calculates the cosine distance of rows of a matrix.
#' @param x A matrix.
#' @return Cosine distance as a dist object.
#' @examples
#' rbind(c(1,1,1,0,0,0), c(0,0,0,1,1,1), c(1,1,1,0,0,1)) -> x
#' cosineDist(x)
#' 1 2
#' 2 1.0000000
#' 3 0.1339746 0.7113249
#' @seealso
#' \url{http://stackoverflow.com/questions/2535234/find-cosine-similarity-in-r}
#' @export
cosineDist <- function(x){
as.dist(1 - x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
intRangeString <- function(vec) {
if(length(vec) ==1) return(as.character(vec))
vec <- sort(vec)
res <- list()
res[[1]] <- vec[1]
for(i in 2:length(vec)) {
if(vec[i] == vec[i-1] + 1 || vec[i] == vec[i-1]) {
res[[length(res)]] <- c(res[[length(res)]],vec[i])
} else {
res[[length(res)+1]] <- vec[i]
}
}
sapply(res, function(r) {
if(min(r) == max(r)) {
return(as.character(max(r)))
} else {
return(paste0(min(r), "-", max(r)))
}
}) -> res
paste0(res, collapse=",")
}
# leadingEdgeGenes <- function(geneList, geneSet, exponent=1) {
# require(DOSE)
# df <- DOSE:::gseaScores(geneList=geneList, geneSet=geneSet, exponent=exponent,fortify=T)
# names(geneList)[which(df$position==1)]
# upreg <- max(df$runningScore) > abs(min(df$runningScore))
# if(upreg) {
# set <- 1:which.max(df$runningScore)
# } else {
# set <- which.min(df$runningScore):length(df$runningScore)
# }
# intersect(names(geneList)[set], geneSet)
# }
scaleVec <- function(x, ...) {
scale(x, ...)[,1]
}
# getAuc <- function(true_Y, probs) {
# probsSort = sort(probs, decreasing = TRUE, index.return = TRUE)
# val = unlist(probsSort$x)
# idx = unlist(probsSort$ix)
# roc_y = true_Y[idx];
# stack_x = cumsum(roc_y == 0)/sum(roc_y == 0)
# stack_y = cumsum(roc_y == 1)/sum(roc_y == 1)
# auc = sum((stack_x[2:length(roc_y)]-stack_x[1:length(roc_y)-1])*stack_y[2:length(roc_y)])
# return(list(stack_x=stack_x, stack_y=stack_y, auc=auc))
# }
# getAuc <- function(class, pred) {
# ord <- order(pred)
# pred <- pred[ord]
# class <- class[ord]
# as.data.frame(table(pred, class)) -> tab
# # tab$pred <- unique(pred)
# npos <- sum(subset(tab, class==1)$Freq)
# nneg <- sum(subset(tab, class==0)$Freq)
# tpr <- npos - cumsum(subset(tab, class==1)$Freq)
# tpr <- tpr/npos
# tpr <- c(0, rev(tpr))
# fpr <- cumsum(subset(tab, class==0)$Freq)
# fpr <- fpr/fpr[length(fpr)]
# fpr <- c(0, fpr)
# auc <- sum(sapply(1:(nrow(tab)/2), function(i) {
# (tpr[i] + tpr[i+1])/2 * (fpr[i+1]-fpr[i])
# }))
# return(list(auc=auc, tpr=tpr, fpr=fpr))
# }
#style converter for titles
# titleConvert <- function(title, style="camel", sep=" ", original_sep="_") {
# title <- unlist(strsplit(title, split=original_sep))
# title <- tolower(title)
# if(style == "camel") {
# title <- gsub("^(.)", "\\U\\1", title, perl=T)
# } else if(style == "upper") {
# title <- toupper(title)
# }
# paste0(title, collapse=sep)
# }
# do.call.args <- function(what, args, quote = FALSE, envir = parent.frame(), ...) {
# extra_args <- list(...)
# do.call(what, c(args, extra_args))
# }
#' @export
saveListFF <- function(list, return_ff_list=F, ...) {
require(ff)
if(is.null(names(list))) {
names(list) <- 1:length(list)
}
if(any(is.na(names(list)))) {
stop("list must have names")
}
ff_list <- sapply(list, as.ff, simplify=F)
with(ff_list, {
ffsave(list=ls(),...)
})
if(return_ff_list) return(ff_list)
}
#' @export
loadFFList <- function(file, ...) {
require(ff)
ff_env <- new.env()
ffload(file, envir=ff_env, ...)
return(as.list(ff_env))
}
#' Cleans leading and trailing whitespace.
#' @description Removes leading and trailing whitespace in a vector of strings.
#' @param x A character vector.
#' @return A vector with leading and trailing whitespace removed.
#' @examples
#' chop(c(" hello ", " 123 \t"))
#'
#' [1] "hello" "123"
#' @export
chop <- function(x) {
gsub("\\s+$","",gsub("^\\s+","",x))
}
multipagePlot <- function(plot_list, nrow=2, ncol=2,...) {
require(grid)
require(gridExtra)
while(length(plot_list) > 0) {
page_plots <- plot_list[seq(min(nrow*ncol, length(plot_list)))]
plot_list <- plot_list[-seq(nrow*ncol)]
grid.arrange(grobs=page_plots, ncol=ncol, nrow=nrow,...)
}
}
#mclapply is broken and has memory issues. Workaround: save return value to temp folder and read at the end of loop
#to do: still needs work
# null.mclapply <- function(X,FUN,mc.cores,...) {
# require(parallel)
# base <- paste0(tempdir(),"/",paste0(sample(c(letters,LETTERS,0:9), replace=T,size=12),collapse=""))
# message(paste0("Saving results to ",base))
# mclapply(seq_along(X),function(i) {
# saveRDS(FUN(X[i]),file=paste0(base,"_",i,".Rds"),compress=F)
# return(NULL)
# }, mc.cores=mc.cores,...)
# message(paste0("Reading results from ",base))
# mclapply(seq_along(X),function(i) {
# readRDS(paste0(base,"_",i,".Rds"))
# },mc.cores=mc.cores,...)
# }
# testAUC <- function(probs, class) {
# x <- probs
# y <- class
# x1 = x[y==1]; n1 = length(x1);
# x2 = x[y==0]; n2 = length(x2);
# r = rank(c(x1,x2))
# h <- ???
# r_aug <- c(0,r[1:n1],n2)
# w <- sapply(1:n1, function(ii) {
# (r[ii]+r[ii+2])/2 - 1
# })
# auc <- sum(w*h)/(n1*n2)
# }
# which operations actually benefit?
mcReduce <- function(FUN,x,mc.cores=4) {
require(parallel)
while(length(x) != 1) {
x_tail <- tail(x,length(x) %% 2)
x <- mclapply(1:floor(length(x)/2), function(i) {
FUN(x[[i*2-1]],x[[i*2]])
}, mc.cores=mc.cores)
if(length(x_tail) > 0) x <- c(x,x_tail)
# print(length(x))
}
return(x[[1]])
}
#' Multi-threaded saveRDS
#' @description Uses the pigz utility to improve saving large R objects. This is compatible with saveRDS and readRDS functions. Requires pigz (sudo apt-get install pigz on Ubuntu).
#' @param object An r object to save.
#' @param file The filename to save to.
#' @param mc.cores How many cores to use in pigz. The program does not seem to benefit after more than about 4 cores.
#' @examples
#' x <- sample(1e4, 1e7, replace=T)
#' y <- sample(1e4, 1e7, replace=T)
#' microbenchmark(mcsaveRDS(x, file="temp.Rds"), saveRDS(y, file="temp2.Rds")
#'
#' Unit: seconds
#' expr min lq mean median uq
#' mcsaveRDS(x, file = "temp.Rds") 1.908310 1.908310 1.908310 1.908310 1.908310
#' saveRDS(y, file = "temp2.Rds") 6.271499 6.271499 6.271499 6.271499 6.271499
#' @seealso
#' \url{http://stackoverflow.com/questions/28927750/}
#' @export
mcsaveRDS <- function(object,file,mc.cores=min(parallel::detectCores(),4)) {
con <- pipe(paste0("pigz -p",mc.cores," > ",file),"wb")
saveRDS(object, file = con)
close(con)
}
#' Multi-threaded readRDS
#' @description Uses the pigz utility to improve loading large R objects. This is compatible with saveRDS and readRDS functions. Requires pigz (sudo apt-get install pigz on Ubuntu).
#' @param file The filename of the rds object.
#' @param mc.cores How many cores to use in pigz. The program does not seem to benefit after more than about 4 cores.
#' @return The R object.
#' @examples
#' x <- sample(1e4, 1e7, replace=T)
#' saveRDS(x, file="temp.Rds")
#' xmc <- mcreadRDS("temp.Rds")
#' @seealso
#' \url{http://stackoverflow.com/questions/28927750/}
#' @export
mcreadRDS <- function(file,mc.cores=min(parallel::detectCores(),4)) {
con <- pipe(paste0("pigz -d -c -p",mc.cores," ",file))
object <- readRDS(file = con)
close(con)
return(object)
}
#' saveRZ
#' @description faster serialization + compression using zstd
#' @export
saveRZ <- function(object, file, compression=3, mc.cores=1) {
con <- pipe(sprintf("zstd --format=zstd -f -%s -T%s -o %s", compression, mc.cores, file), "wb")
saveRDS(object, file = con)
close(con)
# writeBin(fst::compress_fst(serialize(object, connection=NULL), compressor=compressor, compression=compression), con=file, useBytes=T)
}
#' readRz
#' @description faster serialization + compression using fst package
#' @export
readRZ <- function(file, mc.cores=1) {
# unserialize(fst::decompress_fst(readBin(file, "raw", n=file.info(file)$size)))
con <- pipe(sprintf("zstd --format=zstd -d -c -T%s %s", mc.cores, file), "rb")
object <- readRDS(file = con)
close(con)
return(object)
}
#' Multi-threaded readRDS
#' @description Uses the pigz utility to improve loading large R objects. This is compatible with saveRDS and readRDS functions. Requires pigz (sudo apt-get install pigz on Ubuntu).
#' @param file The filename of the rds object.
#' @param mc.cores How many cores to use in pigz. The program does not seem to benefit after more than about 4 cores.
#' @return The R object.
#' @examples
#' x <- sample(1e4, 1e7, replace=T)
#' saveRDS(x, file="temp.Rds")
#' xmc <- mcreadRDS("temp.Rds")
#' @seealso
#' \url{http://stackoverflow.com/questions/28927750/}
#' @export
mcreadRDS <- function(file,mc.cores=min(parallel::detectCores(),4)) {
con <- pipe(paste0("pigz -d -c -p",mc.cores," ",file))
object <- readRDS(file = con)
close(con)
return(object)
}
#' Nelson Aelen estimator
#' @description Nelson–Aalen estimator is an estimator of the cumulative hazard function in survival data. It can be used to compare the overall risks of two groups or used to estimate the number of deaths before a certain time.
#' @param time The time of each patient.
#' @param event The death of each patient: 1 for a patient death, 0 for censored.
#' @return A list containing the cumulative hazard function.
#' @examples
#' library(survival)
#' data(veteran)
#' with(nelson_aelen_surv(veteran$time, veteran$status), plot(ti, Hi, type="b"))
#' @seealso
#' #https://en.wikipedia.org/wiki/Nelson-Aalen_estimator
#' @export
nelson_aelen_surv <- function(time, event) {
ord <- order(time)
time <- time[ord]
event <- event[ord]
di <- table(time[event==1])
ti <- as.numeric(names(di))
di <- as.vector(di)
ni <- length(time) - cumsum(di) + di
list(Hi=cumsum(di/ni),ti=ti)
}
#' Wald-Wolfowitz Runs Tests for Randomness
#' @description This is the k-category asymptotic Z Test with continuity correction. Imagine rolling a die multiple times to obtain a sequence of rolls. This statistic tests whether there exists a "run" within the sequence where one particular number comes up more times in a row than expected randomly. If the test is significant, it can be concluded that the die rolls are not independent.
#' @param x A vector of items, coerced into a factor.
#' @return A p-value for the statistical test.
#' @examples
#' set.seed(1)
#' ww_test(sample(2, 100, replace=T))
#' ww_test(c(sample(6, 90, replace=T),rep(1,10)))
#' @seealso
#' Reference https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Analysis_of_Runs.pdf
#' @export
ww_test <- function(x) {
x <- factor(x)
n <- length(x)
njs <- table(x)
njs <- structure(.Data=as.vector(njs), .Names=names(njs))
mu <- (n*(n+1) - sum(njs^2))/n
sdev_l <- n^2*(n-1)
sdev_u <- sum(njs^2*(sum(njs^2)+n*(n+1))) - 2*n*sum(njs^3)-n^3
sdev <- sqrt(sdev_u/sdev_l)
r <- sum(diff(as.numeric(x)) != 0) + 1
z <- (r - mu) / sdev
pvalue <- 2*pnorm(-abs(z))
return(pvalue)
}
#' Fast AUC
#' @description This function calculates the Area Under the Reciever-Operator Curve from the results of a classifcation model.
#' @param probs A numeric vector of probabilities or likelihoods for each data point.
#' @param class A numeric vector where 1 is positive and 0 is negative.
#' @return The AUC.
#' @examples
#' library(AppliedPredictiveModeling)
#' data(abalone)
#' library(glmnet)
#' class <- factor(as.numeric(abalone$Type == "M"))
#' data <- data.matrix(abalone[,-1])
#' fit <- cv.glmnet(data, class, family="binomial")
#' probs <- predict(fit, newx=data)[,1]
#' fastAUC(probs, class)
#' @seealso
#' Reference https://stat.ethz.ch/pipermail/r-help/2005-September/079872.html
#' @export
fastAUC <- function(probs, class, method="trqwe") {
#ROCR is now much faster
if(method=="ROCR") {
require(ROCR)
pred <- ROCR::prediction(probs, class)
perf <- performance(pred, "auc")
return(perf@y.values[[1]])
}
x <- probs
y <- class
x1 = x[y==1]; n1 = length(x1);
x2 = x[y==0]; n2 = length(x2);
r = rank(c(x1,x2))
auc = (sum(r[1:n1]) - n1*(n1+1)/2) / n1 / n2
return(auc)
}
#' Fast ROC
#' @description Calculates the points in a Reciever-Operator Curve from the results of a classifcation model.
#' @param probs A numeric vector of probabilities or likelihoods for each data point.
#' @param class A numeric vector where 1 is positive and 0 is negative.
#' @return a list containing the ROC curve.
#' @examples
#' library(AppliedPredictiveModeling)
#' data(abalone)
#' library(glmnet)
#' class <- factor(as.numeric(abalone$Type == "M"))
#' data <- data.matrix(abalone[,-1])
#' fit <- cv.glmnet(data, class, family="binomial")
#' probs <- predict(fit, newx=data)[,1]
#' with(fastROC(probs, class), plot(fpr, tpr, type="l"))
#' @export
fastROC <- function(probs, class) {
if(is.factor(class)) {class = as.numeric(class) - 1}
class_sorted <- class[order(probs, decreasing=T)]
TPR <- cumsum(class_sorted) / sum(class)
FPR <- cumsum(class_sorted == 0) / sum(class == 0)
return(list(tpr=TPR, fpr=FPR))
}
#' Precision-Recall curve
#' @description Calculates the points in a Precision-Recall from the results of a classifcation model.
#' @param probs A numeric vector of probabilities or likelihoods for each data point.
#' @param class A numeric vector where 1 is positive and 0 is negative.
#' @return a list containing the ROC curve.
#' @examples
#' library(AppliedPredictiveModeling)
#' data(abalone)
#' library(glmnet)
#' class <- factor(as.numeric(abalone$Type == "M"))
#' data <- data.matrix(abalone[,-1])
#' fit <- cv.glmnet(data, class, family="binomial")
#' probs <- predict(fit, newx=data)[,1]
#' with(fastPR(probs, class), plot(recall, precision, type="l"))
#' @export
fastPR <- function(probs, class) {
if(is.factor(class)) {class = as.numeric(class) - 1}
class_sorted <- class[order(probs, decreasing=T)]
recall <- cumsum(class_sorted) / sum(class)
precision <- cumsum(class_sorted) / 1:length(class_sorted)
return(list(recall=recall, precision=precision))
}
#' Matthew's correlation coefficient
#' @description Calculates Matthew's correlation coefficient. Requires the Rmpfr package.
#' @param probs A numeric vector where 1 is predicted positive and 0 is predicted negative.
#' @param class A numeric vector where 1 is positive and 0 is negative.
#' @return The MCC score.
#' @seealso
#' \url{https://en.wikipedia.org/wiki/Matthews_correlation_coefficient}
#' @export
mccscore <- function(probs, class) {
require(Rmpfr)
m <- function(x) mpfr(x, precBits=20)
tp <- m(sum((probs == 1) & (class == 1)))
tn <- m(sum((probs == 0) & (class == 0)))
fp <- m(sum((probs == 1) & (class == 0)))
fn <- m(sum((probs == 0) & (class == 1)))
as.numeric(((tp * tn) - (fp * fn)) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
}
#' Posterior probability adjustment.
#' @description Adjusts the posterior probability of a classifier based on unbalanced datasets. In classification model where the negative data is randomly under-sampled and all the positive data is used, the adjustment factor (beta) is p(s=1|-) = p(+)/p(-). I.e., the probability that a negative datapoint is selected in the classifier. beta ~ N+/N-.
#' @param probs The original posterior probability.
#' @param beta The adjustment factor.
#' @param Nplus The number of positive examples in the real data. Only used if beta is NULL.
#' @param Nminus The number of negative examples in the real data. Only used if beta is NULL.
#' @return The adjusted posterior probability.
#' @seealso
#' Dal Pozzolo, Andrea, et al. "Calibrating probability with undersampling for unbalanced classification." Computational Intelligence, 2015 IEEE Symposium Series on. IEEE, 2015.
#' @export
posteriorBalance <- function(probs, beta=NULL, Nplus, Nminus) {
if(is.null(beta)) {
beta <- Nplus/Nminus
}
beta*probs / (beta*probs-probs+1)
}
#' F1 score.
#' @description Calculates F1 score from the results of a classification model.
#' @param probs A numeric vector where 1 is predicted positive and 0 is predicted negative.
#' @param class A numeric vector where 1 is positive and 0 is negative.
#' @return The F1 score.
#' @seealso
#' \url{https://en.wikipedia.org/wiki/F1_score}
#' @export
f1score <- function(probs, class) {
tp <- sum((probs == 1) & (class == 1))
fp <- sum((probs == 1) & (class == 0))
fn <- sum((probs == 0) & (class == 1))
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
2 * precision * recall / (precision + recall)
}
#' Concordance Index.
#' @description Calculates the concordance index from the results of a censored survival model. Very fast compared to other packages.
#' @param probs The prognostic score of each patient.
#' @param time The time of each patient.
#' @param event The death of each patient: 1 for a patient death, 0 for censored.
#' @return The concordance index.
#' @export
cindex <- function(probs, time, event)
{
wh <- which(event==1)
time_mat <- outer(time, time[wh], ">")
pred_mat <- outer(probs, probs[wh], "<")
pred_mat_eq <- outer(probs, probs[wh], "==")
total <- sum(time_mat)
concord <- sum(time_mat & pred_mat) + sum(time_mat & pred_mat_eq)*0.5
concord/total
}
#' Sigmoid Function.
#' @description Calculates the results of the sigmoid function.
#' @param probs x Input to the function.
#' @return Sigmoid(x)
#' @export
sigmoid <- function(x) {
return(1/(1+exp(-x)))
}
#' Logit Transformation.
#' @description Calculates the results of the Logit Transformation.
#' @param x Input to the function (e.g., probabilities from 0 to 1).
#' @return Logit(x)
#' @export
logit <- function(x) {
return(log(x/(1-x)))
}
#' Matrix Factor Design.
#' @description From a factor, returns a design matrix with a column for each level.
#' @param x A factor.
#' @param names The name of each instance in the resulting matrix (i.e., the rownames).
#' @return The design matrix.
#' @examples
#' matrixFactor(factor(letters))
#' @export
matrixFactor <- function(x, names=NULL) {
x <- factor(x)
xlevels <- levels(x)
x_mat <- matrix(nrow = length(x), ncol = length(xlevels), 0)
colnames(x_mat) <- xlevels
if(!is.null(names)) {
rownames(x_mat) <- names
}
col <- match(x, xlevels)
for(i in 1:length(x)) {
x_mat[i,col[i]] <- 1
}
return(x_mat)
}
#' Make percentage from number
#' @description Make percentage from number
#' @param x A number.
#' @param digits Number of decimal places, default 2.
#' @return A percentage.
#' @examples
#' make_percent(0.424, 2)
#' @export
make_percent <- function(x, digits=2) {
paste0(round(100*x,digits),"%")
}
drew_table_plot <- function(my_data_frame, title=NULL) {
library(gridExtra)
library(gtable)
library(grid)
library(ggplot2)
tt3 = ttheme_minimal( core=list(bg_params = list(fill = c("azure3","azure2", "azure1", "white"), col=NA),
fg_params=list(fontface=3, cex = .55)), colhead=list(fg_params=list(col="black", fontface=4L,
cex = .55)), rowhead=list(fg_params=list(col="white", fontface=3L, cex = .55)))
g1 = tableGrob(my_data_frame, theme = tt3)
title = textGrob(title, gp=gpar(fontsize=14))
padding = unit(5,"mm")
table = gtable_add_rows(g1, heights = grobHeight(title) + padding, pos = 0)
table = gtable_add_grob(table, title, 1, 1, 1, ncol(table))
grid.arrange(table, nrow=1, newpage = FALSE)
}
gg_center_title <- function() {
theme(plot.title = element_text(hjust = 0.5))
}
gg_rotate_xlabels <- function(angle=90, hjust=1, vjust=0.5, ...) {
theme(axis.text.x = element_text(angle = angle, hjust = hjust, vjust=vjust))
}
gg_legend_bottom <- function() {
theme(legend.position="bottom")
}
#https://stackoverflow.com/questions/12041042/how-to-plot-just-the-legends-in-ggplot2
gg_legend<-function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
#' Fast binary KNN classifier
#' @description Fast binary KNN classifier
#' @param distmat A NxN pre-computed distance matrix
#' @param train_idx Train indicies
#' @param test_idx Test indicies
#' @param classes vector length N, 1 or 0
#' @param K Number of nearest neighbors parameter
#' @param mc.cores Number of threads to use
#' @return A prediction vector for the test set based on the class labels of the train set.
#' @export
trqwe_KNN <- function(distmat, train_idx, test_idx, classes, K, mc.cores=1) {
if(mc.cores==1) {
sapply(test_idx, function(ti) {
sum(classes[topn(distmat[ti,train_idx], n=K, lowest=T)])/K
})
} else {
unlist(mclapply(test_idx, function(ti) {
sum(classes[topn(distmat[ti,train_idx], n=K, lowest=T)])/K
}, mc.cores=mc.cores))
}
}
#' Set rownames of data.frame or matrix
#' @description Set rownames of data.frame or matrix and return it, for use with pipes
#' @param df data.frame or matrix
#' @param rownames rownames to add
#' @return df with rownames added
#' @export
set_rownames <- function(df, rownames) {
rownames(df) <- rownames
return(df)
}
#' Set colnames of data.frame or matrix
#' @description Set colnames of data.frame or matrix and return it, for use with pipes
#' @param df data.frame or matrix
#' @param colnames colnames to add
#' @return df with colnames added
#' @export
set_colnames <- function(df, colnames) {
colnames(df) <- colnames
return(df)
}
# https://stackoverflow.com/questions/7509910/how-can-i-create-a-custom-assignment-using-a-replacement-function
once <- structure(NA, class = "once")
"[<-.once" <- function(once, x, value) {
xname <- deparse(substitute(x))
pf <- parent.frame()
if (!exists(xname, pf)) assign(xname, value, pf)
once
}
# multiple assignment; the zeallot package does it better, so no longer using this approach
#' #' Multiple return assignment
#' #' @description Python style multiple return assignment.
#' #' @examples
#' #' mreturn[x,y,z] <- list("hello", c(1,2,3), sqrt(2))
#' #' print(x)
#' #' [1] "hello"
#' #' print(y)
#' #' [1] 1 2 3
#' #' print(z)
#' #' [1] 1.414214
#' #' @rdname mreturn
#' mreturn <- structure(NA, class = "mreturn")
#'
#' `[<-.mreturn` <- function(mreturn, ..., value) {
#' vars <- sapply(substitute(list(...)), deparse)[-1]
#' stopifnot(all(make.names(vars) == vars)) # invalid variable names
#' stopifnot(length(value) == length(vars)) # incorrect number of return values
#' for(i in 1:length(vars)) {
#' assign(vars[i], value=value[[i]], envir = parent.frame())
#' }
#' return(mreturn)
#' }
# This is the from the TOSTER package, without any extraneous output
TOSTtwo <- function (m1, m2, sd1, sd2, n1, n2, low_eqbound_d, high_eqbound_d,
alpha, var.equal)
{
if (missing(alpha)) {
alpha <- 0.05
}
if (missing(var.equal)) {
var.equal <- FALSE
}
if (var.equal == TRUE) {
sdpooled <- sqrt((((n1 - 1) * (sd1^2)) + (n2 - 1) * (sd2^2))/((n1 +
n2) - 2))
low_eqbound <- low_eqbound_d * sdpooled
high_eqbound <- high_eqbound_d * sdpooled
degree_f <- n1 + n2 - 2
t1 <- ((m1 - m2) - low_eqbound)/(sdpooled * sqrt(1/n1 +
1/n2))
p1 <- pt(t1, degree_f, lower.tail = FALSE)
t2 <- ((m1 - m2) - high_eqbound)/(sdpooled * sqrt(1/n1 +
1/n2))
p2 <- pt(t2, degree_f, lower.tail = TRUE)
t <- (m1 - m2)/(sdpooled * sqrt(1/n1 + 1/n2))
pttest <- 2 * pt(-abs(t), df = degree_f)
LL90 <- (m1 - m2) - qt(1 - alpha, n1 + n2 - 2) * (sdpooled *
sqrt(1/n1 + 1/n2))
UL90 <- (m1 - m2) + qt(1 - alpha, n1 + n2 - 2) * (sdpooled *
sqrt(1/n1 + 1/n2))
LL95 <- (m1 - m2) - qt(1 - (alpha/2), n1 + n2 - 2) *
(sdpooled * sqrt(1/n1 + 1/n2))
UL95 <- (m1 - m2) + qt(1 - (alpha/2), n1 + n2 - 2) *
(sdpooled * sqrt(1/n1 + 1/n2))
}
else {
sdpooled <- sqrt((sd1^2 + sd2^2)/2)
low_eqbound <- low_eqbound_d * sdpooled
high_eqbound <- high_eqbound_d * sdpooled
degree_f <- (sd1^2/n1 + sd2^2/n2)^2/(((sd1^2/n1)^2/(n1 -
1)) + ((sd2^2/n2)^2/(n2 - 1)))
t1 <- ((m1 - m2) - low_eqbound)/sqrt(sd1^2/n1 + sd2^2/n2)
p1 <- pt(t1, degree_f, lower.tail = FALSE)
t2 <- ((m1 - m2) - high_eqbound)/sqrt(sd1^2/n1 + sd2^2/n2)
p2 <- pt(t2, degree_f, lower.tail = TRUE)
t <- (m1 - m2)/sqrt(sd1^2/n1 + sd2^2/n2)
pttest <- 2 * pt(-abs(t), df = degree_f)
LL90 <- (m1 - m2) - qt(1 - alpha, degree_f) * sqrt(sd1^2/n1 +
sd2^2/n2)
UL90 <- (m1 - m2) + qt(1 - alpha, degree_f) * sqrt(sd1^2/n1 +
sd2^2/n2)
LL95 <- (m1 - m2) - qt(1 - (alpha/2), degree_f) * sqrt(sd1^2/n1 +
sd2^2/n2)
UL95 <- (m1 - m2) + qt(1 - (alpha/2), degree_f) * sqrt(sd1^2/n1 +
sd2^2/n2)
}
ptost <- max(p1, p2)
ttost <- ifelse(abs(t1) < abs(t2), t1, t2)
dif <- (m1 - m2)
testoutcome <- ifelse(pttest < alpha, "significant", "non-significant")
TOSToutcome <- ifelse(ptost < alpha, "significant", "non-significant")
if (var.equal == TRUE) {
# message(cat("Using alpha = ", alpha, " Student's t-test was ",
# testoutcome, ", t(", degree_f, ") = ", t, ", p = ",
# pttest, sep = ""))
# cat("\\n")
# message(cat("Using alpha = ", alpha, " the equivalence test based on Student's t-test was ",
# TOSToutcome, ", t(", degree_f, ") = ", ttost, ", p = ",
# ptost, sep = ""))
}
else {
# message(cat("Using alpha = ", alpha, " Welch's t-test was ",
# testoutcome, ", t(", degree_f, ") = ", t, ", p = ",
# pttest, sep = ""))
# cat("\\n")
# message(cat("Using alpha = ", alpha, " the equivalence test based on Welch's t-test was ",
# TOSToutcome, ", t(", degree_f, ") = ", ttost, ", p = ",
# ptost, sep = ""))
}
TOSTresults <- data.frame(t1, p1, t2, p2, degree_f)
colnames(TOSTresults) <- c("t-value 1", "p-value 1", "t-value 2",
"p-value 2", "df")
bound_d_results <- data.frame(low_eqbound_d, high_eqbound_d)
colnames(bound_d_results) <- c("low bound d", "high bound d")
bound_results <- data.frame(low_eqbound, high_eqbound)
colnames(bound_results) <- c("low bound raw", "high bound raw")
CIresults <- data.frame(LL90, UL90)
colnames(CIresults) <- c(paste("Lower Limit ", 100 * (1 -
alpha * 2), "% CI raw", sep = ""), paste("Upper Limit ",
100 * (1 - alpha * 2), "% CI raw", sep = ""))
# cat("TOST results:\\n")
#print(TOSTresults)
# cat("\\n")
# cat("Equivalence bounds (Cohen's d):\\n")
#print(bound_d_results)
# cat("\\n")
# cat("Equivalence bounds (raw scores):\\n")
#print(bound_results)
# cat("\\n")
# cat("TOST confidence interval:\\n")
#print(CIresults)
invisible(list(TOST_t1 = t1, TOST_p1 = p1, TOST_t2 = t2,
TOST_p2 = p2, TOST_df = degree_f, alpha = alpha, low_eqbound = low_eqbound,
high_eqbound = high_eqbound, low_eqbound_d = low_eqbound_d,
high_eqbound_d = high_eqbound_d, LL_CI_TOST = LL90, UL_CI_TOST = UL90))
}
#' Calculates power/sample size of a TOST test
#' @description Calculates the sample size required for a TOST test, given a sample size through exact statistics or simulation
#' @param mu_A mean of group A
#' @param mu_B mean of group B
#' @param sd_A SD of group A
#' @param sd_B SD of group B
#' @param delta maximum tolerated difference (i.e., abs(mu_A - mu_B))
#' @param kappa sample size of group A / sample size of group B (nA / nB)
#' @param alpha significance threshold (0.05 default)
#' @param power Desired power level (0.8 default)
#' @param method Whether to use the exact statistics or simulation. "Exact" - exact statistics, "equivalence" - simulate using equivalence package tost function, "TOSTER" - simulate using TOSTER functions
#' @param n_iterations Number of simulation iterations
#' @param paired If the TOST is paired (kappa should be 1)
#' @param paired_var Sample variance for paired TOST
#' @return For exact statistics, returns a list of two values: minimum sample size to achieve desired power and the power at that sample size. For simulation, returns a series of sample sizes and achieved power up to the desired power.
#' @examples
#' res_equivalence <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.2, method="equivalence")
#' res_toster <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.2, method="TOSTER")
#' res_exact <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.2, method="exact")
#'
#' plot(res_equivalence$nB, res_equivalence$Power, xlim=c(1,20), ylim=c(0,1), type="s", main="TOST power analysis")
#' lines(res_toster$nB, res_toster$Power, col="green", type="s")
#' abline(h=0.8, col="blue", lty=2)
#' points(res_exact$nB, res_exact$Power, col="red", pch=18, cex=2)
#' legend("topleft", legend=c("equivalence", "TOSTER", "exact"), col=c("black", "green", "red"), pch=1)
#'
#' res_equivalence <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.1, method="equivalence")
#' res_exact <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.1, method="exact")
#' plot(res_equivalence$nB, res_equivalence$Power, xlim=c(1,80), ylim=c(0,1), type="s", main="TOST power analysis")
#' abline(h=0.8, col="blue", lty=2)
#' points(res_exact$nB, res_exact$Power, col="red", pch=18, cex=2)
#' legend("topleft", legend=c("equivalence", "exact"), col=c("black", "red"), pch=1)
#' @seealso
#' \url{http://powerandsamplesize.com/Calculators/Compare-2-Means/2-Sample-Equality}
#' Chow S, Shao J, Wang H. 2008. Sample Size Calculations in Clinical Research. 2nd Ed. Chapman & Hall/CRC Biostatistics Series. page 58.
#' @export
tost_power <- function(mu_A, mu_B, sd_A, sd_B, delta=0.3, kappa=1, alpha=0.05, power=0.8, method="equivalence", n_iterations=1000, paired=F, sample_var=0) {
require(equivalence)
beta <- 1-power
if(method == "exact") {
# Reference for exact calculation:
# http://powerandsamplesize.com/Calculators/Compare-2-Means/2-Sample-Equality
# Chow S, Shao J, Wang H. 2008. Sample Size Calculations in Clinical Research. 2nd Ed. Chapman & Hall/CRC Biostatistics Series. page 58.
# Adapted for different variance for group A and B
# (1+1/kappa)*(sd*(qnorm(1-alpha)+qnorm(1-beta/2))/(abs(mu_A-mu_B)-delta))^2
nB <- (sd_A^2+sd_B^2/kappa)*((qnorm(1-alpha)+qnorm(1-beta/2))/(abs(mu_A-mu_B)-delta))^2
nB_ceil <- ceiling(nB) # 108
# z <- (abs(mu_A-mu_B)-delta)/(sd*sqrt((1+1/kappa)/nB))
z <- (abs(mu_A-mu_B)-delta)/(sqrt((sd_A^2+sd_B^2/kappa)/nB))
Power <- 2*(pnorm(z-qnorm(1-alpha))+pnorm(-z-qnorm(1-alpha)))-1
return(list(nB=nB_ceil, Power=Power))
}
nB_i <- 3
power_empirical <- 0
power_curve <- c()
patience <- 3
while(power_empirical < power | patience > 0) {
pvals <- sapply(1:n_iterations, function(i) {
if(!paired) {
sA <- rnorm(nB_i*kappa, mean=mu_A, sd=sd_A)
sB <- rnorm(nB_i, mean=mu_B, sd=sd_B)
} else {
sample_effect <- rnorm(nB_i, sd=sqrt(sample_var))
sA <- rnorm(nB_i, mean=mu_A+sample_effect, sd=sd_A)
sB <- rnorm(nB_i, mean=mu_B+sample_effect, sd=sd_B)
}
if(method == "equivalence") {
res <- tost(sB, sA, epsilon=delta, paired=paired, conf.leve=1-alpha)
res$tost.p.value
} else {
stopifnot(!paired)
sd_A_emp <- sd(sA)
sd_B_emp <- sd(sB)
sd_pooled <- sqrt( ((nB_i*kappa)*sd_A_emp^2 + (nB_i)*sd_B_emp^2) / (nB_i*kappa + nB_i - 2) )
res <- TOSTtwo(m1=mean(sA), m2=mean(sB), sd1=sd_A_emp, sd2=sd_B_emp, n1=nB_i*kappa, n2=nB_i, low_eqbound=-(delta/sd_pooled), high_eqbound=(delta/sd_pooled), alpha = alpha)
res$TOST_p2
}
})
power_empirical <- sum(pvals < 0.05) / n_iterations
power_curve <- c(power_curve, power_empirical)
nB_i <- nB_i + 1
if(power_empirical > power) patience <- patience - 1
}
return(list(nB=1:length(power_curve)+2, Power=power_curve))
}
#' @export
aws_ls <- function(s3path, awspath="/Users/tching/Library/Python/2.7/bin//aws") {
cmd <- sprintf("%s s3 ls %s --recursive | awk '{$1=$2=$3=\"\"; print $0}' | sed 's/^[ \t]*//'", awspath, s3path)
system(cmd, intern=T)
}
#' Heterogeneous cluster helper
#' @description Using PSOCK cluster from future package
#' @export
halCluster <- function(varlist=ls(envir), .packages="auto", envir=parent.frame(), mc.cores.server=8, mc.cores.local=3,
user=Sys.info()["user"], server_ip="hal2",revtunnel=T, ...) {
# if(master_ip == "auto") {
# master_ip <- tryCatch({system("internal-ip --ipv4", intern=T)},
# error=function(e) cat("Run npm install --global internal-ip-cli\\n")
# )
# }
if(.packages[1] == "auto") {
.packages <- names(sessionInfo()$otherPkgs)
}
require(parallel)
require(future)
cl <- makeClusterPSOCK(c(rep("localhost", mc.cores.local),rep(server_ip,mc.cores.server)), user=user,
homogeneous=F, revtunnel=revtunnel,...)
clusterExport(cl, varlist=varlist, envir=envir)
if(!is.null(.packages)) {
clusterCall(cl, fun=function() {
for(pkg in .packages) {
library(pkg, character.only=T)
}
})
}
return(cl)
}
#' @export
log_offset_scale <- function(offset, base=10){
trans <- function(x) {log(x+offset, base=base)}
inv <- function(x) {base^x-offset}
trans_new("log_offset_scale", transform = trans, inverse = inv)
}
#' @export
pow <- function(x, ...) {
vars <- substitute(list(...))
vars <- as.character(vars)[-1]
stopifnot(length(x) == length(vars))
for(i in seq_len(length(x))) assign(vars[i], x[[i]], pos = 1)
}
#' @export
rc <- function(nucSeq) return(stringi::stri_reverse(chartr("acgtACGT", "tgcaTGCA", nucSeq)))
#' @export
sterling2 <- function(n,k) {
require(gmp)
sum((-1L)^(0:k) * chooseZ(k, 0:k) * as.bigz(k-0:k)^(as.bigz(n))) %/% factorialZ(k)
}
#' @export
sterling2_sum <- function(n) {
require(gmp)
sum <- as.bigz(0)
for(k in 1:n) {
sum = sum + sterling2(n,k)
}
return (sum)
}
#' @export
enroll <- function(...) {
x <- list(...)
nx <- names(x)
for(i in seq_along(x)) {
if(nx[i] != "") {
assign(nx[i], x[[i]], envir=parent.frame())
}
}
}
#' @export
install_as_name <- function(pkg, new_name, tempfolder=NULL) {
require(dplyr)
require(Rcpp)
previous_dir <- getwd()
if (is.null(tempfolder))
tempfolder <- tempfile()
dir.create(tempfolder, showWarnings = F)
setwd(tempfolder)
print(tempfolder)
sprintf("wget %s", pkg) %>% system
zfolder <- untar(basename(pkg), list = T, compressed = "gzip")
untar(basename(pkg), compressed = "gzip")
zfolder <- zfolder %>% gsub("/.*$", "", .) %>% unique
z <- list.files(zfolder, recursive = T, full.names = T)
z <- z[basename(z) == "DESCRIPTION"]
stopifnot(length(z) == 1)
stopifnot(length(zfolder) == 1)
# sprintf("sed -i -r 's/^Package:.+/Package: %s/' %s/DESCRIPTION",
# new_name, new_name) %>% system
# sprintf("sed -i -r 's/^useDynLib.+/useDynLib\\(%s, .registration = TRUE\\)/' %s/NAMESPACE",
# new_name, new_name) %>% system
x <- readLines(z)
x <- gsub("^Package.+", sprintf("Package: %s", new_name), x)
writeLines(x, con=z)
x <- readLines(paste0(zfolder, "/NAMESPACE"))
x <- gsub("^useDynLib.+", sprintf("useDynLib(%s, .registration = TRUE)", new_name), x)
writeLines(x, con=paste0(zfolder, "/NAMESPACE"))
sprintf("mv %s %s", zfolder, new_name) %>% system
Rcpp::compileAttributes(new_name)
file.remove(sprintf("%s/MD5", new_name))
"rm *.tar.gz" %>% system
sprintf("R CMD build %s", new_name) %>% system
"R CMD INSTALL *.tar.gz" %>% system
setwd(previous_dir)
}
# https://stackoverflow.com/questions/56353069/r-parallel-abort-all-mclapply-operations
mclapply_catch <- function(X, FUN, ..., mc.preschedule=TRUE,
mc.set.seed=TRUE, mc.silent=FALSE,
mc.cores=getOption("mc.cores", 2L),
mc.cleanup=TRUE, mc.allow.recursive=TRUE,
affinity.list=NULL){
require(parallel)
tmpFileName <- tempfile()
fn <- function(X){
if(file.exists(tmpFileName))
return(NA)
o <- try(do.call("FUN", c(X, list(...))), silent=TRUE)
if(class(o)=="try-error"){
file.create(tmpFileName)
}
o
}
ret <- mclapply(X=X, FUN=fn, mc.preschedule=mc.preschedule,
mc.set.seed=mc.set.seed, mc.silent=mc.silent,
mc.cores=mc.cores, mc.cleanup=mc.cleanup,
mc.allow.recursive=mc.allow.recursive,
affinity.list=affinity.list)
if(exists(tmpFileName))
file.remove(tmpFileName)
ret
}
#' @export
inspect <- function(...) {
.Internal(inspect(...))
}
#' @export
sysbash <- function(cmd, ...) {
bash_profile <- file.exists("~/.bash_profile")
if(bash_profile) {
cmd <- sprintf("source ~/.bash_profile; %s", cmd)
system2("/bin/bash", args = c("-c", shQuote(cmd)), ...)
} else {
cmd <- sprintf("source ~/.profile; %s", cmd)
system2("/bin/bash", args = c("-c", shQuote(cmd)), ...)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.