R/my_functions.r

Defines functions install_as_name enroll sterling2_sum sterling2 rc pow log_offset_scale halCluster aws_ls tost_power set_colnames set_rownames trqwe_KNN gg_legend gg_legend_bottom gg_rotate_xlabels gg_center_title drew_table_plot make_percent matrixFactor logit sigmoid cindex f1score posteriorBalance mccscore fastPR fastROC fastAUC ww_test nelson_aelen_surv mcreadRDS readRZ saveRZ mcreadRDS mcsaveRDS mcReduce multipagePlot chop loadFFList saveListFF scaleVec intRangeString cosineDist read.gmt colorInterpolate se topn `prepend<-` `append<-` `%Q%` mcsplitapply fpkmFromCounts varSizes tablec tableVector fileIgnoreCase mgrepl TCGA_barcode tail2 head2 fastReadLines allDups statsCallback timePrompt install bioc reloadtrqwe reload

Documented in allDups bioc chop cindex cosineDist f1score fastAUC fastPR fastReadLines fastROC fpkmFromCounts halCluster head2 install logit make_percent matrixFactor mccscore mcreadRDS mcsaveRDS mcsplitapply mgrepl nelson_aelen_surv posteriorBalance readRZ reload reloadtrqwe saveRZ se set_colnames set_rownames sigmoid statsCallback tablec tail2 TCGA_barcode timePrompt topn tost_power trqwe_KNN varSizes ww_test

#' Reload a package.
#' @description Unload and reload a package and sets the namespace search order.  
#' @param package Unquoted package name.
#' @param pos Namespace search position.
#' @examples
#' \code{reload(trqwe)}
#' @export
reload <- function(package, pos=2) {
    pstring <- deparse(substitute(package))
    unloadNamespace(pstring)
    library(pstring, pos=pos, character.only=T)
}

#' Unload and reload trqwe.
#' @description Unload and reload trqwe.
#' Shortcut for \code{reload(trqwe)}
#' @export
reloadtrqwe <- function() {
    reload(trqwe)
}


#' Install Bioconductor package.
#' @description Utility function for installing packages from bioconductor easily.
#' @param package unquoted package name.
#' @examples
#' \code{bioc(DESeq2)}
#' @export
bioc <- function(package) {
    source("http://bioconductor.org/biocLite.R")
    pstring <- deparse(substitute(package))
    biocLite(pstring)
}

#' Install CRAN package.
#' @description Utility function for installing packages from CRAN easily.
#' @param package unquoted package name.
#' @examples
#' \code{install(Rcpp)}
#' @export
install <- function(package, repos="http://cran.us.r-project.org") {
    pstring <- deparse(substitute(package))
    install.packages(pstring, repos=repos)
}

#' Time profiling.
#' @description Reports time in seconds to the R prompt of the previous command.
#' This function adds a callback which saves the running of individual commands and reports the time in seconds on the next line.  
#' @examples
#' > timePrompt()
#' 0.000s> x <- sample(1:10, size=1e8, replace=T)
#' 1.240s> 
#' Note - this time is not accurate if child processes or multithreading is involved.  
#' @export
timePrompt <- function() {
    removeTaskCallback(".exec_time_prompt")
    updatePrompt <- function(...) {
        utime <- proc.time()
        utime_prev <- get0(".prev_time", envir=globalenv(), ifnotfound=utime)
        utime_diff <- sprintf("%.3f", sum(utime[c(1,2)] - utime_prev[c(1,2)]))
        options(prompt=paste0(utime_diff,"s> "))
        assign(".prev_time", utime, envir=globalenv())
        return(TRUE)
        }
    ret <- addTaskCallback(updatePrompt, name=".exec_time_prompt")
}

#' Variable information.  
#' @description Automatically stores basic information of variables in the previous command.
#' This function adds a callback which reports information on previous variables and stores this information in \code{.stats}.
#' @examples
#' statsCallback()
#' my_data <- VADeaths
#' .stats
#' > [1] "dim: 5 4, length: 20, class: matrix, typeof: double"
#' @export
statsCallback <- function() {
    removeTaskCallback(".stats_callback")
    statsUpdate <- function(...) {
        lv <- get0(".Last.value", envir=globalenv(), ifnotfound=NULL)
        dim <- paste0(dim(lv), collapse=" ")
        length <- length(lv)
        class <- class(lv)
        typeof <- typeof(lv)
        assign(".stats", paste0("dim: ",dim,", length: ", length, ", class: ", class, ", typeof: ",typeof), envir=globalenv())
        return(TRUE)
        }
    ret <- addTaskCallback(statsUpdate, name=".stats_callback")
}

#' All duplicates.
#' @description Finds all duplicates in a vector including first instances of duplicates.
#' @param vec A vector.
#' @return A boolean vector of the same length as \code{vec}.  TRUE if the element is duplicated, FALSE if not.  
#' @examples
#' allDups(sample(1:100, size=100, replace=T)
#' @export
allDups <- function(vec) {
	duplicated(vec) | rev(duplicated(rev(vec)))
}

#' Fast readLines.
#' @description Replacement for readLines, faster.  
#' @param fname Filename to read.
#' @param newlinechar The new line character in the file.  
#' @return A vector containing all the lines in the file.
#' @examples
#' library(microbenchmark)
#' writeLines(replicate(100000, sample(letters, size=100, replace=T)), con="/tmp/temp.txt")
#' microbenchmark(fastReadLines("/tmp/temp.txt"), 
#'                readLines("/tmp/temp.txt"), times=3)[,c("expr", "mean"), drop=F]
#' 
#' Unit: milliseconds
#'                           expr      min       lq     mean   median       uq
#' fastReadLines("/tmp/temp.txt") 335.3874 335.9188 336.5900 336.4502 337.1912
#'     readLines("/tmp/temp.txt") 724.9136 725.4523 727.2444 725.9911 728.4098
#' @export
fastReadLines <- function(fname, newlinechar="\n", nchars=NULL) {
    if(is.null(nchars)) {
        nchars = file.info(fname)$size
    }
    buf = readChar(fname, nchars, useBytes=T)
    strsplit(buf,newlinechar,fixed=T,useBytes=T)[[1]]
}

#' Upper-left corner of matrix.  
#' @description Shortcut function for previewing a matrix or data.frame by displaying the upper-left corner.  Similar to \code{head}. 
#' @param x A wide matrix or data.frame.  
#' @param n Number of lines to display.  Default 10. 
#' @param ncols Number of columns to display.  Default 10. 
#' @return \code{n} by \code{ncols} subset of the matrix taken from the upper-left corner.  
#' @export
head2 <- function(x, n = 10, ncols = 10) {
    head(x[,seq_len(ncols)])
}

#' Lower-left corner of matrix.  
#' @description Shortcut function for previewing the bottom of a matrix or data.frame by displaying the lower-left corner.  Similar to \code{tail}. 
#' @param x A wide matrix or data.frame.  
#' @param n Number of lines to display.  Default 10. 
#' @param ncols Number of columns to display.  Default 10. 
#' @return \code{n} by \code{ncols} subset of the matrix taken from the lower-left corner.  
#' @export
tail2 <- function(x, n = 10, ncols = 10) {
    tail(x[,seq_len(ncols)])
}

#' Parse TCGA barcode.
#' @description Taking in a full TCGA sample barcode, or any subset of the barcode, and return extracted values.  
#' @param x TCGA barcode, or a vector of barcodes.  
#' @param what Which information to return.  
#' @return The specified information contained in the barcode of the same length as \code{x}. 
#' @examples
#' TCGA_barcode(c("TCGA-02-0001-01C-01D-0182-01", "TCGA-02-0001-11C-01D-0182-01"), what="tissue")
#' 
#' [1] "01" "11"
#' @export
TCGA_barcode <- function(x, what = "patient") {
    reg <- "TCGA-(..)-(....)-?(..)?(.)?-?(..)?(.)?-?(....)?-?(..)?"
    group <- switch(what, 
        TSS = "\\1",
        participant = "\\2",
        patient = "TCGA-\\1-\\2",
        sample = "\\3",
        tissue = "\\3",
        vial = "\\4",
        portion = "\\5",
        analyte = "\\6",
        plate = "\\7",
        center = "\\8")
    gsub(reg, group, x, perl=T)
}

#' Multiple grepl.
#' @description Takes in a list of regex patterns and returns true if any pattern matches. 
#' @param pattern A vector of regex patterns.  
#' @param x A vector of strings to search.  
#' @return A vector of the same length as \code{x}, TRUE if any pattern matches.  
#' @examples
#' x <- fastReadLines("http://textfiles.com/ufo/ufobooks.ufo", newlinechar="\r\n", 1e5)
#' head(x[mgrepl(c("ALIENS", "UFO"), x)])
#' 
#' [1] "                         UFOLOGY BOOKS (REVISION 2.1 343 books)"
#' [2] "     H. S. Stewart on the  subject of UFO's. The list is alphabetic"
#' [3] "     Tom Mickus's most excellent board UFONET I.  (416-237-1204)"
#' [4] "     Bill Adler               *   LETTERS TO THE AIR FORCE ON UFOS  1967"
#' [5] "     Gordon W. Allen              OVERLORDS OLYMPIANS AND THE UFO   1974"
#' [6] "     Robert B. Beard              FLYING SAUCERS, UFO'S AND EXTRA"
#' @export
mgrepl <- function(patterns, x, ...) {
    Reduce("|",lapply(patterns, grepl, x, ...))
}

fileIgnoreCase <- function(x) {
    dir <- dirname(x)
    base <- basename(x)
    files <- list.files(dir)
    select <- grepl(base, files, ignore.case=T)
    if(any(select)) {
        paste0(dir,"/",files[select][1])
    } else {
        return(NA)
    }
}

tableVector <- function(x, factors=NULL) {
    if(!is.null(factors)) {
        x <- factor(x, levels = factors)
    }
    table(x) -> tab
    structure(as.vector(tab), .Names=names(tab))
}

#' Fast C++ tabulation.
#' @description Takes in a character, integer or factor vector and tabulates the number of times each element appears.  
#' @param x A character, integer or factor vector.  NAs are allowed.  
#' @param sort TRUE if the result names should be sorted alphanumerically.  
#' @return A integer vector of counts of each element.  
#' @examples
#'
#' x <- factor(sample(1e5, 1e8, replace=T))
#' microbenchmark(table(x), tablec(x), times=3)
#' 
#' Unit: milliseconds
#'      expr       min         lq       mean    median         uq        max
#'  table(x) 9777.6457 10479.0949 10717.3382 11180.544 11187.1844 11193.8246
#' tablec(x)  678.0364   685.9467   713.1181   693.857   730.6589   767.4608
#'
#'
#' x <- sample(letters, 1e8, replace=T)
#' microbenchmark(table(x), tablec(x), tablec(x,sort=T), times=3)
#' 
#'Unit: seconds
#'                expr      min       lq     mean   median       uq      max
#'            table(x) 5.778514 5.829125 5.855560 5.879737 5.894083 5.908430
#'           tablec(x) 1.589360 1.589381 1.589516 1.589402 1.589594 1.589786
#' tablec(x, sort = T) 1.589386 1.590520 1.591824 1.591655 1.593044 1.594432
#' @export
tablec <- function(x, sort=F) {
    cl <- class(x)
    if(cl == "character") {
        ret <- tablec_string(x)
    } else if(cl == "integer") {
        ret <- tablec_int(x)
    } else if(cl == "factor") {
        ret <- tablec_factor(x)
    } else {
        stop("x must be a character, integer or factor vector")
    }
    if(sort) {
        return(ret[sort(names(ret), na.last=F)])
    } else {
        return(ret)
    }
}

#' Size of R objects. 
#' @description Prints out the size of all R objects in the environment. 
#' @param env The environment to search (default global environment). 
#' @param units Units to print out for each variable.  
#' @return A data.frame containing the size of each object.  
#' @export
varSizes <- function(env=globalenv(), units="KB") {
    vars <- ls(envir=env)
    if(length(vars) == 0) return(NULL)
    sizes <- sapply(vars, function(v) {
        format(object.size(get(v, envir=env)), units=units)
    })
    data.frame(vars=vars, size=sizes, row.names=1:length(sizes), stringsAsFactors=F)
}

#' Calculate FPKM.
#' @description Calculates FPKM from a count matrix.
#' @param mat An integer matrix representing the counts of a RNA-Seq dataset.  
#' @param gene_lengths A named vector with the length of each gene.  
#' @param uq_norm If TRUE, will perform upper-quartile normalization.  
#' @return A matrix containing FPKM with the same dimensions as mat.  
#' @seealso
#' FPKM-UQ - A normalized read count in which gene expression values, in FPKM, are divided by the 75th percentile value.
#'
#' \url{https://gdc-docs.nci.nih.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/}
#'
#' \url{http://vinaykmittal.blogspot.com/2013/10/fpkmrpkm-normalization-caveat-and-upper.html}
#' @export
fpkmFromCounts<- function(mat, gene_lengths, uq_norm=T) {
    gene_lengths <- gene_lengths[rownames(mat)]
    gene_lengths <- gene_lengths / 1000
    lib_sizes <- colSums(mat) / 1e6
    mat <- diag(1 / gene_lengths) %*% mat
    mat <- subset(mat, apply(mat, 1, sum) != 0)
    if(uq_norm == F) {
        return(mat)
    } else {
        uqs <- apply(mat, 2, quantile, 0.75)
        print(summary(uqs))
        mat <- mat %*% diag(1 / uqs)
        return(mat)
    } 
}


# plotROC <- function(pred, class, showplot=T, main="ROC Curve", ...) {
    # require(pROC)
    # roc <- roc(class, pred)
    # auc <- auc(roc)
    
    # conf_mat <- data.matrix(table(pred >= 0.5, class))
    # acc <- (conf_mat[1,1] + conf_mat[2,2])/sum(conf_mat)
    
    # specificity <- roc$specificities
    # sensitivity <- roc$sensitivities
    # if(showplot) { 
        # plot(1-specificity, sensitivity, xlim = c(0,1), ylim = c(0,1), sub=paste0("auc=",round(auc,4), "; accuracy=",round(acc,4)), type="l", lwd=2, main=main,...)
        # abline(a=0,b=1, col="gray")
    # }
    # return(list(specificity, sensitivity))
# }

# needs work, kinda broken atm
# safemclapply <- function(arg_list, p_function, mc.cores=32, tasks_per_core_per_round=3, save_folder=".temp/") {
    # require(parallel)
    # require(digest)
    # # require(dplyr)
    
    # substr(
        # digest(list(capture.output(str(arg_list)),
            # p_function,
            # mc.cores,
            # tasks_per_core_per_round,
            # save_folder)),
    # 1,10) -> run_md5
    
    # function_ls <- ls()

    
    # print(paste("md5 hash: ", run_md5))
    # tasks_per_round = mc.cores*tasks_per_core_per_round
    # task_sequence = seq(1,ceiling(length(arg_list)/tasks_per_round))
    # dir.create(save_folder, showWarnings = FALSE)
    
    # # for(lsv in ls(envir=parent.frame(1))) {lockBinding(lsv,parent.frame(1))}
    # # for(lsv in ls(envir=parent.frame(2))) {lockBinding(lsv,parent.frame(2))}
    
    # for(i in task_sequence) {
        # savefile = paste0(save_folder,run_md5,"_",i,".Rds")
        # if(file.exists(savefile) && file.size(savefile) > 0) next
        # batch_indices = seq((i-1)*tasks_per_round+1,min(i*tasks_per_round,length(arg_list)))
        # print(paste("Processing ",batch_indices[1], " to ", tail(batch_indices,1)))
        # mclapply(arg_list[batch_indices], p_function, mc.cores=mc.cores) -> temp
        # saveRDS(temp, file=savefile)
        # gc()
    # }
    
    # # for(lsv in ls(envir=parent.frame(1))) {unlockBinding(lsv,parent.frame(1))}
    # # for(lsv in ls(envir=parent.frame(2))) {unlockBinding(lsv,parent.frame(2))}
    
    # ret_list <- list()
    # for(i in task_sequence) {
        # savefile = paste0(save_folder,run_md5,"_",i,".Rds")
        # readRDS(savefile) -> ret_list[[i]]
    # }
    # do.call(c, ret_list) -> ret
    
    
    # return(ret)
# }

# splitForMC <- function(arg_list, folder) {
    # if(is.null(names(arg_list))) {
        # names(arg_list) <- 1:length(arg_list)
    # }
    # for(i in 1:length(arg_list)) {
        # names(arg_list)[i] -> na
        # saveRDS(arg_list[[i]], file=sprintf("%s/%s.Rds",folder,na))
    # }
# }

#' Parallel split-matrix or dataframe apply loop
#' @description 
#' Splits a matrix or data.frame into subsets based on a factor, and applies a function to each subset.  Typical use case: sum exon count data to gene count data.  This is similar to the dplyr idiom: \code{df \%>\% group_by(f) \%>\% do(...)}, but has several advantages.  1) mcsplitapply can be used on matrices (and is therefore much faster), 2) inherently parallelized, 3) can return results other than dataframes, 4) you can specify how the data are combined (default is rbind).   
#' @param mat The matrix.
#' @param f A factor of length equal to nrow(mat).  The levels of this factor will split the matrix into subsets.  
#' @param func The function to apply to each subset.  
#' @param .combine The function to combine the results with.  Default is rbind.  Use NA to return a list. 
#' @param mc.cores The number of cores to use.  
#' @return A list or a combined object depending on the .combine parameter.  
#' @examples
#' library(pasilla)
#' library(DEXSeq)
#' library(trqwe)
#' data(pasillaDEXSeqDataSet)
#' exon_counts <- counts(dxd)
#' f <- rowData(dxd)$groupID
#' gene_counts <- mcsplitapply(exon_counts, f, colSums)
#' @export
mcsplitapply <- function(mat, f, func, mc.cores=4, .combine=rbind, ...) {
    if(mc.cores > 1) require(parallel)
    uf <- unique(f)
    ret <- parallel::mclapply(uf, function(fi) {
      func(mat[f == fi,,drop=F])
      }, mc.cores=mc.cores)
    names(ret) <- uf
    if(!is.function(.combine)) {
        return(ret)
    } else {
        ret <- do.call(.combine, ret)
        return(ret)
    }
}

#' Concatenate strings.  
#' @description Concatenates two strings.  
#' @param a First string.  
#' @param b Second string.  
#' @return The concatenated string.
#' @examples
#' 'Hello ' %Q% 'World'
#' 
#' [1] "Hello World"
#' @export
`%Q%` <- function(a, b) paste0(a, b)


#' Append to a vector.  
#' @description Appends the 2nd argument to the 1st.  
#' @param x A vector. 
#' @param value The element to append.  
#' @examples
#' x <- 1:5
#' append(x) <- 6
#' print(x)
#' 
#' [1] 1 2 3 4 5 6
#' @export
`append<-` <- function(x, value) {
    c(x, value)
}

#' Prepend to a vector.  
#' @description Prepends the 2nd argument to the 1st.  
#' @param x A vector. 
#' @param value The element to append.  
#' @examples
#' x <- 1:5
#' prepend(x) <- 6
#' print(x)
#' 
#' [1] 6 1 2 3 4 5
#' @export
`prepend<-` <- function(x, value) {
    c(value, x)
}

#' Highest elements in a vector.
#' @description Finds the top elements in a vector very quickly.  Equivalent ot \code{-sort(-x, partial=1:n)}
#' @param x A numeric vector.  
#' @param n The number of top elements to return.  
#' @param lowest If TRUE, returns the lowest elements instead of the highest. 
#' @param value If TRUE, returns the values of the top elements.  If FALSE, returns the indices.  
#' @return A vector containing the indices or the values of the top elements.  
#' @examples
#' naive_top <- function(x, n) {
#'    -sort(-x, partial=1:n)
#' }
#' x <- runif(1e7)
#' microbenchmark(naive_top(x,100), topn(x,100,value=T), times=10)
#'
#' Unit: milliseconds
#'                    expr       min        lq     mean    median        uq
#'       naive_top(x, 100) 1070.0180 1071.5951 1075.964 1072.3520 1073.9989
#' topn(x, 100, value = T)  433.6682  433.8882  434.771  434.4986  435.6029
#' @seealso
#' \url{http://stackoverflow.com/questions/18450778/}
#' @export
topn <- function(x, n=100, value=F, lowest=F) {
    if(lowest) x <- -x
    nx <- length(x)
    p <- nx-n
    xp <- sort(x, partial=p)[p]
    wh <- x > xp
    if(value) {
        return(x[wh])
    } else {
        return(which(wh))
    }
}

#' Standard error.  
#' @description Calculates the standard error of a sampling distribution.  
#' @param x A vector.  
#' @return The standard error of x.  
#' @examples
#' x <- rnorm(1e3)
#' se(x)
#' [1] 0.03192027
#' @export
se <- function(x) {x<-na.omit(x);sd(x)/sqrt(length(x))}

# rescale <- function(x, min=0, max=1) {
    # (x-min(x))/(max(x)-min(x)) * (max-min) + min
# }

colorInterpolate <- function(x, color1="white", color2="blue") {
    colramp <- colorRamp(c(color1, color2))
    cols <- colramp(rescale(x))
    cols <- cols/255
    ncol(cols) -> nc
    apply(cols, 1, function(r) {
        if(nc == 3) {
            rgb(r[1], r[2], r[3])
        } else {
            rgb(r[1], r[2], r[3], r[4])
        }
    })

}

#devel branch : https://github.com/GuangchuangYu/clusterProfiler/blob/master/R/GMT.R
read.gmt <- function(gmtfile) {
    require(GSEABase)
    gmt <- getGmt(con=gmtfile)
    ont2gene <- stack(geneIds(gmt))
    ont2gene <- ont2gene[, c("ind", "values")]
    colnames(ont2gene) <- c("ont", "gene")
    return(ont2gene)
}


#specific function for pulling down pathway descriptions from GSEA
# pathwayDescriptionPullDown <- function(gmt_file) {
    # require(curl)
    # readLines(gmt_file) -> gmt
    # strsplit(gmt, split ="\t") -> gmt
    # t(sapply(gmt, function(p) {
        # c(p[1], p[2])
    # })) -> pathway_urls
    # sapply(pathway_urls[,2], function(url) {
        # tryCatch({
        # con <- curl(url)
        # lines <- readLines(con)
        # desc <- lines[which(grepl("Brief", lines))+1]
        # desc <- gsub("^.*<td>","",desc)
        # desc <- gsub("</td>$","",desc)
        # desc <- gsub("<a href.+?</a>","",desc)
        # desc <- gsub("  "," ",desc)
        # desc <- gsub("[\\. ]+$","",desc)
        # print(desc)
        # Sys.sleep(.5)
        # return(desc)
        # }, error=function(e) e)
    # }) -> descs
    # data.frame(short=pathway_urls[,1], url=pathway_urls[,2], desc=descs)
# }

#' Cosine distance. 
#' @description Calculates the cosine distance of rows of a matrix.  
#' @param x A matrix.  
#' @return Cosine distance as a dist object.  
#' @examples
#' rbind(c(1,1,1,0,0,0), c(0,0,0,1,1,1), c(1,1,1,0,0,1)) -> x
#' cosineDist(x)
#'           1         2
#' 2 1.0000000
#' 3 0.1339746 0.7113249
#' @seealso
#' \url{http://stackoverflow.com/questions/2535234/find-cosine-similarity-in-r}
#' @export
cosineDist <- function(x){
  as.dist(1 - x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2))))) 
}


intRangeString <- function(vec) {
    if(length(vec) ==1) return(as.character(vec))
    vec <- sort(vec)
    res <- list()
    res[[1]] <- vec[1]
    for(i in 2:length(vec)) {
        if(vec[i] == vec[i-1] + 1 || vec[i] == vec[i-1]) {
            res[[length(res)]] <- c(res[[length(res)]],vec[i])
        } else {
            res[[length(res)+1]] <- vec[i]
        }
    }
    sapply(res, function(r) {
        if(min(r) == max(r)) {
            return(as.character(max(r)))
        } else {
            return(paste0(min(r), "-", max(r)))
        }
    }) -> res
    paste0(res, collapse=",")
}

# leadingEdgeGenes <- function(geneList, geneSet, exponent=1) {
    # require(DOSE)
    # df <- DOSE:::gseaScores(geneList=geneList, geneSet=geneSet, exponent=exponent,fortify=T)
    # names(geneList)[which(df$position==1)]
    # upreg <- max(df$runningScore) > abs(min(df$runningScore))
    # if(upreg) {
        # set <- 1:which.max(df$runningScore)
    # } else {
        # set <- which.min(df$runningScore):length(df$runningScore)
    # }
    # intersect(names(geneList)[set], geneSet)
# }

scaleVec <- function(x, ...) {
    scale(x, ...)[,1]
}

# getAuc <- function(true_Y, probs) {
    # probsSort = sort(probs, decreasing = TRUE, index.return = TRUE)
    # val = unlist(probsSort$x)
    # idx = unlist(probsSort$ix)  

    # roc_y = true_Y[idx];
    # stack_x = cumsum(roc_y == 0)/sum(roc_y == 0)
    # stack_y = cumsum(roc_y == 1)/sum(roc_y == 1)    

    # auc = sum((stack_x[2:length(roc_y)]-stack_x[1:length(roc_y)-1])*stack_y[2:length(roc_y)])
    # return(list(stack_x=stack_x, stack_y=stack_y, auc=auc))
# }

# getAuc <- function(class, pred) {
    # ord <- order(pred)
    # pred <- pred[ord]
    # class <- class[ord]
    # as.data.frame(table(pred, class)) -> tab
    # # tab$pred <- unique(pred)
    # npos <- sum(subset(tab, class==1)$Freq)
    # nneg <- sum(subset(tab, class==0)$Freq)
    # tpr <- npos - cumsum(subset(tab, class==1)$Freq)
    # tpr <- tpr/npos
    # tpr <- c(0, rev(tpr))
    # fpr <- cumsum(subset(tab, class==0)$Freq)
    # fpr <- fpr/fpr[length(fpr)]
    # fpr <- c(0, fpr)
    # auc <- sum(sapply(1:(nrow(tab)/2), function(i) {
        # (tpr[i] + tpr[i+1])/2 * (fpr[i+1]-fpr[i])
    # }))
    # return(list(auc=auc, tpr=tpr, fpr=fpr))
# }



#style converter for titles
# titleConvert <- function(title, style="camel", sep=" ", original_sep="_") {
    # title <- unlist(strsplit(title, split=original_sep))
    # title <- tolower(title)
    # if(style == "camel") {
        # title <- gsub("^(.)", "\\U\\1", title, perl=T)
    # } else if(style == "upper") {
        # title <- toupper(title)
    # }
    # paste0(title, collapse=sep)
# }

# do.call.args <- function(what, args, quote = FALSE, envir = parent.frame(), ...) {
    # extra_args <- list(...)
    # do.call(what, c(args, extra_args))
# }

#' @export
saveListFF <- function(list, return_ff_list=F, ...) {
    require(ff)
    if(is.null(names(list))) {
        names(list) <- 1:length(list)
    }
    if(any(is.na(names(list)))) {
        stop("list must have names")
    }
    ff_list <- sapply(list, as.ff, simplify=F)
    with(ff_list, {
        ffsave(list=ls(),...)
    })
    if(return_ff_list) return(ff_list)
}

#' @export
loadFFList <- function(file, ...) {
    require(ff)
    ff_env <- new.env()
    ffload(file, envir=ff_env, ...)
    return(as.list(ff_env))
}

#' Cleans leading and trailing whitespace.  
#' @description Removes leading and trailing whitespace in a vector of strings.  
#' @param x A character vector.  
#' @return A vector with leading and trailing whitespace removed.  
#' @examples
#' chop(c("    hello ", "  123  \t"))
#'
#' [1] "hello" "123"
#' @export
chop <- function(x) {
    gsub("\\s+$","",gsub("^\\s+","",x))
}

multipagePlot <- function(plot_list, nrow=2, ncol=2,...) {
  require(grid)
  require(gridExtra)
  while(length(plot_list) > 0) {
    page_plots <- plot_list[seq(min(nrow*ncol, length(plot_list)))]
    plot_list <- plot_list[-seq(nrow*ncol)]
    grid.arrange(grobs=page_plots, ncol=ncol, nrow=nrow,...)
  }
}



#mclapply is broken and has memory issues.  Workaround: save return value to temp folder and read at the end of loop
#to do: still needs work
# null.mclapply <- function(X,FUN,mc.cores,...) {
    # require(parallel)
    # base <- paste0(tempdir(),"/",paste0(sample(c(letters,LETTERS,0:9), replace=T,size=12),collapse=""))
    # message(paste0("Saving results to ",base))
    # mclapply(seq_along(X),function(i) {
        # saveRDS(FUN(X[i]),file=paste0(base,"_",i,".Rds"),compress=F)
        # return(NULL)
    # }, mc.cores=mc.cores,...)
    # message(paste0("Reading results from ",base))
    # mclapply(seq_along(X),function(i) {
        # readRDS(paste0(base,"_",i,".Rds"))
    # },mc.cores=mc.cores,...)  
# }





# testAUC <- function(probs, class) {
    # x <- probs
    # y <- class
    # x1 = x[y==1]; n1 = length(x1); 
    # x2 = x[y==0]; n2 = length(x2);
    # r = rank(c(x1,x2))
    # h <- ???
    # r_aug <- c(0,r[1:n1],n2)
    # w <- sapply(1:n1, function(ii) {
        # (r[ii]+r[ii+2])/2 - 1
    # })
    # auc <- sum(w*h)/(n1*n2)
# }


# which operations actually benefit?
mcReduce <- function(FUN,x,mc.cores=4) {
    require(parallel)
    while(length(x) != 1) {
        x_tail <- tail(x,length(x) %% 2)
        x <- mclapply(1:floor(length(x)/2), function(i) {
            FUN(x[[i*2-1]],x[[i*2]])
        }, mc.cores=mc.cores)
        if(length(x_tail) > 0) x <- c(x,x_tail)
        # print(length(x))
    }
    return(x[[1]])
}

#' Multi-threaded saveRDS
#' @description Uses the pigz utility to improve saving large R objects.  This is compatible with saveRDS and readRDS functions.  Requires pigz (sudo apt-get install pigz on Ubuntu).  
#' @param object An r object to save.  
#' @param file The filename to save to.
#' @param mc.cores How many cores to use in pigz.  The program does not seem to benefit after more than about 4 cores.  
#' @examples
#' x <- sample(1e4, 1e7, replace=T)
#' y <- sample(1e4, 1e7, replace=T)
#' microbenchmark(mcsaveRDS(x, file="temp.Rds"), saveRDS(y, file="temp2.Rds")
#' 
#' Unit: seconds
#'                             expr      min       lq     mean   median       uq
#'  mcsaveRDS(x, file = "temp.Rds") 1.908310 1.908310 1.908310 1.908310 1.908310
#'   saveRDS(y, file = "temp2.Rds") 6.271499 6.271499 6.271499 6.271499 6.271499
#' @seealso
#' \url{http://stackoverflow.com/questions/28927750/}
#' @export
mcsaveRDS <- function(object,file,mc.cores=min(parallel::detectCores(),4)) {
  con <- pipe(paste0("pigz -p",mc.cores," > ",file),"wb")
  saveRDS(object, file = con)
  close(con)
}

#' Multi-threaded readRDS
#' @description Uses the pigz utility to improve loading large R objects.  This is compatible with saveRDS and readRDS functions.  Requires pigz (sudo apt-get install pigz on Ubuntu).  
#' @param file The filename of the rds object. 
#' @param mc.cores How many cores to use in pigz.  The program does not seem to benefit after more than about 4 cores.  
#' @return The R object.  
#' @examples
#' x <- sample(1e4, 1e7, replace=T)
#' saveRDS(x, file="temp.Rds")
#' xmc <- mcreadRDS("temp.Rds")
#' @seealso
#' \url{http://stackoverflow.com/questions/28927750/}
#' @export
mcreadRDS <- function(file,mc.cores=min(parallel::detectCores(),4)) {
  con <- pipe(paste0("pigz -d -c -p",mc.cores," ",file))
  object <- readRDS(file = con)
  close(con)
  return(object)
}



#' saveRZ
#' @description faster serialization + compression using zstd
#' @export
saveRZ <- function(object, file, compression=3, mc.cores=1) {
  con <- pipe(sprintf("zstd --format=zstd -f -%s -T%s -o %s", compression, mc.cores, file), "wb")
  saveRDS(object, file = con)
  close(con)
  # writeBin(fst::compress_fst(serialize(object, connection=NULL), compressor=compressor, compression=compression), con=file, useBytes=T)
}

#' readRz
#' @description faster serialization + compression using fst package
#' @export
readRZ <- function(file, mc.cores=1) {
  # unserialize(fst::decompress_fst(readBin(file, "raw", n=file.info(file)$size)))
  con <- pipe(sprintf("zstd --format=zstd -d -c -T%s %s", mc.cores, file), "rb")
  object <- readRDS(file = con)
  close(con)
  return(object)
}

#' Multi-threaded readRDS
#' @description Uses the pigz utility to improve loading large R objects.  This is compatible with saveRDS and readRDS functions.  Requires pigz (sudo apt-get install pigz on Ubuntu).  
#' @param file The filename of the rds object. 
#' @param mc.cores How many cores to use in pigz.  The program does not seem to benefit after more than about 4 cores.  
#' @return The R object.  
#' @examples
#' x <- sample(1e4, 1e7, replace=T)
#' saveRDS(x, file="temp.Rds")
#' xmc <- mcreadRDS("temp.Rds")
#' @seealso
#' \url{http://stackoverflow.com/questions/28927750/}
#' @export
mcreadRDS <- function(file,mc.cores=min(parallel::detectCores(),4)) {
  con <- pipe(paste0("pigz -d -c -p",mc.cores," ",file))
  object <- readRDS(file = con)
  close(con)
  return(object)
}


#' Nelson Aelen estimator
#' @description Nelson–Aalen estimator is an estimator of the cumulative hazard function in survival data.  It can be used to compare the overall risks of two groups or used to estimate the number of deaths before a certain time.  
#' @param time The time of each patient.  
#' @param event The death of each patient: 1 for a patient death, 0 for censored.  
#' @return A list containing the cumulative hazard function.  
#' @examples
#' library(survival)
#' data(veteran)
#' with(nelson_aelen_surv(veteran$time, veteran$status), plot(ti, Hi, type="b"))
#' @seealso
#' #https://en.wikipedia.org/wiki/Nelson-Aalen_estimator
#' @export
nelson_aelen_surv <- function(time, event) {
    ord <- order(time)
    time <- time[ord]
    event <- event[ord]
    di <- table(time[event==1])
    ti <- as.numeric(names(di))
    di <- as.vector(di)
    ni <- length(time) - cumsum(di) + di
    list(Hi=cumsum(di/ni),ti=ti)
}

#' Wald-Wolfowitz Runs Tests for Randomness
#' @description This is the k-category asymptotic Z Test with continuity correction.  Imagine rolling a die multiple times to obtain a sequence of rolls.  This statistic tests whether there exists a "run" within the sequence where one particular number comes up more times in a row than expected randomly.  If the test is significant, it can be concluded that the die rolls are not independent.  
#' @param x A vector of items, coerced into a factor.  
#' @return A p-value for the statistical test.  
#' @examples
#' set.seed(1)
#' ww_test(sample(2, 100, replace=T))
#' ww_test(c(sample(6, 90, replace=T),rep(1,10)))
#' @seealso
#' Reference https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Analysis_of_Runs.pdf
#' @export
ww_test <- function(x) {
    x <- factor(x)
    n <- length(x)
    njs <- table(x)
    njs <- structure(.Data=as.vector(njs), .Names=names(njs))
    mu <- (n*(n+1) - sum(njs^2))/n
    sdev_l <- n^2*(n-1)
    sdev_u <- sum(njs^2*(sum(njs^2)+n*(n+1))) - 2*n*sum(njs^3)-n^3
    sdev <- sqrt(sdev_u/sdev_l)
    r <- sum(diff(as.numeric(x)) != 0) + 1
    z <- (r - mu) / sdev
    pvalue <- 2*pnorm(-abs(z))
    return(pvalue)
}

#' Fast AUC
#' @description This function calculates the Area Under the Reciever-Operator Curve from the results of a classifcation model.  
#' @param probs A numeric vector of probabilities or likelihoods for each data point.  
#' @param class A numeric vector where 1 is positive and 0 is negative.  
#' @return The AUC.  
#' @examples
#' library(AppliedPredictiveModeling)
#' data(abalone)
#' library(glmnet)
#' class <- factor(as.numeric(abalone$Type == "M"))
#' data <- data.matrix(abalone[,-1])
#' fit <- cv.glmnet(data, class, family="binomial")
#' probs <- predict(fit, newx=data)[,1]
#' fastAUC(probs, class)
#' @seealso
#' Reference https://stat.ethz.ch/pipermail/r-help/2005-September/079872.html
#' @export
fastAUC <- function(probs, class, method="trqwe") {
    #ROCR is now much faster
    if(method=="ROCR") {
      require(ROCR)
      pred <- ROCR::prediction(probs, class)
      perf <- performance(pred, "auc")
      return(perf@y.values[[1]])
    }
    x <- probs
    y <- class
    x1 = x[y==1]; n1 = length(x1); 
    x2 = x[y==0]; n2 = length(x2); 
    r = rank(c(x1,x2))  
    auc = (sum(r[1:n1]) - n1*(n1+1)/2) / n1 / n2
    return(auc)
}

#' Fast ROC
#' @description Calculates the points in a Reciever-Operator Curve from the results of a classifcation model.  
#' @param probs A numeric vector of probabilities or likelihoods for each data point.  
#' @param class A numeric vector where 1 is positive and 0 is negative.  
#' @return a list containing the ROC curve.  
#' @examples
#' library(AppliedPredictiveModeling)
#' data(abalone)
#' library(glmnet)
#' class <- factor(as.numeric(abalone$Type == "M"))
#' data <- data.matrix(abalone[,-1])
#' fit <- cv.glmnet(data, class, family="binomial")
#' probs <- predict(fit, newx=data)[,1]
#' with(fastROC(probs, class), plot(fpr, tpr, type="l"))
#' @export
fastROC <- function(probs, class) {
    if(is.factor(class)) {class = as.numeric(class) - 1}
    class_sorted <- class[order(probs, decreasing=T)]
    TPR <- cumsum(class_sorted) / sum(class)
    FPR <- cumsum(class_sorted == 0) / sum(class == 0)
    return(list(tpr=TPR, fpr=FPR))
}

#' Precision-Recall curve
#' @description Calculates the points in a Precision-Recall from the results of a classifcation model.  
#' @param probs A numeric vector of probabilities or likelihoods for each data point.  
#' @param class A numeric vector where 1 is positive and 0 is negative.  
#' @return a list containing the ROC curve.  
#' @examples
#' library(AppliedPredictiveModeling)
#' data(abalone)
#' library(glmnet)
#' class <- factor(as.numeric(abalone$Type == "M"))
#' data <- data.matrix(abalone[,-1])
#' fit <- cv.glmnet(data, class, family="binomial")
#' probs <- predict(fit, newx=data)[,1]
#' with(fastPR(probs, class), plot(recall, precision, type="l"))
#' @export
fastPR <- function(probs, class) {
    if(is.factor(class)) {class = as.numeric(class) - 1}
    class_sorted <- class[order(probs, decreasing=T)]
    recall <- cumsum(class_sorted) / sum(class)
    precision <- cumsum(class_sorted) / 1:length(class_sorted)
    return(list(recall=recall, precision=precision))
}

#' Matthew's correlation coefficient
#' @description Calculates Matthew's correlation coefficient.  Requires the Rmpfr package.  
#' @param probs A numeric vector where 1 is predicted positive and 0 is predicted negative.  
#' @param class A numeric vector where 1 is positive and 0 is negative.  
#' @return The MCC score.  
#' @seealso
#' \url{https://en.wikipedia.org/wiki/Matthews_correlation_coefficient}
#' @export
mccscore <- function(probs, class) {
    require(Rmpfr)
    m <- function(x) mpfr(x, precBits=20)
    tp <- m(sum((probs == 1) & (class == 1)))
    tn <- m(sum((probs == 0) & (class == 0)))
    fp <- m(sum((probs == 1) & (class == 0)))
    fn <- m(sum((probs == 0) & (class == 1)))
    as.numeric(((tp * tn) - (fp * fn)) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
}

#' Posterior probability adjustment.
#' @description Adjusts the posterior probability of a classifier based on unbalanced datasets.  In classification model where the negative data is randomly under-sampled and all the positive data is used, the adjustment factor (beta) is p(s=1|-) = p(+)/p(-).  I.e., the probability that a negative datapoint is selected in the classifier.  beta ~ N+/N-.  
#' @param probs The original posterior probability.  
#' @param beta The adjustment factor.  
#' @param Nplus The number of positive examples in the real data.  Only used if beta is NULL.  
#' @param Nminus The number of negative examples in the real data.  Only used if beta is NULL.  
#' @return The adjusted posterior probability.  
#' @seealso
#' Dal Pozzolo, Andrea, et al. "Calibrating probability with undersampling for unbalanced classification." Computational Intelligence, 2015 IEEE Symposium Series on. IEEE, 2015.
#' @export
posteriorBalance <- function(probs, beta=NULL, Nplus, Nminus) {
  if(is.null(beta)) {
    beta <- Nplus/Nminus
  }
  beta*probs / (beta*probs-probs+1)
}

#' F1 score.  
#' @description Calculates F1 score from the results of a classification model.  
#' @param probs A numeric vector where 1 is predicted positive and 0 is predicted negative.  
#' @param class A numeric vector where 1 is positive and 0 is negative.  
#' @return The F1 score.  
#' @seealso
#' \url{https://en.wikipedia.org/wiki/F1_score}
#' @export
f1score <- function(probs, class) {
    tp <- sum((probs == 1) & (class == 1))
    fp <- sum((probs == 1) & (class == 0))
    fn <- sum((probs == 0) & (class == 1))
    precision <- tp / (tp + fp)
    recall <- tp / (tp + fn)
    2 * precision * recall / (precision + recall)
}

#' Concordance Index.  
#' @description Calculates the concordance index from the results of a censored survival model.  Very fast compared to other packages.  
#' @param probs The prognostic score of each patient.  
#' @param time The time of each patient.  
#' @param event The death of each patient: 1 for a patient death, 0 for censored. 
#' @return The concordance index.  
#' @export
cindex <- function(probs, time, event)
{
    wh <- which(event==1)
    time_mat <- outer(time, time[wh], ">")
    pred_mat <- outer(probs, probs[wh], "<")
    pred_mat_eq <- outer(probs, probs[wh], "==")
    total <- sum(time_mat)
    concord <- sum(time_mat & pred_mat) + sum(time_mat & pred_mat_eq)*0.5
    concord/total
}

#' Sigmoid Function. 
#' @description Calculates the results of the sigmoid function.  
#' @param probs x Input to the function.  
#' @return Sigmoid(x)
#' @export
sigmoid <- function(x) {
    return(1/(1+exp(-x)))
}

#' Logit Transformation. 
#' @description Calculates the results of the Logit Transformation.  
#' @param x Input to the function (e.g., probabilities from 0 to 1).  
#' @return Logit(x)
#' @export
logit <- function(x) {
    return(log(x/(1-x)))
}

#' Matrix Factor Design.  
#' @description From a factor, returns a design matrix with a column for each level.  
#' @param x A factor.  
#' @param names The name of each instance in the resulting matrix (i.e., the rownames).  
#' @return The design matrix.  
#' @examples
#' matrixFactor(factor(letters))
#' @export
matrixFactor <- function(x, names=NULL) {
    x <- factor(x)
    xlevels <- levels(x)
    x_mat <- matrix(nrow = length(x), ncol = length(xlevels), 0)
    colnames(x_mat) <- xlevels
    if(!is.null(names)) {
        rownames(x_mat) <- names
    }
    col <- match(x, xlevels)
    for(i in 1:length(x)) {
        x_mat[i,col[i]] <- 1
    }
    return(x_mat)
}

#' Make percentage from number
#' @description Make percentage from number
#' @param x A number.
#' @param digits Number of decimal places, default 2.  
#' @return A percentage.
#' @examples
#' make_percent(0.424, 2)
#' @export
make_percent <- function(x, digits=2) {
  paste0(round(100*x,digits),"%")
}


drew_table_plot <- function(my_data_frame, title=NULL) {
    library(gridExtra)
    library(gtable)
    library(grid)
    library(ggplot2)
  tt3 = ttheme_minimal(  core=list(bg_params = list(fill = c("azure3","azure2", "azure1", "white"), col=NA),
                                   fg_params=list(fontface=3, cex = .55)), colhead=list(fg_params=list(col="black", fontface=4L,
                                                                                                       cex = .55)), rowhead=list(fg_params=list(col="white", fontface=3L, cex = .55)))
  g1 = tableGrob(my_data_frame, theme = tt3)
  title = textGrob(title, gp=gpar(fontsize=14))
  padding = unit(5,"mm")
  table = gtable_add_rows(g1, heights = grobHeight(title) + padding, pos = 0)
  table = gtable_add_grob(table, title, 1, 1, 1, ncol(table))
  grid.arrange(table, nrow=1, newpage = FALSE)
}

gg_center_title <- function() {
    theme(plot.title = element_text(hjust = 0.5))
}

gg_rotate_xlabels <- function(angle=90, hjust=1, vjust=0.5, ...) {
    theme(axis.text.x = element_text(angle = angle, hjust = hjust, vjust=vjust))
}

gg_legend_bottom <- function() {
  theme(legend.position="bottom")
}

#https://stackoverflow.com/questions/12041042/how-to-plot-just-the-legends-in-ggplot2
gg_legend<-function(a.gplot){ 
  tmp <- ggplot_gtable(ggplot_build(a.gplot)) 
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 
  legend <- tmp$grobs[[leg]] 
  return(legend)
}


#' Fast binary KNN classifier
#' @description Fast binary KNN classifier
#' @param distmat A NxN pre-computed distance matrix
#' @param train_idx Train indicies
#' @param test_idx Test indicies
#' @param classes vector length N, 1 or 0
#' @param K Number of nearest neighbors parameter
#' @param mc.cores Number of threads to use
#' @return A prediction vector for the test set based on the class labels of the train set.
#' @export
trqwe_KNN <- function(distmat, train_idx, test_idx, classes, K, mc.cores=1) {
  if(mc.cores==1) {
    sapply(test_idx, function(ti) {
      sum(classes[topn(distmat[ti,train_idx], n=K, lowest=T)])/K
    })
  } else {
    unlist(mclapply(test_idx, function(ti) {
      sum(classes[topn(distmat[ti,train_idx], n=K, lowest=T)])/K
    }, mc.cores=mc.cores))
  }
}

#' Set rownames of data.frame or matrix
#' @description Set rownames of data.frame or matrix and return it, for use with pipes
#' @param df data.frame or matrix
#' @param rownames rownames to add
#' @return df with rownames added
#' @export
set_rownames <- function(df, rownames) {
  rownames(df) <- rownames
  return(df)
}
#' Set colnames of data.frame or matrix
#' @description Set colnames of data.frame or matrix and return it, for use with pipes
#' @param df data.frame or matrix
#' @param colnames colnames to add
#' @return df with colnames added
#' @export
set_colnames <- function(df, colnames) {
  colnames(df) <- colnames
  return(df)
}

# https://stackoverflow.com/questions/7509910/how-can-i-create-a-custom-assignment-using-a-replacement-function
once <- structure(NA, class = "once")
"[<-.once" <- function(once, x, value) {
  xname <- deparse(substitute(x))
  pf <- parent.frame()
  if (!exists(xname, pf)) assign(xname, value, pf)
  once
}

# multiple assignment; the zeallot package does it better, so no longer using this approach
#' #' Multiple return assignment
#' #' @description Python style multiple return assignment.  
#' #' @examples
#' #' mreturn[x,y,z] <- list("hello", c(1,2,3), sqrt(2))
#' #' print(x)
#' #' [1] "hello"
#' #' print(y)
#' #' [1] 1 2 3
#' #' print(z)
#' #' [1] 1.414214
#' #' @rdname mreturn
#' mreturn <- structure(NA, class = "mreturn")
#' 
#' `[<-.mreturn` <- function(mreturn, ..., value) {
#'   vars <- sapply(substitute(list(...)), deparse)[-1]
#'   stopifnot(all(make.names(vars) == vars)) # invalid variable names
#'   stopifnot(length(value) == length(vars)) # incorrect number of return values
#'   for(i in 1:length(vars)) {
#'     assign(vars[i], value=value[[i]], envir = parent.frame())
#'   }
#'   return(mreturn)
#' }



# This is the from the TOSTER package, without any extraneous output
TOSTtwo <- function (m1, m2, sd1, sd2, n1, n2, low_eqbound_d, high_eqbound_d, 
                     alpha, var.equal) 
{
  if (missing(alpha)) {
    alpha <- 0.05
  }
  if (missing(var.equal)) {
    var.equal <- FALSE
  }
  if (var.equal == TRUE) {
    sdpooled <- sqrt((((n1 - 1) * (sd1^2)) + (n2 - 1) * (sd2^2))/((n1 + 
                                                                     n2) - 2))
    low_eqbound <- low_eqbound_d * sdpooled
    high_eqbound <- high_eqbound_d * sdpooled
    degree_f <- n1 + n2 - 2
    t1 <- ((m1 - m2) - low_eqbound)/(sdpooled * sqrt(1/n1 + 
                                                       1/n2))
    p1 <- pt(t1, degree_f, lower.tail = FALSE)
    t2 <- ((m1 - m2) - high_eqbound)/(sdpooled * sqrt(1/n1 + 
                                                        1/n2))
    p2 <- pt(t2, degree_f, lower.tail = TRUE)
    t <- (m1 - m2)/(sdpooled * sqrt(1/n1 + 1/n2))
    pttest <- 2 * pt(-abs(t), df = degree_f)
    LL90 <- (m1 - m2) - qt(1 - alpha, n1 + n2 - 2) * (sdpooled * 
                                                        sqrt(1/n1 + 1/n2))
    UL90 <- (m1 - m2) + qt(1 - alpha, n1 + n2 - 2) * (sdpooled * 
                                                        sqrt(1/n1 + 1/n2))
    LL95 <- (m1 - m2) - qt(1 - (alpha/2), n1 + n2 - 2) * 
      (sdpooled * sqrt(1/n1 + 1/n2))
    UL95 <- (m1 - m2) + qt(1 - (alpha/2), n1 + n2 - 2) * 
      (sdpooled * sqrt(1/n1 + 1/n2))
  }
  else {
    sdpooled <- sqrt((sd1^2 + sd2^2)/2)
    low_eqbound <- low_eqbound_d * sdpooled
    high_eqbound <- high_eqbound_d * sdpooled
    degree_f <- (sd1^2/n1 + sd2^2/n2)^2/(((sd1^2/n1)^2/(n1 - 
                                                          1)) + ((sd2^2/n2)^2/(n2 - 1)))
    t1 <- ((m1 - m2) - low_eqbound)/sqrt(sd1^2/n1 + sd2^2/n2)
    p1 <- pt(t1, degree_f, lower.tail = FALSE)
    t2 <- ((m1 - m2) - high_eqbound)/sqrt(sd1^2/n1 + sd2^2/n2)
    p2 <- pt(t2, degree_f, lower.tail = TRUE)
    t <- (m1 - m2)/sqrt(sd1^2/n1 + sd2^2/n2)
    pttest <- 2 * pt(-abs(t), df = degree_f)
    LL90 <- (m1 - m2) - qt(1 - alpha, degree_f) * sqrt(sd1^2/n1 + 
                                                         sd2^2/n2)
    UL90 <- (m1 - m2) + qt(1 - alpha, degree_f) * sqrt(sd1^2/n1 + 
                                                         sd2^2/n2)
    LL95 <- (m1 - m2) - qt(1 - (alpha/2), degree_f) * sqrt(sd1^2/n1 + 
                                                             sd2^2/n2)
    UL95 <- (m1 - m2) + qt(1 - (alpha/2), degree_f) * sqrt(sd1^2/n1 + 
                                                             sd2^2/n2)
  }
  ptost <- max(p1, p2)
  ttost <- ifelse(abs(t1) < abs(t2), t1, t2)
  dif <- (m1 - m2)
  testoutcome <- ifelse(pttest < alpha, "significant", "non-significant")
  TOSToutcome <- ifelse(ptost < alpha, "significant", "non-significant")
  
  if (var.equal == TRUE) {
    # message(cat("Using alpha = ", alpha, " Student's t-test was ", 
    #             testoutcome, ", t(", degree_f, ") = ", t, ", p = ", 
    #             pttest, sep = ""))
    # cat("\\n")
    # message(cat("Using alpha = ", alpha, " the equivalence test based on Student's t-test was ", 
    #             TOSToutcome, ", t(", degree_f, ") = ", ttost, ", p = ", 
    #            ptost, sep = ""))
  }
  else {
    # message(cat("Using alpha = ", alpha, " Welch's t-test was ", 
    #            testoutcome, ", t(", degree_f, ") = ", t, ", p = ", 
    #            pttest, sep = ""))
    # cat("\\n")
    # message(cat("Using alpha = ", alpha, " the equivalence test based on Welch's t-test  was ", 
    #            TOSToutcome, ", t(", degree_f, ") = ", ttost, ", p = ", 
    #            ptost, sep = ""))
  }
  TOSTresults <- data.frame(t1, p1, t2, p2, degree_f)
  colnames(TOSTresults) <- c("t-value 1", "p-value 1", "t-value 2", 
                             "p-value 2", "df")
  bound_d_results <- data.frame(low_eqbound_d, high_eqbound_d)
  colnames(bound_d_results) <- c("low bound d", "high bound d")
  bound_results <- data.frame(low_eqbound, high_eqbound)
  colnames(bound_results) <- c("low bound raw", "high bound raw")
  CIresults <- data.frame(LL90, UL90)
  colnames(CIresults) <- c(paste("Lower Limit ", 100 * (1 - 
                                                          alpha * 2), "% CI raw", sep = ""), paste("Upper Limit ", 
                                                                                                   100 * (1 - alpha * 2), "% CI raw", sep = ""))
  # cat("TOST results:\\n")
  #print(TOSTresults)
  # cat("\\n")
  # cat("Equivalence bounds (Cohen's d):\\n")
  #print(bound_d_results)
  # cat("\\n")
  # cat("Equivalence bounds (raw scores):\\n")
  #print(bound_results)
  # cat("\\n")
  # cat("TOST confidence interval:\\n")
  #print(CIresults)
  invisible(list(TOST_t1 = t1, TOST_p1 = p1, TOST_t2 = t2, 
                 TOST_p2 = p2, TOST_df = degree_f, alpha = alpha, low_eqbound = low_eqbound, 
                 high_eqbound = high_eqbound, low_eqbound_d = low_eqbound_d, 
                 high_eqbound_d = high_eqbound_d, LL_CI_TOST = LL90, UL_CI_TOST = UL90))
}


#' Calculates power/sample size of a TOST test
#' @description Calculates the sample size required for a TOST test, given a sample size through exact statistics or simulation
#' @param mu_A mean of group A
#' @param mu_B mean of group B
#' @param sd_A SD of group A
#' @param sd_B SD of group B
#' @param delta maximum tolerated difference (i.e., abs(mu_A - mu_B))
#' @param kappa sample size of group A / sample size of group B (nA / nB)
#' @param alpha significance threshold (0.05 default)
#' @param power Desired power level (0.8 default)
#' @param method Whether to use the exact statistics or simulation.  "Exact" - exact statistics, "equivalence" - simulate using equivalence package tost function, "TOSTER" - simulate using TOSTER functions
#' @param n_iterations Number of simulation iterations
#' @param paired If the TOST is paired (kappa should be 1)
#' @param paired_var Sample variance for paired TOST
#' @return For exact statistics, returns a list of two values: minimum sample size to achieve desired power and the power at that sample size.  For simulation, returns a series of sample sizes and achieved power up to the desired power.  
#' @examples
#' res_equivalence <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.2, method="equivalence")
#' res_toster <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.2, method="TOSTER")
#' res_exact <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.2, method="exact")
#' 
#' plot(res_equivalence$nB, res_equivalence$Power, xlim=c(1,20), ylim=c(0,1), type="s", main="TOST power analysis")
#' lines(res_toster$nB, res_toster$Power, col="green", type="s")
#' abline(h=0.8, col="blue", lty=2)
#' points(res_exact$nB, res_exact$Power, col="red", pch=18, cex=2)
#' legend("topleft", legend=c("equivalence", "TOSTER", "exact"), col=c("black", "green", "red"), pch=1)
#' 
#' res_equivalence <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.1, method="equivalence")
#' res_exact <- tost_power(mu_A=1, mu_B=1, sd_A=0.2, sd_B=0.2, delta=0.1, method="exact")
#' plot(res_equivalence$nB, res_equivalence$Power, xlim=c(1,80), ylim=c(0,1), type="s", main="TOST power analysis")
#' abline(h=0.8, col="blue", lty=2)
#' points(res_exact$nB, res_exact$Power, col="red", pch=18, cex=2)
#' legend("topleft", legend=c("equivalence", "exact"), col=c("black", "red"), pch=1)
#' @seealso
#' \url{http://powerandsamplesize.com/Calculators/Compare-2-Means/2-Sample-Equality}
#' Chow S, Shao J, Wang H. 2008. Sample Size Calculations in Clinical Research. 2nd Ed. Chapman & Hall/CRC Biostatistics Series. page 58.
#' @export
tost_power <- function(mu_A, mu_B, sd_A, sd_B, delta=0.3, kappa=1, alpha=0.05, power=0.8, method="equivalence", n_iterations=1000, paired=F, sample_var=0) {
  require(equivalence)
  
  beta <- 1-power
  
  if(method == "exact") {
    # Reference for exact calculation: 
    # http://powerandsamplesize.com/Calculators/Compare-2-Means/2-Sample-Equality
    # Chow S, Shao J, Wang H. 2008. Sample Size Calculations in Clinical Research. 2nd Ed. Chapman & Hall/CRC Biostatistics Series. page 58.
    # Adapted for different variance for group A and B
    # (1+1/kappa)*(sd*(qnorm(1-alpha)+qnorm(1-beta/2))/(abs(mu_A-mu_B)-delta))^2
    nB <- (sd_A^2+sd_B^2/kappa)*((qnorm(1-alpha)+qnorm(1-beta/2))/(abs(mu_A-mu_B)-delta))^2
    nB_ceil <- ceiling(nB) # 108
    # z <- (abs(mu_A-mu_B)-delta)/(sd*sqrt((1+1/kappa)/nB))
    z <- (abs(mu_A-mu_B)-delta)/(sqrt((sd_A^2+sd_B^2/kappa)/nB))
    Power <- 2*(pnorm(z-qnorm(1-alpha))+pnorm(-z-qnorm(1-alpha)))-1
    return(list(nB=nB_ceil, Power=Power))
  }
  
  nB_i <- 3
  power_empirical <- 0
  power_curve <- c()
  patience <- 3
  while(power_empirical < power | patience > 0) {
    pvals <- sapply(1:n_iterations, function(i) {
      if(!paired) {
        sA <- rnorm(nB_i*kappa, mean=mu_A, sd=sd_A)
        sB <- rnorm(nB_i, mean=mu_B, sd=sd_B)
      } else {
        sample_effect <- rnorm(nB_i, sd=sqrt(sample_var))
        sA <- rnorm(nB_i, mean=mu_A+sample_effect, sd=sd_A)
        sB <- rnorm(nB_i, mean=mu_B+sample_effect, sd=sd_B)
      }
       if(method == "equivalence") {
         res <- tost(sB, sA, epsilon=delta, paired=paired, conf.leve=1-alpha)
         res$tost.p.value
       } else {
        stopifnot(!paired)
        sd_A_emp <- sd(sA)
        sd_B_emp <- sd(sB)
        sd_pooled <- sqrt( ((nB_i*kappa)*sd_A_emp^2 + (nB_i)*sd_B_emp^2) / (nB_i*kappa + nB_i - 2) )
        res <- TOSTtwo(m1=mean(sA), m2=mean(sB), sd1=sd_A_emp, sd2=sd_B_emp, n1=nB_i*kappa, n2=nB_i, low_eqbound=-(delta/sd_pooled), high_eqbound=(delta/sd_pooled), alpha = alpha)
        res$TOST_p2
       }
    })
    power_empirical <- sum(pvals < 0.05) / n_iterations
    power_curve <- c(power_curve, power_empirical)
    nB_i <- nB_i + 1
    if(power_empirical > power) patience <- patience - 1
  }
  return(list(nB=1:length(power_curve)+2, Power=power_curve))
}

#' @export
aws_ls <- function(s3path, awspath="/Users/tching/Library/Python/2.7/bin//aws") {
  cmd <- sprintf("%s s3 ls %s --recursive | awk '{$1=$2=$3=\"\"; print $0}' | sed 's/^[ \t]*//'", awspath, s3path)
  system(cmd, intern=T)
}


#' Heterogeneous cluster helper
#' @description Using PSOCK cluster from future package 
#' @export
halCluster <- function(varlist=ls(envir), .packages="auto", envir=parent.frame(), mc.cores.server=8, mc.cores.local=3,
                      user=Sys.info()["user"], server_ip="hal2",revtunnel=T, ...) {
  # if(master_ip == "auto") {
  #   master_ip <- tryCatch({system("internal-ip --ipv4", intern=T)},
  #                         error=function(e) cat("Run npm install --global internal-ip-cli\\n")
  #   )
  # }
  if(.packages[1] == "auto") {
    .packages <- names(sessionInfo()$otherPkgs)
  }
  require(parallel)
  require(future)
  cl <- makeClusterPSOCK(c(rep("localhost", mc.cores.local),rep(server_ip,mc.cores.server)), user=user, 
                         homogeneous=F, revtunnel=revtunnel,...)
  clusterExport(cl, varlist=varlist, envir=envir)
  if(!is.null(.packages)) {
    clusterCall(cl, fun=function() {
      for(pkg in .packages) {
        library(pkg, character.only=T)
      }
    })
  }
  return(cl)
}


#' @export
log_offset_scale <- function(offset, base=10){
  trans <- function(x) {log(x+offset, base=base)}
  inv <- function(x) {base^x-offset}
  trans_new("log_offset_scale", transform = trans, inverse = inv)
}

#' @export
pow <- function(x, ...) {
  vars <- substitute(list(...))
  vars <- as.character(vars)[-1]
  stopifnot(length(x) == length(vars))
  for(i in seq_len(length(x))) assign(vars[i], x[[i]], pos = 1)
}


#' @export
rc <- function(nucSeq) return(stringi::stri_reverse(chartr("acgtACGT", "tgcaTGCA", nucSeq)))




#' @export
sterling2 <- function(n,k) {
  require(gmp)
  sum((-1L)^(0:k) * chooseZ(k, 0:k) * as.bigz(k-0:k)^(as.bigz(n))) %/% factorialZ(k)
}

#' @export
sterling2_sum <- function(n) {
  require(gmp)
  sum <- as.bigz(0)
  for(k in 1:n) {
    sum = sum + sterling2(n,k)
  }
  return (sum)
}

#' @export
enroll <- function(...) {
  x <- list(...)
  nx <- names(x)
  for(i in seq_along(x)) {
    if(nx[i] != "") {
      assign(nx[i], x[[i]], envir=parent.frame())
    }
  }
}

#' @export
install_as_name <- function(pkg, new_name, tempfolder=NULL) {
  require(dplyr)
  require(Rcpp)
  previous_dir <- getwd()
  if (is.null(tempfolder)) 
    tempfolder <- tempfile()
  dir.create(tempfolder, showWarnings = F)
  setwd(tempfolder)
  print(tempfolder)
  sprintf("wget %s", pkg) %>% system
  zfolder <- untar(basename(pkg), list = T, compressed = "gzip")
  untar(basename(pkg), compressed = "gzip")
  zfolder <- zfolder %>% gsub("/.*$", "", .) %>% unique
  z <- list.files(zfolder, recursive = T, full.names = T)
  z <- z[basename(z) == "DESCRIPTION"]
  stopifnot(length(z) == 1)
  stopifnot(length(zfolder) == 1)
  
  # sprintf("sed -i -r 's/^Package:.+/Package: %s/' %s/DESCRIPTION", 
  #         new_name, new_name) %>% system
  # sprintf("sed -i -r 's/^useDynLib.+/useDynLib\\(%s, .registration = TRUE\\)/' %s/NAMESPACE", 
  #         new_name, new_name) %>% system
  x <- readLines(z)
  x <- gsub("^Package.+", sprintf("Package: %s", new_name), x)
  writeLines(x, con=z)
  
  x <- readLines(paste0(zfolder, "/NAMESPACE"))
  x <- gsub("^useDynLib.+", sprintf("useDynLib(%s, .registration = TRUE)", new_name), x)
  writeLines(x, con=paste0(zfolder, "/NAMESPACE"))
  
  sprintf("mv %s %s", zfolder, new_name) %>% system
  
  Rcpp::compileAttributes(new_name)
  
  file.remove(sprintf("%s/MD5", new_name))
  "rm *.tar.gz" %>% system
  sprintf("R CMD build %s", new_name) %>% system
  "R CMD INSTALL *.tar.gz" %>% system
  setwd(previous_dir)
  
}


# https://stackoverflow.com/questions/56353069/r-parallel-abort-all-mclapply-operations
mclapply_catch <- function(X, FUN, ..., mc.preschedule=TRUE,
                     mc.set.seed=TRUE, mc.silent=FALSE,
                     mc.cores=getOption("mc.cores", 2L), 
                     mc.cleanup=TRUE, mc.allow.recursive=TRUE,
                     affinity.list=NULL){
    require(parallel)
    tmpFileName <- tempfile()
    fn <- function(X){
        if(file.exists(tmpFileName))
            return(NA)
        o <- try(do.call("FUN", c(X, list(...))), silent=TRUE)
        if(class(o)=="try-error"){
            file.create(tmpFileName)
        }
        o
    }
    ret <- mclapply(X=X, FUN=fn, mc.preschedule=mc.preschedule,
                    mc.set.seed=mc.set.seed, mc.silent=mc.silent,
                    mc.cores=mc.cores, mc.cleanup=mc.cleanup,
                    mc.allow.recursive=mc.allow.recursive,
                    affinity.list=affinity.list)
    if(exists(tmpFileName))
        file.remove(tmpFileName)
    ret
}

#' @export
inspect <- function(...) {
  .Internal(inspect(...))
}

#' @export
sysbash <- function(cmd, ...) {
  bash_profile <- file.exists("~/.bash_profile")
  if(bash_profile) {
    cmd <- sprintf("source ~/.bash_profile; %s", cmd)
    system2("/bin/bash", args = c("-c", shQuote(cmd)), ...)
  } else {
    cmd <- sprintf("source ~/.profile; %s", cmd)
    system2("/bin/bash", args = c("-c", shQuote(cmd)), ...)
  }
}
traversc/trqwe documentation built on Dec. 4, 2020, 4:21 a.m.