MSstatsSampleSize: Simulation tool for optimal design of high-dimensional MS-based proteomics experiment

Documented in .catch_faults .classification_model .classification_performance .data_checks .do_prcomp .feature_importance .identify_optimal .logGeneration .log_write .pca_plot .randomImputation .sampleSimulation .status theme_MSstats

#' Simulate extended datasets for sample size estimation
#'
#' @param m number of samples per group to simulate
#' @param mu a matrix of mean abundance in each phenotype group and each protein
#' @param sigma a matrix of variance in each phenotype group and each protein
#' @return \emph{X} A simulated matrix with required sample size
#' @return \emph{Y} Group information corresponding with \emph{X}
#' @keywords internal
.sampleSimulation <- function(m, mu, sigma) {

    # record number of proteins and groups
    nproteins <- nrow(mu)
    ngroup <- ncol(mu)
    # make sure the column names are same bewteen mu and sigma
    sigma <- sigma[, colnames(mu)]
    ## Determine the size of each phenotype group
    samplesize <- rep(m, ngroup)

    ## Simulate the data matrix
    sim_matrix <- matrix(rep(0, nproteins * sum(samplesize)), ncol=sum(samplesize))
    for (i in seq_len(nproteins)) {
        index <- 1
        for (j in seq_len(ngroup)) {
            sim_matrix[i, index:(index+samplesize[j]-1)] <- rnorm(samplesize[j], mu[i, j], sigma[i, j])
            index <- index + samplesize[j]

        }
    }

    sim_matrix <- t(sim_matrix)
    colnames(sim_matrix) <- rownames(mu)
    #Simulate the phenotype information
    group <- rep(colnames(mu), times=samplesize)

    index <- sample(length(group), length(group))
    sim_matrix <- sim_matrix[index, ]
    group <- group[index]

    return(list(X=sim_matrix,
                Y=as.factor(group)))

}


#' For each protein, impute the missing values based on the observed values
#'
#' @param data protein abundance data for one protein.
#' @return Imputed protein abundance data
#' @keywords internal
.randomImputation <- function(data){

    missing <- is.na(data) # count missing values
    n.missing <- sum(missing)
    data.obs <- data[!missing] # keep the observed values
    imputed <- data
    # impute missing values by randomly selecting observed values
    imputed[missing] <- sample(data.obs, n.missing, replace=TRUE)
    return (imputed)

}


#' For each simulated dataset, calculate predictive accuracy on validation set
#'
#' @param index protein abundance data for one protein.
#' @param classifier A character string with the name of the classifier to be used
#' @return A list with (1) predictive accuracy on validation set and
#' (2) the important features
#' @keywords internal
.classification_performance <- function(index, classifier, train_x_list,
                                       train_y_list, valid_x, valid_y, top_K,
                                       tunegrid){

    # record the train x and y
    x <- as.data.frame(train_x_list[[index]])
    y <- as.factor(train_y_list[[index]])

    model <- .classification_model(x = x, y = y, classifier = classifier,
                                  tunegrid = tunegrid)

    pred.features <- .feature_importance(model, classifier, top_K)

    pred.model <- .classification_model(x = x[, pred.features], y = y,
                                       classifier = classifier, tunegrid = tunegrid)

    ## Predict validation data
    pred_y <- predict(pred.model, valid_x[, pred.features])
    ## Calculate predictive accuracy on validation data
    acc <- sum(diag(table(pred_y, valid_y))) / length(pred_y)
    ## record the predictive accuracy and train model
    run <- list(acc = acc, f_imp = pred.features)
    return(run)
}


#' @title Feature Importance
#' @description The method .feature_importance() finds variables importances
#' for the given model and selects the top n features
#' @param model A model object derived from the caret classification models
#' @param classifier The names of the classifier used to train the model on
#' @param top_K an Integer specifying the number of features to consider for 
#' selection
#' @return A string vector of names of the top_K features
#' @keywords internal
#' @import data.table 
.feature_importance <- function(model, classifier, top_K){
    f_imp <- caret::varImp(model, scale = T)
    i_ff <- as.data.table(f_imp$importance, keep.rownames = T)
    
    if(classifier %in% c("svmLinear", "naive_bayes")){
        i_ff[, Overall := rowMeans(i_ff[, -1], na.rm = T)]
    }
    setorder(i_ff, -Overall)
    sel_imp <- i_ff[seq_len(top_K),]$rn
    return(sel_imp)
}


#' Fit a classification model
#'
#' @param x protein abundance data for one protein
#' @param y group information
#' @param classifier A string of classifier 
#' @param tunegrid A data.frame with tuning parameters
#' @return trained classification model
#' @keywords internal
.classification_model <- function(x, y, classifier, tunegrid, ...){
    dots <- list(...)
    MaxNwts <- ifelse(is.null(dots$MaxNwts), 80000, dots$MaxNwts)
    maxit <- ifelse(is.null(dots$maxit), 100, dots$maxit)
    
    if(classifier == "logreg"){
        method <- ifelse(length(unique(y))>2, "multinom", "glm")
    }
    
    if(classifier == "logreg" && method =="glm"){
        ## Train logistic regression on training data
        model <- caret::train(x=x, y=make.names(y), method = method, 
                              trControl = caret::trainControl(method = "none", 
                                                              classProbs = TRUE),
                              maxit = maxit)
    } else {
        if(classifier == 'logreg'){
            classifier <- method
        }
        model <- caret::train(x=x, y=make.names(y),
                              method = classifier,
                              trControl = caret::trainControl(method = "none",
                                                              classProbs = TRUE),
                              tuneGrid = tunegrid, MaxNwts = MaxNwts,
                              maxit = maxit)
    }

    return(model)

}

#' @keywords internal

.tuning_params <- function(classifier, mtry = 2, size = 5, decay = 0.1, C = 1,
                           laplace = 0, usekernel = F, adjust = 1, ...){
    req_classifier <- c('rf','nnet','svmLinear', 'naive_bayes')
    func <- as.list(sys.call())[[1]]
    dots <- list(...)
    for (name in names(dots) ) {
        assign(name, dots[[name]])
    }

    if(!classifier %in% req_classifier){
        stop("CALL_",func,"_Incorrect classifier, should be one of (",
             paste(req_classifier, collapse = ", "),")")
    }
  
    switch(classifier,
           "rf" = {tunegrid=data.frame(mtry=mtry)},
           "nnet"={tunegrid=data.frame(size=size, decay=decay)},
           "svmLinear"={tunegrid=data.frame(C=C)},
           "naive_bayes"={tunegrid=data.frame(laplace=laplace,
                                              usekernel=usekernel,
                                              adjust=adjust)})
    return(tunegrid)
}


#' Create log file
#'
#' @return \emph{con} A file connection to write the logs into
#' @return \emph{file} Name of the file to which connection is open
#' @keywords internal
.logGeneration <- function(...){
    dots <- list(...)
    if(is.null(dots$file)){
        LOG_DIR <- file.path(getwd(),'logs')
        dir.create(LOG_DIR, showWarnings = F)
        FILE <- sprintf("MSstatsSSE_Log_%s.Rout", 
                        format(Sys.time(),"%Y%m%d%H%M%S"))
        assign("LOG_FILE", file.path(LOG_DIR, FILE))
        assign("FILE_CONN", file(LOG_FILE, open='a'))
        writeLines(capture.output(sessionInfo()), FILE_CONN)
        writeLines("\n\n ############## LOG ############# \n\n", FILE_CONN)
    } else{
        FILE_CONN <- file(dots$file, open = 'a')
        LOG_FILE <- dots$file
    }
    return(list(con = FILE_CONN, file = LOG_FILE))
}


#' @title Write Log
#' @param log A string log that has to be written to the log file
#' @param log_type A type of log to be written (purely for informational purpose)
#' @param conn A connection object to the logfile
#' @keywords internal
.log_write <- function(log, log_type, conn){
    sink(conn, type="message")
    message(Sys.time()," : ",log_type,"_",log,"...")
    sink(type="message")
    if(log_type == 'ERROR'){
        message(Sys.time()," : ",log_type,"_",log,"...")
    }    
}


#' @title Status updates
#' @description Logs all messages comming through to the log file and to the console
#' also provided status updates to the shiny interface with the right options
#' @param detail Message that need to be logged/displayed
#' @keywords internal
.status <- function(detail, ...){
    dots <- list(...)
    dots$func <- ifelse(is.null(dots$func), as.character(as.list(
        sys.call(-1))[[1]]),
        as.character(dots$func))
    mess <- sprintf("CALL_%s_%s", dots$func, detail)
    if(!is.null(dots$log)){
        .log_write(log = mess, log_type = "INFO", conn = dots$log)
    }
    if(!is.null(dots$session) && !is.null(dots$value))
        shiny::setProgress(value = dots$value, message = "Progress:",
                           detail = detail, session = dots$session)
    message(Sys.time()," : ",mess,"...")
}


#' @title Catch Faults
#' @description A wrapper function that catches exceptions and logs them appropriately
#' @param conn A connection object generated by .logGeneration
#' @param session A session object for using with the shiny server
#' @keywords internal
.catch_faults <- function(..., conn, session=NULL){
    warn <- err <- NULL
    res <- withCallingHandlers(
        tryCatch(...,
                 error = function(e) {
                     assign("LOG_FILE", conn$file, envir = .GlobalEnv)
                     err <<- conditionMessage(e)
                     .log_write(log = err, log_type = "ERROR", conn = conn$con)
                     if(!is.null(session)){
                         shiny::showNotification(as.character(err), duration = 20, 
                                                 type = "error",
                                                 session = session, 
                                                 id = "error") 
                         shiny::validate(shiny::need(is.null(err), 
                                                     as.character(err)))
                     }
                     
                 }), warning = function(w) {
                     warn <<- append(warn, conditionMessage(w))
                     .log_write(log = warn, log_type = "WARNING", conn = conn$con)
                     invokeRestart("muffleWarning")
                 })
    assign("LOG_FILE", conn$file, envir = .GlobalEnv)
    if(isOpen(conn$con) && is.null(session)){
        close(conn$con)
    }
    return(res)
}


.find_log <- function(...){
    dots <- list(...)
    session <- dots$session
    if(is.null(dots$log_conn)){
        conn = mget("LOG_FILE", envir = .GlobalEnv,
                    ifnotfound = NA)
        if(is.na(conn)){
            rm(conn)
            conn <- .logGeneration()
        } else{
            conn <- .logGeneration(file = conn$LOG_FILE)   
        }
    }else{
        conn <- dots$log_conn
    }
    return(conn)
}


.check_missing_values <- function(x){
    return(apply(x, 2, function(x){
        any(is.na(x) | is.infinite(x) | is.nan(x) | is.null(x) == T)
    }))
}


#' @title Data Checking
#' @description This function check for data consistency in the input dataset
#' and provided either a failure or success message when the data checks are
#' passed successfully
#' @param data A abundance file input 
#' @param annotation Annotation file for the abundance
#' @param conn A connection 
#' @keywords internal
.data_checks <- function(data, annotation, conn){
    packageStartupMessage(Sys.time()," : Checking Data for consistency...", 
                          appendLF = F)
    func <- as.list(sys.call())[[1]]
    
    #Check for any duplicated column names in the data provided
    dups <- any(duplicated(colnames(data)) == T)
    if(dups){
        packageStartupMessage(" Failure")
        stop("CALL_",func,
             "__Please check the column names of 'data'. There are duplicated 'Run'.")
    }
    
    #Check if input data is consistent with the required format
    required.annotation <- c('Condition', 'BioReplicate', 'Run')
    consistent_cols <- !setequal(required.annotation, colnames(annotation))
    if(consistent_cols){
        packageStartupMessage(" Failure")
        nf <- required.annotation[!required.annotation %in% colnames(annotation)]
        stop("CALL_", func,"_",nf,
             " is not provided in Annotation, please check annotation file")
    }
    anno_cols <- colnames(data)[!colnames(data) %in% 'Protein']

    ic <- setequal(annotation$BioReplicate, anno_cols) && 
        nrow(annotation) == length(anno_cols)
    if(!ic){
        packageStartupMessage(" Failure")
        stop("CALL_",func,"_",
             "Please check the annotation file. 'Bioreplicate' must match with",
             " the column names of 'data'.")
    }

    
    if(length(unique(annotation$Condition)) < 2){
        stop("Need at least two conditions to do simulations")
    }
    
    missing_values <- .check_missing_values(annotation)
    if(any(missing_values) == T){
        packageStartupMessage(" Failure")
        stop("CALL_",func,
             "_'NA|NAN|NULL' not permitted in 'Run', 'BioReplicate' or",
             "'Condition' of annotaion.")
    }
    
    data <- data[, annotation$Run]
    group <- as.factor(as.character(annotation$Condition))
    
    if (nrow(data) == 0) {
        stop("CALL_",func,
             "_Please check the column names of data and Run in annotation.")
    }
    
    temp <- unique(annotation[annotation$Run %in% colnames(data),
                              c("BioReplicate", "Condition")])
    temp$Condition <- factor(temp$Condition)
    temp$BioReplicate <- factor(temp$BioReplicate)
    if(any(table(temp$Condition) < 3)){
        stop("CALL_",func,
             "_Each condition must have at least three biological replicates.")
    }
    packageStartupMessage(" Success")
    
    return(list("data" = data, "group" = group))
}


.log2_trans <- function(trans, data, conn, ...){
    func <- as.list(sys.call())[[1]]
    if(is.logical(trans)){
        if(trans){
            data <- log2(data)
        }
        .status(detail = "Negative values if any where replaced with NA",
                log = conn$con, func = func, ...)
        data[!is.na(data) & data < 0] <- NA
        return(data)
    }else{
        stop("CALL_", func,
             "_log2Trans should be logical. Please provide either TRUE or FALSE")
    }
}


#' @title Plot PCA outputs
#' @description A utility wrapper for plotting pca data as returned by the `do_prcomp()`
#' @param data A data frame containing the Principal components to be plotted
#' @param exp_var A vector containing numeric values of the expected variance
.pca_plot <- function(data, exp_var, ...){
    dots <- list(...)
    title <- ifelse(is.null(dots$title),"Input dataset", dots$title)
                    
    p <- ggplot(data = data,
                aes(x = PC1, y = PC2, color = group)) +
        geom_point(size =  ifelse(is.null(dots$dot_size), 3, dots$dot_size)) + 
        stat_ellipse()+
        labs(title = title,
             x = sprintf("PC1 (%s%% explained var.)", exp_var[1]),
             y = sprintf("PC2 (%s%% explained var.)", exp_var[2])) +
        theme_MSstats(...)
    return(p)
}


#' @title Do Principal Component Analysis
#' @description A wrapper function to the base `prcomp()` formats the results 
#' in a required output format
#' @param sim_x A dataframe of the feature variables
#' @param sim_y A dataframe of the predictor vairable
#' @return A named list of the outputs 
.do_prcomp <- function(sim_x, sim_y){
    result.pca <- prcomp(sim_x, scale. = TRUE)
    summary.pca <- summary(result.pca)
    important.pc <- result.pca$x[, 1:2]
    pc.result <- data.frame(important.pc, group = sim_y)
    exp.var <- summary.pca$importance
    exp.var <- format(exp.var[2, 1:2] * 100, digits = 3)
    return(list("pc.result" = pc.result, "exp.var" = exp.var))
}


#' @title MSstats Theme
#' @description A utility function that standardized all the ggplot themes
#' @param x.axis.size A numeric value for size for the elements on the x-axis
#' @param y.axis.size A numeric value for size for the elements on the y-axis
#' @param legend.size A numeric value fot the size of the legend
theme_MSstats <- function(x.axis.size = 10, y.axis.size = 10, 
                          legend.size = 11, margin = 0.5, leg.dir="horizontal",
                          download = F,...){
    
    dots <- list(...)
    x.axis.size <- ifelse(is.null(dots$x.axis.size), x.axis.size,
                          dots$x.axis.size)
    
    y.axis.size <- ifelse(is.null(dots$y.axis.size), y.axis.size,
                          dots$y.axis.size)
    
    legend.size <- ifelse(is.null(dots$legend.size), legend.size,
                          dots$legend.size)
    leg.dir <- ifelse(is.null(dots$leg.dir), leg.dir,
                      dots$leg.dir)
    download <- ifelse(is.null(dots$download), download,
                       dots$download)
    leg.pos <- ifelse(is.null(dots$leg.pos), "top", dots$leg.pos)
    
    th <- ggplot2::theme(panel.background = element_rect(fill = "white", 
                                                         colour = "black"),
                         panel.grid.major = element_line(colour = "gray95"), 
                         panel.grid.minor = element_blank(), 
                         strip.background = element_rect(fill = "gray95"), 
                         strip.text.x = element_text(colour = c("#00B0F6"), 
                                                     size = 14),
                         axis.text.x = element_text(size = x.axis.size, 
                                                    colour = "black"), 
                         axis.text.y = element_text(size = y.axis.size, 
                                                    colour = "black"),
                         axis.ticks = element_line(colour = "black"), 
                         axis.title.x = element_text(size = x.axis.size + 5,
                                                     vjust = -0.4), 
                         axis.title.y = element_text(size = y.axis.size + 5,
                                                     vjust = 0.3),
                         title = element_text(size = x.axis.size + 4,
                                              vjust = 1.5),
                         legend.key = element_rect(fill = "white", 
                                                   colour = "white"),
                         legend.direction = leg.dir,
                         legend.position = leg.pos, 
                         legend.text = element_text(size = legend.size), 
                         legend.title = element_blank(),
                         plot.margin = unit(rep(margin,4), "cm"))
    
    if(!download)
        th <- th + ggplot2::theme(legend.position=c(1, 1.05),
                                  legend.justification="right")
    
    return(th)
}


#' @title Identify optimal sample size
#' @param df A dataframe containing the sample size, accuracies
#' @param cutoff A numeric values to determine the threshold for identifying the
#' sample size
#' @param use_h2o A logical input to use logic for h2o data
#' @return \emph{df} A data.frame
#' @return \emph{opt} A integer with the optimal sample size
#' @return \emph{y_lim} A vector of limits for a ggplot2 object
#' @keywords internal
#' @import data.table
.identify_optimal <- function(df, cutoff, use_h2o = F){    
    if(!is.logical(use_h2o)){
        stop("'use_h2o' needs to be logical")
    }
    
    df <- as.data.table(df)
    df[, mean_acc := mean(acc), sample]
    opt_df <- unique(df[, .(sample, mean_acc)])
    setorder(opt_df, -sample)
    if(use_h2o){
        setorder(opt_df, sample)
    }
    opt_df[, sample := as.numeric(as.character(sample))]
    opt_df[, lg := (mean_acc - shift(mean_acc))/(sample - shift(sample))]
    opt_df[, optimal := ifelse(lg >= cutoff, T, F)]
    if(nrow(opt_df[, .N, optimal][optimal == T]) != 0){
        opt_val <- opt_df[optimal == T][which.min(lg), sample]
    } else {
        opt_val <- opt_df[which.min(sample), sample]
    }
    
    y_lim <- c(df[,min(mean_acc, na.rm = T)]-0.1, 1)
    df[sample == opt_val, fill_col := "red"]
    
    return(list('df' = df, 'opt' = opt_val, 'y_lim' = y_lim))
}

#' Plot Accuracy 
#' @keywords internal
#' @import ggplot2
#' @importFrom scales pretty_breaks
.plot_acc <- function(df, y_lim, optimal_ss, ...){
    g <- ggplot(data = df, aes(x = as.factor(sample)))+
        geom_boxplot(aes(y = acc, group = sample, fill = fill_col), alpha = 0.5)+
        scale_fill_identity()+
        geom_point(aes(y = mean_acc))+
        geom_line(aes(y = mean_acc, group = 1), size = 0.75, color = "blue")+
        labs(x = "Simulated Sample Size Per Group", y = "Predictive Accuracy",
             subtitle = sprintf("Optimal sample size per group is : %s",
                                optimal_ss))+
        scale_y_continuous(breaks = scales::pretty_breaks(), limit = y_lim)+
        theme_MSstats(...)+
        theme(plot.subtitle = element_text(face = "italic", color = "red"))
    
    return(g)
}


#' Plot Variable Importance
#' @keywords internal
#' @import ggplot2
.plot_imp <- function(df, samp = NA, ylim, facet = F,...){
    if(!is.na(samp)){
        df <- subset(df, sample == samp)
    }
    g <- ggplot(data = df, aes(x = reorder(protein, frequency), 
                               y = frequency))+
        geom_col()+
        labs(x = "Protein", y = "Frequency")+
        scale_y_continuous(breaks = scales::pretty_breaks(),
                           limit = ylim)+
        theme_MSstats(...)+
        coord_flip()
    
    if(facet){
        titles <- sprintf("%s Samples/Group", unique(df$sample))
        names(titles) <- unique(df$sample)
        titles_lb <- as_labeller(titles)
        g <- g+
            facet_wrap(sample~., ncol = 2, scales = 'free', labeller = titles_lb)
    }else{
        g <- g+
            labs(title = sprintf("%s Samples/Group", samp))
    }
    return(g)
}


#' @title Plot QC boxplots
#' @description Plots an interactive plotly graphic with boxplots for all the 
#' proteins and their respective conditions with abundance indicated in the log
#' scale on the y axis
#' @param data A formatted data.frame with abundance, bioreplicate and conditions 
#' information
#' @return A plotly object
#' @keywords internal
#' @import data.table
qc_boxplot <- function(data = NULL ,annot = NULL){
    #create the interactive boxplot for all the different proteins found in the data
    if(!'data.table' %in% class(data)){
        data <- as.data.table(data, keep.rownames = T)
        setnames(data, 'rn', 'Protein')
    }
    
    data <- melt(data, id.vars = 'Protein', variable.name = "BioReplicate",
                 value.name = "Abundance")
    annot <- as.data.table(annot)
    data <- merge(data, annot, by = "BioReplicate") 
    data[, Abundance := as.numeric(format(Abundance, digits = 2))]
    
    box_plot <- plotly::plot_ly(data = data[!is.na(Abundance)],
                                y = ~Abundance, x = ~BioReplicate, color = ~Condition,
                                type = "box") %>%
        plotly::layout(xaxis = list(title = "Biological Replicate",
                                    showticklabels = TRUE,
                                    tickangle = -45 ), 
                       yaxis = list(title = "Protein abundance"),
                       legend = list(orientation = "h", #position and of the legend
                                     xanchor = "center",
                                     x = 0.5, y = 1.1)) %>%
        plotly::config(displayModeBar = F) #hide controls of the plotly chart
    
    box_plot
}


.format_df <- function(dat, sample, top_n = NA){
    df <- stack(dat)
    df$sample <- sample
    if(is.na(top_n)){
        return(df)
    }else{
        return(df[seq_len(top_n),])   
    }
}


#' @title Get summary table for the annotation data
#' @description Get the summary for the unique bioreplicates and number of
#' MS runs for the data that is provided
#' @param data A data.frame with the annotation data containing the Bioreplicates
#' Runs and condition information
#' @return A data.frame with the counts of bioreplicates for each conditions
#' and the number of runs as well
.format_summary_table <- function(data = NULL){
    #create crosstable for the conditions vs bioreplicates
    data <- as.data.table(data)
    biorep <- unique(data[,.(BioReplicate, Condition)])
    biorep <- xtabs(~Condition, data = biorep)
    
    #create crosstable for the conditions vs runs if runs data exists
    if(any(c("run","Run") %in% names(data))){
        msruns <- unique(data[,.(Run, Condition)])
        msruns <- xtabs(~Condition, data = msruns)
    }else{
        msruns <- rep(0, length(names(biorep)))
        names(msruns) <- names(biorep) #make runs data 0 if not found
    }
    #format it correctly
    #summary <- rbind(biorep, msruns)
    summary <-matrix(biorep, nrow = 1)
    colnames(summary) <- names(biorep)
    summary <- summary[,which(colSums(summary, na.rm = T) > 0)]
    sum_table <- matrix(summary, nrow=1)
    #rownames(summary) <- c("# of Biological Replicates", "# of MS runs")
    dimnames(sum_table) <- list("# of Biological Replicates", names(summary))
    return(sum_table)
}


#' @title Classsification caret
#' @description A wrapper function to classify multiple simulated datasets
#' @param n_samp A character vector consisting the of the different samples simulated
#' @param sim_data A list object, output out the `simulate_grid` function
#' @param classifier A string specifying the classifier to use
#' @param k A integer value for the number of features to select
#' @param session A session object for the shiny app
#' @keywords internal
ss_classify_caret <- function(n_samp, sim_data, classifier, k = 10, ...){
    
    dots <- list(...)
    session <- dots$session
    conn <- .find_log(...)
    samp <- unlist(strsplit(n_samp,","))
    res <-  list()
    max_val <- length(sim_data)
    iter <- 0
    
    for(i in seq_along(samp)){
        if(!is.null(session)){
            #Provides progress bars to the shiny UI
            iter = iter + 1/max_val
        }
        .status(detail = sprintf("Classifying Sample %s of %s", i, length(samp)),
                log = conn$con, session = session, value = iter)
        res[[i]] <- designSampleSizeClassification(sim_data[[i]],
                                                   classifier = classifier, 
                                                   top_K = k, session = session,
                                                   ...)
    }
    return(res)
}


#' @title Simulate datasets to be tested out
#' @description A wrapper function for the `simulateDataset` function from the 
#' MSstatsSampleSize package which enables simulating datasets for running experiments
simulate_grid <- function(data = NULL, annot = NULL, num_simulation, exp_fc,
                          list_diff_proteins, samples_per_group, sim_valid,
                          rank_proteins, mean_ip, mean_equality, sd_ip,
                          sd_equality, valid_samples_per_grp, seed, est_var, conn, 
                          session = NULL){
    #check if seed value required
    .status(detail = "Setting Up Data Simulation Runs", value = 0.1,
            session = session, log = conn$con)
    if(seed != -1)
        set.seed(seed)
    
    if(exp_fc != "data"){
        .status(detail = "Extracting Fold Change Informations", value = 0.15, 
               session = session, log = conn$con)
        fc <- exp_fc$`Fold Change Value`
        names(fc) <- exp_fc$orig_group
    } else{
        diff_prots <- NULL
        fc <- exp_fc
    }
    .status(detail = "Extracting Number of Samples Information", value = 0.2,
            session = session, log = conn$con)
    .status(detail = sprintf("Number of samples per group: (%s)",
                             samples_per_group),
            session = session , log = conn$con)
    
    samp <- as.numeric(unlist(strsplit(samples_per_group, ",")))
    shiny::validate(shiny::need(all(!is.na(samp)),
                                sprintf("Samples Per Group need to be numeric values, Found : %s",
                                        samples_per_group)),
                    shiny::need(all(samp >= 1), "All samples Need to be >= 1"))
    
    
    if(rank_proteins == "Combined"){
        protein_quantile_cutoff <- list(Mean = mean_ip, SD = sd_ip)
        protein_select <- list(Mean = mean_equality, SD = sd_equality)
    }else if(rank_proteins == "Mean"){
        protein_quantile_cutoff <- list(Mean = mean_ip)
        protein_select <- list(Mean = mean_equality)
    }else if(rank_proteins == "SD"){
        protein_quantile_cutoff <- list(SD = sd_ip)
        protein_select <- list(SD = sd_equality)
    }
    
    if(sim_valid){
        .status(detail = "Validation Simulation requested", value = 0.2,
                session = session, log = conn$con)
    }
    
    .status(detail = "Starting Simulation", value = 0.3, session = session,
            log = conn$con)
    
    sim <- list()
    for(i in samp){
        .status(detail = sprintf("Running Simulation for sample %s of %s",
                                 which(i == samp), length(samp)),
               value = which(i==samp)/length(samp), 
               session = session, log = conn$con)
        
        sim[[paste(i)]] <- simulateDataset(data = data,
                                           annotation = annot,
                                           num_simulations = num_simulation,
                                           protein_select = protein_select,
                                           protein_quantile_cutoff = protein_quantile_cutoff, 
                                           expected_FC = fc,
                                           list_diff_proteins =  list_diff_proteins,
                                           samples_per_group = i,
                                           simulate_validation = as.logical(sim_valid),
                                           valid_samples_per_group = valid_samples_per_grp,
                                           session = session)
    }
    .status(detail = "Simulation Complete", value = 0.9, session = session,
            log = conn$con)
    return(sim)
}


.protein_select<- function(df, cutoff = list("Mean" = 0.2, "SD" = 0.2),
                          equality = list("Mean" = "high", "SD" = "high"),
                          conn){
    
    df <- .quantile_eval(df, vec_name = 'Mean', prob = cutoff[['Mean']],
                         eq = equality[['Mean']])
    df <- .quantile_eval(df, vec_name = 'SD', prob = cutoff[['SD']],
                         eq = equality[['SD']])
    
    .status(detail = sprintf("Number of proteins selected %s", nrow(df)),
            log = conn$con)
    .status(detail = sprintf("The proteins selected are (%s)",
                             paste(df$Protein, collapse = ", ")), log = conn$con)
    return(df[,'Protein'])
}


#' @title Evaluate quantiles given vectors and probability
#' @param df A data.frame, with column names for which quantiles are calculated
#' @param vec_name A vector with the name of the column
#' @param prob A numeric value for the probablitiy
#' @param rnd A integer value which defines where to round the quantiles
.quantile_eval <- function(df, vec_name, prob, eq, rnd = 4){
    eq_vals <- c(">","<=")
    names(eq_vals) <- c("high", "low")
    
    if(!is.null(prob)){
        cutof <- round(quantile(df[,vec_name], probs = prob, 
                                na.rm = T), rnd)
        expr <- parse(text = paste(vec_name, eq_vals[eq], cutof))
        n_df <- subset(df, eval(expr))
        if(nrow(n_df) == 0){
            func <- as.list(sys.call())[[1]]
            stop("CALL_", func, "_No Proteins selected, possibly no protiens ",
                 "exists for provided quantile values, check quantile cutoff ",
                 "values, and equalities")
        }
        return(n_df)
    }else{
        .status(detail = sprintf("No quantile cutoff value for %s", vec_name))
        return(df)
    }
}
Vitek-Lab/MSstatsSampleSize documentation built on Aug. 28, 2020, 10:39 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Vitek-Lab/MSstatsSampleSize
Simulation tool for optimal design of high-dimensional MS-based proteomics experiment

R/Methods.R
In Vitek-Lab/MSstatsSampleSize: Simulation tool for optimal design of high-dimensional MS-based proteomics experiment

Defines functions .identify_optimal theme_MSstats .do_prcomp .pca_plot .log2_trans .data_checks .check_missing_values .find_log .catch_faults .status .log_write .logGeneration .tuning_params .classification_model .feature_importance .classification_performance .randomImputation .sampleSimulation

Documented in .catch_faults .classification_model .classification_performance .data_checks .do_prcomp .feature_importance .identify_optimal .logGeneration .log_write .pca_plot .randomImputation .sampleSimulation .status theme_MSstats

R Package Documentation

Browse R Packages

We want your feedback!

Vitek-Lab/MSstatsSampleSize Simulation tool for optimal design of high-dimensional MS-based proteomics experiment

R/Methods.R In Vitek-Lab/MSstatsSampleSize: Simulation tool for optimal design of high-dimensional MS-based proteomics experiment

Defines functions .identify_optimal theme_MSstats .do_prcomp .pca_plot .log2_trans .data_checks .check_missing_values .find_log .catch_faults .status .log_write .logGeneration .tuning_params .classification_model .feature_importance .classification_performance .randomImputation .sampleSimulation

Documented in .catch_faults .classification_model .classification_performance .data_checks .do_prcomp .feature_importance .identify_optimal .logGeneration .log_write .pca_plot .randomImputation .sampleSimulation .status theme_MSstats

R Package Documentation

Browse R Packages

We want your feedback!

Vitek-Lab/MSstatsSampleSize
Simulation tool for optimal design of high-dimensional MS-based proteomics experiment

R/Methods.R
In Vitek-Lab/MSstatsSampleSize: Simulation tool for optimal design of high-dimensional MS-based proteomics experiment