R/sbpiper_sim.r

Defines functions sbpiper_sim summarise_data gen_stats_table plot_comb_sims plot_sep_sims skewness kurtosis check_exp_dataset load_exp_dataset

Documented in check_exp_dataset gen_stats_table kurtosis load_exp_dataset plot_comb_sims plot_sep_sims sbpiper_sim skewness summarise_data

# Copyright (c) 2018 Piero Dalle Pezze
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.



######################
# EXPORTED FUNCTIONS #
######################



#' Main R function for SBpipe pipeline: simulate(). 
#'
#' @param model the model name
#' @param inputdir the input directory
#' @param outputdir the output directory
#' @param outputfile_stats the output file containing the statistics
#' @param outputfile_repeats the output file storing the model simulation repeats
#' @param exp_dataset the file containing the experimental data.
#' @param plot_exp_dataset TRUE if the experimental data should also be plotted
#' @param exp_dataset_alpha the alpha level for the data set
#' @param xaxis_label the label for the x axis (e.g. Time (min))
#' @param yaxis_label the label for the y axis (e.g. Level (a.u.))
#' @param column_to_read the name of the column to process
#' @examples
#' \donttest{
#' data(insulin_receptor_1)
#' data(insulin_receptor_2)
#' data(insulin_receptor_3)
#' data(insulin_receptor_exp_dataset)
#' dir.create(file.path("sim_datasets"))
#' dir.create(file.path("sim_datasets_sum"))
#' write.table(insulin_receptor_1, 
#'             file=file.path("sim_datasets", "insulin_receptor_1.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_2, 
#'             file=file.path("sim_datasets", "insulin_receptor_2.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_3, 
#'             file=file.path("sim_datasets", "insulin_receptor_3.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_exp_dataset, 
#'             file="insulin_receptor_exp_dataset.csv", 
#'             row.names=FALSE)
#' sbpiper_sim(model="insulin_receptor", 
#'            inputdir="sim_datasets", 
#'            outputdir="sim_plots", 
#'            outputfile_stats="insulin_receptor_IR_beta_pY1146_stats.csv", 
#'            outputfile_repeats=file.path("sim_datasets_sum", 
#'                                         "insulin_receptor_IR_beta_pY1146.csv"), 
#'            exp_dataset="insulin_receptor_exp_dataset.csv", 
#'            plot_exp_dataset=TRUE, 
#'            exp_dataset_alpha=1.0, 
#'            xaxis_label=NULL, 
#'            yaxis_label=NULL, 
#'            column_to_read="IR_beta_pY1146")
#' }
#' @export
sbpiper_sim <- function(model, inputdir, outputdir, outputfile_stats, outputfile_repeats, 
                       exp_dataset, plot_exp_dataset, exp_dataset_alpha, xaxis_label, yaxis_label, 
                       column_to_read) {
  
  if(plot_exp_dataset == 'True' || plot_exp_dataset == 'TRUE' || plot_exp_dataset == 'true') {
    print('experimental dataset will also be plotted')
    plot_exp_dataset = TRUE
  } else {
    plot_exp_dataset = FALSE
  }
  
  print('summarising time course repeats in tables')
  summarise_data(inputdir, model, outputfile_repeats, column_to_read)
  
  print('generating table of statistics')
  gen_stats_table(outputfile_repeats, outputfile_stats, column_to_read)
  
  files <- list.files( path=inputdir, pattern=model )
  if(length(files) > 1) {
    print('plotting separate time courses')
    plot_sep_sims(dirname(outputfile_repeats), outputdir, model, exp_dataset, plot_exp_dataset,
                  exp_dataset_alpha, xaxis_label, yaxis_label, column_to_read)
  }
  print('plotting combined time courses')
  plot_comb_sims(dirname(outputfile_repeats), outputdir, model, exp_dataset, plot_exp_dataset,
                 exp_dataset_alpha, xaxis_label, yaxis_label, column_to_read)
}


#' Summarise the model simulation repeats in a single file.
#'
#' @param inputdir the input directory containing the time course files
#' @param model the model name
#' @param outputfile the file to store the simulated repeats
#' @param column_to_read the name of the column to process
#' @examples
#' data(insulin_receptor_1)
#' data(insulin_receptor_2)
#' data(insulin_receptor_3)
#' dir.create(file.path("sim_datasets"))
#' dir.create(file.path("sim_datasets_sum"))
#' write.table(insulin_receptor_1, 
#'             file=file.path("sim_datasets","insulin_receptor_1.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_2, 
#'             file=file.path("sim_datasets","insulin_receptor_2.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_3, 
#'             file=file.path("sim_datasets","insulin_receptor_3.csv"), 
#'             row.names=FALSE)
#' summarise_data(inputdir="sim_datasets", 
#'                model="insulin_receptor", 
#'                outputfile=file.path("sim_datasets_sum", 
#'                                     "insulin_receptor_IR_beta_pY1146.csv"), 
#'                column_to_read="IR_beta_pY1146")
#' @export
summarise_data <- function(inputdir, model, outputfile, column_to_read='X1') {
  
  # collect all files in the directory
  files <- list.files( path=inputdir, pattern=model )
  # print(files)
  
  # Read the simulated time course data sets
  timecourses <- data.table::fread(file.path(inputdir, files[1]), select=c('Time'))
  #print(timepoints)
  
  # the repeats for this readout
  collect.repeats <- function(mat) {
    for(i in 1:length(files)) {
      mat[,1+i] <- as.data.frame(data.table::fread(file.path(inputdir,files[i]), select=c(column_to_read)))[,1]
    }
    return(mat)
  }
  repeats <- matrix(data=NA, nrow=nrow(timecourses), ncol=1+length(files))
  repeats[, 1] <- timecourses$Time
  repeats <- as.data.frame(collect.repeats(repeats))
  # print(repeats)
  
  names(repeats) <- c("Time", paste('X', seq(1, length(files), 1), sep=""))
  write.table(repeats, file=outputfile, sep="\t", row.names=FALSE, quote=FALSE)
}


#' Generate a table of statistics for each model readout.
#'
#' @param inputfile the file to store the simulated repeats
#' @param outputfile the file to store the statistics
#' @param column_to_read the name of the column to process
#' @examples
#' data(insulin_receptor_1)
#' data(insulin_receptor_2)
#' data(insulin_receptor_3)
#' dir.create(file.path("sim_datasets"))
#' dir.create(file.path("sim_datasets_sum"))
#' write.table(insulin_receptor_1, 
#'             file=file.path("sim_datasets","insulin_receptor_1.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_2, 
#'             file=file.path("sim_datasets","insulin_receptor_2.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_3, 
#'             file=file.path("sim_datasets","insulin_receptor_3.csv"), 
#'             row.names=FALSE)
#' summarise_data(inputdir="sim_datasets", 
#'                model="insulin_receptor", 
#'                outputfile=file.path("sim_datasets_sum", 
#'                                     "insulin_receptor_IR_beta_pY1146.csv"), 
#'                column_to_read="IR_beta_pY1146")
#' gen_stats_table(inputfile=file.path("sim_datasets_sum", 
#'                                     "insulin_receptor_IR_beta_pY1146.csv"), 
#'                 outputfile="insulin_receptor_IR_beta_pY1146_stats.csv", 
#'                 column_to_read="IR_beta_pY1146")
#' @export
gen_stats_table <- function(inputfile, outputfile, column_to_read="X1") {
  
  # Read the summarised simulated time course data set
  repeats <- data.table::fread(inputfile)
  time.col <- repeats$Time
  # equivalent to repeats[, Time :=  NULL] .
  repeats <- as.matrix(data.table::set(repeats, j=c(1L), value=NULL))
  
  # the statistics
  compute.stats <- function(mat) {
    for(i in 1:nrow(mat)) {
      timepoint.values <- repeats[i,]
      mat[i,2] = mean(timepoint.values, na.rm = TRUE)
      mat[i,3] = sd(timepoint.values, na.rm = TRUE)
      mat[i,4] = var(timepoint.values, na.rm = TRUE)
      mat[i,5] = skewness(timepoint.values, na.rm = TRUE)
      mat[i,6] = kurtosis(timepoint.values, na.rm = TRUE)
      mat[i,7] = qnorm(0.975)*mat[i,2]/sqrt(ncol(mat)) # quantile normal distribution (lot of samples)
      mat[i,8] = mat[i,2] / mat[i,1]
      mat[i,9] = min(timepoint.values, na.rm = TRUE)
      mat[i,10] = quantile(timepoint.values, na.rm = TRUE)[2]  # Q1
      mat[i,11] = median(timepoint.values, na.rm = TRUE)  # Q2 or quantile(timepoint.values)[3]
      mat[i,12] = quantile(timepoint.values, na.rm = TRUE)[4]  # Q3
      mat[i,13] = max(timepoint.values, na.rm = TRUE)
    }
    return(mat) 
  }
  statistics <- matrix(data=NA, nrow=nrow(repeats), ncol=13)
  statistics[,1] <- time.col
  statistics <- as.data.frame(compute.stats(statistics))
  colnames(statistics) <- c ("Time", "Mean", "StdDev", "Variance", "Skewness", "Kurtosis", 
                             "n-dist_CI95", "CoeffVar", "Minimum", "1stQuantile", 
                             "Median", "3rdQuantile", "Maximum")  
  # print(statistics)
  write.table(statistics, outputfile, sep="\t", row.names = FALSE)
}


#' Plot the simulation time courses using a heatmap representation.
#'
#' @param inputdir the input directory containing the time course files
#' @param outputdir the output directory
#' @param model the model name
#' @param exp_dataset a full path file containing the experimental data.
#' @param plot_exp_dataset TRUE if the experimental data should also be plotted
#' @param exp_dataset_alpha the alpha level for the data set
#' @param xaxis_label the label for the x axis (e.g. Time (min))
#' @param yaxis_label the label for the y axis (e.g. Level (a.u.))
#' @param column_to_read the name of the column to process
#' @param yaxis.min the lower limit for the y axis
#' @param yaxis.max the upper limit for the y axis
#' @examples
#' data(insulin_receptor_IR_beta_pY1146)
#' data(insulin_receptor_exp_dataset)
#' dir.create(file.path("sim_datasets_sum"))
#' write.table(insulin_receptor_IR_beta_pY1146, 
#'             file=file.path("sim_datasets_sum","insulin_receptor_IR_beta_pY1146.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_exp_dataset, 
#'             file="insulin_receptor_exp_dataset.csv", 
#'             row.names=FALSE)
#' plot_comb_sims(inputdir="sim_datasets_sum", 
#'                outputdir="sim_plots",
#'                model="insulin_receptor",
#'                exp_dataset="insulin_receptor_exp_dataset.csv",
#'                plot_exp_dataset=TRUE, 
#'                exp_dataset_alpha=1.0, 
#'                xaxis_label="Time [m]", 
#'                yaxis_label="Level [a.u.]", 
#'                column_to_read='IR_beta_pY1146', 
#'                yaxis.min=NULL, 
#'                yaxis.max=NULL)
#' @export
plot_comb_sims <- function(inputdir, outputdir, model, exp_dataset, plot_exp_dataset=FALSE,
                           exp_dataset_alpha=1.0, xaxis_label='', yaxis_label='', column_to_read='X1',
                           yaxis.min=NULL, yaxis.max=NULL) {
  
  theme_set(tc_theme(36)) #28
  
  # create the directory of output
  if (!file.exists(outputdir)){
    dir.create(outputdir)
  }
  
  filein <- file.path(inputdir, paste0(model, "_", column_to_read, ".csv"))
  fileout <- file.path(outputdir, gsub('.csv', '.pdf', basename(filein)))
  # print(filein)
  
  df_exp_dataset <- load_exp_dataset(exp_dataset, plot_exp_dataset)
  
  df <- data.table::fread( filein )
  # print(df)

  # mean
  fileoutM <- gsub('.pdf', '_mean.pdf', fileout)
  gM <- ggplot()
  gM <- plot_combined_tc(df, gM, column_to_read, xaxis_label, yaxis_label, 'mean', yaxis.min=yaxis.min, yaxis.max=yaxis.max)
  ggsave(fileoutM, dpi=300, width=8, height=6)#, bg = "transparent")
  
  # mean_sd
  fileoutMSD <- gsub('.pdf', '_mean_sd.pdf', fileout)
  gMSD <- ggplot()
  gMSD <- plot_combined_tc(df, gMSD, column_to_read, xaxis_label, yaxis_label, 'mean_sd', yaxis.min=yaxis.min, yaxis.max=yaxis.max)
  ggsave(fileoutMSD, dpi=300, width=8, height=6)#, bg = "transparent")
  
  # mean_sd_ci95
  fileoutMSDCI <- gsub('.pdf', '_mean_sd_ci95.pdf', fileout)
  gMSDCI <- ggplot()
  gMSDCI <- plot_combined_tc(df, gMSDCI, column_to_read, xaxis_label, yaxis_label, 'mean_sd_ci95', yaxis.min=yaxis.min, yaxis.max=yaxis.max)
  ggsave(fileoutMSDCI, dpi=300, width=8, height=6)#, bg = "transparent")
  
  if(column_to_read %in% colnames(df_exp_dataset)) {
    # mean
    # we make this plot again because we want the line in front.
    gM <- ggplot()
    gM <- plot_raw_dataset(df_exp_dataset, gM, column_to_read, max(df$Time), alpha=exp_dataset_alpha)
    gM <- plot_combined_tc(df, gM, column_to_read, xaxis_label, yaxis_label, 'mean', yaxis.min=yaxis.min, yaxis.max=yaxis.max)
    ggsave(gsub('.pdf', '_w_exp_data.pdf', fileoutM), dpi=300, width=8, height=6)#, bg = "transparent")
    
    # mean_sd
    gMSD <- ggplot()
    gMSD <- plot_raw_dataset(df_exp_dataset, gMSD, column_to_read, max(df$Time), alpha=exp_dataset_alpha)
    gMSD <- plot_combined_tc(df, gMSD, column_to_read, xaxis_label, yaxis_label, 'mean_sd', alpha=0.6, yaxis.min=yaxis.min, yaxis.max=yaxis.max)
    ggsave(gsub('.pdf', '_w_exp_data.pdf', fileoutMSD), dpi=300, width=8, height=6)#, bg = "transparent")
    
    # mean_sd_ci95
    gMSDCI <- ggplot()
    gMSDCI <- plot_raw_dataset(df_exp_dataset, gMSDCI, column_to_read, max(df$Time), alpha=exp_dataset_alpha)
    gMSDCI <- plot_combined_tc(df, gMSDCI, column_to_read, xaxis_label, yaxis_label, 'mean_sd_ci95', alpha=0.6, yaxis.min=yaxis.min, yaxis.max=yaxis.max)
    ggsave(gsub('.pdf', '_w_exp_data.pdf', fileoutMSDCI), dpi=300, width=8, height=6)#, bg = "transparent")
  }
  
}


#' Plot the simulations time course separately.
#'
#' @param inputdir the input directory containing the time course files
#' @param outputdir the output directory
#' @param model the model name
#' @param exp_dataset a full path file containing the experimental data.
#' @param plot_exp_dataset TRUE if the experimental data should also be plotted
#' @param exp_dataset_alpha the alpha level for the data set
#' @param xaxis_label the label for the x axis (e.g. Time (min))
#' @param yaxis_label the label for the y axis (e.g. Level (a.u.))
#' @param column_to_read the name of the column to process
#' @param yaxis.min the lower limit for the y axis
#' @param yaxis.max the upper limit for the y axis
#' @examples
#' data(insulin_receptor_IR_beta_pY1146)
#' data(insulin_receptor_exp_dataset)
#' dir.create(file.path("sim_datasets_sum"))
#' write.table(insulin_receptor_IR_beta_pY1146, 
#'             file=file.path("sim_datasets_sum","insulin_receptor_IR_beta_pY1146.csv"), 
#'             row.names=FALSE)
#' write.table(insulin_receptor_exp_dataset, 
#'             file="insulin_receptor_exp_dataset.csv", 
#'             row.names=FALSE)
#' plot_sep_sims(inputdir="sim_datasets_sum", 
#'               outputdir="sim_plots",
#'               model="insulin_receptor",
#'               exp_dataset="insulin_receptor_exp_dataset.csv",
#'               plot_exp_dataset=TRUE, 
#'               exp_dataset_alpha=1.0, 
#'               xaxis_label="Time [m]", 
#'               yaxis_label="Level [a.u.]", 
#'               column_to_read='IR_beta_pY1146', 
#'               yaxis.min=NULL, 
#'               yaxis.max=NULL)
#' @export
plot_sep_sims <- function(inputdir, outputdir, model, exp_dataset, plot_exp_dataset=FALSE,
                          exp_dataset_alpha=1.0, xaxis_label='', yaxis_label='', column_to_read='X1',
                          yaxis.min=NULL, yaxis.max=NULL) {
  
  theme_set(tc_theme(36)) #28
  
  # create the directory of output
  if (!file.exists(outputdir)){
    dir.create(outputdir)
  }
  # collect all files in the directory
  filein <- file.path(inputdir, paste0(model, "_", column_to_read, ".csv"))
  fileout <- file.path(outputdir, gsub('.csv', '.pdf', basename(filein)))  
  # print(filein)
  
  df_exp_dataset <- load_exp_dataset(exp_dataset, plot_exp_dataset)
  
  df <- data.table::fread( filein )
  # print(df)
  
  g <- plot_repeated_tc(df, ggplot(), column_to_read, xaxis_label, yaxis_label, yaxis.min=yaxis.min, yaxis.max=yaxis.max)
  ggsave(fileout, dpi=300,  width=8, height=6)#, bg = "transparent")
  
  if(column_to_read %in% colnames(df_exp_dataset)) {
    g <- plot_raw_dataset(df_exp_dataset, g, column_to_read, max(df$Time), alpha=exp_dataset_alpha, yaxis.min=yaxis.min, yaxis.max=yaxis.max)
    g <- plot_repeated_tc(df, g, column_to_read, xaxis_label, yaxis_label, alpha=0.2, yaxis.min=yaxis.min, yaxis.max=yaxis.max)
    ggsave(gsub('.pdf', '_w_exp_data.pdf', fileout), dpi=300, width=8, height=6)#, bg = "transparent")
  }
  
  g <- plot_heatmap_tc(df, ggplot(), TRUE, column_to_read, xaxis_label, 'repeats')
  ggsave(gsub('.pdf', '_heatmap_scaled.pdf', fileout), dpi=300,  width=8, height=6)#, bg = "transparent")
  
  g <- plot_heatmap_tc(df, ggplot(), FALSE, column_to_read, xaxis_label, 'repeats')
  ggsave(gsub('.pdf', '_heatmap.pdf', fileout), dpi=300,  width=8, height=6)#, bg = "transparent")
    
}



#####################
# UTILITY FUNCTIONS #
#####################


#' Calculate the skewness of a numeric vector
#'
#' @param x the numeric vector
#' @param na.rm TRUE if NA values should be discarded
#' @return the skewness
#' @examples 
#' skewness(x=c(1,2.4,5,NA), na.rm=TRUE)
#' @export
skewness <- function(x, na.rm=FALSE) {
  if (na.rm) x = x[!is.na(x)]
  return(sum((x-mean(x))^3/length(x))/sqrt(var(x))^3)
}


#' Calculate the kurtosis of a numeric vector
#'
#' @param x the numeric vector
#' @param na.rm TRUE if NA values should be discarded
#' @return the kurtosis
#' @examples 
#' kurtosis(x=c(1,2.4,5,NA), na.rm=TRUE)
#' @export
kurtosis <- function(x, na.rm=FALSE) {
  if (na.rm) x = x[!is.na(x)]
  return(sum((x-mean(x))^4/length(x))/sqrt(var(x))^4)
}


#' Check that the experimental data set exists.
#'
#' @param exp_dataset a full path file containing the experimental data.
#' @param plot_exp_dataset TRUE if the data set file should be plotted.
#' @return TRUE if the file exists.
check_exp_dataset <- function(exp_dataset, plot_exp_dataset=FALSE) {
    if (plot_exp_dataset) {
        # check that exp_dataset exists and that the file ends with .csv (it is not a dir!)
        if (file.exists(exp_dataset) && grepl('.csv$', exp_dataset)){
            return(TRUE)
        } else {
            print(paste("Warning: experimental data set file ", exp_dataset, " does not exist or not specified. Skip.", sep=""))
            return(FALSE)
        }
    }
    return(FALSE)
}


#' Load the experimental data set.
#'
#' @param exp_dataset a full path file containing the experimental data.
#' @param plot_exp_dataset TRUE if the experimental data should also be plotted
#' @return the loaded data set.
load_exp_dataset <- function(exp_dataset, plot_exp_dataset=FALSE) {
    df_exp_dataset <- data.frame()
    if(check_exp_dataset(exp_dataset, plot_exp_dataset)) {
        df_exp_dataset <- data.frame(data.table::fread(exp_dataset))
    }
    return (df_exp_dataset)
}

Try the sbpiper package in your browser

Any scripts or data that you put into this service are public.

sbpiper documentation built on May 2, 2019, 8:53 a.m.