R/PerformAnalysis.R
In Mediana: Clinical Trial Simulations

##############################################################################################################################################

# Function: PerformAnalysis.
# Argument: Data model (or data stack) and analysis model.
# Description: This function carries out the statistical tests and computes the descriptive statistics specified in
# the analysis model using the specified data model (or data stack generated by the user).
# The number of simulations (n.sims) is used only if a data model is specified. If a data stack is specified,
# the number of simulations is obtained from this data stack.
#' @import doRNG
#' @import doParallel
#' @import foreach
PerformAnalysis = function(data, analysis.model, sim.parameters) {

  # Check if a data stack was specified and, if a data model is specified, call the CreateDataStructure function
  if (class(data) == "DataStack") {
    if (data$description == "data.stack") {
      call.CreateDataStructure  = FALSE
      data.stack = data
      n.sims = data.stack$sim.parameters$n.sims
      seed = data.stack$sim.parameters$seed
      data.structure = data.stack$data.structure
    } else {
      stop("The data object is not recognized.")
    }
  } else {
    call.CreateDataStructure  = TRUE
    data.model = data
    # Create a dummy data.stack
    data.stack = CreateDataStack(data.model, 1)
    data.structure = CreateDataStructure(data.model)
  }

  # Simulation parameters
  # Number of simulation runs
  if (is.null(sim.parameters$n.sims))
    stop("The number of simulation runs must be provided (n.sims)")

  if (call.CreateDataStructure == TRUE){
    n.sims = sim.parameters$n.sims
  }
  else {
    # Get the number of simulations with the data stack
    n.sims = n.sims
    warning("The number of simulation runs from the sim.parameter was ignored as a data stack was defined.")
  }
  if (!is.numeric(n.sims))
    stop("Number of simulation runs must be an integer.")
  if (length(n.sims) > 1)
    stop("Number of simulation runs: Only one value must be specified.")
  if (n.sims%%1 != 0)
    stop("Number of simulation runs must be an integer.")
  if (n.sims <= 0)
    stop("Number of simulation runs must be positive.")

  # Seed
  if (is.null(sim.parameters$seed))
    stop("The seed must be provided (seed)")

  if (call.CreateDataStructure == TRUE){
    seed = sim.parameters$seed
  } else {
    # Get the number of simulations with the data stack
    seed = seed
    warning("The seed from the sim.parameter was ignored as a data stack was defined.")
  }

  if (!is.numeric(seed))
    stop("Seed must be an integer.")
  if (length(seed) > 1)
    stop("Seed: Only one value must be specified.")
  if (nchar(as.character(seed)) > 10)
    stop("Length of seed must be inferior to 10.")

  if (!is.null(sim.parameters$proc.load)){
    proc.load = sim.parameters$proc.load
    if (is.numeric(proc.load)){
      if (length(proc.load) > 1)
        stop("Number of cores: Only one value must be specified.")
      if (proc.load %%1 != 0)
        stop("Number of cores must be an integer.")
      if (proc.load <= 0)
        stop("Number of cores must be positive.")
      n.cores = proc.load
    }
    else if (is.character(proc.load)){
      n.cores=switch(proc.load,
                     low={1},
                     med={parallel::detectCores()/2},
                     high={parallel::detectCores()-1},
                     full={parallel::detectCores()},
                     {stop("Processor load not valid")})
    }
  } else n.cores = 1

  sim.parameters = list(n.sims = n.sims, seed = seed, proc.load = n.cores)

  # Perform error checks for the data model and create an internal analysis structure
  analysis.structure = CreateAnalysisStructure(analysis.model)

  # Check if the samples referenced in the analysis model are actually specified in the data model

  # List of sample IDs
  sample.id = unlist(data.structure$id)

  # Number of tests specified in the analysis model
  n.tests = length(analysis.structure$test)

  if (n.tests > 0) {

    # Test IDs
    test.id = rep(" ", n.tests)

    for (test.index in 1:n.tests) {

      test.id[test.index] = analysis.structure$test[[test.index]]$id

      # Number of samples in the current test
      n.test.samples = length(analysis.structure$test[[test.index]]$samples)

      # When processing samples specified for individual tests, it is important to remember that
      # a hierarchical structure can be used, i.e., samples can first be merged and then passed to a specific test

      for (i in 1:n.test.samples) {

        # Number of subsamples in the current sample
        n.subsamples = length(analysis.structure$test[[test.index]]$samples[[i]])
        if (n.subsamples == 1) {
          if (!(analysis.structure$test[[test.index]]$samples[[i]] %in% sample.id))
            stop(paste0("Analysis model: Sample '", analysis.structure$test[[test.index]]$samples[[i]], "' is not defined in the data model."))
        } else {
          # Multiple subsamples
          for (j in 1:n.subsamples) {
            if (!(analysis.structure$test[[test.index]]$samples[[i]][[j]] %in% sample.id))
              stop(paste0("Analysis model: Sample '", analysis.structure$test[[test.index]]$samples[[i]][[j]], "' is not defined in the data model."))
          }
        }
      }
    }
  }

  # Number of statistics specified in the analysis model
  n.statistics = length(analysis.structure$statistic)

  # Statistic IDs
  statistic.id = rep(" ", n.statistics)

  if (n.statistics > 0) {

    for (statistic.index in 1:n.statistics) {

      statistic.id[statistic.index] = analysis.structure$statistic[[statistic.index]]$id

      # Number of samples in the current statistic
      n.statistic.samples = length(analysis.structure$statistic[[statistic.index]]$samples)

      for (i in 1:n.statistic.samples) {
        # Number of subsamples in the current sample
        n.subsamples = length(analysis.structure$statistic[[statistic.index]]$samples[[i]])
        if (n.subsamples == 1) {
          if (!(analysis.structure$statistic[[statistic.index]]$samples[[i]] %in% sample.id))
            stop(paste0("Analysis model: Sample '", analysis.structure$statistic[[statistic.index]]$samples[[i]], "' is not defined in the data model."))
        } else {
          # Multiple subsamples
          for (j in 1:n.subsamples) {
            if (!(analysis.structure$statistic[[statistic.index]]$samples[[i]][[j]] %in% sample.id))
              stop(paste0("Analysis model: Sample '", analysis.structure$statistic[[statistic.index]]$samples[[i]][[j]], "' is not defined in the data model."))
          }
        }
      }
    }
  }

  # Information on the analysis scenario factors

  # Number of multiplicity adjustment sets
  if (!is.null(analysis.structure$mult.adjust)) {
    n.mult.adjust = length(analysis.structure$mult.adjust)
  } else {
    n.mult.adjust = 1
  }

  # Number of analysis points (total number of interim and final analyses)
  if (!is.null(analysis.structure$interim.analysis)) {
    n.analysis.points = length(analysis.structure$interim.analysis$interim.looks$fraction)
  } else {
    # No interim analyses
    n.analysis.points = 1
  }


  # Create the analysis stack (list of the analysis sets produced by the test and statistic functions,
  # each element in this list contains the results generated in a single simulation run)
  analysis.set = list()

  # Number of data scenarios
  n.data.scenarios = dim(data.stack$data.scenario.grid)[1]
  data.scenario.grid = data.stack$data.scenario.grid

  # Create a grid of the data and analysis scenario factors (outcome parameter, sample size,
  # design parameter, multiplicity adjustment)
  scenario.grid = matrix(0, n.data.scenarios * n.mult.adjust, 2)

  index = 1
  for (i in 1:n.data.scenarios) {
    for (j in 1:n.mult.adjust) {
      scenario.grid[index, 1] = i
      scenario.grid[index, 2] = j
      index = index + 1
    }
  }

  # Number of data and analysis scenarios
  n.scenarios = dim(scenario.grid)[1]

  # Number of analysis samples in each data scenario
  n.analysis.samples = length(data.stack$data.set[[1]]$data.scenario[[1]]$sample)

  # Simulation parameters
  # Use proc.load to generate the clusters
  cluster.mediana = parallel::makeCluster(getOption("cluster.mediana.cores", sim.parameters$proc.load))

  # To make this reproducible I used the same number as the seed
  set.seed(seed)
  parallel::clusterSetRNGStream(cluster.mediana, seed)

  #Export all functions in the global environment to each node
  parallel::clusterExport(cluster.mediana,ls(envir=.GlobalEnv))
  doParallel::registerDoParallel(cluster.mediana)

  # Simulation index initialisation
  sim.index=0

  # Loop over simulation runs
  result.analysis.scenario=foreach::foreach(sim.index=1:sim.parameters$n.sims, .packages=(.packages())) %dorng% {
    # Select the current data set within the data stack
    if (!call.CreateDataStructure){
      current.data.set = data.stack$data.set[[sim.index]]
    } else {
      current.data.stack = CreateDataStack(data.model, 1)
      current.data.set = current.data.stack$data.set[[1]]
    }

    # Matrix of results (p-values) produced by the tests
    test.results = matrix(0, n.tests, n.analysis.points)

    # Matrix of results produced by the statistics
    statistic.results = matrix(0, n.statistics, n.analysis.points)

    # Create the analysis scenario list (one element for each unique combination of the data scenario factors)
    result.data.scenario = list()

    # Loop over the data scenario factors (outcome parameter, sample size, and design parameter)
    for (scenario.index in 1:n.data.scenarios) {

      # Current data scenario
      current.data.scenario = current.data.set$data.scenario[[scenario.index]]

      # Current sample size set
      current.sample.size.set = data.stack$data.scenario.grid[scenario.index, "sample.size"]

      # Vector of sample sizes across the data samples in the current sample size set
      if (!any(is.na(data.stack$data.structure$sample.size.set))) current.sample.sizes = data.stack$data.structure$sample.size.set[current.sample.size.set, ]

      # Loop over interim analyses
      for (analysis.point.index in 1:n.analysis.points) {

        # Create a data slice for the current interim look if interim analyses are specified in the analysis model
        if (!is.null(analysis.structure$interim.analysis)) {

          sample.list = analysis.structure$interim.analysis$interim.looks$sample
          parameter = analysis.structure$interim.analysis$interim.looks$parameter
          fraction = analysis.structure$interim.analysis$interim.looks$fraction[[analysis.point.index]]

          # Compute the total sample size in the sample list
          n.sample.list = length(sample.list)
          total.sample.size = 0
          # Number of samples
          n.samples = length(sample.id)

          for (k in 1:n.samples) {
            for (l in 1:n.sample.list) {
              if(sample.list[[l]] == sample.id[k]) total.sample.size = total.sample.size + current.sample.sizes[k]
            }
          }

          data.slice = CreateDataSlice(current.data.scenario, sample.list, parameter, round(total.sample.size * fraction))

        } else {

          # No interim analyses are specified in the analysis model -- simply use the current data scenario
          data.slice = current.data.scenario

        }

        # Loop over the tests specified in the analysis model to compute statistic results
        # if tests are specified in the analysis model
        if (n.tests > 0) {

          # Loop over the tests specified in the analysis model to compute test results (p-values)
          for (test.index in 1:n.tests) {

            # Current test
            current.test = analysis.structure$test[[test.index]]

            # Number of analysis samples specified in the current test
            n.samples = length(current.test$samples)

            # Extract the data frames for the analysis samples specified in the current test
            sample.list = list()

            # Extract the data frames for the analysis samples specified in the current test
            for (sample.index in 1:n.samples) {

              # Number of subsamples within the current analysis sample
              n.subsamples = length(current.test$samples[[sample.index]])

              if (n.subsamples == 1) {

                # If there is only one subsamples, no merging is required, simply select the right analysis sample
                sample.flag.num = match(current.test$samples[[sample.index]],sample.id)
                sample.list[[sample.index]] = data.slice$sample[[sample.flag.num]]$data


              } else {

                # If there are two or more subsamples, these subsamples must be merged first to create analysis samples
                # that are passed to the statistic function

                subsample.flag.num = match(current.test$samples[[sample.index]],sample.id)
                selected.subsamples = lapply(as.list(subsample.flag.num), function(x) data.slice$sample[[x]]$data)

                # Merge the subsamples
                sample.list[[sample.index]] = do.call(rbind, selected.subsamples)

              }

            }

            # Compute the test results (p-values) by calling the function for the current test with the test parameters
            test.results[test.index, analysis.point.index] = do.call(current.test$method,
                                                                     list(sample.list, list("PerformAnalysis",current.test$par)))

          } # End of the loop over the tests

        } # End of the if n.tests>0

        # Loop over the statistics specified in the analysis model to compute statistic results
        # if statistics are specified in the analysis model
        if (n.statistics > 0) {

          for (statistic.index in 1:n.statistics) {

            # Current statistic
            current.statistic = analysis.structure$statistic[[statistic.index]]

            # Number of analysis samples specified in the current statistic
            n.samples = length(current.statistic$samples)

            # Extract the data frames for the analysis samples specified in the current statistic
            sample.list = list()

            for (sample.index in 1:n.samples) {

              # Number of subsamples within the current analysis sample
              n.subsamples = length(current.statistic$samples[[sample.index]])

              if (n.subsamples == 1) {

                # If there is only one subsamples, no merging is required, simply select the right analysis sample
                sample.flag.num = match(current.statistic$samples[[sample.index]],sample.id)
                sample.list[[sample.index]] = data.slice$sample[[sample.flag.num]]$data


              } else {

                # If there are two or more subsamples, these subsamples must be merged first to create analysis samples
                # that are passed to the statistic function

                subsample.flag.num = match(current.statistic$samples[[sample.index]],sample.id)
                selected.subsamples = lapply(as.list(subsample.flag.num), function(x) data.slice$sample[[x]]$data)

                # Merge the subsamples
                sample.list[[sample.index]] = do.call(rbind, selected.subsamples)

              }

            }


            # Compute the statistic results by calling the function for the current statistic with the statistic parameters
            statistic.results[statistic.index, analysis.point.index] = do.call(current.statistic$method,
                                                                               list(sample.list, list("PerformAnalysis",current.statistic$par)))

          } # End of the loop over the statistics

        }    # End of the if n.statistics>0

      } # Loop over interim analyses

      # Assign test names
      if (n.tests > 0) {

        test.results = as.data.frame(test.results)
        rownames(test.results) = test.id

        if (n.analysis.points == 1) {
          colnames(test.results) = "Analysis"
        } else {
          names = rep("", n.analysis.points)
          for (j in 1:n.analysis.points) names[j] = paste0("Analysis ", j)
          colnames(test.results) = names
        }
      } else {

        # No tests are specified in the analysis model
        test.results = NA
      }


      # Assign statistic names
      if (n.statistics > 0) {

        statistic.results = as.data.frame(statistic.results)
        rownames(statistic.results) = statistic.id

        if (n.analysis.points == 1) {
          colnames(statistic.results) = "Analysis"
        } else {
          names = rep("", n.analysis.points)
          for (j in 1:n.analysis.points) names[j] = paste0("Analysis ", j)
          colnames(statistic.results) = names
        }
      } else {
        # No statistics are specified in the analysis model
        statistic.results = NA
      }


      result = list(tests = test.results,
                    statistic = statistic.results)

      result.data.scenario[[scenario.index]] = list(result = result)

    } # Loop over the data scenario factors

    # Loop for each data scenario
    for (data.scenario.index in 1:n.data.scenarios) {

      # If at least one multiplicity adjustment has been specified loop over the analysis scenario factors (multiplicity adjustment)
      if (!is.null(analysis.structure$mult.adjust)) {

        # Create the analysis scenario list (one element for each unique combination of the data and analysis scenario factors)
        result.data.scenario[[data.scenario.index]]$result$tests.adjust = list()

        # Loop for each analysis.scenarios
        for (scenario.index in 1:n.mult.adjust) {

          # Matrix of results (p-values) produced by the tests
          test.results.adj = matrix(0, n.tests, n.analysis.points)

          # Get the current multiplicity adjustment procedure
          current.mult.adjust = analysis.structure$mult.adjust[[scenario.index]]

          # Get the unadjusted pvalues for the current data scenarios
          current.pvalues = result.data.scenario[[data.scenario.index]]$result$tests

          # Loop for each analysis point
          for (analysis.point.index in 1:n.analysis.points) {

            # Number of multiplicity adjustment procedure within the multiplicity adjustment scenarios
            n.mult.adjust.sc = length(current.mult.adjust)

            # Loop for each multiplicity adjustment procedure within the multiplicity adjustment scenarios
            for (mult.adjust.sc in 1:n.mult.adjust.sc) {
              # Apply the multiple testing procedure specified in the current multiplicity adjustment set
              # to the tests specified in this set

              # Extract the p-values for the tests specified in the current multiplicity adjustment set
              pvalues.flag.num = match(current.mult.adjust[[mult.adjust.sc]]$tests, test.id)
              selected.pvalues = current.pvalues[pvalues.flag.num, analysis.point.index]

              if (!is.na(current.mult.adjust[[mult.adjust.sc]]$proc)) {
                test.results.adj[pvalues.flag.num, analysis.point.index] = do.call(current.mult.adjust[[mult.adjust.sc]]$proc, list(selected.pvalues, list("Analysis", current.mult.adjust[[mult.adjust.sc]]$par)))
              }   else {
                # If no multiplicity procedure is defined, there is no adjustment
                test.results.adj[pvalues.flag.num, analysis.point.index] = selected.pvalues
              }
            } # End Loop for each multiplicity adjustment procedure within the multiplicity adjustment scenario
          } # End Loop for each analysis point

          # Assign test names
          if (n.tests > 0) {
            test.results.adj = as.data.frame(test.results.adj)
            rownames(test.results.adj) = test.id
            if (n.analysis.points == 1) {
              colnames(test.results.adj) = "Analysis"
            } else {
              names = rep("", n.analysis.points)
              for (j in 1:n.analysis.points) names[j] = paste0("Analysis ", j)
              colnames(test.results.adj) = names
            }
          } else {
            # No tests are specified in the analysis model
            test.results.adj = NA
          }

          result.data.scenario[[data.scenario.index]]$result$tests.adjust$analysis.scenario[[scenario.index]] = test.results.adj

        } # End Loop for each analysis.scenarios

      } # End if analysis.structure
      else {
        result.data.scenario[[data.scenario.index]]$result$tests.adjust$analysis.scenario[[1]] = result.data.scenario[[data.scenario.index]]$result$tests
      }

    } # End loop for each data scenario

    result.analysis.scenario = result.data.scenario

    return(result.analysis.scenario)
  } # End of the loop over the simulations

  # Stop the cluster
  parallel::stopCluster(cluster.mediana)
  #closeAllConnections()

  # Define the analysis scenario grid (unique combinations of the data and analysis scenario factors)
  analysis.scenario.grid = as.data.frame(matrix(0, n.data.scenarios * n.mult.adjust, 4))
  d = data.stack$data.scenario.grid
  analysis.scenario.grid[, 1:3] = d[scenario.grid[,1], ]
  analysis.scenario.grid[, 4] = scenario.grid[,2]

  colnames(analysis.scenario.grid) = c("design.parameter", "outcome.parameter",
                                       "sample.size", "multiplicity.adjustment")

  # Create the analysis stack
  analysis.stack = list(description = "analysis.stack",
                        analysis.set = result.analysis.scenario,
                        analysis.scenario.grid = analysis.scenario.grid,
                        data.structure = data.structure,
                        analysis.structure = analysis.structure,
                        sim.parameters = sim.parameters)

  class(analysis.stack) = "AnalysisStack"
  return(analysis.stack)

}
# End of PerformAnalysis