R/write_resp.R
In LSDsensitivity: Sensitivity Analysis Tools for LSD Simulations

#*******************************************************************************
#
# ------------------- LSD tools for sensitivity analysis ---------------------
#
#   Written by Marcelo C. Pereira, University of Campinas
#
#   Copyright Marcelo C. Pereira
#   Distributed under the GNU General Public License
#
#*******************************************************************************

# ==== Create DoE response file ====

write.response <- function( folder, baseName, outVar = "", iniExp = 1, nExp = 1,
                            iniDrop = 0, nKeep = -1, pool = TRUE, instance = 1,
                            posit = NULL, posit.match = c( "fixed", "glob", "regex" ),
                            na.rm = FALSE, conf = 0.95, saveVars = NULL,
                            addVars = NULL, eval.vars = NULL, eval.run = NULL,
                            eval.stat = c( "mean", "median" ), rm.temp = TRUE,
                            nnodes = 1, quietly = TRUE ) {

  # evaluate new variables (not in LSD files) names
  nVarNew <- length( addVars )           # number of new variables to add
  newNameVar <- LSDinterface::name.var.lsd( append( saveVars, addVars ) )
  nVar <- length( newNameVar )          # total number of variables

  if( nVar == 0 && nVarNew == 0 )
    stop( "No variable to be kept in the data set, at least one required" )

  if( length( outVar ) == 0 )           # no output var?
    outVar <- newNameVar[ 1 ]           # use first var
  else {
    outVar <- LSDinterface::name.var.lsd( outVar )
    if( ! outVar %in% newNameVar )
      stop( paste0( "Invalid output variable selected (", outVar, ")" ) )
  }

  # check if files are in a subfolder
  myFiles <- LSDinterface::list.files.lsd( path = folder,
                                           conf.name = paste0( baseName, "_",
                                                               iniExp ) )
  if( length( myFiles ) == 0 || ! file.exists( myFiles[ 1 ] ) )
    stop( "No data files (baseName_XX_YY.res[.gz]) found on informed path" )

  folder <- dirname( myFiles[ 1 ] )

  # first check if extraction interrupted and continue with partial files
  tempFile <- paste0( folder, "/", baseName, "_", iniExp,
                      "_", iniExp + nExp - 1, ".RData" )
  if( ! rm.temp && file.exists( tempFile ) )
    tempDate <- file.mtime( tempFile )
  else
    tempDate <- 0

  # test if data files exist and are newer
  dataDate <- 0
  for( k in 1 : nExp ) {
    myFiles <- LSDinterface::list.files.lsd( path = folder,
                                             conf.name = paste0( baseName, "_",
                                                                 iniExp + k - 1 ) )
    if( tempDate == 0 && length( myFiles ) < 2 )
      stop( "Not enough data files (baseName_XX_YY.res[.gz]) found" )

    # get newest date
    for( file in myFiles ) {
      date <- file.mtime( file )
      if( date > dataDate )
        dataDate <- date
    }
  }

  # Continue with temporary file if appropriate
  noTemp <- TRUE
  if( tempDate > dataDate ) {
    temp <- new.env( )
    load( tempFile, envir = temp )

    if( all( temp$newNameVar == newNameVar ) ) {
      if( ! quietly )
        cat( "Previously processed data found, not reading data files...\n\n" )

      noTemp <- FALSE
      load( tempFile )
    }

    rm( temp )
  }

  # if no temporary data, start from zero
  if( noTemp ) {

    if( dataDate == 0 )
      stop( "No data files (baseName_XX_YY.res[.gz]) found" )

    for( k in 1 : nExp ) {

      if( ! quietly )
        cat( "\nSample #", iniExp + k - 1, "\n" )

      # ---- Read data files ----

      # Get file names
      myFiles <- LSDinterface::list.files.lsd( path = folder,
                                               conf.name = paste0( baseName, "_",
                                                                   iniExp + k - 1 ) )
      if( ! quietly )
        cat( "\nData files: ", myFiles, "\n\n" )

      # Determine the DoE sample size (repetitions on the same DoE point)
      nSize  <- length( myFiles )

      if( k == 1 )
        poolData <- array( list( ), dim = c( nExp, nVar, nSize ),
                           dimnames = list( NULL, newNameVar, NULL ) )

      # Get data set details from first file
      dimData <- LSDinterface::info.dimensions.lsd( myFiles[ 1 ] )
      nTsteps <- dimData$tSteps
      origNvar <- dimData$nVars

      if( ! quietly ) {
        cat( "Number of MC runs: ", nSize, "\n" )
        cat( "Number of periods: ", nTsteps, "\n" )
      }

      nTsteps <- nTsteps - iniDrop
      if( nKeep != -1 )
        nTsteps <- min( nKeep, nTsteps )

      if( ! quietly ) {
        cat( "Number of used periods: ", nTsteps, "\n" )
        cat( "Number of variables: ", origNvar, "\n\n" )
        cat( "Reading data from files...\n" )
      }

      nSampl <- 0

      if( pool ) {

        nInsts <- 1                                 # single instanced data

        for( j in 1 : nSize ) {

          # ------ Read data one file at a time to restrict memory usage ------

          dataSet <- LSDinterface::read.single.lsd( myFiles[ j ],
                                                    col.names = saveVars,
                                                    nrows = nKeep,
                                                    skip = iniDrop,
                                                    instance = instance,
                                                    posit = posit,
                                                    posit.match = posit.match )

          origVars <- colnames( dataSet )           # original variables

          if( length( origVars ) == 0 )
            stop( "variable, instance or position not found (saveVars/instance/posit)" )

          # ------ Add new variables to data set ------

          if( nVarNew != 0 ) {
            dataSet <- abind::abind( dataSet, array( as.numeric( NA ),
                                                     dim = c( nTsteps, nVarNew ) ),
                                     along = 2, use.first.dimnames = TRUE )
            colnames( dataSet ) <-
              LSDinterface::name.var.lsd( append( origVars, addVars ) )
          }

          # Call function to fill new variables with data or reevaluate old ones
          if( ! is.null( eval.vars ) )
            dataSet <- eval.vars( dataSet, newNameVar )

          # ---- Reorganize and save data ----

          for( i in 1 : nVar ) {
            if( ! newNameVar[ i ] %in% colnames( dataSet ) )
              stop( paste0( "Invalid variable to be saved (", newNameVar[ i ]
                            ,")" ) )

            x <- dataSet[ , newNameVar[ i ] ]
            if( na.rm )
              poolData[[ k, i, j ]] <- x[ ! is.na( x ) ]
            else
              poolData[[ k, i, j ]] <- x

            nSampl <- nSampl + length( poolData[[ k, i, j ]] )
          }
        }

      } else {

        # ------ Read data at once, may require a lot of memory ------

        dataSet <- LSDinterface::read.4d.lsd( myFiles, col.names = saveVars,
                                              nrows = nKeep, skip = iniDrop,
                                              nnodes = nnodes, posit = posit,
                                              posit.match = posit.match )

        origVars <- dimnames( dataSet )[[ 2 ]]# original variables
        nInsts <- dim( dataSet )[ 3 ]         # total number of instances

        # ------ Add new variables to data set ------

        if( nVarNew != 0 )
          dataSet <- abind::abind( dataSet, array( as.numeric( NA ),
                                                   dim = c( nTsteps, nVarNew,
                                                            nInsts, nSize ) ),
                                   along = 2, use.first.dimnames = TRUE )

        dimnames( dataSet )[[ 2 ]] <-
          LSDinterface::name.var.lsd( append( origVars, addVars ) )

        # Call function to fill new variables with data or reevaluate old ones
        if( ! is.null( eval.vars ) )
          dataSet <- eval.vars( dataSet, newNameVar )

        # ---- Reorganize and save data ----

        for( j in 1 : nSize )
          for( i in 1 : nVar ){
            if( ! newNameVar[ i ] %in% dimnames( dataSet )[[ 2 ]] )
              stop( paste0( "Invalid variable to be saved (", newNameVar[ i ]
                            ,")" ) )

            x <- as.vector( dataSet[ , newNameVar[ i ], , j ] )
            if( na.rm )
              poolData[[ k, i, j ]] <- x[ ! is.na( x ) ]
            else
              poolData[[ k, i, j ]] <- x

            nSampl <- nSampl + length( poolData[[ k, i, j ]] )
          }
      }

      if( ! quietly ) {
        cat( "\nNumber of variables: ", nVar, "\n" )
        cat( "Number of samples: ", nSampl, "\n\n" )
      }

      # Clean temp variables
      rm( dataSet, x )
    }

    # Save data list to disk
    if( ! rm.temp )
      try( save( nExp, nVar, nSize, nInsts, newNameVar, poolData,
                 compress = TRUE, file = tempFile ), silent = TRUE )
  }

  # ---- Process data in each experiment (nSize points) ----

  varIdx <- match( outVar, newNameVar )

  # Process the DoE experiments
  respStat <- respDisp <- vector( mode = "numeric", length = nExp )
  tobs <- tdiscards <- 0

  for( k in 1 : nExp ) {

    # Call function to evaluate each experiment
    if( ! is.null( eval.run ) ) {

      resp <- eval.run( poolData, run = k, varIdx = varIdx, conf = conf  )

    } else {                            # default processing - just calculate
      if( match.arg( eval.stat ) == "median" ) {
        med <- vector( mode = "numeric", length = nSize )
        obs <- 0
        for( j in 1 : nSize ) {
          data <- poolData[[ k, varIdx, j ]]
          med[ j ] <- stats::median( data, na.rm = TRUE )
          obs <- obs + length( data[ ! is.na( data ) ] )
        }

        statAcc <- stats::median( med, na.rm = TRUE )
        dispAcc <- stats::mad( med, na.rm = TRUE )

      } else {
        # the average of all runs and the MC SD
        statAcc <- varAcc <- obs <- 0
        for( j in 1 : nSize ) {
          data <- poolData[[ k, varIdx, j ]]
          m <- mean( data, na.rm = TRUE )
          statAcc <- statAcc + m
          varAcc <- varAcc + m ^ 2
          obs <- obs + length( data[ ! is.na( data ) ] )
        }

        # avoid negative rounding errors
        if( is.finite( varAcc ) && is.finite( statAcc ) ) {
          statAcc <- statAcc / nSize
          if( varAcc / nSize < statAcc ^ 2 )
            dispAcc <- 0
          else
            dispAcc <- sqrt( varAcc / nSize - statAcc ^ 2 )
        } else {
          statAcc <- dispAcc <- NA
        }
      }

      resp <- list( statAcc, dispAcc, obs / nSize, 0 )
      rm( data )
    }

    respStat[ k ] <- resp[[ 1 ]]
    respDisp[ k ] <- resp[[ 2 ]]
    tobs <- tobs + resp[[ 3 ]]
    tdiscards <- tdiscards + resp[[ 4 ]]
  }

  # Write table to the disk as CSV file for Excel
  tresp <- as.data.frame( cbind( respStat, respDisp ) )

  if( match.arg( eval.stat ) == "median" )
    colnames( tresp ) <- c( "Median", "MAD" )
  else
    colnames( tresp ) <- c( "Mean", "SD" )

  if( ! rm.temp ) {
    respFile <- paste0( folder, "/", baseName, "_", iniExp, "_",
                        iniExp + nExp - 1, "_", eval.stat, "_", outVar, ".csv" )

    tryCatch( suppressWarnings( utils::write.csv( tresp,
                                                  respFile,
                                                  row.names = FALSE ) ),
              error = function( e )
                stop( "Cannot write DoE response file to disk (read-only?)" ) )

    if( ! quietly )
      cat( "DoE response file saved:", respFile, "\n" )
  }

  if( ! quietly ) {
    cat( "Doe points =", k, "\n" )
    cat( "Total observations =", tobs, "\n" )
    cat( "Discarded observations =", tdiscards, "\n\n" )
  }

  rm( poolData, resp )
  if( rm.temp && file.exists( tempFile ) )
    unlink( tempFile )

  return( tresp )
}