R/gather_simulation.r

Defines functions summarize.sims patterns2lines collect.params

Documented in summarize.sims

# gathers all simulations in certain folder and puts in big dataframe


# collects parameters from a single file and returns them as a dataframe
collect.params <- function(filename
                           ,line_from
                           ,line_to = NA
                           ,sep=";"
                           ,callback=NULL)
{
    # get the data frame with parameters
    raw.params <- read.table(filename
            ,header= F
            ,sep=sep
            ,skip=line_from - 1
            ,nrow=line_to-line_from
            ,stringsAsFactors=F)

    if (is.null(nrow(raw.params)) || nrow(raw.params) < 1) {
        print(paste("Cannot find any parameters between lines"
                   ,line_from
                   ,"and"
                   ,line_to
                   ,"in file"
                   ,filename
                   ,". Skipping."
                   ))

        return(NULL)
    }

    param.df = as.data.frame(t(raw.params$V2), stringsAsFactors=F)
    colnames(param.df) <- raw.params$V1

    if (!is.null(callback))
    {
        callback.result <- callback(raw.params)

        param.df <- cbind(param.df,callback.result)
    }


    return(param.df)
}


# find out where the parameter listing starts and ends
patterns2lines <- function(
    filename
    ,pattern_from
    ,pattern_to
    ) {

    f <- readLines(filename)

    line_from <- NA
    line_to <- NA

    # make a sequence of the lines...
    seqq <- seq(1,length(f),1)

    found_first <- F

    # go through each line in the data file and find first line
    # where data is printed (i.e., a line which starts with a digit)
    for (line_i in seqq)
    {
        #        print(paste("line numbert: ", line_i))
        #        print(paste0("line contents: '",f[[line_i]],"'"))
        #        #        print(paste("grep result endline: ",grep(pattern="^$",f[[line_i]])))
        #        print(paste("found_first true: ",found_first))
        #        print(paste("found_first pattern: ",pattern_from))
        #        print(paste("grep pattern: ",length(grep(pattern=pattern_from, x=f[[line_i]]))))

#        if (!is.na(pattern_to))
#        {
#            print(paste("grep pattern end: ",length(grep(pattern=pattern_to, x=f[[line_i]]))))
#        }

        # if you are not yet in the parameter listing
        # and you find the start of the parameter listing
        if (!found_first && length(grep(pattern=pattern_from, x=f[[line_i]])) > 0)
        {
            found_first <- T
            line_from <- line_i

            # if there is no end pattern to the parameter listing
            # just quit
            if (is.na(pattern_to))
            {
                line_to <- length(f)
                break
            }
        } else if (found_first &&
                   length(grep(pattern=pattern_to,f[[line_i]]) > 0))
        {
            line_to <- line_i
            break
        }
    }

    return(c(line_from, line_to))
} # end function find_out_param_line()


#' Summarizes a collection of simulation files, by producing a
#' \code{data.frame}
#' each row of which contains the parameters and the data from the last
#' generation of a single simulation file
#'
#' @param simulations_path the directory in which all the simulations are
#' collected
#' @param simulation_file_pattern a \href{https://cran.r-project.org/web/packages/stringr/vignettes/regular-expressions.html}{regular expression}
#'  that matches the simulation files
#' @param parameter_start_pattern a regular expression that matches the start
#' of the parameters
#' @param parameter_end_pattern a regular expression that matches the end
#' of the parameters. If parameters are at the end of the file anyway
#' one can just write \code{NA}
#' @param data_start_pattern a regular expression that matches the start
#' of the data output for each timestep or generation
#' @param data_end_pattern a regular expression that matches the end
#' of the data output for each timestep or generation
#' @param sep data separators, e.g., \code{","} or \code{"\t"}
#' @param recursive if \code{recursive = TRUE}, the search for simulation files
#' recurses into subdirectories. If \code{recursive = FALSE}, only the current
#' directory will be searched without descending into subdirectories
#' @param callback_data function to perform additional
#' processing of the data (variables) section.
#' The function needs to accept a single argument \code{x}.
#' For example, \code{callback_data=some_func(x)}. This argument will be assigned
#' a \code{data.frame} containing all the data
#' The callback function needs to return a named list
#' of values as in \code{list(a=3,b=30.0,xk=-393)} that will be appended to the final reuslt
#' @param callback_parameters function to perform additional processing of the parameters section.
#' The function needs to accept a single argument \code{x}. This argument will be
#' assigned a \code{data.frame} containing the names of the parameters in the first
#' columna and the corresponding parameter values in the second column.
#' The callback function needs to return a named list
#' of values as in \code{list(a=3,b=30.0,xk=-393)} that will be appended to the final result
#'
#'
#' @return A \code{data.frame}, each row of which contains the parameters and last line of
#' output of each simulation file and the corresponding file name in a column named
#' \code{file}
#'
#' @examples
#' # say the current directory "."
#' # contains the following files:
#' # ├── file_x.csv
#' # ├── parameters.txt
#' # ├── sim_cue_integration_23_06_2020_095246_1.csv
#' # ├── sim_cue_integration_23_06_2020_095246_10.csv
#' # ├── sim_cue_integration_23_06_2020_095246_10_dist.csv
#' # ├── sim_cue_integration_23_06_2020_095246_11.csv
#' # ├── sim_cue_integration_23_06_2020_095246_11_dist.csv
#' #
#' # we need to obtain the files
#' # ├── sim_cue_integration_23_06_2020_095246_1.csv
#' # ├── sim_cue_integration_23_06_2020_095246_10.csv
#' # ├── sim_cue_integration_23_06_2020_095246_11.csv
#' #
#' # hence we provide a simulation_file_pattern = "sim_.*\\d\\.csv"
#' # which is a regular expression mating sim_ followed by a series
#' # of any characters .* finalized by a digit \\d followed by a
#' # dot \\. and csv
#' #
#' data <- summarize.sims(simulations_path="."
#'                            ,simulation_file_pattern="sim_.*\\d\\.csv"
#'                            ,parameter_start_pattern="^sigmoid"
#'                            ,parameter_end_pattern="^sigmoid"
#'                            )
#' # collects the output from the files in the data.frame data
#' # sim_cue_integration_23_06_2020_095246_1.csv
#' # sim_cue_integration_23_06_2020_095246_10.csv
#' # sim_cue_integration_23_06_2020_095246_11.csv
#'
#' # data will look like
#' str(data)
#' # 'data.frame':	150 obs. of  4 variables:
#' # generation: num 50000 500000 500000 ...
#' # var1: num 1 2 3 4 ...
#' # x: num 0.33 0.35 0.35...
#' # file: sim_cue_integration_23_06_2020_095246_1.csv ...
#' @export
summarize.sims <- function(simulations_path="."
                           ,simulation_file_pattern="sim_.*"
                           ,parameter_start_pattern="^var"
                           ,parameter_end_pattern=NA
                           ,data_start_pattern="^generation"
                           ,data_end_pattern="^$"
                           ,sep=";"
                           ,recursive=T
                           ,callback_data=NULL
                           ,callback_parameters=NULL
                           )
{
    if (!dir.exists(simulations_path))
    {
        print(paste0("The directory provided to the argument simulations_path ",simulations_path," cannot be found."))
        return(NA)
    }

    # get a list of all the simulation files
    all.simulation.files <- list.files(
            path=simulations_path
            ,pattern=simulation_file_pattern
            ,full.names=T
            ,recursive=recursive
            )

    # place holder variable for a big
    # data frame with all simulations
    big.dataframe.all.sims <- NA

    if (length(all.simulation.files) < 1) {
        return(NA)
    }

    # go through all the simulation files
    # and process them
    for (i in 1:length(all.simulation.files))
    {
        # get the file name
        file_i <- all.simulation.files[[i]]

        # filename might be a factor so
        # let's change it to character
        file_i_chr <- as.character(file_i)

        # give message about progress
        print(paste0("processing file "
                     ,i
                     ," out of "
                     ,length(all.simulation.files)
                     ,": "
                     ,file_i
                     )
             )

        # find the parameters section
        param.lines <- patterns2lines(
               filename=file_i_chr
               ,pattern_from = parameter_start_pattern
               ,pattern_to = parameter_end_pattern)

       # if there are no parameters, skip this simulaton
        if (is.na(param.lines[[1]]))
        {
            print(paste("cannot find a match for the pattern "
                        ,"parameter_start_pattern='"
                        ,parameter_start_pattern
                        ,"' in the file ",file_i
                        ,". Skipping this file."
                        ,sep=""
                 ))

            next
        }

        # parameters have been found, process them
        parameters <- collect.params(
                       filename=file_i_chr
                       ,line_from = param.lines[[1]]
                       ,line_to = param.lines[[2]]
                       ,sep=sep
                       ,callback=callback_parameters)

        # again, if no parameters, skip this stuff
        if (is.null(parameters))
        {
            next
        }

        # find out where the data actually is
        data.lines <- patterns2lines(
               filename=file_i_chr
               ,pattern_from = data_start_pattern
               ,pattern_to = data_end_pattern)

        if (is.na(data.lines[[1]]))
        {
            data.lines[[1]] = 1
        }

        # determine the position of the last line
        pos.last.line <-
            ifelse(test = is.na(data.lines[[2]])
                   ,yes = -1  # no end to data, write -1, reflecting there is no end to rows
                   # end to data, subtract lines to skip
                   # and subtract 1 additional line (because
                   # header)
                   ,no = data.lines[[2]] - (data.lines[[1]]-1) - 2)

        # read the actual data
        the.data <- read.table(
            file=file_i_chr
            ,header=T
            ,skip=data.lines[[1]]-1 # R suddenly counts from 0
            ,blank.lines.skip = T
            ,strip.white=T
            ,nrows=pos.last.line
            ,sep=";")

        # get last line of the data
        last.line <- the.data[nrow(the.data),]

        # perform some post-processing using callbacks
        if (!is.null(callback_data))
        {
            callback.result <- callback_data(the.data)

            last.line <- cbind(last.line, callback.result)
        }

        # now tie parameters, last line of data and filename together
        total.data <- cbind(
            parameters
            ,last.line
            ,list(file=file_i))

        if (class(total.data) != class(data.frame())) {
            stop(paste("Data returned from file "
                       ,file_i_chr
                       ," is not in a proper data.frame format."
                       ,sep=""))
        }

        if (class(big.dataframe.all.sims) != class(data.frame())) {
            # if the data frame is not existing yet,
            # just add total data to big.dataframe.all.sims
            big.dataframe.all.sims <- total.data
        } else {
            # otherwise append to data.frame
            big.dataframe.all.sims <- rbind(
                big.dataframe.all.sims
                ,total.data)
        }

    } # end for

    return(big.dataframe.all.sims)
} # end summarize.sims()
bramkuijper/simulation.utils documentation built on Jan. 29, 2024, 11:23 a.m.