R/utilities-population.R

Defines functions extendPopulationFromXLS extendPopulationByUserDefinedParams readPopulationCharacteristicsFromXLS

Documented in extendPopulationByUserDefinedParams extendPopulationFromXLS readPopulationCharacteristicsFromXLS

#' Read an excel file containing information about population and create a
#' `PopulationCharacteristics` object
#'
#' @param XLSpath Path to the excel file
#' @param populationName Name of the population, as defined in the "PopulationName"
#' column
#' @param sheet Name or the index of the sheet in the excel file.
#' If `NULL`, the first sheet in the file is used.
#'
#' @return A `PopulationCharacteristics` object based on the information
#' in the excel file.
#' @import readxl
#' @export
readPopulationCharacteristicsFromXLS <- function(XLSpath, populationName, sheet = NULL) {
  columnNames <- c(
    "PopulationName", "species", "population", "numberOfIndividuals", "proportionOfFemales", "weightMin", "weightMax",
    "weightUnit", "heightMin", "heightMax", "heightUnit", "ageMin", "ageMax", "BMIMin", "BMIMax", "BMIUnit", "Protein", "Ontogeny"
  )

  validateIsString(c(XLSpath, populationName))
  validateIsString(sheet, nullAllowed = TRUE)

  if (is.null(sheet)) {
    sheet <- 1
  }
  data <- readExcel(path = XLSpath, sheet = sheet)

  if (!all(columnNames %in% names(data))) {
    stop(messages$errorWrongXLSStructure(filePath = XLSpath, expectedColNames = columnNames))
  }
  # Find the row with the given population name
  rowIdx <- which(data$PopulationName == populationName)
  if (length(rowIdx) == 0) {
    stop(messages$errorWrongPopulationName(populationName))
  }

  # Parse the information about population.
  # We only want to use arguments that have a value in the xls-file
  arguments <- list()
  # Starting to iterate by 2 as the first entry is "PopulationName" and
  # is not an argument
  for (i in 2:length(data[rowIdx, ])) {
    value <- data[[rowIdx, i]]
    # Skip column, if no value defined
    if (is.na(value)) {
      next
    }
    columnName <- names(data[rowIdx, ][i])
    # skip columns 'Ontogeny' and 'Protein' as they will be processed separately
    if (any(columnName == c("Ontogeny", "Protein"))) {
      next
    }
    arguments[[columnName]] <- value
  }

  # Create ontogenies for the proteins
  arguments[["moleculeOntogenies"]] <- .readOntongeniesFromXLS(data[rowIdx, ])

  # Using do.call to call the method with arguments in a list
  populationCharacterstics <- do.call(createPopulationCharacteristics, arguments)

  return(populationCharacterstics)
}

#' Possible gender entries as integer values
#'
#' @export
GenderInt <- enum(list(
  MALE = 1,
  FEMALE = 2,
  UNKNOWN = 3
))

#' Add user defined variability on parameters to a population.
#'
#' @param population Object of type `Population`
#' @param parameterPaths Vector of parameter path for which the variability is to be added.
#' @param meanValues Vector of mean values of the parameters. Must have the same
#'   length as `parameterPaths`. The type of mean (arithmetic, geometric)
#'   depends on the selected `distribution`. The values must be in the base
#'   units of the parameters.
#' @param sdValues Vector of standard deviation values of the parameters. Must
#'   have the same length as `parameterPaths`. The type of standard deviation
#'   depends on the selected `distribution`.
#' @param distributions Type of distribution from which the random values will
#'   be sampled. Must have the same length as `parameterPaths`.
#' A list of supported distributions is defined in `Distributions`. Default is `"Normal"`.
#' @export
extendPopulationByUserDefinedParams <- function(population, # nolint: object_length_linter.
                                                parameterPaths,
                                                meanValues,
                                                sdValues,
                                                distributions = Distributions$Normal) {
  validateIsOfType(population, "Population")
  validateIsString(parameterPaths)
  validateIsNumeric(c(meanValues, sdValues))
  distributions <- distributions %||% rep(Distributions$Normal, length(parameterPaths))
  validateIsSameLength(parameterPaths, meanValues, sdValues, distributions)


  # Iterate through all parameters and sample a parameter values vector
  for (i in seq_along(parameterPaths)) {
    path <- parameterPaths[[i]]
    mean <- meanValues[[i]]
    sd <- sdValues[[i]]

    # Sample values
    vals <- sampleRandomValue(
      distribution = distributions[[i]],
      mean = mean,
      sd = sd,
      n = population$count
    )

    population$setParameterValues(parameterOrPath = path, values = vals)
  }
}


#' Add user defined variability on parameters to a population from an excel file.
#'
#' @param population Object of type `Population`
#' @param XLSpath Path to the excel file that stores the information of
#'   parameters. The file must have the columns "Container Path",
#'   "Parameter Name", "Mean", "SD", "Units", and "Distribution". Mean and SD
#'   values must be in the base units of the parameters.
#' @param sheet Name or the index of the sheet in the excel file.
#' If `NULL`, the first sheet in the file is used.
#'
#' @details The method reads the information from the specified excel sheet(s)
#'   and calls `extendPopulationByUserDefinedParams`
#'
#' @import readxl
#' @export
extendPopulationFromXLS <- function(population, XLSpath, sheet = NULL) {
  validateIsOfType(population, "Population")
  validateIsString(XLSpath)
  validateIsString(sheet, nullAllowed = TRUE)
  if (is.null(sheet)) {
    sheet <- 1
  }

  columnNames <- c(
    "Container Path", "Parameter Name", "Mean",
    "SD", "Distribution"
  )

  columnTypes <- c("text", "text", "numeric", "numeric", "text")

  tryCatch(
    {
      data <- readExcel(path = XLSpath, sheet = sheet, col_types = columnTypes)
    },
    error = function(e) {
      cli::cli_abort(message = messages$errorWrongXLSStructure(filePath = XLSpath, expectedColNames = columnNames), call = rlang::caller_env(4))
    }
  )

  if (!all(columnNames %in% names(data))) {
    cli::cli_abort(message = messages$errorWrongXLSStructure(filePath = XLSpath, expectedColNames = columnNames))
  }

  if (nrow(data) == 0) {
    cli::cli_abort(message = c(
      "x" = "The specified excel sheet does not contain any rows with data.",
      "*" = "Please check the excel sheet name and content and try again."
    ))
  }

  complete_data <-
    data %>%
    dplyr::filter(!dplyr::if_any(dplyr::everything(), ~ is.na(.)))


  if (nrow(complete_data) < nrow(data)) {
    cli::cli_warn(message = c(
      "x" = "The specified excel sheet contains uncomplete row(s)",
      "i" = "Using only complete rows to define population parameters"
    ))
  }

  if (nrow(complete_data) == 0) {
    cli::cli_abort(message = c(
      "x" = "The specified excel sheet does not contain any complete row",
      "*" = "Please fill all the columns and try again."
    ))
  }
  extendPopulationByUserDefinedParams(
    population = population,
    parameterPaths = paste(complete_data$`Container Path`, complete_data$`Parameter Name`, sep = "|"),
    meanValues = complete_data$Mean,
    sdValues = complete_data$SD,
    distributions = complete_data$Distribution
  )
}

#' Supported distributions for sampling
#' @export
Distributions <- enum(list(
  "Normal",
  "LogNormal"
))

#' Sample a random value from a distribution
#'
#' @param distribution The type of the distribution the random variable is to be
#'   sampled from. See `Distributions` for the list of supported entries.
#' @param mean Mean value of the random variable
#' @param sd Standard deviation of the random variable
#' @param n Size of the sample
#'
#' @return Numerical vector of size n with randomly sampled values
#' @export
sampleRandomValue <- function(distribution, mean, sd, n) {
  if (!enumHasKey(distribution, Distributions)) {
    stop(messages$errorDistributionNotSupported(distribution))
  }

  if (distribution == Distributions$Normal) {
    return(stats::rnorm(n, mean, sd))
  }

  if (distribution == Distributions$LogNormal) {
    location <- log(mean^2 / sqrt(sd^2 + mean^2))
    shape <- sqrt(log(1 + (sd^2 / mean^2)))
    vals <- stats::rlnorm(n = n, meanlog = location, sdlog = shape)
    return(vals)
  }
  return(NULL)
}

#' Read ontogeny mapping from excel
#'
#' @param data Data from from excel file containing columns 'Protein' and
#' 'Ontogeny'
#'
#' @keywords internal
#' @return A list of `MoleculeOntogeny` objects
.readOntongeniesFromXLS <- function(data) {
  # Read columns 'Ontogeny' and 'Protein'
  ontogenies <- data$Ontogeny
  # calling 'as.character' as sometimes empty cells in Excel are not recognized as
  # chr NA, but some other NA, and strsplit fails.
  ontogenies <- as.character(ontogenies)
  # Proteins/ontogenies are separated by a ','
  ontogenies <- unlist(strsplit(x = ontogenies, split = ",", fixed = TRUE))
  # Remove whitespaces
  ontogenies <- trimws(ontogenies)
  proteins <- data$Protein
  proteins <- as.character(proteins)
  proteins <- unlist(strsplit(x = proteins, split = ",", fixed = TRUE))
  proteins <- trimws(proteins)

  # For each protein, an ontogeny must be specified
  validateIsSameLength(proteins, ontogenies)
  # Return 'NULL' if no ontogenies are specified. Not returning earlier to catch
  # a case where e.g. protein names are specified but not the ontogenies (lenghts
  # are not equal)
  if (anyNA(proteins)) {
    return(NULL)
  }
  moleculeOntogenies <- vector("list", length(proteins))
  for (i in seq_along(proteins)) {
    ontogeny <- ontogenies[[i]]
    validateEnumValue(value = ontogeny, enum = ospsuite::StandardOntogeny)
    moleculeOntogenies[[i]] <- ospsuite::MoleculeOntogeny$new(
      molecule = proteins[[i]],
      ontogeny = ospsuite::StandardOntogeny[[ontogeny]]
    )
  }

  return(moleculeOntogenies)
}
esqLABS/esqlabsR documentation built on April 17, 2025, 10:51 a.m.