R/process_measurements.R
In MorphoRegions: Analysis of Regionalization Patterns in Serially Homologous Structures

Documented in process_measurements

#' Process vertebra measurements
#'
#' `process_measurements()` initializes the analysis workflow by processing a dataset of vertebra measurements into an object usable by \pkg{MorphoRegions}. Such processing includes identifying the vertebra indices and the measurements and filling in missing values.
#'
#' @param data a data.frame containing a column of vertebra indices and measurements for each vertebra, or a list thereof for multiple specimens.
#' @param pos the name or index of the variable in `data` containing the vertebra indices. Default is to use the first column.
#' @param measurements the names or indices of the variables in `data` containing the relevant vertebra measurements. If unspecified, will use all variables other than that specified in `pos`.
#' @param fillNA `logical`; whether to fill in missing values using a simple linear imputation. Default is `TRUE`. See Details.
#'
#' @returns A `regions_data` object, which is a list of data.frames (one for each specimen) with attributes containing metadata.
#'
#' @details
#' Any rows with missing values for all measurements will be removed. When missing values in non-removed rows are present and `fillNA` is set to `TRUE`, `process_measurements()` fills them in if the sequence of missing values is no greater than 2 in length. For numeric variables, it uses a linear interpolation, and for categorical variables, it fills in the missing values with the surrounding non-missing values if they are identical and leaves them missing otherwise. Otherwise, missing values are left as they are.
#'
#' When a list of data frames is supplied to `data`, only the variables named in `measurements` that are common across datasets will be stored as measurement variables.
#'
#' @seealso [svdPCO()] for computing principal coordinate axes from processed vertebra data.
#'
#' @example man/examples/example-process_measurements.R

#' @export
process_measurements <- function(data, pos = 1L, measurements, fillNA = TRUE) {

  if (is.matrix(data) || is.data.frame(data)) {
    data <- list(as.data.frame(data))
  }
  else if (is.list(data) && all(vapply(data, function(i) is.data.frame(i) || is.matrix(i), logical(1L)))) {
    for (i in seq_along(data)) {
      data[[i]] <- as.data.frame(data[[i]])
    }
  }
  else {
    chk::err("`data` must be a matrix, dataframe, or list thereof")
  }

  chk::chk_scalar(pos)
  chk::chk_flag(fillNA)

  pos_names <- character(length(data))
  pos_var_list <- vector("list", length(data))

  for (i in seq_along(data)) {
    if (chk::vld_whole_number(pos) && chk::vld_gte(pos, 1) && chk::vld_lte(pos, ncol(data[[i]]))) {
      pos_names[i] <- names(data[[i]])[as.integer(pos)]
    }
    else if (chk::vld_string(pos) && chk::vld_subset(pos, names(data[[i]]))) {
      pos_names[i] <- pos
    }
    else {
      chk::err("`pos` must be a single value indicating the column in `data` containing the vertebra positions")
    }

    pos_var_list[[i]] <- data[[i]][[pos_names[i]]]
  }

  if (length(unique(pos_names)) > 1) {
    chk::err("the variable identified by `pos` must have the same name in all supplied datasets")
  }

  pos <- pos_names[1]

  for (i in seq_along(pos_var_list)) {
    if (!chk::vld_whole_numeric(pos_var_list[[i]])) {
      chk::err("`pos` must refer to a variable of whole numbers identifying vertebra positions")
    }
  }

  measurements_names_list <- vector("list", length(data))

  for (i in seq_along(data)) {
    if (missing(measurements)) {
      measurements_names_list[[i]] <- setdiff(names(data[[i]]), pos)
    }
    else {
      if (length(measurements) == 0) {
        measurements_names_list[[i]] <- character()
      }
      else if (chk::vld_whole_numeric(measurements) && chk::vld_subset(measurements, seq_len(ncol(data[[i]])))) {
        measurements_names_list[[i]] <- names(data[[i]])[as.integer(measurements)]
      }
      else if (chk::vld_character(measurements) && chk::vld_subset(measurements, names(data[[i]]))) {
        measurements_names_list[[i]] <- measurements
      }
      else {
        chk::err("if supplied, `measurements` must indicate the columns in `data` containing the measurement values")
      }

      if (pos %in% measurements_names_list[[i]]) {
        chk::err("`pos` and `measurements` cannot overlap")
      }
    }
  }

  # Get only measurements that are common across datasets
  measurements <- Reduce(union, measurements_names_list)

  for (i in seq_along(data)) {

    #Subset to specified variables
    data[[i]] <- data[[i]][names(data[[i]]) %in% c(pos, measurements)]

    #Reorder columns to be consistent
    if (i > 1)
      data[[i]] <- data[[i]][names(data[[1]])]

    pos_ind <- match(pos, names(data[[i]]))

    #Rows that are all NA
    all_NA_rows <- which(apply(data[[i]], 1, function(x) all(is.na(x[-pos_ind]))))

    if (length(all_NA_rows) > 0)
      data[[i]] <- data[[i]][-all_NA_rows,]

    if (fillNA) {
      #Fill in missing values
      data[[i]] <- .missingval(data[[i]], pos_ind)

      if (anyNA(data[[i]])) {
        chk::wrn(sprintf("missing values remain in %s because there were sequences of missing values greater than %s in length", if (length(data) == 1) "the dataset" else paste("dataset", i), 2))
      }
    }
  }

  attr(data, "pos_ind") <- match(pos, names(data[[1]]))
  attr(data, "eligible_vertebrae") <- sort(unique(unlist(lapply(data, `[[`, pos))))

  class(data) <- "regions_data"

  data
}

# Takes in a dataframe and fills in missing values
# `max_NA_seq_len` controls max number of consecutive missing values that can
# be imputed
.missingval <- function(data, pos_ind, max_NA_seq_len = 2) {

  if (!anyNA(data[-pos_ind])) return(data)

  chk::chk_count(max_NA_seq_len)

  # Add missing vertebrae
  missing_vertebrae <- setdiff(seq(min(data[[pos_ind]]),
                                   max(data[[pos_ind]])),
                               data[[pos_ind]])

  if (length(missing_vertebrae) > 0) {
    missing_rows <- as.data.frame(matrix(NA, nrow = length(missing_vertebrae),
                                         ncol = ncol(data)))
    names(missing_rows) <- names(data)
    missing_rows[[pos_ind]] <- missing_vertebrae

    data <- rbind(data, missing_rows)
  }

  # Order by vertebra
  o <- order(data[[pos_ind]])

  pos <- data[[pos_ind]]

  # Find strings of 1-2 NAs
  variables_with_missing <- which(vapply(data[!(pos %in% missing_vertebrae), -pos_ind],
                                         anyNA, logical(1L)))

  for (i in variables_with_missing) {
    # Extract variable ordered by vertebra
    dat <- data[o, -pos_ind][[i]]

    miss.par <- which(is.na(dat)) #find which ones are missing
    seqs <- split(miss.par, cumsum(c(1, diff(miss.par) != 1))) #split them into sequences

    l.seq <- which(lengths(seqs) <= max_NA_seq_len) #which strings are two or less
    if (length(l.seq) == 0) next #if no short strings skip
    seqs <- seqs[l.seq]

    for (fill in seqs) { #Fill each string
      before <- min(fill) - 1
      after <- max(fill) + 1
      if (before < 1) before <- after #if at the beginning, use the end points
      if (after > length(dat)) after <- before #if at the end, use beginning points

      if (!is.numeric(dat) && dat[before] != dat[after]) next

      if (is.numeric(dat)) {
        val <- seq(dat[before], dat[after], length.out = length(fill) + 2) #calculate missing as mean of adjacent
        dat[fill] <- val[-c(1, length(val))] #fill in the missing
      }
      else {
        val <- rep(dat[before], length(fill))
        dat[fill] <- val #fill in the missing
      }
    }

    data[o, -pos_ind][[i]] <- dat
  }

  # Remove extra fully missing rows
  subset(data, !pos %in% missing_vertebrae)
}

Any scripts or data that you put into this service are public.

MorphoRegions documentation built on Sept. 11, 2024, 8:28 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MorphoRegions
Analysis of Regionalization Patterns in Serially Homologous Structures

R/process_measurements.R
In MorphoRegions: Analysis of Regionalization Patterns in Serially Homologous Structures

Defines functions .missingval process_measurements

Documented in process_measurements

Try the MorphoRegions package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MorphoRegions Analysis of Regionalization Patterns in Serially Homologous Structures

R/process_measurements.R In MorphoRegions: Analysis of Regionalization Patterns in Serially Homologous Structures

Defines functions .missingval process_measurements

Documented in process_measurements

Try the MorphoRegions package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MorphoRegions
Analysis of Regionalization Patterns in Serially Homologous Structures

R/process_measurements.R
In MorphoRegions: Analysis of Regionalization Patterns in Serially Homologous Structures