R/datplot_utility.R

Defines functions check.structure create.sub.objects get.step.sequence get.probability get.weights switch.dating generate.stepsize

Documented in check.structure create.sub.objects generate.stepsize get.probability get.step.sequence get.weights switch.dating

#' @title Determine stepsize (internal)
#'
#' @description Determines stepsize by selecting the absolute minimum value
#' between the upper and lower end of all dating ranges.
#'
#' @param DAT_mat a matrix as prepared by [datsteps()], resp. a matrix witch
#' columns names `datmin` and `datmax` containing numeric/integer value of the
#' dating ranges.
#' @inheritParams datsteps
#'
#' @return A single numeric value that can be used as minimal stepsize.
#'
#' @seealso [datsteps()]
#'
#' @keywords internal
generate.stepsize <- function(DAT_mat, verbose = FALSE) {
  if (!is.numeric(DAT_mat[, "datmin"]) & !is.numeric(DAT_mat[, "datmax"])) {
    stop("Non numeric values handed to generate.stepsize().")
  }

  timespans <- (abs(DAT_mat[, "datmin"] - DAT_mat[, "datmax"]))

  stepsize <- min(timespans)

  stepsize <- ifelse(stepsize == 0, 1, stepsize)

  if(verbose) {
    message(paste("Using stepsize = ", stepsize, " (auto).", sep = ""))
  }
  return(stepsize)
}

#' @title Switch values where dating is in wrong order (internal)
#'
#' @description Requires a data.frame with 2 numeric variables in the
#' 3rd and 4th column: minimum date (int/numeric) and
#' maximum date (int/numeric) as used in [datsteps()].
#'
#' @param DAT_df a data.frame with 4 variables in this order: ID, group,
#' minimum date (int/num), maximum date (int/num)
#'
#' @return The same data.frame with the dating values which were in wrong order
#' switched.
#'
#' @seealso [datsteps()]
#'
#' @keywords internal
switch.dating <- function(DAT_df) {
  dat_wrong_order <- which(DAT_df[, 3] > DAT_df[, 4])

  if (length(dat_wrong_order) > 0) {
    # Switch the Dating of Rows assumed to be in wrong order:
    DAT_df[dat_wrong_order, 3:4] <- DAT_df[dat_wrong_order, 4:3]
    # Notifying is important, because the data have been changed!
    warning(paste0("Warning: Dating seems to be in wrong order at ID ",
                   paste(DAT_df[dat_wrong_order, 1], collapse = ", "),
                   " (Index: ", paste(dat_wrong_order, collapse = ", "),
                   "). Dates have been switched, but be sure to check ",
                   "your original data for possible mistakes."))
  }

  return(DAT_df)
}


#' @title Calculate the weights for each dated object
#'
#' @description Calculates the weights from two vectors of minimum and maximum
#' dating for each object. Returns a dataframe with the weight in the first
#' column and FALSE in the second if two rows have the same value in both
#' min and max dating. See [publication](https://doi.org/10.1017/aap.2021.8)
#' for information about how this is calculated.
#'
#' @param DAT_min a numeric vector containing the minimum date of each object
#' @param DAT_max a numeric vector containing the maximum date of each object
#' @inheritParams datsteps
#'
#' @seealso [datsteps()], [get.probability()]
#'
#' @return a vector of 'weight'-values for the datsteps-data.frame, that is a
#' quantification of how well the object is dated (lesser value means object
#' is dated to larger timespans, i.e. with less confidence)
#'
#' @export get.weights
get.weights <- function(DAT_min, DAT_max, verbose = FALSE) {
  stopifnot(is.numeric(DAT_min))
  stopifnot(is.numeric(DAT_max))

  weights <- abs(DAT_min - DAT_max)

  if (any(weights == 0)) {
    if (verbose) {
      msg <- paste0("DAT_min and DAT_max at Index: ",
                    paste(which(weights == 1), collapse = ", "),
                    " have the same value! Is this correct? ",
                    "If unsure, check your data for possible errors.")
      message(msg)
    }
    # set weight to 1 to treat objects with same min and max
    # dating (dated to one year precisely) as very influential
    # will have the same weight as objects dated to two years,
    # (which may also equal a span of 1 year)
    weights[which(weights == 0)] <- 1
  }
  # weights have to be below 1
  weights <- 1 / weights

  return(weights)
}


#' @title Calculate the probability for each year and each dated object
#'
#' @description Calculates the probability of each object being dated into
#' each year / timeslot from two vectors of minimum and maximum
#' dating. Returns a vector of probabilities.
#'
#' @inheritParams get.weights
#'
#' @return a vector of probabilities for each object being dated to any
#' single year within the timespan (lesser value means object is dated to
#' larger timespans, i.e. with less confidence).
#'
#' @seealso [datsteps()], [get.weights()]
#'
#' @export get.probability
get.probability <- function(DAT_min, DAT_max) {
  stopifnot(is.numeric(DAT_min))
  stopifnot(is.numeric(DAT_max))

  # calculate the dating probability
  # (thanks to Christian Gugl for requesting this)
  prob <- abs(DAT_min - DAT_max)
  prob <- prob + 1
  prob <- 1 / prob

  return(prob)
}


#' @title Calculate the sequence of dating steps
#'
#' @description
#' Produces an appropriate sequence of years between the minimum
#' and maximum dating.
#'
#' If they cannot be properly divided by the stepsize set
#' beforehand, either three values are generated for objects that are dated to
#' a range of more then 60% of the stepsize (min, mean, max), or two values for
#' objects dated to a timespan of less or equal to 60% of the stepsize.
#' If they can be divided without residual, the normal sequence is returned.
#' If there is a residual, the stepsize is modified depending on how large the
#' residual is.
#'
#' @param datmin numeric value of the minimum dating of one object
#' @param datmax numeric value of the maximum dating of one object
#' @param stepsize the stepsize to be used
#'
#' @return sequence of steps to be created by [create.sub.objects()]
#'
#' @seealso [datsteps()], [create.sub.objects()]
#'
#' @export get.step.sequence
#'
#' @examples
#' min_year <- -494
#' max_year <- -334
#' sequence <- get.step.sequence(datmin = min_year, datmax = max_year, stepsize = 25)
#' sequence
#'
#' min_year <- 1
#' max_year <- 100
#' sequence <- get.step.sequence(datmin = min_year, datmax = max_year, stepsize = 25)
#' sequence
get.step.sequence <- function(datmin = 0, datmax = 100, stepsize = 25) {

  stopifnot(is.numeric(datmin))
  stopifnot(is.numeric(datmax))
  stopifnot(is.numeric(stepsize))

  # Get the difference of the two dating values
  timespan <- datmax - datmin

  # First: If the stepsize is larger than the timespan, two different
  # strategies can be employed
  if (timespan %/% stepsize == 0) {
    if (timespan > (stepsize * 0.6)) {
      # If the timespan exceeds 60% of the stepsize, three steps will be
      # created corresponding to minimum, mean and maximum dating
      sequence <- c(datmin, round(((datmin + datmax) / 2), digits = 0), datmax)
    } else if (timespan == 0) {
      # for objects dated to one year, only use one year!
      sequence <- datmin
    } else {
      # if the timespan is less than 60% of the stepsize, just two values
      # corresponding to minimum and maximum dating will be returned
      sequence <- c(datmin, datmax)
    }
  } else {
    # If the timespan can be devided at least once, first generate the sequence
    sequence <- seq(from = datmin, to = datmax, by = stepsize)
    # then check how many years the maximum dating would be off
    resid <- datmax - sequence[length(sequence)]
    if (resid >= (stepsize / 2)) {
      # if the residual is larger or equals half the stepsize, the stepsize is
      # temporarily modified to fit the as many values
      # as it would with the length of the sequence generated
      stepsize_mod <- (datmax - datmin) / (length(sequence) + 1)
      sequence <- seq(datmin, datmax, stepsize_mod)
      # then rounds all values except first and last, which need to stay as
      # minimum and maximum date
      sequence[-c(1, length(sequence))] <-
        round(sequence[-c(1, length(sequence))],
              digits = 0)
    } else if (resid != 0) {
      # if the residual is smaller but also not 0, the sequence values are moved
      # by an appropriate fraction
      move <- round(resid / (length(sequence) - 1), digits = 0)
      sequence[2:length(sequence)] <- sequence[2:length(sequence)] + move
      # and the end of the sequence is reset as the maximum dating
      sequence[length(sequence)] <- datmax
      # TODO: these two things do essentially the same? I need to fix the first
      # one to use the largest possible division, maybe
    } else {
      # this implies that there was no residual, so the original
      # sequence can be used
    }
  }
  # returns the sequence
  return(sequence)
}


#' @title Create sub-objects for each object in a dataframe (internal)
#'
#' @description Requires a list with named vectors as [datsteps()] will
#' hand to the function.
#'
#' @param DAT_list a list as prepared by [datsteps()]
#' @inheritParams datsteps
#'
#' @return an expanded list of the same structure to be further processed by
#' [datsteps()] each object duplicated according to the number of steps required
#'
#' @keywords internal

create.sub.objects <- function(DAT_list,
                               stepsize,
                               calc = "weight",
                               cumulative = FALSE) {

  diffs <- unlist(lapply(DAT_list, function(x) x["datmax"] - x["datmin"]))

  switch (calc,
          weight = diffs[diffs == 0] <- 1,
          probability = diffs <- diffs + 1
  )


  if (any(diffs < stepsize)) {
    warning(paste0("stepsize is larger than the range of the ",
                   "closest dated object at Index = ",
                   paste(which(diffs < stepsize), collapse = ", "), "). ",
                   "This is not recommended. ",
                   "For information see documentation of get.step.sequence()."))
  }

  DAT_list <- lapply(DAT_list, function(object) {
    sequence <- get.step.sequence(object["datmin"], object["datmax"],
                                  stepsize)
    new_object <- lapply(sequence, function(step) {
      new_object <- object
      new_object["step"] <- step
      return(new_object)
    })
    names(new_object) <- NULL
    new_object <- do.call(rbind, new_object)
    if (cumulative) {
      cumul_prob <- cumsum(new_object[, calc])
      new_object <- cbind(new_object, cumul_prob)
    }
    return(new_object)
  })



  result <- do.call(rbind, DAT_list)

  switch(calc,
         weight = attr <- "Calculated weight of each object according to doi.org/10.1017/aap.2021.8",
         probability = attr <- "year-wise probability of each object")

  attributes(result)$calc <- c(calc, attr)

  return(result)
}

#' @title Check if the structure is compatible with [datsteps()]  (internal)
#'
#' @description Checks if the object passed to [datsteps()] can be used for
#' processing.
#'
#' @param DAT_df An object to check
#' @inheritParams datsteps
#'
#' @return TRUE if object can be processed by [datsteps()], error / FALSE if not
#'
#' @keywords internal

check.structure <- function(DAT_df, verbose = FALSE) {
  dat_df_structure <- c(NA, NA, NA, NA, NA)
  names(dat_df_structure) <- c("is.df", "is.id", "is.var",
                               "is.minDAT", "is.maxDAT")
  # Todo
  dat_df_structure["is.df"] <- is.data.frame(DAT_df)
  dat_df_structure["is.id"] <- is.character(DAT_df[, 1, drop = TRUE])
  dat_df_structure["is.var"] <- is.factor(DAT_df[, 2, drop = TRUE])
  dat_df_structure["is.minDAT"] <- is.numeric(DAT_df[, 3, drop = TRUE])
  dat_df_structure["is.maxDAT"] <- is.numeric(DAT_df[, 4, drop = TRUE])


  if (dat_df_structure[1] == FALSE) {
    result <- FALSE
    stop("datsteps requires an object of class data.frame")
  } else {
    result <- TRUE
    }
  if (any(dat_df_structure[c("is.minDAT", "is.maxDAT")] == FALSE)) {
    result <- FALSE
    stop("The 3rd and 4th columns of your data.frame have to be numeric.")
  } else {
    result <- TRUE
    }
  if (any(dat_df_structure[2:3] == FALSE) & verbose) {
    message(paste0("It is recommended to use ",
                   "character vector for the 'ID'-column (1) ",
                   "and ",
                   "factor for the 'variable'-column (2)."))
  }
  return(result)
}

Try the datplot package in your browser

Any scripts or data that you put into this service are public.

datplot documentation built on Nov. 19, 2023, 1:09 a.m.