distantia: Assessing Dissimilarity Between Multivariate Time Series

Documented in workflowPsi

#' Computes the dissimilarity measure \emph{psi} on two or more sequences.
#'
#' @description If the sequences are not aligned (\code{paired.samples = FALSE}), the function executes these steps.
#' \itemize{
#' \item Computes the autosum of the sequences with \code{\link{autoSum}}.
#' \item Computes the distance matrix with \code{\link{distanceMatrix}}.
#' \item Uses the distance matrix to compute the least cost matrix with \code{\link{leastCostMatrix}}.
#' \item Extracts the cost of the least cost path with \code{\link{leastCost}}.
#' \item Computes the dissimilarity measure \emph{psi} with the function \code{\link{psi}}.
#' \item Delivers an output of type "list" (default), "data.frame" or "matrix", depending on the user input, through \code{\link{formatPsi}}.
#' }
#'
#' If the sequences are aligned (\code{paired.samples = TRUE}), these steps are executed:
#' \itemize{
#' \item Computes the autosum of the sequences with \code{\link{autoSum}}.
#' \item Sums the distances between paired samples with \code{\link{distancePairedSamples}}.
#' \item Computes the dissimilarity measure \emph{psi} with the function \code{\link{psi}}.
#' \item Delivers an output of type "list" (default), "data.frame" or "matrix", depending on the user input, through \code{\link{formatPsi}}.
#' }
#'
#' @usage workflowPsi(
#'   sequences = NULL,
#'   grouping.column = NULL,
#'   time.column = NULL,
#'   exclude.columns = NULL,
#'   method = "manhattan",
#'   diagonal = FALSE,
#'   format = "dataframe",
#'   paired.samples = FALSE,
#'   same.time = FALSE,
#'   ignore.blocks = FALSE,
#'   parallel.execution = TRUE
#'   )
#'
#' @param sequences dataframe with multiple sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param method character string naming a distance metric. Valid entries are: "manhattan", "euclidean", "chi", and "hellinger". Invalid entries will throw an error.
#' @param diagonal boolean, if \code{TRUE}, diagonals are included in the computation of the least cost path. Defaults to \code{FALSE}, as the original algorithm did not include diagonals in the computation of the least cost path. If \code{paired.samples} is \code{TRUE}, then \code{diagonal} is irrelevant.
#' @param format string, type of output. One of: "data.frame", "matrix". If \code{NULL} or empty, a list is returned.
#' @param paired.samples boolean, if \code{TRUE}, the sequences are assumed to be aligned, and distances are computed for paired-samples only (no distance matrix required). Default value is \code{FALSE}.
#' @param same.time boolean. If \code{TRUE}, samples in the sequences to compare will be tested to check if they have the same time/age/depth according to \code{time.column}. This argument is only useful when the user needs to compare two sequences taken at different sites but same time frames.
#' @param ignore.blocks boolean. If \code{TRUE}, the function \code{\link{leastCostPathNoBlocks}} analyzes the least-cost path of the best solution, and removes blocks (straight-orthogonal sections of the least-cost path), which happen in highly dissimilar sections of the sequences, and inflate output psi values.
#' @param parallel.execution boolean, if \code{TRUE} (default), execution is parallelized, and serialized if \code{FALSE}.
#'
#' @return A list, matrix, or dataframe, with sequence names and psi values.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#'
#' @examples
#'
#' \donttest{
#' data("sequencesMIS")
#' #prepare sequences
#' MIS.sequences <- prepareSequences(
#'   sequences = sequencesMIS,
#'   grouping.column = "MIS",
#'   if.empty.cases = "zero",
#'   transformation = "hellinger"
#'   )
#'
#'#execute workflow to compute psi
#'MIS.psi <- workflowPsi(
#'  sequences = MIS.sequences[MIS.sequences$MIS %in% c("MIS-1", "MIS-2"), ],
#'  grouping.column = "MIS",
#'  time.column = NULL,
#'  exclude.columns = NULL,
#'  method = "manhattan",
#'  diagonal = FALSE,
#'  parallel.execution = FALSE
#'  )
#'
#'MIS.psi
#'
#'}
#'
#' @export
workflowPsi <- function(sequences = NULL,
                        grouping.column = NULL,
                        time.column = NULL,
                        exclude.columns = NULL,
                        method = "manhattan",
                        diagonal = FALSE,
                        format = "dataframe",
                        paired.samples = FALSE,
                        same.time = FALSE,
                        ignore.blocks = FALSE,
                        parallel.execution = TRUE){


  #SAMPLES ARE NOT PAIRED: ELASTIC METHOD
  if(paired.samples == FALSE){

    #computing distance matrix
    distance.matrix <- distanceMatrix(
      sequences = sequences,
      grouping.column = grouping.column,
      time.column = time.column,
      exclude.columns = exclude.columns,
      method = method,
      parallel.execution = parallel.execution
    )

    #computing least cost matrix
    least.cost.matrix <- leastCostMatrix(
      distance.matrix = distance.matrix,
      diagonal = diagonal,
      parallel.execution = parallel.execution
    )

    #computing least cost path
    least.cost.path <- leastCostPath(
      distance.matrix = distance.matrix,
      least.cost.matrix = least.cost.matrix,
      diagonal = diagonal,
      parallel.execution = parallel.execution
    )

    #BLOCKS ARE NOT IGNORED
    if(ignore.blocks == TRUE){

      #BLOCKS ARE IGNORED
      #computing least cost path
      least.cost.path <- leastCostPathNoBlocks(
        least.cost.path = least.cost.path,
        parallel.execution = parallel.execution
      )

    }

    #getting least cost ignoring blocks
    least.cost <- leastCost(
    least.cost.path = least.cost.path,
    parallel.execution = parallel.execution
    )

    #autosum
    autosum.sequences <- autoSum(
      sequences = sequences,
      least.cost.path = least.cost.path,
      grouping.column = grouping.column,
      time.column = time.column,
      exclude.columns = exclude.columns,
      method = method,
      parallel.execution = parallel.execution
    )

    #computing psi
    psi.value <- psi(
      least.cost = least.cost,
      autosum = autosum.sequences,
      parallel.execution = parallel.execution
    )

    #shifting psi by 1
    if(diagonal == TRUE){
      psi.value <- lapply(X = psi.value, FUN = function(x){x + 1})
    }

  } #end of paired.samples == FALSE


  #SAMPLES ARE PAIRED: STEP-LOCK METHOD
  if(paired.samples == TRUE){

    #computing least cost
    least.cost <- distancePairedSamples(
      sequences = sequences,
      grouping.column = grouping.column,
      time.column = time.column,
      exclude.columns = exclude.columns,
      same.time = same.time,
      method = method,
      sum.distances = TRUE,
      parallel.execution = parallel.execution
    )

    #autosum
    autosum.sequences <- autoSum(
      sequences = sequences,
      least.cost.path = least.cost,
      grouping.column = grouping.column,
      time.column = time.column,
      exclude.columns = exclude.columns,
      method = method,
      parallel.execution = parallel.execution
    )

    #computing psi
    psi.value <- psi(
      least.cost = least.cost,
      autosum = autosum.sequences,
      parallel.execution = parallel.execution
    )

    #shifting psi by 1
    psi.value <- lapply(X = psi.value, FUN = function(x){x + 1})

  } #end of paired.samples == TRUE


  #formating psi
  if(format != "list"){
    psi.value <- formatPsi(
      psi.values = psi.value,
      to = format
      )
  }

  return(psi.value)
}