distantia: Assessing Dissimilarity Between Multivariate Time Series

Documented in workflowTransfer

#' Transfers an attribute (time, age, depth) from one sequence to another
#'
#' @description Transfers an attribute (generally time/age, but any others are possible) from one sequence (defined by the argument \code{transfer.from}) to another (defined by the argument \code{transfer.to}) lacking it. The transference of the attribute is based on the following assumption: similar samples have similar attributes. This assumption might not hold for noisy multivariate time-series. Attribute transference can be done in two different ways (defined by the \code{mode} argument):
#' \itemize{
#' \item \emph{Direct}: transfers the selected attribute between samples with the maximum similarity. This option will likely generate duplicated attribute values in the output.
#' \item \emph{Interpolate}: obtains new attribute values through weighted interpolation, being the weights derived from the distances between samples
#' }
#'
#' @usage workflowTransfer(
#'   sequences = NULL,
#'   grouping.column = NULL,
#'   time.column = NULL,
#'   exclude.columns = NULL,
#'   method = "manhattan",
#'   transfer.what = NULL,
#'   transfer.from = NULL,
#'   transfer.to = NULL,
#'   mode = "direct",
#'   plot = FALSE
#'   )
#'
#' @param sequences dataframe with multiple sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param method character string naming a distance metric. Valid entries are: "manhattan", "euclidean", "chi", and "hellinger". Invalid entries will throw an error.
#' @param transfer.what character string, column of \code{sequences} with the attribute to be transferred. If empty or ill-defined, \code{time.column} is used instead if available.
#' @param transfer.from character string, group available in \code{grouping.column} identifying the sequence from which to take the attribute values.
#' @param transfer.to character string, group available in \code{grouping.column} identifying the sequence to which transfer the attribute values.
#' @param mode character string, one of: "direct" (default), "interpolate".
#' @param plot boolean, if \code{TRUE}, plots the distance matrix and the least-cost path.
#'
#' @return A dataframe with the sequence \code{transfer.to}, with a column named after \code{transfer.what} with the attribute values.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#'
#' @examples
#'
#' \donttest{
#'
#' #loading sample dataset
#' data(pollenGP)
#' #subset pollenGP to make a shorter dataset
#' pollenGP <- pollenGP[1:50, ]
#'
#' #generating a subset of pollenGP
#' set.seed(10)
#' pollenX <- pollenGP[sort(sample(1:50, 40)), ]
#'
#' #we separate the age column
#' pollenX.age <- pollenX$age
#'
#' #and remove the age values from pollenX
#' pollenX$age <- NULL
#' pollenX$depth <- NULL
#'
#' #removing some samples from pollenGP
#' #so pollenX is not a perfect subset of pollenGP
#' pollenGP <- pollenGP[-sample(1:50, 10), ]
#'
#' #prepare sequences
#' GP.X <- prepareSequences(
#'   sequence.A = pollenGP,
#'   sequence.A.name = "GP",
#'   sequence.B = pollenX,
#'   sequence.B.name = "X",
#'   grouping.column = "id",
#'   time.column = "age",
#'   exclude.columns = "depth",
#'   transformation = "none"
#'   )
#'
#'#transferring age
#'X.new <- workflowTransfer(
#'  sequences = GP.X,
#'  grouping.column = "id",
#'  time.column = "age",
#'  method = "manhattan",
#'  transfer.what = "age",
#'  transfer.from = "GP",
#'  transfer.to = "X",
#'  mode = "interpolated"
#'  )
#'
#' }
#'
#' @export
workflowTransfer <- function(
  sequences = NULL,
  grouping.column = NULL,
  time.column = NULL,
  exclude.columns = NULL,
  method = "manhattan",
  transfer.what = NULL,
  transfer.from = NULL,
  transfer.to = NULL,
  mode = "direct",
  plot = FALSE
){

  #checking transfer.from
  if(!(transfer.from) %in% sequences[, grouping.column]){
    stop("Argument 'transfer.from' must be one of the groups defined by 'grouping.column'")
  }

  #checking transfer.to
  if(!(transfer.to) %in% sequences[, grouping.column]){
    stop("Argument 'transfer.to' must be one of the groups defined by 'grouping.column'")
  }

  #checking transfer.what and time.column
  if(is.null(transfer.what)){
    if(is.null(time.column)){
      stop("Arguments 'transfer.what' and 'time.column' cannot be NULL at the same time.")
    } else {
      if(time.column %in% colnames(sequences)){
        transfer.what <- time.column
      }
    }
  } else {
    if(!(transfer.what %in% colnames(sequences))){
      if(!is.null(time.column)){
        if(time.column %in% colnames(sequences)){
          transfer.what <- time.column
        }
      }
    }
  }

  #checking mode
  direct.strings <- c("direct", "Direct", "DIRECT", "d", "D", "dir", "Dir", "DIR")
  interpolate.strings <- c("interpolate", "Interpolate", "INTERPOLATE", "int", "Int", "INT", "i", "I", "interpolated", "Interpolated", "INTERPOLATED")

  if(!(mode %in% c(direct.strings, interpolate.strings))){
    warning("Argument 'mode' must be one of: 'direct', 'interpolate'. Setting it to 'direct'.")
    mode <- "direct"
  }

  #separating transfer.from and transfer.to
  from.df <- sequences[sequences[, grouping.column] == transfer.from, ]
  to.df <- sequences[sequences[, grouping.column] == transfer.to, ]


  #computation of distance matrix
  distance.matrix <- distanceMatrix(
    sequences = rbind(from.df, to.df),
    grouping.column = grouping.column,
    time.column = time.column,
    exclude.columns = exclude.columns,
    method = method,
    parallel.execution = FALSE
  )


  #MODE IS direct
  #########################
  if(mode %in% direct.strings){

    #computing least cost matrix
    least.cost.matrix <- leastCostMatrix(
      distance.matrix = distance.matrix,
      diagonal = TRUE,
      parallel.execution = FALSE
    )

    #least cost path
    least.cost.path <- leastCostPath(
      distance.matrix = distance.matrix,
      least.cost.matrix = least.cost.matrix,
      diagonal = TRUE,
      parallel.execution = FALSE
    )

    #plot
    if(plot == TRUE){
      plotMatrix(
        distance.matrix = distance.matrix,
        least.cost.path = least.cost.path,
        color.palette = viridis(100, alpha = 0.7)
      )
    }

    #least cost path to dataframe
    least.cost.path.df <- least.cost.path[[1]]

    #flip least cost path
    least.cost.path.df <- least.cost.path.df[nrow(least.cost.path.df):1, ]

    #iterating throug cases in to.df
    for(i in 1:nrow(to.df)){

      #subset
      least.cost.path.df.subset <- least.cost.path.df[least.cost.path.df[, transfer.to] == i, ]

      #select sample with minimum distance
      selected.sample <- least.cost.path.df.subset[which.min(least.cost.path.df.subset$distance), ]

      #transfer
      to.df[i, transfer.what] <- from.df[selected.sample[,transfer.from], transfer.what]

    }

    return(to.df)

  } # end of mode == "direct"


  #MODE is "interpolate"
  ###########################
  if(mode %in% interpolate.strings){

    #computing least cost matrix
    least.cost.matrix <- leastCostMatrix(
      distance.matrix = distance.matrix,
      diagonal = FALSE,
      parallel.execution = FALSE
    )

    #least cost path
    least.cost.path <- leastCostPath(
      distance.matrix = distance.matrix,
      least.cost.matrix = least.cost.matrix,
      diagonal = FALSE,
      parallel.execution = FALSE
    )

    #plot
    if(plot == TRUE){
      plotMatrix(
        distance.matrix = distance.matrix,
        least.cost.path = least.cost.path,
        color.palette = viridis(100, alpha = 0.7)
      )
    }

    #least cost path to dataframe
    least.cost.path.df <- least.cost.path[[1]]

    #flip least cost path
    least.cost.path.df <- least.cost.path.df[nrow(least.cost.path.df):1, ]

    #iterating throug cases in to.df
    ################################
    ################################
    for(k in 1:nrow(to.df)){

      #1. IT IS REPEATED AND THE DISTANCES TO ADJACENT SAMPLES ARE IN PATH
      ###############################################################
      if(sum(least.cost.path.df[, transfer.to] == k) > 1){

        #subset
        least.cost.path.df.subset <- least.cost.path.df[least.cost.path.df[, transfer.to] == k, ]

        #select sample with minimum distance
        selected.sample.i <- least.cost.path.df.subset[which.min(least.cost.path.df.subset$distance), ]

        #subset again to keep the ones +1 and -1 the selected sample
        least.cost.path.df.subset <- least.cost.path.df.subset[least.cost.path.df.subset[, transfer.from] %in% c(selected.sample.i[, transfer.from] - 1, selected.sample.i[, transfer.from] + 1), ]

        #remove this sample from the subset
        if(nrow(least.cost.path.df.subset) > 1){
          selected.sample.j <- least.cost.path.df.subset[which.min(least.cost.path.df.subset$distance), ]
        } else {
          selected.sample.j <- least.cost.path.df.subset
        }

        #getting indices of samples i and j
        i <- selected.sample.i[, transfer.from]
        j <- selected.sample.j[, transfer.from]

      }#end of 1.


      #2. IT IS NOT REPEATED, AND THE DISTANCES TO ADJACENT SAMPLES ARE IN THE DISTANCE MATRIX
      #############################################################
      if(sum(least.cost.path.df[, transfer.to] == k) == 1){

        #get selected sample i
        selected.sample.i <- least.cost.path.df[least.cost.path.df[, transfer.to] == k, ]
        i <- selected.sample.i[, transfer.from]

        #select sample j
        #matrix columns are transfer.to
        #matrix rows are transfer.from
        if(i == 1){
          j <- 2
        } else {
          if(i == nrow(distance.matrix[[1]]) | i + 1 > nrow(distance.matrix[[1]])){
            j <- i - 1
          } else {
            if(i - 1 < 1){
              j <- i + 1
            } else {
              j <- c(i+1, i-1)
              names(j) <- c("plus", "minus")
            }
          }
        }

        #if j has two elements, get the distances with k
        if(length(j) == 2){

          #computing distances for both js
          Dj <- distance.matrix[[1]][j, k]

          #getting the j with the minimum distance
          j <- j[which.min(Dj)]

          #removing names
          names(j) <- NULL
        }

      }# end of 2.

      #computing variables needed to interpolate the attribute
      #ages
      Ati <- from.df[i, transfer.what]
      Atj <- from.df[j, transfer.what]

      #distances
      DBkAi <- distance.matrix[[1]][i, k]
      DBkAj <- distance.matrix[[1]][j, k]

      #normalizing the distances to 1
      wi <- DBkAi / (DBkAi + DBkAj)
      wj <- DBkAj / (DBkAi + DBkAj)

      #computing  Btk
      Btk <- wi * Ati + wj * Atj

      #checking if the attribute is higher than the previous one
      #tries to increase j
      #if still doesn't work, adds a NA
      if(k > 1){

        #finding the previous non-NA age
        n <- 1
        while(is.na(to.df[k - n, transfer.what])){
          n <- n + 1
        }

        #if the interpolated age is lower than the previous one
        #set it to NA
        if(Btk < to.df[k - n, transfer.what]){
              Btk <- NA
              }
        }

      #adding it
      to.df[k, transfer.what] <- Btk

    }#end of k loop

    return(to.df)

  }#end of mode == "interpolate"

}