distantia: Assessing Dissimilarity Between Multivariate Time Series

Documented in workflowPsiHP

#' A refactored version of \code{\link{workflowPsi}} with a higher performance (hence the suffix HP).
#'
#'Ideal for large analyses with hundreds to thousands of sequences. Several options available in \code{\link{workflowPsi}} have been removed from this function in order to simplify the code as much as possible. Psi is computed with the options \code{diagonal = TRUE}, \code{ignore.blocks = TRUE}, and \code{method = "euclidean"}.
#'
#'Due to limitations of the function \code{\link[arrangements]{permutations}}, the maximum number of groups (according to \code{grouping.column}) is around 30000. Besides, a combinations table of this size takes, roughlyl, 7GB of memory.
#'
#' @usage workflowPsiHP(
#'   sequences = NULL,
#'   grouping.column = NULL,
#'   time.column = NULL,
#'   exclude.columns = NULL,
#'   parallel.execution = TRUE
#'   )
#'
#' @param sequences dataframe with multiple sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param parallel.execution boolean, if \code{TRUE} (default), execution is parallelized, and serialized if \code{FALSE}.
#'
#' @return A dataframe with sequence names and psi values.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#'
#' @examples
#'
#' \donttest{
#' data("sequencesMIS")
#' #prepare sequences
#' MIS.sequences <- prepareSequences(
#'   sequences = sequencesMIS[sequencesMIS$MIS %in% c("MIS-1", "MIS-2"), ],
#'   grouping.column = "MIS",
#'   if.empty.cases = "zero",
#'   transformation = "hellinger"
#'   )
#'
#'#execute workflow to compute psi
#'MIS.psi <- workflowPsiHP(
#'  sequences = MIS.sequences,
#'  grouping.column = "MIS",
#'  parallel.execution = FALSE
#'  )
#'
#'MIS.psi
#'
#'}
#'
#' @export
workflowPsiHP <- function(sequences = NULL,
                        grouping.column = NULL,
                        time.column = NULL,
                        exclude.columns = NULL,
                        parallel.execution = TRUE){

  #PREPARING sequences
  ####################
  #removing time column
  if(!is.null(time.column)){
    if(time.column %in% colnames(sequences)){
      sequences[, time.column] <- NULL
    }
  }

  #removing exclude columns
  if(!is.null(exclude.columns)){
    sequences <- sequences[,!(colnames(sequences) %in% exclude.columns)]
  }

  #to data.table
  sequences <- data.table::data.table(
    sequences,
    stringsAsFactors = FALSE
    )
  # data.table::setkey(sequences)

  #select numeric columns
  numeric.cols <- colnames(sequences)[which(
    as.vector(
      sequences[,lapply(.SD, class)]) %in% c("numeric", "integer")
    )]
  numeric.cols <- numeric.cols[numeric.cols != grouping.column]

  #generating combinations
  combinations <- arrangements::combinations(unique(sequences[, get(grouping.column)]), k = 2)

  #number of combinations
  n.iterations <- nrow(combinations)

  #parallel execution = TRUE
  if(parallel.execution == TRUE){
    `%dopar%` <- foreach::`%dopar%`
    n.cores <- parallel::detectCores() - 1
    if(n.iterations < n.cores){n.cores <- n.iterations}

    if(.Platform$OS.type == "windows"){
      my.cluster <- parallel::makeCluster(n.cores, type="PSOCK")
    } else {
      my.cluster <- parallel::makeCluster(n.cores, type="FORK")
    }

    doParallel::registerDoParallel(my.cluster)

    #exporting cluster variables
    parallel::clusterExport(cl=my.cluster,
                            varlist=c('combinations',
                                      'sequences',
                                      'distance',
                                      'numeric.cols',
                                      'grouping.column'),
                            envir=environment()
    )
  } else {
    #replaces dopar (parallel) by do (serial)
    `%dopar%` <- foreach::`%do%`
     on.exit(`%dopar%` <- foreach::`%dopar%`)
  }

  #parallelized loop
  ##################
  ##################
  psi.output <- foreach::foreach(i=1:n.iterations, .combine = "c", .inorder = TRUE) %dopar% {

    #getting combination
    combination <- combinations[i, ]

    #separating sequences
    sequence.A <- sequences[get(grouping.column) == combination[1], ..numeric.cols]
    sequence.B <- sequences[get(grouping.column) == combination[2], ..numeric.cols]


    #computing euclidean distance matrix
    ####################################
    distance.matrix <- fields::rdist(
       x1 = sequence.A,
       x2 = sequence.B
       )


    #computing least cost matrix
    ############################
    #dimensions
    least.cost.rows <- nrow(distance.matrix)
    least.cost.columns <- ncol(distance.matrix)

    #matrix to store least cost
    least.cost.matrix <- matrix(NA, nrow = least.cost.rows, ncol = least.cost.columns)
    least.cost.matrix[1,1] <- distance.matrix[1,1]

    #initiating first columns and rows
    least.cost.matrix[1, ] <- cumsum(distance.matrix[1, ])
    least.cost.matrix[, 1] <- cumsum(distance.matrix[, 1])

    #dynamic programming algorithm
    for (column in 1:(least.cost.columns - 1)){
      for (row in 1:(least.cost.rows - 1)){

        next.row <- row + 1
        next.column <- column + 1

        #find minimum neighbor
        least.cost.matrix[next.row, next.column]  <-  min(
          least.cost.matrix[row, next.column],
          least.cost.matrix[next.row, column],
          least.cost.matrix[row, column]
          ) + distance.matrix[next.row, next.column]

      }
    }


    #computing least cost path
    ###########################
    #dataframe to store the path
    path <- data.table::data.table(
      A = c(
        least.cost.rows,
        rep(NA, least.cost.rows + least.cost.columns)
        ),
      B = c(
        least.cost.columns,
        rep(NA,least.cost.rows + least.cost.columns)
        ),
      distance = c(
        distance.matrix[least.cost.rows, least.cost.columns],
        rep(NA,least.cost.rows + least.cost.columns)
        ),
      cumulative.distance = c(
        least.cost.matrix[least.cost.rows, least.cost.columns],
        rep(NA,least.cost.rows + least.cost.columns)
        )
      )

    #defining coordinates of the focal cell
    focal.row <- as.numeric(path[1, "A"])
    focal.column <- as.numeric(path[1, "B"])

    #going through the matrix
    path.row <- 1L
    repeat{

      #defining values o focal row
      focal.cumulative.cost <- least.cost.matrix[focal.row, focal.column]
      focal.cost <- distance.matrix[focal.row, focal.column]

      #SCANNING NEIGHBORS
      neighbors <- matrix(c(focal.row-1, focal.row-1, focal.row, focal.column, focal.column-1, focal.column-1), ncol = 2)

      #removing neighbors with coordinates lower than 1 (out of bounds)
      neighbors[neighbors < 1] <- NA
      neighbors <- stats::na.omit(neighbors)
      n.neighbors <- nrow(neighbors)
      if(n.neighbors == 0){break}

      #adding cols
      neighbors <- cbind(neighbors, rep(NA, n.neighbors), rep(NA, n.neighbors))

      #computing cost and cumulative cost values for the neighbors
      if(n.neighbors > 1){

        neighbors[,3] <- diag(distance.matrix[neighbors[,1], neighbors[,2]])
        neighbors[,4] <- diag(x = least.cost.matrix[neighbors[,1], neighbors[,2]])

      }else{

        neighbors[,3] <- distance.matrix[neighbors[,1], neighbors[,2]]
        neighbors[,4] <- least.cost.matrix[neighbors[,1], neighbors[,2]]

      }

      #getting the neighbor with a minimum distance
      neighbors <- neighbors[which.min(neighbors[,4]), ]

      #putting them together
      path.row <- path.row + 1L
      data.table::set(path, path.row, names(path), as.list(neighbors))

      #new focal cell
      focal.row <- neighbors[1]
      focal.column <- neighbors[2]

    }#end of repeat

    #removing matrices
    rm(distance.matrix, least.cost.matrix)

    #removing empty rows in path
    path <- stats::na.omit(path)

    #renaming path
    sequence.names <- combination
    colnames(path)[1] <- sequence.names[1]
    colnames(path)[2] <- sequence.names[2]


    #REMOVING BLOCKS leastCostPathNoBlocks
    #####################################
    #add keep column
    path[, "keep"] <- NA

    #keeping first and last rows
    path[c(1, nrow(path)) , "keep"] <- TRUE

    #vectors to introduce used indices
    used <- list()
    # used[[1]] <- vector()
    # used[[2]] <- vector()
    used[[1]] <- used[[2]]<- rep(NA, nrow(path))
    names(used) <- sequence.names

    #starting values for dynamic variables
    target.index <- 1
    target.sequence <- sequence.names[1]

    #switch if this index is not repeated in this sequence
    if(!(sum(path[,get(target.sequence)] == target.index)) > 1){
      target.sequence <- sequence.names[sequence.names != target.sequence]
    }

    #adding 1 to used
    used[[target.sequence]] <- c(used[[target.sequence]], as.numeric(path[target.index, get(target.sequence)]))

    #first row of path
    j <- 2

    #iterating through AB.index.unique
    ##################################
    while(j < (nrow(path) - 1)){

      #IS REPEATED?
      if(path[j, get(target.sequence)] %in% used[[target.sequence]]){

        #IS EQUAL TO THE NEXT ONE?
        if(path[j, get(target.sequence)] == path[j + 1, get(target.sequence)]){

          #YES
          j <- j + 1
          next

          #NO
        } else {

          #ADD IT
          path[j , "keep"] <- TRUE

          #SWITCH
          target.sequence <- sequence.names[sequence.names != target.sequence]

          #add index to used
          used[[target.sequence]] <- c(used[[target.sequence]], as.numeric(path[j, get(target.sequence)]))

          j <- j + 1
          next

        }
      } else {

        #ADD IT
        path[j , "keep"] <- TRUE

        #SWITCH
        target.sequence <- sequence.names[sequence.names != target.sequence]

        #add index to used
        used[[target.sequence]] <- c(used[[target.sequence]], unlist(path[j, get(target.sequence)]))

        j <- j + 1
        next

      }

    }#end of while

    #removing na
    path <- stats::na.omit(path)


    #GETTING LEAST COST leastCost
    #############################
    optimal.cost <- sum(path$distance)


    #AUTOSUM autoSum
    ############################
    #distances
    distances.A <- vector()
    distances.B <- vector()

    #subsetting sequences to remove samples in blocks
    sequence.A <- sequence.A[sort(unique(path[,get(sequence.names[1])])), ]
    sequence.B <- sequence.B[sort(unique(path[,get(sequence.names[2])])), ]

    #removing NA
    sequence.A <- stats::na.omit(sequence.A)
    sequence.B <- stats::na.omit(sequence.B)

    #updating number of rows
    nrow.sequence.A <- nrow(sequence.A)
    nrow.sequence.B <- nrow(sequence.B)

    #autosum A
    distances.A <- sequence.A[, distance := sqrt(
      rowSums(
        (.SD - data.table::shift(
          x = .SD,
          n = 1,
          fill = NA,
          type = "lag"
          )
         )^2)
      ), .SDcols = numeric.cols]$distance

    #autosum B
    distances.B <- sequence.B[, distance := sqrt(
      rowSums(
        (.SD - data.table::shift(
          x = .SD,
          n = 1,
          fill = NA,
          type = "lag")
         )^2)
      ), .SDcols = numeric.cols]$distance

    #autosum of distances
    sum.autosum <- sum(distances.A, distances.B, na.rm = TRUE)


    #PSI
    ###########################
    psi.value <- (((optimal.cost * 2) - sum.autosum) / sum.autosum) + 1

  }#end of parallelized loop

  #closing cluster
  if(parallel.execution == TRUE){
    parallel::stopCluster(cl = my.cluster)
  }

  #preparing output as dataframe
  combinations <- data.frame(combinations, stringsAsFactors = FALSE)
  combinations[, "psi"] <- psi.output
  colnames(combinations) <- c("A", "B", "psi")

  #factor to character
  combinations$A <- as.character(combinations$A)
  combinations$B <- as.character(combinations$B)


  return(combinations)

}