R/workflowSlotting.R

Defines functions workflowSlotting

Documented in workflowSlotting

#' Slots two sequences into a single composite sequence.
#'
#' @description Generates a composite sequence, constrained by sample order, from two sequences, by minimizing the dissimilarity between adjacent samples of each input sequence. The algorithm computes the distance matrix, least cost matrix, and least cost path of two sequences, and uses the least cost path file to find the slotting that better minimizes the dissimilarity between adjacent samples. The algorithm assumes that the samples are not aligned or paired.
#'
#'
#' @usage workflowSlotting(
#'   sequences = NULL,
#'   grouping.column = NULL,
#'   time.column = NULL,
#'   exclude.columns = NULL,
#'   method = "manhattan",
#'   plot = TRUE
#'   )
#'
#' @param sequences dataframe with two sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param method character string naming a distance metric. Valid entries are: "manhattan", "euclidean", "chi", and "hellinger". Invalid entries will throw an error.
#' @param plot boolean, if \code{TRUE}, plots the distance matrix and the least-cost path.
#'
#' @return A dataframe with the same number of rows as \code{sequences}, ordered according to the best solution found by the least-cost algorithm.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#'
#' @examples
#'
#' \donttest{
#' #loading the data
#' data(pollenGP)
#'
#' #getting first 20 samples
#' pollenGP <- pollenGP[1:20, ]
#'
#' #sampling indices
#' set.seed(10) #to get same result every time
#' sampling.indices <- sort(sample(1:20, 10))
#'
#' #subsetting the sequence
#' A <- pollenGP[sampling.indices, ]
#' B <- pollenGP[-sampling.indices, ]
#'
#' #preparing the sequences
#' AB <- prepareSequences(
#'   sequence.A = A,
#'   sequence.A.name = "A",
#'   sequence.B = B,
#'   sequence.B.name = "B",
#'   grouping.column = "id",
#'   exclude.columns = c("depth", "age"),
#'   transformation = "hellinger"
#'   )
#'
#' AB.combined <- workflowSlotting(
#'   sequences = AB,
#'   grouping.column = "id",
#'   time.column = "age",
#'   exclude.columns = "depth",
#'   method = "manhattan",
#'   plot = FALSE
#'   )
#'
#' AB.combined
#'
#'}
#'
#'@export
workflowSlotting <- function(sequences = NULL,
                             grouping.column = NULL,
                             time.column = NULL,
                             exclude.columns = NULL,
                             method = "manhattan",
                             plot = TRUE){

  #checking there are only two sequences
  if(is.null(grouping.column)){
    stop("The argument 'grouping.column' is empty.")
  } else {
    if(length(unique(sequences[, grouping.column])) != 2){
      stop("The argument 'sequences' must be a dataframe with two sequences identified by 'grouping.column'. The dataframe provided does not have two sequences.")
    }
  }

  #distance matrix
  distance.matrix <- distanceMatrix(
    sequences = sequences,
    grouping.column = grouping.column,
    time.column = time.column,
    exclude.columns = exclude.columns,
    method = method,
    parallel.execution = FALSE
  )

  #least cost matrix
  least.cost.matrix <- leastCostMatrix(
    distance.matrix = distance.matrix,
    diagonal = FALSE,
    parallel.execution = FALSE
  )

  #least cost path
  least.cost.path <- leastCostPath(
    distance.matrix = distance.matrix,
    least.cost.matrix = least.cost.matrix,
    diagonal = FALSE,
    parallel.execution = FALSE
  )

  #visualizing output
  if(plot == TRUE){
    plotMatrix(
      distance.matrix = distance.matrix,
      least.cost.path = least.cost.path,
      color.palette = "viridis"
    )
  }

  #sequence names
  sequence.names <- unique(sequences[, grouping.column])

  #extracting and flipping least.cost.path
  path <- least.cost.path[[1]]
  path <- path[nrow(path):1, ]
  rownames(path)<-1:nrow(path)

  #adding index column to the sequences
  for(k in 1:2){
    sequences[sequences[, grouping.column] == sequence.names[k], "index"] <- order(unique(path[, sequence.names[k]]))
  }
  sequences <- data.frame(index = sequences$index, sequences[, colnames(sequences) != "index"])

  #creating output dataframe
  sequences.combined <- sequences[0, ]

  #vectors to introduce used indices
  used <- list()
  used[[1]] <- vector()
  used[[2]] <- vector()
  names(used) <- sequence.names

  #starting values for dynamic variables
  target.index <- 1
  target.sequence <- sequence.names[1]
  #it works better if it starts with the sequence with the first index repeated
  if(!(sum(path[,target.sequence] == target.index) > 1)){
    target.sequence <- sequence.names[sequence.names != target.sequence]
  }

  #first row of path
  j <- 1

  #iterating through AB.index.unique
  ##################################
  while(j < nrow(path)){

    #1. IS NEW?
    ###########
    if(!(path[j, target.sequence] %in% used[[target.sequence]])){

      #1. YES: ADD
      #------------
      sequences.combined <- rbind(sequences.combined, sequences[which(sequences[ , grouping.column] == target.sequence & sequences$index == path[j, target.sequence]), ])

      #adds the index to the used indices
      used[[target.sequence]] <- c(used[[target.sequence]], path[j, target.sequence])

      #2. IS DIFFERENT TO THE NEXT ONE
      ################
      if(path[j, target.sequence] != path[j + 1, target.sequence]){

        #2. YES: NEXT
        #------------
        j <- j + 1
        next

      } else {

        #2. NOPE: SWITCH
        #---------------
        target.sequence <- sequence.names[sequence.names != target.sequence]
        next

      }


    } else {
      #1. NOPE: NEXT
      #-------------
      j <- j + 1
      next
    }

  }

  #adding the last one
  sequences.combined <- rbind(sequences.combined, sequences[which(sequences[ , grouping.column] == target.sequence & sequences$index == path[j, target.sequence]), ])

  #getting index column
  index.column <- sequences.combined$index
  sequences.combined$index <- NULL

  #getting the grouping column
  grouping.column.data <- sequences.combined[, grouping.column]
  sequences.combined[, grouping.column] <- NULL

  #adding it them the beginning
  sequences.combined <- data.frame(groups = grouping.column.data, index = index.column, sequences.combined)
  colnames(sequences.combined)[1] <- grouping.column
  colnames(sequences.combined)[2] <- "original.index"

  return(sequences.combined)

}
BlasBenito/distantia documentation built on Nov. 17, 2023, 11:06 p.m.