#' A refactored version of \code{\link{workflowPsi}} with a higher performance (hence the suffix HP).
#'
#'Ideal for large analyses with hundreds to thousands of sequences. Several options available in \code{\link{workflowPsi}} have been removed from this function in order to simplify the code as much as possible. Psi is computed with the options \code{diagonal = TRUE}, \code{ignore.blocks = TRUE}, and \code{method = "euclidean"}.
#'
#'Due to limitations of the function \code{\link[arrangements]{permutations}}, the maximum number of groups (according to \code{grouping.column}) is around 30000. Besides, a combinations table of this size takes, roughlyl, 7GB of memory.
#'
#' @usage workflowPsiHP(
#' sequences = NULL,
#' grouping.column = NULL,
#' time.column = NULL,
#' exclude.columns = NULL,
#' parallel.execution = TRUE
#' )
#'
#' @param sequences dataframe with multiple sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param parallel.execution boolean, if \code{TRUE} (default), execution is parallelized, and serialized if \code{FALSE}.
#'
#' @return A dataframe with sequence names and psi values.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#'
#' @examples
#'
#' \donttest{
#' data("sequencesMIS")
#' #prepare sequences
#' MIS.sequences <- prepareSequences(
#' sequences = sequencesMIS[sequencesMIS$MIS %in% c("MIS-1", "MIS-2"), ],
#' grouping.column = "MIS",
#' if.empty.cases = "zero",
#' transformation = "hellinger"
#' )
#'
#'#execute workflow to compute psi
#'MIS.psi <- workflowPsiHP(
#' sequences = MIS.sequences,
#' grouping.column = "MIS",
#' parallel.execution = FALSE
#' )
#'
#'MIS.psi
#'
#'}
#'
#' @export
workflowPsiHP <- function(sequences = NULL,
grouping.column = NULL,
time.column = NULL,
exclude.columns = NULL,
parallel.execution = TRUE){
#PREPARING sequences
####################
#removing time column
if(!is.null(time.column)){
if(time.column %in% colnames(sequences)){
sequences[, time.column] <- NULL
}
}
#removing exclude columns
if(!is.null(exclude.columns)){
sequences <- sequences[,!(colnames(sequences) %in% exclude.columns)]
}
#to data.table
sequences <- data.table::data.table(
sequences,
stringsAsFactors = FALSE
)
# data.table::setkey(sequences)
#select numeric columns
numeric.cols <- colnames(sequences)[which(
as.vector(
sequences[,lapply(.SD, class)]) %in% c("numeric", "integer")
)]
numeric.cols <- numeric.cols[numeric.cols != grouping.column]
#generating combinations
combinations <- arrangements::combinations(unique(sequences[, get(grouping.column)]), k = 2)
#number of combinations
n.iterations <- nrow(combinations)
#parallel execution = TRUE
if(parallel.execution == TRUE){
`%dopar%` <- foreach::`%dopar%`
n.cores <- parallel::detectCores() - 1
if(n.iterations < n.cores){n.cores <- n.iterations}
if(.Platform$OS.type == "windows"){
my.cluster <- parallel::makeCluster(n.cores, type="PSOCK")
} else {
my.cluster <- parallel::makeCluster(n.cores, type="FORK")
}
doParallel::registerDoParallel(my.cluster)
#exporting cluster variables
parallel::clusterExport(cl=my.cluster,
varlist=c('combinations',
'sequences',
'distance',
'numeric.cols',
'grouping.column'),
envir=environment()
)
} else {
#replaces dopar (parallel) by do (serial)
`%dopar%` <- foreach::`%do%`
on.exit(`%dopar%` <- foreach::`%dopar%`)
}
#parallelized loop
##################
##################
psi.output <- foreach::foreach(i=1:n.iterations, .combine = "c", .inorder = TRUE) %dopar% {
#getting combination
combination <- combinations[i, ]
#separating sequences
sequence.A <- sequences[get(grouping.column) == combination[1], ..numeric.cols]
sequence.B <- sequences[get(grouping.column) == combination[2], ..numeric.cols]
#computing euclidean distance matrix
####################################
distance.matrix <- fields::rdist(
x1 = sequence.A,
x2 = sequence.B
)
#computing least cost matrix
############################
#dimensions
least.cost.rows <- nrow(distance.matrix)
least.cost.columns <- ncol(distance.matrix)
#matrix to store least cost
least.cost.matrix <- matrix(NA, nrow = least.cost.rows, ncol = least.cost.columns)
least.cost.matrix[1,1] <- distance.matrix[1,1]
#initiating first columns and rows
least.cost.matrix[1, ] <- cumsum(distance.matrix[1, ])
least.cost.matrix[, 1] <- cumsum(distance.matrix[, 1])
#dynamic programming algorithm
for (column in 1:(least.cost.columns - 1)){
for (row in 1:(least.cost.rows - 1)){
next.row <- row + 1
next.column <- column + 1
#find minimum neighbor
least.cost.matrix[next.row, next.column] <- min(
least.cost.matrix[row, next.column],
least.cost.matrix[next.row, column],
least.cost.matrix[row, column]
) + distance.matrix[next.row, next.column]
}
}
#computing least cost path
###########################
#dataframe to store the path
path <- data.table::data.table(
A = c(
least.cost.rows,
rep(NA, least.cost.rows + least.cost.columns)
),
B = c(
least.cost.columns,
rep(NA,least.cost.rows + least.cost.columns)
),
distance = c(
distance.matrix[least.cost.rows, least.cost.columns],
rep(NA,least.cost.rows + least.cost.columns)
),
cumulative.distance = c(
least.cost.matrix[least.cost.rows, least.cost.columns],
rep(NA,least.cost.rows + least.cost.columns)
)
)
#defining coordinates of the focal cell
focal.row <- as.numeric(path[1, "A"])
focal.column <- as.numeric(path[1, "B"])
#going through the matrix
path.row <- 1L
repeat{
#defining values o focal row
focal.cumulative.cost <- least.cost.matrix[focal.row, focal.column]
focal.cost <- distance.matrix[focal.row, focal.column]
#SCANNING NEIGHBORS
neighbors <- matrix(c(focal.row-1, focal.row-1, focal.row, focal.column, focal.column-1, focal.column-1), ncol = 2)
#removing neighbors with coordinates lower than 1 (out of bounds)
neighbors[neighbors < 1] <- NA
neighbors <- stats::na.omit(neighbors)
n.neighbors <- nrow(neighbors)
if(n.neighbors == 0){break}
#adding cols
neighbors <- cbind(neighbors, rep(NA, n.neighbors), rep(NA, n.neighbors))
#computing cost and cumulative cost values for the neighbors
if(n.neighbors > 1){
neighbors[,3] <- diag(distance.matrix[neighbors[,1], neighbors[,2]])
neighbors[,4] <- diag(x = least.cost.matrix[neighbors[,1], neighbors[,2]])
}else{
neighbors[,3] <- distance.matrix[neighbors[,1], neighbors[,2]]
neighbors[,4] <- least.cost.matrix[neighbors[,1], neighbors[,2]]
}
#getting the neighbor with a minimum distance
neighbors <- neighbors[which.min(neighbors[,4]), ]
#putting them together
path.row <- path.row + 1L
data.table::set(path, path.row, names(path), as.list(neighbors))
#new focal cell
focal.row <- neighbors[1]
focal.column <- neighbors[2]
}#end of repeat
#removing matrices
rm(distance.matrix, least.cost.matrix)
#removing empty rows in path
path <- stats::na.omit(path)
#renaming path
sequence.names <- combination
colnames(path)[1] <- sequence.names[1]
colnames(path)[2] <- sequence.names[2]
#REMOVING BLOCKS leastCostPathNoBlocks
#####################################
#add keep column
path[, "keep"] <- NA
#keeping first and last rows
path[c(1, nrow(path)) , "keep"] <- TRUE
#vectors to introduce used indices
used <- list()
# used[[1]] <- vector()
# used[[2]] <- vector()
used[[1]] <- used[[2]]<- rep(NA, nrow(path))
names(used) <- sequence.names
#starting values for dynamic variables
target.index <- 1
target.sequence <- sequence.names[1]
#switch if this index is not repeated in this sequence
if(!(sum(path[,get(target.sequence)] == target.index)) > 1){
target.sequence <- sequence.names[sequence.names != target.sequence]
}
#adding 1 to used
used[[target.sequence]] <- c(used[[target.sequence]], as.numeric(path[target.index, get(target.sequence)]))
#first row of path
j <- 2
#iterating through AB.index.unique
##################################
while(j < (nrow(path) - 1)){
#IS REPEATED?
if(path[j, get(target.sequence)] %in% used[[target.sequence]]){
#IS EQUAL TO THE NEXT ONE?
if(path[j, get(target.sequence)] == path[j + 1, get(target.sequence)]){
#YES
j <- j + 1
next
#NO
} else {
#ADD IT
path[j , "keep"] <- TRUE
#SWITCH
target.sequence <- sequence.names[sequence.names != target.sequence]
#add index to used
used[[target.sequence]] <- c(used[[target.sequence]], as.numeric(path[j, get(target.sequence)]))
j <- j + 1
next
}
} else {
#ADD IT
path[j , "keep"] <- TRUE
#SWITCH
target.sequence <- sequence.names[sequence.names != target.sequence]
#add index to used
used[[target.sequence]] <- c(used[[target.sequence]], unlist(path[j, get(target.sequence)]))
j <- j + 1
next
}
}#end of while
#removing na
path <- stats::na.omit(path)
#GETTING LEAST COST leastCost
#############################
optimal.cost <- sum(path$distance)
#AUTOSUM autoSum
############################
#distances
distances.A <- vector()
distances.B <- vector()
#subsetting sequences to remove samples in blocks
sequence.A <- sequence.A[sort(unique(path[,get(sequence.names[1])])), ]
sequence.B <- sequence.B[sort(unique(path[,get(sequence.names[2])])), ]
#removing NA
sequence.A <- stats::na.omit(sequence.A)
sequence.B <- stats::na.omit(sequence.B)
#updating number of rows
nrow.sequence.A <- nrow(sequence.A)
nrow.sequence.B <- nrow(sequence.B)
#autosum A
distances.A <- sequence.A[, distance := sqrt(
rowSums(
(.SD - data.table::shift(
x = .SD,
n = 1,
fill = NA,
type = "lag"
)
)^2)
), .SDcols = numeric.cols]$distance
#autosum B
distances.B <- sequence.B[, distance := sqrt(
rowSums(
(.SD - data.table::shift(
x = .SD,
n = 1,
fill = NA,
type = "lag")
)^2)
), .SDcols = numeric.cols]$distance
#autosum of distances
sum.autosum <- sum(distances.A, distances.B, na.rm = TRUE)
#PSI
###########################
psi.value <- (((optimal.cost * 2) - sum.autosum) / sum.autosum) + 1
}#end of parallelized loop
#closing cluster
if(parallel.execution == TRUE){
parallel::stopCluster(cl = my.cluster)
}
#preparing output as dataframe
combinations <- data.frame(combinations, stringsAsFactors = FALSE)
combinations[, "psi"] <- psi.output
colnames(combinations) <- c("A", "B", "psi")
#factor to character
combinations$A <- as.character(combinations$A)
combinations$B <- as.character(combinations$B)
return(combinations)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.