#' Transfers an attribute (time, age, depth) from one sequence to another
#'
#' @description Transfers an attribute (generally time/age, but any others are possible) from one sequence (defined by the argument \code{transfer.from}) to another (defined by the argument \code{transfer.to}) lacking it. The transference of the attribute is based on the following assumption: similar samples have similar attributes. This assumption might not hold for noisy multivariate time-series. Attribute transference can be done in two different ways (defined by the \code{mode} argument):
#' \itemize{
#' \item \emph{Direct}: transfers the selected attribute between samples with the maximum similarity. This option will likely generate duplicated attribute values in the output.
#' \item \emph{Interpolate}: obtains new attribute values through weighted interpolation, being the weights derived from the distances between samples
#' }
#'
#' @usage workflowTransfer(
#' sequences = NULL,
#' grouping.column = NULL,
#' time.column = NULL,
#' exclude.columns = NULL,
#' method = "manhattan",
#' transfer.what = NULL,
#' transfer.from = NULL,
#' transfer.to = NULL,
#' mode = "direct",
#' plot = FALSE
#' )
#'
#' @param sequences dataframe with multiple sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param method character string naming a distance metric. Valid entries are: "manhattan", "euclidean", "chi", and "hellinger". Invalid entries will throw an error.
#' @param transfer.what character string, column of \code{sequences} with the attribute to be transferred. If empty or ill-defined, \code{time.column} is used instead if available.
#' @param transfer.from character string, group available in \code{grouping.column} identifying the sequence from which to take the attribute values.
#' @param transfer.to character string, group available in \code{grouping.column} identifying the sequence to which transfer the attribute values.
#' @param mode character string, one of: "direct" (default), "interpolate".
#' @param plot boolean, if \code{TRUE}, plots the distance matrix and the least-cost path.
#'
#' @return A dataframe with the sequence \code{transfer.to}, with a column named after \code{transfer.what} with the attribute values.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#'
#' @examples
#'
#' \donttest{
#'
#' #loading sample dataset
#' data(pollenGP)
#' #subset pollenGP to make a shorter dataset
#' pollenGP <- pollenGP[1:50, ]
#'
#' #generating a subset of pollenGP
#' set.seed(10)
#' pollenX <- pollenGP[sort(sample(1:50, 40)), ]
#'
#' #we separate the age column
#' pollenX.age <- pollenX$age
#'
#' #and remove the age values from pollenX
#' pollenX$age <- NULL
#' pollenX$depth <- NULL
#'
#' #removing some samples from pollenGP
#' #so pollenX is not a perfect subset of pollenGP
#' pollenGP <- pollenGP[-sample(1:50, 10), ]
#'
#' #prepare sequences
#' GP.X <- prepareSequences(
#' sequence.A = pollenGP,
#' sequence.A.name = "GP",
#' sequence.B = pollenX,
#' sequence.B.name = "X",
#' grouping.column = "id",
#' time.column = "age",
#' exclude.columns = "depth",
#' transformation = "none"
#' )
#'
#'#transferring age
#'X.new <- workflowTransfer(
#' sequences = GP.X,
#' grouping.column = "id",
#' time.column = "age",
#' method = "manhattan",
#' transfer.what = "age",
#' transfer.from = "GP",
#' transfer.to = "X",
#' mode = "interpolated"
#' )
#'
#' }
#'
#' @export
workflowTransfer <- function(
sequences = NULL,
grouping.column = NULL,
time.column = NULL,
exclude.columns = NULL,
method = "manhattan",
transfer.what = NULL,
transfer.from = NULL,
transfer.to = NULL,
mode = "direct",
plot = FALSE
){
#checking transfer.from
if(!(transfer.from) %in% sequences[, grouping.column]){
stop("Argument 'transfer.from' must be one of the groups defined by 'grouping.column'")
}
#checking transfer.to
if(!(transfer.to) %in% sequences[, grouping.column]){
stop("Argument 'transfer.to' must be one of the groups defined by 'grouping.column'")
}
#checking transfer.what and time.column
if(is.null(transfer.what)){
if(is.null(time.column)){
stop("Arguments 'transfer.what' and 'time.column' cannot be NULL at the same time.")
} else {
if(time.column %in% colnames(sequences)){
transfer.what <- time.column
}
}
} else {
if(!(transfer.what %in% colnames(sequences))){
if(!is.null(time.column)){
if(time.column %in% colnames(sequences)){
transfer.what <- time.column
}
}
}
}
#checking mode
direct.strings <- c("direct", "Direct", "DIRECT", "d", "D", "dir", "Dir", "DIR")
interpolate.strings <- c("interpolate", "Interpolate", "INTERPOLATE", "int", "Int", "INT", "i", "I", "interpolated", "Interpolated", "INTERPOLATED")
if(!(mode %in% c(direct.strings, interpolate.strings))){
warning("Argument 'mode' must be one of: 'direct', 'interpolate'. Setting it to 'direct'.")
mode <- "direct"
}
#separating transfer.from and transfer.to
from.df <- sequences[sequences[, grouping.column] == transfer.from, ]
to.df <- sequences[sequences[, grouping.column] == transfer.to, ]
#computation of distance matrix
distance.matrix <- distanceMatrix(
sequences = rbind(from.df, to.df),
grouping.column = grouping.column,
time.column = time.column,
exclude.columns = exclude.columns,
method = method,
parallel.execution = FALSE
)
#MODE IS direct
#########################
if(mode %in% direct.strings){
#computing least cost matrix
least.cost.matrix <- leastCostMatrix(
distance.matrix = distance.matrix,
diagonal = TRUE,
parallel.execution = FALSE
)
#least cost path
least.cost.path <- leastCostPath(
distance.matrix = distance.matrix,
least.cost.matrix = least.cost.matrix,
diagonal = TRUE,
parallel.execution = FALSE
)
#plot
if(plot == TRUE){
plotMatrix(
distance.matrix = distance.matrix,
least.cost.path = least.cost.path,
color.palette = viridis(100, alpha = 0.7)
)
}
#least cost path to dataframe
least.cost.path.df <- least.cost.path[[1]]
#flip least cost path
least.cost.path.df <- least.cost.path.df[nrow(least.cost.path.df):1, ]
#iterating throug cases in to.df
for(i in 1:nrow(to.df)){
#subset
least.cost.path.df.subset <- least.cost.path.df[least.cost.path.df[, transfer.to] == i, ]
#select sample with minimum distance
selected.sample <- least.cost.path.df.subset[which.min(least.cost.path.df.subset$distance), ]
#transfer
to.df[i, transfer.what] <- from.df[selected.sample[,transfer.from], transfer.what]
}
return(to.df)
} # end of mode == "direct"
#MODE is "interpolate"
###########################
if(mode %in% interpolate.strings){
#computing least cost matrix
least.cost.matrix <- leastCostMatrix(
distance.matrix = distance.matrix,
diagonal = FALSE,
parallel.execution = FALSE
)
#least cost path
least.cost.path <- leastCostPath(
distance.matrix = distance.matrix,
least.cost.matrix = least.cost.matrix,
diagonal = FALSE,
parallel.execution = FALSE
)
#plot
if(plot == TRUE){
plotMatrix(
distance.matrix = distance.matrix,
least.cost.path = least.cost.path,
color.palette = viridis(100, alpha = 0.7)
)
}
#least cost path to dataframe
least.cost.path.df <- least.cost.path[[1]]
#flip least cost path
least.cost.path.df <- least.cost.path.df[nrow(least.cost.path.df):1, ]
#iterating throug cases in to.df
################################
################################
for(k in 1:nrow(to.df)){
#1. IT IS REPEATED AND THE DISTANCES TO ADJACENT SAMPLES ARE IN PATH
###############################################################
if(sum(least.cost.path.df[, transfer.to] == k) > 1){
#subset
least.cost.path.df.subset <- least.cost.path.df[least.cost.path.df[, transfer.to] == k, ]
#select sample with minimum distance
selected.sample.i <- least.cost.path.df.subset[which.min(least.cost.path.df.subset$distance), ]
#subset again to keep the ones +1 and -1 the selected sample
least.cost.path.df.subset <- least.cost.path.df.subset[least.cost.path.df.subset[, transfer.from] %in% c(selected.sample.i[, transfer.from] - 1, selected.sample.i[, transfer.from] + 1), ]
#remove this sample from the subset
if(nrow(least.cost.path.df.subset) > 1){
selected.sample.j <- least.cost.path.df.subset[which.min(least.cost.path.df.subset$distance), ]
} else {
selected.sample.j <- least.cost.path.df.subset
}
#getting indices of samples i and j
i <- selected.sample.i[, transfer.from]
j <- selected.sample.j[, transfer.from]
}#end of 1.
#2. IT IS NOT REPEATED, AND THE DISTANCES TO ADJACENT SAMPLES ARE IN THE DISTANCE MATRIX
#############################################################
if(sum(least.cost.path.df[, transfer.to] == k) == 1){
#get selected sample i
selected.sample.i <- least.cost.path.df[least.cost.path.df[, transfer.to] == k, ]
i <- selected.sample.i[, transfer.from]
#select sample j
#matrix columns are transfer.to
#matrix rows are transfer.from
if(i == 1){
j <- 2
} else {
if(i == nrow(distance.matrix[[1]]) | i + 1 > nrow(distance.matrix[[1]])){
j <- i - 1
} else {
if(i - 1 < 1){
j <- i + 1
} else {
j <- c(i+1, i-1)
names(j) <- c("plus", "minus")
}
}
}
#if j has two elements, get the distances with k
if(length(j) == 2){
#computing distances for both js
Dj <- distance.matrix[[1]][j, k]
#getting the j with the minimum distance
j <- j[which.min(Dj)]
#removing names
names(j) <- NULL
}
}# end of 2.
#computing variables needed to interpolate the attribute
#ages
Ati <- from.df[i, transfer.what]
Atj <- from.df[j, transfer.what]
#distances
DBkAi <- distance.matrix[[1]][i, k]
DBkAj <- distance.matrix[[1]][j, k]
#normalizing the distances to 1
wi <- DBkAi / (DBkAi + DBkAj)
wj <- DBkAj / (DBkAi + DBkAj)
#computing Btk
Btk <- wi * Ati + wj * Atj
#checking if the attribute is higher than the previous one
#tries to increase j
#if still doesn't work, adds a NA
if(k > 1){
#finding the previous non-NA age
n <- 1
while(is.na(to.df[k - n, transfer.what])){
n <- n + 1
}
#if the interpolated age is lower than the previous one
#set it to NA
if(Btk < to.df[k - n, transfer.what]){
Btk <- NA
}
}
#adding it
to.df[k, transfer.what] <- Btk
}#end of k loop
return(to.df)
}#end of mode == "interpolate"
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.