#' Get data for calculating distance matrix from phylogenetic profiles
#' @export
#' @usage getDataClustering(data, profileType = "binary", var1AggBy = "max",
#' var2AggBy = "max")
#' @param data a data frame contains processed and filtered profiles (see
#' ?fullProcessedProfile and ?filterProfileData, ?fromInputToProfile)
#' @param profileType type of data used for calculating the distance matrix.
#' Either "binary" (consider only the presence/absence status of orthlogs),
#' "orthoID" (consider ortholog IDs as values for clustering),
#' "var1"/"var2" for taking values of the additional variables into account.
#' Default = "binary".
#' @param var1AggBy aggregate method for VAR1 (min, max, mean or median).
#' Default = "max".
#' @param var2AggBy aggregate method for VAR2 (min, max, mean or median).
#' Default = "max".
#' @return A wide dataframe contains values for calculating distance matrix.
#' @author Carla Mölbert (carla.moelbert@gmx.de), Vinh Tran
#' (tran@bio.uni-frankfurt.de)
#' @importFrom data.table dcast setDT
#' @seealso \code{\link{fromInputToProfile}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "mean"
#' getDataClustering(data, profileType, var1AggregateBy, var2AggregateBy)
getDataClustering <- function(
data = NULL, profileType = "binary", var1AggBy = "max", var2AggBy = "max"
) {
if (is.null(data)) stop("Input data cannot be NULL!")
presSpec <- geneID <- supertaxon <- orthoID <- NULL
# Filter out rows where presSpec is 0
subDataHeat <- data %>% dplyr::filter(presSpec > 0)
# Predict orthoID format (default is "other")
subDataHeat$orthoID <- as.character(subDataHeat$orthoID)
idFormat <- ifelse(
length(subDataHeat$orthoID) > 0 &&
grepl(
subDataHeat$supertaxonID[1],
strsplit(subDataHeat$orthoID[1], '\\|', fixed = TRUE)[[1]][2]
), "bionf", "other"
)
# Process based on profileType
if (profileType == "binary") {
subDataHeat <- subDataHeat %>%
dplyr::select(geneID, supertaxon, presSpec) %>%
dplyr::mutate(presSpec = ifelse(presSpec > 0, 1, 0)) %>%
dplyr::distinct()
wideData <- data.table::dcast(
data.table::setDT(subDataHeat),
geneID ~ supertaxon, value.var = "presSpec"
)
} else if (profileType == "orthoID") {
subDataHeat <- subDataHeat %>%
dplyr::select(geneID, supertaxon, orthoID) %>%
dplyr::mutate(
orthoID = if (idFormat == "bionf") {
paste(
vapply(
strsplit(as.character(orthoID), "\\|"),
`[`, character(1), 2
),
vapply(
strsplit(as.character(orthoID), "\\|"),
`[`, character(1), 3
),
sep = "#"
)
} else {
paste(supertaxon, orthoID, sep = "#")
}
) %>% dplyr::distinct()
# Assign numeric factor to orthoID
subDataHeat <- subDataHeat %>%
dplyr::mutate(orthoID = as.numeric(factor(orthoID)))
wideData <- data.table::dcast(
data.table::setDT(subDataHeat),
geneID ~ supertaxon, value.var = "orthoID", fun.aggregate = sum
)
} else {
var <- profileType
subDataHeat <- subDataHeat %>%
dplyr::select(geneID, supertaxon, !!sym(var)) %>%
dplyr::distinct()
# Aggregate values by selected method
aggregateBy <- ifelse(var == "var1", var1AggBy, var2AggBy)
subDataHeat <- subDataHeat %>%
dplyr::group_by(geneID, supertaxon) %>%
dplyr::summarise(
!!sym(var) := match.fun(aggregateBy)(!!sym(var)),
.groups = 'drop'
)
wideData <- data.table::dcast(
data.table::setDT(subDataHeat),
geneID ~ supertaxon, value.var = var
)
}
# Convert to data frame and set geneID as row names
wideData <- as.data.frame(wideData)
dat <- wideData[, -1]
rownames(dat) <- wideData$geneID
# Replace NA with 0
dat[is.na(dat)] <- 0
return(dat)
}
#' Calculate the distance matrix
#' @export
#' @param profiles dataframe contains profile data for distance calculating
#' (see ?getDataClustering)
#' @param method distance calculation method ("euclidean", "maximum",
#' "manhattan", "canberra", "binary", "distanceCorrelation",
#' "mutualInformation" or "pearson" for binary data; "distanceCorrelation" or
#' "mutualInformation" for non-binary data). Default = "mutualInformation".
#' @return A calculated distance matrix for input phylogenetic profiles.
#' @importFrom bioDist mutualInfo
#' @importFrom bioDist cor.dist
#' @importFrom stats dist as.dist
#' @importFrom energy dcor
#' @importFrom Rfast Dist
#' @author Carla Mölbert (carla.moelbert@gmx.de), Vinh Tran
#' (tran@bio.uni-frankfurt.de)
#' @seealso \code{\link{getDataClustering}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' profiles <- getDataClustering(
#' data, profileType, var1AggregateBy, var2AggregateBy)
#' method <- "mutualInformation"
#' getDistanceMatrix(profiles, method)
getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
if (is.null(profiles)) stop("Profile data cannot be NULL!")
profiles <- profiles[, colSums(profiles != 0) > 0]
profiles <- profiles[rowSums(profiles != 0) > 0, ]
distMethods <- c("euclidean", "maximum", "manhattan", "canberra", "binary")
if (method %in% distMethods) {
distanceMatrix <- Rfast::Dist(profiles, method = method)
colnames(distanceMatrix)<-rownames(distanceMatrix)<-rownames(profiles)
} else if (method == "distanceCorrelation") {
n <- seq_len(nrow(profiles))
matrix <- matrix(0L, nrow = nrow(profiles), ncol = nrow(profiles))
for (i in seq_len(nrow(profiles))) { # rows
p_i <- unlist(profiles[i,])
for (j in seq_len(nrow(profiles))) { # columns
if (i == j) break
matrix[i, j] <- energy::dcor(p_i, unlist(profiles[j,]))
}
}
# Swich the value so that the profiles with a high correlation
# are clustered together
matrix <- 1 - matrix
matrix <- as.data.frame(matrix)
profileNames <- rownames(profiles)
colnames(matrix) <- profileNames[seq_len(length(profileNames)) - 1]
rownames(matrix) <- profileNames
distanceMatrix <- stats::as.dist(matrix)
} else if (method == "mutualInformation") {
distanceMatrix <- bioDist::mutualInfo(as.matrix(profiles))
distanceMatrix <- max(distanceMatrix, na.rm = TRUE) - distanceMatrix
} else if (method == "pearson") {
distanceMatrix <- bioDist::cor.dist(as.matrix(profiles))
}
return(as.matrix(distanceMatrix))
}
#' Create a hclust object from the distance matrix
#' @export
#' @param distanceMatrix calculated distance matrix as dist object
#' @param clusterMethod clustering method ("single", "complete",
#' "average" for UPGMA, "mcquitty" for WPGMA, "median" for WPGMC,
#' or "centroid" for UPGMC). Default = "complete".
#' @return An object class hclust generated based on input distance matrix and
#' a selected clustering method.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom fastcluster hclust
#' @seealso \code{\link{getDataClustering}},
#' \code{\link{getDistanceMatrix}}, \code{\link{hclust}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' profiles <- getDataClustering(
#' data, profileType, var1AggregateBy, var2AggregateBy)
#' distMethod <- "mutualInformation"
#' distanceMatrix <- getDistanceMatrix(profiles, distMethod)
#' clusterMethod <- "complete"
#' clusterDataDend(as.dist(distanceMatrix), clusterMethod)
clusterDataDend <- function(distanceMatrix = NULL, clusterMethod = "complete") {
if (is.null(distanceMatrix)) stop("Distance matrix cannot be NULL!")
if (!inherits(distanceMatrix,"dist")) stop("Input must be a 'dist' object!")
# Validate clustering method
validMethods <- c(
"complete", "ward.D", "ward.D2", "single", "average", "mcquitty",
"median", "centroid"
)
if (!clusterMethod %in% validMethods) {
stop(
paste(
"Invalid clustering method. Choose one of:",
paste(validMethods, collapse = ", ")
)
)
}
# Perform clustering using fastcluster
dendrogram <- fastcluster::hclust(distanceMatrix, method = clusterMethod)
return(dendrogram)
}
#' Plot dendrogram tree
#' @export
#' @param dd dendrogram object (see ?clusterDataDend)
#' @return A dendrogram plot for the genes in the input phylogenetic profiles.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom ape plot.phylo
#' @seealso \code{\link{clusterDataDend}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' profiles <- getDataClustering(
#' data, profileType, var1AggregateBy, var2AggregateBy)
#' distMethod <- "mutualInformation"
#' distanceMatrix <- getDistanceMatrix(profiles, distMethod)
#' clusterMethod <- "complete"
#' dd <- clusterDataDend(as.dist(distanceMatrix), clusterMethod)
#' getDendrogram(dd)
getDendrogram <- function(dd = NULL) {
if (is.null(dd)) stop("Input dendrogram cannot be NULL!")
p <- ape::plot.phylo(as.phylo(dd))
return(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.