#' Non Redundant set of samples using hierarchical search.
#'
#' Creates a non-redundant sub-set of samples. The function accept an
#' \emph{accnet} or \emph{mash} object and returns a \emph{nr_list} object.
#' The sub-set can be created using a certain \emph{distance} (mash o jaccard distance),
#' a specific \emph{number} of elements or a \emph{fraction} of the whole sample set.
#' The function performes an iterative seach so sometimes the exact number of returned elements
#' could be not the same that the specified in the input. This difference can be defined
#' with the \emph{threshold} parameter. This function perform a hierarchical search. For small
#' datasets (<2000) \emph{non_redundant()} could be faster. This function comsume less memory than
#' the original so it is better for large datasets.
#'
#' @param data An \emph{accnet} or \emph{mash} object.
#' @param number The number of non-redundant samples.
#' @param fraction The fraction of the whole set of non-redundant samples.
#' @param distance Minimun distance among samples.
#' @param tolerance Percentage of error between the input number and the final number of samples.
#' @param max_iter Maximun number of search iterations.
#' @param fast If fast is TRUE the clustering process uses "components" in other case use "fast_greddy".
#' @param partitions Number of partitions to hiaerarchical search.
#'
#' @return \emph{nr_list} object
#' @export
#'
#' @seealso \code{\link{extract_non_redundant}}
#'
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @import dtplyr
#' @import igraph
#' @importFrom data.table fread
#' @import parallelDist
#'
non_redundant_hier <- function(data, number, fraction, distance, tolerance = 0.05, partitions = 10,max_iter = 10000, fast =FALSE, snps)
{
if(is(data,"accnet"))
{
if(is.null(data$dist))
{
m.matrix <- data$matrix %>%
as_tibble() %>%
column_to_rownames("Source") %>%
as.matrix() %>%
parallelDist(., method = "binary") %>%
as.matrix()
}else{
m.matrix = data$dist %>% as.data.frame()
}
m.list <- m.matrix %>%
rownames_to_column("Source") %>%
spread(Target,Dist,-Source)
}else if (is(data,"mash"))
{
m.matrix <- data$matrix
m.list <- data$table
}else if (is(data,"matrix")){
if(!missing(snps))
{
m.matrix <- data
m.list <- data %>% as_tibble() %>% rownames_to_column("Source") %>% gather(Target,Dist, -Source)
gr.tmp <- m.list %>% filter(Dist <= snps) %>% graph_from_data_frame(.,directed = FALSE)
gr.tmp <- simplify(gr.tmp, remove.multiple = TRUE, remove.loops = TRUE) %>% as.undirected()
if(fast ==TRUE) {
cluster <- components(gr.tmp)
}else{
cluster <- cluster_louvain(gr.tmp)
}
Nc <- as.numeric(max(cluster$membership))
cent <- centralization.degree(gr.tmp)
results = data.frame(Source = as.character(vertex.attributes(gr.tmp)$name),
centrality = cent$res,
cluster = cluster$membership, stringsAsFactors = F)
class(results) <- "nr_list"
return(results)
}else{
m.matrix <- data
m.list = data %>%
as_tibble() %>%
rownames_to_column("Source") %>%
gather(Target,Dist, -Source)
}
}else {
print("Error: data must be accnet or mash object or snps matrix")
}
### With distance
if (!missing(distance)){
gr.tmp <- m.list %>% filter(Dist <= distance) %>% graph_from_data_frame(.,directed = FALSE)
gr.tmp <- simplify(gr.tmp, remove.multiple = TRUE, remove.loops = TRUE) %>% as.undirected()
if(fast ==TRUE) {
cluster <- components(gr.tmp)
}else{
cluster <- cluster_louvain(gr.tmp)
}
Nc <- as.numeric(max(cluster$membership))
if(is.infinite(Nc))
{
cluster <- components(gr.tmp)
Nc <- as.numeric(max(cluster$membership))
}
cent <- centralization.degree(gr.tmp)
results <- data.frame(Source = as.character(vertex.attributes(gr.tmp)$name),
centrality = cent$res,
cluster = cluster$membership, stringsAsFactors = F)
class(results) <- "nr_list"
return(results)
}
## With numbers or fraction
if(!missing(number))
{
min <- min(m.list$Dist)
max <- max(m.list$Dist)
Th <- mean(m.list$Dist)
}
else if (!missing(fraction))
{
number <- m.list %>% select(Source) %>% distinct() %>% count()
number <- number[1] * fraction
min <- min(m.list$Dist)
max <- max(m.list$Dist)
Th <- mean(m.list$Dist)
}else{
print("Error: number, fraction or distance must be provided")
}
steps <- round(seq(1,nrow(m.matrix), length.out = partitions))
#####
table.tmp <- data.frame()
for(i in 1:(partitions-1)) ### First Iteration
{
m.tmp <- m.matrix[steps[i]:steps[i+1],steps[i]:steps[i+1]]
cm <- graph_from_adjacency_matrix(m.tmp < Th, mode = "upper") %>% as.undirected()%>% components()
table.tmp <- cm$membership %>%
as.data.frame() %>%
rownames_to_column("Source") %>%
rename(cluster = 2) %>%
group_by(cluster) %>%
summarise(Source = first(Source)) %>% bind_rows(table.tmp)
}
gr.tmp <- m.list %>% as_tibble() %>%
semi_join(table.tmp %>%
select(Source), by ="Source") %>%
semi_join(table.tmp %>%
rename(Target = Source), by ="Target") %>%
filter(Dist <= Th) %>%
graph_from_data_frame()
cm.all <- gr.tmp %>% as.undirected() %>% components()
Nc <- as.numeric(max(cm.all$membership))
print(paste("First Iteration"," Nc=",Nc,sep = "",collapse = ""))
###### END first Iteration
count <-0
while (count <= max_iter)
{
print(Th)
if (abs(1-(Nc/number)) < tolerance){
gr.tmp <- m.list %>% filter(Dist <= Th) %>% graph_from_data_frame(.,directed = FALSE)
cluster <- simplify(gr.tmp, remove.multiple = TRUE, remove.loops = TRUE) %>%
as.undirected() %>%
cluster_louvain()
print(paste("Final number of clusters: ",max(cluster$membership)))
cent <- centralization.degree(gr.tmp)
results <- data.frame(Source = as.character(vertex.attributes(gr.tmp)$name),
centrality = cent$res,
cluster = cluster$membership, stringsAsFactors = F)
class(results) <- append(class(results),"nr_list")
return(results)
}else if (Nc > number){
min <- Th
Th <-(max + Th)/2
}else if (Nc < number){
max <- Th
Th <- (Th - min)/2
}
table.tmp <- data.frame()
for(i in 1:(partitions-1))
{
m.tmp <- m.matrix[steps[i]:steps[i+1],steps[i]:steps[i+1]]
cm <- graph_from_adjacency_matrix(m.tmp < Th, mode = "upper") %>%
as.undirected() %>%
components()
table.tmp <- cm$membership %>%
as.data.frame() %>%
rownames_to_column("Source") %>%
rename(cluster = 2) %>%
group_by(cluster) %>%
summarise(Source = first(Source)) %>%
bind_rows(table.tmp)
}
gr.tmp <- m.list %>%
semi_join(table.tmp %>%
select(Source), by = "Source") %>%
semi_join(table.tmp %>%
rename(Target = Source), by = "Target") %>%
filter(Dist <= Th) %>%
graph_from_data_frame()
if(fast == TRUE)
{
cm.all <- gr.tmp %>% as.undirected() %>% components()
}else{
cm.all <- gr.tmp %>% as.undirected() %>% cluster_louvain()
}
Nc <- as.numeric(max(cm.all$membership))
print(paste("Th =",Th," Nc=",Nc,sep = "",collapse = ""))
}
stop("Max iter reached")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.