Report Canberra Distance"

# Assuming R Markdown package is installed

knitr::opts_chunk$set(echo = FALSE)

ccanberra <- function(dataset, formula, plot = TRUE, 
                      plot_title = "Canberra Distance Between Groups", 
                      min_group_size = 3) {

  if (!is.data.frame(dataset))
    stop("The input must be a data frame.")

  grouping_vars <- all.vars(formula)
  if (length(grouping_vars) == 0)
    stop("At least one grouping variable must be specified in the formula.")
  if (!all(grouping_vars %in% names(dataset)) && !("index" %in% grouping_vars))
    stop("Some grouping variables are not present in the dataset.")

  # If the user specified "~index", use the individual mode.
  if ("index" %in% grouping_vars) {
    if (!("index" %in% names(dataset))) {
      message("Formula '~index' was used. In 'index' mode, 'min_group_size' is always 1.")
      idx <- rownames(dataset)
      is_default_rownames <- is.null(idx) || all(idx == as.character(seq_len(nrow(dataset))))
      if (is_default_rownames) {
        message(" -> Using sequential numbers (1, 2, 3, ...) for the index.")
        dataset$index <- seq_len(nrow(dataset))
      } else {
        message(" -> Using dataset's rownames for the index.")
        dataset$index <- idx
      }
    }

    # In this case, grouping is performed for each row (i.e., each observation is its own group).
    # Predictors are all variables except for "index".
    predictors <- setdiff(names(dataset), "index")

    ## Perform imputation and one-hot encoding (if necessary)
    dataset_imputed <- na.omit(dataset)
    predictors_data <- model.matrix(~ . - 1, data = dataset_imputed[, predictors, drop = FALSE])
    predictors_data <- as.data.frame(predictors_data)
    variances <-  sapply(predictors_data, var)
    predictors_data <- predictors_data[, variances > 1e-9, drop = FALSE]
    predictors <- colnames(predictors_data)

    # Convert predictors to a matrix to speed up calculations and standardize
    predictors_mat <- as.matrix(predictors_data)
    predictor_means <- colMeans2(predictors_mat, na.rm = TRUE)
    predictor_sds <-  apply(predictors_mat, 2, sd)
    predictor_sds[predictor_sds < 1e-9] <- 1 
    X <- scale(predictors_mat, center = predictor_means, scale = predictor_sds)

    # Compute the Canberra Distance matrix based on standard Canberra distance
    D <- as.matrix(round(dist(X, method = "canberra"), digits = 2))

    return(list(index = list(distances = D)))
  }

  # Assume grouping is not "index"
  predictors <- setdiff(names(dataset), grouping_vars)

  if (length(predictors) != 0) {

    dataset_imputed <- na.omit(dataset)
    predictors_data <- model.matrix(~ . - 1, data = dataset_imputed[, predictors, drop = FALSE])
    predictors_data <- as.data.frame(predictors_data)
    variances <-  sapply(predictors_data, var)
    predictors_data <- predictors_data[, variances > 1e-9, drop = FALSE]
    predictors <- colnames(predictors_data)

    dataset_prepared <- cbind(dataset_imputed[grouping_vars], predictors_data)

    predictors_mat <- as.matrix(predictors_data)
    predictor_means <- colMeans2(predictors_mat, na.rm = TRUE)
    predictor_sds <-  apply(predictors_mat, 2, sd)
    predictor_sds[predictor_sds < 1e-9] <- 1 
    predictors_data_scaled <- scale(predictors_mat, center = predictor_means, scale = predictor_sds)

    dataset_prepared_scaled <- cbind(dataset_imputed[grouping_vars], as.data.frame(predictors_data_scaled))

    dt <- as.data.frame(dataset_prepared_scaled)
  } else if (length(predictors) == 0) {
    dataset_imputed <- na.omit(dataset)
    dataset_prepared <- (dataset_imputed)
    dt <- as.data.frame(dataset_prepared)
  }

  result <- list()
  plot_list <- list()

  for (grouping_var in grouping_vars) {
    groups <- split(dt, dt[[grouping_var]])
    group_sizes <-  sapply(groups, nrow)
    valid_groups <- groups[group_sizes >= min_group_size]

    if (length(valid_groups) < 2) {
      warning(paste("Not enough valid groups for grouping variable:", grouping_var, "- skipping."))
      next
    }

    group_names <- names(valid_groups)
    n <- length(valid_groups)

    if (length(predictors) != 0) {
      means <-  lapply(valid_groups, function(dt_group) {
        colMeans2(as.matrix(dt_group[, predictors]))
      })
    } else if (length(predictors) == 0) {
      means <-  lapply(valid_groups, function(dt_group) {
        colMeans2(as.matrix(dt_group))
      })
    }

    distances <- matrix(0, nrow = n, ncol = n)
    rownames(distances) <- colnames(distances) <- group_names
    # Calculate Canberra distance between each pair of groups.
    for (i in 1:n) {
      mean_i <- means[[i]]
      for (j in 1:n) {
        if (i != j) {
          mean_j <- means[[j]]
          mean_diff <- abs(mean_i - mean_j)
          mean_sum <- abs(mean_i) + abs(mean_j)
          distances[i,j] <- round(sum2(mean_diff / mean_sum), digits = 2)
          distances[j, i] <- distances[i, j]  # Ensure symmetry
        }
      }
    }
    result[[grouping_var]] <- list(distances = distances)
  }

  return(result)
}

pvaluesccanb <- function(dataset, formula, pvalue.method = "permutation", plot = TRUE, seed = NULL, min_group_size = 3) {

  if (!is.data.frame(dataset)) {
    stop("The input must be a data frame.")
  }

  grouping_vars <- all.vars(formula)
  if (length(grouping_vars) == 0) {
    stop("At least one grouping variable must be specified in the formula.")
  }

  if (!all(grouping_vars %in% names(dataset)) && !identical(grouping_vars, "index")) {
    stop("Some grouping variables are not present in the dataset.")
  }

  if ("index" %in% grouping_vars && pvalue.method == "bootstrap") {
    stop("'bootstrap' method does not have sense with 'index'")
  }

  if ("index" %in% grouping_vars && pvalue.method == "permutation") {
    stop("'permutation' method does not have sense with 'index'")
  }

  if ("index" %in% grouping_vars) {
    if (!("index" %in% names(dataset))) {
      message("Formula '~index' was used. In this mode, 'min_group_size' is always 1.")
      idx <- rownames(dataset)
      is_default_rownames <- is.null(idx) || all(idx == as.character(seq_len(nrow(dataset))))
      if (is_default_rownames) {
        message(" -> Using sequential numbers (1, 2, 3, ...) for the index.")
        dataset$index <- seq_len(nrow(dataset))
      } else {
        message(" -> Using dataset's rownames for the index.")
        dataset$index <- idx
      }
    }
  }

  # Create lists to save the results (p-value matrices) and plots
  result_list <- list()
  plot_list <- list()
  excluded_message <- list()

  # Iterate over each grouping variable
  for (idx in seq_along(grouping_vars)) {
    grouping_var <- grouping_vars[idx]

    # Calculate Canberra distances using ccanberra
    canberra_results <- ccanberra(dataset, as.formula(paste("~", grouping_var)),
                                        plot = FALSE, min_group_size = min_group_size)
    distances <- canberra_results[[grouping_var]]$distances

    # Number of groups
    n <- nrow(distances)

    # Initialize the p-values matrix
    p_values <- matrix(NA, nrow = n, ncol = n)
    rownames(p_values) <- colnames(p_values) <- rownames(distances)

    null_distances_list <- list() # Initialize a list to store distances

    if (!is.null(seed)) {
      set.seed(seed)
    }

    if (pvalue.method == "permutation") {
      message(paste("Running permutations for", grouping_var, "..."))

      for (k in 1: nrow(dataset)) { # Loop for permutations
        permuted_data <- dataset
        permuted_data[[grouping_var]] <- sample(permuted_data[[grouping_var]], replace = FALSE)

        permuted_results <- ccanberra(permuted_data,
                                         as.formula(paste("~", grouping_var)),
                                         plot = FALSE,
                                         min_group_size = min_group_size)

        if (!is.null(permuted_results) && !is.null(permuted_results[[grouping_var]])) {
          null_distances_list[[k]] <- permuted_results[[grouping_var]]$distances
        } else {
          null_distances_list[[k]] <- NULL # Store NULL if calculation fails
        }
      }
    } else if (pvalue.method == "bootstrap") {
      message(paste("Running bootstrap samples for", grouping_var, "..."))
      for (k in 1: nrow(dataset)) { # Loop for bootstraps
        bootstrap_sample <- sample(nrow(dataset), replace = TRUE)
        bootstrap_data <- dataset[bootstrap_sample, ] 

        bootstrap_results <- ccanberra(bootstrap_data,
                                          as.formula(paste("~", grouping_var)),
                                          plot = FALSE,
                                          min_group_size = min_group_size)

        if (!is.null(bootstrap_results) && !is.null(bootstrap_results[[grouping_var]])) {
          null_distances_list[[k]] <- bootstrap_results[[grouping_var]]$distances
        } else {
          null_distances_list[[k]] <- NULL
        }
      }
    }

    # Now calculate p-values using the collected null distances
    for (i in 1:n) {
      for (j in i:n) {
        if (i == j) next

        observed_distance <- distances[i, j]

        # Extract the vector of distances for the cell (i,j) from all saved matrices
        replicated_distances <- sapply(null_distances_list, function(dist_matrix) {
          if (!is.null(dist_matrix) && nrow(dist_matrix) >= i && ncol(dist_matrix) >= j && !is.na(dist_matrix[i, j])) {
            return(dist_matrix[i, j])
          } else {
            return(NA)
          }
        })

        valid_distances <- replicated_distances[!is.na(replicated_distances)]

        if (length(valid_distances) == 0) {
          p_values[i, j] <- NA
        } else {
          # Calculate the p-value as a proportion of replicated distances >= to that observed
          p_values[i, j] <- round(mean(valid_distances >= observed_distance, na.rm = TRUE), digits = 2)
        }

        p_values[j, i] <- p_values[i, j] # Make the matrix symmetric
      }
    }
    result_list[[grouping_var]] <- p_values

    excluded_groups <- canberra_results[[grouping_var]]$excluded_groups
    if (length(excluded_groups) > 0 && is.null(excluded_message[[grouping_var]])) {
      excluded_message[[grouping_var]] <- paste("Grouping variable", grouping_var,
                                                "- excluded groups due to insufficient size:",
                                                paste(excluded_groups, collapse = ", "))
      message(excluded_message[[grouping_var]])
    }
  }
  return(result_list)
}


plot_distances <- function(distances) {
  distances_melt <- reshape2::melt(distances, na.rm = TRUE)
      colnames(distances_melt) <- c("Var1", "Var2", "value")

      q <- ggplot2::ggplot(data = distances_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) +
        ggplot2::geom_tile() +
        ggplot2::scale_colour_brewer(palette = "Greens") +
        ggplot2::geom_tile(color = "white", lwd = 1.5, linetype = 1) +
        ggplot2::geom_text(aes(label = value), color = "white", size = 4) + 
        ggplot2::labs(title = paste("Distances heatmap"), x = "", y = "", fill = "p_value") +
        ggplot2::theme_minimal() +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

    print(q)
}


plot_p_values <- function(p_values) {
      p_values_melt <- reshape2::melt(p_values, na.rm = TRUE)
      colnames(p_values_melt) <- c("Var1", "Var2", "value")

      p <- ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) +
        ggplot2::geom_tile() +
        ggplot2::scale_colour_brewer(palette = "Greens") +
        ggplot2::geom_tile(color = "white", lwd = 1.5, linetype = 1) +
        ggplot2::geom_text(aes(label = value), color = "white", size = 4) + 
        ggplot2::labs(title = paste("p-values heatmap"), x = "", y = "", fill = "p_value") +
        ggplot2::theme_minimal() +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

    print(p)

}

distances <- params$distances  # Replace with your actual distances data
p_values <- params$p_values    # Replace with your actual p_values data

if (!is.null(distances) && !all(is.na(distances))) {
  print(distances, caption = "Canberra Distances")
} else {
  print("No distances available")
}

if (!is.null(p_values) && !all(is.na(p_values))) {
   print(p_values, caption = "p_values")
} else {
  print("No p_values available")
}

if (length(grouping_vars) == 1) {
plot_distances(distances)
}

if (length(grouping_vars) == 1) {
plot_p_values(p_values)
}


Try the cmahalanobis package in your browser

Any scripts or data that you put into this service are public.

cmahalanobis documentation built on Sept. 14, 2025, 5:09 p.m.