Report Hamming Distance"
In cmahalanobis: Calculate Distance Measures for DataFrames

# Assuming R Markdown package is installed

knitr::opts_chunk$set(echo = FALSE)

chamming <- function(dataset, formula, plot = TRUE, 
                     plot_title = "Hamming Distance Between Groups", 
                     min_group_size = 3) {

  if (!is.data.frame(dataset))
    stop("The input must be a data frame.")

  grouping_vars <- all.vars(formula)
  if (length(grouping_vars) == 0)
    stop("At least one grouping variable must be specified in the formula.")
  if (!all(grouping_vars %in% names(dataset)) && !("index" %in% grouping_vars))
    stop("Some grouping variables are not present in the dataset.")

  # If the user specified "~index", use the individual mode.
  if ("index" %in% grouping_vars) {
    if (!("index" %in% names(dataset))) {
      message("Formula '~index' was used. In 'index' mode, 'min_group_size' is always 1.")
      idx <- rownames(dataset)
      is_default_rownames <- is.null(idx) || all(idx == as.character(seq_len(nrow(dataset))))
      if (is_default_rownames) {
        message(" -> Using sequential numbers (1, 2, 3, ...) for the index.")
        dataset$index <- seq_len(nrow(dataset))
      } else {
        message(" -> Using dataset's rownames for the index.")
        dataset$index <- idx
      }
    }

    # In this case, each row is its own group.  
    # Predictors are all variables except "index".
    predictor_vars <- setdiff(names(dataset), "index")

    ## Perform imputation and one-hot encoding if necessary.
    dataset_imputed <- na.omit(dataset)
    predictors_data <- model.matrix(~ . - 1, data = dataset_imputed[, predictor_vars, drop = FALSE])
    predictors_data <- as.data.frame(predictors_data)
    variances <- sapply(predictors_data, var)
    predictors_data <- predictors_data[, variances > 1e-9, drop = FALSE]
    predictor_vars <- colnames(predictors_data)

    # Compute Hamming distance for two binary vectors as the number of positions that differ.
    n_rows <- nrow(dataset_imputed)
    D <- matrix(0, nrow = n_rows, ncol = n_rows)
    for (i in 1:(n_rows - 1)) {
      for (j in (i + 1):n_rows) {
        hamming_dist <- sum2(predictors_data[i, ] != predictors_data[j, ])
        D[i, j] <- hamming_dist
        D[j, i] <- hamming_dist
      }
    }
    rownames(D) <- colnames(D) <- as.character(dataset_imputed$index)
    D <- as.matrix(D)

    return(list(index = list(distances = D)))

  }

  # Otherwise, Assume grouping is not "index"
  predictors <- setdiff(names(dataset), grouping_vars)

  if (length(predictors) != 0) {

    dataset_imputed <- na.omit(dataset)
    predictors_data <- model.matrix(~ . - 1, data = dataset_imputed[, predictors, drop = FALSE])
    predictors_data <- as.data.frame(predictors_data)
    variances <-  sapply(predictors_data, var)
    predictors_data <- predictors_data[, variances > 1e-9, drop = FALSE]
    predictors <- colnames(predictors_data)

    dataset_prepared <- cbind(dataset_imputed[grouping_vars], predictors_data)

    predictors_mat <- as.matrix(predictors_data)
    predictor_means <- colMeans2(predictors_mat, na.rm = TRUE)
    predictor_sds <-  apply(predictors_mat, 2, sd)
    predictor_sds[predictor_sds < 1e-9] <- 1 
    predictors_data_scaled <- scale(predictors_mat, center = predictor_means, scale = predictor_sds)

    dataset_prepared_scaled <- cbind(dataset_imputed[grouping_vars], as.data.frame(predictors_data_scaled))

    dt <- as.data.frame(dataset_prepared_scaled)
  } else if (length(predictors) == 0) {
    dataset_imputed <- na.omit(dataset)
    dataset_prepared <- (dataset_imputed)
    dt <- as.data.frame(dataset_prepared)
  }

  result <- list()
  plot_list <- list()

  for (grouping_var in grouping_vars) {
    groups <- split(dt, dt[[grouping_var]])
    group_sizes <-  sapply(groups, nrow)
    valid_groups <- groups[group_sizes >= min_group_size]

    if (length(valid_groups) < 2) {
      warning(paste("Not enough valid groups for grouping variable:", grouping_var, "- skipping."))
      next
    }

    group_names <- names(valid_groups)
    n_groups <- length(valid_groups)

    # Compute pairwise Hamming distances between the group binary vectors.
    distances <- matrix(0, nrow = n_groups, ncol = n_groups)
    rownames(distances) <- colnames(distances) <- group_names

    for (i in 1:(n_groups - 1)) {
      for (j in (i + 1):n_groups) {
        a <- valid_groups[[j]]
        b <- valid_groups[[i]]
        if ((nrow(a)) < (nrow(b))) {
          b <- b[1:nrow(a), ]
          hamming_dist <- sum2(b != a)
        } else if ((nrow(a) > (nrow(b)))) {
          a <- a[1:nrow(b), ]
          hamming_dist <- sum2(b != a)
        } else if ((nrow(b)) == (nrow(a))) {
          hamming_dist <- sum2(b != a)
        }
        distances[i, j] <- hamming_dist 
        distances[j, i] <- distances[i, j]
      }
    }
    result[[grouping_var]] <- list(distances = distances)
  }

  return(result)
}

pvalueschamm <- function(dataset, formula, pvalue.method = "permutation", plot = TRUE, seed = NULL, min_group_size = 3) {

  if (!is.data.frame(dataset)) {
    stop("The input must be a data frame.")
  }

  grouping_vars <- all.vars(formula)
  if (length(grouping_vars) == 0) {
    stop("At least one grouping variable must be specified in the formula.")
  }

  if (!all(grouping_vars %in% names(dataset)) && !identical(grouping_vars, "index")) {
    stop("Some grouping variables are not present in the dataset.")
  }

  if ("index" %in% grouping_vars && pvalue.method == "bootstrap") {
    stop("'bootstrap' method does not have sense with 'index'")
  }

  if ("index" %in% grouping_vars && pvalue.method == "permutation") {
    stop("'permutation' method does not have sense with 'index'")
  }

  if ("index" %in% grouping_vars) {
    if (!("index" %in% names(dataset))) {
      message("Formula '~index' was used. In this mode, 'min_group_size' is always 1.")
      idx <- rownames(dataset)
      is_default_rownames <- is.null(idx) || all(idx == as.character(seq_len(nrow(dataset))))
      if (is_default_rownames) {
        message(" -> Using sequential numbers (1, 2, 3, ...) for the index.")
        dataset$index <- seq_len(nrow(dataset))
      } else {
        message(" -> Using dataset's rownames for the index.")
        dataset$index <- idx
      }
    }
  }

  # Create lists to save the results (p-value matrices) and plots
  result_list <- list()
  plot_list <- list()
  excluded_message <- list()

  # Iterate over each grouping variable
  for (idx in seq_along(grouping_vars)) {
    grouping_var <- grouping_vars[idx]

    # Calculate Hamming distances using chamming
    hamming_results <- chamming(dataset, as.formula(paste("~", grouping_var)),
                                        plot = FALSE, min_group_size = min_group_size)
    distances <- hamming_results[[grouping_var]]$distances

    # Number of groups
    n <- nrow(distances)

    # Initialize the p-values matrix
    p_values <- matrix(NA, nrow = n, ncol = n)
    rownames(p_values) <- colnames(p_values) <- rownames(distances)

    null_distances_list <- list() # Initialize a list to store distances

    if (!is.null(seed)) {
      set.seed(seed)
    }

    if (pvalue.method == "permutation") {
      message(paste("Running permutations for", grouping_var, "..."))

      for (k in 1: nrow(dataset)) { # Loop for permutations
        permuted_data <- dataset
        permuted_data[[grouping_var]] <- sample(permuted_data[[grouping_var]], replace = FALSE)

        permuted_results <- chamming(permuted_data,
                                         as.formula(paste("~", grouping_var)),
                                         plot = FALSE,
                                         min_group_size = min_group_size)

        if (!is.null(permuted_results) && !is.null(permuted_results[[grouping_var]])) {
          null_distances_list[[k]] <- permuted_results[[grouping_var]]$distances
        } else {
          null_distances_list[[k]] <- NULL # Store NULL if calculation fails
        }
      }
    } else if (pvalue.method == "bootstrap") {
      message(paste("Running bootstrap samples for", grouping_var, "..."))
      for (k in 1: nrow(dataset)) { # Loop for bootstraps
        bootstrap_sample <- sample(nrow(dataset), replace = TRUE)
        bootstrap_data <- dataset[bootstrap_sample, ] 

        bootstrap_results <- chamming(bootstrap_data,
                                          as.formula(paste("~", grouping_var)),
                                          plot = FALSE,
                                          min_group_size = min_group_size)

        if (!is.null(bootstrap_results) && !is.null(bootstrap_results[[grouping_var]])) {
          null_distances_list[[k]] <- bootstrap_results[[grouping_var]]$distances
        } else {
          null_distances_list[[k]] <- NULL
        }
      }
    }

    # Now calculate p-values using the collected null distances
    for (i in 1:n) {
      for (j in i:n) {
        if (i == j) next

        observed_distance <- distances[i, j]

        # Extract the vector of distances for the cell (i,j) from all saved matrices
        replicated_distances <- sapply(null_distances_list, function(dist_matrix) {
          if (!is.null(dist_matrix) && nrow(dist_matrix) >= i && ncol(dist_matrix) >= j && !is.na(dist_matrix[i, j])) {
            return(dist_matrix[i, j])
          } else {
            return(NA)
          }
        })

        valid_distances <- replicated_distances[!is.na(replicated_distances)]

        if (length(valid_distances) == 0) {
          p_values[i, j] <- NA
        } else {
          # Calculate the p-value as a proportion of replicated distances >= to that observed
          p_values[i, j] <- round(mean2(valid_distances >= observed_distance, na.rm = TRUE), digits = 2)
        }

        p_values[j, i] <- p_values[i, j] # Make the matrix symmetric
      }
    }
    result_list[[grouping_var]] <- p_values

    excluded_groups <- hamming_results[[grouping_var]]$excluded_groups
    if (length(excluded_groups) > 0 && is.null(excluded_message[[grouping_var]])) {
      excluded_message[[grouping_var]] <- paste("Grouping variable", grouping_var,
                                                "- excluded groups due to insufficient size:",
                                                paste(excluded_groups, collapse = ", "))
      message(excluded_message[[grouping_var]])
    }
  }
  return(result_list)
}

plot_distances <- function(distances) {
  distances_melt <- reshape2::melt(distances, na.rm = TRUE)
      colnames(distances_melt) <- c("Var1", "Var2", "value")

      q <- ggplot2::ggplot(data = distances_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) +
        ggplot2::geom_tile() +
        ggplot2::scale_colour_brewer(palette = "Greens") +
        ggplot2::geom_tile(color = "white", lwd = 1.5, linetype = 1) +
        ggplot2::geom_text(aes(label = value), color = "white", size = 4) + 
        ggplot2::labs(title = paste("Distances heatmap"), x = "", y = "", fill = "p_value") +
        ggplot2::theme_minimal() +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

    print(q)
}

plot_p_values <- function(p_values) {
      p_values_melt <- reshape2::melt(p_values, na.rm = TRUE)
      colnames(p_values_melt) <- c("Var1", "Var2", "value")

      p <- ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) +
        ggplot2::geom_tile() +
        ggplot2::scale_colour_brewer(palette = "Greens") +
        ggplot2::geom_tile(color = "white", lwd = 1.5, linetype = 1) +
        ggplot2::geom_text(aes(label = value), color = "white", size = 4) + 
        ggplot2::labs(title = paste("p-values heatmap"), x = "", y = "", fill = "p_value") +
        ggplot2::theme_minimal() +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

    print(p)

}

distances <- params$distances  # Replace with your actual distances data
p_values <- params$p_values    # Replace with your actual p_values data

if (!is.null(distances) && !all(is.na(distances))) {
  cat(" --- Distances --- \n")
  print(distances)
} else {
  print("No distances available")
}

if (!is.null(p_values) && !all(is.na(p_values))) {
   cat(" --- P-values --- \n")
   print(p_values)
} else {
  print("No p_values available")
}

if (length(grouping_vars) == 1) {
plot_distances(distances)
}

if (length(grouping_vars) == 1) {
plot_p_values(p_values)
}