R/high_confidence.R

Defines functions prepare_high_confidence_objects assign_high_confidence_to_transition_edges report_num_high_confidence_trans_edge discretize_conf_with_cutoff

#' Discritize confidence values given a cutoff
#'
#' @description Given a vector with values that describe confidence, binarize
#'   the vector a accoriding to a cutoff value.
#' @param confidence_vector Numeric vector.
#' @param threshold Number.
#'
#' @return Confidence vector. Binary vector.
#'
#' @noRd
discretize_conf_with_cutoff <- function(confidence_vector, threshold){
  # Check inputs ---------------------------------------------------------------
  check_is_number(threshold)
  if (!is.vector(confidence_vector)) {
    stop("Input must be a numeric vector")
  }
  check_is_number(confidence_vector[1])

  # Function -------------------------------------------------------------------
  confidence_vector[confidence_vector  < threshold] <- 0
  confidence_vector[confidence_vector >= threshold] <- 1

  # Check and return output ----------------------------------------------------
  check_if_binary_vector(confidence_vector)
  return(confidence_vector)
}

#' Report the number of high confidence transition edges
#'
#' @description Given a genotype for which you have: a list of vectors that
#'   stores if there is a genotype transition or not for each edge
#'   (genotype_transition), a list of vectors that stores if that edge is high
#'   confidence or not (high_conf_edges), and a character vector of the genotype
#'   names -- create an object that stores the number of high confidence
#'   transition edges per genotype.
#'
#' @param genotype_transition List of numeric vectors. Number of lists == number
#'   of genotypes. Length of vector == Nedge(tr).
#' @param high_conf_edges Binary vector. List of numeric vectors. Number of
#'   lists == number of genotypes. Length of vector == Nedge(tr).
#' @param geno_names Character vector. Length == ncol(genotype_matrix).
#'
#' @return num_high_conf_trans_edges. Numeric vector. Count of number of high
#'   confidence transitions per genotype. Vector is named with genotype names.
#' @noRd
report_num_high_confidence_trans_edge <- function(genotype_transition,
                                                  high_conf_edges,
                                                  geno_names){
  # Check input ----------------------------------------------------------------
  check_equal(length(genotype_transition), length(geno_names))
  check_equal(length(high_conf_edges), length(geno_names))
  if (!is.vector(genotype_transition[[1]]$transition)) {
    stop("Input must have a vector.")
  }
  if (!is.vector(high_conf_edges[[1]])) {
    stop("Input must have a vector.")
  }
  if (!is.character(geno_names[1])) {
    stop("Input must be a character vector.")
  }

  # Function -------------------------------------------------------------------
  num_high_conf_trans_edges <- rep(0, length(high_conf_edges))
  for (p in 1:length(high_conf_edges)) {
    num_high_conf_trans_edges[p] <-
      sum(genotype_transition[[p]]$transition * high_conf_edges[[p]])
  }
  names(num_high_conf_trans_edges) <- geno_names

  # Return output --------------------------------------------------------------
  return(num_high_conf_trans_edges)
}

#' Identify high confidence transition edges
#'
#' @description Identify all edges for which the edge is high confidence and a
#'   transition edge.
#'
#' @param tr Phylo.
#' @param all_confidence_by_edge List of vectors. Each vector is binary.
#'   Length(list) == number of genomes.
#' @param geno_trans_by_edge List of vectors. Each vector is binary.
#'   Length(list) == number of genomes.
#' @param geno Matrix. Binary.
#'
#' @return edge_confident_and_trans_edge. List of vector. Each vector is binary.
#'   Length(list) == number of genomes.
#' @noRd
assign_high_confidence_to_transition_edges <- function(tr,
                                                       all_confidence_by_edge,
                                                       geno_trans_by_edge,
                                                       geno){
  # Check input ----------------------------------------------------------------
  check_tree_is_valid(tr)
  check_for_root_and_bootstrap(tr)
  check_if_binary_matrix(geno)
  check_equal(length(geno_trans_by_edge[[1]]$transition),
              ape::Nedge(tr))
  check_for_root_and_bootstrap(tr)
  check_equal(length(all_confidence_by_edge[[1]]), ape::Nedge(tr))

  # Function -------------------------------------------------------------------
  edge_confident_and_trans_edge <- rep(list(NULL), ncol(geno))
  for (k in 1:ncol(geno)) {
    edge_confident_and_trans_edge[[k]] <-
      as.numeric( (all_confidence_by_edge[[k]] +
                    geno_trans_by_edge[[k]]$transition) == 2)
  }

  # Return output --------------------------------------------------------------
  return(edge_confident_and_trans_edge)
}

#' Prepare all high confidence objects
#'
#' @description Identify high confidence edges (considering: tree bootstrap
#'   values, phenotype reconstruction, tree edge lengths, and ancestral
#'   reconstruction of genotype).
#'
#' @param genotype_transition List of lists. Number of lists = number of
#'   genotypes. Each list is made of a $transition and $trans_dir list.
#'   Length(transition) == Length(trans_dir) == Nedge(tree)
#' @param tr Phylo.
#' @param pheno_tip_node_recon_conf List of confidence values. Binary.
#'   Length(list) == Ntip() + Nedge()
#' @param boot_threshold Numeric. Between 0 and 1.
#' @param geno Matrix. Binary. Nrow = Ntip(tree). Ncol = Number of genotypes.
#' @param geno_conf_edge List of lists. Binary. Number of lists = number of
#'   genotypes. Length(each individual list) == Nedge(Tree)
#' @param geno_recon_edge List of lists. Binary. Number of lists = number of
#'   genotypes. Length(each individual list) == Nedge(Tree)
#' @param snps_in_each_gene Either Null or named table where names are genotypes
#'   and the values are number of not-yet-grouped-genotypes that go into the
#'   grouped genotype.
#'
#' @return List of objects:
#'   \describe{
#'     \item{dropped_genotypes}{Character vector. Names of the genotypes not
#'     being kept.}
#'     \item{hi_confidence_transition_edge.}{List}
#'     \item{genotype.}{Matrix}
#'     \item{snps_per_gene.}{Either Null or named table where names are
#'     genotypes and the Values are number of not-yet-grouped-genotypes that go
#'     into the grouped genotype.}
#'     \item{genotype_transition}{Object with two lists: $trans_dir and
#'     $transition.}
#'     \item{geno_recon_edge.}{List of lists. Binary. Number of lists = number
#'     of genotypes. Length(each individual list) == Nedge(tree).}
#'     \item{high_conf_ordered_by_edges.}{List.}
#'     \item{num_high_conf_trans_edges.}{Numeric vector. Count of number of high
#'     confidence transitions per genotype. Vector is named with genotype
#'     names.}
#'     \item{tr_and_pheno_hi_conf.}{Vector. Binary. Length = Nedge(tree)}
#'   }
#' @noRd
prepare_high_confidence_objects <- function(genotype_transition,
                                            tr,
                                            pheno_tip_node_recon_conf,
                                            boot_threshold,
                                            geno,
                                            geno_conf_edge,
                                            geno_recon_edge,
                                            snps_in_each_gene){
  # Check input ----------------------------------------------------------------
  check_equal(length(genotype_transition), ncol(geno))
  check_equal(length(genotype_transition[[1]]$transition), ape::Nedge(tr))
  check_for_root_and_bootstrap(tr)
  check_equal(length(pheno_tip_node_recon_conf),
              c(ape::Ntip(tr) + ape::Nnode(tr)))
  check_num_between_0_and_1(boot_threshold)
  check_dimensions(geno,
                   exact_rows = ape::Ntip(tr),
                   min_rows = ape::Ntip(tr),
                   exact_cols = NULL,
                   min_cols = 1)
  check_equal(length(geno_conf_edge), ncol(geno))
  check_equal(length(geno_conf_edge[[1]]), ape::Nedge(tr))
  check_equal(length(geno_recon_edge), ncol(geno))
  check_equal(length(geno_recon_edge[[1]]), ape::Nedge(tr))
  check_if_binary_vector(geno_conf_edge[[1]])
  check_if_binary_vector(geno_recon_edge[[1]])
  if (!is.null(snps_in_each_gene)) {
    if (length(snps_in_each_gene) < 1) {
      stop("There should be more grouped genotypes")
    }
    check_is_number(snps_in_each_gene[1])
    check_is_string(names(snps_in_each_gene)[1])
  }

  # Function -------------------------------------------------------------------
  pheno_conf_ordered_by_edges <-
    reorder_tip_and_node_to_edge(pheno_tip_node_recon_conf, tr)
  tree_conf <- get_bootstrap_confidence(tr, boot_threshold)
  tree_conf_ordered_by_edges <- reorder_tip_and_node_to_edge(tree_conf, tr)
  short_edges <- identify_short_edges(tr)

  high_confidence_edges <-
    pheno_conf_ordered_by_edges + tree_conf_ordered_by_edges + short_edges == 3
  check_equal(length(high_confidence_edges), ape::Nedge(tr))
  all_high_confidence_edges <- rep(list(0), ncol(geno))

  # ADD IN GENO RECONSTRUCTION CONFIDENCE
  for (k in 1:ncol(geno)) {
    all_high_confidence_edges[[k]] <-
      as.numeric(geno_conf_edge[[k]] + high_confidence_edges == 2)
  }
  only_high_conf_geno_trans <-
    assign_high_confidence_to_transition_edges(tr,
                                               all_high_confidence_edges,
                                               genotype_transition,
                                               geno)
  for (i in 1:ncol(geno)) {
    genotype_transition[[i]]$transition <- only_high_conf_geno_trans[[i]]
    genotype_transition[[i]]$trans_dir <-
      only_high_conf_geno_trans[[i]] * genotype_transition[[i]]$trans_dir
  }
  names(only_high_conf_geno_trans) <- colnames(geno)
  geno_trans_by_edge <-
    report_num_high_confidence_trans_edge(genotype_transition,
                                          all_high_confidence_edges,
                                          colnames(geno))

  # KEEP ONLY GENOTYPES WITH AT LEAST TWO HIGH CONFIDENCE TRANSITION EDGES
  geno_to_keep <- keep_two_plus_hi_conf_tran_ed(genotype_transition,
                                                all_high_confidence_edges)
  genotype_transition <- genotype_transition[geno_to_keep]
  geno_recon_edge <- geno_recon_edge[geno_to_keep]
  high_conf_ordered_by_edges <- all_high_confidence_edges[geno_to_keep]
  dropped_genotypes <- get_dropped_genotypes(geno, geno_to_keep)
  geno <- geno[, geno_to_keep, drop = FALSE]
  snps_in_each_gene <-
    snps_in_each_gene[names(snps_in_each_gene) %in% colnames(geno)]

  # Check output ---------------------------------------------------------------
  if (length(genotype_transition) == 0) {
    stop("No genotypes to test because all genotypes failed quality control")
  }

  # Return output --------------------------------------------------------------
  results <-
    list("dropped_genotypes" = dropped_genotypes,
          "hi_confidence_transition_edge" = only_high_conf_geno_trans,
          "genotype" = geno,
          "snps_per_gene" = snps_in_each_gene,
          "genotype_transition" = genotype_transition,
          "geno_recon_edge" = geno_recon_edge,
          "high_conf_ordered_by_edges" = high_conf_ordered_by_edges,
          "num_high_conf_trans_edges" = geno_trans_by_edge,
          "tr_and_pheno_hi_conf" = high_confidence_edges)
  return(results)
}
katiesaund/hogwash documentation built on Jan. 18, 2022, 7:41 a.m.