R/utils-embedding-relco.R
In text2map: R Tools for Text Matrices, Embeddings, and Networks

## INTERNAL FUNCTION ## 

#' Evaluate anchor sets with relco -- called by test_anchors()
#'
#' Evaluates how well an anchor set defines a semantic centroid, semantic 
#' direction, or a compound concept using the technique defined by
#' Taylor et al. (2025)
#'
#' @details
#' If `all = TRUE`, all pairwise combinations of terms between each set
#' are evaluated. This increases computational complexity considerably.
#'
#' @references
#' Taylor, et al. (2025)
#' “”
#' \doi{https://osf.io/preprints/socarxiv/sc2ub_v3}
#'
#' @importFrom Matrix Matrix
#' @importFrom tibble rownames_to_column
#' @importFrom dplyr mutate_if
#' @importFrom tibble as_tibble
#' @importFrom stats t.test na.omit
#'
#' @param anchors Anchor terms
#' @param wv Matrix of word embedding vectors (a.k.a embedding model)
#'           with rows as terms.
#' @param non_anchors Terms that are not anchors (random, unrelated, 
#'                    or distinctive terms)
#' @param type Type of relation, `direction`, `centroid`, or `compound`.
#' @param dir_method Method to calculate direction.
#' @param conf Confidence interval
#' @param n_runs Number of runs
#' @param null Null hypothesis (default is 0).
#' @param alpha Significance level
#' @param seed Set sampling seed
#' @param order_non_anchors Default to FALSE, show the order of 
#'                          the non-anchor terms be fixed between each run.
#' @return list or data.frame
#' @noRd
test_anchors.relco <- function(anchors,
                  non_anchors, 
                  wv, 
                  type = c("direction", "centroid", "compound"), 
                  dir_method = c("paired", "pooled", "L2", "PCA"),
                  conf = 0.95,
                  n_runs = 100, 
                  null = 0, 
                  alpha =  0.5,
                  seed = NULL,
                  order_non_anchors = FALSE) {
  
  ## ----------------------------------------------------------------------------
  # Initialize 
  ## ----------------------------------------------------------------------------
  
  if(!is.null(seed)) set.seed(seed)
  
  ls_relco <- list()
  ls_relco[["run_scores"]]    <- data.frame(matrix(ncol = n_runs, nrow = 1))
  ls_relco[["run_intercept"]] <- data.frame(matrix(ncol = n_runs, nrow = 1))
  ls_relco[["raw_scores"]]    <- list()
  ls_relco[["term_contrib"]]  <- list()
  ls_relco[["anchor_terms"]]  <- list()
  ls_relco[["random_terms"]]  <- list()
  ls_relco[["term_conf"]]     <- list()
  ls_relco[["term_runs"]]     <- list()
  
  ## ----------------------------------------------------------------------------
  # Setup Terms
  ## ----------------------------------------------------------------------------

  # deal with data.frames
  if (any(class(anchors) %in% "data.frame")) { 
    
    if(type == "direction") {
      if(ncol(anchors)  > 2) stop("`anchors` data.frame must be ncol() == 2")
    }
    if(type %in% c("centroid", "compound")) {
      if(ncol(anchors) != 1) stop("`anchors` data.frame must be ncol() == 1")
    }

    # convert data.frame to list
    anchors <- as.list(anchors)
  }
  
  ##
  if (!any(class(anchors) %fin% c("character", "list"))) {
    stop("`anchors` must be either a character vector or a list of character vectors")
  }
  if (!inherits(non_anchors, "character")) {
    stop("`non_anchors` must be a character vector")
  }
  
  ##
  if (!identical(intersect(unlist(anchors), non_anchors), character(0))) {
    stop("`non_anchors` must not share any terms with `anchors`")
  }
  if (any(duplicated(unlist(anchors)))) {
    stop("`anchors` must not have duplicated terms")
  }
  if (any(duplicated(non_anchors))) {
    stop("`non_anchors` must not have duplicated terms")
  }
  
  ## ----------------------------------------------------------------------------
  if (type == "compound" | type == "centroid") {
    
    ## define constants
    mid_anchors <- length(anchors) * .5
    n_cols <- floor(mid_anchors) - 1 # get n cols from anchors (1 for focal term)
    
    if (length(non_anchors) < floor(mid_anchors)) {
      stop("`non_anchors` must have a length >= half the `anchors` length")
    }
    
    # random NOISE terms portion
    if (order_non_anchors) {
      .random_terms <- stats::na.omit(non_anchors[seq_len(floor(mid_anchors))])
    } else {
      .random_terms <- sample(non_anchors, replace = FALSE, size = floor(mid_anchors))
    }
    
    .anchor_list <- anchors
    # inititalize probability of being selected
    weights <- rep(1, length(.anchor_list))
    # subset WVs
    wv <- wv[rownames(wv) %in% c(.anchor_list, .random_terms), ]
  }
  
  ## ----------------------------------------------------------------------------
  if (type == "direction") {

    ## define constants
    mid_anchors <- lengths(anchors)[which.min(lengths(anchors))] * .5
    n_cols <- floor(mid_anchors) - 1 # get n cols from anchors (1 for focal term)
    
    # random NOISE terms portion
    if (order_non_anchors) {
      .random_terms <- na.omit(non_anchors[seq_len(floor(mid_anchors))])
    } else {
      .random_terms <- sample(non_anchors, replace = FALSE, size = floor(mid_anchors))
    }
    
    .anchor_list1 <- anchors[[1]]
    .anchor_list2 <- anchors[[2]]
    # inititalize probability of being selected
    weights1 <- rep(1, length(.anchor_list1))
    weights2 <- rep(1, length(.anchor_list2))
    # subset WVs
    wv <- wv[rownames(wv) %in% c(.anchor_list1, .anchor_list2, .random_terms), ]
  }
  
  ## ----------------------------------------------------------------------------
  # create initial simulated DTM
  # for centroids: dim(k+1, k+k) and for directions: dim(k+k+1, k+k+k)
  ## ----------------------------------------------------------------------------
  .anchors_dtm <- Matrix::Matrix(FALSE, nrow = (floor(mid_anchors) + 1), 
                                 ncol = (floor(mid_anchors) * 2), sparse = TRUE)
  .anchors_dtm[] <- cbind(upper.tri(.anchors_dtm[ , seq_len(floor(mid_anchors))], diag = TRUE),
                          lower.tri(.anchors_dtm[ , (floor(mid_anchors) + 1):ncol(.anchors_dtm)]))
  
  if (type == "direction") {
    .extra_dtm <- Matrix::Matrix(FALSE, 
                                 nrow = (floor(mid_anchors) + 1),
                                 ncol = (floor(mid_anchors) + 1), sparse = TRUE)
    .anchors_dtm <- rbind(cbind(.anchors_dtm, .extra_dtm[, -ncol(.extra_dtm)]), 
                          cbind(.extra_dtm, .anchors_dtm[, -ncol(.anchors_dtm)]))
    .anchors_dtm <- .anchors_dtm[-nrow(.anchors_dtm),]
  }
  
  rownames(.anchors_dtm) <- seq_len(nrow(.anchors_dtm))
  
  ## ----------------------------------------------------------------------------
  #
  ## ----------------------------------------------------------------------------
  
  # how many runs
  for (rdx in seq_len(n_runs)) {
    
    # re-order non_anchors
    temp_non_anchors <- sample(.random_terms, replace = FALSE)
    ls_relco[["random_terms"]] <- temp_non_anchors
    
    if (type == "compound" | type == "centroid") {
      
      focal <- sample(.anchor_list, size = 1, prob = weights)
      temp_cols <- sample(.anchor_list[.anchor_list != focal], size = n_cols, replace = FALSE)
      temp_anchors <- sample(.anchor_list[!.anchor_list %in% c(focal, temp_cols)])
      
      dimnames(.anchors_dtm)[[2]] <- c(focal, temp_cols, temp_non_anchors)
      # down weight focal term for next run
      weights[which(.anchor_list %in% focal)] <- weights[which(.anchor_list %in% focal)] * alpha
      ls_relco[["anchor_terms"]][[rdx]] <- temp_anchors
    }
    
    if (type == "direction") {
      # pole1
      focal1 <- sample(.anchor_list1, size = 1, prob = weights1)
      temp_cols1 <- sample(.anchor_list1[.anchor_list1 != focal1], size = n_cols, replace = FALSE)
      temp_anchors1 <- sample(.anchor_list1[!.anchor_list1 %in% c(focal1, temp_cols1)])
      # pole2
      focal2 <- sample(.anchor_list2, size = 1, prob = weights2)
      temp_cols2 <- sample(.anchor_list2[.anchor_list2 != focal2], size = n_cols, replace = FALSE)
      temp_anchors2 <- sample(.anchor_list2[!.anchor_list2 %in% c(focal2, temp_cols2)])
      
      dimnames(.anchors_dtm)[[2]] <- c(focal1, temp_cols1, temp_non_anchors, focal2, temp_cols2)
      # down weight focal term for next run
      weights1[which(.anchor_list1 %in% focal1)] <- weights1[which(.anchor_list1 %in% focal1)] * alpha
      weights2[which(.anchor_list2 %in% focal2)] <- weights2[which(.anchor_list2 %in% focal2)] * alpha
      #
      ls_relco[["anchor_terms"]][["pole1"]][[rdx]] <- temp_anchors1
      ls_relco[["anchor_terms"]][["pole2"]][[rdx]] <- temp_anchors2
    }
    
    ## ----------------------------------------------------------------------------
    # Run CMD on pseudo DTMs
    ## ----------------------------------------------------------------------------
    
    if (type == "compound") {
      .compound <- paste(temp_anchors, collapse = " ")
      run_cmd <- CMDist(dtm = .anchors_dtm, cw = .compound, wv = wv, scale = FALSE)
      ls_relco[["cmd_scores"]][[rdx]] <- run_cmd[,2]
    }
    
    if (type == "centroid") {
      .centroid <- get_centroid(anchors = temp_anchors, wv = wv)
      run_cmd <- CMDist(dtm = .anchors_dtm, cv = .centroid, wv = wv, scale = FALSE)
      ls_relco[["cmd_scores"]][[rdx]] <- run_cmd[,2]
    }
    
    if (type == "direction") {
      .direction <- get_direction(anchors = list(temp_anchors1, temp_anchors2),
                                            method = dir_method, wv = wv)
      run_cmd <- CMDist(dtm = .anchors_dtm, cv = .direction, wv = wv, scale = FALSE)
      ls_relco[["cmd_scores"]][[rdx]] <- run_cmd[,2]
    }
    
    ## ----------------------------------------------------------------------------
    # Get term-level contributions
    ## ----------------------------------------------------------------------------
    
    ls_relco[["raw_scores"]][[rdx]] <- run_cmd[, 2]
    run_cmd$inverse_rank <- rev(as.numeric(run_cmd$doc_id))/max(rev(as.numeric(run_cmd$doc_id)))
    mod_lm <- stats::lm(run_cmd[,2] ~ inverse_rank, data = run_cmd)[1][[1]]
    
    ls_relco[["run_scores"]][1,rdx] <- mod_lm[2]
    ls_relco[["run_intercept"]][1,rdx] <- mod_lm[1]
    
    zinv_rank <- scale(run_cmd$inverse_rank)
    temp_term <- scale(run_cmd[,2]) * (zinv_rank/sum(zinv_rank^2) * (sd(run_cmd[ ,2]) / sd(run_cmd[ ,3])))
    
    ## ----------------------------------------------------------------------------
    if (type == "compound" | type == "centroid") {
      ls_relco[["term_contrib"]][[rdx]] <- data.frame(diff = temp_term[1] - temp_term[2], 
                                                      term = focal)
    }
    
    if (type == "direction") {
      ls_relco[["term_contrib"]][[rdx]] <- data.frame(
        diff = c(temp_term[1] - temp_term[2], 
                 temp_term[floor(mid_anchors) + 1] - temp_term[floor(mid_anchors) + 2]),
        term = c(focal1, focal2)
      )
    }
  }
  
  ## ----------------------------------------------------------------------------
  ## ----------------------------------------------------------------------------
  
  df_ctrb <- do.call(rbind, ls_relco[["term_contrib"]])
  
  ## ----------------------------------------------------------------------------    
  if (type == "compound" | type == "centroid") {
    
    for (i_term in .anchor_list) {
      rowid <- df_ctrb$term == i_term
      ls_relco[["term_conf"]][[i_term]] <- .conf_int(df_ctrb[rowid, ]$diff, ci = conf)
      ls_relco[["term_runs"]][[i_term]] <- df_ctrb[rowid, ]$diff
    }
  }
  
  ## ----------------------------------------------------------------------------
  if (type == "direction") {
    
    for (i_term1 in .anchor_list1) {
      rowid <- df_ctrb$term == i_term1
      ls_relco[["term_conf"]]$pole1[[i_term1]] <- .conf_int(df_ctrb[rowid, ]$diff, ci = conf)
      ls_relco[["term_runs"]]$pole1[[i_term1]] <- df_ctrb[rowid, ]$diff  
    }
    
    for (i_term2 in .anchor_list2) {
      rowid <- df_ctrb$term == i_term2
      ls_relco[["term_conf"]]$pole2[[i_term2]] <- .conf_int(df_ctrb[rowid, ]$diff, ci = conf)
      ls_relco[["term_runs"]]$pole2[[i_term2]] <- df_ctrb[rowid, ]$diff
    }
  }
  
  ## ----------------------------------------------------------------------------
  # t-test
  mod_ttest <- stats::t.test(ls_relco[["run_scores"]], mu = null, alternative = "greater") # t-test
  mod_ttest$alt_hyp <- paste0("True global reliability coefficient > ", null)
  mod_ttest$df <- 1
  mod_ttest$null <- null
  
  ## ----------------------------------------------------------------------------
  # create word-level contribution data.frame
  if(type == "compound" | type == "centroid") {
    df_relco <- as.data.frame(do.call(rbind, as.list(ls_relco$term_conf) )) |> 
      tibble::rownames_to_column(var = "term") |>
      dplyr::mutate_if(is.numeric, ~round(., 7))  |>
      tibble::as_tibble()
    
    attr(df_relco, "top_terms") <- df_relco[order(-df_relco$mean), ]$term[seq_len(length(.anchor_list))]
  }
  
  if (type == "direction") {
    
    df_relco_pole1 <- as.data.frame(do.call(rbind, as.list(ls_relco$term_conf$pole1))) |> 
      tibble::rownames_to_column(var = "term") |>
      dplyr::mutate_if(is.numeric, ~round(., 7))  |>
      tibble::as_tibble() |> 
      dplyr::mutate(pole = "pole1")
    
    df_relco_pole2 <- as.data.frame(do.call(rbind, as.list(ls_relco$term_conf$pole2) )) |> 
      tibble::rownames_to_column(var = "term") |>
      dplyr::mutate_if(is.numeric, ~round(., 7))  |>
      tibble::as_tibble() |> 
      dplyr::mutate(pole = "pole2")
    
    df_relco <- rbind(df_relco_pole1, df_relco_pole2)
    
    top_terms <- list(pole1 = df_relco_pole1[order(-df_relco_pole1$mean), ]$term[seq_len(length(.anchor_list1))],
                      pole2 = df_relco_pole2[order( df_relco_pole2$mean), ]$term[seq_len(length(.anchor_list2))])
    
    attr(df_relco, "top_terms") <- top_terms    
  }
  
  # add attributes to data.frame
  attr(df_relco, "relation_type")    <- type
  attr(df_relco, "confidence_level") <- conf
  attr(df_relco, "null")             <- null
  attr(df_relco, "alpha")            <- alpha
  attr(df_relco, "seed")             <- seed
  attr(df_relco, "n_runs")           <- n_runs
  attr(df_relco, "total_score")      <- mean(as.numeric(ls_relco[["run_scores"]]))
  attr(df_relco, "total_intercept")  <- mean(as.numeric(ls_relco[["run_intercept"]]))
  attr(df_relco, "t_test")           <- mod_ttest
  attr(df_relco, "confidence_int")   <- .conf_int(as.numeric(ls_relco[["run_scores"]]), ci = conf)
  ##
  attr(df_relco, "term_runs")        <- ls_relco$term_runs
  attr(df_relco, "run_scores")       <- ls_relco$run_scores
  attr(df_relco, "raw_scores")       <- ls_relco$raw_scores
  attr(df_relco, "intercepts")       <- ls_relco$intercept
  attr(df_relco, "anchor_terms")     <- ls_relco$anchor_terms
  attr(df_relco, "non_anchor_terms") <- ls_relco$non_anchor_terms
  attr(df_relco, "cmd_scores")       <- ls_relco$cmd_scores
  ##
  class(df_relco) <- c("relco", "tbl", "data.frame")
  
  return(df_relco)
}


## ----------------------------------------------------------------------------
# Printing for relco
## ----------------------------------------------------------------------------

#' @importFrom cli style_italic
#' @importFrom pillar tbl_sum
#' 
#' @param x relco table 
#' 
#' @keywords internal
#' @export
#' @noRd
tbl_sum.relco <- function(x,...) {
  
  relation <- attr(x, "relation_type")
  
  if(relation == "direction") {
    return(
      c("Relation Type" = cli::style_italic(relation),
        "Global Reliability Coefficient" = cli::style_italic(round(attr(x, "total_score"), 4)), 
        "5 Highest Contributors (Pole 1)" = cli::style_italic(paste(attr(x, "top_terms")$pole1, collapse = ", ")),
        "5 Highest Contributors (Pole 2)" = cli::style_italic(paste(attr(x, "top_terms")$pole2, collapse = ", ")),
        "Confidence Interval (two-tailed)" = cli::style_italic(paste0(round(attr(x, "confidence_int")[3], 4), " to ", round(attr(x, "confidence_int")[1], 4), " at ", attr(x, "confidence_level")*100, "%")), 
        "t-test" = cli::style_italic(paste0("t = ", round(attr(x, "t_test")$statistic, 4), ", df = ", attr(x, "t_test")$df, ", p-value = ", round(attr(x, "t_test")$p.value, 4))), 
        "Alternative Hypothesis" = cli::style_italic(attr(x, "t_test")$alt_hyp), 
        "Term-Level Contributions" = "")
    )
    
  } else {
    return(
      c("Relation Type" = cli::style_italic(relation),
        "Global Reliability Coefficient" = cli::style_italic(round(attr(x, "total_score"), 4)),
        "5 Highest Contributors" = cli::style_italic(paste(attr(x, "top_terms"), collapse = ", ")),
        "Confidence Interval (two-tailed)" = cli::style_italic(paste0(round(attr(x, "confidence_int")[3], 4), " to ", round(attr(x, "confidence_int")[1], 4), " at ", attr(x, "confidence_level")*100, "%")), 
        "t-test" = cli::style_italic(paste0("t = ", round(attr(x, "t_test")$statistic, 4), ", df = ", attr(x, "t_test")$df, ", p-value = ", round(attr(x, "t_test")$p.value, 4))), 
        "Alternative Hypothesis" = cli::style_italic(attr(x, "t_test")$alt_hyp), 
        "Term-Level Contributions" = "")
    )
  }  
}

#' @importFrom pillar tbl_format_footer
#' @importFrom pillar style_subtle
#' @importFrom cli symbol
#' 
#' @param x relco table 
#' @param setup tibble setup
#' 
#' @keywords internal
#' @export
#' @noRd
tbl_format_footer.relco <- function(x, setup,...) {
  n_extra_rows <- setup$rows_total - 10L
  if(n_extra_rows > 0) {

  extra_footer1 <- pillar::style_subtle(paste0("# ", 
              cli::symbol$info, " ", n_extra_rows, " more terms"))
  extra_footer2 <- pillar::style_subtle(paste0("# ", 
              cli::symbol$info, " Use `print(n = ...)` to see more terms"))
  return(c(extra_footer1, extra_footer2))
    
  } else {

  return(c(""))
    
  }
}