R/surveycc.R
In SurveyCC: Canonical Correlation for Survey Data

Documented in surveycc

#' Canonical correlation analysis for complex survey data
#'
#' @description This command extends the functionality of [candisc::cancor] by
#'  calculating the test statistics, degrees of freedom and p-values necessary
#'  to estimate and interpret the statistical significance of the secondary canonical
#'  corr according to the methods Wilks' lambda, Pillai's trace, and Hotelling-Lawley
#'  trace (Caliński et al., 2006) and Roy's largest root (Johnstone, 2009). The units
#'  and variables graphs (Gittins, 1986) can also be drawn by `surveycc` further
#'  complementing the information listed by the existing `cancor`.
#'
#'  Moreover, `csdcanon` implements an algorithm (Cruz-Cano, Cohen, and Mead-Morse, 2024) that
#'  allows the inclusion of complex survey design elements, e.g. strata, cluster and
#'  replicate weights, in the estimation of the statistical significance of the canonical
#'  correlations. The core idea of the algorithm is to reduce the problem of finding
#'  the correlations among the canonical variates and their corresponding statistical
#'  significance to calculating an equivalent sequence of univariate linear regression.
#'  This switch allows the user to take advantage of the existing theoretical and
#'  computational resources that integrate the complex survey design elements into
#'  these regression models (Valliant and Dever, 2018). Hence, this algorithm can include
#'  the same complex design elements as in `survey`.
#'
#' @param design_object a survey design object generated from package `survey`,
#'  eg [survey::svydesign]
#' @param var.x the first set of variables; a vector of names
#' @param var.y the second set of variables; a vector of names
#' @param howmany positive integer; allows the user to choose the number of canonical correlations
#'  for which the statistical significance statistics are displayed. Default is to choose
#'  the minimum of `length(var.x)` and `length(var.y)`. Cannot exceed this value.
#' @param selection allows the user to choose whether the type of sample size is equal to
#'  the number of rows (`ROWS`) in the data set or the sum of the survey weights (`FREQ`).
#'
#' @return An object with S3 class "surveycc". A list, containing the canonical 
#'  correlation object, dimensions for plotting, as well as tables
#'  of the various tests of significance. This includes the test statistics,
#'  degrees of freedom, and p-values for:
#'  * Wilk's lambda
#'  * Pillai's trace
#'  * Hotelling-Lawley
#'  * Roy's greatest root
#'  * the Cruz-Cano algorithm using the survey design object
#'
#'  NOTE: For more information on the statistics presented, i.e. test statistic, df1,
#'  df2, Chi-Sq/F and p-val, please see the documentation in [candisc::cancor]
#'  for Wilk's Lambda, Pillai's Trace and Hotelling-Lawley Trace
#'  (although the present package uses a Chi-squared approximation to the F-distribution),
#'  and see the documentation in [survey::svyglm] for the Weighted/Complex Survey Design
#'  regression.
#'
#' @references  * Cruz-Cano, Cohen, and Mead-Morse. Canonical Correlation Analysis of
#'  Survey data: The SurveyCC R package. The R Journal under review; 2024.
#'  * Gentzke AS, Wang TW, Cornelius M, Park-Lee E, Ren C, Sawdey MD, Cullen KA,
#'  Loretan C, Jamal A, Homa DM. Tobacco Product Use and Associated Factors among
#'  Middle and High School Students - National Youth Tobacco Survey, United States, 2021.
#'  rveill Summ. 2022;71(5):1-29. doi: 10.15585/mmwr.ss7105a1. PubMed PMID: 35271557.
#'  * Gittins R.  Canonical Analysis: A Review with Applications in Ecology:
#'  Springer Berlin Heidelberg; 1986.
#'  * Caliński T., Krzyśko M. and WOłyński W. (2006) A Comparison of Some Tests
#'  for Determining the Number of Nonzero Canonical Correlations, Communications
#'  in Statistics -Simulation and Computation, 35:3, 727-749, DOI: 10.1080/03610 6290.
#'  * Hyland A, Ambrose BK, Conway KP, et al. Design and methods of the Population
#'  Assessment of Tobacco and Health (PATH) StudyTobacco Control 2017;26:371-378.
#'  * Johnstone IM. Approximate Null Distribution of the largest root in a
#'  Multivariate Analysis.  Ann Appl Stat. 2009;3(4):1616-1633. doi: 10.1214/08-AOAS220.
#'  PMID: 20526465; PMCID: PMC2880335.
#'  *  Valliant R. and Dever JA.  Survey Weights: A Step-by-Step Guide to Calculation:
#'  Stata Press; 2018. ISBN-13: 978-1-59718-260-7.
#'
#' @export
#'
#' @examples
#' # PATH example
#' design_object <-
#'  survey::svrepdesign(
#'  id = ~PERSONID,
#'  weights = ~R01_A_PWGT,
#'  repweights = "R01_A_PWGT[1-9]+",
#'  type = "Fay",
#'  rho = 0.3,
#'  data=reducedPATHdata,
#'  mse = TRUE
#'  )
#' var.x <- c("R01_AC1022", "R01_AE1022", "R01_AG1022CG")
#' var.y <- c("R01_AX0075", "R01_AX0076")
#' howmany <- 2
#' surveycc(design_object, var.x, var.y, howmany = howmany,
#'   selection = "ROWS")
#'
#' # NYTS example
#' design_object <-
#'   survey::svydesign(
#'   ids = ~psu2,
#'   nest = TRUE,
#'   strata = ~v_stratum2,
#'   weights = ~finwgt,
#'   data = reducedNYTS2021data
#' )
#' var.x <- c("qn9", "qn38", "qn40", "qn53", "qn54", "qn64", "qn69", "qn74",
#'            "qn76", "qn78", "qn80", "qn82", "qn85", "qn88", "qn89")
#' var.y <- c("qn128", "qn129", "qn130", "qn131", "qn132", "qn134")
#' howmany <- 3
#' surveycc(design_object = design_object, var.x = var.x,
#'   var.y = var.y, howmany = howmany, selection = "ROWS")
#'
surveycc <- function(
    design_object, var.x, var.y, howmany = NA,
    selection = c("FREQ", "ROWS") ) {
  # modify the design object to have complete cases on var.x and var.y
  design_object <- subset(
    design_object,
    stats::complete.cases(
      design_object$variables[, c(var.x, var.y)]
    )
  )
  
  # input checks etc
  selection = match.arg(selection)
  
  if (is.na(howmany)) {
    howmany <- min(length(var.x), length(var.y))
  }
  
  stopifnot(
    "`howmany` must be a positive whole number" = 
      (howmany%%1 == 0 && howmany > 0),
    "`howmany` must be at most the minimum of length var.x and length var.y" = 
      (howmany <= min(length(var.x), length(var.y)))
  )
  
  # Extract weights for the cancor object
  if (!is.null(design_object$pweights)) {
    myweights <- design_object$pweights
  } else myweights <- weights(design_object)

  # create a version of the svy-obj w/ just weights, for weighted regression
  design_objectW <- survey::svydesign(
    ids = ~1, data = design_object$variables, weights = myweights
  )
  # create the cancor object
  cc_object <- candisc::cancor(
    design_object$variables[, var.x],
    design_object$variables[, var.y],
    weights = myweights
  )

  # preparing the inputs for the calcpval function
  varcount1 <- length(cc_object$names$X)
  varcount2 <- length(cc_object$names$Y)
  if (varcount1 >= varcount2) {
    OgX <- cc_object$X
    OgY <- cc_object$Y
    myrawcoef_var1 <- cc_object$coef$X
    myrawcoef_var2 <- cc_object$coef$Y
  } else {
    OgX <- cc_object$Y
    OgY <- cc_object$X
    myrawcoef_var2 <- cc_object$coef$X
    myrawcoef_var1 <- cc_object$coef$Y
  }
  
  diag_W <- diag(cc_object$weights)
  n_cc <- cc_object$ndim
  weights <- cc_object$weights

  # calculating and formating the p-values
  output <- calcpval(
    cc_object, selection, OgX, OgY, diag_W,
    myrawcoef_var1, myrawcoef_var2, weights
  )
  Results <- output$Results
  OGStataUVW <- output$OGStataUVW


  # applying the new method
  w_reg_results <- weighted_reg(n_cc, OGStataUVW, design_objectW)
  svy_reg_results <- weighted_reg(n_cc, OGStataUVW, design_object)

  # adding it to results
  for (i in 1:n_cc) {
    Results[[i]] <- rbind(Results[[i]], w_reg_results[i, ])
    Results[[i]] <- rbind(Results[[i]], svy_reg_results[i, ])
    rownames(Results[[i]]) <- c(
      rownames(Results[[i]])[1:4],
      "Weighted Survey CC",
      "Complex Survey CC"
    )
  }

  # if (is.na(howmany)) {
  #   howmany <- n_cc
  # }

  names(Results) <- paste(
    "Stats.cancor.", 1:n_cc,
    sep = ""
  )

  # round digits for tables
  Results <- lapply(Results, round, digits = 5)

  out.results <- list(
    Stats.cancor = Results[1:howmany],
    cc_object = cc_object
  )
  
  class(out.results) <- c("surveycc", class(out.results))

  return(out.results)
}