R/constrained_genomic_indices.R

Defines functions cppg_lgsi crlgsi ppg_lgsi rlgsi .combined_index_metrics .genomic_index_metrics

Documented in cppg_lgsi crlgsi ppg_lgsi rlgsi

#' Constrained Genomic Selection Indices
#' @name constrained_genomic_indices
#'
#' @description
#' Implements constrained genomic selection index methods using GEBVs.
#' These methods allow breeders to impose restrictions on genetic gains
#' while maintaining genomic selection efficiency.
#'
#' Methods included:
#' - Restricted Linear Genomic Selection Index (RLGSI)
#' - Predetermined Proportional Gains Linear Genomic Selection Index (PPG-LGSI)
#' - Combined Restricted Linear Genomic Selection Index (CRLGSI)
#' - Combined Predetermined Proportional Gains Linear Genomic Selection Index (CPPG-LGSI)
#'
#' @references
#' CerĂ³n-Rojas, J. J., & Crossa, J. (2018). Linear Selection Indices in Modern Plant Breeding.
#' Springer International Publishing. Chapter 6.
#'
#' @keywords internal
#' @importFrom stats cov
#' @importFrom MASS ginv
NULL


#' Compute genomic index metrics using C++ primitives
#' @keywords internal
#' @noRd
.genomic_index_metrics <- function(b, Gamma, w = NULL, k_I = 2.063, L_G = 1, GAY = NULL, gmat = NULL) {
  b <- as.numeric(b)

  bGb <- cpp_quadratic_form_sym(b, Gamma)
  sigma_I <- if (bGb > 0) sqrt(bGb) else NA_real_

  R <- if (!is.na(sigma_I)) (k_I / L_G) * sigma_I else NA_real_

  E_vec <- if (!is.na(sigma_I) && sigma_I > 0) {
    (k_I / L_G) * as.vector(Gamma %*% b) / sigma_I
  } else {
    rep(NA_real_, nrow(Gamma))
  }

  GA <- NA_real_
  PRE <- NA_real_
  if (!is.null(w)) {
    bGw <- cpp_quadratic_form(b, Gamma, w)
    GA <- if (!is.na(sigma_I) && sigma_I > 0) (k_I / L_G) * bGw / sigma_I else NA_real_
    PRE_constant <- if (is.null(GAY)) 100 else 100 / GAY
    PRE <- if (!is.na(GA)) GA * PRE_constant else NA_real_
  }

  rHI <- if (!is.null(w)) {
    C <- if (!is.null(gmat)) gmat else Gamma
    wCw <- cpp_quadratic_form_sym(w, C)
    if (!is.na(bGb) && !is.na(wCw) && bGb > 0 && wCw > 0) {
      bGw_val <- cpp_quadratic_form(b, Gamma, w)
      pmin(pmax(bGw_val / (sqrt(bGb) * sqrt(wCw)), 0), 1)
    } else {
      NA_real_
    }
  } else {
    NA_real_
  }

  list(
    bGb = bGb,
    sigma_I = sigma_I,
    R = R,
    E_vec = as.vector(E_vec),
    GA = GA,
    PRE = PRE,
    rHI = rHI
  )
}

#' Compute combined index metrics (for CRLGSI/CPPG-LGSI)
#' @keywords internal
#' @noRd
.combined_index_metrics <- function(b, T_C, Psi_C, w = NULL, k_I = 2.063, L_I = 1, GAY = NULL) {
  b <- as.numeric(b)

  bTb <- cpp_quadratic_form_sym(b, T_C)
  sigma_I <- if (bTb > 0) sqrt(bTb) else NA_real_

  R <- if (!is.na(sigma_I)) (k_I / L_I) * sigma_I else NA_real_

  n_traits <- ncol(Psi_C) / 2
  E_vec <- if (!is.na(sigma_I) && sigma_I > 0) {
    Psi_b <- as.vector(crossprod(Psi_C, b))
    (k_I / L_I) * Psi_b[1:n_traits] / sigma_I
  } else {
    rep(NA_real_, n_traits)
  }

  GA <- NA_real_
  PRE <- NA_real_
  if (!is.null(w)) {
    n_traits <- length(w)
    a_C <- c(w, rep(0, n_traits))
    bPsiw <- as.numeric(crossprod(b, Psi_C) %*% a_C)
    GA <- if (!is.na(sigma_I) && sigma_I > 0) (k_I / L_I) * bPsiw / sigma_I else NA_real_
    PRE_constant <- if (is.null(GAY)) 100 else 100 / GAY
    PRE <- if (!is.na(GA)) GA * PRE_constant else NA_real_
  }

  rHI <- if (!is.null(w)) {
    n_traits <- length(w)
    G <- Psi_C[1:n_traits, 1:n_traits, drop = FALSE]
    wGw <- cpp_quadratic_form_sym(w, G)
    if (!is.na(bTb) && !is.na(wGw) && bTb > 0 && wGw > 0) {
      a_C <- c(w, rep(0, n_traits))
      bPsiw_val <- as.numeric(crossprod(b, Psi_C) %*% a_C)
      pmin(pmax(bPsiw_val / (sqrt(bTb) * sqrt(wGw)), 0), 1)
    } else {
      NA_real_
    }
  } else {
    NA_real_
  }

  list(
    bTb = bTb,
    sigma_I = sigma_I,
    R = R,
    E_vec = as.vector(E_vec),
    GA = GA,
    PRE = PRE,
    rHI = rHI
  )
}


#' Restricted Linear Genomic Selection Index (RLGSI)
#'
#' @description
#' Implements the Restricted Linear Genomic Selection Index where genetic gains
#' are constrained to zero for specific traits while maximizing gains for others.
#' Uses GEBVs only (no phenotypic data required).
#'
#' @param Gamma GEBV variance-covariance matrix (n_traits x n_traits).
#'   This represents the variance of GEBVs, typically computed from predicted breeding values.
#' @param wmat Economic weights matrix (n_traits x k), or vector
#' @param wcol Weight column to use if wmat has multiple columns (default: 1)
#' @param restricted_traits Vector of trait indices to restrict (default: NULL).
#'   Example: c(1, 3) restricts traits 1 and 3 to zero gain.
#' @param U Constraint matrix (n_traits x n_constraints). Each column defines a restriction.
#'   Alternative to restricted_traits for custom constraints. Ignored if restricted_traits is provided.
#' @param k_I Selection intensity (default: 2.063 for 10 percent selection)
#' @param L_G Standardization constant (default: 1). Can be set to sqrt(w'Gw) for standardization.
#' @param gmat Optional. True genetic variance-covariance matrix for exact accuracy calculation.
#'   If NULL, uses Gamma as approximation. Providing gmat ensures textbook-perfect accuracy metric.
#' @param GAY Optional. Genetic advance of comparative trait for PRE calculation
#'
#' @return List with:
#'   \itemize{
#'     \item \code{summary} - Data frame with coefficients, response metrics
#'     \item \code{b} - Vector of RLGSI coefficients (\eqn{\beta_{RG}})
#'     \item \code{E} - Named vector of expected genetic gains per trait
#'     \item \code{R} - Overall selection response
#'     \item \code{U} - Constraint matrix used
#'     \item \code{constrained_response} - Realized gains for constrained traits (should be ~0)
#'   }
#'
#' @details
#' \strong{Mathematical Formulation (Chapter 6, Section 6.1):}
#'
#' The RLGSI minimizes the mean squared difference between the index I = \eqn{\beta}'\eqn{\gamma} and
#' the breeding objective H = w'g under the restriction: U'\eqn{\Gamma}\eqn{\beta} = 0
#'
#' Solution involves solving the augmented system:
#' \deqn{\begin{bmatrix} \Gamma & \Gamma U \\ U'\Gamma & 0 \end{bmatrix}
#'       \begin{bmatrix} \beta \\ v \end{bmatrix} =
#'       \begin{bmatrix} \Gamma w \\ 0 \end{bmatrix}}
#'
#' Where:
#' - \eqn{\Gamma} (Gamma) = Var(GEBVs) - GEBV variance-covariance matrix
#' - U = Constraint matrix (each column is a restriction vector)
#' - w = Economic weights
#' - \eqn{\beta_{RG}} = RLGSI coefficient vector
#' - v = Lagrange multipliers
#'
#' Selection response: \eqn{R_{RG} = (k_I / L_G) * sqrt(beta_RG' * Gamma * beta_RG)}
#'
#' Expected gains: \eqn{E_{RG} = (k_I / L_G) * (Gamma * beta_RG) / sqrt(beta_RG' * Gamma * beta_RG)}
#'
#' @export
#' @examples
#' \dontrun{
#' # Simulate GEBV variance-covariance matrix
#' set.seed(123)
#' n_traits <- 5
#' Gamma <- matrix(rnorm(n_traits^2), n_traits, n_traits)
#' Gamma <- (Gamma + t(Gamma)) / 2 # Make symmetric
#' diag(Gamma) <- abs(diag(Gamma)) + 2 # Ensure positive definite
#'
#' # Economic weights
#' w <- c(10, 8, 6, 4, 2)
#'
#' # Restrict traits 2 and 4 to zero gain
#' result <- rlgsi(Gamma, w, restricted_traits = c(2, 4))
#' print(result$summary)
#' print(result$E) # Check that traits 2 and 4 have ~0 gain
#' }
rlgsi <- function(Gamma, wmat, wcol = 1,
                  restricted_traits = NULL, U = NULL,
                  k_I = 2.063, L_G = 1, gmat = NULL, GAY = NULL) {
  Gamma <- as.matrix(Gamma)
  wmat <- as.matrix(wmat)
  n_traits <- nrow(Gamma)

  if (ncol(Gamma) != n_traits) {
    stop("Gamma must be a square matrix")
  }

  if (nrow(wmat) != n_traits) {
    stop("wmat must have ", n_traits, " rows")
  }

  if (!is.null(restricted_traits)) {
    if (!is.numeric(restricted_traits) || any(restricted_traits < 1) || any(restricted_traits > n_traits)) {
      stop("restricted_traits must be valid trait indices (1 to ", n_traits, ")")
    }
    U <- diag(n_traits)[, restricted_traits, drop = FALSE]
  } else if (is.null(U)) {
    stop("Either 'restricted_traits' or 'U' must be provided")
  }

  U <- as.matrix(U)
  if (nrow(U) != n_traits) {
    stop("U must have ", n_traits, " rows")
  }

  w <- cpp_extract_vector(wmat, seq_len(n_traits), wcol - 1L)

  n_constraints <- ncol(U)


  GammaU <- Gamma %*% U
  UtGamma <- t(U) %*% Gamma
  zeros <- matrix(0, n_constraints, n_constraints)

  augmented_mat <- rbind(
    cbind(Gamma, GammaU),
    cbind(UtGamma, zeros)
  )

  Gammaw <- Gamma %*% w
  rhs <- c(Gammaw, rep(0, n_constraints))

  solution <- ginv(augmented_mat) %*% rhs

  b_RG <- solution[1:n_traits]

  if (any(is.na(b_RG)) || any(is.infinite(b_RG))) {
    stop("RLGSI coefficients contain NA or Inf. Check matrix conditioning.")
  }


  metrics <- .genomic_index_metrics(b_RG, Gamma, w, k_I, L_G,
    GAY = if (missing(GAY)) NULL else GAY,
    gmat = gmat
  )

  constrained_response <- as.vector(t(U) %*% Gamma %*% b_RG)


  b_vec <- as.numeric(b_RG)
  b_vec <- round(b_vec, 4)

  b_df <- as.data.frame(matrix(b_vec, nrow = 1))
  colnames(b_df) <- paste0("b.", seq_along(b_vec)) # seq_len(length(b_vec)))

  summary_df <- data.frame(
    b_df,
    R = round(metrics$R, 4),
    GA = round(metrics$GA, 4),
    PRE = round(metrics$PRE, 4),
    rHI = round(metrics$rHI, 4),
    stringsAsFactors = FALSE,
    check.names = FALSE
  )

  trait_names <- colnames(Gamma)
  if (!is.null(trait_names)) {
    names(metrics$E_vec) <- trait_names
  }

  rownames(summary_df) <- NULL


  result <- list(
    summary = summary_df,
    b = b_vec,
    E = metrics$E_vec,
    R = metrics$R,
    GA = metrics$GA,
    PRE = metrics$PRE,
    rHI = metrics$rHI,
    sigma_I = metrics$sigma_I,
    U = U,
    constrained_response = constrained_response,
    k_I = k_I,
    L_G = L_G
  )

  class(result) <- c("rlgsi", "genomic_index", "list")

  result
}


#' Predetermined Proportional Gains Linear Genomic Selection Index (PPG-LGSI)
#'
#' @description
#' Implements the PPG-LGSI where breeders specify desired proportional gains
#' between traits rather than restricting specific traits to zero.
#' This is genomic version of PPG-LPSI using GEBVs only.
#'
#' @param Gamma GEBV variance-covariance matrix (n_traits x n_traits)
#' @param d Vector of desired proportional gains (length n_traits or n_constraints).
#'   If length n_traits, constraints are applied to all traits.
#'   If length n_constraints, must provide U matrix.
#' @param wmat Optional. Economic weights for GA/PRE calculation
#' @param wcol Weight column to use if wmat has multiple columns (default: 1)
#' @param U Optional. Constraint matrix (n_traits x n_constraints).
#'   If NULL, assumes d applies to all traits (U = I).
#' @param k_I Selection intensity (default: 2.063)
#' @param L_G Standardization constant (default: 1)
#' @param gmat Optional. True genetic variance-covariance matrix for exact accuracy calculation.
#'   If NULL, uses Gamma as approximation.
#' @param GAY Optional. Genetic advance of comparative trait for PRE calculation
#'
#' @return List with:
#'   \itemize{
#'     \item \code{summary} - Data frame with coefficients and metrics
#'     \item \code{b} - Vector of PPG-LGSI coefficients (\eqn{\beta_{PG}})
#'     \item \code{E} - Named vector of expected genetic gains per trait
#'     \item \code{theta_G} - Proportionality constant
#'     \item \code{gain_ratios} - Ratios of achieved to desired gains
#'   }
#'
#' @details
#' \strong{Mathematical Formulation (Chapter 6, Section 6.2):}
#'
#' Alternative form: \eqn{beta_PG = beta_RG + theta_G * U * (U' * Gamma * U)^{-1} * d}
#'
#' Where:
#' - beta_RG = Restricted index coefficients (from RLGSI)
#' - theta_G = Proportionality constant
#' - d = Vector of desired proportional gains
#'
#' Proportionality constant:
#' \deqn{theta_G = (d' * (U' * Gamma * U)^{-1} * U' * Gamma * w) / (d' * (U' * Gamma * U)^{-1} * d)}
#'
#' @export
#' @examples
#' \dontrun{
#' # Simulate GEBV variance-covariance matrix
#' set.seed(123)
#' n_traits <- 5
#' Gamma <- matrix(rnorm(n_traits^2), n_traits, n_traits)
#' Gamma <- (Gamma + t(Gamma)) / 2
#' diag(Gamma) <- abs(diag(Gamma)) + 2
#'
#' # Desired proportional gains (e.g., 2:1:1:0:0 ratio)
#' d <- c(2, 1, 1, 0, 0)
#'
#' # Economic weights
#' w <- c(10, 8, 6, 4, 2)
#'
#' result <- ppg_lgsi(Gamma, d, wmat = w)
#' print(result$summary)
#' print(result$gain_ratios) # Should be approximately proportional to d
#' }
ppg_lgsi <- function(Gamma, d, wmat = NULL, wcol = 1, U = NULL,
                     k_I = 2.063, L_G = 1, gmat = NULL, GAY = NULL) {
  Gamma <- as.matrix(Gamma)
  d <- as.numeric(d)
  n_traits <- nrow(Gamma)

  if (ncol(Gamma) != n_traits) {
    stop("Gamma must be a square matrix")
  }

  if (is.null(U)) {
    if (length(d) != n_traits) {
      stop("d must have length ", n_traits, " when U is not provided")
    }
    U <- diag(n_traits)
  } else {
    U <- as.matrix(U)
    if (nrow(U) != n_traits) {
      stop("U must have ", n_traits, " rows")
    }
    if (length(d) != ncol(U)) {
      stop("d must have length ", ncol(U), " (number of constraints)")
    }
  }

  w <- NULL
  if (!is.null(wmat)) {
    wmat <- as.matrix(wmat)
    if (nrow(wmat) != n_traits) {
      stop("wmat must have ", n_traits, " rows")
    }
    w <- cpp_extract_vector(wmat, seq_len(n_traits), wcol - 1L)
  }


  if (is.null(w)) {
    w_temp <- rep(1, n_traits)
  } else {
    w_temp <- w
  }

  rlgsi_result <- rlgsi(Gamma, w_temp,
    wcol = 1, U = U,
    k_I = k_I, L_G = L_G, gmat = gmat, GAY = NULL
  )
  b_RG <- rlgsi_result$b


  UtGammaU <- t(U) %*% Gamma %*% U

  UtGammaU_inv <- solve(UtGammaU)

  if (!is.null(w)) {
    UtGammaw <- t(U) %*% Gamma %*% w
    theta_num <- as.numeric(t(d) %*% UtGammaU_inv %*% UtGammaw)
  } else {
    theta_num <- 0
  }

  theta_denom <- as.numeric(t(d) %*% UtGammaU_inv %*% d)

  theta_G <- if (abs(theta_denom) > 1e-10) theta_num / theta_denom else 0


  delta <- U %*% UtGammaU_inv %*% d
  b_PG <- b_RG + theta_G * as.vector(delta)


  metrics <- .genomic_index_metrics(b_PG, Gamma, w, k_I, L_G,
    GAY = if (missing(GAY)) NULL else GAY,
    gmat = gmat
  )

  constrained_gains <- as.vector(t(U) %*% metrics$E_vec)
  gain_ratios <- constrained_gains / d
  gain_ratios[abs(d) < 1e-10] <- NA_real_ # Avoid division by zero


  b_vec <- as.numeric(b_PG)
  b_vec <- round(b_vec, 4)

  b_df <- as.data.frame(matrix(b_vec, nrow = 1))
  colnames(b_df) <- paste0("b.", seq_along(b_vec)) # seq_len(length(b_vec)))

  summary_df <- data.frame(
    b_df,
    R = round(metrics$R, 4),
    GA = round(metrics$GA, 4),
    PRE = round(metrics$PRE, 4),
    rHI = round(metrics$rHI, 4),
    theta_G = round(theta_G, 4),
    stringsAsFactors = FALSE,
    check.names = FALSE
  )

  trait_names <- colnames(Gamma)
  if (!is.null(trait_names)) {
    names(metrics$E_vec) <- trait_names
  }

  rownames(summary_df) <- NULL


  result <- list(
    summary = summary_df,
    b = b_vec,
    E = metrics$E_vec,
    R = metrics$R,
    GA = metrics$GA,
    PRE = metrics$PRE,
    rHI = metrics$rHI,
    theta_G = theta_G,
    gain_ratios = gain_ratios,
    constrained_gains = constrained_gains,
    desired_gains = d,
    U = U,
    k_I = k_I,
    L_G = L_G
  )

  class(result) <- c("ppg_lgsi", "genomic_index", "list")

  result
}


#' Combined Restricted Linear Genomic Selection Index (CRLGSI)
#'
#' @description
#' Implements the CRLGSI which combines phenotypic and genomic information
#' with restrictions on genetic gains. This extends CLGSI to include constraints.
#'
#' @param T_C Combined variance-covariance matrix (2t x 2t) where t = n_traits.
#'   Structure: [P, P_yg; P_yg', P_g] where P = phenotypic var, P_g = GEBV var,
#'   P_yg = covariance between phenotypes and GEBVs.
#'   Can be computed automatically if phen_mat and gebv_mat are provided.
#' @param Psi_C Combined genetic covariance matrix (2t x t).
#'   Structure: [G; C_gebv_g] where G = genetic var, C_gebv_g = Cov(GEBV, g).
#'   Can be computed automatically if gmat and reliability are provided.
#' @param phen_mat Optional. Matrix of phenotypes (n_genotypes x n_traits)
#' @param gebv_mat Optional. Matrix of GEBVs (n_genotypes x n_traits)
#' @param pmat Optional. Phenotypic variance-covariance matrix
#' @param gmat Optional. Genotypic variance-covariance matrix
#' @param wmat Economic weights matrix (n_traits x k), or vector
#' @param wcol Weight column to use if wmat has multiple columns (default: 1)
#' @param restricted_traits Vector of trait indices to restrict (default: NULL)
#' @param U Constraint matrix (2t x n_constraints for combined traits).
#'   Alternative to restricted_traits. Ignored if restricted_traits is provided.
#' @param reliability Optional. Reliability of GEBVs (r^2)
#' @param k_I Selection intensity (default: 2.063)
#' @param L_I Standardization constant (default: 1)
#' @param GAY Optional. Genetic advance of comparative trait for PRE calculation
#'
#' @return List with:
#'   \itemize{
#'     \item \code{summary} - Data frame with coefficients and metrics
#'     \item \code{b} - Vector of CRLGSI coefficients (\eqn{\beta_{CR}})
#'     \item \code{b_y} - Coefficients for phenotypes
#'     \item \code{b_g} - Coefficients for GEBVs
#'     \item \code{E} - Expected genetic gains per trait
#'     \item \code{R} - Overall selection response
#'   }
#'
#' @details
#' \strong{Mathematical Formulation (Chapter 6, Section 6.3):}
#'
#' The CRLGSI combines phenotypic and genomic data with restrictions.
#'
#' Coefficient vector: \eqn{beta_CR = K_C * beta_C}
#'
#' Where K_C incorporates the restriction matrix.
#'
#' Selection response: \eqn{R_CR = (k_I / L_I) * sqrt(beta_CR' * T_C * beta_CR)}
#'
#' Expected gains: \eqn{E_CR = (k_I / L_I) * (Psi_C * beta_CR) / sqrt(beta_CR' * T_C * beta_CR)}
#'
#' @export
#' @examples
#' \dontrun{
#' # Simulate data
#' set.seed(123)
#' n_genotypes <- 100
#' n_traits <- 5
#'
#' phen_mat <- matrix(rnorm(n_genotypes * n_traits, 15, 3), n_genotypes, n_traits)
#' gebv_mat <- matrix(rnorm(n_genotypes * n_traits, 10, 2), n_genotypes, n_traits)
#'
#' gmat <- cov(phen_mat) * 0.6 # Genotypic component
#' pmat <- cov(phen_mat)
#'
#' w <- c(10, 8, 6, 4, 2)
#'
#' # Restrict traits 2 and 4
#' result <- crlgsi(
#'   phen_mat = phen_mat, gebv_mat = gebv_mat,
#'   pmat = pmat, gmat = gmat, wmat = w,
#'   restricted_traits = c(2, 4), reliability = 0.7
#' )
#' print(result$summary)
#' }
crlgsi <- function(T_C = NULL, Psi_C = NULL,
                   phen_mat = NULL, gebv_mat = NULL,
                   pmat = NULL, gmat = NULL,
                   wmat, wcol = 1,
                   restricted_traits = NULL, U = NULL,
                   reliability = NULL,
                   k_I = 2.063, L_I = 1, GAY = NULL) {
  has_direct_matrices <- !is.null(T_C) && !is.null(Psi_C)
  has_raw_data <- !is.null(phen_mat) && !is.null(gebv_mat)

  if (!has_direct_matrices && !has_raw_data) {
    stop("Must provide either (T_C, Psi_C) or (phen_mat, gebv_mat, pmat, gmat)")
  }

  if (has_raw_data) {
    phen_mat <- as.matrix(phen_mat)
    gebv_mat <- as.matrix(gebv_mat)
    n_traits <- ncol(phen_mat)

    if (is.null(pmat)) pmat <- cov(phen_mat)
    if (is.null(gmat)) stop("gmat required when using raw data")

    pmat <- as.matrix(pmat)
    gmat <- as.matrix(gmat)

    P_gebv <- cov(gebv_mat)
    P_yg <- cov(phen_mat, gebv_mat)

    T_C <- rbind(
      cbind(pmat, P_yg),
      cbind(t(P_yg), P_gebv)
    )

    if (is.null(reliability)) {
      reliability <- pmin(diag(P_gebv) / diag(gmat), 1)
    } else if (length(reliability) == 1) {
      reliability <- rep(reliability, n_traits)
    }

    Gamma_gebv_g <- sweep(gmat, 1, sqrt(reliability), "*")
    Psi_C <- rbind(
      cbind(gmat, Gamma_gebv_g),
      cbind(Gamma_gebv_g, Gamma_gebv_g)
    )
  } else {
    T_C <- as.matrix(T_C)
    Psi_C <- as.matrix(Psi_C)
    n_traits <- ncol(Psi_C) / 2 # Psi_C is 2t x 2t
  }

  if (nrow(T_C) != 2 * n_traits || ncol(T_C) != 2 * n_traits) {
    stop("T_C must be (2*n_traits x 2*n_traits)")
  }

  if (nrow(Psi_C) != 2 * n_traits || ncol(Psi_C) != 2 * n_traits) {
    stop("Psi_C must be (2*n_traits x 2*n_traits)")
  }

  wmat <- as.matrix(wmat)
  if (nrow(wmat) != n_traits) {
    stop("wmat must have ", n_traits, " rows")
  }
  w <- cpp_extract_vector(wmat, seq_len(n_traits), wcol - 1L)

  if (!is.null(restricted_traits)) {
    if (!is.numeric(restricted_traits) || any(restricted_traits < 1) || any(restricted_traits > n_traits)) {
      stop("restricted_traits must be valid trait indices (1 to ", n_traits, ")")
    }
    r <- length(restricted_traits)
    U <- matrix(0, nrow = 2 * n_traits, ncol = 2 * r)
    for (i in seq_along(restricted_traits)) {
      trait_idx <- restricted_traits[i]
      U[trait_idx, i] <- 1
      U[n_traits + trait_idx, r + i] <- 1
    }
  } else if (is.null(U)) {
    stop("Either 'restricted_traits' or 'U' must be provided")
  } else {
    U <- as.matrix(U)
  }


  U_TC <- U # Constraint matrix in combined variable space


  n_constraints <- ncol(U_TC)

  TC_UTC <- T_C %*% U_TC
  UTC_TC <- t(U_TC) %*% T_C
  zeros <- matrix(0, n_constraints, n_constraints)

  augmented_mat <- rbind(
    cbind(T_C, TC_UTC),
    cbind(UTC_TC, zeros)
  )

  a_C <- c(w, rep(0, n_traits))
  Psi_w <- Psi_C %*% a_C
  rhs <- c(Psi_w, rep(0, n_constraints))

  solution <- ginv(augmented_mat) %*% rhs

  b_CR <- solution[1:(2 * n_traits)]

  if (any(is.na(b_CR)) || any(is.infinite(b_CR))) {
    stop("CRLGSI coefficients contain NA or Inf. Check matrix conditioning.")
  }

  b_y <- b_CR[1:n_traits]
  b_g <- b_CR[(n_traits + 1):(2 * n_traits)]


  metrics <- .combined_index_metrics(b_CR, T_C, Psi_C, w, k_I, L_I,
    GAY = if (missing(GAY)) NULL else GAY
  )

  if (!is.null(restricted_traits)) {
    constrained_response <- metrics$E_vec[restricted_traits]
  } else {
    constrained_response <- rep(NA_real_, ncol(U) / 2)
  }


  b_y_vec <- round(as.numeric(b_y), 4)
  b_g_vec <- round(as.numeric(b_g), 4)

  b_y_df <- as.data.frame(matrix(b_y_vec, nrow = 1))
  colnames(b_y_df) <- paste0("b_y.", seq_along(b_y_vec)) # seq_len(length(b_y_vec)))

  b_g_df <- as.data.frame(matrix(b_g_vec, nrow = 1))
  colnames(b_g_df) <- paste0("b_g.", seq_along(b_g_vec)) # seq_len(length(b_g_vec)))

  summary_df <- data.frame(
    b_y_df,
    b_g_df,
    R = round(metrics$R, 4),
    GA = round(metrics$GA, 4),
    PRE = round(metrics$PRE, 4),
    rHI = round(metrics$rHI, 4),
    stringsAsFactors = FALSE,
    check.names = FALSE
  )

  rownames(summary_df) <- NULL


  result <- list(
    summary = summary_df,
    b = as.numeric(b_CR),
    b_y = b_y_vec,
    b_g = b_g_vec,
    E = metrics$E_vec,
    R = metrics$R,
    GA = metrics$GA,
    PRE = metrics$PRE,
    rHI = metrics$rHI,
    sigma_I = metrics$sigma_I,
    U = U,
    constrained_response = constrained_response,
    k_I = k_I,
    L_I = L_I
  )

  class(result) <- c("crlgsi", "genomic_index", "list")

  result
}


#' Combined Predetermined Proportional Gains Linear Genomic Selection Index (CPPG-LGSI)
#'
#' @description
#' Implements the CPPG-LGSI which combines phenotypic and genomic information
#' while achieving predetermined proportional gains between traits.
#' This is the most general constrained genomic index.
#'
#' @param T_C Combined variance-covariance matrix (2t x 2t)
#' @param Psi_C Combined genetic covariance matrix (2t x t)
#' @param d Vector of desired proportional gains (length n_traits or n_constraints)
#' @param phen_mat Optional. Matrix of phenotypes for automatic T_C computation
#' @param gebv_mat Optional. Matrix of GEBVs for automatic T_C computation
#' @param pmat Optional. Phenotypic variance-covariance matrix
#' @param gmat Optional. Genotypic variance-covariance matrix
#' @param wmat Optional. Economic weights for GA/PRE calculation
#' @param wcol Weight column to use if wmat has multiple columns (default: 1)
#' @param U Optional. Constraint matrix (n_traits x n_constraints)
#' @param reliability Optional. Reliability of GEBVs (r^2)
#' @param k_I Selection intensity (default: 2.063)
#' @param L_I Standardization constant (default: 1)
#' @param GAY Optional. Genetic advance of comparative trait for PRE calculation
#'
#' @return List with:
#'   \itemize{
#'     \item \code{summary} - Data frame with coefficients and metrics
#'     \item \code{b} - Vector of CPPG-LGSI coefficients (\eqn{\beta_{CP}})
#'     \item \code{b_y} - Coefficients for phenotypes
#'     \item \code{b_g} - Coefficients for GEBVs
#'     \item \code{E} - Expected genetic gains per trait
#'     \item \code{theta_CP} - Proportionality constant
#'     \item \code{gain_ratios} - Ratios of achieved to desired gains
#'   }
#'
#' @details
#' \strong{Mathematical Formulation (Chapter 6, Section 6.4):}
#'
#' Coefficient vector: \eqn{beta_CP = beta_CR + theta_CP * delta_CP}
#'
#' Where beta_CR is from CRLGSI and:
#'
#' \deqn{theta_CP = (beta_C' * Phi_C * (Phi_C' * T_C^{-1} * Phi_C)^{-1} * d_C) / (d_C' * (Phi_C' * T_C^{-1} * Phi_C)^{-1} * d_C)}
#'
#' Selection response: \eqn{R_CP = (k_I / L_I) * sqrt(beta_CP' * T_C * beta_CP)}
#'
#' @export
#' @examples
#' \dontrun{
#' # Simulate data
#' set.seed(123)
#' n_genotypes <- 100
#' n_traits <- 5
#'
#' phen_mat <- matrix(rnorm(n_genotypes * n_traits, 15, 3), n_genotypes, n_traits)
#' gebv_mat <- matrix(rnorm(n_genotypes * n_traits, 10, 2), n_genotypes, n_traits)
#'
#' gmat <- cov(phen_mat) * 0.6
#' pmat <- cov(phen_mat)
#'
#' # Desired proportional gains
#' d <- c(2, 1, 1, 0.5, 0)
#'
#' w <- c(10, 8, 6, 4, 2)
#'
#' result <- cppg_lgsi(
#'   phen_mat = phen_mat, gebv_mat = gebv_mat,
#'   pmat = pmat, gmat = gmat, d = d, wmat = w,
#'   reliability = 0.7
#' )
#' print(result$summary)
#' print(result$gain_ratios)
#' }
cppg_lgsi <- function(T_C = NULL, Psi_C = NULL, d,
                      phen_mat = NULL, gebv_mat = NULL,
                      pmat = NULL, gmat = NULL,
                      wmat = NULL, wcol = 1, U = NULL,
                      reliability = NULL,
                      k_I = 2.063, L_I = 1, GAY = NULL) {
  d <- as.numeric(d)

  has_direct_matrices <- !is.null(T_C) && !is.null(Psi_C)
  has_raw_data <- !is.null(phen_mat) && !is.null(gebv_mat)

  if (!has_direct_matrices && !has_raw_data) {
    stop("Must provide either (T_C, Psi_C) or (phen_mat, gebv_mat, pmat, gmat)")
  }

  if (has_raw_data) {
    phen_mat <- as.matrix(phen_mat)
    gebv_mat <- as.matrix(gebv_mat)
    n_traits <- ncol(phen_mat)

    if (is.null(pmat)) pmat <- cov(phen_mat)
    if (is.null(gmat)) stop("gmat required when using raw data")

    pmat <- as.matrix(pmat)
    gmat <- as.matrix(gmat)

    P_gebv <- cov(gebv_mat)
    P_yg <- cov(phen_mat, gebv_mat)

    T_C <- rbind(
      cbind(pmat, P_yg),
      cbind(t(P_yg), P_gebv)
    )

    if (is.null(reliability)) {
      reliability <- pmin(diag(P_gebv) / diag(gmat), 1)
    } else if (length(reliability) == 1) {
      reliability <- rep(reliability, n_traits)
    }

    C_gebv_g <- sweep(gmat, 1, sqrt(reliability), "*")
    Psi_C <- rbind(
      cbind(gmat, C_gebv_g),
      cbind(C_gebv_g, C_gebv_g)
    )
  } else {
    T_C <- as.matrix(T_C)
    Psi_C <- as.matrix(Psi_C)
    n_traits <- ncol(Psi_C) / 2 # Psi_C is 2t x 2t
  }

  if (is.null(U)) {
    if (length(d) != n_traits) {
      stop("d must have length ", n_traits, " when U is not provided")
    }
    U_base <- diag(n_traits)
    U <- rbind(
      cbind(U_base, matrix(0, n_traits, n_traits)),
      cbind(matrix(0, n_traits, n_traits), U_base)
    )
    d_C <- c(d, d)
  } else {
    U <- as.matrix(U)
    if (nrow(U) != 2 * n_traits) {
      stop("U must have ", 2 * n_traits, " rows for combined indices")
    }
    r <- ncol(U) / 2
    if (length(d) == r) {
      d_C <- c(d, d)
    } else if (length(d) == ncol(U)) {
      d_C <- d
    } else {
      stop("d must have length ", r, " or ", ncol(U), " when providing custom U")
    }
  }

  w <- NULL
  if (!is.null(wmat)) {
    wmat <- as.matrix(wmat)
    if (nrow(wmat) != n_traits) {
      stop("wmat must have ", n_traits, " rows")
    }
    w <- cpp_extract_vector(wmat, seq_len(n_traits), wcol - 1L)
  } else {
    w <- rep(1, n_traits) # Default equal weights
  }


  all_traits <- seq_len(n_traits)
  U_crlgsi <- matrix(0, nrow = 2 * n_traits, ncol = 2 * n_traits)
  for (i in seq_along(all_traits)) {
    trait_idx <- all_traits[i]
    U_crlgsi[trait_idx, i] <- 1
    U_crlgsi[n_traits + trait_idx, n_traits + i] <- 1
  }

  crlgsi_result <- crlgsi(
    T_C = T_C, Psi_C = Psi_C, wmat = w, wcol = 1,
    U = U_crlgsi, k_I = k_I, L_I = L_I, GAY = NULL
  )
  b_CR <- crlgsi_result$b


  Phi_C <- Psi_C %*% U

  T_C_inv <- ginv(T_C)

  Phi_TC_Phi <- t(Phi_C) %*% T_C_inv %*% Phi_C

  Phi_TC_Phi_inv <- ginv(Phi_TC_Phi)

  a_C <- c(w, rep(0, n_traits))
  beta_C <- T_C_inv %*% Psi_C %*% a_C

  theta_num <- as.numeric(t(beta_C) %*% Phi_C %*% Phi_TC_Phi_inv %*% d_C)

  theta_denom <- as.numeric(t(d_C) %*% Phi_TC_Phi_inv %*% d_C)

  theta_CP <- if (abs(theta_denom) > 1e-10) theta_num / theta_denom else 0


  delta_CP <- T_C_inv %*% Phi_C %*% Phi_TC_Phi_inv %*% d_C

  b_CP <- b_CR + theta_CP * as.vector(delta_CP)

  b_y <- b_CP[1:n_traits]
  b_g <- b_CP[(n_traits + 1):(2 * n_traits)]


  metrics <- .combined_index_metrics(b_CP, T_C, Psi_C, w, k_I, L_I,
    GAY = if (missing(GAY)) NULL else GAY
  )

  constrained_gains <- metrics$E_vec[1:n_traits]
  if (length(d) == n_traits) {
    gain_ratios <- constrained_gains / d
    gain_ratios[abs(d) < 1e-10] <- NA_real_
  } else {
    gain_ratios <- rep(NA_real_, n_traits)
  }


  b_y_vec <- round(as.numeric(b_y), 4)
  b_g_vec <- round(as.numeric(b_g), 4)

  b_y_df <- as.data.frame(matrix(b_y_vec, nrow = 1))
  colnames(b_y_df) <- paste0("b_y.", seq_along(b_y_vec)) # seq_len(length(b_y_vec)))

  b_g_df <- as.data.frame(matrix(b_g_vec, nrow = 1))
  colnames(b_g_df) <- paste0("b_g.", seq_along(b_g_vec)) # seq_len(length(b_g_vec)))

  summary_df <- data.frame(
    b_y_df,
    b_g_df,
    R = round(metrics$R, 4),
    GA = round(metrics$GA, 4),
    PRE = round(metrics$PRE, 4),
    rHI = round(metrics$rHI, 4),
    theta_CP = round(theta_CP, 4),
    stringsAsFactors = FALSE,
    check.names = FALSE
  )

  rownames(summary_df) <- NULL


  result <- list(
    summary = summary_df,
    b = as.numeric(b_CP),
    b_y = b_y_vec,
    b_g = b_g_vec,
    E = metrics$E_vec,
    R = metrics$R,
    GA = metrics$GA,
    PRE = metrics$PRE,
    rHI = metrics$rHI,
    theta_CP = theta_CP,
    gain_ratios = gain_ratios,
    constrained_gains = constrained_gains,
    desired_gains = d,
    U = U,
    k_I = k_I,
    L_I = L_I
  )

  class(result) <- c("cppg_lgsi", "genomic_index", "list")

  result
}

Try the selection.index package in your browser

Any scripts or data that you put into this service are public.

selection.index documentation built on March 9, 2026, 1:06 a.m.