mixedgcImp: Missing value imputation for mixed data through Gaussian copula

Documented in get_cond_dist impute_init imputeZ_mixedgc_ppca quad scale_corr sum_2d_scale sum_3d_scale to_numeric_matrix

observed <- function(x) x[!is.na(x)]

sum_list_len <- function(s) sum(purrr::map_int(s, length))


#' DataFrame to Matrix
#'
#' @description Safely turn a numerical data frame to a numerical matrix
#' @param X a data frame
#' @export
#' @keywords internal
to_numeric_matrix <- function(X){
  Xnew = as.numeric(as.matrix(X))
  dim(Xnew) = dim(X)
  Xnew
}

index_int_to_logi <- function(index, l){
  out = logical(l)
  out[index] = TRUE
  out
}

regularize_corr <- function(sigma, corr_min_eigen = 1e-3, verbose = FALSE, prefix = ''){
  o_eigen = eigen(sigma)
  if (min(o_eigen$values)<corr_min_eigen){
    values = o_eigen$values
    values[values<corr_min_eigen] = corr_min_eigen
    sigma = cov2cor(o_eigen$vectors %*% diag(values) %*% t(o_eigen$vectors))
    if (verbose) print(paste0(prefix, 'small eigenvalue in the copula correlation'))
  }
  sigma
}


factor_to_num <- function(x) as.numeric(levels(x))[x]


to_nearest_ord <- function(x, ords=NULL, xobs=NULL){
  if (is.null(ords)) ords = unique(xobs[!is.na(xobs)])
  ords[which.min(abs(x - ords))]
}

to_nearest_ord_vec <- function(x, xobs){
  ords = unique(xobs[!is.na(xobs)])
  n = length(x)
  xnew = numeric(n)
  for (i in seq_along(x)) xnew[i] = to_nearest_ord(x[i], ords)
  xnew
}


#' Return a complete Z matrix based on SVD initialization
#'
#' @description  The SVD initialization is based on a complete Z matrix with missing entries filled in by zero. Useful for getting initial estimates of copula parameters
#' @param Z_meanimp Initial complete Z with mean (i.e. zero) imputation
#' @param rank the number of latent factors
#' @param r_lower Lower boundary of truncated intervals for ordinal columns
#' @param r_upper Upper boundary of truncated intervals for ordinal columns
#' @return A complte Z matrix
#' @export
#' @keywords internal
impute_init = function(Z, rank, Lower, Upper, d_index){
  Z_imp = Z
  Z_imp[is.na(Z)] = 0
  obj = svd(Z_imp, nv=rank)
  Z_imp = obj$u[,1:rank] %*% diag(obj$d[1:rank]) %*% t(obj$v)

  p = ncol(Z)
  j_in_ord = 1
  for (j in 1:p){
    index_o = which(!is.na(Z[,j]))
    if (d_index[j]){
      index = Z_imp[index_o,j] < Lower[index_o,j_in_ord] | Z_imp[index_o,j] > Upper[index_o,j_in_ord]
      Z_imp[index_o[index],j] = Z[index_o[index],j]
      j_in_ord = j_in_ord + 1
    }else{
      Z_imp[index_o,j] = Z[index_o,j]
    }
  }

  Z_imp
}

#' Adjust estiamted correlation parameters
#'
#' @description  adjust estiamted correlation parameters, W and sigma, to satisfy unit diagonal constraints.
#' @param W The latent low rank subspace matrix (initial estimate)
#' @param sigma The noise variance (initial estimate)
#' @return A list containing
#' \describe{
#'   \item{\code{W}}{Adjusted latent low rank subspace matrix}
#'   \item{\code{sigma}}{Adjusted noise variance}
#' }
#' @export
#' @keywords internal
scale_corr = function(W, sigma){
  p = dim(W)[1]
  tr = apply(W, 1, function(x){sum(x^2)})
  sigma = mean(1/(tr+sigma)) * sigma
  for (j in 1:p) W[j,] = sqrt(1-sigma) * W[j,]/sqrt(tr[j])
  return(list(W = W, sigma = sigma))
}

#' Sum a 3d matrix along one axis
#'
#' @description  For a 3d matrix M with dimension n*k*k, sum over dimension (2,3) with scale c, at entries selected by index in dimension 1
#' @param M A 3d matrix with dimension n*k*k
#' @param v A vector of length n
#' @param index a subset of \{1,2,...,n\}
#' @return A k*k matrix
#' @keywords internal
sum_3d_scale = function(M, v, index=NULL){
  if (!is.null(index)){
    apply(M, c(2,3), function(x){sum(x[index] * v[index])})
  }else{
    apply(M, c(2,3), function(x){sum(x* v)})
  }
}



#' Sum a 2d matrix along one axis
#'
#' @description  For a 2d matrix M with dimension n*k, sum over dimension 2 with scale c, at entries selected by index in dimension 1
#' @param M A 2d matrix with dimension n*k
#' @param c A vector of length n
#' @param index a subset of \{1,2,...,n\}
#' @return A k*1 matrix
#' @keywords internal
sum_2d_scale = function(M, v, index=NULL){
  if (!is.null(index)){
    r = apply(M, 2, function(x){sum(x[index] * v[index])})
  }else{
    r= apply(M, 2, function(x){sum(x*v)})
  }
  matrix(r, ncol = 1)
}

#' Quadratic form
#'
#' @description  Compute the quadratic form x^tAx
#' @param x A p*1 vector
#' @param A A p*p matrix
#' @return A scalar
#' @export
#' @keywords internal
quad = function(A,x){
  as.numeric(t(x) %*% A %*% x)
}

quad_mul = function(A, X){
  n = nrow(X)
  vals = numeric(n)
  for (i in 1:n) vals[i] = quad(A, X[i,])
  vals
}

#' Impute missing entries for the Z matrix
#'
#' @description  Using the returned quantities from EM algorithm, impute missing entries for the Z matrix
#' @param Z The returned \code{Zobs} matrix from EM algorithm
#' @param W The returned estimate for \code{W} matrix
#' @return The imputed complete \code{Z} matrix
#' @export
#' @keywords internal
imputeZ_mixedgc_ppca = function(Z, W, sigma){
  n = dim(Z)[1]
  p = dim(Z)[2]
  obj = svd(W)
  U = obj$u
  d = obj$d
  rank = length(d)

  for (i in 1:n){
    index_m = is.na(Z[i,])
    index_o = !index_m
    Ui_obs = U[index_o,,drop=FALSE]
    zi_obs = matrix(Z[i, index_o], ncol = 1)
    UU_obs = t(Ui_obs) %*% Ui_obs
    si = solve(sigma * diag(d^{-2}) + UU_obs, t(Ui_obs) %*% zi_obs)
    Z[i,index_m] =  matrix(U[index_m,], ncol = rank) %*% matrix(si, ncol = 1)
  }
  Z
}


#' Conditional normal mean and cov
#' @description Compute the conditional normal mean and cov
#' @param mu normal mean
#' @param cov normal covariance
#' @param z data observation
#' @param index_o observed dimensions
#' @param index_m missing dimensions
#' @param drop Whether to drop the dimension of computed conditional mean
#' @return A list containing
#' \describe{
#'   \item{\code{mean}}{conditional mean}
#'   \item{\code{cov}}{conditional covariance }
#' }
#' @keywords internal
get_cond_dist <- function(z, mu, cov, index_o, index_m=NULL,drop=TRUE){
  test_logical = TRUE
  if (test_logical){
    if (!is.logical(index_o)) stop('use logical index')
    if (!any(index_o)) print('empty set to conditional on')
    if (is.null(index_m)) index_m = !index_o
    if (sum(index_o)+sum(index_m)>ncol(cov)) stop('inconsistent cov, index_o and index_m')
  }
  if (is.null(index_m)) stop('input index_m')
  if (is.null(dim(z))) z = matrix(z, 1)
  if (ncol(z) != length(mu)) stop('inconsistent z and mu')
  # dim |o| by n_sample, using the centered version
  if (any(index_o)){
    zo_c = t(z[,index_o,drop=FALSE]) - mu[index_o]
    if (any(is.na(zo_c))) stop('missing found in centered zo')
    cov_oo = cov[index_o, index_o, drop=FALSE]
    cov_om = cov[index_o, index_m, drop=FALSE]
    # |o| * |m|
    ans = solve(cov_oo, cov_om)
    # sample * |m|
    cond_mean = t(mu[index_m] + t(ans) %*% zo_c)
    cond_cov = cov[index_m,index_m, drop=FALSE] - t(cov_om) %*% ans
  }else{
    mu_m = mu[index_m]
    cond_mean = matrix(mu_m, nrow(z), length(mu_m), byrow=TRUE)
    cond_cov = cov[index_m,index_m, drop=FALSE]
  }

  if (drop) cond_mean = drop(cond_mean)
  list('mean'=cond_mean, 'cov'=cond_cov)
}

udellgroup/mixedgcImp documentation built on Jan. 25, 2023, 7:55 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

udellgroup/mixedgcImp
Missing value imputation for mixed data through Gaussian copula

R/helper_core.R
In udellgroup/mixedgcImp: Missing value imputation for mixed data through Gaussian copula

Defines functions get_cond_dist imputeZ_mixedgc_ppca quad_mul quad sum_2d_scale sum_3d_scale scale_corr impute_init to_nearest_ord_vec to_nearest_ord factor_to_num regularize_corr index_int_to_logi to_numeric_matrix sum_list_len observed

Documented in get_cond_dist impute_init imputeZ_mixedgc_ppca quad scale_corr sum_2d_scale sum_3d_scale to_numeric_matrix

R Package Documentation

Browse R Packages

We want your feedback!

udellgroup/mixedgcImp Missing value imputation for mixed data through Gaussian copula

R/helper_core.R In udellgroup/mixedgcImp: Missing value imputation for mixed data through Gaussian copula

Defines functions get_cond_dist imputeZ_mixedgc_ppca quad_mul quad sum_2d_scale sum_3d_scale scale_corr impute_init to_nearest_ord_vec to_nearest_ord factor_to_num regularize_corr index_int_to_logi to_numeric_matrix sum_list_len observed

Documented in get_cond_dist impute_init imputeZ_mixedgc_ppca quad scale_corr sum_2d_scale sum_3d_scale to_numeric_matrix

R Package Documentation

Browse R Packages

We want your feedback!

udellgroup/mixedgcImp
Missing value imputation for mixed data through Gaussian copula

R/helper_core.R
In udellgroup/mixedgcImp: Missing value imputation for mixed data through Gaussian copula