R/pmi_svd.R
In nlpembeds: Natural Language Processing Embeddings

Documented in build_spm_cooc_sym get_pmi get_svd spm_to_df

#' Compute pointwise mutual information (PMI)
#'
#' @param spm_cooc Co-occurrence sparse matrix, either a triangular sparse
#'                 matrix or a dataframe
#'
#' @return PMI symmetric matrix
#'
#' @examples
#' df_ehr = data.frame(Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4),
#'                     Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4),
#'                     Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1',
#'                                     'C2', 'C3', 'C4'),
#'                     Count = 1:9)
#' 
#' spm_cooc = build_df_cooc(df_ehr)
#' 
#' m_pmi = get_pmi(spm_cooc)
#'
#' @export
get_pmi = function(spm_cooc) {

  if (!inherits(spm_cooc, 'data.frame')) spm_cooc = spm_to_df(spm_cooc)

  code_freqs = subset(spm_cooc, V1 == V2) %$% setNames(value, V1)
  code_freqs = code_freqs[order(names(code_freqs))]

  row_sums = spm_cooc %$% split(value, V1) %>% sapply(sum)
  row_sums = row_sums + sapply(split(spm_cooc$value, spm_cooc$V2), sum)

  # diagonal is counted twice
  row_sums = row_sums - code_freqs
  total_sum = sum(spm_cooc$value) * 2 - sum(code_freqs)

  denoms = row_sums[spm_cooc$V1] * row_sums[spm_cooc$V2]

  spm_cooc$value = log(spm_cooc$value * total_sum / denoms)

  m_pmi = as.matrix(build_spm_cooc_sym(spm_cooc))
}

#' Compute random singular value decomposition (rSVD)
#'
#' Random SVD is an efficient approximation of truncated SVD, in which only the
#' first principal components are returned. It is computed with the rsvd
#' package, and the author suggests that the number of dimensions requested k
#' should be: k < n / 4, where n is the number of features, for it to be
#' efficient, and that otherwise one should rather use either SVD or truncated
#' SVD.
#' When computing SVD on PMI, we only want to use the singular values
#' corresponding to the positive eigen values. We do not know beforehand how
#' many we will have to filter out, so there is two parameters: 'embedding_dim'
#' for the requested output dimension, and 'svd_rank' for the actual SVD
#' computation, by default twice the requested dimension, and a warning may be
#' thrown if 'svd_rank' needs to be manually increased.
#' Computation may be expensive and manually optimizing the 'svd_rank'
#' parameter might save significant time.
#'
#' @param m_pmi Pointwise mutual information matrix.
#' @param embedding_dim Number of output embedding dimensions requested.
#' @param svd_rank Number of SVD dimensions to compute.
#'
#' @return SVD rectangular matrix
#' @examples
#' df_ehr = data.frame(Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4),
#'                     Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4),
#'                     Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1',
#'                                     'C2', 'C3', 'C4'),
#'                     Count = 1:9)
#'
#' spm_cooc = build_df_cooc(df_ehr)
#'
#' m_pmi = get_pmi(spm_cooc)
#' m_svd = get_svd(m_pmi, embedding_dim = 2)
#'
#' @export
get_svd = function(m_pmi, embedding_dim = 100, svd_rank = embedding_dim * 2) {

  row_names = rownames(m_pmi)
  m_pmi = rsvd::rsvd(m_pmi, svd_rank)

  idx = which(sign(m_pmi$u[1, ]) == sign(m_pmi$v[1, ]))

  if (embedding_dim > 0) {

    idx %<>% head(embedding_dim)

    if (length(idx) < embedding_dim) {
      warning('The output embedding has less dimensions than required. Try increasing the svd_rank parameter.')
    }
  }

  m_pmi = t(t(m_pmi$u[, idx, drop = FALSE]) * sqrt(m_pmi$d[idx]))
  #m_pmi = m_pmi / apply(m_pmi, 2, norm, '2')

  rownames(m_pmi) = row_names

  m_pmi
}


#' Build symmetric sparse matrix from data frame
#'
#' @param df_cooc Symmetric sparse matrix in data frame format
#'
#' @return Matrix::sparseMatrix object, symmetric sparse matrix
#'
#' @export
build_spm_cooc_sym = function(df_cooc) {

  uniq_codes = union(df_cooc[[1]], df_cooc[[2]])

  Matrix::sparseMatrix(match(df_cooc$V1, uniq_codes),
                       match(df_cooc$V2, uniq_codes), x = df_cooc$value,
                       symmetric = TRUE, dims = rep(length(uniq_codes), 2),
                       dimnames = list(uniq_codes, uniq_codes))
}


#' Write sparse matrix to dataframe
#'
#' @param spm Sparse matrix
#'
#' @return Data frame
#'
#' @export
spm_to_df = function(spm) {

  spm_names = rownames(spm)
  spm = setNames(as.data.frame(Matrix::summary(spm)), c('V1', 'V2', 'value'))
  
  data.frame(V1 = spm_names[spm$V1], V2 = spm_names[spm$V2],
             value = spm$value)
}