Nothing
#' Compute pointwise mutual information (PMI)
#'
#' @param spm_cooc Co-occurrence sparse matrix, either a triangular sparse
#' matrix or a dataframe
#'
#' @return PMI symmetric matrix
#'
#' @examples
#' df_ehr = data.frame(Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4),
#' Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4),
#' Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1',
#' 'C2', 'C3', 'C4'),
#' Count = 1:9)
#'
#' spm_cooc = build_df_cooc(df_ehr)
#'
#' m_pmi = get_pmi(spm_cooc)
#'
#' @export
get_pmi = function(spm_cooc) {
if (!inherits(spm_cooc, 'data.frame')) spm_cooc = spm_to_df(spm_cooc)
code_freqs = subset(spm_cooc, V1 == V2) %$% setNames(value, V1)
code_freqs = code_freqs[order(names(code_freqs))]
row_sums = spm_cooc %$% split(value, V1) %>% sapply(sum)
row_sums = row_sums + sapply(split(spm_cooc$value, spm_cooc$V2), sum)
# diagonal is counted twice
row_sums = row_sums - code_freqs
total_sum = sum(spm_cooc$value) * 2 - sum(code_freqs)
denoms = row_sums[spm_cooc$V1] * row_sums[spm_cooc$V2]
spm_cooc$value = log(spm_cooc$value * total_sum / denoms)
m_pmi = as.matrix(build_spm_cooc_sym(spm_cooc))
}
#' Compute random singular value decomposition (rSVD)
#'
#' Random SVD is an efficient approximation of truncated SVD, in which only the
#' first principal components are returned. It is computed with the rsvd
#' package, and the author suggests that the number of dimensions requested k
#' should be: k < n / 4, where n is the number of features, for it to be
#' efficient, and that otherwise one should rather use either SVD or truncated
#' SVD.
#' When computing SVD on PMI, we only want to use the singular values
#' corresponding to the positive eigen values. We do not know beforehand how
#' many we will have to filter out, so there is two parameters: 'embedding_dim'
#' for the requested output dimension, and 'svd_rank' for the actual SVD
#' computation, by default twice the requested dimension, and a warning may be
#' thrown if 'svd_rank' needs to be manually increased.
#' Computation may be expensive and manually optimizing the 'svd_rank'
#' parameter might save significant time.
#'
#' @param m_pmi Pointwise mutual information matrix.
#' @param embedding_dim Number of output embedding dimensions requested.
#' @param svd_rank Number of SVD dimensions to compute.
#'
#' @return SVD rectangular matrix
#' @examples
#' df_ehr = data.frame(Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4),
#' Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4),
#' Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1',
#' 'C2', 'C3', 'C4'),
#' Count = 1:9)
#'
#' spm_cooc = build_df_cooc(df_ehr)
#'
#' m_pmi = get_pmi(spm_cooc)
#' m_svd = get_svd(m_pmi, embedding_dim = 2)
#'
#' @export
get_svd = function(m_pmi, embedding_dim = 100, svd_rank = embedding_dim * 2) {
row_names = rownames(m_pmi)
m_pmi = rsvd::rsvd(m_pmi, svd_rank)
idx = which(sign(m_pmi$u[1, ]) == sign(m_pmi$v[1, ]))
if (embedding_dim > 0) {
idx %<>% head(embedding_dim)
if (length(idx) < embedding_dim) {
warning('The output embedding has less dimensions than required. Try increasing the svd_rank parameter.')
}
}
m_pmi = t(t(m_pmi$u[, idx, drop = FALSE]) * sqrt(m_pmi$d[idx]))
#m_pmi = m_pmi / apply(m_pmi, 2, norm, '2')
rownames(m_pmi) = row_names
m_pmi
}
#' Build symmetric sparse matrix from data frame
#'
#' @param df_cooc Symmetric sparse matrix in data frame format
#'
#' @return Matrix::sparseMatrix object, symmetric sparse matrix
#'
#' @export
build_spm_cooc_sym = function(df_cooc) {
uniq_codes = union(df_cooc[[1]], df_cooc[[2]])
Matrix::sparseMatrix(match(df_cooc$V1, uniq_codes),
match(df_cooc$V2, uniq_codes), x = df_cooc$value,
symmetric = TRUE, dims = rep(length(uniq_codes), 2),
dimnames = list(uniq_codes, uniq_codes))
}
#' Write sparse matrix to dataframe
#'
#' @param spm Sparse matrix
#'
#' @return Data frame
#'
#' @export
spm_to_df = function(spm) {
spm_names = rownames(spm)
spm = setNames(as.data.frame(Matrix::summary(spm)), c('V1', 'V2', 'value'))
data.frame(V1 = spm_names[spm$V1], V2 = spm_names[spm$V2],
value = spm$value)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.