#' @title Word Set Significane Analysis (WSSA) for a given word set in a corpus.
#'
#' @description Performs a word set significance analysis in a corpus for a given
#' topic of interest, specified by the \code{header_set}, with a \code{control_set}
#' forming the background.
#'
#' @param model A word2vec VectorSpace model ouput.
#' @param model_boot If not NULL, a list of resampled Vectospace model outputs generated by
#' applying word2vec on resampled corpus.Defaults to NULL, which ignores the
#' resampling.
#' @param word_set A set of words (a vector of words) whose significance is to be tested.
#' @param header_set a word or a vector of words defining a topic of interest.
#' @param control_set A vector of words forming the background. Defaults to NULL in which case
#' the function chooses a random set of words from the vocabulary.
#' @param num_controls The number of control words chosen for background if \code{control_set}
#' is NULL.
#' @param fgsea.control The control parameters for the GSEA model fitting using the fgsea() package.
#'
#' @return Returns an output of the significance analysis (p-values and expected score) for the
#' \code{word_set} of interest in the context of the \code{header_set} with the
#' \code{control_set} as background.
#' @importFrom fgsea fgsea
#' @importFrom wordVectors cosineSimilarity
#'
#' @export
wssa <- function(model,
model_boot = NULL,
word_set,
header_set,
control_set = NULL,
num_controls = 5000,
fgsea.control = list()){
fgsea.control.default <- list(minSize=5,
maxSize=500, nperm=10000,
nproc=0,
gseaParam=1,
BPPARAM=NULL)
fgsea.control <- modifyList(fgsea.control.default, fgsea.control)
if(is.null(word_set)){
stop("No word set provided to perform WSEA")
}
if(is.null(header_set)){
stop("header set not provided: word group comparisons not possible")
}
if(is.null(control_set)){
control_set <- sample(rownames(model), num_controls, replace = FALSE)
control_set <- setdiff(control_set, word_set)
}
if(is.null(model_boot)){
message("WSSA table prep w/o bootstrap")
word_cos <- as.data.frame(wordVectors::cosineSimilarity(model[[word_set, average = FALSE]], model[[header_set, average = FALSE]]))
control_cos <- as.data.frame(wordVectors::cosineSimilarity(model[[control_set, average = FALSE]], model[[header_set, average = FALSE]]))
pooled_cos <- rbind(word_cos, control_cos)
if(dim(pooled_cos)[1] >= 2){
pooled_cos_update <- rowMeans(pooled_cos)
}else{
pooled_cos_update <- pooled_cos
}
pooled_cos_vec <- as.numeric(unlist(pooled_cos_update))
names(pooled_cos_vec) <- rownames(pooled_cos)
}
if(!is.null(model_boot)){
message("WSSA table prep by bootstrap")
word_cos <- bootse_calc(word_set, header_set, model_boot = model_boot)
control_cos <- bootse_calc(control_set, header_set, model_boot = model_boot)
pooled_cos <- rbind(word_cos, control_cos)
pooled_cos_update <- pooled_cos[,1]
pooled_cos_vec <- as.numeric(unlist(pooled_cos_update))
names(pooled_cos_vec) <- rownames(pooled_cos)
}
word_set_list <- list()
word_set_list[[1]] <- word_set
message("Performing WSSA")
wssa <- do.call(fgsea::fgsea, append(list(pathways = word_set_list,
stats = pooled_cos_vec),
fgsea.control))
message("Finished successfully !")
return(wssa)
}
bootse_calc <- function(word_set,
header_set,
model_boot){
mat <- matrix(0, length(word_set), (length(model_boot)+1))
ss <- as.data.frame(cosineSimilarity(model[[word_set, average = FALSE]],
model[[header_set, average = FALSE]]))
if(dim(ss)[2] > 1){
ss1 <- rowMeans(ss)
}else{
ss1 <- ss
}
indices <- match(rownames(ss), word_set)
mat[indices, 1] <- as.numeric(unlist(ss1))
for(h in 1:length(model_boot)){
tt <- as.data.frame(cosineSimilarity(model_boot[[h]][[word_set, average = FALSE]], model_boot[[h]][[header_set, average = FALSE]]))
if(dim(tt)[2] > 1){
tt1 <- rowMeans(tt)
}else{
tt1 <- tt
}
temp_ids <- match(rownames(tt), word_set)
mat[temp_ids, (h+1)] <- as.numeric(unlist(tt1))
}
temp_x <- mat[,1]
temp_sd <- apply(mat, 1, function(x) return(sd(x[!is.na(x)])))
temp <- as.vector(temp_x/temp_sd)
temp <- matrix(temp, ncol=1)
rownames(temp) <- word_set
return(temp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.