#' @title Word Set Enrichment Analysis (WSEA) across corpora
#'
#' @description Performs a word set enrichment across multiple corpora in relation to a
#' topic of interest, specified by the \code{header_set}.
#'
#' @param model_list A list of VectorSpace models for a number of corpora. The length of the
#' list is equal to the number of corpora.
#' @param word_sets A list of vector of words (word sets) whose enrichment is to be studied.
#' @param header_set a word or a vector of words defining a topic of interest.
#' @param model_names The names of the corpora that are compared. Defaults to numerical groups.
#' @param add_groups A list of vectors indicating group labels that the user wants to compare.
#' Using \code{add_groups}, the user can compare a set of corpora with another
#' set of corpora beyod the 1v1 and 1vall comparisons the default version does.
#' @param numwords The number of neighboring words to each word in the word sets pooled into the
#' word set enrichment analysis.
#' @param fgsea.control The control parameters for the GSEA model fitting using the fgsea() package.
#'
#' @return Produces 1v1, 1vall WSEA output for all copora (each output similar to the fgsea output)
#' together with group based WSEA output as specified by the \code{add_groups}.
#'
#' @importFrom wordVectors nearest_to
#' @importFrom wordVectors cosineSimilarity
#' @importFrom fgsea fgsea
#'
#' @export
wsea <- function(model_list,
word_sets,
header_set,
model_names = NULL,
add_groups = NULL,
numwords = 1000,
fgsea.control = list()){
fgsea.control.default <- list(minSize=5,
maxSize=500, nperm=10000,
nproc=0,
gseaParam=1,
BPPARAM=NULL)
fgsea.control <- modifyList(fgsea.control.default, fgsea.control)
if(is.null(model_names)){
if(!is.null(names(model_list))){
model_names <- names(model_list)
}else{
model_names <- 1:length(model_list)
}
}
if(!is.null(add_groups)){
if(!is.list(add_groups)){
stop("add_groups must be a list if not NULL")
}
}
if(is.null(word_sets)){
stop("No word sets provided to perform WSEA")
}
if(is.null(header_set)){
stop("header sets not provided: word group comparisons not possible")
}
if(!is.list(word_sets)){
stop("word sets must be a list")
}
pooled_words <- unique(c(unlist(word_sets), header_set))
listed_words <- as.character()
for(k in 1:length(model_list)){
for(m in 1:length(pooled_words)){
listed_words <- c(listed_words, names(wordVectors::nearest_to(model_list[[k]],model_list[[k]][[paste0(pooled_words[m])]], numwords)))
}
}
listed_words <- unique(listed_words)
word_sets_extended <- vector(mode="list", length=length(word_sets))
for(l in 1:length(word_sets)){
for(h in 1:length(header_set)){
word_sets_extended[[l]] <- c(word_sets_extended[[l]], paste0(word_sets[[l]], "-", header_set[h]))
}
}
names <- c()
for(h in 1:length(header_set)){
names <- c(names, paste0(listed_words, "-", header_set[h]))
}
message("Building word2vec similarity matrix for related words")
mat <- matrix(0, length(listed_words)*length(header_set), length(model_list))
for(k in 1:length(model_list)){
## find cosine similarity of words from listed words occurring in model k
temp1 <- as.data.frame(wordVectors::cosineSimilarity(model_list[[k]][[header_set, average = FALSE]], model_list[[k]][[listed_words, average = FALSE]]))
## which words out of listed words occurred in the model k (kth newspaper)
indices <- match(colnames(temp1), listed_words)
## flling the matrix with the cosine similarities
for(i in 1:dim(temp1)[1]){
slice <- mat[((i-1)*length(listed_words)+1): (i*length(listed_words)),k]
slice[indices] <- as.numeric(unlist(temp1[i,]))
mat[((i-1)*length(listed_words)+1): (i*length(listed_words)), k] <- slice
}
}
num_zeros <- apply(mat, 1, function(x) return(length(which(x == 0))))
bad_indices <- which(num_zeros >=2)
rownames(mat) <- names
### removing indices/words that do not occur in 2 or more newspapers #####
mat_filtered <- mat[-bad_indices, ]
names_filtered <- names[-bad_indices]
### replacing NAs (at most 1 per row) by the row average
for(k in 1:dim(mat_filtered)[1]){
index <- which(mat_filtered[k, ] ==0)
if(length(index)!=0){
mat_filtered[k, index] <- mean(mat_filtered[k, -index])
}
}
### performing quantile normalization
message("Performing quantile normalization on the models")
norm_mat_filtered <- matrix(0, dim(mat_filtered)[1], dim(mat_filtered)[2])
norm_mat_filtered = preprocessCore::normalize.quantiles(as.matrix(mat_filtered))
rownames(norm_mat_filtered) <- rownames(mat_filtered)
######### performing WSEA ##########################
message("Performing WSEA")
norm_mat_names <- rownames(norm_mat_filtered)
wsea <- list()
message("Fitting the 1v1 WSEA")
model_names <- c("The Nation", "Slate", "Root", "National Review")
for(l in 2:dim(norm_mat_filtered)[2]){
for(ll in 1:(l-1)){
score <- as.vector(norm_mat_filtered[,l] - norm_mat_filtered[,ll])
names(score) <- norm_mat_names
wsea[[paste0("1v1: ", model_names[l], "-", model_names[ll])]] <- do.call(fgsea::fgsea, append(list(pathways = word_sets_extended,
stats = score), fgsea.control))
rm(score)
}
}
message("Fitting the 1v-all WSEA")
for(k in 1:dim(mat_filtered)[2]){
temp_score_one_vs_all <- norm_mat_filtered[,k] - norm_mat_filtered[,-k]
temp_score_min <- apply(temp_score_one_vs_all, 1, min)
names(temp_score_min) <- norm_mat_names
score <- (temp_score_min - mean(temp_score_min))/sd(temp_score_min)
wsea[[paste0("1v-all: ", model_names[k])]] <- do.call(fgsea::fgsea, append(list(pathways = word_sets_extended,
stats = score), fgsea.control))
rm(score)
}
if(!is.null(add_groups)){
message("Fitting the added groups WSEA")
for(m in 1:length(add_groups)){
if(!is.vector(add_groups[[m]])){
message("element of add_groups must be a vector")
}else{
temp_score_group <- norm_mat_filtered[,add_groups[[m]]] - norm_mat_filtered[,-add_groups[[m]]]
temp_score_group_min <- apply(temp_score_group, 1, min)
names(temp_score_group_min) <- norm_mat_names
score <- (temp_score_group_min - mean(temp_score_group_min))/sd(temp_score_group_min)
wsea[[paste0("group: ", m)]] <- do.call(fgsea::fgsea, append(list(pathways = word_sets_extended,
stats = score), fgsea.control))
rm(score)
}
}
}
message("Finished successfully !!")
ww <- list("wsea" = wsea, "matrix" = norm_mat_filtered)
return(ww)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.