R/wssa.R

#' @title Word Set Significane Analysis (WSSA) for a given word set in a corpus.
#'
#' @description Performs a word set significance analysis in a corpus for a given
#'              topic of interest, specified by the \code{header_set}, with a \code{control_set}
#'              forming the background.
#'
#' @param model A word2vec VectorSpace model ouput.
#' @param model_boot If not NULL, a list of resampled Vectospace model outputs generated by
#'                   applying word2vec on resampled corpus.Defaults to NULL, which ignores the
#'                   resampling.
#' @param word_set A set of words (a vector of words) whose significance is to be tested.
#' @param header_set a word or a vector of words defining a topic of interest.
#' @param control_set A vector of words forming the background. Defaults to NULL in which case
#'                    the function chooses a random set of words from the vocabulary.
#' @param num_controls The number of control words chosen for background if \code{control_set}
#'                     is NULL.
#' @param fgsea.control The control parameters for the GSEA model fitting using the fgsea() package.
#'
#' @return Returns an output of the significance analysis (p-values and expected score) for the
#'         \code{word_set} of interest in the context of the \code{header_set} with the
#'         \code{control_set} as background.
#' @importFrom fgsea fgsea
#' @importFrom wordVectors cosineSimilarity
#'
#' @export

wssa <- function(model,
                 model_boot = NULL,
                 word_set,
                 header_set,
                 control_set = NULL,
                 num_controls = 5000,
                 fgsea.control = list()){

  fgsea.control.default <- list(minSize=5,
                                maxSize=500, nperm=10000,
                                nproc=0,
                                gseaParam=1,
                                BPPARAM=NULL)
  fgsea.control <- modifyList(fgsea.control.default, fgsea.control)

  if(is.null(word_set)){
    stop("No word set provided to perform WSEA")
  }

  if(is.null(header_set)){
    stop("header set not provided: word group comparisons not possible")
  }

  if(is.null(control_set)){
    control_set <- sample(rownames(model), num_controls, replace = FALSE)
    control_set <- setdiff(control_set, word_set)
  }

  if(is.null(model_boot)){
    message("WSSA table prep w/o bootstrap")
    word_cos <- as.data.frame(wordVectors::cosineSimilarity(model[[word_set, average = FALSE]], model[[header_set, average = FALSE]]))
    control_cos <- as.data.frame(wordVectors::cosineSimilarity(model[[control_set, average = FALSE]], model[[header_set, average = FALSE]]))

    pooled_cos <- rbind(word_cos, control_cos)
    if(dim(pooled_cos)[1] >= 2){
      pooled_cos_update <- rowMeans(pooled_cos)
    }else{
      pooled_cos_update <- pooled_cos
    }
    pooled_cos_vec <- as.numeric(unlist(pooled_cos_update))
    names(pooled_cos_vec) <- rownames(pooled_cos)
  }

  if(!is.null(model_boot)){
    message("WSSA table prep by bootstrap")
    word_cos <- bootse_calc(word_set, header_set, model_boot = model_boot)
    control_cos <-  bootse_calc(control_set, header_set, model_boot = model_boot)
    pooled_cos <- rbind(word_cos, control_cos)
    pooled_cos_update <- pooled_cos[,1]
    pooled_cos_vec <- as.numeric(unlist(pooled_cos_update))
    names(pooled_cos_vec) <- rownames(pooled_cos)
  }

  word_set_list <- list()
  word_set_list[[1]] <- word_set

  message("Performing WSSA")
  wssa <- do.call(fgsea::fgsea, append(list(pathways = word_set_list,
                                           stats = pooled_cos_vec),
                                      fgsea.control))
  message("Finished successfully !")
  return(wssa)
}



bootse_calc <- function(word_set,
                        header_set,
                        model_boot){
  mat <- matrix(0, length(word_set), (length(model_boot)+1))

  ss <- as.data.frame(cosineSimilarity(model[[word_set, average = FALSE]],
                                       model[[header_set, average = FALSE]]))

  if(dim(ss)[2] > 1){
    ss1 <- rowMeans(ss)
  }else{
    ss1 <- ss
  }

  indices <- match(rownames(ss), word_set)
  mat[indices, 1] <- as.numeric(unlist(ss1))



  for(h in 1:length(model_boot)){
    tt <- as.data.frame(cosineSimilarity(model_boot[[h]][[word_set, average = FALSE]], model_boot[[h]][[header_set, average = FALSE]]))

    if(dim(tt)[2] > 1){
      tt1 <- rowMeans(tt)
    }else{
      tt1 <- tt
    }

    temp_ids <- match(rownames(tt), word_set)
    mat[temp_ids, (h+1)] <- as.numeric(unlist(tt1))
  }

  temp_x <- mat[,1]
  temp_sd <- apply(mat, 1, function(x) return(sd(x[!is.na(x)])))

  temp <- as.vector(temp_x/temp_sd)
  temp <- matrix(temp, ncol=1)
  rownames(temp) <- word_set
  return(temp)
}
kkdey/WEAVER documentation built on May 8, 2019, 9:24 a.m.