R/mpmiExtraction.R

Defines functions mpmiExcraction seed Optics

Documented in mpmiExcraction

#' @name mpmiExcraction
#' @title Extract words based on MPMI+BE alogrithm
#' @description Extract words based on MPMI+BE alogrithm
#' @param df One column dataframe.
#' @param ChineseCharFilter Drop non-Chinese characters. Default is True.
#' @param lambda Weight distribution between PMI and BE, which will directly affect the results.
#' @param optics Automatically optimize the result, select TURE if you don't know how to choose lambda.
#' @param steps Number of iterations in the optimization process.
#' @param threshold threshold for the final score.
#' @param bayesianCutoff threshold for the bayesian probabilty value in word extension process
#'
#' @return List of extracted strings.
#' @export
#' @examples
#' \donttest{
#' mpmiExcraction(drugnames_test,
#'                ChineseCharFilter = T,
#'                lambda = 0.25,
#'                optics = F,
#'                threshold = 30,
#'                bayesianCutoff = 0.6)
#' }

mpmiExcraction <- function(df, ChineseCharFilter = T, lambda = 0.25 , optics, steps = 100, threshold,bayesianCutoff){
  userDataImport(df, ChineseCharFilter = T)
  if (!requireNamespace("crayon", quietly = TRUE)) {
    stop("Package \"crayon\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (!requireNamespace("cli", quietly = TRUE)) {
    stop("Package \"cli\" needed for this function to work. Please install it.",
         call. = FALSE)
  }

  if (!requireNamespace("rwstats", quietly = TRUE)) {
    stop("Package \"rwstats\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (!requireNamespace("pbapply", quietly = TRUE)) {
    stop("Package \"pbapply\" needed for this function to work. Please install it.",
         call. = FALSE)
  }

  # global: userone_char, userdf_t2h, segWords

  # process user file here.
  cat(">>> Start Processing import file...", "\n")
  #assign("userOneChar", onecharFreq(userdf_t2h), envir = .GlobalEnv)
  #assign("userTwoChar", ncharFreq(userdf_t2h, 2, userOneChar), envir = .GlobalEnv)
  userOneChar <- onecharFreq(userdf_t2h)
  userTwoChar <- ncharFreq(userdf_t2h, 2, userOneChar)
  userThreeChar <- ncharFreq(userdf_t2h, 3)
  userFourChar <- ncharFreq(userdf_t2h, 4)
  userFiveChar <- ncharFreq(userdf_t2h, 5)
  cat("DONE ", crayon::green(cli::symbol$tick), "\n")

  # mining seed word
  twoCharFilter <- seed(userTwoChar)

  # process real world data.
  cat(">>> Merging and Filtering data...", "\n")

  threeCharFilter <- suppressWarnings(charFilter(userThreeChar, rwstats::threeChar))
  fourCharFilter <- suppressWarnings(charFilter(userFourChar, rwstats::fourChar))
  fiveCharFilter <- suppressWarnings(charFilter(userFiveChar, rwstats::fiveChar))
  #assign("threeCharFilter", suppressWarnings(charFilter(userThreeChar, rwstats::threeChar)), envir = .GlobalEnv)
  #assign("fourCharFilter", suppressWarnings(charFilter(userFourChar, rwstats::fourChar)), envir = .GlobalEnv)
  #assign("fiveCharFilter", suppressWarnings(charFilter(userFiveChar, rwstats::fiveChar)), envir = .GlobalEnv)
  cat("DONE ", crayon::green(cli::symbol$tick), "\n")


  #
  cat(">>> Branch Entropy calculating...", "\n")

  #ptm <- proc.time()
  twoCharFilter$BE  <- unlist(pbapply::pblapply(twoCharFilter$character,
                                                   FUN =  function(x) branchEntropy(x)))

  #by(twoCharFilter,
  #   twoCharFilter$character,
  #   FUN = function(x) branchEntropy(as.character(x)))
  #proc.time() - ptm

  cat("DONE ", crayon::green(cli::symbol$tick), "\n")

  cat(">>> Normalization...", "\n")
  outliers <- graphics::boxplot(twoCharFilter$log, plot=FALSE)$out
  twoCharFilter <- twoCharFilter[-which(twoCharFilter$log %in% outliers),]
  twoCharFilter$normLog <- with(twoCharFilter, normalize(twoCharFilter$log))
  twoCharFilter$normBE <- with(twoCharFilter, normalize(BE))
  twoCharFilter$score <- with(twoCharFilter,(1-lambda)*normLog-lambda*normBE)
  cat("DONE ", crayon::green(cli::symbol$tick), "\n")
  #assign("twoCharFilter", twoCharFilter, envir = .GlobalEnv)


}

seed <- function(userTwoChar) {

  if (!requireNamespace("stats", quietly = TRUE)) {
    stop("Package \"stats\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (!requireNamespace("cli", quietly = TRUE)) {
    stop("Package \"cli\" needed for this function to work. Please install it.",
         call. = FALSE)
  }

  cat(">>> Mining seeds...", "\n")
  userTwoChar <- userTwoChar[userTwoChar$test_log > 0,]
  userTwoChar <- userTwoChar[order(userTwoChar$test_log, decreasing = T),]
  userTwoCharPriority <- userTwoChar[userTwoChar$test_log >= as.double(stats::quantile(userTwoChar$test_log)[3]),]


  twoCharFilter <-suppressWarnings(charFilter(userTwoChar[userTwoChar$test_log < as.double(stats::quantile(userTwoChar$test_log)[3]),],
                                              two_char))
  twoCharFilter <- twoCharFilter[twoCharFilter$log >= as.double(stats::quantile(twoCharFilter$log)[2]),][,c("character")]
  userTwoCharPriority<- userTwoCharPriority[, c("character")]
  userTwoChar <- append(levels(droplevels(userTwoCharPriority)), twoCharFilter)
  tempchar <- ncharFreq(userdf_t2h, 2, userOneChar)
  userTwoChar <- tempchar[userTwoChar %in% tempchar$character,]
  userTwoChar$character <- as.character(userTwoChar$character)
  userTwoChar$test_log <- with(userTwoChar, log2(pct/(A*B)))
  twoCharFilter <- suppressWarnings(charFilter(userTwoChar, two_char))

  userTwoChar_drop <- userTwoChar[which(userTwoChar$character %in% twoCharFilter$character==F),]
  userTwoChar_drop <- userTwoChar_drop[,c("character","test_log")]
  colnames(userTwoChar_drop) <- c("character","log")

  twoCharFilter <-dplyr::bind_rows(userTwoChar_drop, twoCharFilter)
  cat("DONE ", crayon::green(cli::symbol$tick), "\n")
  return(twoCharFilter)
}

Optics <- function(steps, optics, threshold, lambda,bayesianCutoff) {
  if (!requireNamespace("progress", quietly = TRUE)) {
    stop("Package \"progress\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (!requireNamespace("stats", quietly = TRUE)) {
    stop("Package \"stats\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (!requireNamespace("cli", quietly = TRUE)) {
    stop("Package \"cli\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (!requireNamespace("crayon", quietly = TRUE)) {
    stop("Package \"crayon\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if (optics == TRUE) {
    pb <- progress::progress_bar$new(
      format = "  Optimizating [:bar] :percent in :elapsed",
      total = steps, clear = FALSE, width= 60
    )
    cat(">>> Word mining...", "\n")
    num_char <- list()
    optlist <- list()
    n <- 100/steps
    for (i in 1:steps) {
      twoCharFilter2 <- twoCharFilter
      twoCharFilter2$score <- with(twoCharFilter2,
                                   (1-0.01*i*n)*normLog+0.01*i*n*normBE)
      ifelse(threshold <= 100 & threshold >= 1,
             twoCharFilter2 <- twoCharFilter2[twoCharFilter2$score > stats::quantile(twoCharFilter2$score,
                                                                              probs = 1:100/100)[threshold],],
             warning("Threshold must be an integer between 1 and 100"))

      cl <- nextWordMiner(twoCharFilter2,2,bayesianCutoff)
      cl$cont <- unique(cl$cont[cl$cont %in% threeCharFilter$character == TRUE])
      tri_char_final <- threeCharFilter[threeCharFilter$character %in% cl$cont,]

      # mining word length: 4
      cl2 <- nextWordMiner(tri_char_final,3,bayesianCutoff)
      cl2$cont <- unique(cl2$cont[cl2$cont %in% fourCharFilter$character == TRUE])
      qua_char_final <- fourCharFilter[fourCharFilter$character %in% cl2$cont,]

      # mining word length: 5
      cl3 <- nextWordMiner(qua_char_final,4,bayesianCutoff)
      cl3$cont <- unique(cl3$cont[cl3$cont %in% fiveCharFilter$character == TRUE])
      five_char_final <- fiveCharFilter[fiveCharFilter$character %in% cl3$cont,]


      # generate final list and save
      final_dn <- list()
      final_dn <- append(final_dn,cl$stop)
      final_dn <- append(final_dn,cl2$stop)
      final_dn <- append(final_dn,cl3$stop)

      #final_dn <- append(final_dn,cl$cont)
      #final_dn <- append(final_dn,cl2$cont)
      #final_dn <- append(final_dn,cl3$cont)
      final_dn <- unique(unlist(final_dn))
      optlist <- append(optlist, final_dn)
      num_char <- append(num_char, length(final_dn))
      pb$tick()
      Sys.sleep(1 / 100)
    }
    optlist <- as.data.frame(table(unlist(optlist)))
    #assign("optlist", as.data.frame(table(unlist(optlist))), envir = .GlobalEnv)
    num_char <- as.data.frame(unlist(num_char))
    return(cat("DONE ", crayon::green(cli::symbol$tick), "\n",
               ">>>Extracted word has been saved as", crayon::bgWhite(crayon::black("optlist"))))
  } else {
    cat(">>> Word mining...", "\n")
    twoCharFilter2 <- twoCharFilter
    twoCharFilter2$score <- with(twoCharFilter2, (1-lambda)*normLog+lambda*normBE)
    twoCharFilter2 <- twoCharFilter2[twoCharFilter2$score > stats::quantile(twoCharFilter2$score, probs = 1:100/100)[threshold],]

    cl <- nextWordMiner(twoCharFilter2,2, bayesianCutoff)
    cl$cont <- unique(cl$cont[cl$cont %in% threeCharFilter$character == TRUE])
    tri_char_final <- threeCharFilter[threeCharFilter$character %in% cl$cont,]

    # mining word length: 4
    cl2 <- nextWordMiner(tri_char_final,3, bayesianCutoff)
    cl2$cont <- unique(cl2$cont[cl2$cont %in% fourCharFilter$character == TRUE])
    qua_char_final <- fourCharFilter[fourCharFilter$character %in% cl2$cont,]

    # mining word length: 5
    cl3 <- nextWordMiner(qua_char_final,4, bayesianCutoff)
    cl3$cont <- unique(cl3$cont[cl3$cont %in% fiveCharFilter$character == TRUE])
    five_char_final <- fiveCharFilter[fiveCharFilter$character %in% cl3$cont,]


    # generate final list and save
    final_dn <- list()
    final_dn <- append(final_dn,cl$stop)
    final_dn <- append(final_dn,cl2$stop)
    final_dn <- append(final_dn,cl3$stop)

    final_dn <- append(final_dn,cl$cont)
    final_dn <- append(final_dn,cl2$cont)
    final_dn <- append(final_dn,cl3$cont)

    final_dn <- unique(unlist(final_dn))
    #assign("final_dn", unique(unlist(final_dn)), envir = .GlobalEnv)
    return(cat("DONE ", crayon::green(cli::symbol$tick), "\n",
               ">>>Extracted word has been saved as", crayon::bgWhite(crayon::black("final_dn"))))
  }
}
Flaretie/mpb documentation built on Jan. 24, 2020, 3:18 a.m.