R/bm25.R
In superml: Build Machine Learning Models Like Using Python's Scikit-Learn Library in R

#' Best Matching(BM25) - Deprecated
#'
#' Computer BM25 distance between sentences/documents.
#'
#' @details
#' BM25 stands for Best Matching 25. It is widely using for ranking documents
#' and a preferred method than TF*IDF scores. It is used to find the similar
#' documents from a corpus, given a new document. It is popularly used in
#' information retrieval systems. This implementation uses multiple cores for
#' faster and parallel computation.
#' @keywords internal

bm25 <- R6::R6Class("bm25", public = list(

    #' @field corpus a list containing sentences
    corpus = NA,
    #' @field use_parallel enables parallel computation, defaults to FALSE
    use_parallel = FALSE,

    #' @details
    #' Create a new `bm25` object.
    #'
    #' @param corpus list, a list containing sentences
    #' @param use_parallel logical, enables parallel computation, defaults to FALSE. if TRUE uses n - 1 cores.
    #'
    #' @return A `bm25` object.
    #'
    #' example <- c('white audi 2.5 car','black shoes from office',
    #'              'new mobile iphone 7','audi tyres audi a3',
    #'              'nice audi bmw toyota corolla')
    #' obj <- bm25$new(example, use_parallel=FALSE)
    #' @keywords internal

    initialize = function(corpus, use_parallel){
        if (!(missing(corpus))) self$corpus <- private$transform(corpus)
        if (!(missing(use_parallel))) self$use_parallel <- use_parallel

        if (isTRUE(self$use_parallel)) {
            message('Computation will be done parallelly using all CPU cores.')
        } else {
            message('to activate parallel computation, set use_parallel=TRUE')
        }

    },


    #' @details
    #' Returns a list of the most similar sentence
    #'
    #' @param document character, for this value we find most similar sentences.
    #' @param topn integer, top n sentences to retrieve
    #' @return a vector of most similar documents
    #'
    #'
    #' example <- c('white audi 2.5 car','black shoes from office',
    #'              'new mobile iphone 7','audi tyres audi a3',
    #'              'nice audi bmw toyota corolla')
    #' get_bm <- bm25$new(example, use_parallel=FALSE)
    #' input_document <- c('white toyota corolla')
    #' get_bm$most_similar(document = input_document, topn = 2)
    #' @keywords internal

    most_similar = function(document, topn=1){
        # sort the list by values
        aa <- private$compute(document, self$corpus, self$use_parallel)
        ## return(aa) contains a list of documents & scores
        return(names(aa[order(unlist(aa), decreasing = T)][seq(topn)]))
    }
), private = list(

    calculate_idf = function(q, corpus){

        # check the token exists in how many documents
        q_in_ns <- sum(vapply(corpus,
                              function(x) q %in% x,
                              FUN.VALUE = logical(1)))

        # check token length
        corpus_len <- length(corpus)
        return(log((corpus_len - q_in_ns + 0.5) / (q_in_ns + 0.5)))

    },

    # tokenize the input
    transform = function(corpus){
        return(vapply(corpus,
                       function(x) strsplit(x, split = " "),
                       FUN.VALUE = list(1)))
    },


    bmscore = function(q, document_from_corpus, corpus){

        # constant values
        b <- 0.75
        k1 <- 1.25

        freq_q <- sum(document_from_corpus == q)
        doc_len <- length(document_from_corpus)
        mean_doc_len <- mean(vapply(corpus, length, FUN.VALUE = integer(1)))
        return(private$calculate_idf(q, corpus) *
                    ((freq_q * (k1 + 1)) /
                         ((freq_q + k1) *
                              (1 - b + b *
                                   (doc_len / mean_doc_len)))))

    },

    compute = function(document, corpus, use_parallel){

        if (isTRUE(self$use_parallel)) {
            # devtools uses 2 cores max. to check parallel processes
            # but here removed parameters to set cores.
            cores <- parallel::detectCores()
              # use single core

            message('using ', cores, ' cores for computation')
        } else {
            cores <- 1 # use single core
        }

        # document = your document should be a tokenized vector
        document <- unlist(strsplit(document, split = " "))
        temp_score <- parallel::mclapply(corpus,
                                         function(x) sum(vapply(document, function(y)
                                             private$bmscore(y, x, corpus), FUN.VALUE = numeric(1)))
                                         , mc.cores = cores)

        names(temp_score) <- names(corpus)
        return(temp_score)
    }



))

Any scripts or data that you put into this service are public.

superml documentation built on May 29, 2024, 1:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

superml
Build Machine Learning Models Like Using Python's Scikit-Learn Library in R

R/bm25.R
In superml: Build Machine Learning Models Like Using Python's Scikit-Learn Library in R

Try the superml package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

superml Build Machine Learning Models Like Using Python's Scikit-Learn Library in R

R/bm25.R In superml: Build Machine Learning Models Like Using Python's Scikit-Learn Library in R

Try the superml package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

superml
Build Machine Learning Models Like Using Python's Scikit-Learn Library in R

R/bm25.R
In superml: Build Machine Learning Models Like Using Python's Scikit-Learn Library in R