R/map_word_to_step.R
In Sysrecon: Systematical Metabolic Reconstruction

Documented in map_word_to_step

#' @title map_word_to_step
#' @param wordsMatrix The word matrix generated by the function TermDocumentMatrix in the tm package.
#' @param stepsMatrix Manually constructed metabolic process matrix.
#' @return Specific metabolic process matrix based on text content mapping.
#' @import stringr
#' @export
#' @examples
#' \donttest{matrixProcess <- map_word_to_step(wordsMatrix, stepsMatrix)}

map_word_to_step <- function(wordsMatrix, stepsMatrix){

        stepsMatrix[is.na(stepsMatrix)] <- 0

        # Extract the frequently used, longest and shortest words in the vocabulary matrix and de-duplicate
        allwords <- c(wordsMatrix$prevalent, wordsMatrix$longest, wordsMatrix$shortest) %>% unique()
        # Record the location information of each word
        freq <- rep(wordsMatrix$freq, 3)

        # The mapping score is obtained based on whether the marker words of each metabolic process is mapped to the text vocabulary matrix
        score <- lapply(stepsMatrix$MarkerWords, function(x){
                # Marker words for each process
                words <- str_split(x,';') %>% unlist()
                total <- 0
                # Statistical mapping results for each tagged vocabulary
                for (i in words) {
                        result <- sum(grepl(i, allwords, ignore.case = T))
                        if(result != 0){
                                result = 1
                        } else {
                                result = 0
                        }
                        total = total + result
                }
                return(total/length(words))
        }) %>% unlist()

        # Calculate the use of each metabolic process in the text based on the frequency of the text vocabulary
        degree <- lapply(stepsMatrix$MarkerWords, function(x){
                words <- str_split(x,';') %>% unlist()

                total <- c()
                for (i in words) {
                        # Obtain the minimum frequency after mapping each tagging vocabulary
                        result <- min(freq[grepl(i, allwords, ignore.case = T)])
                        total <- c(total, result)
                        # NA is forced to convert to inf, and inf is converted to 0
                        total[is.infinite(total)] = 0
                        total <- sum(total)
                }
                return(total)
        }) %>% unlist()

        # Extract the metabolic processes and frequencies with mapping scores greater than a preset threshold
        stepsMatrix[!score >= stepsMatrix$ThresholdValue,-c(1:5)] <- 0
        degree[!score >= stepsMatrix$ThresholdValue] = 0

        # Remove unwanted information: tagging vocabulary information, threshold information, grouping information
        matrix <- stepsMatrix[,-c(1,2,4)]
        matrix <- data.frame(matrix)
        rownames(matrix) <- matrix$Steps
        matrix <- as.data.frame(matrix)
        matrix <- matrix[,-1]

        matrix$degree <- degree
        matrix <- matrix[,c(ncol(matrix), 1:(ncol(matrix)-1))]
        return(matrix)
}