R/generate_term_dataset.R

Defines functions generate.term.dataset

Documented in generate.term.dataset

#' Generate frequencies of terms for each paper
#'
#' @param cleaned_text output of 'clean.text' - a list of cleaned text files
#' @param in_dir directory with input text files
#' @param keywords A set of keywords as characters (i.e. traits of interest) in a vector
#'
#' @return A list of frequencies of keywords for each paper
#' @export
#'
#' @examples
#' download.file("https://github.com/ajhelmstetter/papieRmache/raw/master/inst/extdata/test_pdfs.zip", destfile = "./test_pdfs.zip")
#' unzip("./test_pdfs.zip")
#' ct<-clean.text(in_dir = "./test_pdfs/",all_keywords=kw)
#' generate.term.dataset(cleaned_text = ct, in_dir = "./testpdfs/",keywords = c("bisse","musse"))
#'
generate.term.dataset <- function(cleaned_text,in_dir, keywords) {

    options(stringsAsFactors = FALSE)

    data_files <- list.files(path = in_dir, pattern = "*.txt$", full.names = T)


    # empty lists to fill
    combined_text <- list(rep("NULL", length(cleaned_text)))
    sorted_words <- list(rep("NULL", length(cleaned_text)))
    term_freq <- vector()

    for (k in 1:length(cleaned_text)) {
        if (length(cleaned_text[[k]]) > 0) {

            combined_text[[k]] <- stri_join_list(list(cleaned_text[[k]]))

            # change text list to corpus format
            corpus_txt <- Corpus(VectorSource(combined_text[[k]]))

            # make character maps
            corp <- tm_map(corpus_txt, removePunctuation, ucp = TRUE)

            # make term matrix
            corp.tdm <- TermDocumentMatrix(corp, control = list(removePunctuation = T, stopwords = TRUE, tolower = TRUE, stemming = F,
                removeNumbers = T))

            corp.tdm

            # calculate frequency of terms
            ft <- findFreqTerms(corp.tdm, lowfreq = 1, highfreq = Inf)
            as.matrix(corp.tdm[ft, ])

            ft <- ft[grepl(paste(keywords,collapse = "|"),ft)]

            # change into matrix and sort by frequency
            ft.tdm <- as.matrix(corp.tdm[ft, ])
            sorted_words[[k]] <- sort(apply(ft.tdm, 1, sum), decreasing = TRUE)
            term_freq <- cbind(term_freq, names(sorted_words[[k]]))

        }
    }

    ### FIGURE OUT WARNINGS HERE

    return(sorted_words)

}
ajhelmstetter/papieRmache documentation built on March 30, 2024, 9:22 p.m.