R/parseSentence.R

Defines functions parseSentence trueCommas cleanPunctuation

Documented in parseSentence

#' A function to make GDD NLP output human-readable
#'
#' This function will convert a sentence from the raw GeoDeepDive nlp output tables into a more human-readable format.
#'
#' @param Sentence a record (row) in the GDD NLP output
#' @param Parameters a vector of sentence properties
#'
#' @details This function will convert a sentence from the raw GeoDeepDive nlp output tables into a more human-readable format. It returns a matrix of where each column represents an individual word in the sentence, and each row represents a specific property of the word - e.g., poses, dep_parents, dep_paths.
#'
#' @return A matrix
#'
#' @author Andrew A. Zaffos & Erika T. Ito
#'
#' @examples
#'
#' # TBDDDDD
#'
#' @rdname parseSentence
#' @export
# Plots ngram
# Parse the NLP strings into a matrix format
parseSentence<-function(Sentence,Parameters=c("words","dep_paths","dep_parents")) {
        Sentence<-stats::setNames(cleanPunctuation(Sentence),names(Sentence))
        if ("words"%in%names(Sentence)) {Sentence["words"]<-trueCommas(Sentence["words"])}
        WordsMatrix<-sapply(Sentence[Parameters],function(x) strsplit(x,","))
        if (sum(diff(sapply(WordsMatrix,length)))!=0) {return(NA)}
        WordsMatrix<-do.call(rbind,WordsMatrix)
        WordsMatrix[which(WordsMatrix=="COMMASUB")]<-","
        WordsMatrix[which(WordsMatrix=="SPACESUB")]<-""
        colnames(WordsMatrix)<-1:ncol(WordsMatrix)
        return(WordsMatrix)
        }

# R confuses 2,000,381 in a PostgreSQL array as 2 000 381, this function will convert those cases to 2000381.
trueCommas<-function(Words) {
        InsideQuotes<-regmatches(Words, gregexpr('"[^"]*"',Words))[[1]]
        if (length(InsideQuotes)<1) {return(Words)}
        Replacements<-gsub(",","",InsideQuotes)
        for (i in 1:length(InsideQuotes)) {
                Words<-noquote(gsub(InsideQuotes[i],Replacements[i],Words))
                }
        return(Words)
        }

# Remove or replace problematic punctuation
# Even though this is redundnat with trueCommas it applies to more fields
cleanPunctuation<-function(Sentence) {
        Sentence<-gsub("\"\"","SPACESUB",Sentence)
        Sentence<-gsub("\",\"","COMMASUB",Sentence)
        Sentence<-gsub("\\{|\\}","",Sentence)
        Sentence<-gsub("-LRB-","(",Sentence)
        Sentence<-gsub("-RRB-",")",Sentence)
        Sentence<-gsub("-LCB-","{",Sentence)
        Sentence<-gsub("-RCB-","}",Sentence)
        return(Sentence)
        }
aazaff/geocarrot documentation built on May 5, 2019, 9:44 p.m.