#' NNS Term Matrix
#'
#' Generates a term matrix for text classification use in \link{NNS.reg}.
#'
#' @param x mixed data.frame; character/numeric; A two column dataset should be used. Concatenate text from original sources to comply with format. Also note the possibility of factors in \code{"DV"}, so \code{"as.numeric(as.character(...))"} is used to avoid issues.
#' @param oos mixed data.frame; character/numeric; Out-of-sample text dataset to be classified.
#' @return Returns the text as independent variables \code{"IV"} and the classification as the dependent variable \code{"DV"}. Out-of-sample independent variables are returned with \code{"OOS"}.
#' @references Viole, F. and Nawrocki, D. (2013) "Nonlinear Nonparametric Statistics: Using Partial Moments" (ISBN: 1490523995)
#' @examples
#' \dontrun{
#' x <- data.frame(cbind(c("sunny", "rainy"), c(1, -1)))
#' NNS.term.matrix(x)
#'
#' ### Concatenate Text with space separator, cbind with "DV"
#' x <- data.frame(cbind(c("sunny", "rainy"), c("windy", "cloudy"), c(1, -1)))
#' x <- data.frame(cbind(paste(x[ , 1], x[ , 2], sep = " "), as.numeric(as.character(x[ , 3]))))
#' NNS.term.matrix(x)
#'
#' ### NYT Example
#' require(RTextTools)
#' data(NYTimes)
#'
#' ### Concatenate Columns 3 and 4 containing text, with column 5 as DV
#' NYT <- data.frame(cbind(paste(NYTimes[ , 3], NYTimes[ , 4], sep = " "),
#' as.numeric(as.character(NYTimes[ , 5]))))
#' NNS.term.matrix(NYT)
#' }
#' @export
NNS.term.matrix <- function(x, oos = NULL){
if(any(class(x)%in%c("tbl","data.table"))) x <- as.data.frame(x)
p <- length(oos)
x <- t(t(x))
n <- nrow(x)
#Remove commas, etc.
mgsub <- function(pattern, x, ...) {
result <- x
for (i in 1:length(pattern)) {
result <- gsub(pattern[i], "", result, ...)
}
result
}
#Use all lowercase to simplify instances
x[ , 1] <- tolower(mgsub(c(",", ";", ":", "'s", " . "), x[ , 1]))
unique.vocab <- unique(unlist(strsplit(as.character(x[ , 1]), " ", fixed = TRUE)))
#Sub with a longer .csv to be called to reduce IVs
prepositions <- c("a", "in", "of", "our", "the", "is", "for", "with", "we", "this", "it", "but", "was",
"at", "to", "on", "aboard", "aside", "by", "means", "spite", "about", "as", "concerning",
"instead", "above", "at", "considering", "into", "according", "atop", "despite", "view",
"across", "because", "during", "near", "like", "across", "after", "against", "ahead", "along",
"alongside", "amid", "among", "apart", "around", "out", "outside", "over", "owing", "past", "prior",
"before", "behind", "below", "beneath", "beside", "besides", "between", "beyond", "regarding",
"round", "since", "through", "throughout", "till", "down", "except", "from", "addition",
"back", "front", "place", "regard", "inside", "together", "toward", "under", "underneath", "until",
"nearby", "next", "off", "account", "onto", "top", "opposite", "out", "unto", "up", "within", "without", "what")
#Remove prepositions
preps <- unique.vocab %in% c(prepositions)
if(sum(preps) > 0) unique.vocab <- unique.vocab[!preps]
if(!is.null(oos)){
oos.preps <- oos %in% c(prepositions)
if(sum(oos.preps) > 0){
oos <- oos[!oos.preps]
}
}
NNS.TM <- t(sapply(1 : length(x[ , 1]), function(i) as.integer(tryCatch(as.numeric(unique.vocab%in%x[i,1]), error = function (e) 0))))
colnames(NNS.TM) <- c(unique.vocab)
if(!is.null(oos)){
OOS.TM <- t(sapply(1 : length(oos), function(i) as.numeric(unique.vocab%in%oos[i])))
colnames(OOS.TM) <- c(unique.vocab)
return(list("IV" = NNS.TM,
"DV" = as.integer(as.character(x[ , 2])),
"OOS" = OOS.TM))
} else {
return(list("IV" = NNS.TM,
"DV" = as.integer(as.character(x[ , 2]))))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.