#' Parts of Speech Tagging
#'
#' \code{pos} - Apply part of speech tagger to transcript(s).
#'
#' @param text.var The text variable
#' @param parallel logical. If TRUE attempts to run the function on multiple
#' cores. Note that this may not mean a speed boost if you have one core or if
#' the data set is smaller as the cluster takes time to create.
#' @param na.omit logical. If TRUE missing values (\code{NA}) will be omitted.
#' @param digits Integer; number of decimal places to round when printing.
#' @param progress.bar logical. If TRUE attempts to provide a OS appropriate
#' progress bar. If parallel is TRUE this argument is ignored. Note that
#' setting this argument to TRUE may slow down the function.
#' @param percent logical. If TRUE output given as percent. If FALSE the
#' output is proportion.
#' @param zero.replace Value to replace 0 values with.
#' @param gc.rate An integer value. This is a necessary argument because of a
#' problem with the garbage collection in the openNLP function that
#' \code{\link[qdap]{pos}} wraps. Consider adjusting this argument upward if
#' the error \code{java.lang.OutOfMemoryError} occurs.
#' @return pos returns a list of 4:
#' \item{text}{The original text}
#' \item{POStagged}{The original words replaced with parts of speech in context.}
#' \item{POSprop}{Dataframe of the proportion of parts of speech by row.}
#' \item{POSfreq}{Dataframe of the frequency of parts of speech by row.}
#' \item{POSrnp}{Dataframe of the frequency and proportions of parts of speech
#' by row.}
#' \item{percent}{The value of percent used for plotting purposes.}
#' \item{zero.replace}{The value of zero.replace used for plotting purposes.}
#' @rdname pos
#' @seealso
#' \code{\link[openNLP]{tagPOS}}
#' @references \href{openNLP}{http:/opennlp.apache.org}
#' @keywords parts-of-speech
#' @export
#' @import openNLP parallel openNLPmodels.en Snowball
#' @examples
#' posdat <- pos(DATA$state)
#' ltruncdf(posdat, 7, 4)
#' ## str(posdat)
#' names(posdat)
#' posdat$text #original text
#' posdat$POStagged #words replaced with parts of speech
#' posdat$POSprop[, 1:8] #proportion of parts of speech by row
#' posdat$POSfreq #frequency of parts of speech by row
#'
#' out1 <- pos(DATA$state, parallel = TRUE) # not always useful
#' ltruncdf(out1, 7, 4)
#'
#' #use pos.tags to interpret part of speech tags used by pos & pos.by
#' pos.tags()[1:10, ]
#' pos.tags("matrix")[1:10, ]
#' pos.tags("dataframe")[1:10, ]
#' pos.tags("df")[1:10, ]
#' ltruncdf(pos.tags("all"), 3)
#'
#' posbydat <- with(DATA, pos.by(state, sex))
#' names(posbydat)
#' ltruncdf(posbydat, 7, 4)
#' truncdf(posbydat$pos.by.prop, 4)
#'
#' POSby <- with(DATA, pos.by(state, list(adult, sex)))
#' plot(POSby, values = TRUE, digits = 2)
#' #or more quickly - reuse the output from before
#' out2 <- with(DATA, pos.by(posbydat, list(adult, sex)))
pos <-
function(text.var, parallel = FALSE, na.omit = FALSE, digits = 1,
progress.bar = TRUE, percent = TRUE, zero.replace=0, gc.rate=10){
ntv <- length(text.var)
pos1 <- function(i) {
x <- tagPOS(strip(i))
return(x)
}
if (parallel){
cl <- makeCluster(mc <- getOption("cl.cores", detectCores()))
clusterExport(cl=cl, varlist=c("text.var", "ntv", "gc.rate",
"pos1"), envir = environment())
m <- parLapply(cl, seq_len(ntv), function(i) {
x <- pos1(text.var[i])
if (i%%gc.rate==0) gc()
return(x)
}
)
stopCluster(cl)
m <- unlist(m)
} else {
if (progress.bar != FALSE){
if (Sys.info()[['sysname']] == "Windows" & progress.bar != "text"){
pb <- winProgressBar(title = "progress bar", min = 0,
max = ntv, width = 300)
m <- lapply(seq_len(ntv), function(i) {
x <- pos1(text.var[i])
if (i%%gc.rate==0) gc()
setWinProgressBar(pb, i, title = paste(round(i/ntv*100, 0),
"% done"))
return(x)
}
)
close(pb)
m <- unlist(m)
} else {
pb <- txtProgressBar(min = 0, max = ntv, style = 3)
m <- lapply(seq_len(ntv), function(i) {
x <- pos1(text.var[i])
if (i%%gc.rate==0) gc()
setTxtProgressBar(pb, i)
return(x)
}
)
close(pb)
m <- unlist(m)
}
} else {
m <- lapply(seq_len(ntv), function(i) {
x <- pos1(text.var[i])
if (i%%gc.rate==0) gc()
return(x)
}
)
m <- unlist(m)
}
}
names(m) <- NULL
poser <- function(x) sub("^.*/([^ ]+).*$","\\1",
unlist(strsplit(x, " ")))
o <- lapply(m, poser)
lev <- sort(unique(unlist(o)))
G4 <- do.call(rbind,lapply(o,function(x,lev){
tabulate(factor(x,levels = lev, ordered = TRUE),
nbins = length(lev))},lev = lev))
colnames(G4) <-sort(lev)
m <- data.frame(POStagged = m)
m$POStags <- o
m$word.count <- word.count(text.var)
cons <- ifelse(percent, 100, 1)
G5 <- sapply(data.frame(G4, check.names = FALSE),
function(x) cons*(x/m$word.count))
colnames(G5) <- paste0("prop", colnames(G5))
G4 <- data.frame(wrd.cnt = m$word.count, G4, check.names = FALSE)
G5 <- data.frame(wrd.cnt = m$word.count, G5, check.names = FALSE)
if (any(is.na(G4$wrd.cnt))) {
nas <- which(is.na(G4$wrd.cnt))
G4[nas, 2:ncol(G4)] <- NA
m[nas, 1:ncol(m)] <- NA
}
rnp <- raw_pro_comb(G4[, -1], G5[, -1], digits = digits,
percent = percent, zero.replace = zero.replace, override = TRUE)
rnp <- data.frame(G4[, 1, drop = FALSE], rnp, check.names = FALSE)
POS <- list(text = text.var, POStagged = m, POSprop = G5, POSfreq = G4,
POSrnp = rnp, percent = percent, zero.replace = zero.replace)
if(na.omit) POS <- lapply(POS, na.omit)
class(POS) <- "pos"
return(POS)
}
#' Parts of Speech by Grouping Variable(s)
#'
#' \code{pos.by} - Apply part of speech tagger to transcript(s) by zero or more
#' grouping variable(s).
#'
#' @rdname pos
#' @param grouping.var The grouping variables. Default NULL generates one word
#' list for all text. Also takes a single grouping variable or a list of 1 or
#' more grouping variables.
#' @param \ldots Other argument supplied to \code{pos}.
#' @return pos.by returns a list of 6:
#' \item{text}{The original text}
#' \item{POStagged}{The original words replaced with parts of speech in context.}
#' \item{POSprop}{Dataframe of the proportion of parts of speech by row.}
#' \item{POSfreq}{Dataframe of the frequency of parts of speech by row.}
#' \item{POSrnp}{Dataframe of the frequency and proportions of parts of speech
#' by row.}
#' \item{pos.by.prop}{Dataframe of the proportion of parts of speech by grouping
#' variable.}
#' \item{pos.by.freq}{Dataframe of the frequency of parts of speech by grouping
#' variable.}
#' \item{pos.by.rnp}{Dataframe of the frequency and proportions of parts of
#' speech by grouping variable.}
#' \item{percent}{The value of percent used for plotting purposes.}
#' \item{zero.replace}{The value of zero.replace used for plotting purposes.}
#' @export
pos.by <-
function(text.var, grouping.var = NULL, digits = 1, percent = TRUE,
zero.replace = 0, ...){
if(is.null(grouping.var)) {
G <- "all"
} else {
if (is.list(grouping.var)) {
m <- unlist(as.character(substitute(grouping.var))[-1])
m <- sapply(strsplit(m, "$", fixed=TRUE), function(x) {
x[length(x)]
}
)
G <- paste(m, collapse="&")
} else {
G <- as.character(substitute(grouping.var))
G <- G[length(G)]
}
}
check <- FALSE
if (class(text.var) %in% c("pos", "pos.by", "formality")) {
pos.list <- text.var
text.var <- text.var[["POSfreq"]]
check <- TRUE
} else {
pos.list <- pos(text.var, digits = digits, percent = percent, ...)
text.var <- pos.list[["POSfreq"]]
}
if(is.null(grouping.var)){
if (check) {
grouping <- rep("all", nrow(text.var))
} else {
grouping <- rep("all", length(text.var))
}
} else {
if (is.list(grouping.var) & length(grouping.var)>1) {
grouping <- paste2(grouping.var)
} else {
grouping <- unlist(grouping.var)
}
}
DF1 <- data.frame(grouping, text.var, check.names = FALSE)
L1 <- split(DF1, DF1$grouping)
L2 <- lapply(L1, function(x) colSums(x[, -1], na.rm = TRUE))
DF2 <- data.frame(do.call("rbind", L2), check.names = FALSE)
DF2 <- data.frame(replace = rownames(DF2), DF2, check.names = FALSE)
rownames(DF2) <- 1:nrow(DF2)
colnames(DF2)[1] <- G
o <- unclass(pos.list)
o[["pos.by.freq"]] <- DF2
cons <- ifelse(percent, 100, 1)
propby <- lapply(1:nrow(DF2), function(i) {
cons*(DF2[i, -c(1:2)]/rowSums(DF2[, -c(1:2)])[i])
})
propby <- as.matrix(do.call(rbind, propby))
propby[is.nan(propby)] <- 0
o[["pos.by.prop"]] <- suppressWarnings(data.frame(DF2[, 1:2], propby,
check.names = FALSE))
rnp2 <- raw_pro_comb(o[["pos.by.freq"]][, -c(1:2)],
o[["pos.by.prop"]][, -c(1:2)], digits = digits,
percent = percent, zero.replace = zero.replace, override = TRUE)
o[["pos.by.rnp"]] <- data.frame(o[["pos.by.freq"]][, 1:2],
rnp2, check.names = FALSE)
class(o) <- "pos.by"
return(o)
}
#' Parts of Tags
#'
#' \code{pos.tags} - Useful for interpreting the parts of speech tags created by
#' pos and pos.by.
#'
#' @rdname pos
#' @param type An optional character string giving the output of the pos tags.
#' This must be one of the strings \code{"pretty"} (a left justified version of
#' the output optimized for viewing but not good for export), \code{"matrix"}
#' (a matrix version of the output), \code{"dataframe"}\\ \code{"df"} (a
#' dataframe version of the output), \code{"all"} (a list of all three of the
#' previous output types).
#' @export
pos.tags <-
function(type = "pretty"){
POStags.df <- structure(list(Tag = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 15L,
13L, 14L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L,
35L, 36L), .Label = c("CC", "CD", "DT", "EX", "FW",
"IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP",
"NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB",
"RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD",
"VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"),
class = "factor"), Description = structure(c(8L, 7L,
9L, 10L, 11L, 23L, 1L, 2L, 3L, 13L, 14L, 16L, 15L,
25L, 24L, 22L, 19L, 18L, 20L, 4L, 5L, 6L, 17L, 26L,
27L, 12L, 29L, 33L, 30L, 32L, 31L, 28L, 35L, 36L,
21L, 34L), .Label = c("Adjective",
"Adjective, comparative", "Adjective, superlative",
"Adverb", "Adverb, comparative", "Adverb, superlative",
"Cardinal number", "Coordinating conjunction",
"Determiner", "Existential there", "Foreign word",
"Interjection", "List item marker", "Modal",
"Noun, plural", "Noun, singular or mass", "Particle",
"Personal pronoun", "Possessive ending",
"Possessive pronoun", "Possessive wh-pronoun",
"Predeterminer",
"Preposition or subordinating conjunction",
"Proper noun, plural", "Proper noun, singular",
"Symbol", "to", "Verb, 3rd person singular present",
"Verb, base form", "Verb, gerund or present participle",
"Verb, non-3rd person singular present",
"Verb, past participle", "Verb, past tense", "Wh-adverb",
"Wh-determiner", "Wh-pronoun"), class = "factor")),
.Names = c("Tag", "Description"), row.names = c(NA, -36L),
class = "data.frame", comment =
"http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html")
POStags.matrix <- as.matrix(POStags.df)
POStags <- left.just(POStags.df, 1:2)
x <- switch(type,
pretty = POStags,
matrix = POStags.matrix,
df = POStags.df,
dataframe = POStags.df,
all = {list(POStags.df=POStags.df,
POStags.matrix=POStags.matrix, POStags=POStags)},
stop("incorrect type specified")
)
return(x)
}
#' Prints a pos Object.
#'
#' Prints a pos object.
#'
#' @param x The pos object
#' @param digits Integer values specifying the number of digits to be
#' printed.
#' @param percent logical. If TRUE output given as percent. If FALSE the
#' output is proportion. If NULL uses the value from
#' \code{\link[qdap]{termco}}. Only used if \code{label} is TRUE.
#' @param zero.replace Value to replace 0 values with. If NULL uses the value
#' from \code{\link[qdap]{termco}}. Only used if \code{label} is TRUE.
#' @param \ldots ignored
#' @method print pos
#' @S3method print pos
print.pos <-
function(x, digits = 1, percent = NULL, zero.replace = NULL, ...) {
WD <- options()[["width"]]
options(width=3000)
if (!is.null(percent)) {
if (percent != x$percent) {
DF <- as.matrix(x$POSprop[, -c(1:2)])
if (percent) {
DF <- DF*100
} else {
DF <- DF/100
}
x$POSprop <- data.frame(x$POSprop[, 1:2], DF, check.names = FALSE)
}
} else {
percent <- x$percent
}
if (is.null(zero.replace)) {
zero.replace <- x$zero.replace
}
rnp <- raw_pro_comb(x$POSfreq[, -1, drop = FALSE],
x$POSprop[, -1, drop = FALSE], digits = digits, percent = percent,
zero.replace = zero.replace)
rnp <- data.frame(x$POSfreq[, 1, drop = FALSE], rnp, check.names = FALSE)
print(rnp)
options(width=WD)
}
#' Prints a pos.by Object.
#'
#' Prints a pos.by object.
#'
#' @param x The pos.by object
#' @param digits Integer values specifying the number of digits to be
#' printed.
#' @param percent logical. If TRUE output given as percent. If FALSE the
#' output is proportion. If NULL uses the value from
#' \code{\link[qdap]{termco}}. Only used if \code{label} is TRUE.
#' @param zero.replace Value to replace 0 values with. If NULL uses the value
#' from \code{\link[qdap]{termco}}. Only used if \code{label} is TRUE.
#' @param \ldots ignored
#' @method print pos.by
#' @S3method print pos.by
print.pos.by <-
function(x, digits = 1, percent = NULL, zero.replace = NULL, ...) {
WD <- options()[["width"]]
options(width=3000)
if (!is.null(percent)) {
if (percent != x$percent) {
DF <- as.matrix(x$Pos.by.prop[, -c(1:2)])
if (percent) {
DF <- DF*100
} else {
DF <- DF/100
}
x$pos.by.prop <- data.frame(x$pos.by.prop[, 1:2], DF,
check.names = FALSE)
}
} else {
percent <- x$percent
}
if (is.null(zero.replace)) {
zero.replace <- x$zero.replace
}
rnp <- raw_pro_comb(x$pos.by.freq[, -c(1:2), drop = FALSE],
x$pos.by.prop[, -c(1:2), drop = FALSE], digits = digits,
percent = percent, zero.replace = zero.replace)
rnp <- data.frame(x$pos.by.freq[, 1:2, drop = FALSE], rnp,
check.names = FALSE)
print(rnp)
options(width=WD)
}
#' Plots a pos.by Object
#'
#' Plots a pos.by object.
#'
#' @param x The pos.by object
#' @param label logical. If TRUE the cells of the heat map plot will be labeled
#' with count and proportional values.
#' @param lab.digits Integer values specifying the number of digits to be
#' printed if \code{label} is TRUE.
#' @param percent logical. If TRUE output given as percent. If FALSE the
#' output is proportion. If NULL uses the value from
#' \code{\link[qdap]{question_type}}. Only used if \code{label} is TRUE.
#' @param zero.replace Value to replace 0 values with. If NULL uses the value
#' from \code{\link[qdap]{question_type}}. Only used if \code{label} is TRUE.
#' @param \ldots Other arguments passed to qheat.
#' @method plot pos.by
#' @S3method plot pos.by
plot.pos.by <- function(x, label = FALSE, lab.digits = 1, percent = NULL,
zero.replace = NULL, ...) {
if (label) {
if (!is.null(percent)) {
if (percent != x$percent) {
DF <- as.matrix(x$pos.by.prop[, -c(1:2)])
if (percent) {
DF <- DF*100
} else {
DF <- DF/100
}
x$pos.by.prop <- data.frame(x$pos.by.prop[, 1:2], DF,
check.names = FALSE)
}
} else {
percent <- x$percent
}
if (is.null(zero.replace)) {
zero.replace <- x$zero.replace
}
rnp <- raw_pro_comb(x$pos.by.freq[, -c(1:2)], x$pos.by.prop[, -c(1:2)],
digits = lab.digits, percent = percent, , override = TRUE,
zero.replace = x$zero.replace)
rnp <- data.frame(x$pos.by.freq[, 1:2], rnp, check.names = FALSE)
qheat(x$pos.by.prop, values=TRUE, mat2 = rnp, ...)
} else {
qheat(x$pos.by.prop, ...)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.