pos | R Documentation |
pos
- Apply part of speech tagger to transcript(s).
pos_by
- Apply part of speech tagger to transcript(s) by zero or more
grouping variable(s).
pos_tags
- Useful for interpreting the parts of speech tags created by
pos and pos_by.
pos(
text.var,
parallel = FALSE,
cores = detectCores()/2,
progress.bar = TRUE,
na.omit = FALSE,
digits = 1,
percent = TRUE,
zero.replace = 0,
gc.rate = 10
)
pos_by(
text.var,
grouping.var = NULL,
digits = 1,
percent = TRUE,
zero.replace = 0,
...
)
pos_tags(type = "pretty")
text.var |
The text variable. |
parallel |
logical. If |
cores |
The number of cores to use if |
progress.bar |
logical. If |
na.omit |
logical. If |
digits |
Integer; number of decimal places to round when printing. |
percent |
logical. If |
zero.replace |
Value to replace 0 values with. |
gc.rate |
An integer value. This is a necessary argument because of a
problem with the garbage collection in the openNLP function that
|
grouping.var |
The grouping variables. Default |
type |
An optional character string giving the output of the pos tags.
This must be one of the strings |
... |
Other argument supplied to |
pos
- returns a list of 4:
text |
The original text |
POStagged |
The original words replaced with parts of speech in context. |
POSprop |
Dataframe of the proportion of parts of speech by row. |
POSfreq |
Dataframe of the frequency of parts of speech by row. |
POSrnp |
Dataframe of the frequency and proportions of parts of speech by row. |
percent |
The value of percent used for plotting purposes. |
zero.replace |
The value of zero.replace used for plotting purposes. |
pos_by
- returns a list of 6:
text |
The original text |
POStagged |
The original words replaced with parts of speech in context. |
POSprop |
Dataframe of the proportion of parts of speech by row. |
POSfreq |
Dataframe of the frequency of parts of speech by row. |
POSrnp |
Dataframe of the frequency and proportions of parts of speech by row. |
pos.by.prop |
Dataframe of the proportion of parts of speech by grouping variable. |
pos.by.freq |
Dataframe of the frequency of parts of speech by grouping variable. |
pos.by.rnp |
Dataframe of the frequency and proportions of parts of speech by grouping variable. |
percent |
The value of percent used for plotting purposes. |
zero.replace |
The value of zero.replace used for plotting purposes. |
Note that contractions are treated as two words; for example the word
count on "what's" is 2 for "what + is". This is not consistent
with the word_count
treatment of contractions but makes
sense in a part of speech framework where a phrase such as "She's cool" is
treated as a pronoun, verb and adjective respectively for "She + is + cool".
http:/opennlp.apache.org
Maxent_POS_Tag_Annotator
,
colcomb2class
## Not run:
posdat <- pos(DATA$state)
ltruncdf(posdat, 7, 4)
## str(posdat)
names(posdat)
posdat$text #original text
## Methods
preprocessed(posdat) #words replaced with parts of speech
counts(posdat) #frequency of parts of speech by row
proportions(posdat) #proportion of parts of speech by row
## Methods Plotting
plot(preprocessed(posdat))
plot(counts(posdat))
plot(proportions(posdat))
plot(posdat)
out1 <- pos(DATA$state, parallel = TRUE) # not always useful
ltruncdf(out1, 7, 4)
#use pos_tags to interpret part of speech tags used by pos & pos_by
pos_tags()[1:10, ]
pos_tags("matrix")[1:10, ]
pos_tags("dataframe")[1:10, ]
pos_tags("df")[1:10, ]
ltruncdf(pos_tags("all"), 3)
posbydat <- with(DATA, pos_by(state, sex))
names(posbydat)
## Methods
scores(posbydat)
preprocessed(posbydat)
counts(posbydat)
proportions(posbydat)
## Methods Plotting
plot(preprocessed(posbydat))
plot(counts(posbydat))
plot(proportions(posbydat))
plot(posbydat)
ltruncdf(posbydat, 7, 4)
truncdf(posbydat$pos.by.prop, 4)
POSby <- with(DATA, pos_by(state, list(adult, sex)))
plot(POSby, values = TRUE, digits = 2)
#or more quickly - reuse the output from before
out2 <- with(DATA, pos_by(posbydat, list(adult, sex)))
## Definite/Indefinite Noun
## 2 approached compared...
## The later is more efficient but less accurate
## ------------------------##
## Part off speech tagging ##
## ------------------------##
pos_after <- function(text.var, words, pos){
posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
namespos <- lapply(posses, function(x) {
y <- unlist(strsplit(x, "/"))
setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
})
lapply(namespos, function(x, thewords = words, thepos = pos){
locs <- which(x %in% thewords)
locs <- locs[!is.na(locs)]
if (identical(unclass(locs), integer(0))) return(NA_character_)
nounlocs <- which(names(x) %in% thepos)
unname(x[unique(sapply(locs, function(x){
min(nounlocs[nounlocs - x > 0])
}))])
})
}
out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
m <- stats::setNames(data.frame(sort(table(unlist(o))),
stringsAsFactors = FALSE), c("word", "freq"))
m[m$freq> 3, ]
}), c("a", "the"))
dat2 <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))
dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")
dat2 <- dat2[order(dat2$freq, dat2$Word), ]
ord2 <- aggregate(freq ~ Word, dat2, sum)
dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Part Of Speech Parsing Approach")
dev.new()
## --------------------##
## Regular Expressions ##
## --------------------##
library(qdapRegex);library(ggplot2);library(reshape2)
out <- setNames(lapply(c("@after_a", "@after_the"), function(x) {
o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
pattern = x, extract=TRUE)
m <- stats::setNames(data.frame(sort(table(unlist(o))),
stringsAsFactors = FALSE), c("word", "freq"))
m[m$freq> 3, ]
}), c("a", "the"))
dat <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
dat <- dat[order(dat$freq, dat$Word), ]
ord <- aggregate(freq ~ Word, dat, sum)
dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Regex Approach")
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.