Description Usage Arguments Value Note References See Also Examples
pos
- Apply part of speech tagger to transcript(s).
pos_by
- Apply part of speech tagger to transcript(s) by zero or more
grouping variable(s).
pos_tags
- Useful for interpreting the parts of speech tags created by
pos and pos_by.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
text.var |
The text variable. |
parallel |
logical. If |
cores |
The number of cores to use if |
progress.bar |
logical. If |
na.omit |
logical. If |
digits |
Integer; number of decimal places to round when printing. |
percent |
logical. If |
zero.replace |
Value to replace 0 values with. |
gc.rate |
An integer value. This is a necessary argument because of a
problem with the garbage collection in the openNLP function that
|
grouping.var |
The grouping variables. Default |
type |
An optional character string giving the output of the pos tags.
This must be one of the strings |
... |
Other argument supplied to |
pos
- returns a list of 4:
text |
The original text |
POStagged |
The original words replaced with parts of speech in context. |
POSprop |
Dataframe of the proportion of parts of speech by row. |
POSfreq |
Dataframe of the frequency of parts of speech by row. |
POSrnp |
Dataframe of the frequency and proportions of parts of speech by row. |
percent |
The value of percent used for plotting purposes. |
zero.replace |
The value of zero.replace used for plotting purposes. |
pos_by
- returns a list of 6:
text |
The original text |
POStagged |
The original words replaced with parts of speech in context. |
POSprop |
Dataframe of the proportion of parts of speech by row. |
POSfreq |
Dataframe of the frequency of parts of speech by row. |
POSrnp |
Dataframe of the frequency and proportions of parts of speech by row. |
pos.by.prop |
Dataframe of the proportion of parts of speech by grouping variable. |
pos.by.freq |
Dataframe of the frequency of parts of speech by grouping variable. |
pos.by.rnp |
Dataframe of the frequency and proportions of parts of speech by grouping variable. |
percent |
The value of percent used for plotting purposes. |
zero.replace |
The value of zero.replace used for plotting purposes. |
Note that contractions are treated as two words; for example the word
count on "what's" is 2 for "what + is". This is not consistent
with the word_count
treatment of contractions but makes
sense in a part of speech framework where a phrase such as "She's cool" is
treated as a pronoun, verb and adjective respectively for "She + is + cool".
http:/opennlp.apache.org
Maxent_POS_Tag_Annotator
,
colcomb2class
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | ## Not run:
posdat <- pos(DATA$state)
ltruncdf(posdat, 7, 4)
## str(posdat)
names(posdat)
posdat$text #original text
## Methods
preprocessed(posdat) #words replaced with parts of speech
counts(posdat) #frequency of parts of speech by row
proportions(posdat) #proportion of parts of speech by row
## Methods Plotting
plot(preprocessed(posdat))
plot(counts(posdat))
plot(proportions(posdat))
plot(posdat)
out1 <- pos(DATA$state, parallel = TRUE) # not always useful
ltruncdf(out1, 7, 4)
#use pos_tags to interpret part of speech tags used by pos & pos_by
pos_tags()[1:10, ]
pos_tags("matrix")[1:10, ]
pos_tags("dataframe")[1:10, ]
pos_tags("df")[1:10, ]
ltruncdf(pos_tags("all"), 3)
posbydat <- with(DATA, pos_by(state, sex))
names(posbydat)
## Methods
scores(posbydat)
preprocessed(posbydat)
counts(posbydat)
proportions(posbydat)
## Methods Plotting
plot(preprocessed(posbydat))
plot(counts(posbydat))
plot(proportions(posbydat))
plot(posbydat)
ltruncdf(posbydat, 7, 4)
truncdf(posbydat$pos.by.prop, 4)
POSby <- with(DATA, pos_by(state, list(adult, sex)))
plot(POSby, values = TRUE, digits = 2)
#or more quickly - reuse the output from before
out2 <- with(DATA, pos_by(posbydat, list(adult, sex)))
## Definite/Indefinite Noun
## 2 approached compared...
## The later is more efficient but less accurate
## ------------------------##
## Part off speech tagging ##
## ------------------------##
pos_after <- function(text.var, words, pos){
posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
namespos <- lapply(posses, function(x) {
y <- unlist(strsplit(x, "/"))
setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
})
lapply(namespos, function(x, thewords = words, thepos = pos){
locs <- which(x %in% thewords)
locs <- locs[!is.na(locs)]
if (identical(unclass(locs), integer(0))) return(NA_character_)
nounlocs <- which(names(x) %in% thepos)
unname(x[unique(sapply(locs, function(x){
min(nounlocs[nounlocs - x > 0])
}))])
})
}
out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
m <- stats::setNames(data.frame(sort(table(unlist(o))),
stringsAsFactors = FALSE), c("word", "freq"))
m[m$freq> 3, ]
}), c("a", "the"))
dat2 <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))
dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")
dat2 <- dat2[order(dat2$freq, dat2$Word), ]
ord2 <- aggregate(freq ~ Word, dat2, sum)
dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Part Of Speech Parsing Approach")
dev.new()
## --------------------##
## Regular Expressions ##
## --------------------##
library(qdapRegex);library(ggplot2);library(reshape2)
out <- setNames(lapply(c("@after_a", "@after_the"), function(x) {
o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
pattern = x, extract=TRUE)
m <- stats::setNames(data.frame(sort(table(unlist(o))),
stringsAsFactors = FALSE), c("word", "freq"))
m[m$freq> 3, ]
}), c("a", "the"))
dat <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
dat <- dat[order(dat$freq, dat$Word), ]
ord <- aggregate(freq ~ Word, dat, sum)
dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Regex Approach")
## End(Not run)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.