Nothing
## ceeboo 2008
## <NOTE>
## Currently the approach to counting of word sequences is a bad
## workaround. As R has a character cache this could be implemented
## using pointer comparisons and prefix trees, but the latter would be
## inefficient compared to using hash tables. However, the R API does
## not provide the necessary interfaces to its hash table code.
## </NOTE>
textcnt <-
function(x, n = 3L, split = "[[:space:][:punct:][:digit:]]+",
tolower = TRUE, marker = "_", words = NULL, lower = 0L,
method = c("ngram", "string", "prefix", "suffix"),
recursive = FALSE, persistent = FALSE, useBytes = FALSE,
perl = TRUE, verbose = FALSE, decreasing = FALSE)
{
if (is.list(x)) {
if (recursive) {
if (persistent) {
for (z in x[-length(x)])
textcnt(z, n, split, tolower, marker, words, lower,
method, recursive, persistent, perl, useBytes,
verbose)
x <- x[length(x)]
persistent <- FALSE
} else
return(lapply(x, textcnt, n, split, tolower, marker, words,
lower, method, recursive, persistent, useBytes,
perl, verbose))
}
} else {
if (is.null(x)) return (x)
x <- list(x)
}
n <- as.integer(n)
if (n < 0L)
stop("'n' invalid value")
## shortcut
if (n < 1L)
return(NULL)
method <- match.arg(method)
if (!is.null(split))
x <-
if (!is.null(formals(strsplit)$useBytes))
lapply(lapply(x, strsplit, split, perl = perl, useBytes = useBytes), unlist)
else
lapply(lapply(x, strsplit, split, perl = perl), unlist)
if (!useBytes &&
tolower)
x <- lapply(x, tolower)
if (!is.null(words))
x <- .Call(tau_copyTruncate, x, words)
if (method == "ngram") {
if (length(grep(marker, unlist(x, use.names = FALSE),
useBytes = TRUE)))
stop("'marker' contained in 'x'")
## <NOTE>
## As the C code generates all n-grams up to the specified
## length we have to remove padding counts later. [2010/2]
## </NOTE>
if (marker == "\2") {
## pad
m <- paste(rep.int(marker, n - 1L), collapse = "")
x <- lapply(x, function(x) gsub("$(?<!^)", m, x, perl = TRUE,
useBytes = useBytes))
}
## add marker at both ends
x <- lapply(x, function(x) gsub("^(?!$)|$(?<!^)", marker, x,
perl = TRUE, useBytes = useBytes))
x <- .Call(tau_utf8CountNgram, x, n, lower, verbose, persistent,
useBytes)
if (length(x) && marker == "\2") {
## Determine suffixes
i <- grep("^\2{2,}$", names(x), useBytes = useBytes)
## Remove
x <- x[-i]
## Reduce to prefix count
x["\2"] <- x["\2"] / (n + 1L)
## Replace with default marker
names(x) <- gsub("\2",formals(textcnt)$marker, names(x),
useBytes = useBytes)
}
## <NOTE>
## Because of its design the C code can only adjust the prefix
## counts. Here we handle the suffix counts. Obviously, a suffix
## without the marker is a prefix to that suffix and thus the
## former contains the counts of the latter. Note that this
## includes the special case of a string that is both, a prefix
## and a suffix. [2009/9]
## </NOTE>
if (length(x) && marker == "\1") {
## Determine suffixes.
i <- grep("\1$", names(x), useBytes = useBytes)
## Match suffixes, including the marker.
m <- gsub("\1$(?<!^\1)", "", names(x)[i], perl = TRUE,
useBytes = useBytes)
m <- match(m, names(x), nomatch = 0L)
i <- i[m > 0]
m <- m[m > 0]
x[m] <- x[m] - x[i] ## reduce counts
x <- x[x > 0] ## remove
## Replace with the default marker.
names(x) <- gsub("^\1|\1$", formals(textcnt)$marker, names(x),
useBytes = useBytes)
}
} else {
## preprocess
if (method == "string" && n > 1L)
x <- lapply(x, function(x) {
x <- .Call(tau_removeBlank, x)
x <- unlist(lapply(.Call(tau_copyToNgram, x, n), paste,
collapse = " "))
if (is.null(x))
x <- ""
x
})
x <- .Call(tau_utf8CountString, x, n, lower,
match(method, c("string", "prefix", "suffix")) - 1L,
verbose, persistent, useBytes)
}
if (!is.null(x)) {
## Note ties are not reordered (see help sort) which
## means they are in prefix-tree order.
if (decreasing)
sort(x, decreasing = TRUE) -> x
attr(x, "useBytes") <- useBytes
class(x) <- "textcnt"
}
x
}
## util
format.textcnt <-
function(x, ...)
data.frame(
frq = c(x),
rank = rank(c(x)),
bytes = nchar(names(x), type = "bytes", allowNA = TRUE),
Encoding = Encoding(names(x)),
row.names = names(x),
stringsAsFactors = FALSE
)
###
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.