#' @include partition_class.R partitionBundle_class.R ngrams_method.R
NULL
setGeneric("compare", function(x, ...){standardGeneric("compare")})
#' compare features
#'
#' The features of two objects, usually a partition defining a corpus of
#' interest, and a partition defining a reference corpus are compared.
#' The most important purpose is term extraction.
#'
#' @param x a partition or partitionBundle object
#' @param y a partition object, it is assumed that the coi is a subcorpus of
#' ref
#' @param method the statistical test to apply (chisquare or log likelihood)
#' @param included TRUE if coi is part of ref, defaults to FALSE
#' @param verbose logical, defaults to TRUE
#' @param progress logical
#' @param mc logical, whether to use multicore
#' @param ... further parameters
#' @return The function returns a data frame with the following structure:
#' - absolute frequencies in the first row
#' - ...
#' @author Andreas Blaette
#' @aliases compare
#' @docType methods
#' @references Manning / Schuetze ...
#' @exportMethod compare
#' @examples
#' \dontrun{
#' use(polmineR.sampleCorpus)
#' kauder <- partition("PLPRBTTXT", text_name="Volker Kauder", pAttribute="word")
#' all <- partition("PLPRBTTXT", text_date=".*", regex=TRUE, pAttribute="word")
#' terms_kauder <- compare(kauder, all, included=TRUE)
#' top100 <- subset(terms_kauder, rank_chisquare <= 100)
#' }
#' @rdname compare-method
setMethod("compare", signature=c(x="partition"), function(
x, y,
included=FALSE,
method="chisquare",
verbose=FALSE
) {
if (verbose==TRUE) message ('Comparing x and y ...')
newObject <- new(
'comp',
encoding=x@encoding, included=included,
corpus=x@corpus,
sizeCoi=x@size, sizeRef=ifelse(included==FALSE, y@size, y@size-x@size),
stat=data.table()
)
newObject@call <- deparse(match.call())
# check whether count-lists are available for the pAttribute given
if (length(x@pAttribute) == 0 || length(y@pAttribute) == 0) stop("no count performed for x and y")
if (identical(x@pAttribute, y@pAttribute) == FALSE){
stop("mismatch of pAttribute of x and y")
} else {
newObject@pAttribute <- x@pAttribute
pAttribute <- x@pAttribute
}
if (verbose==TRUE) message("... combining frequency lists")
newObject@stat <- merge(x@stat, y@stat, by=pAttribute)
setnames(newObject@stat, c("count.x", "count.y"), c("count_coi", "count_ref"))
if (included == TRUE) newObject@stat[, "count_ref" := newObject@stat[["count_ref"]] - newObject@stat[["count_coi"]] ]
for (how in method){
if (verbose==TRUE) message("... statistical test: ", how)
newObject <- do.call(how, args=list(.Object=newObject))
}
newObject
})
#' @docType methods
#' @rdname compare-method
#' @examples
#' \dontrun{
#' use(polmineR.sampleCorpus)
#' byName <- partitionBundle("PLPRBTTXT", sAttribute="text_name")
#' byName <- enrich(byName, pAttribute="word")
#' all <- partition("PLPRBTTXT", text_date=".*", regex=TRUE, pAttribute="word")
#' result <- compare(byName, all, included=TRUE, progress=TRUE)
#' dtm <- as.DocumentTermMatrix(result, col="chisquare")
#' }
setMethod("compare", signature=c(x="partitionBundle"), function(
x, y,
included=FALSE, method="chisquare", verbose=TRUE, mc=getOption("polmineR.mc"), progress=FALSE
) {
.compare <- function(x, y, included, method, ...) compare(x=x, y=y, included=included, method=method)
retval <- new("compBundle")
retval@objects <- blapply(x@objects, f=.compare, y=y, included=included, method=method, verbose=verbose, mc=mc, progress=progress)
names(retval@objects) <- names(x@objects)
retval
})
#' @rdname compare-method
setMethod("compare", "cooccurrences", function(x, y, included = FALSE, method = "ll", mc = TRUE, verbose = TRUE){
newObject <- new(
'compCooccurrences',
encoding = x@encoding, included = included, corpus = x@corpus, sizeCoi = x@partitionSize,
sizeRef = ifelse(included == FALSE, y@partitionSize, y@partitionSize - x@partitionSize),
stat = data.table()
)
if (identical(x@pAttribute, y@pAttribute) == FALSE) {
warning("BEWARE: cooccurrences objects are not based on the same pAttribute!")
} else {
newObject@pAttribute <- x@pAttribute
}
if (verbose == TRUE) message("... preparing tabs for matching")
keys <- unlist(lapply(c("a", "b"), function(ab) paste(ab, x@pAttribute, sep="_")))
setkeyv(x@stat, keys)
setkeyv(y@stat, keys)
MATCH <- y@stat[x@stat]
# remove columns not needed
colsToDrop <- c(
"ll", "i.ll", "exp_window", "i.exp_window", "rank_ll", "i.rank_ll",
"size_window", "i.size_window", "count_a", "i.count_a", "count_b", "i.count_b",
"exp_partition", "i.exp_partition"
)
for (drop in colsToDrop) MATCH[, eval(drop) := NULL, with=TRUE]
setnames(MATCH, old=c("count_ab", "i.count_ab"), new=c("count_ref", "count_coi"))
if (included == TRUE) MATCH[, "count_ref" := MATCH[["count_ref"]] - MATCH[["count_coi"]] ]
newObject@stat <- MATCH
for (how in method){
if (verbose == TRUE) message("... statistical test: ", how)
newObject <- do.call(how, args = list(.Object = newObject))
}
newObject
})
#' @rdname compare-method
setMethod(
"compare", "ngrams",
function(x, y, included=FALSE, method="chisquare", verbose=TRUE, ...){
stopifnot(
identical(x@pAttribute, y@pAttribute) == TRUE,
x@n == y@n,
all(method %in% c("chisquare", "ll"))
)
tokenColnames <- unlist(lapply(c(1:x@n), function(i) paste("token_", i, "_", x@pAttribute, sep="")))
setkeyv(x@stat, cols=tokenColnames)
setkeyv(y@stat, cols=tokenColnames)
DT <- y@stat[x@stat]
setnames(DT, c("count", "i.count"), c("count_ref", "count_coi"))
setcolorder(DT, c(tokenColnames, "count_coi", "count_ref"))
if (included == TRUE) DT[, "count_ref" := DT[["count_ref"]] - DT[["count_coi"]] ]
newObject <- new(
"compareNgrams",
encoding=x@encoding, included=included, corpus=x@corpus, sizeCoi=x@size,
pAttribute=x@pAttribute, n=x@n,
sizeRef=ifelse(included == FALSE, y@size, y@size-x@size),
stat=DT
)
for (how in method){
if (verbose==TRUE) message("... statistical test: ", how)
newObject <- do.call(how, args=list(.Object=newObject))
}
newObject
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.