Nothing
##
## Support code for manipulating the stuff handed back from the C++ program
##
#' Make matrix of where phrases appear in corpus.
#'
#' Construct a $n X p$ matrix of appearances for selected phrases out of textreg object.
#' $n$ is the number of documents, $p$ is the number of phrases selected in the result object `rules.'
#'
#' @export
#' @param rules Either a textreg.result object or the rules list from such an object.
#' @param n (Optional) If giving a rules list, the number of documents in corpus.
phrase.matrix = function( rules, n ) {
if ( is.textreg.result( rules ) ) {
n = rules$notes$n
rules = rules$rules
}
mat = matrix( 0, nrow=n, ncol=length(rules) )
colnames(mat) = sapply( rules, function(x) { x$ngram } )
for ( i in 1:length(rules) ) {
rl = rules[[i]]$support + 1
#rl = subset( rl, rl < 0 )
mat[ rl, i ] = rules[[i]]$weight
}
mat[ , "*intercept*"] = 1
mat
}
#' Predict labeling with the selected phrases.
#'
#' Given raw text and a textreg model, predict the labeling by counting appearance of
#' relevant phrases in text and then multiplying these counts by the beta vector
#' associated with the textreg object. Just like linear regression.
#'
#' @export
#' @param object A textreg.result object
#' @param new.text If you want to predict for new text, pass it along.
#' @param return.matrix TRUE means hand back the phrase appearance pattern matrix.
#' @param ... Nothing can be passed extra.
#' @return Vector of predictions (numbers).
#'
#' @examples
#' res = textreg( c( "", "", "A", "A" ), c( -1, -1, 1, 1 ),
#' C=1, Lq=1, convergence.threshold=0.00000001, verbosity=0 )
#' predict( res )
#' predict( res, new.text=c("A B C A") )
predict.textreg.result = function( object, new.text= NULL, return.matrix=FALSE, ... ) {
stopifnot( is.textreg.result( object ) )
model = object
if ( !is.null( new.text ) ) {
keyphrase.mat = make.phrase.matrix( model$model$ngram, new.text )
keyphrase.mat[ , "*intercept*" ] = 1
} else {
keyphrase.mat = phrase.matrix( model )
}
model = model$model
kp = sweep( keyphrase.mat, 2, model$Z, FUN="/" )
rsp = as.numeric( kp %*% model$beta )
if ( return.matrix ) {
attr( rsp, "keyphrase.matrix" ) <- keyphrase.mat
}
rsp
}
#' Calculate total loss of model (Squared hinge loss).
#'
#' Calculate the loss for a model in predicting the -1/+1 labeling.
#' If new text and labeling given, then calc loss on the new text and labeling.
#' This can be useful for cross validation and train-test splits.
#'
#' @export
#' @param model.blob The model returned from \code{\link{textreg}}
#' @param new.text New text (string or tm Corpus) to predict labeling for
#' @param new.labeling Labeling to go with new text.
#' @param loss Type of loss to calc for.
#' @return Three numbers: total loss, loss from prediction, loss from penalty term
#' @examples
#' data( testCorpora )
#' res = textreg( c( "", "", "A", "A" ), c( -1, -1, 1, 1 ), C=1, Lq=1,
#' convergence.threshold=0.00000001, verbosity=0 )
#' calc.loss( res )
#' calc.loss( res, new.text=c("A B C A"), new.labeling=c(1) )
calc.loss = function( model.blob, new.text=NULL, new.labeling=NULL, loss=c( "square.hinge", "square", "hinge") ) {
loss = match.arg( loss )
model = model.blob$model
# k.mat = phrase.matrix( model.blob$rules, model.blob$notes$n )
pd = predict( model.blob, new.text )
if ( is.null( new.labeling ) ) {
if ( !is.null( new.text ) ) {
stop( "New text without new labeling" )
}
new.labeling = model.blob$labeling
}
if ( loss =="square.hinge" ) {
loss = sum( pmax( (1 - new.labeling*pd), 0 )^2 )
} else if ( loss == "square" ) {
loss = sum( (pd-new.labeling)^2 )
} else if ( loss == "hinge" ) {
loss = sum( pmax( (1 - pd* new.labeling), 0 ) )
}
pen = model.blob$notes$C * sum( abs( model$beta[ -1 ] ) )
c( tot.loss=loss+pen, loss=loss, penalty=pen )
}
#' Clean up output from textreg.
#'
#' Calculate some useful statistics (percents, etc) and return as dataframe.
#'
#' @export
#' @param model The model returned from \code{\link{textreg}}
#' @param short True if the output should be abbrviated for easy consumption.
#' @return Dataframe with statistics on the terms in the model
#' @family textreg.result
reformat.textreg.model = function( model, short=TRUE ) {
stopifnot( is.textreg.result( model ) )
npos = model$model$posCount[[1]]
nneg = model$model$negCount[[1]]
mod = model$model
mod$per = mod$posCount / mod$totalDocs
mod$perPos = mod$posCount / npos
mod$perNeg = mod$negCount / nneg
#mod$perWord = mod$posWordCount / mod$support
if ( !short ) {
mod[ c( "ngram", "beta", "Z", "support", "totalDocs", "posCount", "negCount", "per", "perPos", "perNeg" ) ]
#, "posWordCount", "negWordCount", "perWord" ) ]
} else {
mod = mod[ c("ngram", "support", "totalDocs", "posCount", "per", "perPos") ]
names(mod) = c("phrase", "num.phrase", "num.reports", "num.tag", "per.tag", "per.phrase" )
mod$per.tag = round( 100 * mod$per.tag )
mod$per.phrase = round( 100 * mod$per.phrase )
mod = mod[ order( mod$per.phrase, decreasing=TRUE ), ]
mod
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.