bathtub_vignette.R
In textreg: n-Gram Text Regression, aka Concise Comparative Summarization

## ----setup, include=FALSE, warning=FALSE--------------------------------------
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.align="center", fig.width=5, fig.height=5, size="scriptsize")
options(width = 80)


## ----install.package, echo=TRUE, eval=FALSE-----------------------------------
#  install.packages("textreg_0.1.tar.gz", repos = NULL, type="source")

## ----LoadPackage, echo=TRUE, message=FALSE, warning=FALSE---------------------
library( textreg )
library( tm )
data( bathtub )
bathtub

## ----GetLabeling, echo=TRUE---------------------------------------------------
mth.lab = meta(bathtub)$meth.chl
table( mth.lab )

## ----Getbanwords,echo=TRUE----------------------------------------------------
banwords = c( "methylene", "chloride")

## ----DoRegression, echo=TRUE, out.width="0.5\\textwidth"----------------------
rs = textreg(bathtub, mth.lab, C=4, gap=1, min.support = 1, 
            verbosity=0, convergence.threshold=0.00001, maxIter=100 )
rs

## ----See.results, echo=TRUE---------------------------------------------------
print( reformat.textreg.model( rs ), row.names=FALSE )

## ----plot_results, echo=TRUE--------------------------------------------------
plot( rs )

## ----Play.with.parameters, echo=TRUE------------------------------------------
rs5 = textreg( bathtub, mth.lab, banwords, C = 5, gap=1, min.support = 1, 
            verbosity=0, convergence.threshold=0.00001, maxIter=100 )
rsLq5 = textreg( bathtub, mth.lab, banwords, C = 3, Lq=5, gap=1, min.support = 1, 
               verbosity=0, convergence.threshold=0.00001, maxIter=100 )
rsMinSup10 = textreg( bathtub, mth.lab, banwords, C = 3, Lq=5, gap=1, min.support = 10,
                    verbosity=0, positive.only=TRUE, convergence.threshold=0.00001, maxIter=100 )
rsMinPat2 = textreg( bathtub, mth.lab, banwords, C = 3, Lq=5, gap=1, min.support = 1, 
                   min.pattern=2, verbosity=0, convergence.threshold=0.00001, maxIter=100 )

## ----show.different.models, results='asis', echo=TRUE-------------------------
library(xtable)
lst = list( rs5, rsLq5, rsMinSup10, rsMinPat2 )
names(lst) = c("C=5", "Lq=5","sup=10", "pat=2")
tbl = make.list.table( lst, topic="Misc Models" )
print( xtable( tbl, caption="Table from the make.list.table call" ), 
       latex.environments="tiny" )

## ----plot_different_models, echo=TRUE, fig.width=4----------------------------
list.table.chart( tbl )

## ----FindC, echo=TRUE---------------------------------------------------------
Cs = find.threshold.C( bathtub, mth.lab, banwords, R = 100, gap=1, min.support = 5, 
                       verbosity=0, convergence.threshold=0.00001 )

Cs[1]
summary( Cs[-1] )

C = quantile( Cs, 0.95 )
C

## ----dropDocs, echo=TRUE------------------------------------------------------
mth.lab.lit = mth.lab
mth.lab.lit[20:length(mth.lab)] = 0

rs.lit = textreg( bathtub, mth.lab.lit, banwords, C = 4, gap=1, min.support = 1, verbosity=0 )
rs.lit
rs.lit$labeling

## ----See.results.loc, echo=TRUE-----------------------------------------------
hits = phrase.matrix( rs )
dim( hits )
t( hits[ 1:10, ] )
hits.lit = phrase.matrix( rs.lit )
dim(hits.lit)

## ----See.results.loc2, echo=TRUE----------------------------------------------
apply( hits[ mth.lab == 1, ], 1, sum )
apply( hits[ mth.lab == 1, ], 2, sum )

## ----phrase.count.demo, echo=TRUE---------------------------------------------
tt2 = phrase.count( "tub * a", bathtub )
head( tt2 )
table( tt2, dnn="Counts for tub * a" )

## ----appearance.pat, echo=TRUE------------------------------------------------
tab = make.phrase.matrix( c( "bathtub", "tub * a" ), bathtub )
head( tab )
table( tab[,2] )

## ----make.count.table.demo, echo=TRUE-----------------------------------------
ct = make.count.table( c( "bathtub", "tub * a", "bath" ), mth.lab, bathtub )
ct

## ----grab.frag.demo, echo=TRUE------------------------------------------------
tmp = grab.fragments( "bathtub", bathtub, char.before=30, char.after=30, clean=TRUE )
tmp[1:3]

## ----sample.frag.demo, echo=TRUE----------------------------------------------
frags = sample.fragments( "tub * a", mth.lab, bathtub, 20, char.before=30, char.after=30 )
frags

## ----ClusterPhraes, echo=TRUE, out.width="0.5\\textwidth"---------------------
cluster.phrases( rs, num.groups=3 )

## ----Make_phrase_cor_chart, echo=TRUE, out.width="0.5\\textwidth"-------------
make.phrase.correlation.chart( rs, count=TRUE, num.groups=3 )

## ----CalcLoss, echo=TRUE------------------------------------------------------
calc.loss( rs )

## ----Prediction, echo=TRUE, out.width="0.5\\textwidth"------------------------
pds = predict( rs )
labs = rs$labeling
table( labs )
boxplot( pds ~ labs, ylim=c(-1,1) ) 
abline( h=c(-1,1), col="red" )

## ----Outofsample, echo=TRUE, out.width="0.5\\textwidth"-----------------------
    smp = sample( length(bathtub), length(bathtub)*0.5 )
	rs = textreg(  bathtub[smp], mth.lab[smp], C = 3, gap=1, min.support = 5, 
              verbosity=0, convergence.threshold=0.00001, maxIter=100 )
	rs
	train.pred = predict( rs )
	test.pred = predict( rs, bathtub[-smp] )

	train.loss = calc.loss( rs )
	train.loss
	test.loss = calc.loss( rs, bathtub[-smp], mth.lab[-smp] )
	test.loss

## ----Cross Validation, echo=TRUE----------------------------------------------
  tbl = find.CV.C( bathtub, mth.lab, c("methylene","chloride"), 4, 8, verbosity=0 )
  print( round( tbl, digits=3 ) )

## ----CrossValidationPlot, echo=TRUE, out.width="0.5\\textwidth"---------------
  rs = make.CV.chart( tbl )
  rs

## ----CleanAndStem, echo=TRUE--------------------------------------------------
data( dirtyBathtub )
strwrap( dirtyBathtub$text[[1]] )
bc = VCorpus( VectorSource( dirtyBathtub$text ) )

bc.clean = clean.text( bc )
strwrap( bc.clean[[1]] )
  
bc.stem = stem.corpus(bc.clean, verbose=FALSE)
strwrap( bc.stem[[1]] )

## ----CleanAndStem2, echo=TRUE-------------------------------------------------
  res.stm = textreg(  bc.stem, mth.lab, c("chlorid+", "methylen+"), C=4, verbosity=0 )
  res.stm

  sample.fragments( "that contain+", res.stm$labeling, bc.stem, 5, char.before=10 )
  sample.fragments( "that contain+", res.stm$labeling, bc.clean, 5, char.before=10 )