demo/lsa_landauer.R

### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### demo/lsa_landauer.r
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
# generate the files of the famous Landauer example

ldir = tempfile()
dir.create(ldir)
write( c("human", "interface", "computer"), file=paste(ldir, "c1", sep="/"))
write( c("survey", "user", "computer", "system", "response", "time"), file=paste(ldir, "c2", sep="/"))
write( c("EPS", "user", "interface", "system"), file=paste(ldir, "c3", sep="/"))
write( c("system", "human", "system", "EPS"), file=paste(ldir, "c4", sep="/"))
write( c("user", "response", "time"), file=paste(ldir, "c5", sep="/"))
write( c("trees"), file=paste(ldir, "m1", sep="/"))
write( c("graph", "trees"), file=paste(ldir, "m2", sep="/"))
write( c("graph", "minors", "trees"), file=paste(ldir, "m3", sep="/"))
write( c("graph", "minors", "survey"), file=paste(ldir, "m4", sep="/"))

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
# generate doc term matrix from landauer files

dtm = textmatrix(ldir, minWordLength=1)
dtm

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
# make a space, reconstruct original

landauerOriginalSpace = lsa(dtm, dims=dimcalc_raw())
X = as.textmatrix(landauerOriginalSpace)

# X should be equal to dtm (beside rounding errors)
all( (round(X,2) == dtm) == TRUE)

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
# reduce dimensionality (Y shall be 
# the recalculated 'reduced' matrix)

landauerSpace = lsa(dtm, dims=2)
Y = as.textmatrix(landauerSpace)
round(Y,2)

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
# now read in again the landauer sample (but 
# with the vocabulary of the existing matrix)

pdocs = textmatrix(ldir, vocabulary=rownames(dtm))

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -
# now calc a pseudo SVD on the basis of dtm's SVD

Y2 = fold_in(pdocs, landauerSpace)
round(Y2,2)

# Y and Y2 should be the same (as well as 
# dtm and pdocs should be equal)

all( (round(Y,2) == round(Y2,2)) == TRUE)

# calc pearson doc2doc correlation

rawCor = cor(dtm)
lsaCor = cor(Y)

# you should clearly see, that the "computer" documents (starting with "C")
# can in lsaCor be much better be differentiated from the "math" documents
# (starting with "m"). Moreover, the computer and math documents respectively
# have become more similar within their group.

round(rawCor,2)
round(lsaCor,2)

# clean up
unlink(ldir, recursive=TRUE)

Try the lsa package in your browser

Any scripts or data that you put into this service are public.

lsa documentation built on May 9, 2022, 9:10 a.m.