wordspace-intro.R
In wordspace: Distributional Semantic Models in R

## ----message=FALSE------------------------------------------------------------
library(wordspace)

## ----echo=FALSE---------------------------------------------------------------
set.seed(42)
idx <- sort(sample.int(nrow(DSM_VerbNounTriples_BNC), 10))
knitr::kable(DSM_VerbNounTriples_BNC[idx, ])

## -----------------------------------------------------------------------------
Triples <- subset(DSM_VerbNounTriples_BNC, mode == "written")

## -----------------------------------------------------------------------------
subset(Triples, noun == "dog" & verb == "walk")

## -----------------------------------------------------------------------------
VObj <- dsm(target=Triples$noun, feature=Triples$verb, score=Triples$f, raw.freq=TRUE)
dim(VObj)

## -----------------------------------------------------------------------------
subset(VObj$rows, rank(-f) <= 6) # 6 most frequent nouns

## -----------------------------------------------------------------------------
head(VObj)

## -----------------------------------------------------------------------------
VObj <- subset(VObj, nnzero >= 3, nnzero >= 3, recursive=TRUE)
dim(VObj)

## -----------------------------------------------------------------------------
VObj <- dsm.score(VObj, score="simple-ll", transform="log", normalize=TRUE, method="euclidean")

## -----------------------------------------------------------------------------
VObj

## -----------------------------------------------------------------------------
VObj300 <- dsm.projection(VObj, method="svd", n=300)
dim(VObj300)

## ---- fig.width=7, fig.height=3, echo=2---------------------------------------
par(mar=c(4,4,1,1))
plot(attr(VObj300, "R2"), type="h", xlab="latent dimension", ylab="R2")

## -----------------------------------------------------------------------------
pair.distances("book", "paper", VObj300, method="cosine")

## -----------------------------------------------------------------------------
pair.distances("book", "paper", VObj300, method="cosine", convert=FALSE)

## -----------------------------------------------------------------------------
nearest.neighbours(VObj300, "book", n=14) # reduced space

## -----------------------------------------------------------------------------
nn <- nearest.neighbours(VObj, "book", n=15)    # unreduced space
names(nn)

## ---- echo=c(1,3), fig.height=4-----------------------------------------------
nn.mat <- nearest.neighbours(VObj300, "book", n=15, dist.matrix=TRUE)
par(mar=c(1,1,1,1))
plot(nn.mat)

## ----echo=FALSE---------------------------------------------------------------
knitr::kable(RG65[seq(5, 65, 10), ])

## -----------------------------------------------------------------------------
eval.similarity.correlation(RG65, VObj300, convert=FALSE, format="HW")

## ---- echo=2------------------------------------------------------------------
par(mar=c(4,4,2,1))
plot(eval.similarity.correlation(RG65, VObj300, convert=FALSE, format="HW", details=TRUE))

## ---- echo=2------------------------------------------------------------------
par(mar=c(4,4,2,1))
plot(eval.similarity.correlation(RG65, DSM_Vectors, convert=FALSE, details=TRUE))

## -----------------------------------------------------------------------------
Vessel <- subset(SemCorWSD, target == "vessel" & pos == "n")
table(Vessel$gloss)

## ---- echo=FALSE--------------------------------------------------------------
knitr::kable(Vessel[, c("sense", "sentence")], row.names=FALSE)

## -----------------------------------------------------------------------------
centroids <- context.vectors(DSM_Vectors, Vessel$lemma, row.names=Vessel$id)

## ---- echo=2:4----------------------------------------------------------------
par(mar=c(2, 2, 2, 1))
library(cluster) # clustering algorithms of Kaufman & Rousseeuw (1990)
res <- pam(dist.matrix(centroids), 2, diss=TRUE, keep.diss=TRUE)
plot(res, col.p=factor(Vessel$sense), shade=TRUE, which=1, main="WSD for 'vessel'")

## ---- echo=1, eval=2----------------------------------------------------------
table(res$clustering, Vessel$sense)
knitr::kable(table(res$clustering, Vessel$sense))

## -----------------------------------------------------------------------------
eval.clustering(Vessel, M=centroids, word.name="id", class.name="sense")

## -----------------------------------------------------------------------------
mouse <- VObj300["mouse", ] # extract row vectors from matrix
computer <- VObj300["computer", ]

## -----------------------------------------------------------------------------
nearest.neighbours(VObj300, "mouse", n=12)

## -----------------------------------------------------------------------------
nearest.neighbours(VObj300, mouse + computer, n=12)

## -----------------------------------------------------------------------------
nearest.neighbours(VObj300, mouse * computer, n=12)