#' Create a semantic map of a keyword
#'
#' Build a scatterplot that represents the conceptual structure of a keyword.
#'
#' @param mat A word-context matrix.
#'
#' @param keyword A string.
#'
#' @param method A character string: 'cosine', 'euclidean', 'pearson' or 'covariance', which names
#' the mathematical similarity test to be performed. Default is 'cosine'.
#'
#' @param margin Numeric value: 1 or 2. If 1, calculations are performed over the rows. If 2, over
#' the columns.
#'
#' @param numResults Numeric value. The number of words to be displayed in the
#' graph. Default is 40.
#'
#' @param numGrps Numeric value. The number of groups in which you'd like to divide
#' the display. Default is 5.
#'
#' @return A scatterplot showing structure of 30 most-similar words.
#'
#' @section What it does:
#' This function runs \code{\link{most_similar}} over a word-context matrix and looks for the
#' thirty most similar terms, then clusters them.
#' @export
similarity_map = function(mat,
keyword,
method = "cosine",
margin = 1,
numResults = 40,
numGrps = 5) {
if (method %in% c("cosine", "euclidean", "pearson", "covariance") == F) {
stop("The 'method' argument must be deployed: 'cosine', 'euclidean', 'pearson', 'covariance'.")
}
# Get words most similar to keyword
results = most_similar(mat = mat, vec = keyword, method = method, margin = margin, fullResults = T)
if (method == "euclidean") {
results = sort(results)[1:numResults]
} else {
results = sort(results, decreasing = T)[1:numResults]
}
words = names(results)
# Set correlation method
if (method == "cosine") {
corr_method = function(x, y) { x %*% y/(sqrt(x %*% x) * sqrt(y %*% y)) }
}
if (method == "euclidean") {
corr_method = function(x, y) { sqrt(sum((x - y)^2)) }
}
if (method == "covariance") {
corr_method = cov
}
if (method == "pearson") {
corr_method = cor
}
# Now build a correlations matrix among the words
if (margin == 1) {
mat = mat[words, ]
correlations = matrix(0, length(words), length(words))
for (i in 1:nrow(correlations)) {
correlations[i, ] = apply(mat, 1, corr_method, mat[i, ])
}
rownames(correlations) = words
hc = hclust(dist(correlations))
cluster <- cutree(hc, k=numGrps)
xy <- data.frame(cmdscale(dist(correlations)), factor(cluster), factor(words))
names(xy) <- c("x", "y", "clusts", "words")
single_item_cluster = which(table(xy$clusts) == 1)
bad_hits = which(xy$clusts %in% single_item_cluster)
outliers = c()
if (any(bad_hits)) {
outliers = xy$words[bad_hits]
xy = xy[-bad_hits,]
}
xdiff = (max(xy$x) - min(xy$x)) / 4
ydiff = (max(xy$y) - min(xy$y)) / 4
p = ggplot(xy, aes(x, y, label = words, group = clusts))
if (length(outliers) == 0) {
p = p + xlab("")
}
if (length(outliers) == 1) {
p = p + xlab(paste("Outlier excluded from graph:", paste(outliers, collapse = ", ")))
}
if (length(outliers) > 1) {
p = p + xlab(paste("Outliers excluded from graph:", paste(outliers, collapse = ", ")))
}
p = p + stat_density2d(aes(alpha = ..level.., fill = clusts), geom="polygon", bins = 30, size = 0) +
scale_alpha_continuous(range=c(0.01,0.033)) +
geom_text(colour = "black") +
theme_bw() +
xlim(min(xy$x) - xdiff, max(xy$x) + xdiff) +
ylim(min(xy$y) - ydiff, max(xy$y) + ydiff) +
ylab("") +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank(),
axis.line.x = element_blank(),
axis.line.y = element_blank(),
legend.position = 'none',
text = element_text(size = 2),
title = element_text(size=10, face='bold'))
print(p)
}
if (margin == 2) {
mat = mat[,words]
correlations = matrix(0, length(words), length(words))
for (i in 1:nrow(correlations)) {
correlations[, i] = apply(mat, 2, corr_method, mat[, i])
}
rownames(correlations) = words
hc = hclust(dist(correlations))
cluster <- cutree(hc, k=numGrps)
data(tcp)
row.names(tcp) = tcp$TCP
titles = tcp[words,"Title"]
titles = substring(titles, 1, 20)
titles = paste(words, titles, sep=", ")
xy <- data.frame(cmdscale(dist(correlations)), factor(cluster), titles)
names(xy) <- c("x", "y", "clusts", "words")
single_item_cluster = which(table(xy$clusts) == 1)
bad_hits = which(xy$clusts %in% single_item_cluster)
outliers = c()
if (any(bad_hits)) {
outliers = xy$words[bad_hits]
xy = xy[-bad_hits,]
}
xdiff = (max(xy$x) - min(xy$x)) / 4
ydiff = (max(xy$y) - min(xy$y)) / 4
p = ggplot(xy, aes(x, y, label = words, group = clusts))
if (length(outliers) == 0) {
p = p + xlab("")
}
if (length(outliers) == 1) {
p = p + xlab(paste("Outlier excluded from graph:", paste(outliers, collapse = ", ")))
}
if (length(outliers) > 1) {
p = p + xlab(paste("Outliers excluded from graph:", paste(outliers, collapse = ", ")))
}
p = p + stat_density2d(aes(alpha = ..level.., fill = clusts), geom="polygon", bins = 30, size = 0) +
scale_alpha_continuous(range=c(0.01,0.033)) +
geom_text(colour = "black") +
theme_bw() +
xlim(min(xy$x) - xdiff, max(xy$x) + xdiff) +
ylim(min(xy$y) - ydiff, max(xy$y) + ydiff) +
ylab("") +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank(),
axis.line.x = element_blank(),
axis.line.y = element_blank(),
legend.position = 'none',
text = element_text(size = 2),
title = element_text(size=10, face='bold'))
print(p)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.