word_cor: Find Correlated Words

View source: R/word_cor.R

word_corR Documentation

Find Correlated Words

Description

Find associated words within grouping variable(s).

Usage

word_cor(
  text.var,
  grouping.var = qdapTools::id(text.var),
  word,
  r = 0.7,
  values = TRUE,
  method = "pearson",
  ...
)

Arguments

text.var

The text variable (or frequency matrix).

grouping.var

The grouping variables. Default uses each row as a group. Also takes a single grouping variable or a list of 1 or more grouping variables. Unlike other qdap functions, this cannot be NULL.

word

The word(s) vector to find associated words for.

r

The correlation level find associated words for. If positive this is the minimum value, if negative this is the maximum value.

values

logical. If TRUE returns the named correlates (names are the words). If FALSE only the associated words are returned.

method

A character string indicating which correlation coefficient is to be computed ("pearson", "kendall", or "spearman").

...

Other arguments passed to wfm.

Value

Returns a vector of associated words or correlation matrix if r = NULL.

Note

Note that if a word has no variablity in it's usage across grouping variable(s) the sd will result in 0, thus cor will will likely return a warning as in this example: cor(rep(3, 10), rnorm(10)).

References

The plotting method for the list output was inspired by Ben Marwick; see https://stackoverflow.com/a/19925445/1000343 for more.

See Also

word_proximity, findAssocs, word_associate, wfm, cor

Examples

## Not run: 
x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep = "|")))
word_cor(rajSPLIT$dialogue, x, "romeo", .45)
word_cor(rajSPLIT$dialogue, x, "love", .5)  

## Negative correlation
word_cor(rajSPLIT$dialogue, x, "you", -.1)
with(rajSPLIT, word_cor(dialogue, list(person, act), "hate"))

words <- c("hate", "i", "love", "ghost")
with(rajSPLIT, word_cor(dialogue, x, words, r = .5))
with(rajSPLIT, word_cor(dialogue, x, words, r = .4))

## Set `r = NULL` to get matrix between words
with(rajSPLIT, word_cor(dialogue, x, words, r = NULL))

## Plotting 
library(tm)
data("crude")
oil_cor1 <- apply_as_df(crude, word_cor, word = "oil", r=.7)
plot(oil_cor1)

oil_cor2 <- apply_as_df(crude, word_cor, word = qcv(texas, oil, money), r=.7)
plot(oil_cor2)
plot(oil_cor2, ncol=2)

oil_cor3 <- apply_as_df(crude, word_cor, word = qcv(texas, oil, money), r=NULL)
plot(oil_cor3)

## Run on multiple times/person/nested
## Split and apply to data sets
## Suggested use of stemming
DATA3 <- split(DATA2, DATA2$person)

## Find correlations between words per turn of talk by person
## Throws multiple warning because small data set
library(qdapTools)
lapply(DATA3, function(x) {
    word_cor(x[, "state"], qdapTools::id(x), qcv(computer, i, no, good), r = NULL)
})

## Find words correlated per turn of talk by person
## Throws multiple warning because small data set
lapply(DATA3, function(x) {
    word_cor(x[, "state"], qdapTools::id(x), qcv(computer, i, no, good))
})


## A real example
dat <- pres_debates2012 
dat$TOT <- factor(with(dat, paste(time, pad(TOT(tot)), sep = "|")))
dat <- dat[dat$person %in% qcv(OBAMA, ROMNEY), ]
dat$person <- factor(dat$person)
dat.split <- with(dat, split(dat, list(person, time)))

wrds <- qcv(america, debt, dollar, people, tax, health)
lapply(dat.split, function(x) {
    word_cor(x[, "dialogue"], x[, "TOT"], wrds, r=NULL)
})

## Supply a matrix (make sure to use `t` on a `wfm` matrix)
worlis <- list(
    pronouns = c("you", "it", "it's", "we", "i'm", "i"),
    negative = qcv(no, dumb, distrust, not, stinks),
    literacy = qcv(computer, talking, telling)
)
y <- wfdf(DATA$state, qdapTools::id(DATA, prefix = TRUE))
z <- wfm_combine(y, worlis)

out <- word_cor(t(z), word = c(names(worlis), "else.words"), r = NULL)
out
plot(out)

## Additional plotting/viewing
require(tm)
data("crude")

out1 <- word_cor(t(as.wfm(crude)), word = "oil", r=.7)
vect2df(out1[[1]], "word", "cor")

plot(out1)
qheat(vect2df(out1[[1]], "word", "cor"), values=TRUE, high="red", 
    digits=2, order.by ="cor", plot=FALSE) + coord_flip()


out2 <- word_cor(t(as.wfm(crude)), word = c("oil", "country"), r=.7)
plot(out2)

## End(Not run)

qdap documentation built on May 31, 2023, 5:20 p.m.