#' Create a co-occurence network
#'
#' @param x either a document-term matrix (DTM) or a tokenlist.
#' @param measure the measure to calcualte adjacency. Currently supports cosine and conditional probability
#' @return A graph in the Igraph format in which edges represent the adjacency of terms
#' @export
coOccurenceNetwork <- function(x, measure='cooccurence', doc.col=getOption('doc.col','doc_id'), position.col=getOption('position.col','position'), word.col=getOption('word.col','word')){
if('data.frame' %in% class(x)) {
verifyTokenlistColumns(tokenlist, doc.col, position.col, word.col)
x = tokenlistToDTM(x, doc.col, position.col, word.col)
}
if('DocumentTermMatrix' %in% class(x)) x = dtmToSparseMatrix(x)
x = as(as(x, 'dgCMatrix'), 'dgTMatrix')
if(measure == 'cosine') {
mat = as(getCosine(x), 'dgCMatrix')
g = graph.adjacency(mat, mode='upper', diag=F, weighted=T)
}
if(measure == 'cooccurence') {
mat = getCoOccurence(x)
g = graph.adjacency(mat, mode='upper', diag=F, weighted=T)
}
if(measure == 'conprob') {
mat = getConditionalProbability(x)
g = graph.adjacency(mat, mode='directed', diag=F, weighted=T)
}
V(g)$freq = col_sums(x)
g = set.edge.attribute(g, measure, value=E(g)$weight)
class(g) = c('semnet',class(g))
g
}
dtmToSparseMatrix <- function(dtm){
sm = spMatrix(nrow(dtm), ncol(dtm), dtm$i, dtm$j, dtm$v)
rownames(sm) = rownames(dtm)
colnames(sm) = colnames(dtm)
sm
}
getCoOccurence <- function(m1, m2=m1){
m1@x[m1@x > 0] = 1
m2@x[m2@x > 0] = 1
mat = Matrix::crossprod(m1,m2)
mat[is.na(mat)] = 0
mat
}
getConditionalProbability <- function(m1, m2=m1){
m1@x[m1@x > 0] = 1
m2@x[m2@x > 0] = 1
mat = Matrix::crossprod(m1,m2)/Matrix::colSums(m1)
mat[is.na(mat)] = 0
mat
}
getCosine <- function(m1, m2=NULL){
norm = sqrt(Matrix::colSums(m1^2))
m1@x = m1@x / norm[m1@j+1]
if(!is.null(m2)){
norm = sqrt(Matrix::colSums(m2^2))
m2@x = m2@x / norm[m2@j+1]
cp = Matrix::crossprod(m1,m2)
} else cp = Matrix::crossprod(m1)
cp
}
calculateAdjacency <- function(position.mat, window.mat){
adj = Matrix::crossprod(position.mat, window.mat)
g = graph.adjacency(adj, mode='directed', weighted=T, diag=F)
V(g)$freq = col_sums(position.mat)
E(g)$cooccurence = E(g)$weight
E(g)$weight_pct = E(g)$weight / V(g)$freq[get.edgelist(g, names = F)[,1]]
class(g) = c('semnet',class(g))
g
}
aggCoOc <- function(x, position.mat, window.mat){
cooc = position.mat[,x] & window.mat
cooc = as(cooc, 'lgTMatrix')
cooc = data.frame(x=x, y=cooc@j+1, context=cooc@i+1, weight=cooc@x)
cooc = cooc[!cooc$x == cooc$y,]
ddply(cooc, .(x,y,context), summarize, weight=sum(weight))
}
calculateAdjacencyPerContext <- function(position.mat, window.mat) {
adj = ldply(1:ncol(position.mat), function(x) aggCoOc(x, position.mat, window.mat))
adj$context = rownames(position.mat)[adj$context]
adj$x = as.factor(colnames(position.mat)[adj$x])
adj$y = as.factor(colnames(position.mat)[adj$y])
adj
}
#' Merge a list of matrices
#'
#' Merge matrices, by standradizing column and row dimensions and using the Reduce function to merge them.
#'
#' @param matrices A list of matrices
#' @param reduce_func The function to pass to Reduce
#' @return A single matrix
#' @export
mergeMatrices <- function(matrices, reduce_func='+'){
rowdim = unique(llply(matrices, function(x) rownames(x)))[[1]]
coldim = unique(llply(matrices, function(x) colnames(x)))[[1]]
matrices = llply(matrices, function(x) setMatrixDims(x, rowdim, coldim))
mat = Reduce(reduce_func, matrices)
rownames(mat) = rowdim
colnames(mat) = coldim
mat
}
setMatrixDims <- function(mat, rowdim, coldim){
## set rows and columns of matrix to given rowdim and coldim vectors. (usefull for making different matrices identical so that they can be summed up)
mat = as(mat, 'dgTMatrix')
d = data.frame(i=rownames(mat)[mat@i+1], j=colnames(mat)[mat@j+1], v=mat@x)
d = d[d$i %in% rowdim & d$j %in% coldim,]
d$i = match(d$i, rowdim)
d$j = match(d$j, coldim)
spMatrix(length(rowdim), length(coldim), d$i, d$j, d$v)
}
#### using chi-square (experimental)
#' Compute the chi^2 statistic for a 2x2 crosstab containing the values
#' [[a, b], [c, d]]
chi2 <- function(a,b,c,d, yates_correction=rep(F, length(a)), autocorrect=F){
n = a+b+c+d
sums = cbind(c1 = a+c, c2 = b+d, r1 = a+b, r2 = c+d)
if(autocorrect){
## apply Cochrans criteria: no expected values below 1 and less than 20% of cells empty (which means none in a 2x2 design)
## if these are violated, use the yates_correction
## http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2041889/
e = cbind(sums[,'c1'] / n, sums[,'c2'] / n)
e = cbind(e * sums[,'r1'], e * sums[,'r2'])
c1 = rowSums(e < 1) > 0 # at least one expected value below 1
c2 = rowSums(sums < 5) > 0 # at least one cell below 5
yates_correction = ifelse(c1 | c2, T, F)
}
x = a*d - b*c
x = ifelse(yates_correction, abs(x) - n/2, x)
chi = n*x^2 / (sums[,'c1'] * sums[,'c2'] * sums[,'r1'] * sums[,'r2'])
ifelse(is.na(chi), 0, chi)
}
binom.coef.log <- function(n, k) {
bcl_func <- function(n, k) sum(log(((n-k+1):n) / ((k-k+1):k)))
mapply(bcl_func, n, k)
}
fishers.exact <- function(a,b,c,d){
n = a+b+c+d
log_p = binom.coef.log(a+b,a) + binom.coef.log(c+d,c) - binom.coef.log(n,a+c)
exp(log_p)
}
getContextOccurence <- function(m1, m2=m1){
m2@x[Matrix::which(m2@x > 0)] = 1
mat = Matrix::crossprod(m1,m2)
mat[is.na(mat)] = 0
mat
}
smooth_prob <- function(succes, n, vocabulary_size=NULL, smoothing_parameter=1){
if(is.null(vocabulary_size)) {
(word_occ + smoothing_parameter) / (n + smoothing_parameter)
} else{
(word_occ + smoothing_parameter) / (n + (vocabulary_size*smoothing_parameter))
}
}
edgeChi2 <- function(g, wordfreq.col='freq', wordcooc.col='cooccurence', vocabulary=NULL){
freq = get.vertex.attribute(g, wordfreq.col)
cooc = get.edge.attribute(g, wordcooc.col)
e = get.edgelist(g, names = F)
if(is.null(vocabulary)) vocabulary = structure(freq, names=V(g)$name)
y_if_x= cooc
y_ifnot_x = freq[e[,2]] - cooc
x_total_cooc = tapply(cooc, e[,1], sum)
n_if_x = x_total_cooc[e[,1]]
n_ifnot_x = sum(x_total_cooc) - n_if_x
E(g)$chi = chi2(y_if_x, y_ifnot_x,
n_if_x - y_if_x, n_ifnot_x - y_ifnot_x,
autocorrect = T)
E(g)$chi.p = 1-pchisq(E(g)$chi, 1)
g
}
#' Get wordassociations based on ratio and chi-2
#'
#' Calculate wordassociations (x -> y) as the ratio of the p
#'
#' @param dtm the main document-term matrix
#' @return A dataframe representing and edge.list
#' @export
chi2_wordassociations <- function(dtm, odds.ratio.thres=1, thres=0.05, measure='fisher', smoothing_parameter=1, use.wordcount=T, return.graph=T) {
d = coOccurenceDistributionTable(dtm, use.wordcount)
d$prob_y_if_x = (d$y_if_x / d$n_if_x)
d$prob_y_ifnot_x = (d$y_ifnot_x / d$n_ifnot_x)
d$prob_ratio = d$prob_y_if_x / d$prob_y_ifnot_x
d$odds_ratio = (d$prob_y_if_x / (1 + d$prob_y_if_x)) / (d$prob_y_ifnot_x / (1 + d$prob_y_ifnot_x))
d = d[d$odds_ratio > odds.ratio.thres,]
d$chi = chi2(d$y_if_x,
d$y_ifnot_x,
d$n_if_x-d$y_if_x,
d$n_ifnot_x-d$y_ifnot_x)
d$chi.p = 1-pchisq(d$chi, 1)
d$fisher.p = fishers.exact(d$y_if_x,
d$y_ifnot_x,
d$n_if_x-d$y_if_x,
d$n_ifnot_x-d$y_ifnot_x)
d = d[d$chi.p < chi.p.thres,]
d = d[d$fisher.p < fisher.p.thres,]
d = d[!d$x == d$y,]
d$x = colnames(dtm)[d$x]
d$y = colnames(dtm)[d$y]
d$n = d$y_if_x
if(return.graph){
g = graph.data.frame(d[,c('x','y')])
E(g)$odds_ratio = d$odds_ratio
E(g)$smooth_odds_ratio = d$odds_ratio
E(g)$weight = d$odds_ratio / (1 + d$odds_ratio)
V(g)$freq = col_sums(dtm)[match(V(g)$name, colnames(dtm))]
class(g) = c('semnet',class(g))
return(g)
} else return(d[,c('x','y', 'n', 'prob_ratio', 'odds_ratio','chi','chi.p', 'fisher.p')])
}
coOccurenceDistributionTable <- function(dtm, use.wordcount=T, smooth_param=1){
if('DocumentTermMatrix' %in% class(dtm)) dtm = dtmToSparseMatrix(dtm)
dtm = as(dtm, 'dgCMatrix')
conocc = as(getContextOccurence(dtm), 'dgTMatrix')
d = data.frame(x=conocc@j+1, y=conocc@i+1)
d$y_if_x= conocc@x # how often did y occur in documents in which x occured
d$y_ifnot_x = Matrix::colSums(dtm)[d$y] - d$y_if_x # how often did y occur in documents in which x did not occur
d$n_if_x = Matrix::colSums(conocc)[d$x] # how many words occured in documents where x occured
d$n_ifnot_x = sum(dtm) - d$n_if_x # how many words occured in documents where x did not occur
d$y_if_x = d$y_if_x + smooth_param
d$y_ifnot_x = d$y_ifnot_x + smooth_param
if(use.wordcount){
vocabulary_if_x = Matrix::colSums(conocc > 0)[d$x]
vocabulary = Matrix::colSums(dtm)
vocabulary_ifnot_x = sapply(1:ncol(dtm), function(x) sum(vocabulary > Matrix::colSums(dtm[dtm[,x] > 0, ,drop=F])))
vocabulary_ifnot_x = vocabulary_ifnot_x[d$x]
d$n_if_x = d$n_if_x + vocabulary_if_x*smooth_param
#d$n_ifnot_x = d$n_ifnot_x + ncol(dtm)*smooth_param
d$n_ifnot_x = d$n_ifnot_x + vocabulary_ifnot_x*smooth_param
} else {
d$n_if_x = d$n_if_x + smooth_param
d$n_ifnot_x = d$n_ifnot_x + smooth_param
}
d
}
#' Get wordassociations based on ratio and chi-2
#'
#' Calculate wordassociations (x -> y) as the ratio of the p
#'
#' @param dtm the main document-term matrix
#' @return A dataframe representing and edge.list
#' @export
chi2_wordassociations_alternative <- function(dtm, ratio.thres=1, p.thres=0.05, smoothing_parameter=1, return.graph=T) {
if('DocumentTermMatrix' %in% class(dtm)) dtm = dtmToSparseMatrix(dtm)
dtm = as(dtm, 'dgCMatrix')
conocc = as(getContextOccurence(dtm), 'dgTMatrix')
d = data.frame(x=conocc@j+1, y=conocc@i+1)
d$y_if_x= conocc@x # how often did y occur in documents in which x occured
d$y_ifnot_x = diag(conocc)[d$y] - d$y_if_x # how often did y occur in documents in which x did not occur
d$n_if_x = colSums(conocc)[d$x] # how many words occured in documents where x occured
d$n_ifnot_x = sum(dtm) - d$n_if_x # how many words occured in documents where x did not occur
d$chi = chi2(d$y_if_x,
d$y_ifnot_x,
d$n_if_x-d$y_if_x,
d$n_ifnot_x-d$y_ifnot_x)
d$p = 1-pchisq(d$chi, 1)
d = d[d$p < p.thres,]
d$odds_ratio = (d$y_if_x/d$n_if_x) / (d$y_ifnot_x/d$n_ifnot_x)
d = d[d$odds_ratio > ratio.thres,]
d$smooth_odds_ratio = smooth_rel(d$y_if_x, d$n_if_x, nrow(conocc), smoothing_parameter) / smooth_rel(d$y_ifnot_x, d$n_ifnot_x, nrow(conocc), smoothing_parameter)
d = d[!d$x == d$y,]
d$x = colnames(conocc)[d$x]
d$y = colnames(conocc)[d$y]
d$n = d$y_if_x
if(return.graph){
g = graph.data.frame(d[,c('x','y')])
E(g)$odds_ratio = d$odds_ratio
E(g)$smooth_odds_ratio = d$smooth_odds_ratio
E(g)$weight = d$smooth_odds_ratio / (1 + d$smooth_odds_ratio)
V(g)$freq = colSums(dtm)[match(V(g)$name, colnames(dtm))]
class(g) = c('semnet',class(g))
return(g)
} else return(d[,c('x','y', 'n', 'odds_ratio','smooth_odds_ratio','chi','p')])
}
edgeWeightStatistics <- function(g, edge.coocc.var='weight', vertex.freq.var='freq'){
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.