ok.bibliometrics.zipf <- function(dtm, alpha=1) {
words <- dtm %>% Matrix::colSums()
word_count <- data.frame(names(sort(words,T)),sort(words,T))
colnames(word_count) <- c("word", "count")
word_count <- word_count[word_count$count!=0,]
alpha <- 1
word_count <- word_count %>% mutate(word = factor(word, levels = word), rank = row_number(), zipfs_freq = ifelse(rank == 1, count, dplyr::first(count) / rank^alpha))
word_count$rank_log <- log(word_count$rank)
word_count$count_log <- log(word_count$count)
word_count$t_count_log <- log(word_count$zipfs_freq)
word_count$rel_freq <- word_count$count/sum(word_count$count)
word_count$rel_freq_cum <- cumsum(word_count$rel_freq)
word_count$t_freq <- word_count$zipfs_freq / sum(word_count$zipfs_freq)
word_count$t_freq_cum <- cumsum(word_count$t_freq)
word_count$constans <- word_count$count * word_count$rank
word_count$coeff.of.variation <- sd(word_count$constans) / mean(word_count$constans)
word_count$word.count <- nrow(word_count)
plotie <- ggplot(data=word_count) + geom_point(aes(x=rank_log,y=count_log, color='observed')) + geom_point(aes(x=rank_log,y=t_count_log, color='theoretical')) + labs(x = "log(rank)", y = "log(count)", title = NULL, color='color')
mean_deviation <- mean(abs(word_count$count-word_count$zipfs_freq))
population_residual_standard_deviation <- sqrt(sum((word_count$count-word_count$zipfs_freq)^2) / nrow(word_count))
mean_o <- mean(word_count$count)
c_v <- population_residual_standard_deviation / mean_o
return(list(plot=plotie, word.count=word_count, mean=mean_o, residual.deviation=population_residual_standard_deviation, coefficient.of.residual.variance=c_v))
}
ok.bibliometrics.zipf.compare <- function(zipf.1, zipf.2) {
zipf.1 <- zipf.1$word.count[match(zipf.2$word.count$word, zipf.1$word.count$word),]
#print(nrow(zipf.1))
zipf.2 <- zipf.2$word.count[as.character(zipf.2$word.count$word) %in% as.character(zipf.1$word),]
#print(nrow(zipf.2))
rownames(zipf.1) <- rownames(zipf.2) <- 1:nrow(zipf.1)
diffe <- zipf.1$rank-zipf.2$rank
dists <- data.frame(id=1:nrow(zipf.1),difference=diffe, word=zipf.1$word, zipf.1.rank = zipf.1$rank, zipf.2.rank = zipf.2$rank, zipf.1.freq=zipf.1$count, zipf.2.freq=zipf.2$count, diff.x.count = zipf.1$count*diffe)
#dists_ordered <- arrange(dists, difference)
dists_ordered <- dists[order(-dists$diff.x.count),]
top.filter <- dists_ordered$zipf.1.freq>1 & dists_ordered$difference > 100
top <- dists_ordered[top.filter,]
return(list(zipf.1=zipf.1, zipf.2=zipf.2, dists=dists, dists.ordered=dists_ordered, top=top))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.