R/zipf.R

ok.bibliometrics.zipf <- function(dtm, alpha=1) {
  words <- dtm %>% Matrix::colSums()
  word_count <- data.frame(names(sort(words,T)),sort(words,T))
  colnames(word_count) <- c("word", "count")
  word_count <- word_count[word_count$count!=0,]
  alpha <- 1
  word_count <- word_count %>% mutate(word = factor(word, levels = word), rank = row_number(), zipfs_freq = ifelse(rank == 1, count, dplyr::first(count) / rank^alpha))
  word_count$rank_log <- log(word_count$rank)
  word_count$count_log <- log(word_count$count)
  word_count$t_count_log <- log(word_count$zipfs_freq)
  word_count$rel_freq <- word_count$count/sum(word_count$count)
  word_count$rel_freq_cum <- cumsum(word_count$rel_freq)
  word_count$t_freq <- word_count$zipfs_freq / sum(word_count$zipfs_freq)
  word_count$t_freq_cum <- cumsum(word_count$t_freq)
  word_count$constans <- word_count$count * word_count$rank
  word_count$coeff.of.variation <- sd(word_count$constans) / mean(word_count$constans)
  word_count$word.count <- nrow(word_count)
  plotie <- ggplot(data=word_count) + geom_point(aes(x=rank_log,y=count_log, color='observed')) + geom_point(aes(x=rank_log,y=t_count_log, color='theoretical')) + labs(x = "log(rank)", y = "log(count)", title = NULL, color='color')
  mean_deviation <- mean(abs(word_count$count-word_count$zipfs_freq))
  population_residual_standard_deviation <- sqrt(sum((word_count$count-word_count$zipfs_freq)^2) / nrow(word_count))
  mean_o <- mean(word_count$count)
  c_v <- population_residual_standard_deviation / mean_o



  return(list(plot=plotie, word.count=word_count, mean=mean_o, residual.deviation=population_residual_standard_deviation, coefficient.of.residual.variance=c_v))
}

ok.bibliometrics.zipf.compare <- function(zipf.1, zipf.2) {
  zipf.1 <- zipf.1$word.count[match(zipf.2$word.count$word, zipf.1$word.count$word),]
  #print(nrow(zipf.1))
  zipf.2 <- zipf.2$word.count[as.character(zipf.2$word.count$word) %in% as.character(zipf.1$word),]
  #print(nrow(zipf.2))
  rownames(zipf.1) <- rownames(zipf.2) <- 1:nrow(zipf.1)
  diffe <- zipf.1$rank-zipf.2$rank
  dists <- data.frame(id=1:nrow(zipf.1),difference=diffe, word=zipf.1$word, zipf.1.rank = zipf.1$rank, zipf.2.rank = zipf.2$rank, zipf.1.freq=zipf.1$count, zipf.2.freq=zipf.2$count, diff.x.count = zipf.1$count*diffe)
  #dists_ordered <- arrange(dists, difference)
  dists_ordered <- dists[order(-dists$diff.x.count),]
  top.filter <- dists_ordered$zipf.1.freq>1 & dists_ordered$difference > 100
  top <- dists_ordered[top.filter,]
  return(list(zipf.1=zipf.1, zipf.2=zipf.2, dists=dists, dists.ordered=dists_ordered, top=top))
}
theogrost/bibliometricLaws documentation built on July 2, 2019, 8:56 p.m.