bibliometricLaws: Calculates Bibliometrics Laws

ok.bibliometrics.overview <- function(records) {
  overview <- list()

  abstracts_vector <- records$AB %>% if_else(!is.na(.),.,'') %>% tolower()
  abstracts_words_vector <- tokenize_ngrams(abstracts_vector, n=1L, stopwords = c('.',',',';','-',':')) %>% unlist() %>% unique()
  abstracts_alpha_words_vector <- abstracts_words_vector[gsub('[0-9\\.:,-]*','',abstracts_words_vector)!='']
  abstracts_words_vocabulary <- vocab_vectorizer(create_vocabulary(abstracts_alpha_words_vector))
  it <- itoken(abstracts_vector)
  abstracts_words_dtm <- create_dtm(it, abstracts_words_vocabulary)
  abstracts_words_dtm_cited_weighted <- abstracts_words_dtm * records$TC
  abstracts_words_dtm_binned <- abstracts_words_dtm[,!duplicated(Matrix::colSums(abstracts_words_dtm))]
  abstracts_words_dtm_cited_weighted_binned <- abstracts_words_dtm_cited_weighted[,!duplicated(Matrix::colSums(abstracts_words_dtm_cited_weighted))]
  overview$abstracts_words_dtm <- abstracts_words_dtm

  overview$zipf.count <- ok.bibliometrics.zipf(abstracts_words_dtm)
  overview$zipf.cited <- ok.bibliometrics.zipf(abstracts_words_dtm_cited_weighted)
  overview$zipf.cited.filt <- ok.bibliometrics.zipf(abstracts_words_dtm[records$TC>0,])

  overview$zipf.count.binned <- ok.bibliometrics.zipf(abstracts_words_dtm_binned)
  overview$zipf.cited.binned <- ok.bibliometrics.zipf(abstracts_words_dtm_cited_weighted_binned)
  overview$zipf.cited.filt.binned <- ok.bibliometrics.zipf(abstracts_words_dtm_binned[records$TC>0,])

  overview$lotka.count <- ok.bibliometrics.theoreticalLotka.all(records$AU, scale=list(c(0,31),c(0,0.9)))
  overview$lotka.cited <- ok.bibliometrics.theoreticalLotka.all(records$AU[records$TC>0], scale=list(c(0,31),c(0,0.9)))
  overview$lotka.cited.weight <- ok.bibliometrics.theoreticalLotka.all.weighted(records$AU, weights.vector = records$TC, scale=list(c(0,31),c(0,0.9)))

  overview$bradford.count <- ok.bibliometrics.bradford(records$SO)
  overview$bradford.cited <- ok.bibliometrics.bradford(rep(records$SO, records$TC))
  overview$bradford.cited.filt <- ok.bibliometrics.bradford(records$SO[records$TC>0])

  overview$price.count <- ok.bibliometrics.price(records$AU)
  overview$price.cited <- ok.bibliometrics.price(rep(records$AU, records$TC))
  overview$price.cited.filt <- ok.bibliometrics.price(records$AU[records$TC>0])
  overview$csv.df <- ok.bibliometrics.overview.toCSV.weighted(overview)

  return(overview)
}

ok.bibliometrics.overview.toCSV.weighted <- function(overview, digits.no=3) {
  csv_df_names <- c('Method - law','Method - modifications','Method - variables','Dataset - count','Dataset - citations - filtered','Dataset - citations - multiplied')
  rows <- 30
  csv_df <- ok.dataframe.make(colnames=csv_df_names, rows = rows)
  #print('lotka')
  csv_df[1,] <- c('Lotka\'s law', 'strict','chi-squared p-value',overview$lotka.count$first$results$plain$chi.test$p.value, overview$lotka.cited$first$results$plain$chi.test$p.value, overview$lotka.cited.weight$first$results$plain$chi.test$p.value)
  csv_df[2,] <- c('Lotka\'s law', 'strict','KS p-value',overview$lotka.count$first$results$plain$ks.test$p.value, overview$lotka.cited$first$results$plain$ks.test$p.value, overview$lotka.cited.weight$first$results$plain$ks.test$p.value)
  csv_df[3,] <- c('Lotka\'s law', 'strict','r-squared',overview$lotka.count$first$results$plain$r.squared, overview$lotka.cited$first$results$plain$r.squared, overview$lotka.cited.weight$first$results$plain$r.squared)
  csv_df[4,] <- c('Lotka\'s law', 'strict','n',overview$lotka.count$first$results$plain$n, overview$lotka.cited$first$results$plain$n, overview$lotka.cited.weight$first$results$plain$n)

  csv_df[5,] <- c('Lotka\'s law', 'extended','chi-squared p-value',overview$lotka.count$all$results$plain$chi.test$p.value, overview$lotka.cited$all$results$plain$chi.test$p.value, overview$lotka.cited.weight$all$results$plain$chi.test$p.value)
  csv_df[6,] <- c('Lotka\'s law', 'extended','KS p-value',overview$lotka.count$all$results$plain$ks.test$p.value, overview$lotka.cited$all$results$plain$ks.test$p.value, overview$lotka.cited.weight$all$results$plain$ks.test$p.value)
  csv_df[7,] <- c('Lotka\'s law', 'extended','r-squared',overview$lotka.count$all$results$plain$r.squared, overview$lotka.cited$all$results$plain$r.squared, overview$lotka.cited.weight$all$results$plain$r.squared)
  csv_df[8,] <- c('Lotka\'s law', 'extended','n',overview$lotka.count$all$results$plain$n, overview$lotka.cited$all$results$plain$n, overview$lotka.cited.weight$all$results$plain$n)

  csv_df[9,] <- c('Lotka\'s law', 'general','chi-squared p-value',overview$lotka.count$all$results$inverse.power$chi.test$p.value, overview$lotka.cited$all$results$inverse.power$chi.test$p.value, overview$lotka.cited.weight$all$results$inverse.power$chi.test$p.value)
  csv_df[10,] <- c('Lotka\'s law', 'general','KS p-value',overview$lotka.count$all$results$inverse.power$ks.test$p.value, overview$lotka.cited$all$results$inverse.power$ks.test$p.value, overview$lotka.cited.weight$all$results$inverse.power$ks.test$p.value)
  csv_df[11,] <- c('Lotka\'s law', 'general','r-squared',overview$lotka.count$all$results$inverse.power$r.squared, overview$lotka.cited$all$results$inverse.power$r.squared, overview$lotka.cited.weight$all$results$inverse.power$r.squared)
  csv_df[12,] <- c('Lotka\'s law', 'general','n',overview$lotka.count$all$results$inverse.power$n, overview$lotka.cited$all$results$inverse.power$n, overview$lotka.cited.weight$all$results$inverse.power$n)

  csv_df[13,] <- c('Lotka\'s law', 'Pao','chi-squared p-value',overview$lotka.count$all$results$pao$ks.best.chi, overview$lotka.cited$all$results$pao$ks.best.chi, overview$lotka.cited.weight$all$results$pao$ks.best.chi)
  csv_df[14,] <- c('Lotka\'s law', 'Pao','KS p-value',overview$lotka.count$all$results$pao$max.ks.ks$p.value, overview$lotka.cited$all$results$pao$max.ks.ks$p.value, overview$lotka.cited.weight$all$results$pao$max.ks.ks$p.value)
  csv_df[15,] <- c('Lotka\'s law', 'Pao','r-squared',overview$lotka.count$all$results$pao$ks.best.r, overview$lotka.cited$all$results$pao$ks.best.r, overview$lotka.cited.weight$all$results$pao$ks.best.r)
  csv_df[16,] <- c('Lotka\'s law', 'Pao','n',overview$lotka.count$all$results$pao$ks.best.n, overview$lotka.cited$all$results$pao$ks.best.n, overview$lotka.cited.weight$all$results$pao$ks.best.n)
  csv_df[17,] <- c('Lotka\'s law', 'Pao','bins',overview$lotka.count$all$results$pao$bins, overview$lotka.cited$all$results$pao$bins, overview$lotka.cited.weight$all$results$pao$bins)
  csv_df[18,] <- c('Lotka\'s law', 'Pao','combinations',overview$lotka.count$all$results$pao$combinations, overview$lotka.cited$all$results$pao$combinations, overview$lotka.cited.weight$all$results$pao$combinations)
  csv_df[19,] <- c('Lotka\'s law', 'Pao','cutoff level',overview$lotka.count$all$results$pao$cutoff, overview$lotka.cited$all$results$pao$cutoff, overview$lotka.cited.weight$all$results$pao$cutoff)
  csv_df[20,] <- c('Lotka\'s law', 'Pao','cutoff index',overview$lotka.count$all$results$pao$cutoff.number, overview$lotka.cited$all$results$pao$cutoff.number, overview$lotka.cited.weight$all$results$pao$cutoff.number)

  #print('price')
  csv_df[21,] <-c('Price\'s law','strict','top 10% productivity: co-authored',overview$price.count$co.authored.10perc, overview$price.cited.filt$co.authored.10perc, overview$price.cited$co.authored.10perc)
  csv_df[22,] <-c('Price\'s law','strict','top 10% productivity: all-authored',overview$price.count$all.authored.10perc, overview$price.cited.filt$all.authored.10perc, overview$price.cited$all.authored.10perc)
  csv_df[23,] <-c('Price\'s law','strict','ideal 10% rounded',overview$price.count$theoretical.cutoff.ind, overview$price.cited.filt$theoretical.cutoff.ind, overview$price.cited$theoretical.cutoff.ind)
  csv_df[24,] <-c('Price\'s law','strict','cutoff publications no',overview$price.count$cutoff_productivity, overview$price.cited.filt$cutoff_productivity, overview$price.cited$cutoff_productivity)
  csv_df[25,] <-c('Price\'s law','strict','cutoff authors no',overview$price.count$cutoff, overview$price.cited.filt$cutoff, overview$price.cited$cutoff)

  csv_df[26,] <-c('Price\'s law','strict','square root co-authored productivity',overview$price.count$second.elite.coauthored, overview$price.cited.filt$second.elite.coauthored, overview$price.cited$second.elite.coauthored)
  csv_df[27,] <-c('Price\'s law','strict','square root all-authored productivity',overview$price.count$second.elite.allauthored, overview$price.cited.filt$second.elite.allauthored, overview$price.cited$second.elite.allauthored)
  csv_df[28,] <-c('Price\'s law','strict','ideal square root rounded',overview$price.count$second.elite.idealSize, overview$price.cited.filt$second.elite.idealSize, overview$price.cited$second.elite.idealSize)
  csv_df[29,] <-c('Price\'s law','strict','top authors cutoff number',overview$price.count$second.elite.realSize, overview$price.cited.filt$second.elite.realSize, overview$price.cited$second.elite.realSize)
  csv_df[30,] <-c('Price\'s law','strict','top authors cutoff level',overview$price.count$second.elite.cutoffLevel, overview$price.cited.filt$second.elite.cutoffLevel, overview$price.cited$second.elite.cutoffLevel)

  #print('bradford')
  csv_df[31,] <- c('Bradford\'s law','inverse power to fit','blocks',paste0(overview$bradford.count$sources.container, collapse = '\n'),paste0(overview$bradford.cited.filt$sources.container, collapse = '\n'),paste0(overview$bradford.cited$sources.container, collapse = '\n'))
  csv_df[32,] <- c('Bradford\'s law','inverse power to fit','articles',paste0(overview$bradford.count$sources.no.container, collapse = '\n'),paste0(overview$bradford.cited.filt$sources.no.container, collapse = '\n'),paste0(overview$bradford.cited$sources.no.container, collapse = '\n'))
  csv_df[33,] <- c('Bradford\'s law','inverse power to fit','power',overview$bradford.count$fitted.power,overview$bradford.cited.filt$fitted.power,overview$bradford.cited$fitted.power,overview$bradford.links.filt$fitted.power,overview$bradford.links$fitted.power)
  csv_df[34,] <- c('Bradford\'s law','inverse power to fit','n',overview$bradford.count$fitted.n,overview$bradford.cited.filt$fitted.n,overview$bradford.cited$fitted.n,overview$bradford.links.filt$fitted.n,overview$bradford.links$fitted.n)

  #print('zipf')
  csv_df[35,] <- c('Zipf\'s law', 'strict', 'residual standard deviation', overview$zipf.count$residual.deviation, overview$zipf.cited.filt$residual.deviation, overview$zipf.cited$residual.deviation)
  csv_df[36,] <- c('Zipf\'s law', 'strict', 'coefficient of variation of residual standard deviation', overview$zipf.count$coefficient.of.residual.variance, overview$zipf.cited.filt$coefficient.of.residual.variance, overview$zipf.cited$coefficient.of.residual.variance)
  csv_df[37,] <- c('Zipf\'s law', 'strict', 'number of words', overview$zipf.count$word.count$word.count[1], overview$zipf.cited.filt$word.count$word.count[1], overview$zipf.cited$word.count$word.count[1])

  csv_df[38,] <- c('Zipf\'s law', 'binned', 'residual standard deviation', overview$zipf.count.binned$residual.deviation, overview$zipf.cited.filt.binned$residual.deviation, overview$zipf.cited.binned$residual.deviation)
  csv_df[39,] <- c('Zipf\'s law', 'binned', 'coefficient of variation of residual standard deviation', overview$zipf.count.binned$coefficient.of.residual.variance, overview$zipf.cited.filt.binned$coefficient.of.residual.variance, overview$zipf.cited.binned$coefficient.of.residual.variance)
  csv_df[40,] <- c('Zipf\'s law', 'binned', 'number of ranks', overview$zipf.count.binned$word.count$word.count[1], overview$zipf.cited.filt.binned$word.count$word.count[1], overview$zipf.cited.binned$word.count$word.count[1])

  #print('misc')
  csv_df[1:30,4] <- as.character(round(as.numeric(csv_df[1:30,4]), digits = digits.no))
  csv_df[1:30,5] <- as.character(round(as.numeric(csv_df[1:30,5]), digits = digits.no))
  csv_df[1:30,6] <- as.character(round(as.numeric(csv_df[1:30,6]), digits = digits.no))
  csv_df[33:40,4] <- as.character(round(as.numeric(csv_df[33:40,4]), digits = digits.no))
  csv_df[33:40,5] <- as.character(round(as.numeric(csv_df[33:40,5]), digits = digits.no))
  csv_df[33:40,6] <- as.character(round(as.numeric(csv_df[33:40,6]), digits = digits.no))


  return(csv_df)
}

ok.stats.distribution <- function(table) {
  return(table/sum(table))
}

ok.stats.distribution.cumulative <- function(table) {
  return(cumsum(ok.stats.distribution(table)))
}

ok.stats.rSquared <- function (x, y) { # https://stackoverflow.com/questions/40901445/function-to-calculate-r2-r-squared-in-r
  return(cor(x, y)^2)
}

ok.ggplot.distribution.compare <- function(x,y1,y2, title=NULL, xlab='values', ylab='distribution', legend='distributions', y1lab='distribution 1', y2lab='distribution 2',theme=theme_test()) {
  melted <- data.frame(x,y1,y2) %>% reshape2::melt(id=1) %>% set_colnames(c('x','c','y')) %>% mutate('c'=ok.text.mgsub(c('y1','y2'),c(y1lab,y2lab),.[,'c'])) %>% mutate('x'=as.numeric(x)) %>% mutate('y'=as.numeric(y))
  res <- ggplot(data = melted) + geom_line(aes(x=x, y=y, color=c), size=1) + geom_point(aes(x=x, y=y, color=c), size=3) + labs(x=xlab, y=ylab, color=legend, title=title) + theme
  return(res)
}

ok.text.mgsub <- function(patterns, replacements, x, perl=F, fixed=F, ignore.case=F, positions=NULL) {
  substituted <- x
  if(is.null(positions)) {
    for(i in 1:length(patterns)) {
      substituted <- gsub(patterns[i], replacements[i], substituted, perl = perl, fixed = fixed, ignore.case = ignore.case)
    }
  } else {
    for(i in 1:length(patterns)) {
      position <- positions[[i]]
      substituted[position] <- gsub(patterns[i], replacements[i], substituted[position], perl = perl, fixed = fixed, ignore.case = ignore.case)
    }
  }

  return(substituted)
}

ok.ggplot.extractLegend <-function(plot){ # https://stackoverflow.com/questions/13649473/add-a-common-legend-for-combined-ggplots
  tmp <- ggplot_gtable(ggplot_build(plot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

ok.dataframe.make <- function(colnames=c(), value = NA, rownames=NULL, rows = NULL) {
  rows <- if (is.null(rows)&!is.null(rownames)) length(rownames) else rows
  newDF <- data.frame(matrix(nrow = rows, ncol = length(colnames)))
  if(rows>0) {
    newDF[,] <- value
  }
  colnames(newDF) <- colnames
  if(!is.null(rownames)) {
    rownames(newDF) <- rownames
  }
  return(newDF)
}