R/lotka.R

ok.bibliometrics.theoreticalLotka.plain <- function(count.table) {
  authors.number <- sum(count.table)
  count.vector <- as.numeric(names(count.table))
  lowering <- T
  last_diff <- 9e20
  index <- 1
  res <- c()
  res <- ok.bibliometrics.theoreticalLotka.simulate(count.table, 2)$theoretical.count
  names(res) <- count.vector
  rsq <- ok.stats.rSquared(count.table, res)
  chi <- suppressWarnings(chisq.test(count.table, res))
  dis_o <- ok.stats.distribution(unname(as.numeric(count.table)))
  dis_t <- ok.stats.distribution(unname(res))
  cum_o <- ok.stats.distribution.cumulative(unname(as.numeric(count.table)))
  cum_t <- ok.stats.distribution.cumulative(unname(res))
  kst <- suppressWarnings(ks.test(cum_o, cum_t, exact = F))
  pl <- ok.ggplot.distribution.compare(x=count.vector, y1=dis_o, y2=dis_t, xlab = 'documents number', ylab = 'distribution', y1lab = 'observed', y2lab = 'theoretical')
  li_res <- list(n=2,c=index,table=res,r.squared=rsq,chi.test=chi,ks.test=kst, plot=pl)
  return(li_res)
}

ok.bibliometrics.theoreticalLotka.inversePower <- function(count.table) {
  authors.number <- sum(count.table)
  count.vector <- as.numeric(names(count.table))
  lowering <- T
  last_diff <- 9e20
  index <- 1
  res <- c()
  n_slope <- abs(lm(log(count.table)~log(count.vector))$coefficients[2])
  res <- ok.bibliometrics.theoreticalLotka.simulate(count.table, n_slope)$theoretical.count
  names(res) <- count.vector
  rsq <- ok.stats.rSquared(count.table, res)
  chi <- suppressWarnings(chisq.test(count.table, res))
  dis_o <- ok.stats.distribution(unname(as.numeric(count.table)))
  dis_t <- ok.stats.distribution(unname(res))
  cum_o <- ok.stats.distribution.cumulative(unname(as.numeric(count.table)))
  cum_t <- ok.stats.distribution.cumulative(unname(res))
  kst <- suppressWarnings(ks.test(cum_o, cum_t, exact = F))
  pl <- ok.ggplot.distribution.compare(x=count.vector, y1=dis_o, y2=dis_t, xlab = 'documents number', ylab = 'distribution', y1lab = 'observed', y2lab = 'theoretical')
  li_res <- list(n=n_slope,c=res[1],table=res,r.squared=rsq,chi.test=chi,ks.test=kst,plot=pl)
  return(li_res)
}

ok.bibliometrics.theoreticalLotka.getModel <- function(count.table, slope) {
  authors.number <- sum(count.table)
  count.vector <- as.numeric(names(count.table))
  lowering <- T
  last_diff <- 9e20
  index <- 1
  res <- c()
  n_slope <- lm(log(count.table)~0+1)
  res <- ok.bibliometrics.theoreticalLotka.simulate(count.table, n_slope)$theoretical.count
  return(res)
}

ok.bibliometrics.theoreticalLotka.pao <- function(count.table) {
  authors.number <- sum(count.table)
  count.vector <- as.numeric(names(count.table))
  cutoff <- sum(count.table) %>% sqrt() %>% round()
  cutoff_level <- count.table[length(count.table):1] %>% cumsum() %>% {which(.>cutoff)[1]} %>% names()
  cutoff_level_no <-  which(names(count.table)==cutoff_level)
  if(cutoff_level_no<length(count.table)) {
    datasets <- ggm::powerset(count.vector[cutoff_level_no:length(count.vector)]) %>% sapply(function(e){return(c(count.vector[1:(cutoff_level_no-1)],e))})
    datasets[[length(datasets)+1]] <- count.vector[1:(cutoff_level_no-1)]
  } else {
    datasets <- list(count.vector)
  }
  ns <- chisqs <- kss <- rsqs <- vector('numeric',length(datasets))
  res <- inds <-  list()
  for(i in 1:length(datasets)) {
    indices <- datasets[[i]]
    if(length(indices)>1) {
      chind <- as.character(indices)
      n_slope <- abs(lm(log(count.table[chind])~log(indices))$coefficients[2])
      ns[i] <- n_slope
      cu <- sapply(indices, function(i){return(9e20/i^n_slope)}) %>% round() %>% set_names(chind)
      rsqs[i] <- ok.stats.rSquared(count.table[chind], cu)
      kss[i] <- suppressWarnings(ks.test(ok.stats.distribution.cumulative(count.table[chind]), ok.stats.distribution.cumulative(cu), exact = F)$p.value)
      chisqs[i] <- suppressWarnings(chisq.test(count.table[chind], cu)$p.value)
      res[[i]] <- cu
      inds[[i]] <- chind
    }
  }

  ks_optim_index <- which(kss==max(kss))[1]
  ks_filter <- inds[[ks_optim_index]]
  dis_o <- ok.stats.distribution(unname(as.numeric(count.table[ks_filter])))
  dis_t <- ok.stats.distribution(unname(res[[ks_optim_index]]))
  cum_o <- ok.stats.distribution.cumulative(unname(as.numeric(count.table[ks_filter])))
  cum_t <- ok.stats.distribution.cumulative(unname(res[[ks_optim_index]]))
  kst <- suppressWarnings(ks.test(cum_o, cum_t, exact = F))
  pl <- ok.ggplot.distribution.compare(x=as.numeric(ks_filter), y1=dis_o, y2=dis_t, xlab = 'documents number', ylab = 'distribution', y1lab = 'observed', y2lab = 'theoretical')
  ks_best_chi <- chisqs[ks_optim_index]
  ks_best_rsqs <- rsqs[ks_optim_index]
  ks_best_n <- ns[ks_optim_index]

  li_res <- list(cutoff=cutoff_level, cutoff.number=cutoff_level_no, combinations=length(datasets), bins=length(count.table), n=ns,datasets=datasets,r.squared.datasets=rsqs,ks.test.datasets=kss,chi.test.datasets=chisqs,plot=pl,max.ks.index=ks_optim_index,max.ks.ks=kst, ks.best.chi=ks_best_chi, ks.best.n=ks_best_n, ks.best.r=ks_best_rsqs)
  return(li_res)
}

ok.bibliometrics.theoreticalLotka <- function(count.table) {
  plain <- ok.bibliometrics.theoreticalLotka.plain(count.table)
  inverse_power <- ok.bibliometrics.theoreticalLotka.inversePower(count.table)
  pao <- ok.bibliometrics.theoreticalLotka.pao(count.table)
  return(list(plain=plain,inverse.power=inverse_power,pao=pao))
}

ok.bibliometrics.theoreticalLotka.simulate <- function(authors.count, slope) {
  result <- list()
  result$distribution <- (100/(as.numeric(names(authors.count))^slope)) %>% {return(./sum(.))}
  result$theoretical.count <- (result$distribution * sum(authors.count)) %>% round()
  result$constant <- result$distribution[1]
  return(result)
}

ok.bibliometrics.theoreticalLotka.all <- function(authors.vector, sep=';', scale=NULL) {
  res <- list(first=list(), all=list())
  res$first$authors.vector <- strsplit(authors.vector, sep) %>% sapply(function(e){return(unlist(e)[1])})
  res$first$authors.vector <- res$first$authors.vector[trimws(res$first$authors.vector)!='NA'&!is.na(res$first$authors.vector)]
  res$first$authors.table <- table(res$first$authors.vector)
  res$first$table.of.table <- table(res$first$authors.table)
  res$first$distribution <- ok.stats.distribution(res$first$table.of.table)
  res$first$distribution.cumulative <- ok.stats.distribution.cumulative(res$first$table.of.table)

  res$all$authors.vector <- strsplit(authors.vector, sep) %>% unlist()
  res$all$authors.vector <- res$all$authors.vector[trimws(res$all$authors.vector)!='NA'&!is.na(res$all$authors.vector)]
  res$all$authors.table <- table(res$all$authors.vector)
  res$all$table.of.table <- table(res$all$authors.table)
  res$all$distribution <- ok.stats.distribution(res$all$table.of.table)
  res$all$distribution.cumulative <- ok.stats.distribution.cumulative(res$all$table.of.table)

  res$first$results <- ok.bibliometrics.theoreticalLotka(res$first$table.of.table)
  res$all$results <- ok.bibliometrics.theoreticalLotka(res$all$table.of.table)

  legend <- ok.ggplot.extractLegend(res$first$results$plain$plot+theme(legend.direction = 'horizontal')) #%>% ggplotify::as.ggplot() + theme_test()
  strict <- res$first$results$plain$plot + theme(legend.position = "none") + ggtitle('strict Lotka')
  extended <- res$all$results$plain$plot + theme(legend.position = "none") + ggtitle('extended Lotka')
  general_case <- res$all$results$inverse.power$plot + theme(legend.position = "none") + ggtitle('inverse power law')
  pao <- res$all$results$pao$plot +theme(legend.position = "none") + ggtitle('Pao-like procedure')

  if(!is.null(scale)) {
    strict <- strict + xlim(scale[[1]]) + ylim(scale[[2]])
    extended <- extended + xlim(scale[[1]]) + ylim(scale[[2]])
    general_case <- general_case + xlim(scale[[1]]) + ylim(scale[[2]])
    pao <- pao + xlim(scale[[1]]) + ylim(scale[[2]])
  }

  grand <- gridExtra::grid.arrange(strict,extended,general_case,pao, legend, ncol=2, heights=c(10,10,1))
  res$plot <- grand

  return(res)
}

ok.bibliometrics.theoreticalLotka.all.weighted <- function(authors.vector, sep=';', weights.vector, scale=NULL) {
  res <- list(first=list(), all=list())
  aus <- strsplit(authors.vector, sep)
  aus.unl <- unlist(aus)
  len <- aus %>% sapply(length)
  cit <- rep(weights.vector, len)
  res$authors.weights <- sapply(unique(aus.unl), function(e){return(sum(cit[aus.unl==e]))})
  res$first$authors.vector <- strsplit(authors.vector, sep) %>% sapply(function(e){return(unlist(e)[1])})
  res$first$authors.vector <- res$first$authors.vector[trimws(res$first$authors.vector)!='NA'&!is.na(res$first$authors.vector)]
  res$first$authors.table <- table(res$first$authors.vector)
  counts <- sort(unique(res$first$authors.table))
  res$first$table.of.table <- sapply(counts,function(e){sum(res$authors.weights[names(res$authors.weights) %in% names(res$first$authors.table)[res$first$authors.table==e]])}) %>% set_names(counts)

  res$first$distribution <- ok.stats.distribution(res$first$table.of.table)
  res$first$distribution.cumulative <- ok.stats.distribution.cumulative(res$first$table.of.table)

  res$all$authors.vector <- strsplit(authors.vector, sep) %>% unlist()
  res$all$authors.vector <- res$all$authors.vector[trimws(res$all$authors.vector)!='NA'&!is.na(res$all$authors.vector)]
  res$all$authors.table <- table(res$all$authors.vector)
  counts <- sort(unique(res$all$authors.table))
  res$all$table.of.table <-  sapply(counts,function(e){sum(res$authors.weights[names(res$authors.weights) %in% names(res$all$authors.table)[res$all$authors.table==e]])}) %>% set_names(counts)

  res$all$distribution <- ok.stats.distribution(res$all$table.of.table)
  res$all$distribution.cumulative <- ok.stats.distribution.cumulative(res$all$table.of.table)

  res$first$results <- ok.bibliometrics.theoreticalLotka(res$first$table.of.table)
  res$all$results <- ok.bibliometrics.theoreticalLotka(res$all$table.of.table)

  legend <- ok.ggplot.extractLegend(res$first$results$plain$plot+theme(legend.direction = 'horizontal')) #%>% ggplotify::as.ggplot() + theme_test()
  strict <- res$first$results$plain$plot + theme(legend.position = "none") + ggtitle('strict Lotka')
  extended <- res$all$results$plain$plot + theme(legend.position = "none") + ggtitle('extended Lotka')
  general_case <- res$all$results$inverse.power$plot + theme(legend.position = "none") + ggtitle('inverse power law')
  pao <- res$all$results$pao$plot +theme(legend.position = "none") + ggtitle('Pao-like procedure')

  if(!is.null(scale)) {
    strict <- strict + xlim(scale[[1]]) + ylim(scale[[2]])
    extended <- extended + xlim(scale[[1]]) + ylim(scale[[2]])
    general_case <- general_case + xlim(scale[[1]]) + ylim(scale[[2]])
    pao <- pao + xlim(scale[[1]]) + ylim(scale[[2]])
  }

  grand <- gridExtra::grid.arrange(strict,extended,general_case,pao, legend, ncol=2, heights=c(10,10,1))
  res$plot <- grand

  return(res)
}
theogrost/bibliometricLaws documentation built on July 2, 2019, 8:56 p.m.