ok.bibliometrics.theoreticalLotka.plain <- function(count.table) {
authors.number <- sum(count.table)
count.vector <- as.numeric(names(count.table))
lowering <- T
last_diff <- 9e20
index <- 1
res <- c()
res <- ok.bibliometrics.theoreticalLotka.simulate(count.table, 2)$theoretical.count
names(res) <- count.vector
rsq <- ok.stats.rSquared(count.table, res)
chi <- suppressWarnings(chisq.test(count.table, res))
dis_o <- ok.stats.distribution(unname(as.numeric(count.table)))
dis_t <- ok.stats.distribution(unname(res))
cum_o <- ok.stats.distribution.cumulative(unname(as.numeric(count.table)))
cum_t <- ok.stats.distribution.cumulative(unname(res))
kst <- suppressWarnings(ks.test(cum_o, cum_t, exact = F))
pl <- ok.ggplot.distribution.compare(x=count.vector, y1=dis_o, y2=dis_t, xlab = 'documents number', ylab = 'distribution', y1lab = 'observed', y2lab = 'theoretical')
li_res <- list(n=2,c=index,table=res,r.squared=rsq,chi.test=chi,ks.test=kst, plot=pl)
return(li_res)
}
ok.bibliometrics.theoreticalLotka.inversePower <- function(count.table) {
authors.number <- sum(count.table)
count.vector <- as.numeric(names(count.table))
lowering <- T
last_diff <- 9e20
index <- 1
res <- c()
n_slope <- abs(lm(log(count.table)~log(count.vector))$coefficients[2])
res <- ok.bibliometrics.theoreticalLotka.simulate(count.table, n_slope)$theoretical.count
names(res) <- count.vector
rsq <- ok.stats.rSquared(count.table, res)
chi <- suppressWarnings(chisq.test(count.table, res))
dis_o <- ok.stats.distribution(unname(as.numeric(count.table)))
dis_t <- ok.stats.distribution(unname(res))
cum_o <- ok.stats.distribution.cumulative(unname(as.numeric(count.table)))
cum_t <- ok.stats.distribution.cumulative(unname(res))
kst <- suppressWarnings(ks.test(cum_o, cum_t, exact = F))
pl <- ok.ggplot.distribution.compare(x=count.vector, y1=dis_o, y2=dis_t, xlab = 'documents number', ylab = 'distribution', y1lab = 'observed', y2lab = 'theoretical')
li_res <- list(n=n_slope,c=res[1],table=res,r.squared=rsq,chi.test=chi,ks.test=kst,plot=pl)
return(li_res)
}
ok.bibliometrics.theoreticalLotka.getModel <- function(count.table, slope) {
authors.number <- sum(count.table)
count.vector <- as.numeric(names(count.table))
lowering <- T
last_diff <- 9e20
index <- 1
res <- c()
n_slope <- lm(log(count.table)~0+1)
res <- ok.bibliometrics.theoreticalLotka.simulate(count.table, n_slope)$theoretical.count
return(res)
}
ok.bibliometrics.theoreticalLotka.pao <- function(count.table) {
authors.number <- sum(count.table)
count.vector <- as.numeric(names(count.table))
cutoff <- sum(count.table) %>% sqrt() %>% round()
cutoff_level <- count.table[length(count.table):1] %>% cumsum() %>% {which(.>cutoff)[1]} %>% names()
cutoff_level_no <- which(names(count.table)==cutoff_level)
if(cutoff_level_no<length(count.table)) {
datasets <- ggm::powerset(count.vector[cutoff_level_no:length(count.vector)]) %>% sapply(function(e){return(c(count.vector[1:(cutoff_level_no-1)],e))})
datasets[[length(datasets)+1]] <- count.vector[1:(cutoff_level_no-1)]
} else {
datasets <- list(count.vector)
}
ns <- chisqs <- kss <- rsqs <- vector('numeric',length(datasets))
res <- inds <- list()
for(i in 1:length(datasets)) {
indices <- datasets[[i]]
if(length(indices)>1) {
chind <- as.character(indices)
n_slope <- abs(lm(log(count.table[chind])~log(indices))$coefficients[2])
ns[i] <- n_slope
cu <- sapply(indices, function(i){return(9e20/i^n_slope)}) %>% round() %>% set_names(chind)
rsqs[i] <- ok.stats.rSquared(count.table[chind], cu)
kss[i] <- suppressWarnings(ks.test(ok.stats.distribution.cumulative(count.table[chind]), ok.stats.distribution.cumulative(cu), exact = F)$p.value)
chisqs[i] <- suppressWarnings(chisq.test(count.table[chind], cu)$p.value)
res[[i]] <- cu
inds[[i]] <- chind
}
}
ks_optim_index <- which(kss==max(kss))[1]
ks_filter <- inds[[ks_optim_index]]
dis_o <- ok.stats.distribution(unname(as.numeric(count.table[ks_filter])))
dis_t <- ok.stats.distribution(unname(res[[ks_optim_index]]))
cum_o <- ok.stats.distribution.cumulative(unname(as.numeric(count.table[ks_filter])))
cum_t <- ok.stats.distribution.cumulative(unname(res[[ks_optim_index]]))
kst <- suppressWarnings(ks.test(cum_o, cum_t, exact = F))
pl <- ok.ggplot.distribution.compare(x=as.numeric(ks_filter), y1=dis_o, y2=dis_t, xlab = 'documents number', ylab = 'distribution', y1lab = 'observed', y2lab = 'theoretical')
ks_best_chi <- chisqs[ks_optim_index]
ks_best_rsqs <- rsqs[ks_optim_index]
ks_best_n <- ns[ks_optim_index]
li_res <- list(cutoff=cutoff_level, cutoff.number=cutoff_level_no, combinations=length(datasets), bins=length(count.table), n=ns,datasets=datasets,r.squared.datasets=rsqs,ks.test.datasets=kss,chi.test.datasets=chisqs,plot=pl,max.ks.index=ks_optim_index,max.ks.ks=kst, ks.best.chi=ks_best_chi, ks.best.n=ks_best_n, ks.best.r=ks_best_rsqs)
return(li_res)
}
ok.bibliometrics.theoreticalLotka <- function(count.table) {
plain <- ok.bibliometrics.theoreticalLotka.plain(count.table)
inverse_power <- ok.bibliometrics.theoreticalLotka.inversePower(count.table)
pao <- ok.bibliometrics.theoreticalLotka.pao(count.table)
return(list(plain=plain,inverse.power=inverse_power,pao=pao))
}
ok.bibliometrics.theoreticalLotka.simulate <- function(authors.count, slope) {
result <- list()
result$distribution <- (100/(as.numeric(names(authors.count))^slope)) %>% {return(./sum(.))}
result$theoretical.count <- (result$distribution * sum(authors.count)) %>% round()
result$constant <- result$distribution[1]
return(result)
}
ok.bibliometrics.theoreticalLotka.all <- function(authors.vector, sep=';', scale=NULL) {
res <- list(first=list(), all=list())
res$first$authors.vector <- strsplit(authors.vector, sep) %>% sapply(function(e){return(unlist(e)[1])})
res$first$authors.vector <- res$first$authors.vector[trimws(res$first$authors.vector)!='NA'&!is.na(res$first$authors.vector)]
res$first$authors.table <- table(res$first$authors.vector)
res$first$table.of.table <- table(res$first$authors.table)
res$first$distribution <- ok.stats.distribution(res$first$table.of.table)
res$first$distribution.cumulative <- ok.stats.distribution.cumulative(res$first$table.of.table)
res$all$authors.vector <- strsplit(authors.vector, sep) %>% unlist()
res$all$authors.vector <- res$all$authors.vector[trimws(res$all$authors.vector)!='NA'&!is.na(res$all$authors.vector)]
res$all$authors.table <- table(res$all$authors.vector)
res$all$table.of.table <- table(res$all$authors.table)
res$all$distribution <- ok.stats.distribution(res$all$table.of.table)
res$all$distribution.cumulative <- ok.stats.distribution.cumulative(res$all$table.of.table)
res$first$results <- ok.bibliometrics.theoreticalLotka(res$first$table.of.table)
res$all$results <- ok.bibliometrics.theoreticalLotka(res$all$table.of.table)
legend <- ok.ggplot.extractLegend(res$first$results$plain$plot+theme(legend.direction = 'horizontal')) #%>% ggplotify::as.ggplot() + theme_test()
strict <- res$first$results$plain$plot + theme(legend.position = "none") + ggtitle('strict Lotka')
extended <- res$all$results$plain$plot + theme(legend.position = "none") + ggtitle('extended Lotka')
general_case <- res$all$results$inverse.power$plot + theme(legend.position = "none") + ggtitle('inverse power law')
pao <- res$all$results$pao$plot +theme(legend.position = "none") + ggtitle('Pao-like procedure')
if(!is.null(scale)) {
strict <- strict + xlim(scale[[1]]) + ylim(scale[[2]])
extended <- extended + xlim(scale[[1]]) + ylim(scale[[2]])
general_case <- general_case + xlim(scale[[1]]) + ylim(scale[[2]])
pao <- pao + xlim(scale[[1]]) + ylim(scale[[2]])
}
grand <- gridExtra::grid.arrange(strict,extended,general_case,pao, legend, ncol=2, heights=c(10,10,1))
res$plot <- grand
return(res)
}
ok.bibliometrics.theoreticalLotka.all.weighted <- function(authors.vector, sep=';', weights.vector, scale=NULL) {
res <- list(first=list(), all=list())
aus <- strsplit(authors.vector, sep)
aus.unl <- unlist(aus)
len <- aus %>% sapply(length)
cit <- rep(weights.vector, len)
res$authors.weights <- sapply(unique(aus.unl), function(e){return(sum(cit[aus.unl==e]))})
res$first$authors.vector <- strsplit(authors.vector, sep) %>% sapply(function(e){return(unlist(e)[1])})
res$first$authors.vector <- res$first$authors.vector[trimws(res$first$authors.vector)!='NA'&!is.na(res$first$authors.vector)]
res$first$authors.table <- table(res$first$authors.vector)
counts <- sort(unique(res$first$authors.table))
res$first$table.of.table <- sapply(counts,function(e){sum(res$authors.weights[names(res$authors.weights) %in% names(res$first$authors.table)[res$first$authors.table==e]])}) %>% set_names(counts)
res$first$distribution <- ok.stats.distribution(res$first$table.of.table)
res$first$distribution.cumulative <- ok.stats.distribution.cumulative(res$first$table.of.table)
res$all$authors.vector <- strsplit(authors.vector, sep) %>% unlist()
res$all$authors.vector <- res$all$authors.vector[trimws(res$all$authors.vector)!='NA'&!is.na(res$all$authors.vector)]
res$all$authors.table <- table(res$all$authors.vector)
counts <- sort(unique(res$all$authors.table))
res$all$table.of.table <- sapply(counts,function(e){sum(res$authors.weights[names(res$authors.weights) %in% names(res$all$authors.table)[res$all$authors.table==e]])}) %>% set_names(counts)
res$all$distribution <- ok.stats.distribution(res$all$table.of.table)
res$all$distribution.cumulative <- ok.stats.distribution.cumulative(res$all$table.of.table)
res$first$results <- ok.bibliometrics.theoreticalLotka(res$first$table.of.table)
res$all$results <- ok.bibliometrics.theoreticalLotka(res$all$table.of.table)
legend <- ok.ggplot.extractLegend(res$first$results$plain$plot+theme(legend.direction = 'horizontal')) #%>% ggplotify::as.ggplot() + theme_test()
strict <- res$first$results$plain$plot + theme(legend.position = "none") + ggtitle('strict Lotka')
extended <- res$all$results$plain$plot + theme(legend.position = "none") + ggtitle('extended Lotka')
general_case <- res$all$results$inverse.power$plot + theme(legend.position = "none") + ggtitle('inverse power law')
pao <- res$all$results$pao$plot +theme(legend.position = "none") + ggtitle('Pao-like procedure')
if(!is.null(scale)) {
strict <- strict + xlim(scale[[1]]) + ylim(scale[[2]])
extended <- extended + xlim(scale[[1]]) + ylim(scale[[2]])
general_case <- general_case + xlim(scale[[1]]) + ylim(scale[[2]])
pao <- pao + xlim(scale[[1]]) + ylim(scale[[2]])
}
grand <- gridExtra::grid.arrange(strict,extended,general_case,pao, legend, ncol=2, heights=c(10,10,1))
res$plot <- grand
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.