Authors

Auxiliary files

Top-r ntop uniquely identified authors and their productivity (title count).

knitr::opts_chunk$set(fig.path = "figure_slides/", dev="CairoPNG")
p <- NULL
p <- top_plot(df, "author", ntop) +
              ggtitle(paste("Top authors")) +
          ylab("Documents")
print(p)

dff = df %>%
      filter(is.na(author) & !is.na(author_name)) %>%
      select(author_name)
p <- top_plot(dff,  "author_name", ntop) + ggtitle("Top discarded authors")
print(p)

Ambiguous authors

Authors with ambiguous living year information - can we spot here cases where these are clearly known identical or distinct authors? Should also add living year information from supporting sources later.

r length(unique(subset(df, is.na(author_birth) | is.na(author_death))$author)) authors with missing life years (Life year info can be augmented here)

r length(readLines(paste0(this.folder, "/output.tables/author_life_ambiguous.csv")))-1 authors with ambiguous life years Some of these might be synonymous and could be added to author synonyme list (the first term will be selected for the final data)

Life span of uniquely identified top authors

Ordered by productivity (number of documents))

a <- rev(rev(sort(table(df$author)))[1:ntop])
dfa <- df[, c("author", "author_birth", "author_death")]
dfa <- filter(dfa, !is.na(author) & (author %in% names(a)))
dfa <- dfa[!duplicated(dfa), ]
dfa <- dfa[match(names(a), dfa$author),]
dfa <- arrange(dfa, author_birth)
# Order authors by birth year
dfa$author <- factor(dfa$author, levels = dfa$author)
dfa$index <- 1:nrow(dfa)

p <- ggplot(dfa)
p <- p + geom_segment(aes(y = author, yend = author, x = author_birth, xend = author_death), size = 2) 
p <- p + theme(axis.text.y = element_text(size = 9))
p <- p + xlab("Author life span (year)") + ylab("")
print(p)

Author age

r sum(!is.na(df$author_age)) documents (r round(100*mean(!is.na(df$author_age)))%) have author age at the publication year. These have been calculated for documents where the publication year and author life years (birth and death) are available, and the document has been printed during the author's life time.

theme_set(theme_bw(20))
dfs <- df %>% filter(!is.na(author_age)) %>%
         group_by(publication_decade) %>%
         summarize(age = mean(author_age)) %>%
         arrange(publication_decade)
p1 <- ggplot(dfs, aes(x = publication_decade, y = age)) +
       geom_bar(stat = "identity", position = "stack", color = "black") +
       xlab("Publication Decade") +
       ylab("Mean age (years)") +
       scale_fill_grey() +
       #guides(fill = guide_legend("Author")) +
       ggtitle("Average author age")

dfs <- df %>% filter(!is.na(author_age)) %>%
         group_by(publication_year) %>%
         summarize(age = mean(na.omit(author_age)), n = n()) %>%
         arrange(publication_year)
p2 <- ggplot(dfs, aes(x = publication_year, y = age, size = n)) +
       geom_point() +
       geom_smooth() +       
       xlab("Publication year") +
       ylab("Mean age (years)") +
       scale_fill_grey() +
       ggtitle("Author mean age")

dfs <- df %>% filter(!is.na(author_age))
p3 <- ggplot(dfs, aes(x = author_age)) +
       geom_histogram(binwidth = 10) +
       xlab("Age (years)") + ylab("Title count") +
       #ylab("") +
       #scale_fill_grey() +
       #guides(fill = guide_legend("Author")) +
       ggtitle("Author age on the publication year")

grid.arrange(p1, p2, p3, nrow = 1)

Author productivity

Title count versus paper consumption (all authors):

res1 <- compare_title_paper(df, "author", plot.mode = "text")
res2 <- compare_title_paper(df, "author", plot.mode = "point")
grid.arrange(res2$plot, res1$plot, nrow = 1)
#kable(res1$table)
theme_set(theme_bw(20))
top.authors <- names(top(df, field = "author", n = 3))
dfs <- df %>% filter(author %in% top.authors) %>%
         group_by(author, publication_decade) %>%
         tally() %>%
         arrange(publication_decade)
p <- ggplot(dfs, aes(x = publication_decade, y = n, fill = author)) +
       geom_bar(stat = "identity", position = "stack", color = "black") +
       xlab("Publication Decade") +
       ylab("Title Count") +
       scale_fill_grey() +
       guides(fill = guide_legend("Author")) +
       ggtitle("Title count timeline for the top authors")
print(p)
df <- df.preprocessed
top.authors <- names(rev(sort(table(df.preprocessed$author))))[1:10]
df2 <- df %>% filter(author %in% top.authors) %>% group_by(publication_year, author) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 

p <- ggplot(df2, aes(x = publication_year, y = paper, group = author, color = author))
p <- p + geom_point()
p <- p + geom_line()
p <- p + ggtitle("Paper consumption per author")
p <- p + xlab("Year")
p <- p + scale_y_log10()
p <- p + ylab("Paper consumption")
print(p)

p <- ggplot(df2, aes(x = publication_year, y = n, group = author, color = author))
p <- p + geom_point()
p <- p + geom_line()
p <- p + ggtitle("Documents per author")
p <- p + xlab("Year")
p <- p + scale_y_log10()
p <- p + ylab("Documents (n)")
print(p)


rOpenGov/bibliographica documentation built on April 10, 2022, 8:51 p.m.