citationsr: A package to analyze citations

Documented in analyze_citations

#' Generates figures with summary of analysis of citation cases
#'
#' @param file 'citation_data.csv' that contains data on the citation cases.
#' @param article Name of cited article; e.g. Fearon (2003)
#' @param output folder where figures generated by function will be stored
#'
#'
#' @examples
#' \dontrun{
#'  file <- "~/Google Drive/2016_Quality_of_citations/data/Fearon 2003_citation_cases.csv"
#'  article <- "Fearon and Laitin (2003)"
#'  output <- "fearon_2003"
#'  analyze_citations(file, article, output)
#' }

analyze_citations <- function(file, article, output){

  require(ggplot2)
  require(scales)
  require(quanteda)
  require(stringr)
  require(qdapDictionaries)
  require(readr)

  # precleaning file
  # text <- scan(file, what="character", sep="\n")
  # text <- gsub('\\\\"', "''", text)
  # text <- paste0(text, collapse="\n")
  # tmp <- tempfile()
  # writeLines(text, con=tmp)

  # reading file and cleaning data
  tf <- read_csv(file) # , fileEncoding = encoding
  # extracting year, deleting citations with empty years
  tf$year <- as.numeric(str_extract(tf$document, '\\s([1-2]{1}[0-9]{3})'))
  message("Warning: ", sum(is.na(tf$year)), " citation cases with missing year will be excluded from analysis.")
  todelete <- which(is.na(tf$year))
  message("Warning: ", sum(duplicated(tf$citation.case)), " duplicated citation cases will be excluded from analysis.")
  todelete <- c(todelete, which(duplicated(tf$citation.case)))
  message("Warning: ", sum(nchar(tf$citation.case)>1000), " citation cases longer than 1000 characters will be excluded from analysis.")
  todelete <- unique(c(todelete, which(nchar(tf$citation.case)>1000)))
  # exporting
  write.csv(tf[todelete,], file=paste0(output, '/parsing-errors.csv'), row.names=FALSE)
  tf <- tf[-todelete,]
  tf <- transform(tf, doc_id=match(document, unique(document))) # generate document number
  message("A total of ", max(tf$doc_id), " documents and ", nrow(tf), " citation cases will be included in the analysis.")


  # generating histogram with times cited within document
  x <- table(tf$document)
  range.x <- range(x)
  breaks <- c(range.x[1]:range.x[2]-0.5, range.x[2]+0.5)
  seq.x <- seq(range.x[1], range.x[2], 1)
  f1 <- paste0(output, '/01-times-cited-within-document.pdf')
  pdf(f1, height=4, width=6)
  par (mar=c(3,3,2,1), mgp=c(2,.7,0), tck=-.025)
  hist(x, xlab="Citation cases (per document)",
    main = paste0("Citation cases: ", article), xaxt="n",
    breaks=breaks, cex.main=1, ylab="Citing documents (frequency)")
  axis(1,seq.x)
  dev.off()
  message("File generated: ", f1)

  # generating histograms for co-citations
  tf$citation_counts <- stringr::str_count(tf$citation.case, "(19|20)[0-9]{2}")
  x <- tf$citation_counts
  # table(x)
  range.x <- range(x)
  breaks <- c(range.x[1]:range.x[2]-0.5, range.x[2]+0.5)
  seq.x <- c(seq(range.x[1], range.x[2], 1))
  f2 <- paste0(output, '/02-co-citations-in-citation-case.pdf')
  pdf(f2, height=4, width=6)
  par (mar=c(3,3,2,1), mgp=c(2,.7,0), tck=-.025)
  hist(x, xlab="Number of references (per citation case)",
    main = paste0("Citation cases: ", article), xaxt="n", breaks = breaks,  cex.main=1, ylab="Citation cases (frequency)")
  axis(1,seq.x)
  dev.off()
  message("File generated: ", f2)

  # generating average number of references per citation over time
  tf_group <- aggregate(tf$citation_counts, by=list(year=tf$year), FUN=mean, na.rm=TRUE)
  p <- ggplot(tf_group, aes(x=as.numeric(year), y=x))
  pq <- p + geom_point() + geom_line() + theme_minimal() +
    theme(axis.title.x=element_blank()) +
    scale_y_continuous("Average number of references in citation case") +
    ggtitle(paste0("Citation cases: ", article))
  f3 <- paste0(output, '/03-co-citations-over-time.pdf')
  ggsave(pq, file=f3, height=4, width=6)
  message("File generated: ", f3)





  # figure with positive signals
  signal.words <- paste0("follow|recommend|validate|suggest|accordance|advice|demonstrate",
    "|confirm|support|in line with|based")
  tf$signal_positive <- grepl(signal.words, tf$citation.case)
  tf_group <- aggregate(tf$signal_positive, by=list(year=tf$year), FUN=mean, na.rm=TRUE)
  p <- ggplot(tf_group, aes(x=as.numeric(year), y=x))
  pq <- p + geom_point() + geom_line() + theme_minimal() +
    theme(axis.title.x=element_blank()) +
    scale_y_continuous("Proportion of citation cases with `positive' signal",
      label=percent) + ggtitle(paste0("Citation cases: ", article))
  f4 <- paste0(output, '/04-citations-with-positive-signal.pdf')
  ggsave(pq, file=f4, height=4, width=6)
  message("File generated: ", f4)

  # text cleaning
  authors <- tokens(char_tolower(c(tf$document, article)), remove_punct=T, remove_numbers=T)
  authors <- unique(unlist(authors))
  # tokenizing
  tokens <- tokens(char_tolower(tf$citation.case), remove_punct=T, remove_numbers=T)
  # removing stopwords, author names, and other frequent words
  tokens <- tokens_remove(tokens,
    patter = c(stopwords("english"), "other", "others", "see", "also", "u", authors))
  # stemming?
  #tokens <- lapply(tokens, wordstem)
  # creating n-grams
  ngrams <- tokens_ngrams(tokens, n = 1) # lapply(tokens, ngrams, 1:3)
  # putting it all back together...
  ngrams <- unlist(lapply(ngrams, paste, collapse=" "))
  # constructing the DFM
  cit <- corpus(ngrams)
  docnames(cit) <- paste0(1:nrow(tf), '_', tf$document)
  # summary(cit)
  citmat <- dfm(cit)

  # word cloud
  f5 <- paste0(output, '/05-citations-word-cloud.pdf')
  pdf(f5, height=5, width=5)
  # paul - START
  layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
  par(mar=rep(0, 4))
  plot.new()
  text(x=0.5, y=0.5, paste0("Citation cases: ", article))
  # paul - END
  textplot_wordcloud(citmat, rot.per=0, scale=c(4, .4), max.words=80)
  dev.off()
  message("File generated: ", f5)

  # sentiment analysis
  dict <- qdapDictionaries::key.pol
  mydict <- dictionary(list(negative = dict$x[dict$y==-1],
                          postive = dict$x[dict$y==1]))
  myDfm <- dfm(cit, dictionary = mydict)
  tf$neg <- as.numeric(myDfm[,1])
  tf$pos <- as.numeric(myDfm[,2])
  tf$score <- (tf$pos - tf$neg)

  tf_group <- aggregate(tf$score, by=list(year=tf$year), FUN=mean, na.rm=TRUE)
  p <- ggplot(tf_group, aes(x=as.numeric(year), y=x))
  pq <- p + geom_point() + geom_line() + theme_minimal() +
    theme(axis.title.x=element_blank()) +
    scale_y_continuous("Average sentiment in citations") +
    ggtitle(paste0("Citation cases: ", article))
  f6 <- paste0(output, '/06-sentiment-over-time.pdf')
  ggsave(pq, file=f6, height=4, width=6)
  message("File generated: ", f6)

  # PAUL: figure with length of citation cases
  f7 <- paste0(output, '/07-length-of-citation-case-words.pdf')
  pdf(f7, height=4, width=6)
  x <- sapply(stringr::str_extract_all(tf$citation.case, "\\W+"), length) # Very rough count of words
  hist(x, xlab="Words (per citation case)", breaks = 20,  cex.main=1,
       ylab="Citation cases (frequency)",
       main = paste0("Citation cases: ", article))
  dev.off()
  message("File generated: ", f7)


  # PAUL: figure with length of citation cases
  f8 <- paste0(output, '/08-length-of-citation-case-characters.pdf')
  pdf(f8, height=4, width=6)
  x <- nchar(tf$citation.case)
  hist(x, xlab="Characters per citation case", breaks = 20,  cex.main=1,
       ylab="Citation cases (frequency)",
       main = paste0("Citation cases: ", article))
  dev.off()
  message("File generated: ", f8)

}