#' Generates figures with summary of analysis of citation cases
#'
#' @param file 'citation_data.csv' that contains data on the citation cases.
#' @param article Name of cited article; e.g. Fearon (2003)
#' @param output folder where figures generated by function will be stored
#'
#'
#' @examples
#' \dontrun{
#' file <- "~/Google Drive/2016_Quality_of_citations/data/Fearon 2003_citation_cases.csv"
#' article <- "Fearon and Laitin (2003)"
#' output <- "fearon_2003"
#' analyze_citations(file, article, output)
#' }
analyze_citations <- function(file, article, output){
require(ggplot2)
require(scales)
require(quanteda)
require(stringr)
require(qdapDictionaries)
require(readr)
# precleaning file
# text <- scan(file, what="character", sep="\n")
# text <- gsub('\\\\"', "''", text)
# text <- paste0(text, collapse="\n")
# tmp <- tempfile()
# writeLines(text, con=tmp)
# reading file and cleaning data
tf <- read_csv(file) # , fileEncoding = encoding
# extracting year, deleting citations with empty years
tf$year <- as.numeric(str_extract(tf$document, '\\s([1-2]{1}[0-9]{3})'))
message("Warning: ", sum(is.na(tf$year)), " citation cases with missing year will be excluded from analysis.")
todelete <- which(is.na(tf$year))
message("Warning: ", sum(duplicated(tf$citation.case)), " duplicated citation cases will be excluded from analysis.")
todelete <- c(todelete, which(duplicated(tf$citation.case)))
message("Warning: ", sum(nchar(tf$citation.case)>1000), " citation cases longer than 1000 characters will be excluded from analysis.")
todelete <- unique(c(todelete, which(nchar(tf$citation.case)>1000)))
# exporting
write.csv(tf[todelete,], file=paste0(output, '/parsing-errors.csv'), row.names=FALSE)
tf <- tf[-todelete,]
tf <- transform(tf, doc_id=match(document, unique(document))) # generate document number
message("A total of ", max(tf$doc_id), " documents and ", nrow(tf), " citation cases will be included in the analysis.")
# generating histogram with times cited within document
x <- table(tf$document)
range.x <- range(x)
breaks <- c(range.x[1]:range.x[2]-0.5, range.x[2]+0.5)
seq.x <- seq(range.x[1], range.x[2], 1)
f1 <- paste0(output, '/01-times-cited-within-document.pdf')
pdf(f1, height=4, width=6)
par (mar=c(3,3,2,1), mgp=c(2,.7,0), tck=-.025)
hist(x, xlab="Citation cases (per document)",
main = paste0("Citation cases: ", article), xaxt="n",
breaks=breaks, cex.main=1, ylab="Citing documents (frequency)")
axis(1,seq.x)
dev.off()
message("File generated: ", f1)
# generating histograms for co-citations
tf$citation_counts <- stringr::str_count(tf$citation.case, "(19|20)[0-9]{2}")
x <- tf$citation_counts
# table(x)
range.x <- range(x)
breaks <- c(range.x[1]:range.x[2]-0.5, range.x[2]+0.5)
seq.x <- c(seq(range.x[1], range.x[2], 1))
f2 <- paste0(output, '/02-co-citations-in-citation-case.pdf')
pdf(f2, height=4, width=6)
par (mar=c(3,3,2,1), mgp=c(2,.7,0), tck=-.025)
hist(x, xlab="Number of references (per citation case)",
main = paste0("Citation cases: ", article), xaxt="n", breaks = breaks, cex.main=1, ylab="Citation cases (frequency)")
axis(1,seq.x)
dev.off()
message("File generated: ", f2)
# generating average number of references per citation over time
tf_group <- aggregate(tf$citation_counts, by=list(year=tf$year), FUN=mean, na.rm=TRUE)
p <- ggplot(tf_group, aes(x=as.numeric(year), y=x))
pq <- p + geom_point() + geom_line() + theme_minimal() +
theme(axis.title.x=element_blank()) +
scale_y_continuous("Average number of references in citation case") +
ggtitle(paste0("Citation cases: ", article))
f3 <- paste0(output, '/03-co-citations-over-time.pdf')
ggsave(pq, file=f3, height=4, width=6)
message("File generated: ", f3)
# figure with positive signals
signal.words <- paste0("follow|recommend|validate|suggest|accordance|advice|demonstrate",
"|confirm|support|in line with|based")
tf$signal_positive <- grepl(signal.words, tf$citation.case)
tf_group <- aggregate(tf$signal_positive, by=list(year=tf$year), FUN=mean, na.rm=TRUE)
p <- ggplot(tf_group, aes(x=as.numeric(year), y=x))
pq <- p + geom_point() + geom_line() + theme_minimal() +
theme(axis.title.x=element_blank()) +
scale_y_continuous("Proportion of citation cases with `positive' signal",
label=percent) + ggtitle(paste0("Citation cases: ", article))
f4 <- paste0(output, '/04-citations-with-positive-signal.pdf')
ggsave(pq, file=f4, height=4, width=6)
message("File generated: ", f4)
# text cleaning
authors <- tokens(char_tolower(c(tf$document, article)), remove_punct=T, remove_numbers=T)
authors <- unique(unlist(authors))
# tokenizing
tokens <- tokens(char_tolower(tf$citation.case), remove_punct=T, remove_numbers=T)
# removing stopwords, author names, and other frequent words
tokens <- tokens_remove(tokens,
patter = c(stopwords("english"), "other", "others", "see", "also", "u", authors))
# stemming?
#tokens <- lapply(tokens, wordstem)
# creating n-grams
ngrams <- tokens_ngrams(tokens, n = 1) # lapply(tokens, ngrams, 1:3)
# putting it all back together...
ngrams <- unlist(lapply(ngrams, paste, collapse=" "))
# constructing the DFM
cit <- corpus(ngrams)
docnames(cit) <- paste0(1:nrow(tf), '_', tf$document)
# summary(cit)
citmat <- dfm(cit)
# word cloud
f5 <- paste0(output, '/05-citations-word-cloud.pdf')
pdf(f5, height=5, width=5)
# paul - START
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, paste0("Citation cases: ", article))
# paul - END
textplot_wordcloud(citmat, rot.per=0, scale=c(4, .4), max.words=80)
dev.off()
message("File generated: ", f5)
# sentiment analysis
dict <- qdapDictionaries::key.pol
mydict <- dictionary(list(negative = dict$x[dict$y==-1],
postive = dict$x[dict$y==1]))
myDfm <- dfm(cit, dictionary = mydict)
tf$neg <- as.numeric(myDfm[,1])
tf$pos <- as.numeric(myDfm[,2])
tf$score <- (tf$pos - tf$neg)
tf_group <- aggregate(tf$score, by=list(year=tf$year), FUN=mean, na.rm=TRUE)
p <- ggplot(tf_group, aes(x=as.numeric(year), y=x))
pq <- p + geom_point() + geom_line() + theme_minimal() +
theme(axis.title.x=element_blank()) +
scale_y_continuous("Average sentiment in citations") +
ggtitle(paste0("Citation cases: ", article))
f6 <- paste0(output, '/06-sentiment-over-time.pdf')
ggsave(pq, file=f6, height=4, width=6)
message("File generated: ", f6)
# PAUL: figure with length of citation cases
f7 <- paste0(output, '/07-length-of-citation-case-words.pdf')
pdf(f7, height=4, width=6)
x <- sapply(stringr::str_extract_all(tf$citation.case, "\\W+"), length) # Very rough count of words
hist(x, xlab="Words (per citation case)", breaks = 20, cex.main=1,
ylab="Citation cases (frequency)",
main = paste0("Citation cases: ", article))
dev.off()
message("File generated: ", f7)
# PAUL: figure with length of citation cases
f8 <- paste0(output, '/08-length-of-citation-case-characters.pdf')
pdf(f8, height=4, width=6)
x <- nchar(tf$citation.case)
hist(x, xlab="Characters per citation case", breaks = 20, cex.main=1,
ylab="Citation cases (frequency)",
main = paste0("Citation cases: ", article))
dev.off()
message("File generated: ", f8)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.