R/JSTOR_2bigramscor.R

#' Plot the change over time of the correlation between one bigram (or set of bigrams) and another bigram (or set of bigrams) in a JSTOR DfR dataset
#' 
#' @description Function to plot changes in the correlation of two sets of bigrams (two sets of 2-grams, or two sets of multiple bigrams) over time. For use with JSTOR's Data for Research datasets (http://dfr.jstor.org/).
#' @param unpack2 object returned by the function JSTOR_unpack2.
#' @param bigram1 One bigram or a vector of bigrams, each bigram surrounded by standard quote marks.
#' @param bigram2 One bigram or a vector of bigrams, each bigram surrounded by standard quote marks.
#' @param span span of the loess line (controls the degree of smoothing). Default is 0.4
#' @return Returns a ggplot object with publication year on the horizontal axis and Pearson's correlation on the vertical axis. Each point represents all the documents of a single year, point size is inversely proportional to p-value of the correlation.
#' @examples 
#' ## JSTOR_2bigramscor(unpack2, bigram1 = "hot water", bigram2 = "cold water")
#' ## JSTOR_2bigramscor(unpack2, c("hot water", "warm water", "tepid water"),  c("cold water", "ice water"))
#' @import slam data.table ggplot2 scales reshape2 plyr




JSTOR_2bigramscor <- function(unpack2, bigram1, bigram2, span = 0.4){
  ## investigate correlations between bigrams over time
  y <- unpack2$bigrams
  bibliodata <- unpack2$bibliodata
  bg1 <- bigram1
  bg2 <- bigram2
  
  # using dtm
  # y <- as.matrix(wordcounts)
  # Get total number of word in the article to standarise for different article lengths
  leng <- row_sums(y)
  # now get total numbers of words of interest (always lower case)
  bg1a <- as.matrix(y[,dimnames(y)$Terms %in% bg1])
  bg2a <- as.matrix(y[,dimnames(y)$Terms %in% bg2])
  
  
#   # full-text method
#   bg1a <- sapply(1:length(bigrams), function(i) sum(bigrams[[i]] %in% bg1))
#   bg2a <- sapply(1:length(bigrams), function(i) sum(bigrams[[i]] %in% bg2))
#   leng <- sapply(1:length(bigrams), function(i) length(bigrams[[i]]))
  
  
  # calculate ratios
  cbg1_ratio <- bg1a/leng
  cbg2_ratio <- bg2a/leng
  # get years for each article and make data frame
 
  c2bigrams_by_year <- data.table(cbg1 = cbg1_ratio, cbg2 = cbg2_ratio, year = as.numeric(as.character(bibliodata$year)))
  setnames(c2bigrams_by_year, c("cbg1", "cbg2", "year"))
  lim_min <- as.numeric(as.character(min(bibliodata$year)))
  lim_max <- as.numeric(as.character(max(bibliodata$year)))
  # calculate correlations of the two words per year (and p-values)
  corrp <- ddply(c2bigrams_by_year, .(year), summarize, "corr" = cor.test(cbg1, cbg2)$estimate, "pval" = cor.test(cbg1, cbg2)$p.value)
  # visualise
  suppressWarnings(ggplot(corrp, aes(year, corr)) +
                     geom_point(aes(size = -pval)) +
                     geom_smooth(  method = "loess", 
                                   span = span, 
                                   se = FALSE) +
                     theme(axis.text.x = element_text(angle = 90, 
                                                      hjust = 1)) +
                     geom_hline(yintercept=0, colour = "red") + 
                     ylim(min(corrp$corr), 1.0) +
                     ylab(paste0("correlation between \'", bg1, 
                                 "\' and \'", bg2, "\'")) +
                     scale_x_continuous(limits=c(lim_min, 
                                                 im_max), 
                                        breaks = seq(lim_min-1, lim_max+1, 2)) +
                     scale_size_continuous("p-values", 
                                           breaks = c(-0.75, -0.25, 
                                                      -0.05, -0.001), 
                                           labels = c(0.75, 0.25, 0.05, 0.001)))
}
benmarwick/JSTORr documentation built on May 12, 2019, 12:59 p.m.