R/JSTOR_lda_docdists.R

#' Calculate Euclidean distances between documents 
#' 
#' @description Generates a dendrogram of document clusters, a network plot of document-document relationships, and a graphml file to open with Gephi. For use with JSTOR's Data for Research datasets (http://dfr.jstor.org/).
#' @param lda the object returned by the function JSTOR_lda.
#' @return Returns plots of the document clusters and network and a graphml file in the working directory that can be opened with Gephi
#' @examples 
#' ## JSTOR_lda_docdists(lda = lda150) 
#' @import cluster igraph


JSTOR_lda_docdists <- function(lda){
  
  # unpack output from JSTOR_lda
  topic.props <- lda[[1]]  
  # if want to take logs, so adjust zeros to avoid -Inf
  # topic.props[topic.props == 0] <- 0.0000000000001
  
  #### Euclidean distance matrix on topics
  
  doc.props.dists1 <-  as.matrix(daisy(((topic.props[, !(colnames(topic.props) %in% c("ID","year"))])), metric =  "euclidean", stand = TRUE))
  # Change row values to zero if less than row minimum plus row standard deviation
  # This is how Jockers subsets the distance matrix to keep only 
  # closely related documents and avoid a dense spagetti diagram 
  # that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500)
  doc.props.dists2 <- doc.props.dists1
  doc.props.dists1[sweep(doc.props.dists1, 1, (apply(doc.props.dists1,1,min) + apply(doc.props.dists1,1,sd) )) > 0 ] <- 0
  
  ## dendrogram
  plot(hclust(dist(doc.props.dists2)), xlab = "document clusters", sub = "", main = "", labels = topic.props$ID, cex = 0.1)
  ## network plot
 
  g <- as.undirected(graph.adjacency(doc.props.dists1))
  layout1 <- layout.fruchterman.reingold(g, niter=500)
  plot(g, layout=layout1, edge.curved = TRUE, vertex.size = 1,  vertex.color= "grey", edge.arrow.size = 0.1, vertex.label.dist=0.5, vertex.label = NA)
  return(doc.props.dists1)
  write.graph(g, file="docs.graphml", format="graphml") 
  message(paste0("The docs.graphml file for Gephi can be found in ", getwd()))
}
benmarwick/JSTORr documentation built on May 12, 2019, 12:59 p.m.