# Packages
library(sfi)
library(kmodR)
library(directlabels)
library(webshot)
library(ggplot2)
library(dplyr)
library(ggrepel)
library(plotly)
library(ggiraph)
library(scales)
library(tidyverse)
library(knitr)
library(Hmisc)
library(RColorBrewer)
library(extrafont)
library(kableExtra)
library(grid)

# webshot::install_phantomjs()

loadfonts()
## Global options
options(max.print="75")
opts_chunk$set(echo=FALSE,
            cache=FALSE,
              prompt=FALSE,
              tidy=TRUE,
              comment=NA,
              message=FALSE,
              warning=FALSE,
              dpi = 300,
              # dev = "cairo_pdf",
              dev = c("png", "cairo_pdf"),
              fig.pos="!h",
              fig.path = 'figures/')
opts_knit$set(width=75)
options(xtable.comment = FALSE)
options(tinytex.verbose = TRUE)

This markdown is for Chen and Ash figures.

\newpage

Chen and Ash 1

K-means clustering with outlier detection (ignore code output)

# find outliers based on

 # no scientific notation
# options(scipen = '999')
# 
# # get data 
data <- all_data$chen$f1
# get label numbers
data$text_label <- unlist(lapply(strsplit(data$circuit, '-', fixed = TRUE), function(x) x[length(x)]))
# remove 11 with FC and SC with 12, and 0 with dc
data$text_label <- gsub('^0$', 'DC', data$text_label)
data$text_label <- gsub('12', 'FC', data$text_label)
# randomly sample half of the data set for clarity
keep_index <- sample(1:nrow(data), nrow(data)/2)
data <- data[keep_index, ]
invisible(temp <- kmod(data[, c('x', 'y')], k = 14, l = 200, i_max = 300, conv_method = "delta_C",
   conv_error = 0, allow_empty_c = TRUE))
dat_c <- as.data.frame(cbind(data, clust_dist = temp$XC_dist_sqr_assign))
outlier_index <- as.character(temp$L_index)
# create an variable that indicates outlier
dat_c$outlier <- ifelse(dat_c$clust_dist.dist_sqr > 50, 'Outlier', 'In Cluster')
dat_c$clust_dist.c <- as.factor(as.character(dat_c$clust_dist.c))
# recode SC and FC as in cluster
dat_c$outlier <- as.factor(ifelse(dat_c$text_label == 'SC', 'In Cluster', 
                      ifelse(dat_c$text_label == 'FC', 'In Cluster', 
                             ifelse(dat_c$text_label == 'DC', 'In Cluster', dat_c$outlier))))

\newpage

figure 1 (sampled half the data set (n = 1089) and using different point shapes)

  # version 11
  g1 <- ggplot(dat_c,
               aes(x = x,
                   y = y,
                   group = clust_dist.c,
                   shape = clust_dist.c,
                   color = outlier)) +
    geom_point(size = 1,
              alpha = 0.8) +
  scale_color_manual(name = '',
                       values = c('black', 'grey')) +
    scale_shape_manual(name = 'Cluster group',
                       breaks = sort(c('1', '10', '11', '12', '13', '14', '2', '3', '4', '5','6', '7', '8', '9')),
                       values = c(14, 20, 8, 19, 12, 1, 2, 3, 5, 6, 7, 10, 15, 17)) +

      labs(x = '',
         y = '',
         title = paste0('Figure 1: Centered by Topic-Year', '\n', 'Averaged by Judge, Labeled by Court'),
         caption = paste0('Circuit, CC Judge Vector', '\n',
'Demeaned by Year and Big Topic')) +
    theme_sfi(lp = 'bottom',
              y_axis_title_style = 'bold',
              x_axis_title_style = 'bold',
              title_style = 'bold') +     
    theme(axis.text=element_text(size = 10, hjust = 1),
          plot.title = element_text(size =12)) +


    annotate(geom = 'text', 
             label = 'SC',
             alpha = 0.8,
             x = -28, 
             y = 42,
             family = c("Computer modern"), 
             size =6) +
    annotate(geom = 'text', 
             label = 'FC',
             alpha = 0.8,
             x = -32, 
             y = 30,
             family = 'CMU Bright', 
             size =6) +
   annotate(geom = 'text', 
             label = 'DC',
             alpha = 0.8,
             x = -16, 
             y = 30,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '9',
             alpha = 0.8,
             x = -23, 
             y = 9,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '5',
             alpha = 0.8,
             x = -7, 
             y = -35.5,
             family = 'CMU Bright', 
             size =6) +
  annotate(geom = 'text', 
             label = '8',
             alpha = 0.8,
             x = 16, 
             y = -38,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '6',
             alpha = 0.8,
             x = 26, 
             y = 8,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '10',
             alpha = 0.8,
             x = 23, 
             y = -15,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '4',
             alpha = 0.8,
             x = 3, 
             y = -13,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '7',
             alpha = 0.8,
             x = 13, 
             y = 18,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '2',
             x = 10, 
             y = 40,
             family = 'CMU Bright', 
             size =6) +
    annotate(geom = 'text', 
             label = '3',
             alpha = 0.8,
             x = -6, 
             y = 30,
             family = 'CMU Bright', 
             size =6)


  g1

figure 1 (sampled half the data set (n = 1089) and using different point shapes)

# # get data 
data <- all_data$chen$f1

# get label numbers
data$text_label <- unlist(lapply(strsplit(data$circuit, '-', fixed = TRUE), function(x) x[length(x)]))

# remove 11 with FC and SC with 12, and 0 with dc
data$text_label <- gsub('^0$', 'DC', data$text_label)
data$text_label <- gsub('12', 'FC', data$text_label)

# randomly sample half of the data set for clarity
keep_index <- sample(1:nrow(data), nrow(data)/2)
data <- data[keep_index, ]

# run kmeans clustering
cluster_data <- data[, c('x', 'y')]
dat_c <- kmeans(cluster_data, 14)
clusters <- dat_c$cluster
centers <- as.data.frame(dat_c$centers[dat_c$cluster, ])
rownames(centers) <- NULL
names(centers) <- c('centroid_x', 'centroid_y')
distances <- sqrt(rowSums((cluster_data-centers)^2))

# combine with old data
data$cluster <- clusters
data$distance <- as.numeric(distances)
data <- as.data.frame(cbind(data, centers))

data <- data[order(data$distance, decreasing = TRUE),]

# if distance over 3.5, then outlier 
data$outlier <- ifelse(data$distance > 8, 'Outlier', 'In cluster')

# get subset of cluster, x_centroid, and y_centroid
centroids <- data[, c('text_label','cluster','centroid_x', 'centroid_y')]
centroids <- centroids[!duplicated(centroids$text_label),]

# create list to store loop results
data_list <- list()

i = 1
unique_text <- unique(data$text_label)
data$label <- ''
for(i in 1:length(unique_text)){
  this_text <- unique_text[i]
  sub_text <- data[data$text_label == this_text,]
  sub_text <- sub_text[order(sub_text$distance, decreasing = FALSE),]

  # randomly assign year label to one row
  random_row <- sample(1:nrow(sub_text), 1)
  sub_text$label[random_row] <- this_text


  data_list[[i]] <- sub_text
}

data <- do.call('rbind', data_list)

# if label not blank, put in cluster
data$outlier <- ifelse(grepl('1|2|3|4|5|6|7|8|9|10|FC|SC|11|DC', data$label), 'In cluster', data$outlier)

# version 11
g1 <- ggplot(data,
             aes(x = x,
                 y = y,
                 group = text_label,
                 color = outlier)) +
  geom_point(size = 1,
            alpha = 0.8) +
  geom_label_repel(aes(label = label), size = 5, segment.size = 0, box.padding = 0.5) +
  scale_color_manual(name = '',
                     values = c('black', 'grey')) +

    labs(x = '',
       y = '',
       title = paste0('Figure 1: Centered by Topic-Year', '\n', 'Averaged by Judge, Labeled by Court'),
       caption = paste0('Circuit, CC Judge Vector', '\n',
'Demeaned by Year and Big Topic')) +
  theme_sfi(lp = 'bottom',
            y_axis_title_style = 'bold',
            x_axis_title_style = 'bold',
            title_style = 'bold') +     
  theme(axis.text=element_text(size = 10, hjust = 1),
        plot.title = element_text(size =12)) 
g1

figure 1 (sampled half the data set (n = 1089) no clustering

# # get data 
data <- all_data$chen$f1

# get label numbers
data$text_label <- unlist(lapply(strsplit(data$circuit, '-', fixed = TRUE), function(x) x[length(x)]))

# remove 11 with FC and SC with 12, and 0 with dc
data$text_label <- gsub('^0$', 'DC', data$text_label)
data$text_label <- gsub('12', 'FC', data$text_label)

# randomly sample half of the data set for clarity
keep_index <- sample(1:nrow(data), nrow(data)/2)
data <- data[keep_index, ]
data$text_label <- as.factor(data$text_label)
data$text_label <- factor(data$text_label, levels = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'DC', 'FC', 'SC'))

data <- data[complete.cases(data),]
# version 11
g1 <- ggplot(data,
             aes(x = x,
                 y = y,
                 group = text_label)) +
  geom_text(aes(label = text_label),
            alpha = 0.8) +
    labs(x = '',
       y = '',
       title = paste0('Figure 1: Centered by Topic-Year', '\n', 'Averaged by Judge, Labeled by Court'),
       caption = paste0('Circuit, CC Judge Vector', '\n',
'Demeaned by Year and Big Topic')) +
  theme_sfi(lp = 'bottom',
            y_axis_title_style = 'bold',
            x_axis_title_style = 'bold',
            title_style = 'bold') +     
  theme(axis.text=element_text(size = 10, hjust = 1),
        plot.title = element_text(size =12)) 
g1

Chen and Ash 2

# # get data 
  data <- all_data$chen$f2

# rename 
names(data) <- c('x', 'y', 'circuit','decade', 'decades')

# remove s from decades so it can be numeric
data$decades <- gsub('s', '', data$decades, fixed = TRUE)
data$decades <- as.numeric(data$decades)

# create an empty label column
data$label <- ''

# create list to store loop results
data_list <- list()

i = 1
unique_years <- unique(data$decades)
for(i in 1:length(unique_years)){
  this_year <- unique_years[i]
  sub_year <- data[data$decades == this_year,]

  if(this_year == 1890 | this_year == 1950 | this_year == 2010) {


    # randomly assign year label to one row
    random_row <- sample(1:nrow(sub_year), 1)
    sub_year$label[random_row] <- this_year

  } 
  data_list[[i]] <- sub_year
  print(this_year)
}

  data <- do.call('rbind', data_list)

  cols <- make_colors(length(unique(data$decade)), bw = TRUE)

  # version 11
  g1 <- ggplot(data,
               aes(x = x,
                   y = y,
                   color = decades)) +
    ylim(c(-7, 20)) +
    geom_point(size = 2) +
    #  geom_dl(data = subset(data, decades ==  1890),
    #         aes(label = decades), method = 'last.qp', hjust  = 1) +
    # geom_dl(data = subset(data, decades ==  1950),
    #         aes(label = decades), method = 'smart.grid') +
    # geom_dl(data = subset(data, decades == 2010),
    #         aes(label = decades), method = 'last.qp') +
    geom_text_repel(aes(label = label),segment.size = 0) +
    scale_color_gradient(name = 'Year', low = "#2C2C2C", high = "#ABABAB") +  
    labs(x = '',
         y = '',
         title = 'Figure 2.',
         subtitle = 'Centered by Court Topic, Averaged by Court-Year, Labeled by Decade',
         caption = 'Court Decade, SC & CC Court Decade Vector, Demeaned by Circuit and Big Topic') +
    theme_sfi(lp = 'bottom',
              lkw = TRUE, 
              lkt = 'point', 
              legend_width = 40,
              y_axis_title_style = 'bold',
              x_axis_title_style = 'bold',
              title_style = 'bold') +
        theme(axis.text=element_text(size = 10))


  g1

Chen and Ash 3

# # no scientific notation
  # options(scipen = '999')
  # 
  # # get data 
  data <- all_data$chen$f3
  names(data) <- gsub('-', '.', names(data), fixed = TRUE)

  data <- data[data$big.issue != 'Others',]
  data <- data[data$big.issue != '9',]
  data$big.issue <- as.factor(data$big.issue)
  JY_levels <- levels(data$big.issue)
  JY_labels <- c('1 - Criminal Appeal', '2 - Civil Rights', '3 - First Amendment','4 - Due Process',
                 '5 - Privacy', '6 - Labor','7 - Regulation')
  JY_labels_n <- c('1', '2', '3','4','5', '6','7')
  data$big.issue.n <- data$big.issue
  data$big.issue  <- factor(data$big.issue, levels=JY_levels, labels=JY_labels)


  data$big.issue.n  <- factor(data$big.issue.n, levels=JY_levels, labels=JY_labels_n)

  # # remove 9 and others
  # data <- data %>% filter(`big-issue` != '9') %>%
  #   filter(`big-issue` != 'Others')
  # 
  # Y_labels <- c('1 - Criminal Appeal', '2 - Civil Rights', '3 - First Amendment','4 - Due Process',
  #               '5 - Privacy', '6 - Labor','7 - Regulation','9 - Miscellaneous', 'Others')
  # JY_labels_n <- c('1', '2', '3','4','5', '6','7','9', 'Others')
  # 
  # 
  # version 11
  g1 <- ggplot(data,
               aes(x = x,
                   y = y)) +
    geom_point(size = 2,
               color = 'black',
               alpha = 0.4) +
    scale_color_manual(name = 'Confidence elipse',
                       values = c('darkgrey','darkgrey', 'darkgrey','darkgrey','darkgrey','darkgrey',
                                  'darkgrey','darkgrey','darkgrey','darkgrey','darkgrey','darkgrey','darkgrey')) +
    labs(x = '',
         y = '',
         title = 'Figure 3.',
         subtitle = 'Centered by Judge-Year, Averaged by Topic-Year, Labeled by Topic',
         caption = paste0('Big Topics and Year, SC and CC Topic Year Vector, Demeaned by Judge & Year')) +
  annotate(geom = 'text',
           x = 0,
           y = -50,
           family = 'CMU Bright',
           label = 'Regulation') +
    annotate(geom = 'text',
            family = 'CMU Bright', 
             x = 0,
             y = 50,
             label = 'Criminal Appeal') +
    annotate(geom = 'text',
             family = 'CMU Bright', 
             x = -38,
             y = -19,
             label = '1st Amendment') +
    annotate(geom = 'text',
             family = 'CMU Bright',
             x = -40,
             y = 28,
             label = 'Civil Rights') + 
    annotate(geom = 'text',
             family = 'CMU Bright',
             x = 17,
             y = 31,
             label = 'Privacy') +
    annotate(geom = 'text',
             family = 'CMU Bright',
             x = 38,
             y = 13,
             label = 'Labor') +
    annotate(geom = 'text',
             family = 'CMU Bright',
             x = 30,
             y = -2,
             label = 'Due Process') +
    theme_sfi(lp = 'none',
              y_axis_title_style = 'bold',
              x_axis_title_style = 'bold',
              title_style = 'bold') +
        theme(axis.text=element_text(size = 10, hjust = 1))
   g1 

Chen and Ash 4

# # get data 
  data <- all_data$chen$f4

  # recode party affiliation
  data <- data[data$party %in% c(1,0),]
  data$party <- ifelse(data$party == 0, 'R', 'D')

  # 
  # version 11
  g1 <- ggplot(data,
               aes(x = x,
                   y = y,
                   color = party,
                   shape = party)) +
    geom_point(size = 2, 
               alpha = 0.7) +
    scale_shape_manual(name = 'Party affiliation',
                       values = c(17, 15),
                       breaks = c('D', 'R'),
                       labels = c('D = Democrat (black)', 'R = Republican (grey)')) +
    scale_color_manual(name = '',
                       values = c('black', 'darkgrey')) +
    labs(x = '',
         y = '',
         title = 'Figure 4.',
         subtitle = 'Centered by Court-Topic-Year, Averaged by Judge, Labeled by Political Party',
         caption = 'Party affiliation, SC & CC Judge Vector, Demeaned by Circuit, Big Topics, and year') +
    theme_sfi(lp = 'bottom',
              y_axis_title_style = 'bold',
              x_axis_title_style = 'bold',
              title_style = 'bold') +
        theme(axis.text=element_text(size = 10, hjust = 1)) + guides(col = FALSE)


 g1

Chen and Ash 4 (black and white dots with black outline)

# # get data 
  data <- all_data$chen$f4

  # recode party affiliation
  data <- data[data$party %in% c(1,0),]
  data$party <- ifelse(data$party == 0, 'R', 'D')

  # 
  # version 11
  g1 <- ggplot(data,
               aes(x = x,
                   y = y,
                   fill = party)) +
    geom_point(size = 2, 
               alpha = 0.7,
               pch = 21,
               color = 'black') +
    scale_fill_manual(name = '',
                       values = c('black', 'white')) +
    labs(x = '',
         y = '',
         title = 'Figure 4.',
         subtitle = 'Centered by Court-Topic-Year, Averaged by Judge, Labeled by Political Party',
         caption = 'Party affiliation, SC & CC Judge Vector, Demeaned by Circuit, Big Topics, and year') +
    theme_sfi(lp = 'bottom',
              y_axis_title_style = 'bold',
              x_axis_title_style = 'bold',
              title_style = 'bold') +
        theme(axis.text=element_text(size = 10, hjust = 1)) + guides(col = FALSE)


 g1


databrew/sfi documentation built on May 29, 2019, 1:52 a.m.