R/compile_definfo.R

Defines functions odcr_compile_definfo oscn_compile_definfo

oscn_compile_definfo <- function(courts, casetypes, file_years) {
  
  courts <- courts[courts %in% oscn_counties]
  
  qoscn <- function(table, fields) {
    dbGetQuery(ojo_db, glue_sql(
      "SELECT {`fields`*}
                          FROM {`table`}
                          WHERE court IN ({courts*})
                          AND casetype IN ({casetypes*})
                          AND file_year IN ({file_years*})", 
      .con = ojo_db)) %>% 
      mutate_at(vars(contains("_date")), list(ymd))
  }
  
  connect_ojo()
  
  di <- qoscn("oscn_parties", c("court", "casenum", "party_id", "party_name", "party_type")) %>%
    filter(party_type %in% c("Defendant", "DEFENDANT")) %>%
    rename(def_id = party_id, defname = party_name) %>%
    left_join(qoscn("oscn_caseinfo", c("court", "casenum", "file_date"))) %>% 
    left_join(qoscn("oscn_party_address", c("court", "casenum", "def_id", "def_address", "def_zip", "rec_date"))) %>% 
    mutate(datedif = abs(rec_date - file_date),
           def_address = str_squish(def_address) %>% 
             str_remove_all("[^[[:alpha:]]|,| ]") %>% 
             str_remove(" U(S| )A$")) %>% 
    group_by(court, casenum, def_id) %>%
    filter(datedif == min(datedif)) %>% 
    group_by(court, casenum, def_id, defname) %>%
    summarize(def_address = first(def_address[which(!is.na(def_address) & def_address != "")]),
              def_zip = first(def_zip[which(!is.na(def_zip))])) %>% 
    left_join(qoscn("oscn_party_profile", c("court", "casenum", "def_id", "def_mob")) %>% 
                filter(!is.na(def_mob)) %>% 
                group_by(court, casenum, def_id) %>% 
                summarize(def_mob = first(def_mob))) %>% 
    mutate(firstname = str_extract(defname, "(?<=, ).*?(?= |$)"),
           middlename = str_extract(defname, "(?<= )[[:alpha:]]*?$"),
           b_year = str_sub(def_mob, 1, 4) %>% 
             as.numeric)
  
  print("Defendant data queried.")
  
  gender_tbl <- dbReadTable(ojo_db, "ojo_gender")
  
  dip <- di %>% 
    ungroup %>% 
    select(firstname, b_year) %>% 
    distinct() %>% 
    anti_join(gender_tbl %>% 
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(firstname = name, b_year))
  
  sex_df <- tibble()
  
  print("Getting gender data for first names . . .")
  
  total_time <- 0
  
  if (nrow(dip) > 0) {
    for (i in 1:nrow(dip)) {
      
      start_time <- Sys.time()
      
      if (is.na(dip[i, "b_year"])) {
        sex_df <- sex_df %>% 
          bind_rows(gender(dip[i, "firstname"] %>% as.character(),
                           years = c(1950, 2000),
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      } else {
        sex_df <- sex_df %>% 
          bind_rows(gender(dip[i, "firstname"] %>% as.character(),
                           years = dip[i, "b_year"] %>% as.numeric,
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      }
      
      total_time <- total_time + (Sys.time() - start_time)
      
      print(paste(i, "of", nrow(dip), "done. About", 
                  if_else(round((total_time/i)*(nrow(dip) - i), 1) < 60, 
                          paste(round((total_time/i)*(nrow(dip) - i), 1), "seconds remaining."),
                          paste(round((total_time/i)*(nrow(dip) - i)/60, 1), "minutes remaining.")))
      )
    }
  }
  
  gender_tbl <- gender_tbl %>% 
    bind_rows(sex_df %>% 
                mutate(newrow = TRUE))
  
  round2 <- di %>% 
    ungroup %>% 
    select(firstname, middlename, b_year) %>% 
    left_join(gender_tbl %>% 
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(firstname = name, b_year, proportion_male)) %>% 
    filter(proportion_male > .25 & proportion_male < .75) %>% 
    select(middlename, b_year) %>% 
    filter(nchar(middlename) > 1) %>% 
    distinct
  
  middle_sex <- tibble()
  
  total_time <- 0
  
  print("Searching for middle names")
  
  if (nrow(round2) > 0) {
    for (i in 1:nrow(round2)) {
      
      start_time <- Sys.time()
      
      if (is.na(round2[i, "b_year"])) {
        middle_sex <- middle_sex %>% 
          bind_rows(gender(round2[i, "middlename"] %>% as.character(),
                           years = c(1950, 2000),
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      } else {
        middle_sex <- middle_sex %>% 
          bind_rows(gender(round2[i, "middlename"] %>% as.character(),
                           years = round2[i, "b_year"] %>% as.numeric,
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      }
      total_time <- total_time + (Sys.time() - start_time)
      
      print(paste(i, "of", nrow(round2), "middle names done. About", 
                  if_else(round((total_time/i)*(nrow(round2) - i), 1) < 60, 
                          paste(round((total_time/i)*(nrow(round2) - i), 1), "seconds remaining."),
                          paste(round((total_time/i)*(nrow(round2) - i)/60, 1), "minutes remaining.")))
      )
    }
  }
  
  gender_tbl <- gender_tbl %>% 
    bind_rows(middle_sex %>% 
                mutate(newrow = TRUE))
  
  dbWriteTable(ojo_db, "ojo_gender", 
               gender_tbl %>% 
                 filter(newrow == TRUE) %>% 
                 select(-newrow), 
               row.names = FALSE, append = TRUE)
  
  di <- di %>% 
    left_join(gender_tbl %>%
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(firstname = name, b_year, proportion_male, gender) %>% 
                distinct)
  
  di <- di %>% 
    left_join(gender_tbl %>%
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(middlename = name, b_year, proportion_male_middle = proportion_male, gender_middle = gender) %>% 
                distinct)
  
  di_all <<- di %>% 
    mutate(sex_imputed = case_when(proportion_male < .25 | proportion_male > .75 ~ gender,
                                   proportion_male_middle < .25 | proportion_male_middle > .75 ~ gender_middle,
                                   TRUE ~ "unknown"),
           sex_impute_method = case_when(proportion_male < .25 | proportion_male > .75 ~ "first name",
                                         proportion_male_middle < .25 | proportion_male_middle > .75 ~ "middle name",
                                         TRUE ~ "none")) %>% 
    filter(!str_detect(defname, "OSBI|PROFILE")) %>% 
    mutate(casetype = str_sub(casenum, 1, 2),
           file_year = str_sub(casenum, 4, 7))
  
  dbxUpsert(ojo_db, "ojo_def_info", di_all, where_cols = c("court", "casenum", "defname"))
  
  disconnect_ojo()
}

odcr_compile_definfo <- function(courts, casetypes, file_years) {
  
  courts <- courts[courts %in% odcr_counties]
  
  qodcr <- function(table, fields) {
    dbGetQuery(ojo_db, glue_sql(
      "SELECT {`fields`*}
                          FROM {`table`}
                          WHERE court IN ({courts*})
                          AND casetype IN ({casetypes*})
                          AND file_year IN ({file_years*})", 
      .con = ojo_db)) %>% 
      mutate_at(vars(contains("_date")), list(ymd))
  }
  
  connect_ojo()
  
  di <- qodcr("odcr_party", c("court", "casenum", "party", "party_type")) %>%
    filter(party_type == "Defendant") %>% 
    select(-party_type) %>% 
    rename(defname = party) %>% 
    left_join(qodcr("odcr_party_profile", c("court", "casenum", "def_mob", "def_address", "def_zip")) %>% 
                mutate(def_address = str_remove(def_address, "\\s{5}.*|\\d{5}") %>% 
                         str_squish())) %>% 
    mutate(firstname = str_extract(defname, "(?<=, ).*?(?= |$)"),
           middlename = str_extract(defname, "(?<= )[[:alpha:]]*?$"),
           b_year = str_sub(def_mob, 1, 4) %>% 
             as.numeric)
  
  gender_tbl <- dbReadTable(ojo_db, "ojo_gender")
  
  dip <- di %>% 
    ungroup %>% 
    select(firstname, b_year) %>% 
    distinct() %>% 
    anti_join(gender_tbl %>% 
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(firstname = name, b_year))
  
  sex_df <- tibble()
  
  print("Getting gender data for first names . . .")
  
  total_time <- 0
  
  if (nrow(dip) > 0) {
    for (i in 1:nrow(dip)) {
      
      start_time <- Sys.time()
      
      if (is.na(dip[i, "b_year"])) {
        sex_df <- sex_df %>% 
          bind_rows(gender(dip[i, "firstname"] %>% as.character(),
                           years = c(1950, 2000),
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      } else {
        sex_df <- sex_df %>% 
          bind_rows(gender(dip[i, "firstname"] %>% as.character(),
                           years = dip[i, "b_year"] %>% as.numeric,
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      }
      
      total_time <- total_time + (Sys.time() - start_time)
      
      print(paste(i, "of", nrow(dip), "done. About", 
                  if_else(round((total_time/i)*(nrow(dip) - i), 1) < 60, 
                          paste(round((total_time/i)*(nrow(dip) - i), 1), "seconds remaining."),
                          paste(round((total_time/i)*(nrow(dip) - i)/60, 1), "minutes remaining.")))
      )
    }
  }
  
  gender_tbl <- gender_tbl %>% 
    bind_rows(sex_df %>% 
                mutate(newrow = TRUE,
                       gender = as.character(gender)))
  
  round2 <- di %>% 
    ungroup %>% 
    select(firstname, middlename, b_year) %>% 
    left_join(gender_tbl %>% 
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(firstname = name, b_year, proportion_male)) %>% 
    filter(proportion_male > .25 & proportion_male < .75) %>% 
    select(middlename, b_year) %>% 
    filter(nchar(middlename) > 1) %>% 
    distinct
  
  middle_sex <- tibble()
  
  total_time <- 0
  
  print("Searching for middle names")
  
  if (nrow(round2) > 0) {
    for (i in 1:nrow(round2)) {
      
      start_time <- Sys.time()
      
      if (is.na(round2[i, "b_year"])) {
        middle_sex <- middle_sex %>% 
          bind_rows(gender(round2[i, "middlename"] %>% as.character(),
                           years = c(1950, 2000),
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      } else {
        middle_sex <- middle_sex %>% 
          bind_rows(gender(round2[i, "middlename"] %>% as.character(),
                           years = round2[i, "b_year"] %>% as.numeric,
                           method = "ssa") %>% 
                      mutate(gender = as.character(gender)))
      }
      total_time <- total_time + (Sys.time() - start_time)
      
      print(paste(i, "of", nrow(round2), "middle names done. About", 
                  if_else(round((total_time/i)*(nrow(round2) - i), 1) < 60, 
                          paste(round((total_time/i)*(nrow(round2) - i), 1), "seconds remaining."),
                          paste(round((total_time/i)*(nrow(round2) - i)/60, 1), "minutes remaining.")))
      )
    }
  }
  
  gender_tbl <- gender_tbl %>% 
    bind_rows(middle_sex %>% 
                mutate(newrow = TRUE,
                       gender = as.character(gender)))
  
  connect_ojo()
  
  dbWriteTable(ojo_db, "ojo_gender", 
               gender_tbl %>% 
                 filter(newrow == TRUE) %>% 
                 select(-newrow), 
               row.names = FALSE, append = TRUE)
  
  di <- di %>% 
    left_join(gender_tbl %>%
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(firstname = name, b_year, proportion_male, gender) %>% 
                distinct)
  
  di <- di %>% 
    left_join(gender_tbl %>%
                mutate(b_year = ifelse(year_min == year_max, year_min, NA)) %>% 
                select(middlename = name, b_year, proportion_male_middle = proportion_male, gender_middle = gender) %>% 
                distinct)
  
  di_all <<- di %>% 
    mutate(sex_imputed = case_when(proportion_male < .25 | proportion_male > .75 ~ gender,
                                   proportion_male_middle < .25 | proportion_male_middle > .75 ~ gender_middle,
                                   TRUE ~ "unknown"),
           sex_impute_method = case_when(proportion_male < .25 | proportion_male > .75 ~ "first name",
                                         proportion_male_middle < .25 | proportion_male_middle > .75 ~ "middle name",
                                         TRUE ~ "none")) %>% 
    filter(!str_detect(defname, "OSBI|PROFILE")) %>% 
    mutate(casetype = str_sub(casenum, 1, 2),
           file_year = str_sub(casenum, 4, 7))
  
  dbxUpsert(ojo_db, "ojo_def_info", di_all, where_cols = c("court", "casenum", "defname"))
  
  disconnect_ojo()
}
openjusticeok/ojo documentation built on Feb. 2, 2021, 5:47 a.m.