R/master_data.R

Defines functions make_master make_master_with_map_list list_by_pid read_litigation_data load_item save_and_make_map_files read_map_files split_litigation read_grant_data read_application_data make_cancer_split_with_master

# applications, grants, cancer, and litigation


if (FALSE) {
  cancer <- read_cancer_data()
  appl_bib <- load_item("_raw_data/appl_bib.Rdata", "appl_bib")
  grant_bib <- load_item("_raw_data/grant_bib_2.Rdata", "grant_bib")
  lit_cancer <- read_litigation_data()

  library(tibble)
  glimpse(cancer, 120)
  glimpse(appl_bib, 120)
  glimpse(grant_bib, 120)
  glimpse(lit_cancer, 120)

  save_and_make_map_files(
    cancer = cancer,
    application = appl_bib,
    grant = grant_bib,
    litigation = lit_cancer
  )

  map_list <- read_map_files()

  master <- make_master_with_map_list(map_list)
  cancer_split_by_appnum <- cancer %>% select(pid_num, appnum) %>% list_by_pid("appnum")
  load_all(); cancer_split <- make_cancer_split_with_master(master, cancer_split_by_appnum)

}


make_cancer_split_with_master <- function(master, cancer_split_by_appnum) {

  cancer_split_by_appnum %>%
    # tail(20000) %>%
    lapply_pb(function(cancer_app) {
      pid_num_vals <- cancer_app$pid_num

      is_app_pid <- (nchar(pid_num_vals) == 7)

      master_app <- master %>% filter(pid_num %in% pid_num_vals)

      ret <- list(appnum = cancer_app$appnum[1])

      cancer_val <- master_app$cancer

      if (sum(is_app_pid) > 0) {
        # ret$cancer_app <- cancer_val[is_app_pid]
        ret$cancer_app <- list(cancer_val[is_app_pid] %>% bind_rows())
      } else {
        ret$cancer_app <- list(NULL)
      }
      if (sum(!is_app_pid) > 0) {
        # ret$cancer_pub <- cancer_val[!is_app_pid]
        ret$cancer_pub <- list(cancer_val[!is_app_pid] %>% bind_rows())
      } else {
        ret$cancer_pub <- list(NULL)
      }

      set_ret <- function(key) {
        key_val <- master_app[[key]]
        key_val_is_there <- key_val %>% lapply(is.null) %>% unlist() %>% not()
        sum_val <- sum(key_val_is_there)
        if (sum_val == 0) {
          ret[[key]] <<- list(NULL)
        } else {
          ret[[key]] <<- list(key_val[key_val_is_there] %>% bind_rows())
        }
      }
      set_ret("application")
      set_ret("grant")
      set_ret("litigation")

      ret

    })
}



read_application_data <- function() {
  load_item("_raw_data/appl_bib.Rdata", "appl_bib")
}
read_grant_data <- function() {
  load_item("_raw_data/grant_bib_2.Rdata", "grant_bib")
}

split_litigation <- function(dt, name = "pid_nums") {

  require(tidyr)
  require(dplyr)
  row_info <- data_frame(
    row = seq_along(dt[[name]]),
    pid_nums = dt[[name]]
  ) %>%
    unnest(pid_nums)

  tall_dt <- dt[row_info$row, ]
  tall_dt$contains_pid_num <- row_info$pid_nums

  list_by_pid(tall_dt, "contains_pid_num")
}

read_map_files <- function() {
  get_map <- function(file, name) {
    message("Reading ", name, " file: ", file)
    readRDS(file)
  }
  list(
    cancer = get_map("_raw_data/cancer_map.rds", "cancer"),
    application = get_map("_raw_data/application_map.rds", "application"),
    grant = get_map("_raw_data/grant_map.rds", "grant"),
    litigation = get_map("_raw_data/litigation_map.rds", "litigation")
  )
}

save_and_make_map_files <- function(
  cancer = NULL, # read_cancer_data()
  application = NULL, # read_application_data()
  grant = NULL, # read_grant_data()
  litigation = NULL # read_litigation_data()
) {

  time_and_save <- function(dt, name, output_file, fn = list_by_pid) {
    message("splitting...")
    start <- Sys.time()
    dt_map <- fn(dt, name)
    end <- Sys.time();
    print(end - start)
    message("saving...")
    saveRDS(dt_map, output_file)
    dt_map
  }

  ret <- list()
  if (!is.null(cancer)) {
    message("Splitting cancer ~35 mins")
    ret[["cancer"]] <- time_and_save(cancer, "pid_num", "_raw_data/cancer_map.rds")
  }
  if (!is.null(application)) {
    message("Splitting application ~6 mins")
    ret[["application"]] <- time_and_save(application, "publication_number", "_raw_data/application_map.rds")
  }
  if (!is.null(grant)) {
    message("Splitting grant ~3 mins")
    ret[["grant"]] <- time_and_save(grant, "publication_number", "_raw_data/grant_map.rds")
  }
  if (!is.null(litigation)) {
    message("Splitting litigation <1 min")
    ret[["litigation"]] <- time_and_save(litigation, "pid_nums", "_raw_data/litigation_map.rds", split_litigation)
  }
  message("Done!")
  ret
}

load_item <- function(file, name) {
  env <- new.env()
  cat("loading file: ", file, "\n", sep = "")
  load(file, envir = env)
  get(name, envir = env)
}

read_litigation_data <- function(file = "_raw_data/lit_cancer.Rdata") {

  lit_cancer <- load_item(file, "lit_cancer")

  lit_cancer %>%
    lapply(function(lit_item) {

      lit_item$related_patents$doc_num <- remove_pid_leading_zero(lit_item$related_patents$doc_num)

      # before list'ing related_patents
      lit_item$pid_nums <- list(lit_item$related_patents$doc_num)

      lit_item$related_patents <- list(lit_item$related_patents)

      lit_item$defendant_atts <- list(lit_item$defendant_atts)
      lit_item$plaintiff_atts <- list(lit_item$plaintiff_atts)

      lit_item$defendant <- list(lit_item$defendant)
      lit_item$plaintiff <- list(lit_item$plaintiff)

      col_lengths <- lapply(lit_item, length) %>% unlist()
      if (any(col_lengths != 1)) {
        print(lit_item)
        browser()
      }

      lit_item
    }) %>%
    bind_rows()
}

list_by_pid <- function(obj, key = "publication_number") {
  obj %>% split(f = obj[[key]])
}

make_master_with_map_list <- function(map_list) {

  cancer_map <- map_list[["cancer"]]
  application_map <- map_list[["application"]]
  grant_map <- map_list[["grant"]]
  litigation_map <- map_list[["litigation"]]

  pid_nums <- names(cancer_map)
  # pid_nums <- pid_nums[c(c(157000:158000), (length(pid_nums)-3000):length(pid_nums))]

  # message("Getting grant list")
  # grant_list <- grant_map[pid_nums]
  # message("Getting application list")
  # application_list <- application_map[pid_nums]
  # message("Getting litigation list")
  # litigation_list <- litigation_map[pid_nums]
  #
  # data_frame(
  #   pid_num = pid_nums,
  #   cancer = cancer_map,
  #   application = application_list,
  #   grant = grant_list,
  #   litigation = litigation_list
  # )

  lapply_pb(pid_nums, function(pid_num) {
    cancer_val = cancer_map[[pid_num]]
    grant_val = grant_map[[pid_num]]
    application_val = application_map[[pid_num]]
    litigation_val = litigation_map[[pid_num]]
    if (!is.null(grant_val)) {
      cat("grant!\n")
    }
    if (!is.null(application_val)) {
      cat("application!\n")
    }
    if (!is.null(litigation_val)) {
      cat("litigation!\n")
    }
    list(
      pid_num = pid_num,
      cancer = list(cancer_val),
      grant = list(grant_val),
      application = list(application_val),
      litigation = list(litigation_val)
    )
  }) %>%
    bind_rows() ->
  ret

  saveRDS(ret, "_raw_data/master_by_pid_data.rds")

  invisible(ret)
}

make_master <- function(cancer, grants, applications, litigation) {

  lit_pids <- lit_cancer %>% extract2("pid_nums")

  cancer$pid_num %>%
    head(100) %>%
    lapply_pb(function(pid_num_val) {
      cancer_dt <- cancer %>% filter(pid_num == pid_num_val)
      grants_dt <- grants %>% filter(publication_number == pid_num_val)
      applications_dt <- applications %>% filter(publication_number == pid_num_val)

      # lit_pids %>%
      #   lapply(function(pid_vals){
      #     pid_vals %in% pid_num_val
      #   }) %>%
      #   unlist() ->
      # lit_rows
      # litigation_dt <- litigation %>% filter_(lit_rows)

      list(
        pid_num = pid_num_val,
        cancer = list(cancer_dt),
        grants = list(grants_dt),
        applications = list(applications_dt),
        litigation = list(litigation_dt)
      )
    }) %>%
    bind_rows() ->
  ret

  ret
}
hafen/xdata-hackathon-uspto documentation built on July 20, 2017, 6:31 a.m.