dev/reapr.R

library(tidyverse)
library(reapr)
library(hgr)
library(cruxjars)
library(crux)
library(lexrankr)

df_companies <- "Desktop/deals/sheldon/affwerx/data/OpenTopicCos_182_183_191.csv" %>% read_csv() %>%
  select(-X1) %>%
  set_names(c("nameCompany", "idDUNS", "url"))

df_companies$idDUNS %>% sample(1)


urls <-
  df_companies %>%
  filter(!is.na(url)) %>%
  pull(url)


df_cruxed <-
  crux_urls(urls = urls, return_message = T)

df_companies <-
  df_companies %>%
  left_join(df_cruxed, by = "url") %>%
  select(idDUNS, nameCompany, url, title, text, description, everything()) %>%
  mutate_if(is.character, str_trim)




# crux -------------------------------------------------------------------

try <-
  df_companies %>%
  filter(!is.na(URL)) %>%
  pull(URL) %>%
  sample(1)

df_classified <- crux::classify_url(x = try)
df_summary <- crux::summarise_url(x = try) %>% as_data_frame() %>%
  rename()
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.