data-raw/compile_br.R

# install.packages("rvest")
# library(rvest)
library(tibble)
library(dplyr)
# first, just get a couple of examples from ON and QC (with accents)

# then scrape this website to get a complete list.
# http://www.ic.gc.ca/app/ccc/sld/cmpny.do?letter=A&lang=eng&profileId=1921&naics=441#A
# go through the list of companies, find header 'Location Address', then go to next thing, get address bit.

# or maybe focus on Aboriginal owned companies first.
# http://www.ic.gc.ca/app/ccc/srch/srch.do?lang=eng&profileId=&prtl=1&searchCriteriaBean.portal=1&V_SEARCH.documentJSP=%2FretrieveEstablishmentIdFromVerity.do&searchPage=cccSrch.do%253Flang%253Deng%2526prtl%253D1%2526tagid%253D&searchCriteriaBean.resultJsp=%2Fresults.do&V_SEARCH.scopeCategory=CCC.Root&V_SEARCH.depth=1&V_SEARCH.showStricts=false&searchCriteriaBean.conceptOperator=and&searchCriteriaBean.naicsCodeText=44%2C+45&searchCriteriaBean.isExportingOrInterested=exportingActively&searchCriteriaBean.specializedDirectory=248&searchCriteriaBean.hitsPerPage=10&searchCriteriaBean.sortSpec=title+asc&searchCriteriaBean.isSummaryOn=Y&sbmtBtn=Search

# federal companies, etc.


abo_firm <- read_html("http://www.ic.gc.ca/app/ccc/srch/nvgt.do?V_SEARCH.command=navigate&V_TOKEN=1509898996415&V_SEARCH.docsStart=1&lang=eng&prtl=1&V_SEARCH.resultsJSP=/results.do&profileId=")
# To extract the rating, we start with selectorgadget to figure out which css selector matches the data we want: strong span. (If you haven’t heard of selectorgadget, make sure to read vignette("selectorgadget") - it’s the easiest way to determine which selector extracts the data that you’re interested in.) We use html_node() to find the first node that matches that selector, extract its contents with html_text(), and convert it to numeric with as.numeric():
# abo_firm %>%
#   html_nodes("div") %>% html_text()



br <- tibble(
  name = c("A.-B. SECURITY", "Armada Security Canada", "Halfway River Mountainview Safety Limited",
           "RNN Sales & Réntals", "Tim Tom Construction & Concrete"),
  address = c("Unit 212, 833 103 Ave", "9605 14 St", "801 102 Ave, Golden Mile Shopping Centre",
              "P.O. Box 143, Main Stn", "1205 116th Ave, #499"),
  city = rep_len("DAWSON CREEK", 5),
  province = rep_len("DAWSON CREEK", 5),
  postal_code = c("V1G2G2", "V1G3Y1", "V1G2B4", "V1G4E9", "V1G4P5")
) %>% tibble::rownames_to_column(var = "id")
br

# then make a fake TCOD. With a couple of observations from one to the other.

# tcod:
# origin, destination, name, address
# or cross, then sample

tcod <- expand.grid(1:5, 1:5) %>% tibble::as.tibble() %>% rename(id.x = Var1, id.y = Var2) %>% mutate_all(as.character) %>% filter(id.x != id.y)
tcod <- tcod %>% left_join(br, by = c("id.x" = "id")) %>% left_join(br, by = c("id.y" = "id"))
tcod <- tcod %>% sample_frac(size = 0.7, replace = TRUE)
tcod

# use gsub to randomly replace characters,
# maybe do it after the matches so I don't have to calculate them by hand.
tcod$address.x

?gsub
gsub(pattern = sample(letters, size = 1), replacement = sample(letters, size = 1), x = tcod$address.x)
# ok, cool. this data.
tweed1e/matchtools documentation built on May 29, 2019, 10:51 a.m.