# install.packages("rvest")
# library(rvest)
library(tibble)
library(dplyr)
# first, just get a couple of examples from ON and QC (with accents)
# then scrape this website to get a complete list.
# http://www.ic.gc.ca/app/ccc/sld/cmpny.do?letter=A&lang=eng&profileId=1921&naics=441#A
# go through the list of companies, find header 'Location Address', then go to next thing, get address bit.
# or maybe focus on Aboriginal owned companies first.
# http://www.ic.gc.ca/app/ccc/srch/srch.do?lang=eng&profileId=&prtl=1&searchCriteriaBean.portal=1&V_SEARCH.documentJSP=%2FretrieveEstablishmentIdFromVerity.do&searchPage=cccSrch.do%253Flang%253Deng%2526prtl%253D1%2526tagid%253D&searchCriteriaBean.resultJsp=%2Fresults.do&V_SEARCH.scopeCategory=CCC.Root&V_SEARCH.depth=1&V_SEARCH.showStricts=false&searchCriteriaBean.conceptOperator=and&searchCriteriaBean.naicsCodeText=44%2C+45&searchCriteriaBean.isExportingOrInterested=exportingActively&searchCriteriaBean.specializedDirectory=248&searchCriteriaBean.hitsPerPage=10&searchCriteriaBean.sortSpec=title+asc&searchCriteriaBean.isSummaryOn=Y&sbmtBtn=Search
# federal companies, etc.
abo_firm <- read_html("http://www.ic.gc.ca/app/ccc/srch/nvgt.do?V_SEARCH.command=navigate&V_TOKEN=1509898996415&V_SEARCH.docsStart=1&lang=eng&prtl=1&V_SEARCH.resultsJSP=/results.do&profileId=")
# To extract the rating, we start with selectorgadget to figure out which css selector matches the data we want: strong span. (If you haven’t heard of selectorgadget, make sure to read vignette("selectorgadget") - it’s the easiest way to determine which selector extracts the data that you’re interested in.) We use html_node() to find the first node that matches that selector, extract its contents with html_text(), and convert it to numeric with as.numeric():
# abo_firm %>%
# html_nodes("div") %>% html_text()
br <- tibble(
name = c("A.-B. SECURITY", "Armada Security Canada", "Halfway River Mountainview Safety Limited",
"RNN Sales & Réntals", "Tim Tom Construction & Concrete"),
address = c("Unit 212, 833 103 Ave", "9605 14 St", "801 102 Ave, Golden Mile Shopping Centre",
"P.O. Box 143, Main Stn", "1205 116th Ave, #499"),
city = rep_len("DAWSON CREEK", 5),
province = rep_len("DAWSON CREEK", 5),
postal_code = c("V1G2G2", "V1G3Y1", "V1G2B4", "V1G4E9", "V1G4P5")
) %>% tibble::rownames_to_column(var = "id")
br
# then make a fake TCOD. With a couple of observations from one to the other.
# tcod:
# origin, destination, name, address
# or cross, then sample
tcod <- expand.grid(1:5, 1:5) %>% tibble::as.tibble() %>% rename(id.x = Var1, id.y = Var2) %>% mutate_all(as.character) %>% filter(id.x != id.y)
tcod <- tcod %>% left_join(br, by = c("id.x" = "id")) %>% left_join(br, by = c("id.y" = "id"))
tcod <- tcod %>% sample_frac(size = 0.7, replace = TRUE)
tcod
# use gsub to randomly replace characters,
# maybe do it after the matches so I don't have to calculate them by hand.
tcod$address.x
?gsub
gsub(pattern = sample(letters, size = 1), replacement = sample(letters, size = 1), x = tcod$address.x)
# ok, cool. this data.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.