R/census_table.R

## census search results...


census_table <- function(doc, trim_parish = TRUE){
   # 1. Name  (get first node after a/)
   name <- html_nodes(doc, xpath="//div[@class='name']/a/node()[1]") %>% html_text(trim=TRUE)
## last names have space after \t,  patronyms have NO space  ???
#   "Christian\t\t\t\t\t\t Lund" vs  "Christian\t\t\t\t\t\tOlsen"
    first_name <- gsub("([^\t]+).*", "\\1", name)
    name2 <- gsub("[^\t]+\t+(.*)", "\\1", name)
    if(any(grepl("\t", name2))) message("Warning: tabs in last name")
    patronymic <- ifelse( grepl("^ ", name2), "", name2)
    last_name <- ifelse( grepl("^ ", name2), trimws(name2), "")
    name <- gsub("\t+ ?", " ", name)

   #2. gender  (venus = female, mars = male)
   gender <- html_nodes(doc, xpath="//div[@class='gender']/span/i") %>% html_attr("class")
   gender <- ifelse(gender == "fa fa-mars", "male", "female")

   #3 Birth date
   b1 <- html_nodes(doc, xpath="//div[@class='born']") %>% html_text
   birth_date <-  as.integer(gsub("[^0-9]", "", b1))

   #4. Place of birth ??

   # 5 Pos./Status
   status <- html_nodes(doc, xpath="//div[@class='stilling-stand']") %>% html_text(trim=TRUE)
   status <- gsub("Pos./Status: ", "", status)

   # 6 Residence
   residence <- html_nodes(doc, xpath="//div[@class='place']") %>% html_text(trim=TRUE)
   residence <- gsub("Domicile: ", "", residence)
  ## Trim parish IF searching by parish??
   if(trim_parish)  residence <- gsub("[^:]+: ", "", residence)

   #7 Event date	  <div class="action">
   #8 Role          <div class="role">
   #9	Source        <div class="source">

   if(!all.equal( length(name), length(gender), length(birth_date), length(status), length(residence)   )){
       stop("Parsing error: Missing one or more nodes")
   }

   x <- tibble( name, residence, gender, birth_date, status, first_name, patronymic, last_name)
   x
}
cstubben/aRkivet documentation built on May 14, 2019, 12:25 p.m.