R/census_search.R

Defines functions census_search

Documented in census_search

#' Search census records at the Norway Digital Archives
#'
#' Formats the Search for Persons in the 1801 census at
#' \url{https://digitalarkivet.arkivverket.no/en-gb/ft/sok/1801}.
#' Change the year to 1865, 1875, 1900, or 1910 to search other censuses
#'
#' @param year Census year, 1801, 1865, 1875, 1900, or 1910 (other years are partial censuses)
#' @param municipal Municipal code, one or more codes like 1638 for Orkdal or c(1638, 1636) for Orkdal and Meldal
#' @param county County code, first two letters of municipal code for all parishes in county
#' @param region Region code, 1 to 5 for ...
#' @param first_name First name
#' @param last_name Last name or patronym
#' @param gender Gender, m or f
#' @param family_position Family position
#' @param marital_status Marital status
#' @param occupation Occupation or job
#' @param date_of_birth Date of birth in yyyy-mm-dd format, use 175* for anyone born in the 1750s
#' @param age estimated Age at census
#' @param notes Notes
#' @param residence  Residence or domicile
#' @param district Census district
#' @param house House number
#' @param all Default search gets first page with 50 records, set all=TRUE to download all results
#' @param \dots Other options like \code{trim_parish} passed to \code{\link{census_table}}
#' @return A data.frame with name, residence, gender, birth_date, status, first_name, patronymic,
#' last_name.  The last three columns are parsed from name in the html file (first is before a
#' string of tabs, patronym is right after the tabs or last is after tabs AND a space)
#' @note Municipal codes in code{\link{municipal}}.
#' @author Chris Stubben
#' @examples
#' \dontrun{
#'   # set all=TRUE to get all 68 Siverts in Orkdal, default is first page only
#'   census_search(1801, 1638, first="Siv*", all=TRUE)
#'   # search in Meldal and Orkdal.  Set trim_parish=FALSE to keep parish in residence
#'   census_search(1801, municipal=c(1636, 1638), first="Lars", last="Siv*", trim_parish=FALSE)
#' }
#' @export

census_search  <- function( year, municipal=NULL, county=NULL, region=NULL,
        first_name=NULL, last_name=NULL, gender=NULL, family_position=NULL,
        marital_status=NULL, occupation=NULL, date_of_birth=NULL, age=NULL,
        notes=NULL, residence=NULL,  district=NULL,  house=NULL, all=FALSE, ...){
   url1 <- paste0("https://digitalarkivet.no/en/census/search/", year)
   l1 <- list(`m[]` = municipal, `c[]` = county, `r[]` = region)
    x <- unlist(l1)
   if( any( !grepl("^[0-9]+$", x))) stop("Locations should be numeric codes")
   ## make new list if more than one location code (vectors not allowed in compose_query)
   if( any( sapply(l1, length)>1)){
      l1 <- as.list(x)
      names(l1) <- names(x)
   }




   # skip ny_husholdning=new_household,   (1 for 1st person or x for Lodger)
   l2 <- list(fornavn=first_name, etternavn=last_name, kjonn=gender, familiestilling=family_position,
         sivilstand=marital_status, yrke=occupation, fodselsaar=date_of_birth, alder=age,
         merknader=notes, bosted=residence, bydel=district, gaardsnummer=house)
   n <- sapply(l2, length)
   if(any(n > 1)) stop("Vectors are not allow in search fields")
   url2 <- httr:::compose_query( c( l1, l2) )
   url <- paste0( url1, "?", url2)
   message("Getting ", url)
   doc <- read_html(url)
   results <- html_nodes(doc, xpath="//span[@data-hits]") %>% html_text
   n <- as.numeric( gsub(".* ([0-9]+) hit.*", "\\1", results))
   message(results)
   if(n > 0){
      n1 <- ceiling(n/50)
      y <- vector("list", n1)
      y[[1]] <- census_table(doc, ...)
      if(all){
         if(n1 > 40) stop("Too many results to download.  Try narrowing search")
         if(n1 > 1 ){
            for( i in 2: n1){
               message(" Getting page ", i)
               url2 <- paste0(url, "&page=", i)
               doc2 <- read_html(url2)
               y[[i]] <- census_table(doc2, ...)
               Sys.sleep(sample(1:3))
            }
         }
      }else{
         if(n > 50) message("Only downloading first 50 records. Set all=TRUE to download all ", n, " results")
      }
      x <- bind_rows(y)
      x
   }
}
cstubben/norwayr documentation built on May 14, 2019, 12:25 p.m.