R/advanced_search.R

Defines functions advanced_search

Documented in advanced_search

#' Advanced search in the Norway Digital Archives
#'
#' Formats the Advanced Person Search page at
#' \url{https://www.digitalarkivet.no/en/search/persons/advanced}
#'
#' @param municipals a vector of municipal codes or names (and \code{municipal} dataset used to lookup codes)
#' @param county a vector county codes, from the first two digits of the municipal code
#' @param region a vector of regions, 01=
#' @param first First name
#' @param last Last name
#' @param gender Gender, m or f
#' @param role  Role like barn, far, mor, fadder, brudgom, brur
#' @param domicile  Domicile, residence or farm name
#' @param event Event date in yyyy-mm-dd format, split into yyyy and mm-dd fields in url
#' @param birth Birth date in yyyy-mm-dd format
#' @param place Place of birth
#' @param family Family position, marital status or occupation
#' @param county County code
#' @param all Default search gets first page with 50 records, set all=TRUE to download all results
#' @return A data.frame with ame, residence, year, type, role, event, birth, place, family position, source, url
#' @note Municipal codes in code{\link{municipal}}.  Use wildcard searches * since
#' most names and places have alternate spellings like Michel or Mikkel, Larsen or Larssen, etc
#' @seealso \code{\link{get_records}} to loop through search results and get indvidual records
#' @author Chris Stubben
#' @examples
#' \dontrun{
#'   # search for Sivert Larsen Stubben
#'   advanced_search(1638, first="Siv*", last="Lars*", residence="Stub*")
#'   # Siverts in 1799
#'   advanced_search(1638, first="Siv*", event=1799)
#'   advanced_search(1638, event="1776-07-04")
#'   # Mikkel or Michel Stubben (but not Mildri or Mimi)
#'   advanced_search(1638, first="Mi*", residence="Stub*", gender="m")
#' }
#' @export


advanced_search  <- function(municipals=NULL, county=NULL, region=NULL,
   event = NULL,  birth=NULL, period = NULL,
   firstname=NULL, lastname=NULL,
    gender = NULL, role = NULL,
    date_event= NULL, date_birth=NULL,  place_birth=NULL,  domicile=NULL,
     position=NULL, disable_name_variants = TRUE,
    all=FALSE){

    url1 <- "https://www.digitalarkivet.no/en/search/persons/advanced"
    municode <- municipals
    ## lookup names in municipal
    if(!all( grepl("^[0-9]+$", municode))){
      #data(municipal)
      municode  <- municipal$code[ municipal$name %in% municode ]
   }
    l1 <- list(`m[]` = municode , `c[]` = county, `r[]` = region, `gender[]` = gender, `role[]` = role)
    # compose_query requires values with length 1
    if( any( sapply(l1, length)>1)) l1 <- as.list(setNames(unlist(l1),rep(names(l1), lengths(l1))))
   ## use "1" for TRUE , NULL for anything else
   if(!is.null(disable_name_variants)) disable_name_variants <- switch(disable_name_variants==TRUE, "1")
   l2 <- list(firstname=firstname, lastname=lastname,
   event_year_from= event[1], event_year_to=event[length(event)], event_date= date_event,
   from = period[1], to = period[length(period)],
    birth_year_from=birth[1],  birth_year_to=birth[length(birth)], birth_date=date_birth,
    birth_place=place_birth,
     domicile=domicile, position=position, disable_name_variants =disable_name_variants)
     n <- sapply(l2, length)
     if(any(n > 1)) stop("Vectors are not allow in ", names(n[n>1]))
     url2 <- httr:::compose_query( c( l1, l2) )
     url <- paste0( url1, "?", url2)
     url

   message("Getting ", url)
   doc <- XML::htmlParse( suppressWarnings( readLines(url) )   )
   results <- XML::xpathSApply(doc, "//p[@class='comment standalone']", XML::xmlValue)
    n <- as.numeric( gsub(".* ([0-9]+) hit.*", "\\1", results))
   message(results)
   x <- XML::readHTMLTable(doc , stringsAsFactors=FALSE , which=1)
   # lower-case first word for column name
   names(x) <- tolower(gsub("([^ ]+).*", "\\1", names(x) ))
   x$url <- XML::xpathSApply(doc, "//td/a", XML::xmlGetAttr, "href")
   if(all){
      n <- ceiling(n/50)
      y <- vector("list", n)
      y[[1]] <- x
      for( i in 2: n){
         message(" Getting page ", i)
         url2 <- paste0(url, "&page=", i)
         doc <- XML::htmlParse(suppressWarnings( readLines(url2) )  )
         x <- XML::readHTMLTable(doc , stringsAsFactors=FALSE , which=1)
         names(x) <- tolower(gsub("([^ ]+).*", "\\1", names(x) ))
         x$url <- XML::xpathSApply(doc, "//td/a", XML::xmlGetAttr, "href")
         y[[i]] <- x
         Sys.sleep(sample(1:3))
      }
      x <- do.call("rbind", y)
   }else{
      if(n > 50) message("Only downloading first 50 records. Set all=TRUE to download all ", n, " results")
   }
   ## change dd.mm.yyyy to yyyy.mm.dd for sorting
   x$event <- sapply(strsplit(x$event, "\\."), function(y) paste(rev(y), collapse="."))
   x <- x[order(x$event),]
   rownames(x) <- NULL
   ## add record type by parsing url string
   y <- gsub( ".*/en-gb/([^/]+/[^/]+).*", "\\1", x$url)
   utype <- c("ft/person", "gen/vis", "kb/df", "kb/dp", "kb/gr", "kb/in",
      "kb/kf", "kb/pa", "kb/uf", "kb/va", "kb/vi")
   rtype <- c("census", "tax", "stillbirth", "baptism", "death", "introduced", "confirmation",
           "absolutions", "migration", "vaccination", "marriage")
   n1 <- match( y, utype)
   if(any(is.na(n1))) message("Warning: missing record type")
   x$type <- rtype[n1]
   ## add year,  if missing, check source
   n1 <- x$event == ""
   if(sum(n1)>0)  x$event[n1] <- gsub(".*?([0-9]{4}).*", "\\1", x$source[n1])
    # some vaccinations with ,00
   x$event <- gsub(",00$", "", x$event)
   x$year <- as.numeric(substr( x$event, 1,4))
   ## change date to birth
   names(x)[2] <- "birth"
   x <- x[, c(1, 5, 11, 10, 7,6, 2:4, 8:9)]
   dplyr::tbl_df(x)
}
cstubben/norwayr documentation built on May 14, 2019, 12:25 p.m.