R/query_api_dk.R

#' Function to query dk api
#' @param pub_end_date Last day reports are wanted from
#' @param pub_start_date First date that reports are wanted from
#' @export
query_api_dk <- function(pub_end_date, pub_start_date = NULL){
  
  if(is.null(pub_start_date)){pub_start_date <- pub_end_date}
  
  query <- list(
    "query" = list(
      "range" = list(
        "offentliggoerelsesTidspunkt" = list(
          "lte" = paste0(pub_end_date, "T23:59:59.999"),
          "gte" = paste0(pub_start_date, "T00:00:00.000"),
          "time_zone" = "+01:00"
        )
      )
    )
  )
  
  # first query_params ------------------------------------------------------------
  url <- 'http://distribution.virk.dk/offentliggoerelser/_search?'
  param_first <- list("size" = 3000, 'scroll' = '5m')
  
  
  # send_first_request ------------------------------------------------------
  first_response <- httr::POST(
    url = url, 
    query = param_first,
    body = query,
    encode = "json"
  )
  contents <- first_response %>% httr::content()
  
  
  # extract variables -------------------------------------------------------
  hits <- contents[["hits"]][["hits"]]
  antal <- length(hits)
  
  # scroll ------------------------------------------------------------------
  scroll_url <- 'http://distribution.virk.dk/_search/scroll?'
  while(antal > 0){
    
    param_scroll <- list(
      'scroll'= '5m', 
      'scroll_id'= contents[['_scroll_id']]
      )
    
    contents <- httr::POST(
      url = scroll_url,
      query = param_scroll
    )  %>% 
      httr::content()
    
    new_hits <- contents[["hits"]][["hits"]]
    hits <- c(hits, new_hits)
    antal <- length(new_hits)
    
    print(paste0("downloaded ", length(hits)))
  }
  
  
  `%l0%` <- function(x, y) if (length(x) > 0) x else y

  # Initializing stuff ------------------------------------------------------
  len <- length(hits)
  cvr <- vector(mode = "character", len)
  startdate <- vector(mode = "character", len)
  enddate <- vector(mode = "character", len)
  publicationdate <- vector(mode = "character", len)
  xml_link <- vector(mode = "character", len) 
  type <- vector(mode = "character", len) 
  doctype <- vector(mode = "character", len)
  case_number <- vector(mode = "character", len)
  reversal <- vector(mode = "character", len)
  id <- vector(mode = "character", len)
  
  # Creating tible
  for (i in 1:length(hits)){
    
    source <- hits[[i]][["_source"]] 
    cvr[i] <- source[["cvrNummer"]] %l0% ""
    startdate[i] <- source$regnskab$regnskabsperiode$startDato %l0% ""
    enddate[i] <- source$regnskab$regnskabsperiode$slutDato %l0% ""
    publicationdate[i] <- source$offentliggoerelsesTidspunkt %l0% ""
    type[i] <- source$offentliggoerelsestype %l0% ""
    case_number[i] <- source$sagsNummer %l0% ""
    reversal[i] <- source$omgoerelse %l0% ""
    id[i] <- hits[[i]][["_id"]]
    
    
    for (doc in source$dokumenter){
      if(doc$dokumentMimeType == 'application/xml' ){
        xml_link[i] <- doc$dokumentUrl %l0% ""
        doctype[i] <- doc$dokumentType %l0% ""
      }
    }
    
    if(is.na(xml_link[i])){
      xml_link <- NULL
    }
  }
  
  res <- tibble::tibble(
    cvr = cvr,
    startdate = startdate,
    enddate = enddate,
    publicationdate = publicationdate,
    xml_link = xml_link,
    type = type,
    xmldoctype = doctype,
    case_number = case_number,
    reversal = reversal,
    id = id
    
  )
  
  return(res)
}
soetang/xbrlr documentation built on May 26, 2019, 7:01 p.m.