R/Get_DB_MIUR.R

Defines functions Get_DB_MIUR

Documented in Get_DB_MIUR

#' Download the database of Italian public schools buildings
#'
#' @description  This function downloads the School Buildings Open Database provided by the Italian Ministry of Education, University and Research.
#'
#'
#' It is one of the main sources of information regarding the infrastructure system of public schools in Italy.
#' For a given year, all available data are downloaded (except for the structural units section, which has a different level of detail) and gathered into a unique dataframe.
#'
#' @param Year Numeric or character value. Reference school year (last available is 2023).
#' Available in the formats: \code{2023}, \code{"2022/2023"}, \code{202223}, \code{20222023}. \code{2022} by default (other databases are not currently available for 2023).
#' @param input_Registry Object of class \code{tbl_df}, \code{tbl} and \code{data.frame}.
#' The school registry corresponding to the year in scope, obtained as output of the  function \code{\link{Get_Registry}}.
#' If \code{NULL}, it will be downloaded automatically, but not saved in the global environment.
#' \code{NULL} by default.
#' @param certifications Logical. From year 2021/22 onwards, whether to include some safety certifications in the database.
#' Given the particular level of definition of this file, it requires extra computational time (other than the downloading time). \code{FALSE} by default.
#' @param input_AdmUnNames Object of class \code{tbl_df}, \code{tbl} and \code{data.frame}.
#' The ISTAT file including all the codes and all the names of the administrative units for the year in scope, obtained as output of the function \code{\link{Get_AdmUnNames}}.
#' Only  necessary for school years 2015/16, 2017/18 and 2018/19.
#' If \code{NULL} and required, it will be downloaded automatically but not saved in the global environment. \code{NULL} by default.
#' @param verbose Logical. If \code{TRUE}, the user keeps track of the main underlying operations. \code{TRUE} by default.
#' @param autoAbort Logical. Whether to automatically abort the operation and return NULL in case of missing internet connection or server response errors. \code{FALSE} by default.
#' @param show_col_types Logical. If \code{TRUE}, if the \code{verbose} argument is also \code{TRUE}, the columns of the raw dataset are shown during the download. \code{FALSE} by default.
#' @param t_out Numeric. !! EXPERIMENTAL !! session timeout for scraping and download, in seconds. 900 seconds by default.
#'
#' @source  <https://dati.istruzione.it/opendata/opendata/catalogo/elements1/?area=Edilizia+Scolastica>
#'
#' @details
#' This function downloads the raw data; missing observations are not edited; all variables are characters.
#' Since certifications are defined at the level of structural units of the single buildings, here
#' the fields read as the percentage of structural units in a building having a given certificate.
#' To edit the output of this function and convert the relevant variables to numeric or Boolean, please \code{\link{Util_DB_MIUR_num}}.
#' Schools different from primary, middle or high schools are classified as \code{"NR"}. In the example, the data for school year 2022/23 are retrieved.
#'
#'
#' @return An object of class \code{tbl_df}, \code{tbl} and \code{data.frame}.
#'
#' @examples
#'
#' \donttest{
#'   input_DB23_MIUR <- Get_DB_MIUR(2023, autoAbort = TRUE, t_out = 20)
#'
#'   input_DB23_MIUR[-c(1,4,6,9)]
#'
#' }
#'
#'
#' @export


Get_DB_MIUR <- function(Year = 2023, verbose = TRUE, input_Registry = NULL,
                        input_AdmUnNames = NULL, show_col_types = FALSE,
                        certifications = FALSE, autoAbort = FALSE, t_out = 900){

  start.zero <- Sys.time()

  if(!Check_connection(autoAbort)) return(NULL)

  # Link retrieving
  home.url <-"https://dati.istruzione.it/opendata/opendata/catalogo/elements1/?area=Edilizia%20Scolastica"
  homepage <- NULL
  attempt <- 0
  while(is.null(homepage) && attempt <= 10){
    homepage <- tryCatch({
      httr::content(httr::GET(home.url))
    }, error = function(e){
      message("Cannot read the html; ", 10 - attempt,
              " attempts left. If the problem persists, please contact the maintainer.\n")
      return(NULL)
    })
    attempt <- attempt + 1
  }
  if(is.null(homepage)) return(NULL)
  name_pattern <- "([0-9]+)\\.(csv)$"
  pattern <- year.patternB(Year)
  patternA <- year.patternA(Year)
  links <- homepage %>% rvest::html_nodes("a") %>% rvest::html_attr("href") %>% unique()
  links <- links[which(!is.na(links))]
  if (!any(str_detect.general(links, pattern))){
    message("No data available for this year. We apologise for the inconvenience")
    return(NULL)
  }

  files_to_download <- c()
  for (string in links[grep(".csv", links)] ) {
    num_numeric_digits <- sum(unlist(gregexpr("[0-9]", string) ) > 0)
    nchar_min <- min(nchar(pattern))
    nchar_minA <- nchar(patternA)
    nchar_max <- max(nchar(pattern))
    if (num_numeric_digits >= nchar_min && !grepl("EDIUNITASTRUTSTA", string)){
      first_nchar_min <- stringr::str_extract(string, paste0("[0-9]{", nchar_min, "}"))
      first_nchar_max <- stringr::str_extract(string, paste0("[0-9]{", nchar_max, "}"))
      if (!is.na(first_nchar_min) && !is.na(first_nchar_max) && any(pattern %in% c(first_nchar_min, first_nchar_max)) &&
          ! string %in% files_to_download) {
        files_to_download <- append(files_to_download, string)
      }
    } else if(stringr::str_extract(string, paste0("[0-9]{", nchar_minA, "}")) == patternA && grepl("EDIUNITASTRUTSTA", string)){
      if(certifications) files_to_download <- append(files_to_download, string)
    }
  }

  # Scraping
  base.url <- dirname(home.url)
  input_MIUR <- list()
  starttime <- Sys.time()
  for (link in files_to_download) {
    file.url <- file.path(base.url, link)
    status <- 0
    attempt <- 0
    while(status != 200){
      response <- tryCatch({
        httr::GET(file.url, httr::timeout(t_out))
      }, error = function(e) {
        message("Error occurred during scraping, attempt repeated ... \n")
        NULL
      })
      status <- response$status_code
      if(is.null(response)){
        status <- 0
      }
      if(status != 200){
        attempt <- attempt + 1
        message("Operation exited with status: ", status, "; operation repeated (",
                10 - attempt, " attempts left)")
      }
      if(attempt >= 10) {
        message("Maximum attempts reached. Abort. We apologise for the inconvenience")
        return(NULL)
      }
    }

    if (httr::http_type(response) %in% c("application/csv", "text/csv", "application/octet-stream")) {
      content <- rawToChar(response$content)
      if(nchar(content)==0){
        #This way the whole routine aborts even if one file is missing
        message("Empty file. Operation aborted.
      There seems to be something wrong with the website.
      Please contact the maintainer, maybe it could help. \n")
        return(NULL)
      } else{
        if(verbose){
          dat <- readr::read_csv(content, show_col_types = FALSE)
          cat("CSV file downloaded:", link, " ... ")
        } else {
          suppressMessages(dat <- readr::read_csv(content))
        }
      }
      if(grepl("UNITASTRUTSTA", link)){
        dat[, -c(1:4)] <- lapply(dat[, -c(1:4)], function(x){
          gsub(
            "SI", 1, gsub(
              "NO", 0, gsub(
                "Esiste", 1, ignore.case = TRUE, gsub(
                  "Non Esiste", 0, ignore.case = TRUE, gsub(
                    "IN PARTE", NA, ignore.case = TRUE, gsub(
                      "ND", NA, ignore.case = TRUE, gsub(
                        "Non Definito", NA, ignore.case = TRUE, gsub(
                          "Non Comunicato", NA, ignore.case = TRUE, gsub(
                            "Non Richiesto", NA, ignore.case = TRUE, gsub(
                              "^-$", NA, x))))))))))}) %>%
          as.data.frame()
        dat <- dat %>%
          Group_Count(groupcol = c("ANNOSCOLASTICO", "CODICESCUOLA", "CODICEEDIFICIO"),
                      startgroup = 5, count = FALSE, FUN = MeanOrMode)

        dat <- dat %>%
          dplyr::mutate(dplyr::across(names(dat)[unlist(lapply(dat, is.numeric))], as.character))
      }
      input_MIUR[[link]] <- dat
      input_MIUR[[link]] <- input_MIUR[[link]] %>% dplyr::select(-.data$ANNOSCOLASTICO)
      input_MIUR[[link]] <- input_MIUR[[link]][!duplicated(input_MIUR[[link]]),]
      #input_MIUR[[link]] <- input_MIUR[[link]] %>% tidyr::unite(ID, .data$CODICESCUOLA, .data$CODICEEDIFICIO)
     } else {
      if(verbose){
        message(paste("Wrong file type:", httr::http_type(response)) )
        message("Failed to download and process:", link)
      }
    }
    endtime <- Sys.time()
    if(verbose){
      cat(paste(round(difftime(endtime, starttime, units="secs"),
                      2),"seconds required to join it \n ") )
    }
    starttime <- Sys.time()
  }
  if(length(input_MIUR) == 0L) return(NULL)

  # Joining tables
  mapping_MIUR <- input_MIUR[[grep("ANAGRAFE", names(input_MIUR))]]
  DB_MIUR.R <- mapping_MIUR %>% dplyr::select(
    .data$CODICESCUOLA, .data$CODICEEDIFICIO, .data$CODICECOMUNE, .data$DESCRIZIONECOMUNE,
    .data$SIGLAPROVINCIA, .data$CAP)


  # This is for the municipality of Bladen/Plodn/Sappada which changed it province in 2018
  if(!any(pattern %in% year.patternB(2016))){
    DB_MIUR.R <- DB_MIUR.R %>% dplyr::mutate(dplyr::across(.data$SIGLAPROVINCIA, ~ dplyr::case_when(
      toupper(.data$DESCRIZIONECOMUNE) == "SAPPADA" ~ "UD",
      TRUE ~ .data$SIGLAPROVINCIA
    )))
  }

  for ( i in c(1:length(input_MIUR))) {
    if(length(grep("ANAGRAFE", names(input_MIUR)[i]))==0){
      if (nrow(input_MIUR[[i]]) != nrow(DB_MIUR.R) & verbose == TRUE){
        warning(paste("Expected", nrow(DB_MIUR.R), "rows but in",
                      names(input_MIUR)[i], "there are:", nrow(input_MIUR[[i]])))
      }
      DB_MIUR.R <- dplyr::left_join(DB_MIUR.R, input_MIUR[[i]], by = c("CODICESCUOLA", "CODICEEDIFICIO"))
    }
  }

  names(DB_MIUR.R) <- names(DB_MIUR.R) %>% stringr::str_remove_all(".y") %>% stringr::str_remove_all(".x")
  DB_MIUR.R <- DB_MIUR.R[,!duplicated(colnames(DB_MIUR.R))]

  # This is for the province of Naples which happens to have "NA" as abbreviation
  DB_MIUR.R$SIGLAPROVINCIA <- stringr::str_replace_na(DB_MIUR.R$SIGLAPROVINCIA, "NA")

  #DB_MIUR.R <- DB_MIUR.R %>%tidyr::separate(col = .data$ID, into=c("CODICESCUOLA", "CODICEEDIFICIO"), sep="_")

  tabrename <- tabrename.manual()
  for (j in (1:ncol(DB_MIUR.R))){
    if (names(DB_MIUR.R)[j] %in% tabrename$Input){
      names(DB_MIUR.R)[j] <- tabrename[which(tabrename$Input == names(DB_MIUR.R)[j]),4]
    }
  }

  #This is for old data where the municipality is identified through the cadastral code
  if (any(pattern %in% c(year.patternB(2016), year.patternB(2018), year.patternB(2019)))) {

    YearMinus1 <- as.numeric(substr(year.patternA(Year),1,4))
    if(is.null(input_AdmUnNames)) {
      cat("Mapping cadastral codes to municipality (LAU) codes:")
      AdmUnYear <- ifelse(any(pattern %in% c(year.patternB(2016), year.patternB(2018))), YearMinus1+1, YearMinus1)
      AdmUnDate <- ifelse(any(pattern %in% c(year.patternB(2016), year.patternB(2018))), "01-01", "09-01")
      input_AdmUnNames <- Get_AdmUnNames(Date = paste0(AdmUnYear, "-", AdmUnDate), autoAbort = autoAbort)
    }
    CodMun.R <- input_AdmUnNames %>% dplyr::select(.data$Cadastral_code, .data$Municipality_code)

    DB_MIUR.R <- DB_MIUR.R %>% dplyr::rename(Cadastral_code = .data$Municipality_code) %>%
      dplyr::left_join(CodMun.R, by = "Cadastral_code") %>%
      dplyr::relocate(.data$Municipality_code, .after = "Building_code") %>%
      dplyr::select(-.data$Cadastral_code) %>%
      fixMun.manual(Year)
  }

  ## !!!! TBD!!! CHECK MANUALLY

  attempt.registry <- 0
  left <- NULL
  while (is.null(input_Registry) && attempt.registry == 0) {
    input_Registry <- Get_Registry(Year = Year, autoAbort = autoAbort)
    left <- input_Registry[,c(1,6,5)] %>% dplyr::filter(.data$School_code %in% DB_MIUR.R$School_code)
    attempt.regitry <- 1
  }
  if(is.null(left)){
    message("Warning: problem in downloading school registries data, called from
    Get_DB_MIUR. Despite all the work done, at this stage we are forced to abort.
    Please, try downloading & providing
    schools registry separately with Get_Registry, then run Get_DB_MIUR again.
    We apologise for the inconvenience. \n")
    return(NULL)
  }



  DB_MIUR <- dplyr::left_join(left, DB_MIUR.R, by = "School_code") %>%
    School.order() %>%
    dplyr::mutate(Municipality_description = stringr::str_to_title(.data$Municipality_description))

  enditme <- Sys.time()
  if(verbose){
    cat(paste("Total running time needed to import school buildings data:",
              round(difftime(endtime, start.zero, units="secs"), 2), "seconds \n"  ))
  }

  return(DB_MIUR)
}

Try the SchoolDataIT package in your browser

Any scripts or data that you put into this service are public.

SchoolDataIT documentation built on Dec. 17, 2025, 5:08 p.m.