BRschoolData: Import and label Brazilian school Census data

Documented in download_microdata import_csv2rda insert_labels

# 0- funcao para download do censo 2019 ------------------------------------

#' Download school census microdata
#'
#' @param year numeric, at the moment works with 2018 and 2019 data
#' @param method character, default is "wget". Other options: "internal", "libcurl", "curl"
#'
#' @return .zip file with all microdata, which must be unzipped
#' @export
#'
#' @details This function works only with 2019 and 2018 data.
#' @details Other years can be downloaded in: http://inep.gov.br/microdados
#' @details This package works fine with 2019 data, and at least partially with 2015 data. Other years were not tested till now.
#'
#' @examples
#' \donttest{
#' download_microdata()
#' download_microdata(year=2018)}
#'
#' # for a more complete view, see ?insert_labels
download_microdata<-function(year=2019, method =  "wget")
{
  warning('You can download the microdata in: http://inep.gov.br/microdados')
  source=paste0('http://download.inep.gov.br/microdados/microdados_educacao_basica_',year,'.zip')
  download.file(url=source, destfile='microdados_educacao_basica_',year,'.zip', method =  method)
}


# 1- funcao para importar dados para o R -------------------------------------

#' Import csv microdata to R
#'
#' @description  Import csv microdata to R, optimizing for big data (student and teacher levels)
#'
#' @param file_path_origin character, path to downloaded .csv file
#' @param file_path_destiny character, path to .rda file to be created
#'
#' @return data.frame saved in .rda file
#' @export
#'
#' @examples
#' \dontrun{
#' file.downloaded='~/YOUR_PATH/downloaded_data/ESCOLA.CSV'
#' file.imported='~/YOUR_PATH/temp_data.rda'
#'
#' import_csv2rda(file_path_origin=file.downloaded,
#'               file_path_destiny=file.imported)}
#'
#' # for a more complete view, see ?insert_labels
import_csv2rda=function(file_path_origin, file_path_destiny)
{

  # funcao supostamente valida para importar todos arquivos do censo escolar desde 2013

  censo=data.table::fread(file_path_origin, sep="|", dec=",", encoding="Latin-1")
  print('SEE THE DATA IMPORTED:')
  print(head(censo))

  save(censo,file=file_path_destiny)
}


# 2- funcao para incluir rotulos do dicionario no data frame ------------------------------------------
# usar o que era arquivo destino agora como origem

#' Insert labels in factor variables (in portuguese).
#'
#' @description The dictionary used to label is from 2019 data
#'
#' @param file_path_origin character, path to .rda file generated by function import_csv2rda()
#' @param file_path_destiny character, path to .rda file to be created, with labels in factors
#' @param data_level character, defines census data level according to data file imported ('Escola','Docente','Gestor','Turma','Matricula')
#' @param add_variables logical, if TRUE add some useful variables to the data.frame
#'
#' @details Works fine with 2019 data, and at least partially with 2015 data. Other years not tested yet.
#'
#' @return data.frame saved in .rda file
#' @export
#'
#' @examples
#' # you must first download the .zip file. See ?download_microdata
#' # then you must unzip it and choose a data file (eg. ESCOLA.CSV)
#' # then you can run the code below, changing the first 3 lines as you wish
#' # note that 'data_level' must also be defined, in function insert_labels()
#'
#' \dontrun{
#' file.downloaded='~/YOUR_PATH/downloaded_data/ESCOLA.CSV'
#' file.imported='~/YOUR_PATH/temp_data.rda'
#' file.labelled='~/YOUR_PATH/censusData_ESCOLA.rda'
#'
#' import_csv2rda(file_path_origin=file.downloaded,
#'               file_path_destiny=file.imported)
#'
#' insert_labels (file_path_origin=file.imported,
#'               file_path_destiny=file.labelled,
#'               data_level='Escola',
#'               add_variables=TRUE)}
insert_labels=function(file_path_origin, file_path_destiny,
                       data_level=c('Escola','Docente','Gestor','Turma','Matricula'), add_variables=TRUE)
{
  # carregar dados desse nivel para recodificar
  load(system.file("recodes", paste0('dados-recode_',data_level,'.rda'), package = 'BRschoolData'))
  dd <- as.data.frame(dd) # declarar pra nao dar warning no pacote

  # carregar arquivo importado
  load(file_path_origin)
  censo <- as.data.frame(censo)

  # loop para recodificar
  variaveis=names(censo)
  for(lop.var in 1:length(variaveis)){
    # lop.var=1

    if(!variaveis[lop.var]%in%dd$nome) {
      warning(paste('INCOMPLETE LABELING: variable',variaveis[lop.var],'is not in the dicionary.'))
      next
    }

    indice.dicionario=which(dd$nome%in%variaveis[lop.var])

    if(length(indice.dicionario)>1) stop('Tem duplicacao de variaveis')

    if(is.na(dd$trad.fatores[indice.dicionario])){ # se nao for fator
      censo[,lop.var][censo[,lop.var]==999]=NA
      censo[,lop.var][censo[,lop.var]==8887]=NA
      censo[,lop.var][censo[,lop.var]==8888]=NA
      censo[,lop.var][censo[,lop.var]==88888]=NA
      next
    }

    ## recodificar variavel ----------------

    if(add_variables==TRUE){
      # criar antes variaveis sinteticas para 'TP_ETAPA_ENSINO'
      if(variaveis[lop.var]=='TP_ETAPA_ENSINO'){
        # identificar turmas de ensino fundamental e medio
        turmas.infantil=1:3
        turmas.mista.InfantFundam=56
        turmas.fundamental1=c(4:7,14:18)
        turmas.fundamental2=c(8:11,19:21,41)
        turmas.mista.Fundam=c(12:13,22:24)
        turmas.medio=c(25:38) # todos os tipos, menos EJA, menos educacao profissional
        turmas.eja=c(65,67,69:74)
        turmas.prof=c(39,40,64,68)
        # fazer variavel 'ciclo'
        censo$ciclo=NA
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.infantil)]="EI"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.mista.InfantFundam)]="EIeEFmix"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.fundamental1)]="EF1"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.fundamental2)]="EF2"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.mista.Fundam)]="EFmix"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.medio)]="EM"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.eja)]="EJA"
        censo$ciclo[which(censo$TP_ETAPA_ENSINO%in%turmas.prof)]="Prof"
        table(censo$ciclo)
        ## fazer variavel so para serie do ensino medio regular
        censo$serieEM=NA
        censo$serieEM[which(censo$TP_ETAPA_ENSINO%in%c(25,30,35))]="1a serie"
        censo$serieEM[which(censo$TP_ETAPA_ENSINO%in%c(26,31,36))]="2a serie"
        censo$serieEM[which(censo$TP_ETAPA_ENSINO%in%c(27,32,37))]="3a serie"
        table(censo$serieEM, useNA = 'ifany')
      }
    }
    # recodificar
    censo[,lop.var]=car::Recode(var=censo[,lop.var], recodes=dd$trad.fatores[indice.dicionario])
  }
  save(censo,file=file_path_destiny)

  print('SEE THE DATA LABELED:')
  print(head(censo))
}