R/pof.R

Defines functions lodown_pof get_catalog_pof

get_catalog_pof <-
	function( data_name = "pof" , output_dir , ... ){

		pof_ftp <- "ftp://ftp.ibge.gov.br/Orcamentos_Familiares/"

		ftp_listing <- readLines( textConnection( RCurl::getURL( pof_ftp ) ) )

		ay <- rev( gsub( "(.*) (.*)" , "\\2" , ftp_listing ) )

		# hardcoded removal of microdata before 2003
		ay <- ay[ !( ay %in% c( "" , "Pesquisa_de_Orcamentos_Familiares_1987_1988" , "Pesquisa_de_Orcamentos_Familiares_1995_1996" , "Pesquisa_de_Orcamentos_Familiares_1997_1998" ) ) ]

		second_year <- gsub( "(.*)_([0-9]+)" , "\\2" , ay )

		catalog <-
			data.frame(
				full_urls = paste0( pof_ftp , ay , "/Microdados/", ifelse(second_year == 2018, "Dados_20200917.zip","Dados.zip" )) ,
				period = gsub( "Pesquisa_de_Orcamentos_Familiares_" , "" , ay ) ,
				documentation = paste0( pof_ftp , ay , "/Microdados/" , ifelse( second_year < 2009 , "Documentacao.zip" , ifelse(second_year == 2018, "Documentacao_20200917.zip","documentacao.zip" )) ) ,
				aliment_file = ifelse( second_year < 2009 , NA , paste0( pof_ftp , ay , "/Microdados/", ifelse(second_year == 2018, "Tradutores_20200917.zip","tradutores.zip" ) )) ,
				output_folder = paste0( output_dir , "/" , gsub( "Pesquisa_de_Orcamentos_Familiares_" , "" , ay ) ) ,
				stringsAsFactors = FALSE
			)

		catalog

	}


lodown_pof <-
	function( data_name = "nppes" , catalog , path_to_7za = if( .Platform$OS.type != 'windows' ) '7za' else normalizePath( "C:/Program Files/7-zip/7z.exe" ) , ... ){

		if( system( paste0( '"' , path_to_7za , '" -h' ) , show.output.on.console = FALSE ) != 0 ) stop( paste0( "you need to install 7-zip.  if you already have it, include a parameter like path_to_7za='" , path_to_7za , "'" ) )

		on.exit( print( catalog ) )

		if ( !requireNamespace( "readxl" , quietly = TRUE ) ) stop( "readxl needed for this function to work. to install it, type `install.packages( 'readxl' )`" , call. = FALSE )

		tf <- tempfile() ; tf2 <- tempfile()

		for ( i in seq_len( nrow( catalog ) ) ){

			cachaca( catalog[ i , "full_urls" ] , tf , mode = 'wb' , filesize_fun = 'unzip_verify' )

			unzipped_files <- unzip_warn_fail( tf , exdir = paste0( tempdir() , "/unzips" ) )

			cachaca( catalog[ i , "documentation" ] , tf , mode = 'wb' , filesize_fun = 'unzip_verify')

			doc_files <- unzip_warn_fail( tf , exdir = paste0( tempdir() , "/unzips" ) )

			files <- c( unzipped_files , doc_files )

			Encoding( files ) <- 'latin1'

			if( !is.na( catalog[ i , 'aliment_file' ] ) ){

				cachaca( catalog[ i , "aliment_file" ] , tf , mode = 'wb' , filesize_fun = 'unzip_verify')

				ali_files <- unzip_warn_fail( tf , exdir = paste0( tempdir() , "/unzips" ) )

				Encoding( ali_files ) <- 'latin1'

				# # # # # # # # # # # # # #
				# tables with food codes  #

				# figure out which is the alimentacao file
				cda <- ali_files[ grep( 'codigos_de_alimentacao' , tolower( ali_files ) ) ]

				# extract both tabs from the excel file
				componentes <- readxl::read_excel( cda , sheet = 1 , skip = 3 , col_types = rep( 'text' , 7 ) , col_names = c( 'codigo' , 'nivel.1' , 'desc.1' , 'nivel.2' , 'desc.2' , 'nivel.3' , 'desc.3' ) )
				componentes <- as.data.frame( componentes , stringsASFactors = FALSE )
				componentes[ , 1:2 ] <- apply( componentes[ , 1:2 ] , 2 , function(x) { gsub( "\\..*", "" , x ) } )
				estrutura <- readxl::read_excel( cda , sheet = 2 , skip = 3 , col_types = rep( 'text' , 6 ) , col_names = c( 'nivel.1' , 'desc.1' , 'nivel.2' , 'desc.2' , 'nivel.3' , 'desc.3' ) )
				estrutura <- as.data.frame( estrutura , stringsASFactors = FALSE )
				estrutura[ , 1 ] <- gsub( "\\..*", "" , estrutura[ , 1 ] )

				# componentes table has a footnote, so throw it out
				# by removing all records with a missing
				# or empty `nivel.1` field
				componentes <- componentes[ !is.na( componentes$nivel.1 ) , ]
				componentes <- componentes[ componentes$nivel.1 != "" , ]


				# save both of these data frames to the local disk
				saveRDS( componentes , file = paste0( catalog[ i , 'output_folder' ] , "/codigos de alimentacao componentes.rds" ) , compress = FALSE )

				saveRDS( estrutura , file = paste0( catalog[ i , 'output_folder' ] , "/codigos de alimentacao estrutura.rds" ) , compress = FALSE )

				# # # # # # # # # # # # # # # # #
				# table for post-stratification #

				# figure out which is the post-stratification table
				pos <- files[ grep( 'pos_estratos_totais' , tolower( files ) ) ]

				# extract the post-stratification table
				# from the excel file
				poststr <- readxl::read_excel( pos , sheet = 1 , col_names = c( "uf",	"control", "estrato", "fator_des" ,	"tot_pop", "estrato_unico",	"pos_estrato", "tot_unidade_c", "fator_pos" ) , col_types = rep( "numeric" , 9 ) , skip = 1 )
				poststr <- as.data.frame( poststr , stringsASFactors = FALSE )
				# imported!  cool?  cool.

				# convert all column names to lowercase
				names( poststr ) <- tolower( names( poststr ) )

				# save this data frame to the local disk
				saveRDS( poststr , file = paste0( catalog[ i , 'output_folder' ] , "/poststr.rds" ) , compress = FALSE )

				# remove all three of these tables from memory
				rm( componentes , estrutura , poststr )

			}





			# # # # # # # # # # # # # #
			# sas import organization #

			# before you worry about the data files,
			# get the sas import scripts under control.

			# extract the leitura file containing the sas importation instructions
			leitura <- files[ grep( 'leitura' , tolower( files ) ) ]

			# read the whole thing into memory
			leitura_con <- file( leitura , encoding = 'windows-1252' )

			z <- readLines( leitura_con )

			# remove all those goofy tab characters (which will screw up SAScii)
			z <- gsub( "\t" , " " , z )

			# remove lines containing the `if reg=__ then do;` pattern
			z <- z[ !grepl( 'if reg=.* then do;' , z ) ]

			# remove goofy @;
			z <- gsub( "@;" , "" , z )

			# remove goofy "/;";
			z <- gsub( "/;" , "/" , z )

			# remove lines containing solely `input`
			z <- z[ !( tolower( z ) == 'input' ) ]

			# remove the (SAScii-breaking) overlapping `controle` columns
			z <- z[ !grepl( "@3 controle 6." , z , fixed = TRUE ) ]

			# write the file back to your second temporary file
			writeLines( z , tf2 )

			# find each of your beginline parameters

			# find each line containing the string `INFILE` or `infile`
			all.beginlines <- grep( 'INFILE|infile' , z )

			# find line start positions
			start.pos <-
				unlist(
					lapply(
						gregexpr(
							"\\" ,
							z[ all.beginlines ] ,
							fixed = TRUE
						) ,
						max
					)
				) + 1

			# find line end positions
			end.pos <-
				unlist(
						gregexpr(
							".txt" ,
							z[ all.beginlines ]
						)
					) - 1

			# isolate the names of all data files to be imported..
			data.files.to.import <-
				# pull the 14th character until `.txt` in the `INFILE` lines of the sas import script
				substr(
					z[ all.beginlines ] ,
					start.pos ,
					end.pos
				)

			# now you've got an object containing the names of all data files that need to be imported.
			data.files.to.import

			# isolate the base filename before the period
			# for all downloaded files..
			all.file.basenames <-
				unlist(
					lapply(
						strsplit(
							basename( files ) ,
							'.' ,
							fixed = TRUE
						) ,
						'[[' ,
						1
					)
				)

			# for each data file name in `data.files.to.import`..
			for ( dfn in data.files.to.import ){

				# identify which .7z file contains the data
				if ( tolower( dfn ) == 't_rendimentos' ) {
					data.file <- files[ which( 't_rendimentos1' == tolower( all.file.basenames ) ) ]
				} else {
					data.file <- files[ which( tolower( dfn ) == tolower( all.file.basenames ) ) ]
				}


				# if `data.file` contains multiple files..
				if ( length( data.file ) > 1 ){

					# pick the zipped file..
					data.file <- data.file[ grep( '.zip' , tolower( data.file ) , fixed = TRUE ) ]

					# ..unzip it, and overwrite `data.file` with the new filepath
					data.file <- unzip_warn_fail( data.file , exdir = tempdir() )
				}


				# and now, if the data.file is just a text file..
				if ( grepl( "txt$" , tolower( data.file ) ) ){

					# then no unzipping is necessary
					curfile <- data.file

				# otherwise, the file must be unzipped with 7-zip
				} else {
				
					# build the string to send to DOS
					dos.command <- paste0( '"' , path_to_7za , '" x "' , normalizePath( data.file ) , '" -o"' , normalizePath( paste0( tempdir() , '/unzips' ) ) , '"' )

					# extract the file
					system( dos.command , show.output.on.console = FALSE )

					# find the name of the final ASCII data file to be imported
					curfile <- paste0( tempdir() , '/unzips/' , gsub( ".7z" , ".txt" , basename( data.file ) ) )

				}

				# figure out which beginline position to use
				cur.beginline <- which( tolower( dfn ) == tolower( data.files.to.import ) )

				curfile_con <- file( curfile , encoding = 'windows-1252' )

				# import the data file into R
				x <-
					read_SAScii(
						curfile_con ,
						tf2 ,
						beginline = all.beginlines[ cur.beginline ] ,
						skip_decimal_division = TRUE ,
						sas_encoding = "latin1"
					)

				# convert all column names to lowercase
				names( x ) <- tolower( names( x ) )

				catalog[ i , 'case_count' ] <- max( catalog[ i , 'case_count' ] , nrow( x ) , na.rm = TRUE )

				# save the current data.frame
				# to the appropriate year folder
				# within the current working directory
				saveRDS( x , file = paste0( catalog[ i , 'output_folder' ] , "/" , tolower( dfn ) , ".rds" ) , compress = FALSE )

				# delete the current file from the current working directory
				file.remove( curfile )

			}

			# revert the encoding for more effective deletion.
			Encoding( files ) <- Encoding( ali_files ) <- ''



			# delete the temporary files
			suppressWarnings( file.remove( tf , files , ali_files ) )

			cat( paste0( data_name , " catalog entry " , i , " of " , nrow( catalog ) , " stored in '" , catalog[ i , 'output_folder' ] , "'\r\n\n" ) )

		}

		on.exit()
		
		catalog

	}
ajdamico/lodown documentation built on Oct. 21, 2024, 4:53 p.m.