get_catalog_nhis <-
function( data_name = "nhis" , output_dir , ... ){
catalog <- NULL
for( this_name in c( "nhpi" , "NHIS" ) ){
base_ftp_dir <- paste0( "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/" , this_name , "/" )
# read the text of the microdata ftp into working memory
# download the contents of the ftp directory for all microdata
ftp.listing <- strsplit( RCurl::getURL( base_ftp_dir , ssl.verifypeer = FALSE ) , "<br>" )[[1]]
# extract the text from all lines containing a this_year of microdata
# figure out the names of those this_year directories
ay <- rev( gsub( '(.*)\\">(.*)<\\/A>$', "\\2" , ftp.listing ) )
# remove non-numeric strings
suppressWarnings( available_years <- ay[ as.numeric( ay ) %in% ay ] )
# now `available.years` should contain all of the available years on the nhis ftp site
for( this_year in available_years ){
# define path of this year
year_dir <- paste0( base_ftp_dir , this_year , "/" )
cat( paste0( "loading " , data_name , " catalog from " , year_dir , "\r\n\n" ) )
# just like above, read those lines into working memory
this_listing <- strsplit( RCurl::getURL( year_dir , ssl.verifypeer = FALSE ) , "<br>" )[[1]]
ftp_files <- tolower( gsub( '(.*)\\">(.*)<\\/A>$' , "\\2" , this_listing ) )
# identify all .exe files..
exe.filenames <- ftp_files[ grepl( ".exe" , ftp_files ) ]
# identify all .zip files..
zip.filenames <- ftp_files[ grepl( ".zip" , ftp_files ) ]
# identify overlap between .zip and .exe files
exe.filenames <- gsub( ".exe" , "" , exe.filenames )
zip.filenames <- gsub( ".zip" , "" , zip.filenames )
duplicate.filenames <- zip.filenames[ (zip.filenames %in% exe.filenames) ]
zip.filenames.with.exe.matches <- paste( duplicate.filenames , ".zip" , sep = "" )
# throw out .zip files that match a .exe file exactly
ftp_files <- ftp_files[ ! ( ftp_files %in% zip.filenames.with.exe.matches ) ]
# end of throwing out file.zip files that match file.exe files #
if( this_year == 2004 ){
ftp_files <- gsub( "familyfile" , "familyfile/familyxx.exe" , ftp_files )
ftp_files <- gsub( "household" , "household/househld.exe", ftp_files )
ftp_files <- gsub( "injurypoison" , "injurypoison/injpoiep.exe" , ftp_files )
ftp_files <- gsub( "person" , "person/personsx.exe" , ftp_files )
ftp_files <- gsub( "sampleadult" , "sampleadult/samadult.exe" , ftp_files )
ftp_files <- gsub( "samplechild" , "samplechild/samchild.exe" , ftp_files )
ftp_files <- ftp_files[ !( ftp_files %in% c( "" , "injuryverbatim" ) ) ]
}
# throw out folders (assumed to be files without a . in them)
# (any files in folders within the main this_year folder need to be downloaded separately)
ftp_files <- ftp_files[ grepl( "\\." , ftp_files ) ]
# skip txt files
ftp_files <- ftp_files[ !grepl( '\\.txt' , ftp_files ) ]
# skip the new sc_bwt files entirely
ftp_files <- ftp_files[ !grepl( 'sc_bwt' , ftp_files ) ]
# skip these 1963 files with irregular SAS importation scripts
if ( this_year == 1963 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( 'condition.exe' , 'family.exe' , 'hospital.exe' , 'health_exp.exe' ) ) ]
# skip these 1964 files with irregular SAS importation scripts
if ( this_year == 1964 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( 'family.exe' , 'hospital.exe' , 'xray.exe' , 'person.exe' ) ) ]
# skip these 1965 files with irregular SAS importation scripts
if ( this_year == 1965 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "condition.exe" , "diabetes.exe" , "person.exe" , "presmed.exe" ) ) ]
# skip these 1966 files with irregular SAS importation scripts
if ( this_year == 1966 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "condition.exe" , "person.exe" ) ) ]
# skip these 1969 files with irregular SAS importation scripts
if ( this_year == 1969 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "aidsspec.exe" , "arthrtis.exe" ) ) ]
# skip these 1970 files with irregular SAS importation scripts
if ( this_year == 1970 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "healthin.exe" , "medccost.exe" , "xrayxxxx.exe" ) ) ]
# the healthin file has WTBDD2W and WTBDD2WB (in the SAS input file) in the wrong order
if ( this_year %in% c( 1972 , 1974 ) ) ftp_files <- ftp_files[ ! ( ftp_files %in% 'healthin.exe' ) ]
# skip this 1973 file with irregular SAS importation scripts
if ( this_year == 1973 ) ftp_files <- ftp_files[ ! ( ftp_files %in% "prgnancy.exe" ) ]
# skip these 1977 files with irregular SAS importation scripts
if ( this_year == 1977 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "aidsspec.exe" , "influenza.exe" ) ) ]
# skip these 1978 files with irregular SAS importation scripts
if ( this_year == 1978 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "famedexp.exe" , "immunize.exe" ) ) ]
# skip 1979 personsx file
if ( this_year == 1979 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "personsx.exe" , "smokingx.exe" ) ) ]
# skip 1988 mdevices file
if ( this_year == 1988 ) ftp_files <- ftp_files[ ! ( ftp_files %in% "mdevices.exe" ) ]
# skip 1994 and 1995 dfs files
if ( this_year %in% c( 1994 , 1995 ) ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "dfschild.exe" , "dfsadult.exe" ) ) ]
# skip the 1992 nursing home files
if ( this_year == 1992 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "conditnh.exe" , "drvisinh.exe" , "hospitnh.exe" , "househnh.exe" , "personnh.exe" ) ) ]
# skip the 2007 alternative medicine and injury verbatim files
if ( this_year == 2007 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "althealt.exe" , "injverbt.exe" ) ) ]
# skip the 1999 and 2000 injury verbatim file
if ( this_year %in% c( 1998:2000 , 2008 , 2009 ) ) ftp_files <- ftp_files[ ! ( ftp_files %in% "injverbt.exe" ) ]
# skip the 2010 sad_wgts.dat and sc_bwt10.dat files (although you may need them, depending what you're doing!)
if ( this_year == 2010 ) ftp_files <- ftp_files[ ! ( ftp_files %in% c( "sad_wgts.dat" , "sc_bwt10.dat" ) ) ]
# skip csv files
ftp_files <- ftp_files[ !grepl( "csv\\.zip$" , ftp_files , ignore.case = TRUE ) ]
catalog <-
rbind(
catalog ,
data.frame(
year = this_year ,
type = tolower( gsub( "\\.(.*)" , "" , basename( ftp_files ) ) ) ,
full_url = paste0( year_dir , ftp_files ) ,
nhpi = ( this_name == "nhpi" ) ,
stringsAsFactors = FALSE
)
)
}
}
catalog$output_filename <-
paste0( output_dir , "/" , catalog$year , "/" , ifelse( catalog$nhpi , "nhpi/" , "" ) , gsub( "\\.(.*)" , ".rds" , basename( catalog$full_url ) ) )
catalog$sas_script <-
paste0( gsub( "Datasets" , "Program_Code" , dirname( catalog$full_url ) ) , "/" , gsub( "\\.rds" , ".sas" , basename( catalog$output_filename ) ) )
catalog$sas_script <-
gsub( "nhpi/" , "NHPI/" ,
gsub( "[0-9][0-9]\\." , "." ,
gsub( "^https" , "ftp" ,
catalog$sas_script
) ) )
catalog$output_filename <- gsub( "nhpi/" , "nhpi_" , catalog$output_filename )
catalog$imputed_income <- grepl( 'inc[0-9][0-9]\\.' , catalog$full_url )
available_imputed_incomes <- grep( "imputed_income" , ay , value = TRUE , ignore.case = TRUE )
for( this_income in available_imputed_incomes ){
# define path of this imputed income file
income_dir <- paste0( base_ftp_dir , this_income , "/" )
cat( paste0( "loading " , data_name , " catalog from " , income_dir , "\r\n\n" ) )
this_listing <- strsplit( RCurl::getURL( income_dir , ssl.verifypeer = FALSE ) , "<br>" )[[1]]
ftp_files <- tolower( gsub( '(.*)\\">(.*)<\\/A>$' , "\\2" , this_listing ) )
# remove stata files and missing files
ftp_files <- ftp_files[ !( ftp_files %in% "" ) & !grepl( "\\.do$" , ftp_files ) ]
# base the catalog off of the sas scripts
inc_cat <-
data.frame(
year = gsub( "_imputed_income" , "" , this_income , ignore.case = TRUE ) ,
type = "ii" ,
sas_script = paste0( income_dir , grep( "\\.sas$" , ftp_files , value = TRUE , ignore.case = TRUE ) ) ,
imputed_income = TRUE ,
nhpi = FALSE ,
stringsAsFactors = FALSE
)
for( j in seq_len( nrow( inc_cat ) ) ){
inc_cat[ j , 'full_url' ] <-
paste0(
income_dir ,
ftp_files[ ftp_files %in% paste0( gsub( "\\.sas" , "" , basename( inc_cat[ j , 'sas_script' ] ) , ignore.case = TRUE ) , c( '.zip' , '.exe' ) ) ]
)
}
inc_cat$output_filename <-
paste0( output_dir , "/" , inc_cat$year , "/" , gsub( "\\.(.*)" , ".rds" , basename( inc_cat$full_url ) ) )
catalog <- rbind( catalog , inc_cat )
}
catalog <- catalog[ order( catalog[ , 'year' ] ) , ]
catalog
}
lodown_nhis <-
function( data_name = "nhis" , catalog , ... ){
on.exit( print( catalog ) )
tf <- tempfile()
for ( i in seq_len( nrow( catalog ) ) ){
# download the file
cachaca( catalog[ i , "full_url" ] , tf , mode = 'wb' )
unzipped_files <- unzip_warn_fail( tf , exdir = paste0( tempdir() , "/unzips" ) )
if( catalog[ i , 'imputed_income' ] ){
SAScii_start <- grep( "INPUT ALL VARIABLES|input the data from the ASCII file" , readLines( catalog[ i , 'sas_script' ] , encoding = 'latin1' ) ) + 1
# unzip the file into a temporary directory.
# the unzipped file should contain *five* ascii files
income_file_names <- sort( unzipped_files )
if( catalog[ i , 'year' ] >= 2019 ){
stopifnot( length( income_file_names ) == 1 )
ii <- read_SAScii( income_file_names , catalog[ i , 'sas_script' ] , beginline = SAScii_start , sas_encoding = 'latin1' )
names( ii ) <- tolower( names( ii ) )
ii$rectype <- NULL
if( any( names( ii ) %in% c( 'impnum_c' , 'impnum_a' ) ) ) names( ii )[ names( ii ) %in% c( 'impnum_c' , 'impnum_a' ) ] <- 'impnum'
for( j in 1:10 ) assign( paste0( "ii" , j ) , subset( ii , impnum == j ) )
# save all five imputed income data frames to a single .rds file #
saveRDS( mget( paste0( "ii" , 1:10 ) ) , file = catalog[ i , 'output_filename' ] , compress = FALSE )
catalog[ i , 'case_count' ] <- nrow( ii ) / 10
} else {
# loop through all five imputed income files
for ( j in 1:length( income_file_names ) ){
ii <- read_SAScii( income_file_names[ j ] , catalog[ i , 'sas_script' ] , beginline = SAScii_start , sas_encoding = 'latin1' )
names( ii ) <- tolower( names( ii ) )
ii$rectype <- NULL
assign( paste0( "ii" , j ) , ii )
}
# save all five imputed income data frames to a single .rds file #
saveRDS( mget( paste0( "ii" , 1:5 ) ) , file = catalog[ i , 'output_filename' ] , compress = FALSE )
catalog[ i , 'case_count' ] <- nrow( ii )
}
} else {
# if the zipped file includes a csv file, pick only the `.dat` file instead
if( length( unzipped_files ) > 1 ) unzipped_files <- unzipped_files[ grep( "\\.dat$" , tolower( unzipped_files ) ) ]
# ..and read that text file directly into an R data.frame
# using the sas importation script downloaded before this big fat loop
x <- read_SAScii( unzipped_files , catalog[ i , "sas_script" ] , sas_encoding = 'latin1' )
# convert all column names to lowercase
names( x ) <- tolower( names( x ) )
catalog[ i , 'case_count' ] <- nrow( x )
saveRDS( x , file = catalog[ i , 'output_filename' ] , compress = FALSE )
}
# delete the temporary files
file.remove( tf , unzipped_files )
cat( paste0( data_name , " catalog entry " , i , " of " , nrow( catalog ) , " stored at '" , catalog[ i , 'output_filename' ] , "'\r\n\n" ) )
}
on.exit()
catalog
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.