claims_db/phclaims/load_raw/tables/count_lines.apcd_all_tables.R

#### COUNT LINES IN APCD EXTRACTED TABLE CHUNKS
#
# Eli Kern, PHSKC (APDE)
#
# 2019-10
#
# Notes:
# 1) I compared countLines with using wc.exe in Rtools, with the latter being slightly faster
# 2) To be able to use wc.exe from Rtools, must add C:\Rtools\bin to the Path variable in environmental variables
#
# Run time: 8.5 hours, 156 table chunks, 10/14/19


#### Set up global parameter and call in libraries ####
options(max.print = 350, tibble.print_max = 50, warning.length = 8170, scipen = 999)
library(tidyverse)
library(openxlsx)
read_dir <- "//kcitsqlutpdbh51/ImportData/Data/APCD_data_import"
write_path <- "C:/Users/kerneli/King County/King County Cross-Sector Data - Documents/ETL/"

#Create list of tables
table_list <- list("dental_claim_export", "eligibility_export", "medical_claim_export", "member_month_detail_export", "pharmacy_claim_export", 
                   "provider_export", "provider_master_export", "provider_practice_roster_export")

#Loop over table chunks within each table folder and count lines
system.time(apcd_row_count <- lapply(table_list, function(x) {
  
  #Loop over table chunk list
  file_list <- as.list(list.files(path = file.path(read_dir, x), pattern = "*.csv", full.names = T))
  
  #Count lines and save as data frame
  inner_df <- lapply(file_list, function(x) {
    print(x)
    args <- paste0("-l ", x)
    result <- system2(command = "wc", args = args, stdout = TRUE)
    result_parse <- str_split(result, " ", simplify = T)
    df <- data.frame(table_chunk = result_parse[2], line_count = result_parse[1], stringsAsFactors = F)
  }) %>%
    bind_rows()  
}) %>%
  bind_rows())

#Export results
today <- Sys.Date()
filename <- paste0(write_path, "apcd_table_chunk_row_count_", today, ".xlsx")
write.xlsx(apcd_row_count, file = filename, sheet = "results")
PHSKC-APDE/Medicaid documentation built on March 25, 2020, 3:16 p.m.