#' @title Filter Read CSV
#' @description Iteratively reads in and filters a CSV file based on specified filters.
#' @param path A file path to a local csv file.
#' @param batch_size The size of each csv chunk to be read in. If not specified, then the data will be
#' broken into 10 chunks by default.
#' @param filters vector of filters to be applied. Each filter should start with the column name and
#' NOT the name of the file; filter_read_csv will take care of this automatically. For example, if one
#' of the filters was to be applied on the column "Age", the filter would be included in the vector
#' as "Age < 40", NOT as "df$AGE < 40".
#' @return A filtered tibble.
#' @importFrom readr read_csv cols
#' @export
filter_read_csv <- function(path, batch_size, cols, filters){
check_filters(filters) #Ensure at least one filter is present
file_nrow <- length(count.fields(path)) #Get the number of rows present in the data
size <- ifelse(is.null(batch_size), round(file_nrow / 10), batch_size)
batches <- seq(from = 0, to = file_nrow + batch_size, by = size) #Prepare the batches based on the batch size and the overall file size
dat = NULL #Initialize an empty dataframe
cols.names <- colnames(read_csv(path, skip = batches[1], n_max = 0, col_types = ifelse(is.null(cols), cols(), cols), progress = FALSE)) #Get column names from data
filter_length <- length(filters) #Count number of filters inputed
for(i in 1:length(batches)){ #Iterate through every batch
data_temp <- read_csv(path, skip = batches[i], #Read in a batch_size amount of data
n_max = batch_size, col_names = cols.names, col_types = ifelse(is.null(cols), cols(), cols), progress = FALSE)
for(j in 1:filter_length){ #Iterate through every filter
data_temp <- subset(data_temp, eval(parse(text = paste("data_temp$", filters[j], sep = "")))) #Subset the data based on the given filters
}
dat <- rbind(dat, data_temp) #rbind the filtered data to the filtered data from previous batches
rm(data_temp_filtered) #Remove the filtered batch
cat(i, " of ", length(batches), " batches cleaned and read.", sep = "")
#cat(round(batches[i] / file_nrow * 100, 2), "% \r") #Print what percentage of the rows of the initial CSV file have been read and filtered through
}
dat #Return the filtered data
}
check_filters <- function(input){
if(is.null(input)){ #Check that the input is not empty
stop("\n'filters' must not be null")
}
TRUE
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.