#' @title The freadss() function reads in a csv and perhaps subsets the rows (optionally sampled)
#'
#' @seealso \code{\link[data.table]{fread}}
#' @description The input file is read in-memory via \code{\link[data.table]{fread}}.
#' If rows are subset, there is a slow down. Hence, subsetting rows costs a one-time slow down
#' but affords ability to bean count the read-in memory footprint
#'
#' @details NOTE: if both ss and ind_choose are NULL, no subsetting is done. Entire csv is read in.
#'
#' @param input character for file path of csv passed to \code{\link[data.table]{fread}}
#' @param ss integer for desired row sample size. Default of ss is NULL, meaning no subsampling.
#' @param ind_choose optional integer vector of specific rows to read in (instead of sampling)
#'
#' @return a 'data.frame' with optionally subsetted rows (perhaps from sampling)
#' @export
#'
#' @examples
#'
#' set.seed(1); m = matrix(rnorm(10*100),ncol=100,nrow=100)
#'
#' csv = data.frame(m)
#' names(csv) = paste0('x',seq_along(csv))
#' names(csv)
#'
#' tf = tempfile()
#' write.csv(csv,tf,row.names=FALSE)
#' dir_test=tf
#'
#' # if ss=NULL and ind_choose=NULL
#' # no sub sampling, basically fread() but no flexible optional args.
#' # just demo, might as well use fread() directly
#'
#' identical(freadss(input=dir_test),fread(dir_test))
#'
#' # user wants to sample 5 random rows
#' freadss(input=dir_test,ss=5)
#'
#' # user picks 5 specific rows
#' ind_pick = c(1,7,23,69,100)
#'
#' df_subset_before = freadss(input=dir_test,ind_choose = ind_pick)
#' df_subset_after = freadss(input=dir_test)[ind_pick,]
#' identical(df_subset_before,df_subset_after)
#'
# disabled optional ... args passed to fread()
# eg drop/keep bugs out since issue of original header name lost during do.call(fread,list_ss)
freadss = function(input,ss=NULL,
ind_choose=NULL){
# ss = 100 # samp size
# ind_choose=ind_pick
# negative ss
# ss and ind_choose non null
if((!is.null(ss))&&(!is.null(ind_choose))){stop('you can not have BOTH non-null ss and ind_choose')}
require(data.table)
# other args passed to fread()
# dots = list(...)
if((is.null(ss)&&is.null(ind_choose))==TRUE){
# no subsampling of rows
return(fread(input))
}
# else, subsample rows
# must know nrow beforehand
# input = '~/projects/datzen/tests/proto/test.csv'
num_rows = data.table::fread(paste0('wc -l ',input))[[1]] - 1
name_header_orig = names(fread(input,nrows=0))
if(ss > num_rows){stop('You have chosen to sub sample MORE rows than the original dataset, this goes against the philosophy of this function')}
# row index random sampled
if(is.null(ind_choose)&&(!is.null(ss))){
if(ss <= 0){stop('ss must be NULL or greater than 0')}
# note: hardcode replace=FALSE
# edge case where ss > nrow(dat_raw) and replace = TRUE
# goes against motivating use case to sub sample before read in
ind_samp = sample(x=(1:num_rows),size=ss,replace=FALSE)
ind_spec = as.integer(ind_samp)
}
# row index user specified
if(!is.null(ind_choose)&&(is.null(ss))){
ind_spec = as.integer(ind_choose)
}
# probably slightly faster, but storage overhead
v = rep(FALSE,num_rows)
v[ind_spec] = TRUE
# sum(v)
# v <- (1:num_rows %in% ind_spec)
seq = rle(v)
idx = c(0, cumsum(seq$lengths))[which(seq$values)] + 1
df_indx = data.frame(start=idx, length=seq$length[which(seq$values)])
# str(df_indx)
result = do.call(rbind,apply(df_indx,1, function(x) return(fread(input,nrows=x[2],skip=x[1]))))
names(result) = name_header_orig
# revisit drop keep optional args
# names(result) = name_header_orig[!(name_header_orig %in% drop)]
return(result)
}
########################################
#
#
# result = do.call(rbind,
# apply(X=df_indx,MARGIN=1,
# FUN=function(xx){
#
# # str(df_indx)
# # xx = df_indx[1,]
#
#
# # internal do.call(fread,X[[i]])
# # will chop off first global row (true header)
# # and following rows will be auto renamed to use V1 etc
# # behavior of fread()
#
# args_cust = list(input,
# # header=FALSE,
# nrows=unlist(xx[2]),
# skip=unlist(xx[1])
# )
#
# # str(args_cust)
#
# # append 'dots' from topmost scope
# # args_all = append(args_cust,dots)
#
# args_all = args_cust
#
#
# return(do.call(fread,args_all))
#
# }))
#
#
# one line at time is signifig slower
#
#
# freadss = function(input,ss=10,replace=TRUE,ind_choose=NULL,...){
#
# require(data.table)
#
# # other args passed to fread()
# dots = list(...)
#
# # must know nrow beforehand
# # input = '~/projects/datzen/tests/proto/test.csv'
#
# num_rows = data.table::fread(paste0('wc -l ',input))[[1]] - 1
#
# if(is.null(ind_choose)){
# # use ind_samp
# if(num_rows < ss){
# ind_samp = sample(x=(1:nrow(dat_raw)),size=ss,replace=TRUE)
# warning('nrow() less than ss, so will force replace=TRUE')
#
# } else {
# ind_samp = sample(x=(1:num_rows),size=ss,replace=replace)
# }
# ind_spec = as.integer(ind_samp)
# } else {
# # ind_choose = c(1:50,53, 65,77,90,100:200,350:500, 5000:6000)
# ind_spec = as.integer(ind_choose)
# }
#
#
# # 1 line at a time version
#
# result = do.call(rbind,
# lapply(X=ind_spec,
# FUN=function(xx){
#
# # str(df_indx)
# # xx = df_indx[1,]
#
# # read 1 entry at a time without rle()
# # nrows=ind_spec
# # skip=1
#
# args_cust = list(input=input,
# nrows=1,
# skip=unlist(xx))
#
# # str(args_cust)
# # append 'dots' from topmost scope
#
# args_all = append(args_cust,dots)
# # do.call(fread,args_all)
#
# return(do.call(fread,args_all))
#
# }))
#
# return(result)
# }
#
# I don't want to be that random! Can you just specifically give me rows 69,23, and 7 ?
#
# ```{r message=FALSE}
# freadss(input=tf,ind_choose=c(69,23,7)) %>% str
# ```
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.