tidbits: Various small utilities for simplifying data preparation and analysis

Documented in allTheData autoread colinfo fullargs git_add git_autoconf git_checkout git_diff_filter git_ignore git_lsfiles git_move git_newbranch git_other git_push git_rename git_ssh git_status git_subupd grepor instrequire submulti systemRootDir t_autoread tblinfo v vec2m with_attrs

# small utils ------------------------------------------------------------------

#' @importFrom magrittr %>%
#' @export
#' @noRd
magrittr::`%>%`


#' This function takes a list of package names, loads them if they are
#' available, otherwise attempts to install each one and then again
#' attempts to load it.
instrequire <- function(pkgs # nodeps
                        ,quietly=TRUE
                        # the dependencies argument is ignored and is only here
                        # so that it doesn't end up in the '...'
                        ,dependencies=TRUE
                        ,repos=getOption('repos','https://cran.rstudio.com/')
                        ,...){
  pkgs_installed <- sapply(pkgs,require,character.only=TRUE);
  if(length(pkgs_needed <- names(pkgs_installed[!pkgs_installed]))>0){
    utils::install.packages(pkgs_needed,repos=repos,dependencies = TRUE,...);
    pkgs_final <- sapply(pkgs_needed,require,character.only=TRUE
                         ,quietly=quietly);
    if(!all(pkgs_final)){
      stop(c('the following required packages could not be installed:\n'
             ,paste0(names(pkgs_final[!pkgs_final]),collapse = ', ')));
    }
  };
}

clean_slate <- function(command="",removepatt='^\\.RData$|*.R\\.rdata$' # deps:git_subupd
                        ,all=TRUE,cleanglobal=TRUE
                        ,updatemodules=!file.exists('.developer')
                        ,envir=parent.frame()){
  if(!interactive()) warning('This function is intended to run in an '
                            ,'interactive session to restart that\n  '
                            ,'session on a clean slate. If you are calling it '
                            ,'non-interactively  (from a\n  script or '
                            ,'function), don\'t expect any code that you put '
                            ,'after it to work!');
  # remove cached files
  file.remove(list.files(pattern=removepatt,all.files=TRUE,recursive=TRUE,full.names = TRUE));
  # Update the git submodules
  if(updatemodules) git_subupd();
  # clear out calling environment
  rm(list=ls(all.names=all,envir = envir),envir = envir);
  # also global environment if specified
  if(cleanglobal) rm(list=ls(all.names=all,envir=.GlobalEnv),envir = .GlobalEnv);
  # if rstudioapi available, use it to restart the session
  if(requireNamespace('rstudioapi') && rstudioapi::isAvailable()){
    rstudioapi::restartSession(command)};
}

#' Append or replace attributes of any object in a pipeline-frindly way.
#'
#' @param xx        Object whose attributes to modify and then return the object
#' @param rep.attrs Named list of attributes to create or replace
#' @param app.attrs Named list of attributes to create or append to
#'
#' @return The object `xx` with modified attributes
#' @export
#'
#' @examples
#' # Change an object's attribute
#' with_attrs(iris,list(class='list'))
#'
#' # Create a new attribute for an object
#' foo <- with_attrs(LETTERS,list(comment='Hello world'))
#' comment(foo)
#' foo <- with_attrs(foo,rep='One more comment.')
#' comment(foo)
#'
with_attrs<-function(xx,rep.attrs=list(),app.attrs=list()){ # nodeps
  attrs <- attributes(xx); if(is.null(attrs)) attrs<-list();
  for(ii in names(rep.attrs)) attrs[[ii]] <- rep.attrs[[ii]];
  for(ii in names(app.attrs)) attrs[[ii]] <- c(attrs[[ii]],app.attrs[[ii]]);
  attributes(xx) <- attrs;
  xx;
}

#' takes input and returns it with a comment attribute
cm <- with_cm <- function(xx,comment=NULL,append=T # deps:with_attrs
                          ,transf=stringr::str_squish){
  if(!is.null(transf)) comment <- transf(comment);
  if(append) with_attrs(xx,app.attrs=list(comment=comment)) else {
    with_attrs(xx,rep.attrs=list(comment=comment));
  }
}


getCall.list <- getCall.data.frame <- getCall.gg <- function(x,...) {attr(x,'call')};

# why not update calls?
update.call <- function(object,...){
  dots <- list(...);
  for(ii in names(dots)) object[[ii]] <- dots[[ii]];
  object;
}

#' Stack a vector to form a matrix with repeating rows, with optional
#' column names and transformation
#'
#' @param  vv    A vector which will become the row of a matrix
#' @param  nr    Number of (identical) rows this matrix will contain
#' @param  trans An optional function that can take a matrix as its
#'              sole argument. Useful functions include `as.data.frame()`
#'              `as_tibble()` and `as.table()`
#' @return A matrix, unless the function specified in the `trans` argument
#'         causes the output to be something else.
#' @export
#'
#' @examples
#' vec2m(1:10,5);
#' vec2m(1:10,5,tr=data.frame);
#' vec2m(setNames(1:12,month.name),3);
vec2m <- function(vv,nr=1,trans=identity) {
  return(trans(matrix(as.matrix(c(vv)),nrow=nr,ncol=length(vv),byrow=T
                      ,dimnames=list(NULL,names(vv)))));
}

#' returns call with ALL arguments specified, both the defaults and those
#' explicitly provided by the user
fullargs <- function(syspar=sys.parent(),env=parent.frame(2L),expand.dots=TRUE){
  fn <- sys.function(syspar);
  frm <- formals(fn);
  cll <- match.call(fn,sys.call(syspar),expand.dots = expand.dots,envir = env);
  defaults <- setdiff(names(frm),c(names(cll),'...'));
  for(ii in defaults) cll[[ii]] <- frm[[ii]];
  return(cll);
}

#' Take a set of objects coercible to matrices and perform sprintf on them while
#' preserving their dimensions (obtained from the first argument of ...)

# figure out how the current OS represents the top of its file system
systemRootDir <- function(){
  dir <- dirname(normalizePath('.'));
  newdir <- dirname(dir);
  while(dir!=newdir){dir<-newdir; newdir <- dirname(newdir)}
  return(newdir);
}


# extract the error message of the argument
getTryMsg <- function(xx,ifNotErr=xx){ # revdeps: t_autoread
  if(methods::is(xx,'try-error')) return(attr(xx,'condition')$message);
  return(ifNotErr);}

# to be used inside a function to get a list of unevaluated calls
# from all the ... args
getParentDots <- function(xx,call=sys.call(-1),fun=sys.function(-1)){ # revdeps: colinfo,tblinfo
  out <- list();
  for(ii in setdiff(names(call),c(names(formals(fun)),''))){
    out[[ii]] <- call[[ii]]};
  out;
}

# Credit: http://conjugateprior.org/2015/06/identifying-the-os-from-r/
get_os <- function(){ # nodeps
  sysinf <- Sys.info();
  if (!is.null(sysinf)){
    os <- sysinf['sysname'];
    if (os == 'Darwin') os <- "osx";
  } else { ## mystery machine
    os <- .Platform$OS.type;
    if (grepl("^darwin", R.version$os)) os <- "osx";
    if (grepl("linux-gnu", R.version$os)) os <- "linux";
  }
  tolower(os);
};


systemwrapper <- function(cmd='',...,VERBOSE=getOption('sysverbose',T)
                          ,CHECKFILES=c('files')){ # nodeps
  args <- list(...); sysargs <- list();
  # separate out the args intended for system
  for(ii in intersect(names(args),names(formals(system)))){
    sysargs[[ii]] <- args[[ii]]; args[[ii]] <- NULL;};
  # check to make sure all arguments listed in checkfiles contain only files
  # that exist
  for(ii in intersect(CHECKFILES,names(args))){
    if(!all(.exist <- file.exists(args[[ii]]))){
      stop('The following files cannot be found:\n'
           ,paste(args[[ii]][!.exist],collapse=', '))}};
  for(xx in args) cmd <- paste(cmd,paste(xx,collapse=' '));
  if(VERBOSE) message('Executing the following command:\n',cmd);
  return(do.call(system,c(command=cmd,sysargs)));
}
# git ----
#' git checkout
git_checkout <- function(which=getOption('git.workingbranch','master'),...){
  systemwrapper('git checkout',which,...)};

gco <- git_checkout;

git_commit <- function(files='-a',comment
                       ,autopush=getOption('git.autopush',T),...){
  .changed<-git_status(VERBOSE=F,intern=T);
  filenames <- if(!missing(files)){
    paste0(paste(files,collapse=','),': ')} else 'multi: ';
  comment <- paste0('"',filenames,comment,'"');
  systemwrapper('git commit',files,'-m',comment,...);
  if(autopush) git_push();}

gci <- git_commit;

#' List the files in the repo having a particular status
#'
#' Quoting from git documentation (\code{git help diff}):
#' \emph{Select only files that are Added (A), Copied (C), Deleted (D),
#' Modified (M), Renamed (R), have their type (i.e. regular file, symlink,
#' submodule, ...) changed (T), are Unmerged (U), are Unknown (X), or have had
#' their pairing Broken (B). Any combination of the filter characters (including
#' none) can be used. When * (All-or-none) is added to the combination, all
#' paths are selected if there is any file that matches other criteria in the
#' comparison; if there is no file that matches other criteria, nothing is
#' selected.}
#'
#' @param xx String containing one or more of A,C,D,M,R,T,U,X,B, or *
git_diff_filter <- function(xx) {
  system(paste('git diff --name-only --diff-filter',xx),intern=T)};

#' Nicely formatted and concise status of current git repo.
git_status <- function(print=T
                       ,diff_filters=list(Added='A',Copied='C',Deleted='D'
                                          ,Modified='M',Renamed='R'
                                          ,ChangedType='T',Unmerged='U'
                                          ,Unknown='X',Broken='B')
                       ,...){
  branch <- system('git rev-parse --abbrev-ref HEAD',intern=T);
  tracking <- system('git rev-parse --abbrev-ref --symbolic-full-name @{u}'
                     ,intern=T);
  commits <- if(length(tracking)==0) character(0) else {
    system(paste('git log',paste0(tracking,'..',branch),'--oneline')
           ,intern=T)};
  diffs <- lapply(diff_filters,git_diff_filter);
  if(print){
    message('Branch: ',branch);
    if(length(commits)>0) {
      message('Ahead of ',tracking,' by ',length(commits),' commit'
              ,if(length(commits)>1) 's.' else '.')} else {
                if(!any(sapply(diffs,length)>0)){
                  message('All local changes have already been pushed')}};
    # TODO: check for un-pulled upstream changes
    for(ii in names(diffs)) if(length(diffs[[ii]])>0){
      message(ii,':'); cat(paste(' ',diffs[[ii]]),sep='\n');}
    }
  invisible(list(branch=branch,tracking=tracking,commits=commits
                 ,diffs=diffs));
  }
gst <- git_status;

#' List only the files currently being tracked by git
git_lsfiles <- function(...) {systemwrapper('git ls-files',...)};

#' Whatever other git functions that aren't explicitly implemented yet. Just put
#' any combination of git arguments as arguments to this function, leaving out
#' \code{git} itself.
git_other <- function(...){systemwrapper('git',...)};
git_ <- git_other;

#' Make the specified file start getting tracked by the current git repository.
git_add <- function(files,...){
  systemwrapper('git add',files=files,...)};
gadd <- git_add;

#' Rename a git file, so git knows you didn't delete it.
git_rename <- function(from,to,...){systemwrapper('git rename',from,to,...)};

#' Move a git file, so git knows you didn't delete it.
git_move <- function(from,to,...) {systemwrapper('git mv',from,to,...)};

#' Push committed changes to the origin (for example, but not necessarily,
#' github.com)
git_push <- function(...) {systemwrapper('git push',...)};
gp <- git_push;

#' Create a new branch \emph{and} check it out immediately. Optionally also
#' push.
git_newbranch <- function(branch,pushorigin=F,...){
  systemwrapper('git checkout -b',branch,...);
  if(pushorigin) systemwrapper('git push origin',branch);
}
gbr <- git_newbranch;

# TODO: detect conflicts in advance and ask what to do
git_merge <- function(which,fastfwd=getOption('git.fastfwd',F)
                      ,verbose=getOption('git.verbose',T),...){
  cmd <- paste('git merge',if(!fastfwd) '--no-ff' else '',...);
  if(verbose) message('Executing the following command:\n',cmd);
  system(cmd);}
gmr <- git_merge;

#' Delete and re-download git submodules if any exist.
#'
#' @param stopfile The name of a file which, if exists, will cause this function
#'                 to exit without doing anything. Will silently return errors
#'                 from shell but will not throw an error.
#'
#' @return If successful, \code{0}, otherwise an error code.
#' @export
#'
#' @examples
#'
#' \dontrun{ git_subupd() }
git_subupd <- function(stopfile='.developer'){if(!file.exists(stopfile)){
  unlink(systemwrapper("git submodule --quiet foreach 'echo $path'"
                       ,intern=TRUE,VERBOSE=FALSE)
         ,recursive = TRUE,force = TRUE);
  systemwrapper('git submodule update --init --recursive --remote')} else {
    message('Developer mode-- ignoring.'); return(0);
  }};

#' Automatically configure your global .gitconfig with your name and email
#' (if not yet thus configured) so that git will allow you to commit changes
git_autoconf <- function(upstream=getOption('git.upstream'),...){
  # should only be run in an interactive context
  if(!'upstream' %in% system('git remote',intern=T) && !is.null(upstream)){
    systemwrapper('git remote add upstream',upstream);
  }
  # Set username and email
  if(length(.username <- system('git config user.name',intern=T))==0){
    message("Please type in your name as you want it to appear in git logs:");
    .username <- paste0('"',readline(),'"');
    systemwrapper('git config --global user.name',.username)};
  if(length(.useremail <- system('git config user.email',intern=T))==0){
    message("Please type in your email as you want it to appear in git logs:");
    .useremail <- paste0('"',readline(),'"');
    systemwrapper('git config --global user.email',.useremail)};
}



#' Title: Add a pattern to a .gitignore file
#'
#' @param patterns A character vector of patterns to ignore. Required.
#'                 Always appended. If you need to un-ignore something
#'                 you will have to edit .gitignore manually.
#' @param ignorepath Path to .gitignore (you can have multiple ones)
#'                   current directory by default.
#' @param preamble What to put in the line/s before a set of ignore
#'                 patterns. Empty line by default, set to NULL if you
#'                 want to not skip a line.
#'
#' @return NULL
#' @export
#'
#' @examples git_ignore(c('*.csv','*.tsv'))
git_ignore <- function(patterns,ignorepath='.',preamble='') {
  write(c(preamble,patterns),file.path(ignorepath,'.gitignore'),append=T)};

#' Switch between ssh authentication and ssl authentication for a git repo.
#'
#' A use-case for this is some environments that by default initialize projects
#' as ssl/https (e.g. RStudio Cloud) but some users may prefer ssh
#' authentication. This easily converts between the two settings without having
#' to remember the whole git command. Will silently return errors from shell but
#' will not throw an error.
#'
#' @param tossh If `TRUE`, will attempt to convert the remote.origin.url from
#'              https to ssh. Default: `TRUE`
#' @param sshstr A string to use as the prefix for ssh connection. Optional,
#'               defaults to the values used by github.com
#' @param sslstr A string to use as the prefix for the ssl connection. Optional,
#'               defaults to the values used by github.com.
#'
#' @return Invisibly returns `0` or an error code.
#'
#' @export
#' @examples
#' \dontrun{
#' # Convert from https://github.com/... to git@github.com:...
#' git_ssh()
#'
#' # Convert from git@github.com:... to https://github.com/...
#' git_ssh(FALSE)
#'
#' }
git_ssh <- function(tossh=TRUE,sshstr='git@github.com:'
                    ,sslstr='https://github.com/'){
  currentorigin <- systemwrapper('git config remote.origin.url',intern=TRUE);
  message('Current origin: ',currentorigin);
  matchrepl <- if(tossh) c(sslstr,sshstr) else c(sshstr,sslstr);
  matchrepl[1]<-paste0('^',matchrepl[1]);
  neworigin <- gsub(matchrepl[1],matchrepl[2],currentorigin);
  message('Setting origin to: ',neworigin);
  systemwrapper('git config remote.origin.url',neworigin);
  systemwrapper('git remote -v');
}


# TODO: git nagger

# renaming and remapping  ----


#' Take a character vector and perform multiple search-replace
#' operations on it.
#' @param xx A \code{vector} of type \code{character} (required)
#' @param searchrep A \code{matrix} with two columns of type \code{character} (required). The left column is the pattern and the right, the replacement.
#' @param method One of 'partial','full', or 'exact'. Controls whether to replace only the matching regexp, replace the entire value that contains a matching regexp, or replace the entire value if it's an exact match.
submulti <- function(xx,searchrep
                     ,method=c('partial','full','exact'
                               ,'starts','ends','startsends')){
  # if no method is specified by the user, this makes it take the first value
  # if a method is only partially written out, this completes it, and if the
  # method doesn't match any of the valid possibilities this gives an informativ
  # error message
  method<-match.arg(method);
  # if passed a single vector of length 2 make it a matrix
  if(is.null(dim(searchrep))&&length(searchrep)==2) searchrep<-rbind(searchrep);
  # rr is a sequence of integers spanning the rows of the searchrep matrix
  rr <- 1:nrow(searchrep);
  # oo will hold the result that this function will return
  oo <- xx;
  switch(method
         ,partial = {for(ii in rr)
           oo <- gsub(searchrep[ii,1],searchrep[ii,2],oo)}
         ,full =    {for(ii in rr)
           oo[grepl(searchrep[ii,1],oo)]<-searchrep[ii,2]}
         ,exact = {for(ii in rr)
           oo[grepl(searchrep[ii,1],oo,fixed=T)]<-searchrep[ii,2]}
           #oo <- gsub(searchrep[ii,1],searchrep[ii,2],oo,fixed = T)}
         ,starts = {for(ii in rr)
           oo <- gsub(paste0('^',searchrep[ii,1]),searchrep[ii,2],oo)}
         ,ends = {for(ii in rr)
           oo <- gsub(paste0(searchrep[ii,1],'$'),searchrep[ii,2],oo)}
         ,startsends = {for(ii in rr)
           oo <- gsub(paste0('^',searchrep[ii,1],'$'),searchrep[ii,2],oo)}
  );
  oo;
}

#' Take a data.frame or character vector and a vector of grep targets and return
#' the values that match (for data.frame, column names that match). If no
#' patterns given just returns the names
#' @param xx A \code{data.frame} or character vector (required)
#' @param patterns A character vector of regexp targets to be OR-ed
grepor <- function(xx,patterns='.') {
  if(is.list(xx)) xx <-names(xx);
  grep(paste0(patterns,collapse='|'),xx,value=TRUE);
}



# table utilities -----------------------------------

#' Extends trailR package with integrated universal (almost) file reader
#'
#' @param file Any of the common delimited file formats
#'
#' @export
t_autoread <- function(file,...){ #deps: getTryMsg
  # make sure prerequisite function exists
  if(requireNamespace('trailR')){
    do.call(trailR::tread,c(list(file,readfun=autoread),list(...)))} else {
      stop("The 't_autoread()' only works if the trailR package is installed")
    }};


#' Autoguessing function for reading most common data formats
#'
#' Supported so far are: xls, xlsx, csv and most other delimited text formats,
#' SPSS, Stata, and SAS.
#'
#' @param file       The name of a file you want to read into R
#' @param na         Vector of strings that should get translated to `NA` upon
#'                   import. Optional, defaults to a reasonable set of values.
#' @param fixnames   A function that normalizes column names after importing the
#'                   data. If you want to leave them untouched, set this equal
#'                   to `identity()`. Optional, defaults to making them lower
#'                   case, R-legal, and unique.
#' @param file_args  This is to easily pass project-level or script-level
#'                   defaults in the form of an `alist()` to whichever lucky
#'                   function ends up winning the contest to read your file.
#'                   Only names that match the formal arguments of your function
#'                   will be used, the rest will be silently ignored. This way,
#'                   you can pass some `read_xlsx` specific arguments without
#'                   worrying that something else will intercept them and error
#'                   out.
#' @param ...        Additional named arguments passed to this function will
#'                   be added to those in `file_args` overriding any that have
#'                   matching names.
#'
#' @return A `tibble`
#' @importFrom readxl read_xls read_xlsx excel_sheets
autoread <- function(file,na=c('','.','(null)','NULL','NA')
                     # change this to identity to do nothing to names
                     ,fixnames=function(xx) {
                       stats::setNames(xx,tolower(make.names(names(xx),unique = TRUE)))}
                     ,file_args=list(),...){
  if(!RCurl::url.exists(file)){
    if(!file.exists(file)) stop(sprintf('File "%s" not found.'),file);
    if(dir.exists(file)) stop(sprintf('"%s" is not a file, it\'s a directory.')
                              ,file)};
  args <- list(...);
  # allow file_args to be overridden by ... args, while preserving
  # order of ...
  for(ii in intersect(names(args),names(file_args))) file_args[[ii]] <- NULL;
  xlformat <- readxl::format_from_signature(file);
  args <- c(file_args,args);
  reader <- if(!is.na(xlformat)) paste0('read_',xlformat) else 'auto';
  if(reader == 'auto' && nrow(enc<-readr::guess_encoding(file))>0){
    # if it's a zip file, this unzips it and replaces the original file arg
    # with the temporary unzipped version
    unzfile <- suppressWarnings(utils::unzip(file,exdir = tempfile("autoread")));
    if(length(unzfile)>1){ if(!'sheet' %in% names(args)){
      warning(
        "\nMultiple files found in ",file,":\n"
        ,paste(list.files(dirname(unzfile),recursive = T,all.files = T)
               ,collapse=',')
        ,"\nReading in the first file. If you want a different one"
        ,"\nplease specify a 'sheet' argument");
      unzfile <- unzfile[1]} else unzfile <- unzfile[args$sheet]};
    if(length(unzfile==1)){
      message('Reading unzipped file',basename(unzfile));
      file <- unzfile;}
    message('Trying to read as a text file with fread()')
    # try to read as a delimited file via fread
    txargs <- args[intersect(names(args)
                             ,names(formals(data.table::fread)))];
    txargs$na.strings <- na;
    out <- try(tibble::as_tibble(do.call(data.table::fread
                                         ,c(list(input=file),txargs)))
               ,silent = T);
    if(!methods::is(out,'try-error')) return(fixnames(out));
    message('fread() failed! Falling back on read_delim');
    txargs <- args[intersect(names(args),names(formals(readr::read_delim)))];
    txargs$na <- na;
    txargs$delim <- '\t';
    suppressMessages(out <- try({
      problems<-problems(oo<-do.call(readr::read_delim,c(list(file=file)
                                                         ,txargs)));
      oo},silent=T));
    if(!methods::is(out,'try-error') && ncol(out)>1) return(fixnames(out)) else out_tab <- out;
    txargs$delim <- ',';
    suppressMessages(out <- try({
      problems<-problems(oo<-do.call(readr::read_delim,c(list(file=file)
                                                         ,txargs)));
      oo},silent=T));
    if(!methods::is(out,'try-error')) return(fixnames(out));
    cat('\nGuessed encoding:\n');print(enc);
    stop(attr(out,'condition')$message);
  }
  # try various binary formats
  if(reader %in% c('read_xls','read_xlsx')){
    # check for Excel formats
    message('checking sheets in workbook');
    sheets <- readxl::excel_sheets(file);
    if(length(sheets)>1 && !'sheet' %in% names(args)){
      warning(
        "\nMultiple sheets found:\n",paste(sheets,collapse=', ')
        ,"\nReading in the first sheet. If you want a different one"
        ,"\nplease specify a 'sheet' argument")};
    xlargs <- args[intersect(names(args)
                             ,names(formals(eval(as.name(reader)))))];
    xlargs$na <- na;
    message('About to read Excel file');
    out <- do.call(reader,c(list(path=file),xlargs));
    message('Fixing column names on Excel file');
    out <- fixnames(out);
    return(out)};

  # SPSS, SAS, and Stata
  # one of these has some error message that bubbles through despite silent=T
  # so we sink before the for loop, unsink if one of the readers succeeds...
  sink(tempfile());
  for(ff in c(haven::read_sav,haven::read_por,haven::read_dta,haven::read_sas
              ,haven::read_xpt)){
      {
        if(!methods::is(try(out <- ff(file),silent=T),'try-error')){
          sink();
          return(fixnames(out))}}
  }
  # and unsink at the end if none of them succeed
  sink();

  message('\nUnknown file type?\n');
  stop(attr(out,'condition')$message);
  }

#' Sumarize a table column
colinfo <- function(col,custom_stats=alist(),...){
  nn <- length(col);
  nona <- stats::na.omit(col);
  isna <- is.na(col);
  coltab <- table(nona);
  out <- list(class=paste0(class(col),collapse=':')
              ,uniquevals=length(coltab)
              ,isnum=is.numeric(col)
              ,frc_int=if(is.numeric(nona)) mean(nona%%1==0) else 0
              ,n_nonmissing=nn-sum(isna)
              ,n_missing=sum(isna)
              ,frc_missing=mean(isna)
              ,n_nonrepeat=sum(coltab==1)
              ,frc_nonrepeat=sum(coltab==1)/length(nona)
              ,top3=paste0('"',names(sort(coltab,decreasing = T)[1:3]),'"'
                              ,collapse='; ')
  );
  for(ii in names(custom_stats)){
    out[[ii]] <- eval(custom_stats[[ii]],envir = out)};
  dots <- getParentDots();
  for(ii in names(dots)) out[[ii]] <- eval(dots[[ii]],envir=out);
  out;
  }

#' Create an automated data dictionary
#'
#' @param dat          An object that inherits from `data.frame`
#' @param custom_stats An `alist` of statistics to calculate on each
#'                     column of `dat` in addition to the defaults
#'                     in `info_cols` (below). Optional.
#' @param info_cols    Another `alist`, this one has default values but
#'                     can be overridden on an all-or-none basis.
#' @param ...          Eats any extra arguments, to keep them from
#'                     causing trouble.
#'
#' @return A data-frame having one row for each column in `dat`
#' @export
#'
#' @examples
#'
#' tblinfo(datasets::iris)
#'
tblinfo <- function(dat,custom_stats=alist()
                    # some handy column groupers
                    ,info_cols=alist(
                       c_empty=frc_missing==1,c_uninformative=n_nonmissing<2
                      ,c_ordinal=uniquevals<10&isnum
                      ,c_tm=uniquevals==1&n_missing>0
                      ,c_tf=uniquevals==2,c_numeric=isnum&!c_ordinal
                      ,c_factor=uniquevals<20&!isnum
                      ,c_complex=!(c_ordinal|c_tm|c_tf|c_numeric|c_factor)
                    ),...){
  out <- dplyr::bind_rows(sapply(dat,colinfo,custom_stats=custom_stats,simplify=F)
                   ,.id='column');
  for(ii in names(info_cols)) out[[ii]] <- eval(info_cols[[ii]],envir=out);
  dots <- getParentDots();
  for(ii in names(dots)) out[[ii]] <- eval(dots[[ii]],envir=out);
  class(out)<-c('dtdict',class(out));
  return(out);
}


#' Returns a list of column names from the data dictionary for which the column
#' named in the first argument is true. The first arg can be either a string or
#' a name. The second must be a data.frame
#'
#' @param var        Either a string or a name, of a column in `dictionary`
#' @param dat        An optional data.frame, to constrain which rows of the
#'                   'dictionary' object get used
#' @param retcol     Which column to return-- by default the same as used for 'matchcol'
#' @param dictionary A 'data.frame' that is used as a data dictionary. It must at
#'                   minimum contain a column of column-names for the dataset for
#'                   which it is a data dictionary ('matchcol') and one or more
#'                   columns each representing a _group_ of columns in the dataset,
#'                   such that a TRUE or T value means the column whose name is
#'                   the value of 'matchcol' is the name of a column in the data
#'                   that belongs to the group defined by the grouping column.
#'                   These grouping columns are what the argument 'var' is
#'                   supposed to refer to. We will use the convention that grouping
#'                   column names begin with 'c_' but this convention is not
#'                   (currently) enforced programmatically.
#'
#' @examples
#'
#' dct0 <- tblinfo(mtcars);
#'
#' v();
#'
#' # Numeric variables in mtcars that behave like discrete variables
#' v(c_ordinal);
#' # Numeric variables in mtcars
#' v(c_numeric);
#' # Variables in mtcars that only have two values, so could be encoded as
#' # boolean
#' v(c_tf);
#'
#' # Non-default data dictionary
#' dct1 <- tblinfo(state.x77)
#' v(c_ordinal,dict=dct1)
#' v(c_factor,dict=dct1)
#' v(c_tf,dict=dct1)
#' v(c_numeric,dict=dct1)
#'
#' @export
v <- function(var,dat
              ,retcol=getOption('tb.retcol','column')
              ,dictionary=get('dct0')
              ,asname=F) {
  # convenience function: if forgot what column names are available, call with
  # no arguments and they will be listed
  if(missing(var)) return(names(dictionary));
  # support both standard or non-standard evaluation
  var<-as.character(substitute(var));
  # TODO: Think about what to do when nothing matches... not necessarily an error
  #       condition, might just be something to warn about and move on.
  out <- unique(as.vector(na.omit(unlist(dictionary[dictionary[[var]],retcol]))));
  if(!is(try(cnames<-colnames(dat),silent = T),'try-error')&&length(cnames)>0) {
    out <- out[out%in%cnames];}
  if(asname) out <- lapply(out,as.name);
  #return(unname(out));
  return(out);
}


# string hacking ---------------------------------------------------------------

# Project Utilities ----
personalizeTemplate <- function(file,title='TITLE',author='AUTHOR'
                                ,deps=c('dictionary.R'),packages=c()
                                ,date=Sys.Date(),template='TEMPLATE.R'
                                ,path_to_global
                                ,paths=c('.','..','scripts')
                                ,notebook=F){
  # TODO: interactively prompt for non-default title and author if interactive
  # TODO: ask if want to be prompted for the other arguments if interactive
  if(length(deps)>0){
    .files <- sapply(deps,function(ii) !is.null(find_path(ii,paths)));
    if(!all(.files)) stop(
"Of the files you specified in the 'deps' argument the following are missing:\n"
      ,paste0(deps[!.files],collapse=', '))};
  # make sure the template exists
  whichtemplate <- find_path(template,paths);
  if(is.null(whichtemplate)) stop(sprintf('Cannot find file %s',template));
  # make sure global.R exists
  if(missing(path_to_global)) path_to_global<-find_relpath('global.R'
                                                           ,c('.','..','../..')
                                                           ,recursive = T
                                                           ,normalize = F);
  if(length(path_to_global)==0) stop("Cannot find file 'global.R'");
  out <- sprintf(readLines(whichtemplate)
                 ,title # The title that will appear in the header
                 ,author # Author, ditto
                 ,format(date,'%d/%m/%Y') # Date, ditto
                 # packages (optional)
                 ,paste('c(',paste0("'",packages,"'",collapse=','),')')
                 ,file # ...so the file knows it's own name!
                 # dependencies on previously run files
                 ,paste('c(',paste0("'",deps,"'",collapse=','),')')
                 # location of global.R
                 ,path_to_global[1]
  );
  write(out,file);
  if(notebook) knitr::spin(file,knit=F);
}

find_path <- function(file,paths=c('.','..')){
  # get the basename of the file
  filebase <- basename(file);
  # generate a search-paths for this file, starting with the path component
  # of 'file'
  filedirs <- if(filebase!=file) dirname(file) else c();
  filedirs <- normalizePath(unique(c(filedirs,paths)));
  # return the first full path in which the file is found to exist
  for(ii in file.path(filedirs,filebase)) if(file.exists(ii)) return(ii);
  return(c());
}

find_relpath <- function(file,paths=c('..','../..','.'),recursive=F
                         ,normalize=T){
  filebase <- basename(file);
  paths<-c(if(filebase!=file && file.exists(dirname(file))){
    dirname(file)} else c(),paths);
  for(ii in paths){
    .paths <- file.path(c(ii,list.dirs(ii,full.names = T,recursive=recursive))
                        ,filebase);
    if(any(.found<-file.exists(.paths))){
      return(if(normalize) normalizePath(.paths[.found]) else .paths[.found])};
  }
  # if returns empty vector means none found
  return(c());
}

load_deps <- function(deps,scriptdir=getwd(),cachedir=scriptdir
                      ,fallbackdir='scripts',envir=parent.frame()
                      ,loadfn=if(exists('tload')) tload else load ){
  if(length(deps)==0||identical(deps,'')){message('No dependencies.');return();}
  # what objects got loaded by this function
  loadedobj=c();
  for(ii in deps){
    # if a cached .rdata file for this dependency cannot be found...
    if(is.null(iicached<-find_path(paste0(ii,'.rdata')
                                   ,c(cachedir,scriptdir,fallbackdir)))){
      # run that script and create one
      if(!is.null(iiscript<-find_path(ii,c(scriptdir,fallbackdir)))){
        # TODO: modify all files to write their cached results to a user
        # specified path if one is provided
        message(sprintf('Trying to initialize cache using script %s'
                        ,iiscript));
        .junk <- system(sprintf('R --no-restore -e ".workdir<-\'%s\'; source(\'%s\',chdir=T)"'
                                ,cachedir,iiscript),intern = T);
        # again try to find a valid path to it
        iicached <- find_path(paste0(ii,'.rdata')
                              ,c(cachedir,scriptdir,fallbackdir));
        } else{
          # if cannot find script, error
          stop(sprintf('The script %s was not found',ii));
        }};
    # if there is still no cached .rdata found, error
    if(is.null(iicached)){
      stop(sprintf('The cached file for %s could not be found',iiscript));
      # otherwise, the cached .rdata now exists one way or another, load it
    } else {
      loadedobj <- union(loadedobj,loadfn(iicached,envir=envir));
      message(sprintf('Loaded data for %s from %s',ii,iicached));
      };
  }
  return(loadedobj);
}


#' Search for all sample datasets in all currently installed packages.
#'
#' A goal of this function is to be able to quickly filter through currently
#' available datasets and find ones that meet your needs so you're not using
#' the same old `mtcars` and `iris` for everything.
#'
#' @return A `data.frame` with columns `Package`: name of the package that
#'         provides that dataset, `LibPath`: path where that package is
#'         currently installed, `Item`: the name of the dataset, `Title`: a
#'         brief description of the dataset, `Class`: the class of the object
#'         listed in `Item` (if multiple classes, they are delimited by
#'         semicolons),`IsDataFrame`: whether or not the object listed in
#'         `Item` inherits from `data.frame`,`NumberNonNumeric`: number of
#'         columns that are not `numeric` (`character`, `factor`, `POSIXct`,
#'         etc.),`Rows`: number of rows in the `Item` if applicable,`Cols`:
#'         number of columns in the dataset in the `Item` if applicable.
#' @export
#'
#' @examples \dontrun{ allTheData() }
allTheData <- function(verbose=T){
  # get all datasets provided by all loaded packages
  dt = as.data.frame(utils::data(package = .packages(all.available = TRUE))$results
                     ,stringsAsFactors=F);
  # df = data.frame?, nnn = number not numeric, nr/nc = nrows, ncols
  dt[,c('Class','IsDataFrame','NumberNonNumeric','Rows','Cols')] <- NA;
  for(ii in unique(dt$Package)){
    for(jj in subset(dt,Package==ii)$Item) {
      path <- paste0(ii,'::',jj);
      rows <- dt$Item==jj & dt$Package == ii;
      oo <- try(eval(parse(text=path)),silent=T);
      if(verbose) message(path);
      if(!is(oo,'try-error')){
        dt[rows,'Class'] <- paste0(class(oo),collapse=';');
        if(dt[rows,'IsDataFrame'] <- is(oo,'data.frame')){
          dt[rows,'NumberNotNumeric'] <- ncol(oo) -
            sum(sapply(oo,is.numeric))};
        dt[rows,c('Rows','Cols')]<-c(c(nrow(oo),NA)[1]
                                     ,c(ncol(oo),NA)[1]);}
    }};
  return(dt);
};
bokov/tidbits documentation built on Jan. 26, 2024, 6:25 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
bokov/tidbits
Various small utilities for simplifying data preparation and analysis

R/functions.R
In bokov/tidbits: Various small utilities for simplifying data preparation and analysis

Documented in allTheData autoread colinfo fullargs git_add git_autoconf git_checkout git_diff_filter git_ignore git_lsfiles git_move git_newbranch git_other git_push git_rename git_ssh git_status git_subupd grepor instrequire submulti systemRootDir t_autoread tblinfo v vec2m with_attrs

R Package Documentation

Browse R Packages

We want your feedback!

bokov/tidbits Various small utilities for simplifying data preparation and analysis

R/functions.R In bokov/tidbits: Various small utilities for simplifying data preparation and analysis

Documented in allTheData autoread colinfo fullargs git_add git_autoconf git_checkout git_diff_filter git_ignore git_lsfiles git_move git_newbranch git_other git_push git_rename git_ssh git_status git_subupd grepor instrequire submulti systemRootDir t_autoread tblinfo v vec2m with_attrs

R Package Documentation

Browse R Packages

We want your feedback!

bokov/tidbits
Various small utilities for simplifying data preparation and analysis

R/functions.R
In bokov/tidbits: Various small utilities for simplifying data preparation and analysis