R/merge_utils.R

Defines functions complete.cases2 multimerge uniqueNotNA .contents contents is.character.contents is.numeric.contents is.logical.contents .unfactor unfactor recodeReverse recodeMatches recodeAs recodeDF recodeDFas colwise2 matchStrings matchByDistance checkVar checkDF doDFchecks dupsBetweenGroups dupsBetweenDFs numUnique lsapply

Documented in checkDF checkVar colwise2 complete.cases2 contents doDFchecks dupsBetweenDFs dupsBetweenGroups is.character.contents is.logical.contents is.numeric.contents lsapply matchByDistance matchStrings multimerge numUnique recodeAs recodeDF recodeDFas recodeMatches recodeReverse unfactor uniqueNotNA

.packageName <- "mergeutils"

##' @title Find complete cases/rows of data.frame with respect to a set of variables.
##' @description Find complete cases/rows of data.frame with respect to a set of variables.
##' @details The row numbers of 'df' for which there are no missing values in any of the columns indicated by '...' are
##' returned.
##' The arguments passed in '...' may be character strings, character vectors or formula objects. For formula objects
##' all variables on both left and right hand side of the formula are used. For character strings and vectors the variables
##' whose names are given by those strings are used.
##' Variable transformations may also be used in the arguments, however it doesn't seem to work for the 'diff'
##' transformation (though 'lag' works), and I can't see how to easily fix this.
##' @param ... A comma seperated list of arguments indicating which variables to consider. The arguments may be
##' character strings, character vectors or formula objects. 
##' @param df A data.frame containing the data to be checked for complete cases.
##' @return A logical vector indicating which rows of df have no missing values for the variables passed in '...'
##' Note: currently this doesn't work for formulas which specify instruments (i.e. containing a | symbol)
##' @author Ben Veal
##' @keywords utilities misc
##' @export
complete.cases2 <- function(...,df)
{
  vars <- ""
  for(arg in list(...))
    {
      if(class(arg)[1]=="formula")
        vars <- paste(vars,str_c(get.vars(arg),collapse=","),sep=",")
      else if(class(arg)[1]=="character")
        {
          vars <- paste(vars,str_c(arg,collapse=","),sep=",")
        }
      else stop("Function args should be strings or formulas and a data.frame for the df arg.")
    }
  vars <- substring(vars,2,nchar(vars))
  eval(parse(text=paste0("evalq(complete.cases(",vars,"),envir=as.data.frame(df))")))
}

## TODO - check it works, I think it still needs work
##' @title Merge multiple dataframe by common columns.
##' @description Merge multiple dataframe by common columns.
##' @details Given a list of dataframes, and a list of vectors each of which contains the same number column names/numbers,
##' merge the dataframes by matching on the corresponding columns.
##' The 'by' argument specifies, for each dataframe, which columns to use for matching - it should be a list of numeric or
##' character vectors. If there are not enough vectors in 'by' to cover all the dataframes in 'data' then the final vector
##' of 'by' will be recycled to cover missing specifications.
##'
##' The optional 'all' argument is a list which can be used to specify for each dataframe which rows to keep.
##' If the i'th element of 'all' is NULL (default) then all rows of the i'th dataframe will be included in the results.
##' Otherwise the i'th element of 'all' can be a vector of integers. In this case a row of the i'th dataframe will only
##' be included if it is matched in all of the dataframes corresponding to positive integers, or at least one of the
##' dataframes corresponding to negative integers in the vector. 
##' For example if all[[3]]==c(1,2,-4,-5) then a row of the 3rd dataframe will only be included in the results if it is
##' matched in both the 1st & 2nd dataframes or in the 4th or 5th dataframes.
##'
##' To avoid column name clashes, the corresponding dataframe indices will be appended to the names of any columns in the
##' results that would otherwise clash. Alternatively suffixes can be specified in the 'suffixes' argument, which should
##' be a list of strings.
##' @param data a list of dataframes to merge.
##' @param by a list of vectors each of which contains column names/numbers to be used for matching rows of the corresponding
##' dataframe in 'data'. Each of these vectors should be the same length. Final vector will be recycled to fill in missing
##' vectors. If this argument is left as NULL then all common column names will be used for matching.
##' @param all (optional) list of vectors/NULL values, each indicating which rows of the corresponding dataframe in 'data'
##' to keep. See details below.
##' @param suffixes (optional) list or vector of suffixes (one for each dataframe) to be appended to clashing column names
##' in order to indicate the original dataframe. 
##' @return A single dataframe containing the merged data.
##' @keywords manip
##' @author Ben Veal
##' @export
multimerge <- function(data,by=NULL,all=NULL,suffixes=NULL) {
    len <- length(data)
    ## make sure there are names for all elements of by dataframes, and convert numeric variable indices to variable names
    if(!is.null(by)) {
        for(i in 1:len) {
            if(i > length(by)) by[[i]] <- by[[i-1]]
            else if(is.numeric(by[[i]]))
                by[[i]] <- names(data[[i]])[by[[i]]]
        }
    }
    ## subset dataframes to appropriate rows according to values in "all" argument
    data2 <- list()
    for(i in 1:len) {
        if(length(all) < i || is.null(all[[i]]) || all[[i]] == c(i))
            keep <- rep(TRUE,nrow(data[[i]]))
        else {
            keep <- rep(FALSE,nrow(data[[i]]))
            posids <- all[[i]][which(all[[i]] > 0)]
            negids <- -(all[[i]][which(all[[i]] < 0)])
            if(length(negids)>0)
                keep <- keep | dupsBetweenDFs(data[c(i,negids)],by=by[c(i,negids)],matchall=FALSE)[[1]]
            if(length(posids)>0)
                keep <- keep | dupsBetweenDFs(data[c(i,posids)],by=by[c(i,posids)],matchall=TRUE)[[1]]
        }
        data2[[i]] <- data[[i]][keep,]
    }
    ## append suffixes to conflicting column names (can't rely on 'merge' function to do this properly)
    if(!is.null(by)) {
        cols <- mapply(function(a,b){setdiff(names(a),b)},data,by,SIMPLIFY=FALSE)
        for(i in 1:len) {
            dups <- intersect(cols[[i]],unlist(cols[-i]))
            indices <- names(data2[[i]]) %in% dups
            if(length(suffixes) >= i && !is.na(suffixes[[i]]))
                names(data2[[i]])[indices] <- paste0(names(data2[[i]])[indices],suffixes[[i]])
            else names(data2[[i]])[indices] <- paste0(names(data2[[i]])[indices],".",as.character(i))
        }
    }
    ## finally, merge the data
    accum <- data2[[1]]
    for(i in 2:len) {
        if(is.null(by[[1]]))
            accum <- merge(accum,data2[[i]],all=TRUE)
        else
            accum <- merge(accum,data2[[i]],by.x=by[[1]],by.y=by[[i]],all=TRUE)
    }
    return(accum)
}

##' @title Like 'unique' but omits NA's or NaN's, suppresses warnings, and returns result as character vector.
##' @description Like 'unique' but omits NA's or NaN's, suppresses warnings, and returns result as character vector.
##' @param x A vector.
##' @param warn Whether to issue warnings from 'unique' (default is FALSE).
##' @return The unique values of 'x' with NA & NaN values removed.
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
uniqueNotNA <- function(x,warn=FALSE)
{
    if(warn)
        uvals <- unique(x)
    else
        uvals <- suppressWarnings(unique(x))
    
    as.character(uvals[!is.na(uvals)])
}

##' @title Internal function used by 'contents'
##' @description Internal function used by 'contents'
##' @param x 
##' @return A string
##' @author Ben Veal
.contents <- function(x)
{
    uvals <- uniqueNotNA(x)
    if(suppressWarnings(!anyNA(as.numeric(uvals))))
        "numeric"
    else if(setequal(uvals,c("FALSE","TRUE")))
        "logical"
    else
        "character"
}

##' @title Find contents of variables/factors - either "numeric", "logical" or "character"
##' @description Find contents of variables/factors - either "numeric", "logical" or "character"
##' @details If 'x' is a vector or matrix then a single string is returned, if 'x' is a list or
##' dataframe then a character vector is returned with each entry indicating the contents of the
##' corresponding element/column.
##' @param x 
##' @return A single string or a character vector
##' @seealso \code{\link{is.character.contents}}, \code{\link{is.numeric.contents}}, \code{\link{is.logical.contents}}
##' @author Ben Veal
##' @keywords utilities misc
##' @export
contents <- function(x)
{
    if(class(x)=="data.frame")
        apply(x,2,.contents)
    else if(class(x)=="list")
        sapply(x,.contents,simplify=TRUE)
    else if(class(x) %in% c("factor","numeric","logical","integer","double","character"))
        .contents(x)
    else
        "unknown"
}

##' @title Test if contents of vector or factor are character/string values.
##' @description Test if contents of vector or factor are character/string values.
##' @details Unlike \code{\link{is.character}} this works with factors.
##' @param x 
##' @return TRUE or FALSE
##' @seealso \code{\link{contents}}, \code{\link{is.numeric.contents}}, \code{\link{is.logical.contents}}
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
is.character.contents <- function(x)
{
    contents(x)=="character"
}

##' @title Test if contents of vector or factor are numeric values.
##' @description Test if contents of vector or factor are numeric values.
##' @details Unlike \code{\link{is.character}} this works with factors, and will also return TRUE if the
##' contents are strings containing only numbers.
##' @param x 
##' @return TRUE or FALSE
##' @seealso \code{\link{contents}}, \code{\link{is.character.contents}}, \code{\link{is.logical.contents}}
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
is.numeric.contents <- function(x)
{
    contents(x)=="numeric"
}

##' @title Test if contents of vector or factor are logical (TRUE/FALSE) values.
##' @description Test if contents of vector or factor are logical (TRUE/FALSE) values.
##' @details Unlike \code{\link{is.character}} this works with factors.
##' @param x 
##' @return TRUE or FALSE
##' @seealso \code{\link{contents}}, \code{\link{is.character.contents}}, \code{\link{is.numeric.contents}},
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
is.logical.contents <- function(x)
{
    contents(x)=="logical"
}

##' @title Internal function used by 'unfactor' 
##' @description Internal function used by 'unfactor' 
##' @param x 
##' @return A vector
##' @author Ben Veal
.unfactor <- function(x)
{
    if(!class(x)=="factor")
        x
    else
        {
            c <- contents(x)
            switch(contents(x),
                   "character"=as.character(x),
                   "logical"=as.logical(x),
                   "numeric"=as.numeric(as.character(x)),
                   x)
        }
}

##' @title Convert factors into numeric/character/logical vectors.
##' @description Convert factors into numeric/character/logical vectors.
##' @details This function converts factor vectors into a numeric, character or logical vectors
##' depending on the contents. If 'x' is a list or dataframe then any constituent factors will
##' be converted.
##' @param x A factor vector, list or dataframe.
##' @return A vector, list or dataframe containing no factors.
##' @author Ben Veal
##' @keywords manip
##' @export 
unfactor <- function(x)
{
    if(class(x)=="data.frame")
        as.data.frame(lapply(x,.unfactor),stringsAsFactors=FALSE)
    else if(class(x)=="list")
        lapply(x,.unfactor)
    else if(class(x)=="factor")
        .unfactor(x)
    else
        x
}

##' @title Recode an ordered variable by reversing the codes.
##' @description Recode an ordered variable by reversing the codes.
##' @details Given a variable of integer codes (e.g. measured on a Likert scale), or an ordered factor, this function
##' will recode the variable with the codes reversed, and return the result.
##' @param var The variable containing the codes to be reversed.
##' @return The recoded variable with codes reversed.
##' @seealso \code{\link{recodeAs}}, \code{\link{recodeMatches}}, \code{\link{recodeVar}},
##' \code{\link{colwise2}} for recoding multiple dataframe columns simultaneously (in library(plyr)).
##' @author Ben Veal
##' @keywords manip
##' @export 
recodeReverse <- function(var)
{
  stopifnot(is.ordered(var)||is.numeric(var))
  vals <- sort(unique(var))
  revvals <- sort(unique(var),decreasing=TRUE)
  var2 <- var
  for(i in 1:length(vals))
      var2[var==vals[i]] <- revvals[i]
  return(var2)
}

##' @title Recode values of variable/factor matching patterns
##' @description Recode values of variable/factor matching patterns
##' @details This function is like 'recodeVar' except that it uses regular expressions to match
##' the source variables (using 'grepl'). For each unique value of 'x' the regexps in 'patterns' are
##' tried in turn until a match is found, then the corresponding element of 'targets' is used to recode
##' the value. If length(patterns) > length(targets) then the final target will be used for all
##' excess patterns. The 'default' and 'keep.na' parameters are the same as for 'recodeVar'.
##' To do string matching instead of regular expression matching use fixed=TRUE
##' @param x The character vector/factor to recode.
##' @param patterns List of regular expressions for matching.
##' @param targets Values to recode corresponding matches in 'patterns' into.
##' @param default Default target value for those values of x that don't match any pattern.
##' When default=NULL, non-matching values of x will be kept in the output.
##' @param keep.na If TRUE then NA's in x will be retained in the output.
##' @param ignore.case If TRUE (default) then matching is not case sensitive. 
##' @param ignore.punc If TRUE (default) then punctuation will be ignored when matching.
##' @param ... optional arguments to grepl (e.g. use fixed=TRUE for string instead of regexp matching)
##' @return A vector
##' @seealso \code{\link{recode}} for recoding numbers (in library(car)), \code{\link{recodeVar}}, \code{\link{recodeAs}},
##' \code{\link{colwise2}} for recoding multiple dataframe columns simultaneously (in library(plyr)).
##' @author Ben Veal
##' @keywords manip
##' @export
recodeMatches <- function(x,patterns,targets,default=NULL,keep.na=TRUE,ignore.case=TRUE,ignore.punc=TRUE,...)
{
    stopifnot(class(x) %in% c("character","factor"))
    src <- character()
    tgt <- character()
    dots <- list(...)
    if("fixed" %in% names(dots) && dots$fixed && ignore.punc)
        ignore.punc <- FALSE
    for(val in uniqueNotNA(x))
        {
            val2 <- ifelse(ignore.punc,gsub("[[:punct:]]","",val),val)
            matches <- sapply(patterns,function(pat){grepl(pat,val2,ignore.case=ignore.case,...)})
            if(any(matches))
                {
                    src <- append(src,val)
                    indx <- min(which(matches))
                    tgt <- append(tgt,ifelse(indx <= length(targets),targets[indx],last(targets)))
                }
        }
    if(length(src)>0)
        doBy::recodeVar(x,src,tgt,default=default,keep.na=keep.na)
    else
        x
}

##' @title Recode unique values of one variable to match unique values of another variable.
##' @description Recode unique values of one variable to match unique values of another variable.
##' @details This function is a wrapper around \code{\link{recodeVar}}. It can be used when you need to
##' recode a character variable/factor so that the values correspond with those of another variable
##' (e.g. when merging datasets with slightly different value labels).
##' The \code{\link{matchstrings}} function is used to guess the best mapping between the values of 'A' and the values of 'B'.
##' Use this function with care.
##' @param A The character vector/factor to recode.
##' @param B The character vector/factor whose unique values are to be copied.
##' @return The recoded version of 'A'
##' @seealso \code{\link{recode}} for recoding numbers (in library(car)), \code{\link{recodeVar}}, \code{\link{recodeMatch}},
##' \code{\link{colwise2}} for recoding multiple dataframe columns simultaneously (in library(plyr)).
##' @author Ben Veal
##' @keywords manip
##' @export
recodeAs <- function(A,B)
{
    stopifnot(class(A)=="factor"||class(A)=="character")
    stopifnot(class(B)=="factor"||class(B)=="character")
    Avals <- uniqueNotNA(A)
    Bvals <- uniqueNotNA(B)
    Bvals <- Bvals[matchStrings(Avals,Bvals)]
    for(i in 1:length(Avals))
        print(paste0("Recoding '",as.character(Avals[i]),"' to '",as.character(Bvals[i]),"'"))
    recodeVar(A,Avals,Bvals)
}

##' @title Recode collection of variables so that they all have the same unique values.
##' @description Recode collection of variables so that they all have the same unique values.
##' @details This is a wrapper around \code{\link{recodeMatches}} & \code{\link{recode}} (in library(car))
##' for recoding all strings and numbers variables in a dataframe.
##' @param df A dataframe to recode
##' @param sPatterns list of regexps matching strings to be recoded
##' @param sTargets list of targets corresponding to patterns in 'sPatterns'
##' @param nRecodes specification for recoding numbers (see \code{\link{recode}})
##' @return a dataframe (recoded)
##' @author Ben Veal
##' @keywords manip
##' @export
recodeDF <- function(df,sPatterns,sTargets=NA,nRecodes)
{
    fn1 <- colwise2(recodeMatches,is.character.contents,patterns=sPatterns,targets=sTargets)
    fn2 <- colwise2(car::recode,is.numeric.contents,recodes=nRecodes)
    fn2(fn1(df))
}

##' @title Recode variables in dataframe to match codes in another dataframe
##' @description Recode variables in dataframe to match codes in another dataframe
##' @details This is a wrapper around \code{\link{recodeAs}}.
##' Each variable of 'df1' listed in 'cols1' will be recoded to the corresponding variable of 'df2'
##' listed in 'cols2'. If length(cols2) < length(cols1) then the final variable in 'cols2' will be
##' used for all excess variables in 'cols1'. If 'cols2' is not supplied then it will take the same
##' value as 'cols1'.
##' @param df1 dataframe to be recoded
##' @param df2 dataframe to copy codes from
##' @param cols1 numeric/character vector indicating variables of first dataframe to be recoded 
##' @param cols2 numeric/character vector indicating variables of second dataframe to copy codes from
##' @return a dataframe (recoded version of 'df1')
##' @author Ben Veal
##' @keywords manip
##' @export 
recodeDFas <- function(df1,df2,cols1,cols2=cols1)
{
    x <- df1
    for(i in 1:length(cols1))
        {
            col1 <- cols1[i]
            col2 <- ifelse(i<=length(cols2),cols2[i],last(cols2))
            x[,col1] <- recodeAs(df1[,col1],df2[,col2])
        }
    return(x)
}

##' @title Like \code{\link{colwise}}, but the returned function will return unaffected columns alongside affected ones
##' @description Like \code{\link{colwise}}, but the returned function will return unaffected columns alongside affected ones
##' if possible.
##' @details This function works like \code{\link{colwise}} except that the function it returns also returns the columns
##' not specified by the '.cols' argument (as long as they are compatible). If the unaffected columns have a different
##' number of rows than the affected ones, then only the affected rows will be returned.
##' The columns will be returned in the same order as the original dataframe argument to the function.
##' @param .fun a function to apply to each column
##' @param .cols either a function that tests columns for inclusion, or a quoted object giving which columns to process
##' @param ... other arguments passed on to ‘.fun’
##' @return A function which accepts a dataframe as argument and applies '.fun' to each of the columns specified in '.cols'.
##' Further arguments to the function will be passed on to '.fun'.
##' @author Ben Veal
##' @keywords manip
##' @export
colwise2 <- function(.fun,.cols=true,...)
{
    namesfn <- plyr::colwise(names,.cols)
    applyfn <- plyr::colwise(.fun,.cols,...)
    function(df,...) {
            colnames <- names(namesfn(df))
            x <- applyfn(df,...)
            if(dim(x)[1]==dim(df)[1]) {
                    df[,colnames] <- x
                    return(df)
                }
            else {
                    print("New columns incompatible with old ones")
                    return(x)
                }
        }
}

##' @title Match strings in A with strings in B
##' @description Match strings in A with strings in B
##' @details This function tries to match the strings in 'A' to the closest ones in 'B'.
##' By default it tries to ensure that every element of 'B' is matched by at least one element of 'A'.
##' So if 'A' and 'B' have the same length it calculates an exact pairwise matching between 'A' & 'B'.
##' If length(A) < length(B) then some elements of 'B' will not be matched.
##' If length(A) > length(B) then some elements of 'B' will be matched more than once.
##' Alternatively, if the 'onto' parameter is set to FALSE then each element of 'A' will be matched with the
##' most similar element of 'B' regardless of whether or not that element is matched by another element of 'A'.
##' @param A A character vector
##' @param B A character vector
##' @param onto If TRUE (default) then ensure all strings in B are matched by at least one string in A if possible.
##' @return A vector whose i'th entry indicates the element of B that matches the i'th element of A
##' @author Ben Veal
##' @keywords utilities misc
##' @export
matchStrings <- function(A,B,onto=TRUE)
{
    distMat <- stringdist::stringdistmatrix(A,B,method="jw")
    matchByDistance(distMat)
}

##' @title Match elements of one set with closest elements of another set, according to their mutual distances.
##' @description Match elements of one set with closest elements of another set, according to their mutual distances.
##' @details This function tries to match elements of one set with the closest elements of another set according
##' to the distances between these elements as supplied by 'distMat'.
##' By default it tries to ensure that every element of the second set (corresponding to the columns of 'distMat')
##' is matched with at least one element of the first set (corresponding to the rows of 'distMat').
##' So if 'distMat' is a square matrix it calculates an exact pairwise matching between row elements & column elements.
##' If No. rows of 'distMat' < No. of columns of 'distMat' then some columns will not be matched. 
##' If No. rows of 'distMat' > No. of columns of 'distMat' then some columns will be matched more than once.
##' Alternatively, if the 'onto' parameter is set to FALSE then each row will be matched with the closest column
##' regardless of whether or not that column is matched by another row.
##' @param distMat A distance matrix - the (i,j) element indicates the distance between the i'th element of the
##' first set of things and the j'th element of the second set of things.
##' @param onto If TRUE (default) then ensure all columns are matched by at least one row if possible.
##' @return A vector whose i'th entry indicates the element in the 2nd set matching the i'th element in the 1st set.
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
matchByDistance <- function(distMat,onto=TRUE)
{
    if(!onto) return(apply(distMat,1,which.min))
    ## For each row/column rank the distances and return matrix of (row rank + column rank).
    rankMat <- t(apply(distMat,1,rank))+apply(distMat,2,rank)
    ## Match each row with the column which has smallest (row rank + column rank).
    row2col <- apply(rankMat,1,which.min)
    ## Find which columns have been matched & which haven't.
    usedcols <- unique(row2col)
    unusedcols <- setdiff(1:dim(distMat)[2],usedcols)
    ## Match any columns which haven't been matched if possible
    if(length(unusedcols) > 0)
        {
            ## Find duplicate matches
            dups <- lapply(usedcols,function(x){which(row2col==x)})
            ## For any duplicate matches, keep the row which is closest to the column,
            ## and collect the remaining rows.
            remaining <- numeric()
            for(i in 1:length(dups))
                if(length(dups[[i]]) > 1)
                    {
                        closest <- which.min(distMat[dups[[i]],usedcols[i]])
                        remaining <- union(dups[[i]][-closest],remaining)
                    }
            ## Match the remaining rows with the unmatched columns
            if(length(remaining) > 0)
                {
                    if(length(remaining)==1)
                        row2col[remaining] <- unusedcols[which.min(distMat[remaining,unusedcols])]
                    else if(length(unusedcols)==1)
                        row2col[remaining] <- unusedcols
                    else
                        row2col[remaining] <- unusedcols[matchByDistance(distMat[remaining,unusedcols])]
                }
        }
    return(row2col)
}

##' @title Perform sanity checks on a single variable.
##' @description Perform sanity checks on a single variable.
##' @description This function can be used after performing some data munging to check for mistakes.
##' @details 
##' You can check the data type, class, mode, length, max, min, unique, or missing values, and checksum.
##' You can also supply your own function to check the variable.
##' For the 'min_uniq', 'max_uniq' and 'max_na' variables, you can supply either a whole number indicating
##' the number of cases, or a number between 0 & 1 representing a proportion of cases.
##' 
##' Note: if you need to repeatedly call the function on the same dataframe you can curry the data argument using
##' the CurryL function in the functional library, e.g: checkalldata <- CurryL(checkVar,data=alldata)
##' (it wont work with the non-lazy Curry function). To apply the function to all variables in a dataframe use the
##' \code{\link{apply}} or \code{\link{checkDF}} functions.
##' @param var the variable to check (or it's name as a string if the data arg is supplied)
##' @param data an optional dataframe containing the variable (otherwise 'var' is taken from the calling environment)
##' @param vartype (optional) type of the variable (compared with typeof(var))
##' @param varclass (optional) class of the variable (compared with class(var), matches if arg is in class(var))
##' @param varmode (optional) mode of the variable (compared with mode(var))
##' @param min_len (optional) minimum length of the variable (compared with length(var))
##' @param max_len (optional) maximum length of the variable (compared with length(var))
##' @param min (optional), minimum allowed value (compared with min(var))
##' @param max (optional), maximum allowed value (compared with max(var))
##' @param vals (optional), list of all unique non missing values (compared with uniqueNotNA(var))
##' @param valstype (optional) used in conjunction with 'vals'. If "all" (default) then 'vals' should contain all the same
##' items as uniqueNotNA(var), if "subset"/"superset" then 'vals' should be a subset/superset of uniqueNotNA(var)
##' @param charmatch (optional) regexp that should match each value (apart from missing values). Use only with character variables.
##' @param nocharmatch (optional) regexp that should not match any value (apart from missing values). Use only with character variables.
##' @param min_uniq (optional) minimum number of unique values (compare with length(unique(var)))
##' @param max_uniq (optional) maximum number of unique values (compare with length(unique(var)))
##' @param max_na (optional) maximum number of missing values (compare with sum(is.na(var)))
##' @param checksum (optional) a checksum of the variable as returned by digest(VAR,algo="crc32").
##' @param pred (optional) A function which takes a variable as input and returns TRUE/FALSE depending on whether the
##' variable is valid or not.
##' @param silent (optional) if TRUE then don't omit warning messages informing of error type (FALSE by default)
##' @param showbadvals (optional) if a positive integer N then print the first N non-matching values (only for tests on individual values. Default: N = 100).
##' @param stoponfail (optional) if TRUE then throw an error on the first check that fails (FALSE by default)
##' @return A list whose first element is TRUE if all checks passed, FALSE otherwise, and whose subsequent elements are vectors of indices of non-matching values for tests on individual values.
##' @seealso \code{\link{checkDF}}, \code{\link{CurryL}}, \code{\link{apply}}
##' @examples ## create a function for checking variables in "ChickWeight" dataframe
##' checkalldata <- functional::CurryL(checkVar,data=ChickWeight)
##' ## check one variable
##' checkalldata("weight",vartype="double")
##' @author Ben Veal
##' @keywords utilities misc
##' @export
checkVar <- function(var,data,vartype,varclass,varmode,min_len,max_len,min,max,vals,valstype="all",charmatch,nocharmatch,
                     min_uniq,max_uniq,max_na,checksum,pred,showbadvals=100,silent=FALSE,stoponfail=FALSE)
{
    if(is.expression(var))
        subvar <- substitute(var)
    else if(is.character(var) & length(var)==1)
        subvar <- var
    else
        subvar <- "unknown"
    if(is.symbol(subvar))
        varname <- deparse(subvar)
    else 
        varname <- subvar
    if(!missing(data))
        var <- data[[varname]]
    isnumeric <- mode(var) == "numeric"
    len <- length(var)
    stopifnot(len > 0)
    ok <- TRUE
    badidxs <- list()
    ## Useful functions and macros to save some typing
    is.positiveint <- function(x) (abs(x - round(x)) < .Machine$double.eps^0.5 & x > 0)
    min2 <- function(x,y) {if (x<y) return(x) else return(y)} # for some reason "min" doesnt work inside defmacro
    ## The following macro is used for reporting results of each test.
    ## str is a message, and idxs is the vector of indices where the test fails
    report <- gtools::defmacro(str,idxs=NULL,
                               expr={msg <- paste(str,"for",varname);
                                   if(!silent) {
                                       cat(msg)
                                       if(is.positiveint(showbadvals) & !is.null(idxs)) {
                                           cat(":\n")
                                           cat(var[idxs[1:min2(showbadvals,length(idxs))]],sep=" ")
                                           if(showbadvals<length(idxs)) cat(" ... ")
                                           cat("\n\nAt indices:\n")
                                           cat(idxs[1:min2(showbadvals,length(idxs))],sep=",")
                                           if(showbadvals<length(idxs)) cat(" ...")
                                           cat("\n")
                                       }
                                       cat("\n")
                                   }
                                   if(stoponfail) {
                                       stop("stopping")
                                   }
                                   ok <- FALSE})
    mintest <- gtools::defmacro(val,tot,min,str,expr={if(val < min | (min <= 1 & val/tot < min)) report(str)})
    maxtest <- gtools::defmacro(val,tot,max,str,expr={if(val/tot > max | (max >= 1 & val > max)) report(str)})
    ## perform the checks
    if(!missing(vartype))
        if(!(vartype==typeof(var)))
            report(paste0("Expected '",vartype,"' type but got '",typeof(var),"' type"))
    if(!isnumeric & (!missing(min) | !missing(max)))
        report(paste0("Expected numeric type, but got '",mode(var),"' type"))
    if(!missing(varclass))
        if(!(varclass %in% class(var)))
            report(paste0("Expected '",varclass,"' class but got '",class(var),"' class"))
    if(!missing(varmode))
        if(!(varmode==mode(var)))
            report(paste0("Expected '",varmode,"' mode but got '",mode(var),"' mode"))
    if(!missing(min_len))
        mintest(len,1,min_len,paste("Length is <",min_len))
    if(!missing(max_len))
        maxtest(len,1,max_len,paste("Length is >",max_len))
    if(isnumeric & !missing(min)) {
        idxs <- which(var < min)
        if(length(idxs) > 0) {
            badidxs <- c(badidxs,list(min=idxs))
            report(paste("Found values <",as.character(min)),idxs)
        }
    }
    if(isnumeric & !missing(max)) {
        idxs <- which(var > max)
        if(length(idxs) > 0) {
            badidxs <- c(badidxs,list(max=idxs))
            report(paste("Found values >",as.character(max)),idxs)
        }
    }
    if(!missing(vals)|!missing(min_uniq)|!missing(max_uniq)) {
        uvals <- uniqueNotNA(var)
        ulen <- length(uvals)
        if(!missing(vals)) {
            vals <- uniqueNotNA(vals)
            if((valstype=="all" & !setequal(vals,uvals))) {
                idxs <- which(!(uvals %in% vals))
                diffvals <- setdiff(vals,uvals)
                badidxs <- c(badidxs,list(vals=idxs))
                report("Invalid values",idxs)
                report("Missing values",diffvals)
            } else if((valstype=="subset" & !(all(vals %in% uvals)))) {
                diffvals <- setdiff(vals,uvals)
                report("Missing values",diffvals)
            } else if((valstype=="superset" & !(all(uvals %in% vals)))) {
                idxs <- which(!(uvals %in% vals))
                badidxs <- c(badidxs,list(vals=idxs))
                report("Invalid values",idxs)
            }
        }
        if(!missing(min_uniq)) {
            if(ulen < min_uniq) {
                report("Not enough unique values")
            }
        }
        if(!missing(max_uniq)) {
            if(ulen > max_uniq) {
                report("Too many unique values")
            }
        }
    }
    if(!missing(charmatch)) {
        if(class(var)!="character")
            report(paste0("Expected 'character' mode for use with charmatch arg but got '",mode(var),"' mode"))
        idxs <- which((!grepl(charmatch,var)) & (!(is.na(var))))
        if(length(idxs) > 0) {
            badidxs <- c(badidxs,list(charmatch=idxs))
            report("Invalid values",idxs)

        }
    }
    if(!missing(nocharmatch)) {
        if(class(var)!="character")
            report(paste0("Expected 'character' mode for use with charmatch arg but got '",mode(var),"' mode"))
        idxs <- which(grepl(nocharmatch,var) & (!(is.na(var))))
        if(length(idxs) > 0) {
            badidxs <- c(badidxs,list(nocharmatch=idxs))
            report("Invalid values",idxs)
        }
    }
    if(!missing(max_na))
        maxtest(sum(is.na(var)),len,max_na,"Too many missing values")
    if(!missing(checksum))
        if(digest(var,algo="crc32")!=checksum) report("Invalid checksum")
    if(!missing(pred)) {
        if(!pred(var))
            report(paste(deparse(substitute(pred)),"returns false"))
    }
    if(ok & !silent) print(paste("All checks passed for",varname,"variable"))
    return(c(ok,badidxs))
}

##' @title Perform sanity checks on a dataframe. 
##' @description Perform sanity checks on a dataframe. 
##' @description This function can be used after performing some data munging to check for mistakes.
##' @details
##' You can restrict the checks to a subset of the dataframe by supplying a logical expression in the 'subset'
##' argument. This expression will be evaluated in the context of the supplied dataframe (the 'data' argument),
##' so you don't need to qualify the variable names. If all arguments apart from 'data', 'subset', 'silent' and 'stoponfail'
##' are unset/NULL then the function will check if all rows satisfy the subset logical expression (unless this is unset).
##' The other arguments can used for checking the number of complete cases (i.e. rows with no missing values), unique cases,
##' missing values, checksum, and variable specific checks (see below).
##'
##' For arguments with names beginning with 'min_' or 'max_' you can supply either a whole number indicating
##' an amount of rows/columns, or a number between 0 & 1 indicating a proportion of rows/columns.
##' For 'min_rows' & 'max_rows' proportions are interpreted as proportions of the whole data (before subsetting),
##' whereas for other arguments proportions are interpreted as proportions of the subsetted data.
##'
##' To perform variable specific checks use the 'vars' argument to specify which variables to check. 'vars' can be
##' either a numeric vector of column numbers, or a character vector of regexps matching column names. The matching
##' columns will be individually checked by the \code{\link{checkVar}} function. To specify which checks to perform
##' supply a list of arguments for \code{\link{checkVar}} in the 'checks' argument. You do not need to include the data
##' or var arguments in this list. For example to check that all variables with names matching "country" or "name" have
##' type "character" and between 10 & 300 unique values you could do:
##' 
##' checkDF(data,vars=c("country","name"),checks=list(type="character",min_uniq=10,max_uniq=300))
##' 
##' To ensure that a regexp matches only a single variable put a ^ at the front and $ at the end (e.g. "^country$").
##' Note: the values of the 'silent' and 'stoponfail' args will be passed on the \code{\link{checkVar}} by default
##' but you can override these values by passing new values in the 'checks' arg.
##' 
##' By default a warning message will be issued when a check fails. This can be prevented by setting 'silent' to TRUE.
##' If the 'stoponfail' argument is set to TRUE then an error will be thrown on the first check that fails,
##' otherwise the return value of the function indicates whether all checks passed (TRUE) or not (FALSE).
##' @param data dataframe to be checked
##' @param subset (optional) logical expression indicating subset of 'data' to check (see \code{\link{subset}})
##' @param min_rows (optional) minimum number of rows (compare with dim(data[subset,])[1]). 
##' @param max_rows (optional) maximum number of rows (compare with dim(data[subset,])[1]).
##' @param min_cols (optional) minimum number of columns (compare with dim(data[subset,])[2]). 
##' @param max_cols (optional) maximum number of columns (compare with dim(data[subset,])[2]).
##' @param min_cc (optional) minimum number of complete cases (compare with sum(complete.cases(data[subset,])))
##' @param max_cc (optional) maximum number of complete cases (compare with sum(complete.cases(data[subset,])))
##' @param min_uniq (optional) minimum number of unique cases (compare with dim(unique(data[subset,]))[1]). Default value is 1.
##' @param max_uniq (optional) maximum number of unique cases (compare with dim(unique(data[subset,]))[1])
##' @param min_na_row (optional) minimum number of missing values in each row
##' @param max_na_row (optional) maximum number of missing values in each row
##' @param min_na_all (optional) minimum number of missing values overall
##' @param max_na_all (optional) maximum number of missing values overall
##' @param checksum (optional) a checksum of the variable as returned by digest(VAR,algo="crc32").
##' @param showbadrows (optional) if a positive integer N then print the first N non-matching rows (only for tests on rows. Default: N = 100).
##' @param silent (optional) if TRUE then don't omit warning messages informing of error type (FALSE by default)
##' @param stoponfail (optional) if TRUE then throw an error on the first check that fails (FALSE by default)
##' @param vars (optional) either a numeric or character vector, or a regexp matching names of variables to check
##' @param checks (optional) a list of a arguments to be passed to \code{\link{checkVar}}
##' @return A list whose first element is TRUE if all checks passed, FALSE otherwise, and whose subsequent elements are vectors of indices of non-matching rows for tests on rows.
##' @examples checkDF(ChickWeight,weight>Time)
##' checkDF(ChickWeight,min_uniq=10)
##' @seealso \code{\link{checkVar}}
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
checkDF <- function(data,subset,min_rows,max_rows,min_cols,max_cols,min_cc,max_cc,min_uniq,max_uniq,
                    min_na_row,max_na_row,min_na_all,max_na_all,checksum,
                    showbadrows=100,silent=FALSE,stoponfail=FALSE,vars=NULL,checks=NULL)
{
    nrows1 <- dim(data)[1]
    stopifnot(nrows1>0)
    if(is.expression(data)) {
        framename <- deparse(substitute(data))
    } else {
        framename <- "unknown"
    }
    subsetstr <- deparse(substitute(subset))
    if(subsetstr!="") {
        data <- data[with(data,eval(parse(text=subsetstr))),]
        subsetmsg <- paste0("rows satisfying '",subsetstr,"'")
    }
    else subsetmsg <- "rows"
    nrows2 <- dim(data)[1]
    ncols2 <- dim(data)[2]
    ok <- TRUE
    badrows <- list()
    ## useful functions and macros to save some typing
    is.positiveint <- function(x) (abs(x - round(x)) < .Machine$double.eps^0.5 & x > 0)
    badvar <- function(x) {!as.logical(x[[1]])} # returns TRUE if x is a return value from checkVar that indicates the check failed
    report <- gtools::defmacro(str,expr={msg <- paste(str,"for",framename,"dataframe");
                                 if(stoponfail) stop(msg);
                                 if(!silent) print(msg);
                                 ok <- FALSE})
    mintest <- gtools::defmacro(val,tot,min,str,expr={if(val < min | (min <= 1 & val/tot < min))
                                                          report(paste("Not enough",str,": only got",val,"but needed at least",
                                                                       ifelse((min>1),min,min*tot)))})
    maxtest <- gtools::defmacro(val,tot,max,str,expr={if(val/tot > max | (max >= 1 & val > max))
                                                          report(paste("Too many",str,": got",val,"but needed at most",
                                                                       ifelse((max>=1),max,max*tot)))})
    ## do dataframe wide checks
    if(!missing(min_rows))
        mintest(nrows2,nrows1,min_rows,subsetmsg)
    # if only the 'subset' & 'data' args are supplied then just check that all rows satisfy the 'subset' expression
    else if(subsetstr!="" & missing(max_rows) & missing(min_cc) & missing(max_cc) & missing(min_uniq) & missing(max_uniq)
             & missing(max_na_row) & missing(max_na_all) & length(vars)==0)
        mintest(nrows2,nrows1,nrows1,subsetmsg)
    if(!missing(max_rows))
        maxtest(nrows2,nrows1,max_rows,subsetmsg)
    if(!missing(min_cols))
        mintest(ncols2,1,min_cols,"columns")
    if(!missing(max_cols))
        maxtest(ncols2,1,max_cols,"columns")
    if(!missing(min_cc))
        mintest(sum(complete.cases(data)),nrows2,min_cc,"complete cases")
    if(!missing(max_cc))
        maxtest(sum(complete.cases(data)),nrows2,max_cc,"complete cases")
    if(!missing(min_uniq))
        mintest(dim(unique(data))[1],nrows2,min_uniq,"unique cases")
    if(!missing(max_uniq))
        maxtest(dim(unique(data))[1],nrows2,max_uniq,"unique cases")
    if((!missing(max_na_row)) && is.numeric(max_na_row)) {
        if(max_na_row > 1)
            max1 <- max_na_row
        else
            max1 <- ncols2*max_na_row
        whichrows <- which(apply(data,1,function(x) {sum(is.na(x)) > max1}))
        if(length(whichrows) > 0) {
            badrows <- c(badrows,list(max_na_row=whichrows))
            report("Too many missing values in rows")
            if(is.positiveint(showbadrows)) {
                print(whichrows[1:min(showbadrows,length(whichrows))])
            } else {
                print(whichrows)
            }
        }
    }
    if((!missing(min_na_row)) && is.numeric(min_na_row)) {
        if(min_na_row > 1)
            min1 <- min_na_row
        else
            min1 <- ncols2*min_na_row
        whichrows <- which(apply(data,1,function(x) {sum(is.na(x)) < min1}))
        if(length(whichrows) > 0) {
            badrows <- c(badrows,list(min_na_row=whichrows))
            report("Too few missing values in rows")
            print(whichrows)
        }
    }
    if(!missing(max_na_all))
        maxtest(sum(is.na(data)),nrows2*ncols2,max_na_all,"missing values")
    if(!missing(min_na_all))
        mintest(sum(is.na(data)),nrows2*ncols2,min_na_all,"missing values")
    if(!missing(checksum))
        if(digest(data,algo="crc32")!=checksum) report("Invalid checksum")
    # do variable specific checks
    varnames <- names(data)
    if(length(vars) > 0 & length(checks) > 0) {
        if(!("silent" %in% names(checks)))
            checks <- c(checks,silent=silent)
        if(!("stoponfail" %in% names(checks)))
            checks <- c(checks,stoponfail=stoponfail)
        for(i in 1:length(vars)) {
            ## first get the appropriate column numbers 
            if(class(vars[i])=="character") cols <- grep(vars[i],varnames)
            else cols <- vars[i]
            ## now check each column
            for(j in cols) {
                retval <- with(data,do.call(checkVar,args=c(var=as.symbol(varnames[j]),checks)))
                if(badvar(retval)) ok <- FALSE
            }
        }
    }
    # finished all checks
    if(ok & !silent) print(paste("All checks passed for",framename,"dataframe"))
    return(c(ok,badrows))
}


##' @title Perform multiple sets of predefined tests on a dataframe
##' @description This function allows you to pass in lists of arguments defining tests on different variables
##' of a dataframe. Each list should contain "vars" and "checks" named arguments, along with other named arguments
##' to be passed to the \code{\link{checkDF}} function. You can also pass single named arguments for \code{\link{checkDF}}
##' which will be checked separately at the end.
##' @details A test specification takes the form of a list of named arguments to \code{\link{checkDF}}.
##' Each list containing "vars" and "checks" named arguments will invoke a separate call to the \code{\link{checkDF}} 
##' function, and the matching columns of df will be checked. Any other named arguments will be aggregated
##' together and used in a single call to \code{\link{checkDF}} after all the variable specific checks have finished.
##' If the same named argument (apart from "vars" and "checks") appears more than once (either in different lists, or separately)
##' it will only be used once at the end. This means you can create predefined test specifications and use them together,
##' without worrying about overlap.
##' This is useful when you need to check multiple dataframes which may share some data of the same type. In this case you can
##' predefine tests for the different types of variables, and perhaps also some general tests about the dimensions of the
##' dataframe, number of missing values etc. Then for each dataframe you can call this function with the appropriate test
##' specifications.
##' See the examples section for examples of how to define test specifications.
##' @param df A dataframe on which to perform the checks.
##' @param ... Lists of arguments, or individual arguments for \code{\link{checkDF}}
##' @return
##' @examples ## Test specifcation that checks that variables with names matching "postcode" should contain postcodes, and
##' ## have at most 30% missing values:
##' postcode.check <- list(vars="postcode",checks=list(vartype="character",
##' ## charmatch="[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} ?[0-9]?[A-Za-z]{2}",max_na=0.3))
##' ## General test specification to check that the dataframe dimensions are reasonable,
##' ## and there arent too many missing values
##' sanity.check <- list(min_rows=100,max_rows=10000000,min_cols=2,max_cols=100,max_na_row=0.5)
##' ## Test specifications for longitude and latitude variables (bounds are for UK):
##' long.check <- list(vars="longitude",checks=list(vartype="double",max_na=0.1,min=-9,max=2))
##' lat.check <- list(vars="latitude",checks=list(vartype="double",max_na=0.1,min=49,max=100))
##' ## To check a dataframe using these specifications:
##' ## doDFchecks(df1,postcode.check,long.check,lat.check,sanity.check)
##' ## You can override some of the named arguments in sanity.check:
##' ## doDFchecks(df1,postcode.check,long.check,lat.check,sanity.check,min_rows=50000,max_rows=100000)
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
doDFchecks <- function(df,...) {
    args <- list(...)
    checksnames <- paste(deparse(substitute(list(...))),collapse="")
    checksnames <- substr(checksnames,6,nchar(checksnames)-1)
    checksnames <- strsplit(checksnames,",")[[1]]
    otherargs <- list()
    dfname <- deparse(substitute(df))
    print(paste0(dfname,": "))
    ## these named arguments will always be checked together, but separately from other arguments which apply
    ## to whole dataframe
    varchknames <- c("vars","checks")
    ## loop over the arguments given to the function
    for(i in 1:length(args)) {
        arg <- args[[i]]
        name <- names(args)[i]
        ## get the name of the current check
        checkname <- checksnames[i]
        ## If the arg is a list, then extract elements with names in varchknames (if there are any) 
        ## and check now. Other elements will be added to otherargs and checked at the end.
        if(is.list(arg)) {
            if(all(varchknames %in% names(arg))) {
                ## inform user which check is being done
                checkname <- gsub("^\\s+|\\s+$","",checkname)
                print(paste0(checkname,"..."))
                do.call(checkDF,c(list(data=df),arg[varchknames]))
                }
            argnames <- Filter(function(x) !(x %in% varchknames), names(arg))
            ## loop over the named elements that are not in varchknames
            for(name in argnames) {
                ## if this element is already present in otherargs then overwrite its value
                ## otherwise add it to otherargs
                if(name %in% names(otherargs)) {
                    otherargs[name] <- arg[[name]]
                } else {
                    otherargs <- c(arg[name],otherargs)
                }
            }
            ## for non-list arguments, check their names, and either overwrite or add to otherargs depending if
            ## its already in there or not
        } else if(name %in% names(otherargs)) {
            otherargs[name] <- arg
        } else {
            otherargs <- c(arg,otherargs)
            names(otherargs)[1] <- name
        }
    }
    ## finally check all the otherargs (if any)
    if(length(otherargs) > 0) {
        do.call(checkDF,c(list(data=df),otherargs))
    }
}

##' Given a dataframe df and a grouping variable idcol, this function finds which rows are the same across
##' different groups, i.e. a row will be chosen if it is identical to another row on all variables apart from
##' the idcol variable which should be different. If `matchall' is TRUE then only rows which are the same
##' across all groups will be chosen, otherwise rows which are the same across at least 2 groups are reported.
##' 
##' This function uses code by Winston Chang pinched from here:
##'       http://www.cookbook-r.com/Manipulating_data/Comparing_data_frames/
##' @title Find rows of dataframe that are duplicated between groups (indicated by a grouping variable).
##' @description Find rows of dataframe that are duplicated between groups (indicated by a grouping variable).
##' @param df A dataframe
##' @param idcol The name/index of the grouping variable
##' @param matchall If TRUE then find rows that are duplicated across ALL groups, otherwise a row only need
##' be duplicated across 2 groups
##' @return A logical vector indicating which rows of df are duplicated between groups
##' @author Ben Veal
##' @keywords utilities misc
##' @export
dupsBetweenGroups <- function(df,idcol,matchall=FALSE) {
    ## If there is only 1 group then we can return now
    if(length(unique(df[,idcol]))==1) return(rep(FALSE,nrow(df)))
    ## Make sure idcol is the name of the grouping variable 
    if(is.numeric(idcol)) idcol <- names(df)[idcol]
    if(matchall) {
        idvals <- unique(df$idcol) 
        ## First find rows of first group which are duplicated across all groups
        ## these will be stored in group1dups at the end.
        grp1 <- idvals[1]
        ids <- df$idcol
        ## initialize group1dups to include all rows in group 1
        group1dups <- (ids==grp1)
        ## loop over i: compare group1dups with group i rows, and update group1dups
        for(i in 2:length(idvals)) {
            ## find duplicates between i'th group rows and group1dups rows
            rows <- (ids==idvals[i]) | group1dups
            dups <- .dupsBetweenSomeGroups(df[rows,],idcol)
            ## update group1dups to only include rows also duplicated in group i 
            group1dups[group1dups] <- dups[ids[rows]==grp1]
        }
        ## group1dups now shows group 1 rows that are duplicated across all groups.
        ## To find corresponding rows in other groups, we need to iterate over all groups again.
        alldups <- group1dups
        for(i in 2:length(idvals)) {
            ## find duplicates between i'th group rows and group1dups rows
            rows <- (ids==idvals[i]) | group1dups
            dups <- .dupsBetweenSomeGroups(df[rows,],idcol)
            ## update alldups to include rows common to group i and group1dups
            alldups[ids==idvals[i]] <- dups[ids[rows]==idvals[i]]
        }
        return(alldups)
    } else return(.dupsBetweenSomeGroups(df,idcol))
}

##' Function called by \code{\link{dupsBetweenGroups}} when the 'matchall' argument is FALSE
##'
##' This function was pinched from here: http://www.cookbook-r.com/Manipulating_data/Comparing_data_frames/
##' @title Internal function used by \code{\link{dupsBetweenGroups}} (which see)
##' @description Internal function used by \code{\link{dupsBetweenGroups}} (which see)
##' @param df A dataframe
##' @param idcol The name of the grouping variable
##' @return A logical vector indicating which rows of df are duplicated between groups
##' @author Winston Chang?
.dupsBetweenSomeGroups <- function (df, idcol) {
    ## Get the data columns to use for finding matches
    datacols <- setdiff(names(df), idcol)
    ## Sort the rows so that duplicates follow each other.
    ## The original ordering can be recreated using sortorder.
    sortorder <- do.call(order,df)
    df <- df[sortorder,]
    ## Find duplicates within each id group (idcol must match in this case).
    ## First copy is not marked.
    dupWithin <- duplicated(df)
    ## Vector for indicating which elements are duplicated between groups:
    dupBetween = rep(NA, nrow(df))
    ## Filter out within group duplicates, and find between groups duplicates (first one isn't marked). 
    dupBetween[!dupWithin] <- duplicated(df[!dupWithin,datacols])
    ## Again from the other end to ensure all duplicates are marked.
    dupBetween[!dupWithin] <- duplicated(df[!dupWithin,datacols], fromLast=TRUE) | dupBetween[!dupWithin]
    ## ============= Replace NA's with previous non-NA value ==============
    ## A consecutive sequence of within group duplicates will count as between groups duplicates if and only if
    ## they come directly after a between groups duplicate (i.e. they are a duplicate of a between groups duplicate)
    ## So we need to replace the NA's in dupBetween (i.e. the within group duplicates) with the previous non-NA value
    ## (TRUE if it is a between groups duplicate and FALSE if it isn't).
    ## This is why we sorted earlier - so that the within group duplicates of the same item would be sequential.

    ## Get indexes of non-NA's, i.e. rows that are not within group duplicates
    goodIdx <- !dupWithin
    ## Prepend a leading NA to the non-NA entries of dupBetween so that the initial index is > 0
    goodVals <- c(NA, dupBetween[goodIdx])
    ## Fill the indices of the output vector with the indices pulled from these offsets of goodVals.
    ## Add 1 to avoid indexing to zero (this works since we have prepended an extra NA at the start).
    fillIdx <- cumsum(goodIdx)+1
    ## The original vector, now with gaps filled
    dupBetween <- goodVals[fillIdx]

    ## The following example might make the algorithm clearer:
    ## df (letters 4 unique rows):    A  A  B  C  D  D  A  B  E  F
    ## idcol                     :   g1 g1 g2 g2 g2 g2 g3 g3 g3 g3
    ## goodIdx                   :    T  F  T  T  T  F  T  T  T  T
    ## cumsum(goodIdx)           :    1  1  2  3  4  4  5  6  7  8
    ## fillIdx=cumsum(goodIdx)+1 :    2  2  3  4  5  5  6  7  8  9  
    ## dupsBetween               :    T  NA T  F  F  NA T  T  F  F
    ## goodVals                  : NA T     T  F  F     T  T  F  F
    ##                                |\    |  |  |\    |  |  |  |
    ## goodVals[fillIdx]         :    T  T  T  F  F  F  T  T  F  F

    ## Undo the original sort
    dupBetween[sortorder] <- dupBetween
    ## Return the vector indicating between group duplicates
    return(dupBetween)
}

##' Find rows that are duplicated across dataframes
##'
##' This is just a wrapper around \code{\link{dupsBetweenGroups}} (which see),
##' but for separate dataframes instead of groups within a single dataframe.
##'
##' The 'by' argument can be used for specifying which columns to restrict each
##' dataframe in 'dfs' to. Each element of 'by' can be either NULL (default) or
##' a character/numeric vector. A NULL value means use all columns and a character
##' or numeric vector means use the columns indicated by that vector.
##'
##' @title Find rows that are duplicated across dataframes
##' @description Find rows that are duplicated across dataframes
##' @param dfs a list of dataframes to compare
##' @param by a list of vectors each of which indicates which columns to use for
##' the corresponding dataframe in 'dfs'. A NULL value means use all columns.
##' @param matchall If TRUE then find rows that are duplicated across ALL groups, 
##' otherwise a row only need be duplicated across 2 groups
##' @return a list of vectors indicating duplicated rows of dataframes
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
dupsBetweenDFs <- function(dfs,by=NULL,matchall=FALSE) {
    dfs2 <- list()
    ## First restrict to the columns specified in 'by'
    for(i in 1:length(dfs)) {
        if(is.null(by[[i]]))
            dfs2 <- c(dfs2,list(dfs[[i]]))
        else {
            if(length(by[[i]])>1) 
                dfs2 <- c(dfs2,list(dfs[[i]][,by[[i]]]))
            else {
                x <- as.data.frame(dfs[[i]][,by[[i]]])
                names(x) <- by[[i]]
                dfs2 <- c(dfs2,list(x))
            }
        }
    }
    ## Check that we have the same number of columns for each dataframe
    ncols <- ncol(dfs2[[1]])
    if(!all(unlist(lapply(dfs2,function(df){ncol(df)==ncols}))))
        stop("differing number of columns in elements of dfs")
    ## label the dataframes
    for(i in 1:length(dfs2)) {
        ## hopefully there are no existing columns named idcolzyx
        dfs2[[i]]$idcolzyx <- i 
    }
    ## join the dataframes together
    alldata <- do.call(rbind,dfs2)
    ## find the duplicate rows
    dupsL <- dupsBetweenGroups(alldata,"idcolzyx",matchall=matchall)
    ## separate the indicator vectors in dupsL
    return(split(dupsL,alldata$idcolzyx))
}


##' @title Return the number of unique elements of x (excluding NA & NaN's)
##' @description Return the number of unique elements of x (excluding NA & NaN's)
##' @param x A vector.
##' @param warn Whether to issue warnings from 'unique' (default is FALSE).
##' @return The number of unique values of 'x' excluding NA & NaN values.
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
numUnique <- function(x,warn=FALSE) {
    length(uniqueNotNA(x,warn))
}

##' @title Apply function to objects in environment matching pattern and filter
##' @description Apply a function to objects in an environment (default is globalenv() - the user workspace)
##' whose names match the regexp given as the second argument, and whose values return true when the filter
##' argument is applied.
##' @details This function is useful for browsing your workspace objects. It takes the same arguments as \code{\link{ls}}
##' but in a different order, and with a few other arguments: FUN, filter and inc.null
##' The objects are first filtered using the pattern and filter arguments, and then FUN is applied to them.
##' FUN should take a single object as its only argument. By default any objects which return NULL when FUN is applied to
##' them will not be included in the output (e.g. if FUN=names then vectors will be skipped from output since they have no names).
##' You can include these NULL objects in the output by setting inc.null=TRUE
##'
##' The objects are first filtered by name using the pattern argument (a regular expression matching the object names),
##' and then using the filter argument if supplied. The filter can be any expression involving "x" (a variable containing
##' the current object being tested) which returns TRUE or FALSE.
##' The function can be called using one of the following formats:
##' 
##' lsapply(FUN,pattern,filter[,named args])
##' lsapply(FUN,filter[,named args])
##' lsapply(pattern,filter[,named args])
##' lsapply(pattern[,named args])
##' lsapply(filter,pattern[,named args])
##' lsapply(filter[,named args])
##'
##' (the named args are optional)
##' 
##' @param pattern A regular expression matching the objects to apply the FUN to.
##' By default this is set to ".*", i.e. match all objects. If this argument is missing then the filter argument can take its place.
##' @param filter An expression for filtering the objects before apply FUN to them. Within the expression the object under
##' scrutiny can be referenced with "x", e.g: is.data.frame(x) && nrow(x) > 300
##' @param FUN A function to apply to each matched object (default=dim)
##' @param name An environment to search for objects (see \code{\link{ls}}). Default is globalenv() - the user workspace.
##' @param all.names If TRUE all object names are returned. If FALSE (default), names beginning with '.' are omitted (see \code{\link{ls}}).
##' @param sorted If TRUE (default) the object names are sorted alphabetically before passing to FUN (see \code{\link{ls}}).
##' @param inc.null Whether or not to include NULL values in the output. See details.
##' @return A named list of objects after applying FUN.
##' @examples 
##' lsapply()
##' lsapply(names)
##' lsapply("data")
##' lsapply(names,"data")
##' lsapply(nrow(x) > 100)
##' lsapply(names,nrow(x) > 100)
##' lsapply("data",nrow(x) > 100)
##' lsapply(names,"data",nrow(x) > 100)
##' @author Ben Veal
##' @keywords utilities misc
##' @export 
lsapply <- function(FUN=dim, pattern=".*", filter, name=globalenv(), all.names=FALSE, sorted=TRUE, inc.null=FALSE) {
    ## check types of 1st and 2nd args for non-standard use cases
    arg1 <- substitute(FUN)
    arg2 <- substitute(pattern)
    ## if 1st arg is a function there could still be further unnamed args
    arg1isFunction <- is.name(arg1) && is.function(eval(arg1))
    ## if 1st arg is a pattern there could still be further unnamed args
    arg1isPattern <-  is.character(arg1)
    arg2isPattern <- is.character(arg2)
    ## if 1st or 2nd arg is a filter then there should be no 3rd unnamed arg
    arg1isFilter <- is.call(arg1) && missing(filter)
    arg2isFilter <- is.call(arg2) && missing(filter)
    ## parse the arguments for non-standard use cases
    if(arg1isFilter) { ## 1st arg is a filter, e.g: lsapply(nrow(x) > 100)
        filter <- arg1
        FUN <- dim
    } else if(arg1isPattern) { ## 1st arg is a pattern, e.g: lsapply("data"...)
        if(arg2isFilter) { ## 2nd arg is a filter, e.g: lsapply("data", nrow(x) > 100...)
            filter <- arg2
        }
        pattern <- FUN
        FUN <- dim
    } else if(arg1isFunction) { ## 1st arg is a function, e.g: lsapply(dim,...)
        if(arg2isFilter) { ## 2nd arg is a filter, e.g: lsapply(dim, nrow(x) > 100)
            filter <- arg2
            pattern <- ".*"
        } else if(arg2isPattern) {
            filter <- substitute(filter)
        } else {
            stop(paste("Invalid 2nd arg:",deparse(arg2)))
        }
    } else {
        stop(paste("Invalid 1st arg:",deparse(arg1)))
    }
    ## get names of matching objects in the workspace
    names1 <- ls(name=name,pattern=pattern,all.names=all.names,sorted=sorted)
    ## filter the objects using the filter arg
    filterfun <- function(arg) {
        eval(filter,envir=list(x=get(arg)))
    }
    if(missing(filter)) {
        whichkeep <- rep(TRUE,length(names1))
    } else {
        whichkeep <- sapply(names1,filterfun)
    }
    whichkeep[which(sapply(whichkeep,function(y) {length(y)==0 || is.null(y)}))] <- FALSE
    whichkeep <- unlist(whichkeep)
    stopifnot(is.logical(whichkeep))
    ## apply FUN to the filtered objects
    vals <- lapply(names1[whichkeep], function(x) FUN(get(x)))
    ## name them
    names(vals) <- names1[whichkeep]
    ## return the results, removing NULL values if necessary
    if(inc.null) vals
    else vals[which(!sapply(vals,is.null))]
}
vapniks/mergeutils documentation built on May 3, 2019, 4:33 p.m.