R/dupes_tag.R

#Let's see if we can write a duplicates tagger similar to the one in STATA
#Goal is to take in a data frame or matrix and return it with a new column
#called dupes.count which will tell the number of rows that have that exact
#covariate pattern

dupes_tag <- function(df,...){

  library("dplyr")
  ###########
  #safegaurds
  ###########

  #df must be a data frame
  if(!("data.frame" %in% class(df))){
    stop("df must be a data frame")
  }


  #probably don't want ... to be empty
  cols = list(...)
  if(length(cols)==0){
    stop("One or more arguments required in ...")
  }

  #now verify that all the arguments of ... are in colnames(df)
  indf = cols %in% colnames(df)
  if(min(indf) == 0){
    #the stop message works with no seperator so let's make one
    missing = cols[!indf]
    txt = paste(" ", missing[1])
    for(i in 2:length(missing)){
      txt = paste(txt,missing[i], sep = ", ")
    }
    stop("Some arguments were not found in the columns of df:\n",txt)

  }

  ###########
  #agg +merge
  ###########

  #I feel like there is a much simpler, more beautiful way to do this but we're
  #having problems with our data types in the aggregate function.
  #We must produce the columns of df specified by ... however to get those columns
  #we need a character vector

  cols.char = c()
  for(i in 1:length(cols)){
    cols.char = c(cols.char,as.character(cols[i]))
  }

  #we gonna create a dummy variable
  one = rep(x = 1, times = nrow(df))
  #make a sum of the one column across the columns selected
  agg = aggregate(one, as.list(as.data.frame(df[,cols.char])), FUN = sum)

  #rename
  colnames(agg) = c(as.character(cols),"dupes")
  agg$dupes = agg$dupes - 1

  #now join back to df
  df = suppressMessages(left_join(df,agg))
  return(df$dupes)


}
svenhalvorson/SvenSFPS documentation built on May 21, 2019, 11:42 a.m.