R/data_analysis.R

Defines functions Mark TestMark Pr PrCols DescDf Pause IsInstalled InstallIf InstallLoad InstallLoadMulti Nullify TestNullify LsObjects ReportMemoryUsage ReportObjectSize ReportObjectSize_true SourceHttps TestSourceHttps OpenDataFiles ReadDataDir WriteDataWithSep TrimTrailing Flatten_repField_v1 TestFlatten_repField_v1 Flatten_repField TestFlatten_repField Concat_stringColsDf TestConcat_stringColsDf Quantile TestQuantile SubsetCols TestSubsetCols Col TestCol HandleNaPropFcn DtSimpleAgg TestDtSimpleAgg Ci_forWLRatio TestCi_forWLRatio CltCi CltCiDf RelativeRiskCi RelativeRiskCi_approx TestRelativeRiskCi_approx Remap_lowFreqCategs TestRemap_lowFreqCategs FlagRow_ifLowFreqValue TestFlagRow_ifLowFreqValue CheckColFreqDt DtReplaceNa TestDtReplaceNa CategMode RowMode Example ContiMode DtRemapNa DtRemap_colValues TestDtRemap_colValues RoundDf RoundDt SignifDf SignifDt StarCiDf StarPvalueDf TidyCiDf RegMod_coefTableSumm RegModList_coefTableSumm xtable2 Entropy SplitStrCol TestSplitStrCol PartitionCi BootstrapCi TestPartitionCi ReplaceStringMulti CapWords TestCapWords StringCartesianProd TestStringCartesianProd SortDf TestSortDf SymRelErrFcn TestSymRelErrFcn CalcErrDfPair TestCalcErrDfPair FreqTables_simpleDiff MinInd MinIndDf Example Debug Check_andFix_dependencies DropEndingVowels TestDropEndingVowels DropEndingChars TestDropEndingChars AbbrString TestAbbrString AbbrStringVec AbbrStringCols TestAbbrStringCols CommonString TestCommonString SumCols_viaPattern TestSumCols_viaPattern SumCols_multiPatterns TestSumCols_multiPatterns print.data.frame DichomVar Add_dichomVar TestAdd_dichomVar Add_dichomVarMulti TestAdd_dichomVarMulti BirthYear_toAgeCateg ReportPropertyDf TestReportPropertyDf ReportNA TestReportNA ReplaceNA TestReplaceNA BalanceSampleSize

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# author: Reza Hosseini

## helper functions for R data analysis
# proprietary information

#' @import data.table

Mark = function(x, text="") {
  str = paste0(
      "\n *** object class: ",
      paste(as.character(class(data.table())), collapse=", "))
  if (text != "") {
    str = paste0(str, "; message: ", text)
  }
  cat(str, "\n")
  print(x)
}

TestMark = function() {
  x = 2
  Mark(x, text="this is the value of x")
}

# print a vector horizontally
Pr = function(x) {
  for (i in 1:length(x)) {
    print(x[i])
  }
}

## print columns of data frame in readable format
PrCols = function(df) {
  Pr(colnames(df))
}

## some basic info about data.frame
DescDf = function(df) {

  print(paste0(
      "this is the dim: ", "row_num: ", dim(df)[1], "; col_num: ", dim(df)[2]))
  #print("sample data")
  #print(df[1, ])

  for (col in colnames(df)) {
    x = df[[col]]
    print(paste0(col, ":  ", class(x)))
    tab = table(x)
    k = min(length(tab), 8)
    freqDf = data.frame(tab)[1:k, ]
    print(freqDf)
  }
}

# wait for key to continue
Pause = function() {
    cat ("Press [enter] to continue")
    line = readline()
}

## checks if a library is installed
IsInstalled = function(pkg) {
  is.element(pkg, installed.packages()[ , 1])
}

## it installs a library if its not installed already
InstallIf = function(pkg, Func=install.packages, ...) {
  if (!IsInstalled(pkg)) {
    Func(pkg, ...)
  }
}

## it installs a library if not installed and it loads it regardless
InstallLoad = function(pkg) {
  InstallIf(pkg)
  library(pkg, character.only=TRUE)
}

## same as above but does it for set of libraries
InstallLoadMulti = function(pkgs) {
  for (pkg in pkgs) {
    InstallIf(
        pkg,
        #repos = c("http://rstudio.org/_packages", "http://cran.rstudio.com"),
        #dependencies=TRUE
    )

    eval(parse(text=paste0("library(package=", pkg, ")")))
  }
}

## purpose is to release memory by "removing" objects
# this function assigns NULL to the objects passed by their name in global env.
# then it collects the garbage twice (deliberately)
# setting to NULL is done since after "rm" and "gc"
# sometimes R does not release memory
Nullify = function(objectNames) {

  for (objectName in objectNames) {
    eval(parse(text=paste(objectName, "<<- NULL")))
    gc()
    gc()
  }
}

TestNullify = function() {

  x = rnorm(10^8)
  y = x + 1
  z = x + 2

  ## first nullify the objects
  Nullify(c("x", "y", "z"))

  ## then remove them if you like,
  # although they don't take space
  rm("x", "y", "z")

  ## Nullify seems to work inside a function as well
  x = rnorm(10^8)
  (function()Nullify("x"))()
  ## this will be true
  x == NULL

  ## Nullify seems to release memory after setting a local variable to NULL
  # as well
}

## memory management
LsObjects = function(
    pos=1,
    pattern,
    order.by,
    decreasing=FALSE,
    head=FALSE,
    n=5)  {
  napply = function(names, fn) sapply(names, function(x)
                     fn(get(x, pos = pos)))
  names = ls(pos = pos, pattern = pattern)
  obj.class = napply(names, function(x) as.character(class(x))[1])
  obj.mode = napply(names, mode)
  obj.type = ifelse(is.na(obj.class), obj.mode, obj.class)
  obj.prettysize = napply(
    names,
    function(x) {
      capture.output(format(utils::object.size(x), units="auto"))}
  )
  obj.size = napply(names, object.size)
  obj.dim = t(napply(names, function(x)
            as.numeric(dim(x))[1:2]))
  vec = is.na(obj.dim)[, 1] & (obj.type != "function")
  obj.dim[vec, 1] = napply(names, length)[vec]
  out = data.frame(obj.type, obj.size, obj.prettysize, obj.dim)
  names(out) = c("Type", "Size", "PrettySize", "Rows", "Columns")
  if (!missing(order.by))
    out = out[order(out[[order.by]], decreasing=decreasing), ]
  if (head)
    out = head(out, n)
  out
}

# shorthand
ReportMemoryUsage = function(..., n=10) {
  LsObjects(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
}

ReportObjectSize = function(x, name="") {
  out = paste(
    "size of", name, "is", round(as.numeric(object.size(x) / 10^6), 3), "MB")
  return(out)
}

ReportObjectSize_true = function(x, name="") {
  size = length(serialize(modList, NULL)) / 10^6
    out = paste(
    "size of", name, "is", round(size, 3), "MB")
  return(out)
}


### git functions:
SourceHttps = function(url, ...) {
  # load package
  require(RCurl)

  # parse and evaluate each .R script
  sapply(
  c(url, ...),
  function(u) {
    eval(parse(text = getURL(u, followlocation = TRUE,
    cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))),
    envir = .GlobalEnv)
  }
  )
}

# Example
TestSourceHttps = function() {
  SourceHttps("https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/bingSearchXScraper/bingSearchXScraper.R",
       "https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R")
}

## Reading and Writing Files
# open the files in a dir; if fileList is not specific, we try to open all
OpenDataFiles = function(
    path,
    fileList=NULL,
    ReadF=read.csv,
    colClasses=NA,
    patterns=NULL,
    Src=function(){},
    parallel=FALSE,
    parallel_outfile="") {

  if (is.null(fileList)) {
    fileList = list.files(path)
  }

  if (!is.null(patterns)) {
    fileListInd = do.call(
        intersect,
        lapply(
            X=patterns,
            FUN=function(p){grep(pattern=p, x=fileList)}))

    fileList = fileList[fileListInd]
  }

  k = length(fileList)
  if (k == 0) {
    warning("no files in path or no files followed the patterns specified.")
    return(NULL)
  }

  ## read one file
  Func = function(i) {
    Src()
    fn0 = fileList[[i]]
    print(fn0)
    fn = paste(path, fn0, sep="")
    fn = file(fn)
    df = ReadF(fn, colClasses=colClasses)
    return(df)
  }


  if (!parallel) {
    dfList = lapply(X=1:k, FUN=Func)
  } else {
    suppressMessages(library("parallel"))
    closeAllConnections()
    no_cores = detectCores() - 3
    no_cores = min(no_cores, k + 1)
    Mark(no_cores, "no_cores")
    # Initiate cluster
    cl = makeCluster(no_cores, outfile=parallel_outfile)
    clusterExport(
            cl=cl,
            list(
                "fileList", "path", "ReadF", "Src"),
            envir=environment())
    dfList =  parLapply(cl=cl, X=1:k, fun=Func)
    stopCluster(cl)
    closeAllConnections()
  }

  names(dfList) = fileList
  return(dfList)
}


# (old function, check) Read data directory
ReadDataDir = function(
    dir,
    Read=read.csv,
    ProcessDf=NULL,
    num=NULL) {

  fileNames = list.files(dir, pattern="*.csv")

  if (!is.null(num)) {
    k = min(num, length(fileNames))
    fileNames = fileNames[1:k]
  }

  fileNames2 = paste0(dir, '/', fileNames)

  G = Read

  if (!is.null(ProcessDf)) {
    G = function(x, ...) {
      ProcessDf(Read(x, ...))
    }
  }

  dfList = lapply(X=fileNames2, FUN=G)

  return(dfList)
}


## write a dataframe to a file with separators;
# can be used for qwiki for example.
WriteDataWithSep = function(
    fn, path=NULL, data, dataSep="|", headerSep="||") {

  options("scipen"=100, "digits"=4)
  fn = paste(path, fn, sep="")
  sink(fn)
  n = dim(data)[1]
  l = dim(data)[2]
  header = names(dat2)
  cat(headerSep)
  for (j in 1:l) {
    cat(header[j])
    cat(headerSep)
  }
  cat("\n")
  for (i in 1:n) {
    cat(dataSep)
    for (j in 1:l) {
      el = as.character(data[i, j])
      cat(el)
      cat(dataSep)
    }
    cat("\n")
  }
  sink()
}

## removes trailing spaces
TrimTrailing = function(x) {
  gsub("^\\s+|\\s+$", "", x)
}

## flatting a column with repeated field
# below we have another version of same function
# which is most likely faster for large data
Flatten_repField_v1 = function(df, listCol, sep=NULL) {

  if (!is.null(sep)) {
    s = strsplit(as.character(df[ , listCol]), split=sep)
  }

  cols = colnames(df)
  cols2 = cols[cols != listCol]
  outDf = data.frame(listCol=unlist(s))
  colnames(outDf) = listCol
  for (col in cols2) {
    outDf[ , col] = rep(df[ , col], sapply(s, length))
  }
  return(outDf)
}

TestFlatten_repField_v1 = function() {

  df = data.frame(list("var1"=c("a,b,c", "d,e,f"), "var2"=1:2, "var3"=3:4))
  print(df)
 Flatten_repField_v1(df=df, listCol="var1", sep=",")

  df = data.frame(list("var1"=I(list(1:3, 4:6)), "var2"=1:2, "var3"=3:4))
  print(df)
 Flatten_repField_v1(df=df, listCol="var1", sep=NULL)
}

## flatten a column (listCol) of df with multiple values
# the column elements could be lists
# or could be separated by sep e.g. comma
# this is faster than v1, but more complex
Flatten_repField = function(df, listCol, sep=NULL) {

  if (!is.null(sep)) {
    Func = function(x) {
      l = as.vector(strsplit(x, sep)[[1]])
      return(l)
    }

    df$newListCol = lapply(X=as.character(df[ , listCol]), FUN=Func)
  } else {
    df$newListCol = df[ , listCol]
    df = SubsetCols(df=df, dropCols=listCol)
  }

  cols = names(df)[names(df) != "newListCol"]
  dt = data.table(df)[ , unlist(newListCol), by=cols]
  df = data.frame(dt)
  df[ , listCol] = df[ , "V1"]
  df = df[ , ! names(df) %in% "V1"]
  return(df)
}

TestFlatten_repField = function() {

  df = data.frame(list("var1"=c("a;b;c", "d;e;f"), "var2"=1:2, "var3"=3:4))
  print(df)
  Flatten_repField(df=df, listCol="var1", sep=";")

  df = data.frame(list("var1"=I(list(1:3, 4:6)), "var2"=1:2, "var3"=3:4))
  print(df)
  Flatten_repField(df=df, listCol="var1", sep=NULL)
}

## creating a single string column using multiple columns (cols)
# and adding that to the data frame
Concat_stringColsDf = function(df, cols, colName=NULL, sepStr="-") {

  x = ""
  if (is.null(colName)){
    colName = paste(cols, collapse=sepStr)
  }

  for (i in 1:length(cols)) {
    col = cols[i]
    x = paste0(x, as.character(df[ , col]))
    if (i < length(cols)) {
      x = paste0(x, "-")
    }
  }
  df[ , colName] = x
  return(df)
}

TestConcat_stringColsDf = function() {

  df = data.frame(list("a"=1:3, "b"=c("rr",  "gg", "gg"), "c"=1:3))
  Concat_stringColsDf(df=df, cols=c("a", "b", "c"), colName=NULL, sepStr="-")
}


## left and right quantiles
# these quantiles have many desirable properties compared to
# regular quantiles:
# (1) the result has been observed
# (2) its symmetric
# (3) its truely equivariant
# See: Statistical models for agroclimate risk analysis (2010)
# Reza Hosseini, UBC.
# https://open.library.ubc.ca/cIRcle/collections/ubctheses/24/items/1.0070885
Quantile = function(x, probs, direction="left") {

  x = na.omit(x)
  x = sort(x)
  n = length(x)


  LeftQuantileSorted = function(x, p) {
    npi = (n*p) %/% 1

    if (n*p > npi) {
        lqv = x[npi + 1]} else if (n*p == npi & p != 0) {
        lqv = x[npi]} else if (p == 0) {
        lqv = -Inf}

    return(lqv)
  }

  RightQuantileSorted = function(x, p) {
    npi = (n*p) %/% 1

    if (n*p > npi) {
      rqv = x[npi + 1]} else if (n*p == npi & p != 1) {
        rqv = x[npi + 1]} else if (p == 1) {
      rqv = Inf}

    return(rqv)
  }


  G = LeftQuantileSorted
  if (direction != "left") {
    G = RightQuantileSorted
  }

  Func = function(p) {
    G(x=x, p=p)
  }

  qs = sapply(FUN=Func, X=probs)

  return(qs)
}

TestQuantile = function() {

  x = c(1, 2, 3)
  probs = 0:10 / 10.0

  Quantile(x, probs, direction="left")
  Quantile(x, probs, direction="right")


}


## subset columns of a data frame / data table in a clear way
SubsetCols = function(df, keepCols=NULL, dropCols=NULL) {

    cols = colnames(df)
    if (!is.null(keepCols)) {
      cols = cols[cols %in% keepCols]
    }

    if (!is.null(dropCols)) {
      cols = cols[!(cols %in% dropCols)]
    }

    if ("data.table" %in% class(df)) {
      return(df[ , cols, with=FALSE])
    }

    return(df[ , cols, drop=FALSE])
}


TestSubsetCols = function() {

  dt0 = data.table(x=1:10, y=1:10, z=1:10)
  colnames(dt0)

  SubsetCols(df=dt0, keepCols=NULL, dropCols="x")
  funcly::SubsetCols(df=dt0, keepCols=NULL, dropCols="x")

  cols = c("x", "y")
  dt0[ , cols, with=FALSE]

}


## get a column values from data.table without failure
# failure can happen via "get" if the passed parameter for column name
# already is the same as the column name in the data table
# see example in test function for issue with get
Col = function(dt, col) {
  dt[[col]]
}

TestCol = function(){

  df = data.frame("x"=1:10, "y"=1:10)
  dt = data.table(df)
  dt[ , "x"] # this will be a data frame still
  dt[ , x] # is a vector as we desire
  col = "x"
  dt[ , get(col)] # that does work
  x = "x" # this is where "get" gets confused
  dt[ , get(x)] # returns error: Error in get(x) : invalid first argument

  # on the other hand, all of the below work

  Col(dt, col)
  Col(dt, x)

}


## a function which handles NA and accepts a proportion of NAs up to 90%
HandleNaPropFcn = function(Fcn, p=0.9) {

  Func = function(x) {
    l = length(x)
    k = sum(!is.na(x))
    if ((k/l) > p) {
      x = na.omit(x)
      return(Fcn(x))
    }
    return(NA)
  }
  return(Func)
}


#dt1 = copy(dt0)

#cols = c(gbCols, valueCols)
#dt0 = dt0[ , cols, with=FALSE]
#colnames(dt0) = paste0("X", 1:length(cols))
#gbCols2 = colnames(dt0)[1:length(gbCols)]
#outDt = dt0[ , lapply(.SD, F), by=gbCols2]
#colnames(outDt) = cols

#outDt = dt1[ , lapply(.SD, F), by=as.list(dt1[ , gbCols, with=FALSE])]



## simple aggregation with data.table
# gbCols are group by columns
# valueCols are the value columns we want to aggregate
# cols is c(gbCols, valueCols) so only two out of three are needed
# AggFunc is the aggregate function
DtSimpleAgg = function(
    dt,
    gbCols=NULL,
    valueCols=NULL,
    cols=NULL,
    AggFunc=sum) {

  library(data.table)
  ## this aggregates multiple columns with the same function
  # this first subsets the data to the cols we need
  # then it aggregates with F

  # if we are not given the valueCols or all cols we need
  # we assume we need all cols in the dt
  if (is.null(cols) & (is.null(valueCols) | is.null(gbCols))) {
    cols = names(dt)
  }

  if (is.null(cols)) {
    cols = c(gbCols, valueCols)
  }

  if (is.null(gbCols)) {
    gbCols = setdiff(cols, valueCols)
  }

  dt = dt[ , cols, with=FALSE]

  outDt = dt[ , lapply(.SD, AggFunc), by=gbCols]
  return(outDt)
}


TestDtSimpleAgg = function() {

  x = rnorm(100000)
  y = rnorm(100000)
  dt = data.table(x=x, y=y, x1=round(x), y1=round(y)+1)

  DtSimpleAgg(
      dt=dt,
      gbCols=c("x1", "y1"),
      valueCols=c("x", "y"),
      AggFunc=mean)

  funcly::DtSimpleAgg(
      dt=dt,
      gbCols=c("x1", "y1"),
      valueCols=c("x", "y"),
      AggFunc=mean)

}

## calculating bootstrap conf intervals for win/loss ratio
# input is a binary vector
# the idea is to use bootstrap
# for any bootstrap sample which degenerates
# because we have all ones or all zeros, we add (0,1) to the vector
# we also do that to the original vector!
# if we don't confidence intervals for vectors
# such as (0,0) will be one point (or (1,1))
Ci_forWLRatio = function(x) {

  flag = 'None'
  if (sum(x) == 0) {
    x = c(x, 1, 0)
    flag = 'Zero'
  }

  if (sum(1-x) == 0) {
    x = c(x, 0, 1)
    flag = 'Inf'
  }

  Bootst = function(data, F, num=1000) {

    n = dim(data)[1]
    G = function(i) {
      samp = sample(1:n, n, replace=TRUE)
      data2 = data[samp, , drop=FALSE]
      F(data2)
    }

    ind = as.list(1:num)
    res = lapply(X=ind, FUN=G)
    res =unlist(res)
    return(res)
  }

  data = data.frame(x)

  WL = function(data) {
    y = data[ , 1]
    if (sum(y) == 0) {y = c(y, 1, 0)}
    if (sum(1-y) == 0) {y = c(y, 0, 1)}
    out = sum(y) / sum(1-y)
    return(out)
  }

  res = Bootst(data=data, F=WL, num=1000)
  qVec = quantile(res, c(0.025, 0.975))

  ## we adjust the extremes of the interval in the degenerate case
  if (flag == 'Inf') {
    qVec[2] = Inf
  }

  if (flag == 'Zero') {
    qVec[1] = 0
  }

  return(qVec)
}

TestCi_forWLRatio = function() {

  Ci_forWLRatio(c(rep(1, 7)))
  Ci_forWLRatio(c(rep(1, 6), 0, 0, 0))
  Ci_forWLRatio(c(0, 0, 0))
  Ci_forWLRatio(c(1, 0, 0))
  Ci_forWLRatio(c(1, 1, 1))
}

## calculates CLT confidence interval
CltCi = function(x, p=0.95) {
  muHat = mean(x)
  error = qnorm(1 - (1-p)/2) * sd(x)/sqrt(length(x))
  upper = muHat + error
  lower = muHat - error
  return(list("muHat"=muHat, "error"=error, "upper"=upper, "lower"=lower))
}

## calculates CIs for multiple columns in a df and returns a df
CltCiDf = function(df, cols, p=0.95) {

  F = function(col) {
    x = df[ , get(col)]
    return(CltCi(x, p=p))
  }

  res = lapply(cols, FUN=F)
  names(res) = cols
  outDf = data.frame(matrix(unlist(res), nrow=4, byrow=TRUE))
  outDf[ , "metric"] = cols
  names(outDf) = c("muHat", "error", "upper", "lower", "metric")
  outDf = outDf[ , c("metric", "muHat", "error", "upper", "lower")]

  return(outDf)
}

## calculates relative risk
RelativeRiskCi = function(a1, n1, a2, n2) {

  p1 = a1/n1
  p2 = a2/n2
  if (p2 == 0) {
    print("the probability in the denom is zero, infinite risk!")
    return()
  }
  risk = p1/p2
  logRisk = log(risk)
  se = sqrt(1/a1 + 1/a2 - 1/n1 - 1/n2)
  logRiskUpper = logRisk + 1.96*se
  logRiskLower = logRisk - 1.96*se
  riskUpper = exp(logRiskUpper)
  riskLower = exp(logRiskLower)
  return(list(
      "risk"=risk,
      "riskLower"=riskLower,
      "riskUpper"=riskUpper,
      "logRisk"=logRisk,
      "logRiskUpper"=logRiskUpper,
      "logRiskLower"=logRiskLower,
      "logScaleError"=1.96*se
      ))
}

## calculates an upper bound/conservative CI for
# relative risk when the sample sizes are missing
# but their relative size is know
# e.g. this is true for experiment mods
RelativeRiskCi_approx = function(a1, a2, n2_n1_ratio=1) {

  risk = a1/a2 * n2_n1_ratio
  logRisk = log(risk)
  se = sqrt(1/a1 + 1/a2)
  logRiskUpper = logRisk + 1.96*se
  logRiskLower = logRisk - 1.96*se
  riskUpper = exp(logRiskUpper)
  riskLower = exp(logRiskLower)
  return(list(
      "risk"=risk,
      "riskLower"=riskLower,
      "riskUpper"=riskUpper,
      "logRisk"=logRisk,
      "logRiskUpper"=logRiskUpper,
      "logRiskLower"=logRiskLower,
      "logScaleError"=1.96*se
      ))
}

TestRelativeRiskCi_approx = function() {

  Func = function(n1) {
    a1 = 30
    e1 = RelativeRiskCi(a1=a1, n1=n1, a2=2*a1, n2=3*n1)[["logScaleError"]]
    e2 = RelativeRiskCi_approx(a1=a1, a2=2*a1, n2_n1_ratio=3)[["logScaleError"]]
    return(c(e1, e2))
  }

  grid = (a1 + 1):200
  res = lapply(grid, FUN=Func)
  outDf = data.frame(matrix(unlist(res), nrow=length(grid), byrow=TRUE))


  plot(
      grid, outDf[ , 1], ylim=c(0, 2*outDf[1, 2]),
      col="blue", ylab="CI error in log risk scale", xlab="n1")
  abline(h=outDf[1, 2], col="red")

  abline(v=2*a1, col="grey")
  text(x=2*a1, y=outDf[1, 2]/2, labels="2*a1")
  text(x=grid[length(grid)]-5, y=outDf[1, 2], labels="approx")
}

# remap low freq labels to a new label in data
# this is useful to avoid model breakage
# this also remaps NAs to the newLabel
# labelsNumMax decides whats the max number of labels allowed
Remap_lowFreqCategs = function(
    dt,
    cols,
    newLabels="other",
    otherLabelsToReMap=NULL,
    freqThresh=5,
    labelsNumMax=NULL,
    remapNA=TRUE) {

  # first check if all cols passed on are present in the data.table
  ind = cols %in% colnames(dt)
  if (sum(ind) < length(ind)) {
    warning(paste(
      "Not all cols provided are in the data.table.\n",
      "These cols are missing:\n",
      paste(cols[!ind], collapse="; "),
      "\n",
      "We return dt unchanged.",
      collapse=" "))
    return(dt)
  }

  for (col in cols) {
    c = class(dt[[col]])
    if (c != "character") {
      warning(paste(
          "class of the column: ", col,
          " was not character and was set to character."))
      dt[ , col] = as.character(dt[[col]])
    }
  }

  if (!"data.table" %in% class(dt)) {
    warning("dt is not a data.table")
    return(list("dt"=dt, "Func"=NULL, "freqLabelsList"=NULL))
  }

  dt2 = copy(dt)

  k = length(cols)
  if (length(freqThresh) == 1) {
    freqThresh = rep(freqThresh, k)
  }

  if (length(newLabels) == 1) {
    newLabels = rep(newLabels, k)
  }

  if (!is.null(labelsNumMax) && length(labelsNumMax) == 1) {
    labelsNumMax = rep(labelsNumMax, k)
  }


  GetFreqLabels = function(i) {
    col = cols[i]
    freqDt = data.table(table(dt2[[col]]))
    colnames(freqDt) = c(col, "freq")
    freqDt = freqDt[order(freq, decreasing=TRUE)]
    freqLabels = freqDt[freq > freqThresh[i]][[col]]
    if (!is.null(labelsNumMax)) {
      maxNum = min(length(freqLabels), labelsNumMax[i])
      freqLabels = freqLabels[1:maxNum]
    }

    if (!is.null(otherLabelsToReMap)) {
      freqLabels = setdiff(freqLabels, otherLabelsToReMap)
    }
    return(freqLabels)
  }

  freqLabelsList = lapply(X=1:k, FUN=GetFreqLabels)
  names(freqLabelsList) = cols

  Func = function(dt) {
    for (i in 1:length(cols)) {
      col = cols[i]
      newLabel = newLabels[i]
      badLablesNum = sum(!dt[[col]] %in% freqLabelsList[[col]])
      if (badLablesNum > 0) {
        data.table::set(
            dt,
            i=which(!dt[[col]] %in% freqLabelsList[[col]]),
            j=col,
            value=newLabel)}
      dt = ReplaceNA(df=dt, cols=cols[i], replaceValue=newLabels[i])
    }
    return(dt)
  }

  return(list("dt"=Func(dt2), "Func"=Func, "freqLabelsList"=freqLabelsList))
}

TestRemap_lowFreqCategs = function() {

  dt = data.table(data.frame(
      "country"=c("", rep("US", 10), rep("IN", 3), rep("FR", 10), "IR", ""),
      "gender"=c(
          "", rep("MALE", 10), rep("FEMALE", 10), rep("OTHER", 3), "NONE", ""),
      "value"=rnorm(26)))

  res = Remap_lowFreqCategs(
      dt=dt, cols=c("country", "gender"), otherLabelsToReMap=c(""),
      freqThresh=5)

  print(res)

  dt2 = data.table(data.frame(
      "country"=c("", rep("NZ", 10), rep("IN", 10), rep("FR", 3), "IR", ""),
      "gender"=c(
          "", rep("MALE", 10), rep("FEMALE", 10), rep("OTHER", 3), "NONE", ""),
      "value"=rnorm(26)))
  res[["Func"]](dt2)
}


# if a label of a column is rare, we add one to the flag
FlagRow_ifLowFreqValue = function(dt, cols, freqThresh, flagCol="flag") {

  dt[ , flagCol] = 0

  for (col in cols) {
    if (!col %in% colnames(dt)) {
      warning(paste(col, " was not in the columns."))
      return(NULL)
    }
    freqDt = data.table(table(dt[[col]]))
    colnames(freqDt) = c(col, "freq")
    freqDt = freqDt[freq <= freqThresh, ]
    if (nrow(freqDt) > 0) {
      rareLabels = freqDt[[col]]
      dt[dt[[col]] %in% rareLabels, flagCol] = (
          dt[dt[[col]] %in% rareLabels, ][[flagCol]] + 1)
    }
  }

  return(dt)
}


TestFlagRow_ifLowFreqValue = function() {

  n = 20
  x = sample(
      c("horse", "cat", "cat", "dog", "dog", "cat"),
      n,
      replace=TRUE)
  y = sample(
      c("horse", "cat", "cat", "dog", "dog", "cat"),
      n,
      replace=TRUE)
  z = x

  df = data.frame("x"=x, "y"=y, "z"=z)
  dt = data.table(df)

  FlagRow_ifLowFreqValue(
      dt=dt,
      cols=c("x", "z"),
      freqThresh=2,
      flagCol="flag")

  FlagRow_ifLowFreqValue(
      dt=dt,
      cols=c("x", "y", "z"),
      freqThresh=2,
      flagCol="flag")

}

## quick check
CheckColFreqDt = function(dt, col) {

  freqDf = data.frame(table(as.character(dt[ , get(col)])))
  freqDf = freqDf[order(freqDf[ , "Freq"], decreasing=TRUE), ]
  rownames(freqDf) = NULL
  Mark(dim(freqDf), "dim(freqDf)")
  Mark(freqDf[1:min(50, nrow(freqDf)), ], "freqDf")
  return(freqDf)

}

## replaces all NAs in a data.table dt, for given cols
DtReplaceNa = function(dt, cols=NULL, replaceValue=0) {

  dt2 = copy(dt)
  if (is.null(cols)) {
    cols = names(dt2)
  }
  for (col in cols) {
    dt2[is.na(dt[[col]]), (col) := replaceValue]
  }
  return(dt2)
}

TestDtReplaceNa = function() {

  x = sample(
      c(NA, "horse", "cat", "cat", "dog", "dog", "cat"),
      n,
      replace=TRUE)
  y = sample(
      c(NA, "horse", "cat", "cat", "dog", "dog", "cat"),
      n,
      replace=TRUE)

  z = x

  df = data.frame("x"=x, "y"=y, "z"=z)
  dt = data.table(df)

  DtReplaceNa(dt, cols=colnames(dt), replaceValue="other")

}

## categorical mode
CategMode = function(x) {

  x = na.omit(x)
  if (length(x) == 0) {
    return(NULL)
  }
  ux = unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}


RowMode = function(mat) {
  apply(mat, 1, CategMode)
}

Example = function() {
  x = c('a', 'a', 'b', 'c')
  categMode(x)
  M = matrix(sample(c('a', 'b', 'c'), 25, replace=TRUE), 5, 5)
  RowMode(M)
}

## continuous mode
ContiMode = function(x) {

  d = density(x)
  d[["x"]][which.max(d[["y"]])]
}

## replacing NAs with mode or means
DtRemapNa = function(
    dt,
    cols=NULL,
    NumericReplace=function(x){mean(x, na.rm=TRUE)},
    FactorReplace=CategMode) {

  dt2 = copy(dt)

  if (is.null(cols)) {
    cols = names(dt2)
  }

  for (col in cols) {

    naProp = sum(is.na(dt2[ , get(col)])) / nrow(dt2)

    if (naProp > 0) {
      print(col)
      print(naProp)
      print(class(dt2[ , get(col)]))

      if (class(dt2[ , get(col)]) %in% c("numeric")) {
        set(dt2, i=which(is.na(dt2[ , get(col)])), j=col,
        value=mean(dt2[ , get(col)], na.rm=TRUE))
      }

      if (class(dt2[ , get(col)]) %in% c("integer")) {
        set(dt2, i=which(is.na(dt2[ , get(col)])), j=col,
        value=round(NumericReplace(dt2[ , get(col)])))
      }

      if (class(dt2[ , get(col)]) %in% c("factor", "character")) {
        set(dt2, i=which(is.na(dt2[ , get(col)])), j=col,
        value=CategMode(dt2[ , get(col)]))
      }
    }
  }

  return(dt2)
}

## remap values in a column (col) to other given values in newValues
DtRemap_colValues = function(dt, col, values, newValues, newCol=NULL) {

  if (length(values) != length(newValues)) {
    stop("Error: values and newValues do not have the same length")
  }

  if (is.null(newCol)) {
    newCol = col
  }

  F = function(x) {
    if (!(x %in% values)) {
      return(x)
    } else {
      i = which(values == x)[1]
      return(newValues[i])
    }
  }

  dt[ , newCol] = sapply(dt[ , get(col)], FUN=F)
  return(dt)
}

TestDtRemap_colValues = function() {

  dt = data.table(x=c("a", "b", "c", "a"), y=1:4)
  values = c("a", "b")
  newValues = c("A", "B")
  DtRemapColValues(dt, col="x", values=values, newValues=newValues)
}

## rounds the numeric columns in a df
RoundDf = function(df, num=1) {

  cols = names(df)[unlist(lapply(df, is.numeric))]
  if (is.null(cols)) {
    return(df)
  }
  df[ , cols] = round(df[ , cols], num)
  return(df)
}

## rounds the numeric columns in a dt
RoundDt = function(dt, num=1) {

  cols = names(dt)[which(sapply(dt, is.numeric))]
  if (is.null(cols)) {
    return(dt)
  }
  dt[ , cols] = round(SubsetCols(df=dt, keepCols=cols), num)
  return(dt)
}

## rounds the numeric columns using signif in a df
SignifDf = function(df, num=1) {

  cols = names(df)[unlist(lapply(df, is.numeric))]
  if (is.null(cols)) {
    return(df)
  }
  df[ , cols] = signif(df[ , cols], num)
  return(df)
}

## rounds the numeric columns using signif in a dt
SignifDt = function(dt, num=1) {

  cols = names(dt)[which(sapply(dt, is.numeric))]
  if (is.null(cols)) {
    return(dt)
  }
  dt[ , cols] = signif(SubsetCols(df=dt, keepCols=cols), num)
  return(dt)
}

## adding + or - to CIs to make it easy to find significant ones
StarCiDf = function(
    df, upperCol, lowerCol, upperThresh=0, lowerThresh=0,
    starCol="sig_stars") {

  df[ , starCol] = ""

  for (i in 1:length(upperThresh)) {
    ind = df[ , lowerCol] > upperThresh[i]
    if (length(ind) > 0) {
      df[ind, starCol] = paste0(df[ind, starCol], "+")
    }
  }

  for (i in 1:length(lowerThresh)) {
    ind = df[ , upperCol] < lowerThresh[i]
    if (length(ind) > 0) {
      df[ind, starCol] = paste0(df[ind, starCol], "-")
    }
  }

  return(df)
}

##
StarPvalueDf = function(
    df, pvalueCol="p-value", thresh=c(0.1, 0.05, 0.01, 0.001, 0.0001),
    starCol="pvalue_stars") {

  df[ , starCol] = ""

  ind = df[ , pvalueCol] < thresh[1]
    if (length(ind) > 0) {
      df[ind, starCol] = paste0(df[ind, starCol], ".")
  }

  for (i in 2:length(thresh)) {
    ind = df[ , pvalueCol] < thresh[i]
    if (length(ind) > 0) {
      df[ind, starCol] = paste0(df[ind, starCol], "*")
    }
  }

  return(df)
}

## this is the standard version of StarCiDf
TidyCiDf = function(
    df,  upperCol="ci_upper", lowerCol="ci_lower",
    upperThresh=c(1, 1.5, 2), lowerThresh=c(1, 0.75, 0.5), rounding=3) {

  df = StarCiDf(
      df=RoundDf(df, rounding), upperCol=upperCol, lowerCol=lowerCol,
      upperThresh=c(1, 1.5, 2), lowerThresh=c(1, 0.75, 0.5))

  return(df)
}

## Creates a table summary for the output of a regression model coefficients
# e.g. glm
RegMod_coefTableSumm = function(
    mod, label, dropVars="(Intercept)", keepVars=NULL, signif=2) {

  df = data.frame(summary(mod)[["coefficients"]])
  df = df[ , c("Estimate", "Std..Error", "Pr...t..")]
  colnames(df) = c("Estimate", "Sd", "p-value")
  df[ , "var"] = rownames(df)
  df = df[ , c("var", "Estimate", "Sd", "p-value")]
  df[ , "model_label"] = label
  if (!is.null(dropVars)) {
    df = df[!(df[ , "var"] %in% dropVars), ]
  }

  if (!is.null(keepVars)) {
    df = df[df[ , "var"] %in% keepVars, ]
  }

  df = SignifDf(df=df, num=signif)
  rownames(df) = NULL
  return(df)
}

## Creates a coef table summary for a list of models
RegModList_coefTableSumm = function(
    modList, labels=NULL, dropVars=NULL, keepVars=NULL, signif=2) {

  if (is.null(labels)) {
    labels = names(modList)
  }
  Func = function(i) {
    mod = modList[[i]]
    label = labels[i]
    out = RegMod_coefTableSumm(
        mod=mod,
        label=label,
        dropVars=dropVars,
        keepVars=keepVars,
        signif=signif)
    return(out)
  }

  outDf = do.call(what=rbind, args=lapply(X=1:length(modList), FUN=Func))
  return(outDf)
}

## xtable with vertical dividers
# we do not capitalize here as an exception
# since this is a minor tweak to existing function
xtable2 = function(x, caption="", label="label", ...) {

  MakeAlignString = function(x) {
    k = ncol(x)
    format_str = ifelse(sapply(x, is.numeric), "r", "l")
    out = paste0("|", paste0(c("r", format_str), collapse = "|"), "|")
    return(out)
  }

  return(xtable(x, caption=caption, label=label, ..., align=MakeAlignString(x)))
}

## entropy
Entropy = function(p) {
  if (min(p) < 0 || sum(p) <= 0) {
    pNorm = p[p > 0] / sum(p)
  }
  -sum(log2(pNorm)*pNorm)
}

## should work with both data frame and data table
SplitStrCol = function(df, col, sepStr) {

  dt = data.table(df)

  if (sepStr == "") {
    Func = function(x) {
      nchar(x)
    }
    sepNums = nchar(as.character(dt[ , get(col)]))
    colNum = sepNums[1]
  } else {

    F = function(x) {
      lengths(regmatches(x, gregexpr(sepStr, x)))
    }
    sepNums = unlist(lapply(FUN=Func, X=dt[ , get(col)]))
    # sepNums = dt[, F(get(col)), by = 1:nrow(dt)][ , V1]
    # second approach but it wasnt really faster
    colNum = sepNums[1] + 1
  }


  print(summary(sepNums))

  if (max(sepNums) != min(sepNums)) {
    warning("the strings do not have the same number of sepStr within.")
    return(NULL)
  }

  setDT(dt)[ , paste0(col, 1:colNum):=tstrsplit(
      get(col), sepStr, type.convert=TRUE, fixed=TRUE)]

  newCols = paste0(col, 1:colNum)
  return(list("dt"=copy(dt), "newCols"=newCols))
}

TestSplitStrCol = function() {

  df = data.frame(
    attr = c(1, 30 ,4 ,6 ),
    type = c('foo_and_bar_and_bar3', 'foo_and_bar_2_and_bar3')
  )

  sepStr = "_and_"
  col = "type"

  SplitStrCol(df=df, col=col, sepStr=sepStr)

  dt = data.table(df)
  SplitStrCol(df=dt, col=col, sepStr=sepStr)

  df = data.frame(
    attr = c(1, 30 ,4 ,6 ),
    type = c('aaa', 'abc')
  )

  SplitStrCol(df=data.table(df), col="type", sepStr="")
}

## this applies the jack-knife method
# F is an estimator which is a function of dt and returns a vector
# we want a CI for each of the components returned by F
# dt is a data.table
PartitionCi = function(
    dt, Estim, bucketCol=NULL, bucketNum=NULL, method="jk", conf=0.95) {

  if (is.null(bucketCol)) {
    bucketCol = "bucket"
    n = nrow(dt)
    bucketSize = floor(n / bucketNum)
    r = n - bucketSize*bucketNum
    bucketVec = c(rep(1:bucketNum, bucketSize))
    if (r > 0) {
      bucketVec = c(bucketVec, 1:r)
    }
    bucketVec = sample(bucketVec)
    dt[ , "bucket"] =  bucketVec
  }

  buckets = unique(dt[ , get(bucketCol)])

  Jk = function(b) {
    dt2 = dt[get(bucketCol) != b]
    dt2 = SubsetCols(dt2, dropCols=bucketCol)
    return(Estim(dt2))
  }

  Simple = function(b) {
    dt2 = dt[get(bucketCol) == b]
    dt2 = SubsetCols(dt2, dropCols=bucketCol)
    return(Estim(dt2))
  }

  if (method == "jk") {
    G = Jk
  } else {
    G = Simple
  }

  estimList = lapply(X=buckets, FUN=G)
  x0 = estimList[[1]]
  names = names(x0)

  estimDf = setNames(
      data.frame(matrix(ncol=length(x0), nrow=length(buckets))),
      names)

  for (i in 1:length(buckets)) {
    estimDf[i, ] = estimList[[i]]
  }

  CltCi = function(x) {
    x = na.omit(x)
    m = mean(x)
    s = sd(x)
    n = length(x)

    if (method == "jk") {
      estimSd = sqrt(n-1) * s
    } else {
      estimSd = s / sqrt(n)
    }

    zValue = 1 - (1-conf) / 2
    upper = m + qnorm(zValue) * estimSd
    lower = m - qnorm(zValue) * estimSd
    return(c(m, estimSd, lower, upper))
  }

  ciDf = t(apply(estimDf, 2, CltCi))
  ciDf = data.frame(ciDf)
  colnames(ciDf) = c("mean", "estim sd", "lower", "upper")
  ciDf[ , "length"] = ciDf[ , "upper"] - ciDf[ , "lower"]
  return(ciDf)
}

BootstrapCi = function(dt, Estim, bsNum=500, conf=0.95) {

  q1 = (1 - conf) / 2
  q2 = 1 - q1

  n = nrow(dt)
  Bs = function(b) {
    samp = sample(1:n, replace=TRUE)
    dt2 = dt[samp, ]
    return(Estim(dt2))
  }


  estimList = lapply(X=1:bsNum, FUN=Bs)
  x0 = estimList[[1]]
  names = names(x0)

  estimDf = setNames(
      data.frame(matrix(ncol=length(x0), nrow=bsNum)),
      names)

  for (i in 1:bsNum) {
    estimDf[i, ] = estimList[[i]]
  }

  BsCi = function(x) {
    x = na.omit(x)
    m = mean(x)
    estimSd = sd(x)
    upper = quantile(x, q2)
    lower = quantile(x, q1)
    return(c(m, estimSd, lower, upper))
  }

  ciDf = t(apply(estimDf, 2, BsCi))
  ciDf = data.frame(ciDf)
  colnames(ciDf) = c("mean", "estim sd", "lower", "upper")
  ciDf[ , "length"] = ciDf[ , "upper"] - ciDf[ , "lower"]
  return(ciDf)
}

TestPartitionCi = function() {

  n = 10^4
  x1 = rnorm(n, mean=3, sd=10)
  x2 = rnorm(n, mean=5, sd=2)
  x3 = x1 + x2
  df = data.frame(x1=x1, x2=x2, x3=x3)
  dt = data.table(df)
  bucketCol = NULL
  bucketNum = 20

  Estim = function(dt) {colMeans(dt)}
  PartitionCi(
      dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
      method="jk", conf=0.95)

  PartitionCi(
      dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
      method="simple", conf=0.95)

  BootstrapCi(dt=dt, Estim=Estim, bsNum=1000, conf=0.95)

  Estim = function(dt) {mean(dt[[1]])}
  PartitionCi(
      dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
      method="jk", conf=0.95)

  PartitionCi(
      dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
      method="simple", conf=0.95)

  BootstrapCi(dt=dt, Estim=Estim, bsNum=500, conf=0.95)
}

## substituting multiple values
ReplaceStringMulti = function(x, values, subs) {

  for (i in 1:length(values)) {
    x = gsub(values[i], subs[i], x)
  }

  return(x)
}

## capitalizes all words in a sentence
CapWords = function(x, splitStr=" ") {
  s = strsplit(x, splitStr)[[1]]

  paste(toupper(substring(s, 1, 1)), substring(s, 2),
      sep="", collapse=splitStr)
}

TestCapWords = function() {

  CapWords("be free.") == "Be Free."
}

# Cartesian product of string vectors
StringCartesianProd = function(..., prefix="", sep="_") {

  #paste0(prefix, levels(interaction(..., sep=sep)))
  paste2 = function(...) {
    paste(..., sep=sep)
  }

  df = expand.grid(...)
  do.call(what=paste2, args=df)
}

## test for the above function
TestStringCartesianProd = function() {

  values = c("active_days_num", "activity_num")
  products = c("assist", "search", "watchFeat", "photos", "multi")
  periods = c("pre", "post")
  valueCols = StringCartesianProd(values, products, periods, sep="_")
}

## sorts data frames and data.tables
# R syntax for sorting is ineffective and not so great inside functions
# this function provides a user friendly approach
# cols: columns to be used for sorting, in order of their importance
# ascend: specifies if the order is ascending (TRUE) or not (FALSE)
# default for ascend is (TRUE, ..., TRUE)
SortDf = function(
    df, cols=NULL, ascend=NULL, printCommand=FALSE) {

  if (is.null(cols)) {
    cols = names(df)
  }

  if (min(cols %in% names(df)) < 1) {
    warning("some of your columns are not in df.")
    return(df)
  }

  if (is.null(ascend)) {
    ascend = rep(TRUE, length(cols))
  }

  commandStr = "order("

  for (i in 1:length(cols)) {
    dir = ascend[i]
    col = cols[i]
    if (dir) {
      commandStr = paste0(commandStr, " ", col)
    } else {
      commandStr = paste0(commandStr, " ", "-", col)
    }

    if (i == length(cols)) {
      commandStr = paste0(commandStr, ")")
    } else {
      commandStr = paste0(commandStr, ",")
    }
  }

  commandStr = paste0("df = df[with(df, ", commandStr,") , ]")

  if (printCommand) {
    print(commandStr)
  }

  eval(parse(text=commandStr))

  return(df)
}

TestSortDf = function() {

  n = 20

  df = data.frame(
      "first_name"=sample(c("John", "Omar", "Mo"), size=n, replace=TRUE),
      "family_name"=sample(c("Taylor", "Khayyam", "Asb"), size=n, replace=TRUE),
      "grade"=sample(1:10, size=n, replace=TRUE))

  # sort with defaults, nice and easy
  SortDf(df=df)

  # choose columns and the direction of sorting
  SortDf(
      df=df,
      cols=c("first_name", "family_name", "grade"),
      ascend=c(TRUE, TRUE, FALSE),
      printCommand=TRUE)

  # try same with data table object
  SortDf(
      df=data.table(df),
      cols=c("first_name", "family_name", "grade"),
      ascend=c(TRUE, TRUE, FALSE),
      printCommand=FALSE)
}

## this returns a function which calculates a relative err
# using norms
# this error function is symmetric with respect to its inputs position
# p denoted the power in L-p norm (p > 0)
SymRelErrFcn = function(p) {

  if (p <= 0) {
    warning("p has to be positive")
    return(NULL)
  }

  Func = function(x, y) {
    z = abs(x - y)
    err = 2 * z^p / (abs(x)^p + abs(y)^p)
    return(err)
  }

  return(Func)
}

TestSymRelErrFcn = function() {
  x = 3
  y = 5

  SymRelErrFcn(2)(x, y)
  SymRelErrFcn(1)(x, y)
}

## calculates diff between valueCols between two data frames
CalcErrDfPair = function(
    df1, df2, valueCols, Err, ErrAvgF=mean, sort=TRUE,
    idCols=NULL, checkMatch=TRUE) {

  if (nrow(df1) != nrow(df2)) {
    warning("length of the data frames is not the same.")
    return(NULL)
  }

  if (is.null(idCols)) {
    ## the common cols except for valueCols used for sorting and matching
    idCols = setdiff(intersect(colnames(df1), colnames(df2)), valueCols)
  }

  if (sort) {
    df1 = SortDf(df=df1, cols=idCols)
    df2 = SortDf(df=df2, cols=idCols)
  }

  if (checkMatch) {
    if (!identical(df1[ , idCols], df2[ , idCols])) {
      warning(paste(
          "id columns:",
          paste(idCols, collapse=" "),
          "are not matching in values."))

      return(NULL)
    }
  }

  errVec = NULL

  for (valueCol in valueCols) {
    err = ErrAvgF(Err(df1[ , valueCol], df2[ , valueCol]))
    errVec = c(errVec, err)
  }

  names(errVec) = valueCols

  return(errVec)
}

TestCalcErrDfPair = function() {

  n = 100
  x1 = sample(1:n, size=n)
  x2 = sample(1:n, size=n)

  df1 = data.frame(
      "x1"=x1,
      "x2"=x2,
      "y1"=2*x1 + rnorm(n),
      "y2"=2*x2 + rnorm(n))

  df2 = data.frame(
      "x1"=x1,
      "x2"=x2,
      "y1"=2*x1 + rnorm(n),
      "y2"=1*x2 + rnorm(n))

  df3 = df2[sample(1:n, n), ]

  Err = SymRelErrFcn(2)

  CalcErrDfPair(
      df1=df1, df2=df2, valueCols=c("y1", "y2"),
      Err=Err, ErrAvgF=mean, sort=TRUE,
      idCols=NULL, checkMatch=TRUE)

  CalcErrDfPair(
      df1, df3, valueCol=c("y1", "y2"), Err=Err, ErrAvgF=mean, sort=TRUE,
      idCols=NULL, checkMatch=TRUE)
}

## This function compares two frequency tables
# it returns a row_wise err which is then averaged across rows
# also returns a global err which is standardized by the total freq
# both metrics are symmetric
# note that this is not a distbn distance by default: set distbn_dist=TRUE
# this will compare frequencies by default
FreqTables_simpleDiff = function(
    tab1, tab2, AvgF=mean, distbn_dist=FALSE) {

  df1 = data.frame(tab1)
  colnames(df1) = c("var", "freq1")

  df2 = data.frame(tab2)
  colnames(df2) = c("var", "freq2")

  ## if we want a distbn distance we cal probabilities
  if (distbn_dist) {
    df1[ , "freq"] = df1[ , "freq"] / sum(df1[ , "freq"])
    df2[ , "freq"] = df2[ , "freq"] / sum(df2[ , "freq"])
  }

  compareDf = merge(df1, df2, on=colnames(df1), all=TRUE)
  compareDf[is.na(compareDf)] = 0
  compareDf[ , "err"] = abs(compareDf[ , "freq1"] - compareDf[ , "freq2"])

  denom_elementwise = (0.5*compareDf[ , "freq1"] + 0.5*compareDf[ , "freq2"])
  avg_elementwise_err = AvgF(compareDf[ , "err"] / denom_elementwise)

  total_freq = sum(compareDf[ , "freq1"]) + sum(compareDf[ , "freq2"])
  global_err = sum(compareDf[ , "err"]) / total_freq

  return(list(
      "avg_elementwise_err"=avg_elementwise_err,
      "global_err"=global_err))
}

## which value is the min
MinInd = function(x) {
  which(x == min(x))
}

# which row has the min value for col
MinIndDf = function(df, col) {

  x = df[ , col]
  ind = which(x == min(x))
  return(df[ind, , drop=FALSE])
}

## debugging R code
Example = function() {

  Func = function() {
    on.exit(traceback(1))
    G = function() {
      x = 1 + "a"
    }
    G()
  }

  Func()

  #traceback()
}

## for debugging within R
Debug = function(Func)  {
    on.exit(traceback(1))
    Func()
    #traceback()
}

## check for a library dependencies
# also tries to find out if those libs are installed by checking library(lib)
# if not installed, it tries to install them
# it reports un-installed ones and the unavailable ones for install
# Install is either install.packages or a custom install function
Check_andFix_dependencies = function(lib, Install) {

  library("tools")
  libs = package_dependencies(lib)[[1]]
  uninstalledLibs = NULL
  unavailLibs = NULL

  Func = function(lib) {
    suppressMessages(library(lib, character.only=TRUE))
    return(NULL)
  }

  for (lib in libs) {

    x = tryCatch(
        Func(lib),
        error=function(e) {lib})
    uninstalledLibs = c(uninstalledLibs, x)

  }

  Func = function(lib) {
    suppressMessages(Install(lib))
    return(NULL)
  }

  for (lib in uninstalledLibs) {

    x = tryCatch(
        Func(lib),
        error=function(e) {lib})
    unavailLibs = c(unavailLibs, x)
  }

  return(list(
      unavailLibs=unavailLibs,
      uninstalledLibs=uninstalledLibs))
}

## drops (multiple) ending vowels from a string
DropEndingVowels = function(s, minLength=2) {

  cond = TRUE
  while (cond && nchar(s) > minLength) {

    if (tolower(substr(s, nchar(s), nchar(s))) %in% c("a", "o", "e", "u", "i")) {
      s = substr(s, 1, nchar(s)-1)
    } else {
      cond = FALSE
    }
  }

  return(s)
}

TestDropEndingVowels = function() {

  s = "abbggaae"
  DropEndingVowels(s)

  s = "abbggaaeuzzai"
  DropEndingVowels(s)
}

## drops ending of words of specified characters
DropEndingChars = function(s, chars, minLength=2) {

  cond = TRUE
  while (cond && nchar(s) > minLength) {
    if (tolower(substr(s, nchar(s), nchar(s))) %in% chars) {
      s = substr(s, 1, nchar(s)-1)
    } else {
      cond = FALSE
    }
  }

  return(s)
}

TestDropEndingChars = function() {

  s = "abbggaaeuzz"
  DropEndingChars(s, chars=c("a", "z", "u"))

}

## abbreviates a string.
# first we abbreviate each word in a string (phrase)
# then we concat them back and abbreviate the whole phrase
AbbrString = function(
    s,
    wordLength=6,
    replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
    sep="-",
    totalLength=NULL,
    wordNumLimit=NULL,
    dropEndingVowels=FALSE) {

  for (char in replaceStrings) {
    s = gsub(char, " ", s)
  }

  sVec = strsplit(s, " ")[[1]]
  sVec = substr(sVec, 1, wordLength)
  sVec = sVec[!sVec %in% c("", " ", "  ", "    ")]

  if (dropEndingVowels) {
    sVec = sapply(FUN=DropEndingVowels, X=sVec)
  }

  sVec = unique(sVec)

  if (!is.null(wordNumLimit)) {
    sVec = sVec[1:min(wordNumLimit, length(sVec))]
  }

  s = paste(sVec, collapse=sep)

  if (!is.null(totalLength)) {
    s = substr(s, 1, totalLength)
  }

  if (dropEndingVowels) {
    s = DropEndingVowels(s)
  }

  s = DropEndingChars(
    s=s,
    chars=c("/", "&", " and ", "-", sep, " ", ",", ";"))

  return(s)
}

TestAbbrString = function() {

  s = "aa_sasa_aann & jjabbbbbaa --- aaahhh"

  AbbrString(
      s,
      wordLength=6,
      replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
      sep="-",
      totalLength=NULL,
      wordNumLimit=4)

  s = "get_request_rate"

  AbbrString(
      s,
      wordLength=6,
      replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
      sep="-",
      totalLength=NULL,
      wordNumLimit=4)
}

AbbrStringVec = function(
    strings,
    wordLength=6,
    replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
    sep="-",
    totalLength=NULL,
    wordNumLimit=NULL,
    dropEndingVowels=FALSE) {

  Abbr = function(s) {

    abbrStr = AbbrString(
        s=s,
        wordLength=wordLength,
        replaceStrings=replaceStrings,
        sep=sep,
        totalLength=totalLength,
        wordNumLimit=wordNumLimit,
        dropEndingVowels=dropEndingVowels)

    return(abbrStr)
  }

  abbrValues = sapply(FUN=Abbr, X=strings)

  return(abbrValues)

}

## functions to abbreviate various string columns
# of a data frame
AbbrStringCols = function(
    df,
    cols,
    newCols=NULL,
    wordLength=6,
    replaceStrings=c("/", "&", " and ", "-"),
    sep="-",
    totalLength=NULL,
    wordNumLimit=NULL,
    dropEndingVowels=FALSE) {

  values = unique(as.vector(as.matrix(df[ , cols])))
  print(values)

  Abbr = function(s) {

    abbrStr = AbbrString(
        s=s,
        wordLength=wordLength,
        replaceStrings=replaceStrings,
        sep=sep,
        totalLength=totalLength,
        wordNumLimit=wordNumLimit,
        dropEndingVowels=dropEndingVowels)

    return(abbrStr)
  }

  abbrValues = sapply(FUN=Abbr, X=values)
  print(abbrValues)

  mapDf = data.frame("value"=values, "abbr_values"=abbrValues)
  rownames(mapDf) = NULL

  df[ , cols] = plyr:::mapvalues(
      as.vector(as.matrix(df[ , cols])),
      from=values,
      to=abbrValues)

  if (!is.null(newCols)) {

    df[ , newCols] = df[ , cols]

  }

  return(list("df"=df, "mapDf"=mapDf))
}

TestAbbrStringCols = function() {

  df = data.frame(
      "col1"=c("life is beautiful", "i like mountains", "ok", "cool"),
      "col2"=c("life is beautiful", "life sucks indeed", "a", "b"))

  #AbbrStringCols(df, cols=["col"])
  res = AbbrStringCols(
    df=df,
    cols=c("col1", "col2"), totalLength=10, wordNumLimit=NULL)

  res[["df"]]

  res[["mapDf"]]
}


## find common string among a few
CommonString = function(strings) {

  Intersect = function (x, y) {
      y = as.vector(y)
      y[match(as.vector(x), y, 0L)]
  }

  commonStr = paste(Reduce(Intersect, strsplit(strings, NULL)), collapse="")
  return(commonStr)
}


TestCommonString = function() {

  strings =  c("a123a", "abcd123", "123uu123")
  CommonString(strings)
}


## sums the columns of a data table which satisfy a certain property
SumCols_viaPattern = function(dt, pattern) {

  dt = data.table(dt)
  cols = colnames(dt)[grepl(pattern, colnames(dt))]
  if (length(cols) == 0) {
    warning("No columns satisfied the given pattern. Returns NULL.")
    return(NULL)
  }

  commonStr = CommonString(cols)
  text = paste0(
      "dt2 = dt[ , ",
      commonStr,
      ":=",
      "(",
      paste0(cols, collapse="+"), "), ]")

  dt2 = eval(parse(
      text=text))

  return(list("dt"=dt2, "newCol"=commonStr))
}

TestSumCols_viaPattern = function() {

  x = matrix(1:100, 10, 5)
  df = data.frame(x)
  colnames(df) = c(
      "abc_lor1_final",
      "abc_ltx1_final",
      "abc_lkg1_final",
      "abc_kjg1_final",
      "abc_opt1_final")

  dt = data.table(df)
  pattern = "^abc.*final$"

  res = SumCols_viaPattern(dt=dt, pattern=pattern)

  res[["dt"]]
  res[["newCol"]]
}


SumCols_multiPatterns = function(dt, patterns) {

  newCols = NULL

  for (pattern in patterns) {

    dt = data.table(dt)
    cols = colnames(dt)[grepl(pattern, colnames(dt))]

    if (length(cols) == 0) {
      warning(paste0(
          "No columns satisfied the pattern: ",
          pattern,
          " Function does nothing for this pattern."))
    } else {

      commonStr = CommonString(cols)
      text = paste0(
          "dt2 = dt[ , ",
          commonStr,
          ":=",
          "(",
          paste0(cols, collapse="+"), "), ]")

      dt = eval(parse(
          text=text))

      newCols = c(newCols, commonStr)
    }

  }

  return(list("dt"=dt, "newCols"=newCols))
}

TestSumCols_multiPatterns = function() {

  x = matrix(1:100, 10, 5)
  df = data.frame(x)
  colnames(df) = c(
      "abc_lor1_final",
      "abc_ltx1_final",
      "abc_lkg1_final",
      "efg_kjg1_final",
      "efg_opt1_final")

  dt = data.table(df)
  patterns = c("^abc.*final$", "^efg*")

  res = SumCols_multiPatterns(dt=dt, patterns=patterns)

  res[["dt"]]
  res[["newCols"]]

}

## rewrite the print data.frame function to be same as data.table
# this is to avoids R from attempting to print all of the data frame
print.data.frame = function(df) {
  data.table:::print.data.table(data.table(df))
}

DichomVar = function(x, num=6) {

  x = na.omit(x)
  step = 1 / num
  #qs = quantile(x, seq(step, 1-step, step))
  qs = Quantile(x, seq(step, 1-step, step))
  qs = unique(qs)
  qs = c(-Inf, qs, Inf)

  Dichom = function(z) {
    cut(z, qs)
  }

  return(list(
    "var"=Dichom(x),
    "Dichom"=Dichom,
    "qs"=qs))

}

Add_dichomVar = function(dt, col, num=6) {

  x = Col(dt, col)

  res = DichomVar(x, num=num)
  Dichom = res[["Dichom"]]
  qs = res[["qs"]]

  AddDichom = function(dt) {
    x = Col(dt, col)
    dt[ , paste0(col, "_categ")] = Dichom(x)
    return(dt)
  }

  return(list(
      "dt"=AddDichom(dt),
      "qs"=qs,
      "Dichom"=Dichom,
      "AddDichom"=AddDichom,
      "newCol"=paste0(col, "_categ")))

}

TestAdd_dichomVar = function() {

  n = 100
  dt0 = data.table(
      x=1:n,
      y=2*(1:n) + rnorm(n))

  dt = dt0[sample(.N, n/2)]
  newDt = dt0[sample(.N, n/2)]
  res = Add_dichomVar(dt=dt, col="x", num=6)

  Dichom = res[["Dichom"]]
  AddDichom = res[["AddDichom"]]
  dt = res[["dt"]]
  newCol = res[["newCol"]]

  AddDichom(newDt)
}

## add multiple dichom columns
Add_dichomVarMulti = function(dt, cols, num=num) {

  dt = data.table(dt)

  fcnList = list()
  addFcnList = list()
  newCols = NULL
  for (col in cols) {

    res = Add_dichomVar(dt=dt, col=col, num=num)
    Dichom = res[["Dichom"]]
    AddDichom = res[["AddDichom"]]
    dt = res[["dt"]]
    newCol = res[["newCol"]]

    fcnList[[col]] = Dichom
    addFcnList[[col]] = AddDichom
    newCols = c(newCols, newCol)

  }

  return(list(
      "dt"=dt,
      "newCols"=newCols,
      "fcnList"=fcnList,
      addFcnList="addFcnList"
      ))
}


TestAdd_dichomVarMulti = function() {
  n = 100
  dt0 = data.table(
      x=1:n,
      y=2*(1:n) + rnorm(n))

  dt = dt0[sample(.N, n/2)]
  newDt = dt0[sample(.N, n/2)]
  res = Add_dichomVarMulti(dt=dt, cols=c("x", "y"), num=6)

  dt = res[["dt"]]
  newCols = res[["newCols"]]
}


## birth year to age
BirthYear_toAgeCateg = function(x, currentYear=NULL) {

  if (is.null(currentYear)) {
    currentYear = as.integer(format(Sys.Date(), "%Y"))
  }

  if (is.na(x) | is.null(x) | x == "" | x == 0) {
    return("other")
  }

  x = as.numeric(x)
  age = currentYear - x

  if (age <= 17) {
    return("<18")
  }

  if (age <= 25) {
    return("18-25")
  }

  if (age <= 35) {
    return("26-35")
  }

  if (age <= 50) {
    return("36-50")
  }

  return(">51")
}


# checks what percentage of each column of df satisfy a given property
# also reports for which percent of rows all values of the row satisfy
# and for which rows any of the values satisfy
# property is input using a function: Func
ReportPropertyDf = function(
    df,
    cols=NULL,
    Func,
    propertyName="prop",
    removeNA=TRUE) {

  if (!is.null(cols)) {
    cols = colnames(df)
    df = SubsetCols(df, keepCols=cols)
  }

  n = nrow(df)
  res = 100 * apply(
      X=df,
      MARGIN=2,
      FUN=function(x)sum(Func(x), na.rm=removeNA)) / n

  outDf = data.frame(res)
  colnames(outDf) = paste0(propertyName, "_perc")
  outDf[ , "colname"] = names(res)
  outDf = outDf[ , c("colname", paste0(propertyName, "_perc"))]

  colNum = ncol(df)
  any_prop = sum(
      apply(
          X=df,
          MARGIN=1,
          FUN=function(x)sum(Func(x), na.rm=removeNA)) > 0)

  all_prop = sum(
      apply(
          X=df,
          MARGIN=1,
          FUN=function(x)sum(Func(x), na.rm=removeNA)) == colNum)

  outDf[nrow(outDf) + 1, ] = c(NA, NA)
  outDf[nrow(outDf), 1] = paste0("any_", propertyName, "_perc")
  outDf[nrow(outDf), 2] = 100 * any_prop / n
  outDf[nrow(outDf) + 1, ] = c(NA, NA)
  outDf[nrow(outDf), 1] = paste0("all_", propertyName, "_perc")
  outDf[nrow(outDf), 2] = 100 * all_prop / n

  return(outDf)

}


TestReportPropertyDf = function() {

  df = data.frame(
      x=c(rep(1, 10), NA),
      y=c(NA, 1:10),
      z=c(1:10, NA),
      u=c(NA, NA, 1:9))

  ReportPropertyDf(
      df=df,
      cols=c("x", "y", "z"),
      Func=is.na,
      propertyName="missing")

  ReportPropertyDf(
      df=df,
      cols=c("x", "y", "z"),
      Func=function(x) {x == 2},
      propertyName="is_two")

}

## reports NAs for a data frame
# for each column it reports the perc missing
# also it reports the per of rows with all missing or some missing
# TODO (Reza Hosseini): add correlation matrix for missing patterns
ReportNA = function(df, cols=NULL) {

  outDf = ReportPropertyDf(
      df=df,
      cols=cols,
      Func=is.na,
      propertyName="missing")

  return(outDf)
}

TestReportNA = function() {
  df = data.frame(
      x=c(1:10, NA),
      y=c(NA, 1:10),
      z=c(1:10, NA),
      u=c(NA, NA, 1:9))

  ReportNA(df, cols=c("x", "y"))
}


# works with data.table and data.frame
ReplaceNA = function(df, cols=NULL, replaceValue=0) {

  if (is.null(cols)) {
    cols = colnames(df)
  }

  if ("data.table" %in% class(df)) {
    for (col in cols) set(df, which(is.na(df[[col]])), col, replaceValue)
    return(df)
  }

  df[ , cols][is.na(df[ , cols])] = replaceValue
  return(df)
}

TestReplaceNA = function() {

  df = data.frame(
    x=c(1:10, NA),
    y=c(NA, 1:10),
    z=c(1:10, NA),
    u=c(NA, NA, 1:9))

  ReplaceNA(df)
  ReplaceNA(data.frame(df))
  ReplaceNA(df, cols="x")

}

## balancing the sample sizes (in terms of number of items)
# we assign the minimum available sample size to all slices
# this is done by defining a new column: which is isBalancedSample
# if you only like to do partial balancing on some slice Values
# specify those slice values in sliceCombinValues_toBalance
BalanceSampleSize = function(
    df,
    sliceCols,
    itemCols=NULL,
    sliceCombinValues_toBalance=NULL) {

  if (is.null(itemCols)) {
    itemCols = c("dummy_item")
    df[ , "dummy_item"] = 1:nrow(df)}

  df = Concat_stringColsDf(
    df=df,
    cols=itemCols,
    colName="item_combin",
    sepStr="-")

  df = Concat_stringColsDf(
    df=df,
    cols=sliceCols,
    colName="slice_combin",
    sepStr="-")

  itemColsStr = paste(itemCols, collapse="_")
  sliceColsStr = paste(sliceCols, collapse="_")

  df2 = unique(df[ , c("item_combin", "slice_combin")])
  df3 = df2[order(df2[ , "slice_combin"]), ]
  dt3 = data.table(df3)

  dfItemCount_perSlice = data.frame(dt3[ , .N, by="slice_combin"])
  names(dfItemCount_perSlice)[names(dfItemCount_perSlice) == "N"] = "item_combin_count"

  dfItemCount_perSlice$slice_item_index =  lapply(
      X=as.list(dfItemCount_perSlice[ , "item_combin_count"]),
      FUN=function(x){list(1:x)})

  if (is.null(sliceCombinValues_toBalance)) {
    minSs = min(dfItemCount_perSlice[ , "item_combin_count"])
    # if there is only once slice remaining, we assign False to all
    if (nrow(dfItemCount_perSlice) < 2){
      minSs = -Inf
    }
  }  else {
    df0 = dfItemCount_perSlice[
      dfItemCount_perSlice[ , "slice_combin"] %in% sliceCombinValues_toBalance, ]
    minSs =  min(df0[ , "item_combin_count"])
    # if there is only once slice remaining, we assign False to all
    if (nrow(df0) < 2){
      minSs = -Inf
    }
  }

  dfItemSliceIndex = Flatten_repField(
      df=dfItemCount_perSlice,
      listCol="slice_item_index")

  dfItemSliceIndex = SubsetCols(
      df=dfItemSliceIndex,
      dropCols="item_combin_count")

  colName = paste0(sliceColsStr, ".", itemColsStr, "_index")
  boolColName = paste0("balanced_", sliceColsStr, "__", itemColsStr)

  setnames(x=dfItemSliceIndex, old="slice_item_index", new=colName)

  if (is.null(sliceCombinValues_toBalance)) {
    dfItemSliceIndex[boolColName] = dfItemSliceIndex[colName] <= minSs
  } else {
    dfItemSliceIndex[ , boolColName] = (
        (dfItemSliceIndex[ , colName] <= minSs) |
        !dfItemSliceIndex[ , "slice_combin"] %in% sliceCombinValues_toBalance)
  }

  df3[ , colName] = dfItemSliceIndex[ , colName]
  df3[ , boolColName] = dfItemSliceIndex[ , boolColName]
  fullDf = merge(df, df3, all.x=TRUE, by=c("item_combin", "slice_combin"))

  df0 = fullDf[ , c(sliceCols, boolColName, "item_combin")]
  dt0 = data.table(df0)
  infoDf = dt0[ , .(item_combin_count=length(unique(item_combin))),
               by=c(sliceCols, boolColName)]

  fullDf = SubsetCols(
      df=fullDf,
      dropCols=c("item_combin", "slice_combin"))
  subDf = fullDf[fullDf[ , boolColName], ]

  subDf = SubsetCols(
      df=subDf,
      dropCols=c(colName, boolColName))

  return(list("fullDf"=fullDf, "subDf"=subDf, "infoDf"=infoDf))
}

TestBalanceSampleSize = function() {

  n = 100
  df = data.frame(
      "user_id"=1:n,
      "country"=sample(c("us", "jp"), n, replace=TRUE),
      "date"=sample(c("1/1", "1/2", "1/3"), n, replace=TRUE))

  Mark(df[1:2, ])

  res = BalanceSampleSize(
      df=df,
      sliceCols=c("country"),
      itemCols=c("user_id"),
      sliceCombinValues_toBalance=NULL)

  Mark(res["infoDf"])

  ## partial balancing
  res = BalanceSampleSize(
      df=df,
      sliceCols="country",
      itemCols=c("user_id", "date"),
      sliceCombinValues_toBalance=c("JP", "FR"))

  Mark(res["infoDf"])
}


# This will make sure that the sample size is the same
# for each (multi-dimensional) value of "wrt_cols" across
# slice_cols. For example for if wrt_cols = [country], slice_cols = [expt_id]
# for Japan we will have same number of
# units on base and test arms eg 2 and 2
# and for US we will have same number eg 3 and 3.
# TODO: Reza Hosseini resolve BUG: if RU has 3 items on base and no items on
# test. RU base will be kept at 3. Maybe RU has to be dropped.
BalanceSampleSize_wrtCols = function(
    df,
    sliceCols,
    wrtCols,
    itemCols=NULL,
    sliceCombinValues_toBalance=NULL) {

  df = Concat_stringColsDf(
    df=df,
    cols=wrtCols,
    colName='wrt_combin',
    sepStr='-')

  Func = function(group) {

    df0 = df[df["wrt_combin"] == group, ]
    res = BalanceSampleSize(
      df=df0,
      sliceCols=sliceCols,
      itemCols=itemCols,
      sliceCombinValues_toBalance=sliceCombinValues_toBalance)

    return(res[['subDf']])}

  groups = unique(df[ , "wrt_combin"])

  subDf = do.call(what=rbind, args=lapply(FUN=Func, X=groups))

  subDf = SubsetCols(subDf, keepCols=colnames(df))

  return(list("subDf"=subDf))
}

TestBalanceSampleSize_wrtCols = function() {

  n = 20
  df = data.frame(
      "treat"=c(rep(0, n), rep(1, n)),
      "subclass"=sample(1:5, 2*n, replace=TRUE))

  sliceCols = "treat"
  wrtCols = "subclass"

  res = BalanceSampleSize_wrtCols(
      df=df,
      sliceCols=sliceCols,
      wrtCols=wrtCols,
      itemCols=NULL,
      sliceCombinValues_toBalance=NULL)

  subDf = res[["subDf"]]

  dt = data.table(df)
  subDt = data.table(subDf)

  SortDf(dt[ , .N, c(sliceCols, wrtCols)], "subclass")
  SortDf(subDt[ , .N, c(sliceCols, wrtCols)], "subclass")

}


Compact_condDist = function(df, groupCols, valueCol, quantileNum=40) {
  # create a conditional distbn
  dt = data.table(df)
  AggFunc = function(x, num=quantileNum) {
    qs = quantile(x, seq(0, 1, 1/num), na.rm=TRUE)
    return(as.list(qs))
  }

  dt = dt[ , c(groupCols, valueCol), with=FALSE]
  distDt = dt[ , AggFunc(value), by=groupCols]

  return(distDt)

}

# test the content of this function
TestCompact_condDist = function() {

  n = 1000
  df = data.frame(
      x=sample(paste0("label", 1:100), n, replace=TRUE),
      gender=sample(c("male", "female"), n, replace=TRUE),
      value=rnorm(n))

  qDt = Compact_condDist(
      df=df,
      groupCols=c("x", "gender"),
      valueCol="value",
      quantileNum=40)

}
Reza1317/funcly documentation built on Feb. 5, 2020, 4:06 a.m.