#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# author: Reza Hosseini
## helper functions for R data analysis
# proprietary information
#' @import data.table
Mark = function(x, text="") {
str = paste0(
"\n *** object class: ",
paste(as.character(class(data.table())), collapse=", "))
if (text != "") {
str = paste0(str, "; message: ", text)
}
cat(str, "\n")
print(x)
}
TestMark = function() {
x = 2
Mark(x, text="this is the value of x")
}
# print a vector horizontally
Pr = function(x) {
for (i in 1:length(x)) {
print(x[i])
}
}
## print columns of data frame in readable format
PrCols = function(df) {
Pr(colnames(df))
}
## some basic info about data.frame
DescDf = function(df) {
print(paste0(
"this is the dim: ", "row_num: ", dim(df)[1], "; col_num: ", dim(df)[2]))
#print("sample data")
#print(df[1, ])
for (col in colnames(df)) {
x = df[[col]]
print(paste0(col, ": ", class(x)))
tab = table(x)
k = min(length(tab), 8)
freqDf = data.frame(tab)[1:k, ]
print(freqDf)
}
}
# wait for key to continue
Pause = function() {
cat ("Press [enter] to continue")
line = readline()
}
## checks if a library is installed
IsInstalled = function(pkg) {
is.element(pkg, installed.packages()[ , 1])
}
## it installs a library if its not installed already
InstallIf = function(pkg, Func=install.packages, ...) {
if (!IsInstalled(pkg)) {
Func(pkg, ...)
}
}
## it installs a library if not installed and it loads it regardless
InstallLoad = function(pkg) {
InstallIf(pkg)
library(pkg, character.only=TRUE)
}
## same as above but does it for set of libraries
InstallLoadMulti = function(pkgs) {
for (pkg in pkgs) {
InstallIf(
pkg,
#repos = c("http://rstudio.org/_packages", "http://cran.rstudio.com"),
#dependencies=TRUE
)
eval(parse(text=paste0("library(package=", pkg, ")")))
}
}
## purpose is to release memory by "removing" objects
# this function assigns NULL to the objects passed by their name in global env.
# then it collects the garbage twice (deliberately)
# setting to NULL is done since after "rm" and "gc"
# sometimes R does not release memory
Nullify = function(objectNames) {
for (objectName in objectNames) {
eval(parse(text=paste(objectName, "<<- NULL")))
gc()
gc()
}
}
TestNullify = function() {
x = rnorm(10^8)
y = x + 1
z = x + 2
## first nullify the objects
Nullify(c("x", "y", "z"))
## then remove them if you like,
# although they don't take space
rm("x", "y", "z")
## Nullify seems to work inside a function as well
x = rnorm(10^8)
(function()Nullify("x"))()
## this will be true
x == NULL
## Nullify seems to release memory after setting a local variable to NULL
# as well
}
## memory management
LsObjects = function(
pos=1,
pattern,
order.by,
decreasing=FALSE,
head=FALSE,
n=5) {
napply = function(names, fn) sapply(names, function(x)
fn(get(x, pos = pos)))
names = ls(pos = pos, pattern = pattern)
obj.class = napply(names, function(x) as.character(class(x))[1])
obj.mode = napply(names, mode)
obj.type = ifelse(is.na(obj.class), obj.mode, obj.class)
obj.prettysize = napply(
names,
function(x) {
capture.output(format(utils::object.size(x), units="auto"))}
)
obj.size = napply(names, object.size)
obj.dim = t(napply(names, function(x)
as.numeric(dim(x))[1:2]))
vec = is.na(obj.dim)[, 1] & (obj.type != "function")
obj.dim[vec, 1] = napply(names, length)[vec]
out = data.frame(obj.type, obj.size, obj.prettysize, obj.dim)
names(out) = c("Type", "Size", "PrettySize", "Rows", "Columns")
if (!missing(order.by))
out = out[order(out[[order.by]], decreasing=decreasing), ]
if (head)
out = head(out, n)
out
}
# shorthand
ReportMemoryUsage = function(..., n=10) {
LsObjects(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
}
ReportObjectSize = function(x, name="") {
out = paste(
"size of", name, "is", round(as.numeric(object.size(x) / 10^6), 3), "MB")
return(out)
}
ReportObjectSize_true = function(x, name="") {
size = length(serialize(modList, NULL)) / 10^6
out = paste(
"size of", name, "is", round(size, 3), "MB")
return(out)
}
### git functions:
SourceHttps = function(url, ...) {
# load package
require(RCurl)
# parse and evaluate each .R script
sapply(
c(url, ...),
function(u) {
eval(parse(text = getURL(u, followlocation = TRUE,
cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))),
envir = .GlobalEnv)
}
)
}
# Example
TestSourceHttps = function() {
SourceHttps("https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/bingSearchXScraper/bingSearchXScraper.R",
"https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R")
}
## Reading and Writing Files
# open the files in a dir; if fileList is not specific, we try to open all
OpenDataFiles = function(
path,
fileList=NULL,
ReadF=read.csv,
colClasses=NA,
patterns=NULL,
Src=function(){},
parallel=FALSE,
parallel_outfile="") {
if (is.null(fileList)) {
fileList = list.files(path)
}
if (!is.null(patterns)) {
fileListInd = do.call(
intersect,
lapply(
X=patterns,
FUN=function(p){grep(pattern=p, x=fileList)}))
fileList = fileList[fileListInd]
}
k = length(fileList)
if (k == 0) {
warning("no files in path or no files followed the patterns specified.")
return(NULL)
}
## read one file
Func = function(i) {
Src()
fn0 = fileList[[i]]
print(fn0)
fn = paste(path, fn0, sep="")
fn = file(fn)
df = ReadF(fn, colClasses=colClasses)
return(df)
}
if (!parallel) {
dfList = lapply(X=1:k, FUN=Func)
} else {
suppressMessages(library("parallel"))
closeAllConnections()
no_cores = detectCores() - 3
no_cores = min(no_cores, k + 1)
Mark(no_cores, "no_cores")
# Initiate cluster
cl = makeCluster(no_cores, outfile=parallel_outfile)
clusterExport(
cl=cl,
list(
"fileList", "path", "ReadF", "Src"),
envir=environment())
dfList = parLapply(cl=cl, X=1:k, fun=Func)
stopCluster(cl)
closeAllConnections()
}
names(dfList) = fileList
return(dfList)
}
# (old function, check) Read data directory
ReadDataDir = function(
dir,
Read=read.csv,
ProcessDf=NULL,
num=NULL) {
fileNames = list.files(dir, pattern="*.csv")
if (!is.null(num)) {
k = min(num, length(fileNames))
fileNames = fileNames[1:k]
}
fileNames2 = paste0(dir, '/', fileNames)
G = Read
if (!is.null(ProcessDf)) {
G = function(x, ...) {
ProcessDf(Read(x, ...))
}
}
dfList = lapply(X=fileNames2, FUN=G)
return(dfList)
}
## write a dataframe to a file with separators;
# can be used for qwiki for example.
WriteDataWithSep = function(
fn, path=NULL, data, dataSep="|", headerSep="||") {
options("scipen"=100, "digits"=4)
fn = paste(path, fn, sep="")
sink(fn)
n = dim(data)[1]
l = dim(data)[2]
header = names(dat2)
cat(headerSep)
for (j in 1:l) {
cat(header[j])
cat(headerSep)
}
cat("\n")
for (i in 1:n) {
cat(dataSep)
for (j in 1:l) {
el = as.character(data[i, j])
cat(el)
cat(dataSep)
}
cat("\n")
}
sink()
}
## removes trailing spaces
TrimTrailing = function(x) {
gsub("^\\s+|\\s+$", "", x)
}
## flatting a column with repeated field
# below we have another version of same function
# which is most likely faster for large data
Flatten_repField_v1 = function(df, listCol, sep=NULL) {
if (!is.null(sep)) {
s = strsplit(as.character(df[ , listCol]), split=sep)
}
cols = colnames(df)
cols2 = cols[cols != listCol]
outDf = data.frame(listCol=unlist(s))
colnames(outDf) = listCol
for (col in cols2) {
outDf[ , col] = rep(df[ , col], sapply(s, length))
}
return(outDf)
}
TestFlatten_repField_v1 = function() {
df = data.frame(list("var1"=c("a,b,c", "d,e,f"), "var2"=1:2, "var3"=3:4))
print(df)
Flatten_repField_v1(df=df, listCol="var1", sep=",")
df = data.frame(list("var1"=I(list(1:3, 4:6)), "var2"=1:2, "var3"=3:4))
print(df)
Flatten_repField_v1(df=df, listCol="var1", sep=NULL)
}
## flatten a column (listCol) of df with multiple values
# the column elements could be lists
# or could be separated by sep e.g. comma
# this is faster than v1, but more complex
Flatten_repField = function(df, listCol, sep=NULL) {
if (!is.null(sep)) {
Func = function(x) {
l = as.vector(strsplit(x, sep)[[1]])
return(l)
}
df$newListCol = lapply(X=as.character(df[ , listCol]), FUN=Func)
} else {
df$newListCol = df[ , listCol]
df = SubsetCols(df=df, dropCols=listCol)
}
cols = names(df)[names(df) != "newListCol"]
dt = data.table(df)[ , unlist(newListCol), by=cols]
df = data.frame(dt)
df[ , listCol] = df[ , "V1"]
df = df[ , ! names(df) %in% "V1"]
return(df)
}
TestFlatten_repField = function() {
df = data.frame(list("var1"=c("a;b;c", "d;e;f"), "var2"=1:2, "var3"=3:4))
print(df)
Flatten_repField(df=df, listCol="var1", sep=";")
df = data.frame(list("var1"=I(list(1:3, 4:6)), "var2"=1:2, "var3"=3:4))
print(df)
Flatten_repField(df=df, listCol="var1", sep=NULL)
}
## creating a single string column using multiple columns (cols)
# and adding that to the data frame
Concat_stringColsDf = function(df, cols, colName=NULL, sepStr="-") {
x = ""
if (is.null(colName)){
colName = paste(cols, collapse=sepStr)
}
for (i in 1:length(cols)) {
col = cols[i]
x = paste0(x, as.character(df[ , col]))
if (i < length(cols)) {
x = paste0(x, "-")
}
}
df[ , colName] = x
return(df)
}
TestConcat_stringColsDf = function() {
df = data.frame(list("a"=1:3, "b"=c("rr", "gg", "gg"), "c"=1:3))
Concat_stringColsDf(df=df, cols=c("a", "b", "c"), colName=NULL, sepStr="-")
}
## left and right quantiles
# these quantiles have many desirable properties compared to
# regular quantiles:
# (1) the result has been observed
# (2) its symmetric
# (3) its truely equivariant
# See: Statistical models for agroclimate risk analysis (2010)
# Reza Hosseini, UBC.
# https://open.library.ubc.ca/cIRcle/collections/ubctheses/24/items/1.0070885
Quantile = function(x, probs, direction="left") {
x = na.omit(x)
x = sort(x)
n = length(x)
LeftQuantileSorted = function(x, p) {
npi = (n*p) %/% 1
if (n*p > npi) {
lqv = x[npi + 1]} else if (n*p == npi & p != 0) {
lqv = x[npi]} else if (p == 0) {
lqv = -Inf}
return(lqv)
}
RightQuantileSorted = function(x, p) {
npi = (n*p) %/% 1
if (n*p > npi) {
rqv = x[npi + 1]} else if (n*p == npi & p != 1) {
rqv = x[npi + 1]} else if (p == 1) {
rqv = Inf}
return(rqv)
}
G = LeftQuantileSorted
if (direction != "left") {
G = RightQuantileSorted
}
Func = function(p) {
G(x=x, p=p)
}
qs = sapply(FUN=Func, X=probs)
return(qs)
}
TestQuantile = function() {
x = c(1, 2, 3)
probs = 0:10 / 10.0
Quantile(x, probs, direction="left")
Quantile(x, probs, direction="right")
}
## subset columns of a data frame / data table in a clear way
SubsetCols = function(df, keepCols=NULL, dropCols=NULL) {
cols = colnames(df)
if (!is.null(keepCols)) {
cols = cols[cols %in% keepCols]
}
if (!is.null(dropCols)) {
cols = cols[!(cols %in% dropCols)]
}
if ("data.table" %in% class(df)) {
return(df[ , cols, with=FALSE])
}
return(df[ , cols, drop=FALSE])
}
TestSubsetCols = function() {
dt0 = data.table(x=1:10, y=1:10, z=1:10)
colnames(dt0)
SubsetCols(df=dt0, keepCols=NULL, dropCols="x")
funcly::SubsetCols(df=dt0, keepCols=NULL, dropCols="x")
cols = c("x", "y")
dt0[ , cols, with=FALSE]
}
## get a column values from data.table without failure
# failure can happen via "get" if the passed parameter for column name
# already is the same as the column name in the data table
# see example in test function for issue with get
Col = function(dt, col) {
dt[[col]]
}
TestCol = function(){
df = data.frame("x"=1:10, "y"=1:10)
dt = data.table(df)
dt[ , "x"] # this will be a data frame still
dt[ , x] # is a vector as we desire
col = "x"
dt[ , get(col)] # that does work
x = "x" # this is where "get" gets confused
dt[ , get(x)] # returns error: Error in get(x) : invalid first argument
# on the other hand, all of the below work
Col(dt, col)
Col(dt, x)
}
## a function which handles NA and accepts a proportion of NAs up to 90%
HandleNaPropFcn = function(Fcn, p=0.9) {
Func = function(x) {
l = length(x)
k = sum(!is.na(x))
if ((k/l) > p) {
x = na.omit(x)
return(Fcn(x))
}
return(NA)
}
return(Func)
}
#dt1 = copy(dt0)
#cols = c(gbCols, valueCols)
#dt0 = dt0[ , cols, with=FALSE]
#colnames(dt0) = paste0("X", 1:length(cols))
#gbCols2 = colnames(dt0)[1:length(gbCols)]
#outDt = dt0[ , lapply(.SD, F), by=gbCols2]
#colnames(outDt) = cols
#outDt = dt1[ , lapply(.SD, F), by=as.list(dt1[ , gbCols, with=FALSE])]
## simple aggregation with data.table
# gbCols are group by columns
# valueCols are the value columns we want to aggregate
# cols is c(gbCols, valueCols) so only two out of three are needed
# AggFunc is the aggregate function
DtSimpleAgg = function(
dt,
gbCols=NULL,
valueCols=NULL,
cols=NULL,
AggFunc=sum) {
library(data.table)
## this aggregates multiple columns with the same function
# this first subsets the data to the cols we need
# then it aggregates with F
# if we are not given the valueCols or all cols we need
# we assume we need all cols in the dt
if (is.null(cols) & (is.null(valueCols) | is.null(gbCols))) {
cols = names(dt)
}
if (is.null(cols)) {
cols = c(gbCols, valueCols)
}
if (is.null(gbCols)) {
gbCols = setdiff(cols, valueCols)
}
dt = dt[ , cols, with=FALSE]
outDt = dt[ , lapply(.SD, AggFunc), by=gbCols]
return(outDt)
}
TestDtSimpleAgg = function() {
x = rnorm(100000)
y = rnorm(100000)
dt = data.table(x=x, y=y, x1=round(x), y1=round(y)+1)
DtSimpleAgg(
dt=dt,
gbCols=c("x1", "y1"),
valueCols=c("x", "y"),
AggFunc=mean)
funcly::DtSimpleAgg(
dt=dt,
gbCols=c("x1", "y1"),
valueCols=c("x", "y"),
AggFunc=mean)
}
## calculating bootstrap conf intervals for win/loss ratio
# input is a binary vector
# the idea is to use bootstrap
# for any bootstrap sample which degenerates
# because we have all ones or all zeros, we add (0,1) to the vector
# we also do that to the original vector!
# if we don't confidence intervals for vectors
# such as (0,0) will be one point (or (1,1))
Ci_forWLRatio = function(x) {
flag = 'None'
if (sum(x) == 0) {
x = c(x, 1, 0)
flag = 'Zero'
}
if (sum(1-x) == 0) {
x = c(x, 0, 1)
flag = 'Inf'
}
Bootst = function(data, F, num=1000) {
n = dim(data)[1]
G = function(i) {
samp = sample(1:n, n, replace=TRUE)
data2 = data[samp, , drop=FALSE]
F(data2)
}
ind = as.list(1:num)
res = lapply(X=ind, FUN=G)
res =unlist(res)
return(res)
}
data = data.frame(x)
WL = function(data) {
y = data[ , 1]
if (sum(y) == 0) {y = c(y, 1, 0)}
if (sum(1-y) == 0) {y = c(y, 0, 1)}
out = sum(y) / sum(1-y)
return(out)
}
res = Bootst(data=data, F=WL, num=1000)
qVec = quantile(res, c(0.025, 0.975))
## we adjust the extremes of the interval in the degenerate case
if (flag == 'Inf') {
qVec[2] = Inf
}
if (flag == 'Zero') {
qVec[1] = 0
}
return(qVec)
}
TestCi_forWLRatio = function() {
Ci_forWLRatio(c(rep(1, 7)))
Ci_forWLRatio(c(rep(1, 6), 0, 0, 0))
Ci_forWLRatio(c(0, 0, 0))
Ci_forWLRatio(c(1, 0, 0))
Ci_forWLRatio(c(1, 1, 1))
}
## calculates CLT confidence interval
CltCi = function(x, p=0.95) {
muHat = mean(x)
error = qnorm(1 - (1-p)/2) * sd(x)/sqrt(length(x))
upper = muHat + error
lower = muHat - error
return(list("muHat"=muHat, "error"=error, "upper"=upper, "lower"=lower))
}
## calculates CIs for multiple columns in a df and returns a df
CltCiDf = function(df, cols, p=0.95) {
F = function(col) {
x = df[ , get(col)]
return(CltCi(x, p=p))
}
res = lapply(cols, FUN=F)
names(res) = cols
outDf = data.frame(matrix(unlist(res), nrow=4, byrow=TRUE))
outDf[ , "metric"] = cols
names(outDf) = c("muHat", "error", "upper", "lower", "metric")
outDf = outDf[ , c("metric", "muHat", "error", "upper", "lower")]
return(outDf)
}
## calculates relative risk
RelativeRiskCi = function(a1, n1, a2, n2) {
p1 = a1/n1
p2 = a2/n2
if (p2 == 0) {
print("the probability in the denom is zero, infinite risk!")
return()
}
risk = p1/p2
logRisk = log(risk)
se = sqrt(1/a1 + 1/a2 - 1/n1 - 1/n2)
logRiskUpper = logRisk + 1.96*se
logRiskLower = logRisk - 1.96*se
riskUpper = exp(logRiskUpper)
riskLower = exp(logRiskLower)
return(list(
"risk"=risk,
"riskLower"=riskLower,
"riskUpper"=riskUpper,
"logRisk"=logRisk,
"logRiskUpper"=logRiskUpper,
"logRiskLower"=logRiskLower,
"logScaleError"=1.96*se
))
}
## calculates an upper bound/conservative CI for
# relative risk when the sample sizes are missing
# but their relative size is know
# e.g. this is true for experiment mods
RelativeRiskCi_approx = function(a1, a2, n2_n1_ratio=1) {
risk = a1/a2 * n2_n1_ratio
logRisk = log(risk)
se = sqrt(1/a1 + 1/a2)
logRiskUpper = logRisk + 1.96*se
logRiskLower = logRisk - 1.96*se
riskUpper = exp(logRiskUpper)
riskLower = exp(logRiskLower)
return(list(
"risk"=risk,
"riskLower"=riskLower,
"riskUpper"=riskUpper,
"logRisk"=logRisk,
"logRiskUpper"=logRiskUpper,
"logRiskLower"=logRiskLower,
"logScaleError"=1.96*se
))
}
TestRelativeRiskCi_approx = function() {
Func = function(n1) {
a1 = 30
e1 = RelativeRiskCi(a1=a1, n1=n1, a2=2*a1, n2=3*n1)[["logScaleError"]]
e2 = RelativeRiskCi_approx(a1=a1, a2=2*a1, n2_n1_ratio=3)[["logScaleError"]]
return(c(e1, e2))
}
grid = (a1 + 1):200
res = lapply(grid, FUN=Func)
outDf = data.frame(matrix(unlist(res), nrow=length(grid), byrow=TRUE))
plot(
grid, outDf[ , 1], ylim=c(0, 2*outDf[1, 2]),
col="blue", ylab="CI error in log risk scale", xlab="n1")
abline(h=outDf[1, 2], col="red")
abline(v=2*a1, col="grey")
text(x=2*a1, y=outDf[1, 2]/2, labels="2*a1")
text(x=grid[length(grid)]-5, y=outDf[1, 2], labels="approx")
}
# remap low freq labels to a new label in data
# this is useful to avoid model breakage
# this also remaps NAs to the newLabel
# labelsNumMax decides whats the max number of labels allowed
Remap_lowFreqCategs = function(
dt,
cols,
newLabels="other",
otherLabelsToReMap=NULL,
freqThresh=5,
labelsNumMax=NULL,
remapNA=TRUE) {
# first check if all cols passed on are present in the data.table
ind = cols %in% colnames(dt)
if (sum(ind) < length(ind)) {
warning(paste(
"Not all cols provided are in the data.table.\n",
"These cols are missing:\n",
paste(cols[!ind], collapse="; "),
"\n",
"We return dt unchanged.",
collapse=" "))
return(dt)
}
for (col in cols) {
c = class(dt[[col]])
if (c != "character") {
warning(paste(
"class of the column: ", col,
" was not character and was set to character."))
dt[ , col] = as.character(dt[[col]])
}
}
if (!"data.table" %in% class(dt)) {
warning("dt is not a data.table")
return(list("dt"=dt, "Func"=NULL, "freqLabelsList"=NULL))
}
dt2 = copy(dt)
k = length(cols)
if (length(freqThresh) == 1) {
freqThresh = rep(freqThresh, k)
}
if (length(newLabels) == 1) {
newLabels = rep(newLabels, k)
}
if (!is.null(labelsNumMax) && length(labelsNumMax) == 1) {
labelsNumMax = rep(labelsNumMax, k)
}
GetFreqLabels = function(i) {
col = cols[i]
freqDt = data.table(table(dt2[[col]]))
colnames(freqDt) = c(col, "freq")
freqDt = freqDt[order(freq, decreasing=TRUE)]
freqLabels = freqDt[freq > freqThresh[i]][[col]]
if (!is.null(labelsNumMax)) {
maxNum = min(length(freqLabels), labelsNumMax[i])
freqLabels = freqLabels[1:maxNum]
}
if (!is.null(otherLabelsToReMap)) {
freqLabels = setdiff(freqLabels, otherLabelsToReMap)
}
return(freqLabels)
}
freqLabelsList = lapply(X=1:k, FUN=GetFreqLabels)
names(freqLabelsList) = cols
Func = function(dt) {
for (i in 1:length(cols)) {
col = cols[i]
newLabel = newLabels[i]
badLablesNum = sum(!dt[[col]] %in% freqLabelsList[[col]])
if (badLablesNum > 0) {
data.table::set(
dt,
i=which(!dt[[col]] %in% freqLabelsList[[col]]),
j=col,
value=newLabel)}
dt = ReplaceNA(df=dt, cols=cols[i], replaceValue=newLabels[i])
}
return(dt)
}
return(list("dt"=Func(dt2), "Func"=Func, "freqLabelsList"=freqLabelsList))
}
TestRemap_lowFreqCategs = function() {
dt = data.table(data.frame(
"country"=c("", rep("US", 10), rep("IN", 3), rep("FR", 10), "IR", ""),
"gender"=c(
"", rep("MALE", 10), rep("FEMALE", 10), rep("OTHER", 3), "NONE", ""),
"value"=rnorm(26)))
res = Remap_lowFreqCategs(
dt=dt, cols=c("country", "gender"), otherLabelsToReMap=c(""),
freqThresh=5)
print(res)
dt2 = data.table(data.frame(
"country"=c("", rep("NZ", 10), rep("IN", 10), rep("FR", 3), "IR", ""),
"gender"=c(
"", rep("MALE", 10), rep("FEMALE", 10), rep("OTHER", 3), "NONE", ""),
"value"=rnorm(26)))
res[["Func"]](dt2)
}
# if a label of a column is rare, we add one to the flag
FlagRow_ifLowFreqValue = function(dt, cols, freqThresh, flagCol="flag") {
dt[ , flagCol] = 0
for (col in cols) {
if (!col %in% colnames(dt)) {
warning(paste(col, " was not in the columns."))
return(NULL)
}
freqDt = data.table(table(dt[[col]]))
colnames(freqDt) = c(col, "freq")
freqDt = freqDt[freq <= freqThresh, ]
if (nrow(freqDt) > 0) {
rareLabels = freqDt[[col]]
dt[dt[[col]] %in% rareLabels, flagCol] = (
dt[dt[[col]] %in% rareLabels, ][[flagCol]] + 1)
}
}
return(dt)
}
TestFlagRow_ifLowFreqValue = function() {
n = 20
x = sample(
c("horse", "cat", "cat", "dog", "dog", "cat"),
n,
replace=TRUE)
y = sample(
c("horse", "cat", "cat", "dog", "dog", "cat"),
n,
replace=TRUE)
z = x
df = data.frame("x"=x, "y"=y, "z"=z)
dt = data.table(df)
FlagRow_ifLowFreqValue(
dt=dt,
cols=c("x", "z"),
freqThresh=2,
flagCol="flag")
FlagRow_ifLowFreqValue(
dt=dt,
cols=c("x", "y", "z"),
freqThresh=2,
flagCol="flag")
}
## quick check
CheckColFreqDt = function(dt, col) {
freqDf = data.frame(table(as.character(dt[ , get(col)])))
freqDf = freqDf[order(freqDf[ , "Freq"], decreasing=TRUE), ]
rownames(freqDf) = NULL
Mark(dim(freqDf), "dim(freqDf)")
Mark(freqDf[1:min(50, nrow(freqDf)), ], "freqDf")
return(freqDf)
}
## replaces all NAs in a data.table dt, for given cols
DtReplaceNa = function(dt, cols=NULL, replaceValue=0) {
dt2 = copy(dt)
if (is.null(cols)) {
cols = names(dt2)
}
for (col in cols) {
dt2[is.na(dt[[col]]), (col) := replaceValue]
}
return(dt2)
}
TestDtReplaceNa = function() {
x = sample(
c(NA, "horse", "cat", "cat", "dog", "dog", "cat"),
n,
replace=TRUE)
y = sample(
c(NA, "horse", "cat", "cat", "dog", "dog", "cat"),
n,
replace=TRUE)
z = x
df = data.frame("x"=x, "y"=y, "z"=z)
dt = data.table(df)
DtReplaceNa(dt, cols=colnames(dt), replaceValue="other")
}
## categorical mode
CategMode = function(x) {
x = na.omit(x)
if (length(x) == 0) {
return(NULL)
}
ux = unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
RowMode = function(mat) {
apply(mat, 1, CategMode)
}
Example = function() {
x = c('a', 'a', 'b', 'c')
categMode(x)
M = matrix(sample(c('a', 'b', 'c'), 25, replace=TRUE), 5, 5)
RowMode(M)
}
## continuous mode
ContiMode = function(x) {
d = density(x)
d[["x"]][which.max(d[["y"]])]
}
## replacing NAs with mode or means
DtRemapNa = function(
dt,
cols=NULL,
NumericReplace=function(x){mean(x, na.rm=TRUE)},
FactorReplace=CategMode) {
dt2 = copy(dt)
if (is.null(cols)) {
cols = names(dt2)
}
for (col in cols) {
naProp = sum(is.na(dt2[ , get(col)])) / nrow(dt2)
if (naProp > 0) {
print(col)
print(naProp)
print(class(dt2[ , get(col)]))
if (class(dt2[ , get(col)]) %in% c("numeric")) {
set(dt2, i=which(is.na(dt2[ , get(col)])), j=col,
value=mean(dt2[ , get(col)], na.rm=TRUE))
}
if (class(dt2[ , get(col)]) %in% c("integer")) {
set(dt2, i=which(is.na(dt2[ , get(col)])), j=col,
value=round(NumericReplace(dt2[ , get(col)])))
}
if (class(dt2[ , get(col)]) %in% c("factor", "character")) {
set(dt2, i=which(is.na(dt2[ , get(col)])), j=col,
value=CategMode(dt2[ , get(col)]))
}
}
}
return(dt2)
}
## remap values in a column (col) to other given values in newValues
DtRemap_colValues = function(dt, col, values, newValues, newCol=NULL) {
if (length(values) != length(newValues)) {
stop("Error: values and newValues do not have the same length")
}
if (is.null(newCol)) {
newCol = col
}
F = function(x) {
if (!(x %in% values)) {
return(x)
} else {
i = which(values == x)[1]
return(newValues[i])
}
}
dt[ , newCol] = sapply(dt[ , get(col)], FUN=F)
return(dt)
}
TestDtRemap_colValues = function() {
dt = data.table(x=c("a", "b", "c", "a"), y=1:4)
values = c("a", "b")
newValues = c("A", "B")
DtRemapColValues(dt, col="x", values=values, newValues=newValues)
}
## rounds the numeric columns in a df
RoundDf = function(df, num=1) {
cols = names(df)[unlist(lapply(df, is.numeric))]
if (is.null(cols)) {
return(df)
}
df[ , cols] = round(df[ , cols], num)
return(df)
}
## rounds the numeric columns in a dt
RoundDt = function(dt, num=1) {
cols = names(dt)[which(sapply(dt, is.numeric))]
if (is.null(cols)) {
return(dt)
}
dt[ , cols] = round(SubsetCols(df=dt, keepCols=cols), num)
return(dt)
}
## rounds the numeric columns using signif in a df
SignifDf = function(df, num=1) {
cols = names(df)[unlist(lapply(df, is.numeric))]
if (is.null(cols)) {
return(df)
}
df[ , cols] = signif(df[ , cols], num)
return(df)
}
## rounds the numeric columns using signif in a dt
SignifDt = function(dt, num=1) {
cols = names(dt)[which(sapply(dt, is.numeric))]
if (is.null(cols)) {
return(dt)
}
dt[ , cols] = signif(SubsetCols(df=dt, keepCols=cols), num)
return(dt)
}
## adding + or - to CIs to make it easy to find significant ones
StarCiDf = function(
df, upperCol, lowerCol, upperThresh=0, lowerThresh=0,
starCol="sig_stars") {
df[ , starCol] = ""
for (i in 1:length(upperThresh)) {
ind = df[ , lowerCol] > upperThresh[i]
if (length(ind) > 0) {
df[ind, starCol] = paste0(df[ind, starCol], "+")
}
}
for (i in 1:length(lowerThresh)) {
ind = df[ , upperCol] < lowerThresh[i]
if (length(ind) > 0) {
df[ind, starCol] = paste0(df[ind, starCol], "-")
}
}
return(df)
}
##
StarPvalueDf = function(
df, pvalueCol="p-value", thresh=c(0.1, 0.05, 0.01, 0.001, 0.0001),
starCol="pvalue_stars") {
df[ , starCol] = ""
ind = df[ , pvalueCol] < thresh[1]
if (length(ind) > 0) {
df[ind, starCol] = paste0(df[ind, starCol], ".")
}
for (i in 2:length(thresh)) {
ind = df[ , pvalueCol] < thresh[i]
if (length(ind) > 0) {
df[ind, starCol] = paste0(df[ind, starCol], "*")
}
}
return(df)
}
## this is the standard version of StarCiDf
TidyCiDf = function(
df, upperCol="ci_upper", lowerCol="ci_lower",
upperThresh=c(1, 1.5, 2), lowerThresh=c(1, 0.75, 0.5), rounding=3) {
df = StarCiDf(
df=RoundDf(df, rounding), upperCol=upperCol, lowerCol=lowerCol,
upperThresh=c(1, 1.5, 2), lowerThresh=c(1, 0.75, 0.5))
return(df)
}
## Creates a table summary for the output of a regression model coefficients
# e.g. glm
RegMod_coefTableSumm = function(
mod, label, dropVars="(Intercept)", keepVars=NULL, signif=2) {
df = data.frame(summary(mod)[["coefficients"]])
df = df[ , c("Estimate", "Std..Error", "Pr...t..")]
colnames(df) = c("Estimate", "Sd", "p-value")
df[ , "var"] = rownames(df)
df = df[ , c("var", "Estimate", "Sd", "p-value")]
df[ , "model_label"] = label
if (!is.null(dropVars)) {
df = df[!(df[ , "var"] %in% dropVars), ]
}
if (!is.null(keepVars)) {
df = df[df[ , "var"] %in% keepVars, ]
}
df = SignifDf(df=df, num=signif)
rownames(df) = NULL
return(df)
}
## Creates a coef table summary for a list of models
RegModList_coefTableSumm = function(
modList, labels=NULL, dropVars=NULL, keepVars=NULL, signif=2) {
if (is.null(labels)) {
labels = names(modList)
}
Func = function(i) {
mod = modList[[i]]
label = labels[i]
out = RegMod_coefTableSumm(
mod=mod,
label=label,
dropVars=dropVars,
keepVars=keepVars,
signif=signif)
return(out)
}
outDf = do.call(what=rbind, args=lapply(X=1:length(modList), FUN=Func))
return(outDf)
}
## xtable with vertical dividers
# we do not capitalize here as an exception
# since this is a minor tweak to existing function
xtable2 = function(x, caption="", label="label", ...) {
MakeAlignString = function(x) {
k = ncol(x)
format_str = ifelse(sapply(x, is.numeric), "r", "l")
out = paste0("|", paste0(c("r", format_str), collapse = "|"), "|")
return(out)
}
return(xtable(x, caption=caption, label=label, ..., align=MakeAlignString(x)))
}
## entropy
Entropy = function(p) {
if (min(p) < 0 || sum(p) <= 0) {
pNorm = p[p > 0] / sum(p)
}
-sum(log2(pNorm)*pNorm)
}
## should work with both data frame and data table
SplitStrCol = function(df, col, sepStr) {
dt = data.table(df)
if (sepStr == "") {
Func = function(x) {
nchar(x)
}
sepNums = nchar(as.character(dt[ , get(col)]))
colNum = sepNums[1]
} else {
F = function(x) {
lengths(regmatches(x, gregexpr(sepStr, x)))
}
sepNums = unlist(lapply(FUN=Func, X=dt[ , get(col)]))
# sepNums = dt[, F(get(col)), by = 1:nrow(dt)][ , V1]
# second approach but it wasnt really faster
colNum = sepNums[1] + 1
}
print(summary(sepNums))
if (max(sepNums) != min(sepNums)) {
warning("the strings do not have the same number of sepStr within.")
return(NULL)
}
setDT(dt)[ , paste0(col, 1:colNum):=tstrsplit(
get(col), sepStr, type.convert=TRUE, fixed=TRUE)]
newCols = paste0(col, 1:colNum)
return(list("dt"=copy(dt), "newCols"=newCols))
}
TestSplitStrCol = function() {
df = data.frame(
attr = c(1, 30 ,4 ,6 ),
type = c('foo_and_bar_and_bar3', 'foo_and_bar_2_and_bar3')
)
sepStr = "_and_"
col = "type"
SplitStrCol(df=df, col=col, sepStr=sepStr)
dt = data.table(df)
SplitStrCol(df=dt, col=col, sepStr=sepStr)
df = data.frame(
attr = c(1, 30 ,4 ,6 ),
type = c('aaa', 'abc')
)
SplitStrCol(df=data.table(df), col="type", sepStr="")
}
## this applies the jack-knife method
# F is an estimator which is a function of dt and returns a vector
# we want a CI for each of the components returned by F
# dt is a data.table
PartitionCi = function(
dt, Estim, bucketCol=NULL, bucketNum=NULL, method="jk", conf=0.95) {
if (is.null(bucketCol)) {
bucketCol = "bucket"
n = nrow(dt)
bucketSize = floor(n / bucketNum)
r = n - bucketSize*bucketNum
bucketVec = c(rep(1:bucketNum, bucketSize))
if (r > 0) {
bucketVec = c(bucketVec, 1:r)
}
bucketVec = sample(bucketVec)
dt[ , "bucket"] = bucketVec
}
buckets = unique(dt[ , get(bucketCol)])
Jk = function(b) {
dt2 = dt[get(bucketCol) != b]
dt2 = SubsetCols(dt2, dropCols=bucketCol)
return(Estim(dt2))
}
Simple = function(b) {
dt2 = dt[get(bucketCol) == b]
dt2 = SubsetCols(dt2, dropCols=bucketCol)
return(Estim(dt2))
}
if (method == "jk") {
G = Jk
} else {
G = Simple
}
estimList = lapply(X=buckets, FUN=G)
x0 = estimList[[1]]
names = names(x0)
estimDf = setNames(
data.frame(matrix(ncol=length(x0), nrow=length(buckets))),
names)
for (i in 1:length(buckets)) {
estimDf[i, ] = estimList[[i]]
}
CltCi = function(x) {
x = na.omit(x)
m = mean(x)
s = sd(x)
n = length(x)
if (method == "jk") {
estimSd = sqrt(n-1) * s
} else {
estimSd = s / sqrt(n)
}
zValue = 1 - (1-conf) / 2
upper = m + qnorm(zValue) * estimSd
lower = m - qnorm(zValue) * estimSd
return(c(m, estimSd, lower, upper))
}
ciDf = t(apply(estimDf, 2, CltCi))
ciDf = data.frame(ciDf)
colnames(ciDf) = c("mean", "estim sd", "lower", "upper")
ciDf[ , "length"] = ciDf[ , "upper"] - ciDf[ , "lower"]
return(ciDf)
}
BootstrapCi = function(dt, Estim, bsNum=500, conf=0.95) {
q1 = (1 - conf) / 2
q2 = 1 - q1
n = nrow(dt)
Bs = function(b) {
samp = sample(1:n, replace=TRUE)
dt2 = dt[samp, ]
return(Estim(dt2))
}
estimList = lapply(X=1:bsNum, FUN=Bs)
x0 = estimList[[1]]
names = names(x0)
estimDf = setNames(
data.frame(matrix(ncol=length(x0), nrow=bsNum)),
names)
for (i in 1:bsNum) {
estimDf[i, ] = estimList[[i]]
}
BsCi = function(x) {
x = na.omit(x)
m = mean(x)
estimSd = sd(x)
upper = quantile(x, q2)
lower = quantile(x, q1)
return(c(m, estimSd, lower, upper))
}
ciDf = t(apply(estimDf, 2, BsCi))
ciDf = data.frame(ciDf)
colnames(ciDf) = c("mean", "estim sd", "lower", "upper")
ciDf[ , "length"] = ciDf[ , "upper"] - ciDf[ , "lower"]
return(ciDf)
}
TestPartitionCi = function() {
n = 10^4
x1 = rnorm(n, mean=3, sd=10)
x2 = rnorm(n, mean=5, sd=2)
x3 = x1 + x2
df = data.frame(x1=x1, x2=x2, x3=x3)
dt = data.table(df)
bucketCol = NULL
bucketNum = 20
Estim = function(dt) {colMeans(dt)}
PartitionCi(
dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
method="jk", conf=0.95)
PartitionCi(
dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
method="simple", conf=0.95)
BootstrapCi(dt=dt, Estim=Estim, bsNum=1000, conf=0.95)
Estim = function(dt) {mean(dt[[1]])}
PartitionCi(
dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
method="jk", conf=0.95)
PartitionCi(
dt=dt, Estim=Estim, bucketCol=NULL, bucketNum=bucketNum,
method="simple", conf=0.95)
BootstrapCi(dt=dt, Estim=Estim, bsNum=500, conf=0.95)
}
## substituting multiple values
ReplaceStringMulti = function(x, values, subs) {
for (i in 1:length(values)) {
x = gsub(values[i], subs[i], x)
}
return(x)
}
## capitalizes all words in a sentence
CapWords = function(x, splitStr=" ") {
s = strsplit(x, splitStr)[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
sep="", collapse=splitStr)
}
TestCapWords = function() {
CapWords("be free.") == "Be Free."
}
# Cartesian product of string vectors
StringCartesianProd = function(..., prefix="", sep="_") {
#paste0(prefix, levels(interaction(..., sep=sep)))
paste2 = function(...) {
paste(..., sep=sep)
}
df = expand.grid(...)
do.call(what=paste2, args=df)
}
## test for the above function
TestStringCartesianProd = function() {
values = c("active_days_num", "activity_num")
products = c("assist", "search", "watchFeat", "photos", "multi")
periods = c("pre", "post")
valueCols = StringCartesianProd(values, products, periods, sep="_")
}
## sorts data frames and data.tables
# R syntax for sorting is ineffective and not so great inside functions
# this function provides a user friendly approach
# cols: columns to be used for sorting, in order of their importance
# ascend: specifies if the order is ascending (TRUE) or not (FALSE)
# default for ascend is (TRUE, ..., TRUE)
SortDf = function(
df, cols=NULL, ascend=NULL, printCommand=FALSE) {
if (is.null(cols)) {
cols = names(df)
}
if (min(cols %in% names(df)) < 1) {
warning("some of your columns are not in df.")
return(df)
}
if (is.null(ascend)) {
ascend = rep(TRUE, length(cols))
}
commandStr = "order("
for (i in 1:length(cols)) {
dir = ascend[i]
col = cols[i]
if (dir) {
commandStr = paste0(commandStr, " ", col)
} else {
commandStr = paste0(commandStr, " ", "-", col)
}
if (i == length(cols)) {
commandStr = paste0(commandStr, ")")
} else {
commandStr = paste0(commandStr, ",")
}
}
commandStr = paste0("df = df[with(df, ", commandStr,") , ]")
if (printCommand) {
print(commandStr)
}
eval(parse(text=commandStr))
return(df)
}
TestSortDf = function() {
n = 20
df = data.frame(
"first_name"=sample(c("John", "Omar", "Mo"), size=n, replace=TRUE),
"family_name"=sample(c("Taylor", "Khayyam", "Asb"), size=n, replace=TRUE),
"grade"=sample(1:10, size=n, replace=TRUE))
# sort with defaults, nice and easy
SortDf(df=df)
# choose columns and the direction of sorting
SortDf(
df=df,
cols=c("first_name", "family_name", "grade"),
ascend=c(TRUE, TRUE, FALSE),
printCommand=TRUE)
# try same with data table object
SortDf(
df=data.table(df),
cols=c("first_name", "family_name", "grade"),
ascend=c(TRUE, TRUE, FALSE),
printCommand=FALSE)
}
## this returns a function which calculates a relative err
# using norms
# this error function is symmetric with respect to its inputs position
# p denoted the power in L-p norm (p > 0)
SymRelErrFcn = function(p) {
if (p <= 0) {
warning("p has to be positive")
return(NULL)
}
Func = function(x, y) {
z = abs(x - y)
err = 2 * z^p / (abs(x)^p + abs(y)^p)
return(err)
}
return(Func)
}
TestSymRelErrFcn = function() {
x = 3
y = 5
SymRelErrFcn(2)(x, y)
SymRelErrFcn(1)(x, y)
}
## calculates diff between valueCols between two data frames
CalcErrDfPair = function(
df1, df2, valueCols, Err, ErrAvgF=mean, sort=TRUE,
idCols=NULL, checkMatch=TRUE) {
if (nrow(df1) != nrow(df2)) {
warning("length of the data frames is not the same.")
return(NULL)
}
if (is.null(idCols)) {
## the common cols except for valueCols used for sorting and matching
idCols = setdiff(intersect(colnames(df1), colnames(df2)), valueCols)
}
if (sort) {
df1 = SortDf(df=df1, cols=idCols)
df2 = SortDf(df=df2, cols=idCols)
}
if (checkMatch) {
if (!identical(df1[ , idCols], df2[ , idCols])) {
warning(paste(
"id columns:",
paste(idCols, collapse=" "),
"are not matching in values."))
return(NULL)
}
}
errVec = NULL
for (valueCol in valueCols) {
err = ErrAvgF(Err(df1[ , valueCol], df2[ , valueCol]))
errVec = c(errVec, err)
}
names(errVec) = valueCols
return(errVec)
}
TestCalcErrDfPair = function() {
n = 100
x1 = sample(1:n, size=n)
x2 = sample(1:n, size=n)
df1 = data.frame(
"x1"=x1,
"x2"=x2,
"y1"=2*x1 + rnorm(n),
"y2"=2*x2 + rnorm(n))
df2 = data.frame(
"x1"=x1,
"x2"=x2,
"y1"=2*x1 + rnorm(n),
"y2"=1*x2 + rnorm(n))
df3 = df2[sample(1:n, n), ]
Err = SymRelErrFcn(2)
CalcErrDfPair(
df1=df1, df2=df2, valueCols=c("y1", "y2"),
Err=Err, ErrAvgF=mean, sort=TRUE,
idCols=NULL, checkMatch=TRUE)
CalcErrDfPair(
df1, df3, valueCol=c("y1", "y2"), Err=Err, ErrAvgF=mean, sort=TRUE,
idCols=NULL, checkMatch=TRUE)
}
## This function compares two frequency tables
# it returns a row_wise err which is then averaged across rows
# also returns a global err which is standardized by the total freq
# both metrics are symmetric
# note that this is not a distbn distance by default: set distbn_dist=TRUE
# this will compare frequencies by default
FreqTables_simpleDiff = function(
tab1, tab2, AvgF=mean, distbn_dist=FALSE) {
df1 = data.frame(tab1)
colnames(df1) = c("var", "freq1")
df2 = data.frame(tab2)
colnames(df2) = c("var", "freq2")
## if we want a distbn distance we cal probabilities
if (distbn_dist) {
df1[ , "freq"] = df1[ , "freq"] / sum(df1[ , "freq"])
df2[ , "freq"] = df2[ , "freq"] / sum(df2[ , "freq"])
}
compareDf = merge(df1, df2, on=colnames(df1), all=TRUE)
compareDf[is.na(compareDf)] = 0
compareDf[ , "err"] = abs(compareDf[ , "freq1"] - compareDf[ , "freq2"])
denom_elementwise = (0.5*compareDf[ , "freq1"] + 0.5*compareDf[ , "freq2"])
avg_elementwise_err = AvgF(compareDf[ , "err"] / denom_elementwise)
total_freq = sum(compareDf[ , "freq1"]) + sum(compareDf[ , "freq2"])
global_err = sum(compareDf[ , "err"]) / total_freq
return(list(
"avg_elementwise_err"=avg_elementwise_err,
"global_err"=global_err))
}
## which value is the min
MinInd = function(x) {
which(x == min(x))
}
# which row has the min value for col
MinIndDf = function(df, col) {
x = df[ , col]
ind = which(x == min(x))
return(df[ind, , drop=FALSE])
}
## debugging R code
Example = function() {
Func = function() {
on.exit(traceback(1))
G = function() {
x = 1 + "a"
}
G()
}
Func()
#traceback()
}
## for debugging within R
Debug = function(Func) {
on.exit(traceback(1))
Func()
#traceback()
}
## check for a library dependencies
# also tries to find out if those libs are installed by checking library(lib)
# if not installed, it tries to install them
# it reports un-installed ones and the unavailable ones for install
# Install is either install.packages or a custom install function
Check_andFix_dependencies = function(lib, Install) {
library("tools")
libs = package_dependencies(lib)[[1]]
uninstalledLibs = NULL
unavailLibs = NULL
Func = function(lib) {
suppressMessages(library(lib, character.only=TRUE))
return(NULL)
}
for (lib in libs) {
x = tryCatch(
Func(lib),
error=function(e) {lib})
uninstalledLibs = c(uninstalledLibs, x)
}
Func = function(lib) {
suppressMessages(Install(lib))
return(NULL)
}
for (lib in uninstalledLibs) {
x = tryCatch(
Func(lib),
error=function(e) {lib})
unavailLibs = c(unavailLibs, x)
}
return(list(
unavailLibs=unavailLibs,
uninstalledLibs=uninstalledLibs))
}
## drops (multiple) ending vowels from a string
DropEndingVowels = function(s, minLength=2) {
cond = TRUE
while (cond && nchar(s) > minLength) {
if (tolower(substr(s, nchar(s), nchar(s))) %in% c("a", "o", "e", "u", "i")) {
s = substr(s, 1, nchar(s)-1)
} else {
cond = FALSE
}
}
return(s)
}
TestDropEndingVowels = function() {
s = "abbggaae"
DropEndingVowels(s)
s = "abbggaaeuzzai"
DropEndingVowels(s)
}
## drops ending of words of specified characters
DropEndingChars = function(s, chars, minLength=2) {
cond = TRUE
while (cond && nchar(s) > minLength) {
if (tolower(substr(s, nchar(s), nchar(s))) %in% chars) {
s = substr(s, 1, nchar(s)-1)
} else {
cond = FALSE
}
}
return(s)
}
TestDropEndingChars = function() {
s = "abbggaaeuzz"
DropEndingChars(s, chars=c("a", "z", "u"))
}
## abbreviates a string.
# first we abbreviate each word in a string (phrase)
# then we concat them back and abbreviate the whole phrase
AbbrString = function(
s,
wordLength=6,
replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
sep="-",
totalLength=NULL,
wordNumLimit=NULL,
dropEndingVowels=FALSE) {
for (char in replaceStrings) {
s = gsub(char, " ", s)
}
sVec = strsplit(s, " ")[[1]]
sVec = substr(sVec, 1, wordLength)
sVec = sVec[!sVec %in% c("", " ", " ", " ")]
if (dropEndingVowels) {
sVec = sapply(FUN=DropEndingVowels, X=sVec)
}
sVec = unique(sVec)
if (!is.null(wordNumLimit)) {
sVec = sVec[1:min(wordNumLimit, length(sVec))]
}
s = paste(sVec, collapse=sep)
if (!is.null(totalLength)) {
s = substr(s, 1, totalLength)
}
if (dropEndingVowels) {
s = DropEndingVowels(s)
}
s = DropEndingChars(
s=s,
chars=c("/", "&", " and ", "-", sep, " ", ",", ";"))
return(s)
}
TestAbbrString = function() {
s = "aa_sasa_aann & jjabbbbbaa --- aaahhh"
AbbrString(
s,
wordLength=6,
replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
sep="-",
totalLength=NULL,
wordNumLimit=4)
s = "get_request_rate"
AbbrString(
s,
wordLength=6,
replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
sep="-",
totalLength=NULL,
wordNumLimit=4)
}
AbbrStringVec = function(
strings,
wordLength=6,
replaceStrings=c("/", "&", " and ", "-", "_", ",", ";"),
sep="-",
totalLength=NULL,
wordNumLimit=NULL,
dropEndingVowels=FALSE) {
Abbr = function(s) {
abbrStr = AbbrString(
s=s,
wordLength=wordLength,
replaceStrings=replaceStrings,
sep=sep,
totalLength=totalLength,
wordNumLimit=wordNumLimit,
dropEndingVowels=dropEndingVowels)
return(abbrStr)
}
abbrValues = sapply(FUN=Abbr, X=strings)
return(abbrValues)
}
## functions to abbreviate various string columns
# of a data frame
AbbrStringCols = function(
df,
cols,
newCols=NULL,
wordLength=6,
replaceStrings=c("/", "&", " and ", "-"),
sep="-",
totalLength=NULL,
wordNumLimit=NULL,
dropEndingVowels=FALSE) {
values = unique(as.vector(as.matrix(df[ , cols])))
print(values)
Abbr = function(s) {
abbrStr = AbbrString(
s=s,
wordLength=wordLength,
replaceStrings=replaceStrings,
sep=sep,
totalLength=totalLength,
wordNumLimit=wordNumLimit,
dropEndingVowels=dropEndingVowels)
return(abbrStr)
}
abbrValues = sapply(FUN=Abbr, X=values)
print(abbrValues)
mapDf = data.frame("value"=values, "abbr_values"=abbrValues)
rownames(mapDf) = NULL
df[ , cols] = plyr:::mapvalues(
as.vector(as.matrix(df[ , cols])),
from=values,
to=abbrValues)
if (!is.null(newCols)) {
df[ , newCols] = df[ , cols]
}
return(list("df"=df, "mapDf"=mapDf))
}
TestAbbrStringCols = function() {
df = data.frame(
"col1"=c("life is beautiful", "i like mountains", "ok", "cool"),
"col2"=c("life is beautiful", "life sucks indeed", "a", "b"))
#AbbrStringCols(df, cols=["col"])
res = AbbrStringCols(
df=df,
cols=c("col1", "col2"), totalLength=10, wordNumLimit=NULL)
res[["df"]]
res[["mapDf"]]
}
## find common string among a few
CommonString = function(strings) {
Intersect = function (x, y) {
y = as.vector(y)
y[match(as.vector(x), y, 0L)]
}
commonStr = paste(Reduce(Intersect, strsplit(strings, NULL)), collapse="")
return(commonStr)
}
TestCommonString = function() {
strings = c("a123a", "abcd123", "123uu123")
CommonString(strings)
}
## sums the columns of a data table which satisfy a certain property
SumCols_viaPattern = function(dt, pattern) {
dt = data.table(dt)
cols = colnames(dt)[grepl(pattern, colnames(dt))]
if (length(cols) == 0) {
warning("No columns satisfied the given pattern. Returns NULL.")
return(NULL)
}
commonStr = CommonString(cols)
text = paste0(
"dt2 = dt[ , ",
commonStr,
":=",
"(",
paste0(cols, collapse="+"), "), ]")
dt2 = eval(parse(
text=text))
return(list("dt"=dt2, "newCol"=commonStr))
}
TestSumCols_viaPattern = function() {
x = matrix(1:100, 10, 5)
df = data.frame(x)
colnames(df) = c(
"abc_lor1_final",
"abc_ltx1_final",
"abc_lkg1_final",
"abc_kjg1_final",
"abc_opt1_final")
dt = data.table(df)
pattern = "^abc.*final$"
res = SumCols_viaPattern(dt=dt, pattern=pattern)
res[["dt"]]
res[["newCol"]]
}
SumCols_multiPatterns = function(dt, patterns) {
newCols = NULL
for (pattern in patterns) {
dt = data.table(dt)
cols = colnames(dt)[grepl(pattern, colnames(dt))]
if (length(cols) == 0) {
warning(paste0(
"No columns satisfied the pattern: ",
pattern,
" Function does nothing for this pattern."))
} else {
commonStr = CommonString(cols)
text = paste0(
"dt2 = dt[ , ",
commonStr,
":=",
"(",
paste0(cols, collapse="+"), "), ]")
dt = eval(parse(
text=text))
newCols = c(newCols, commonStr)
}
}
return(list("dt"=dt, "newCols"=newCols))
}
TestSumCols_multiPatterns = function() {
x = matrix(1:100, 10, 5)
df = data.frame(x)
colnames(df) = c(
"abc_lor1_final",
"abc_ltx1_final",
"abc_lkg1_final",
"efg_kjg1_final",
"efg_opt1_final")
dt = data.table(df)
patterns = c("^abc.*final$", "^efg*")
res = SumCols_multiPatterns(dt=dt, patterns=patterns)
res[["dt"]]
res[["newCols"]]
}
## rewrite the print data.frame function to be same as data.table
# this is to avoids R from attempting to print all of the data frame
print.data.frame = function(df) {
data.table:::print.data.table(data.table(df))
}
DichomVar = function(x, num=6) {
x = na.omit(x)
step = 1 / num
#qs = quantile(x, seq(step, 1-step, step))
qs = Quantile(x, seq(step, 1-step, step))
qs = unique(qs)
qs = c(-Inf, qs, Inf)
Dichom = function(z) {
cut(z, qs)
}
return(list(
"var"=Dichom(x),
"Dichom"=Dichom,
"qs"=qs))
}
Add_dichomVar = function(dt, col, num=6) {
x = Col(dt, col)
res = DichomVar(x, num=num)
Dichom = res[["Dichom"]]
qs = res[["qs"]]
AddDichom = function(dt) {
x = Col(dt, col)
dt[ , paste0(col, "_categ")] = Dichom(x)
return(dt)
}
return(list(
"dt"=AddDichom(dt),
"qs"=qs,
"Dichom"=Dichom,
"AddDichom"=AddDichom,
"newCol"=paste0(col, "_categ")))
}
TestAdd_dichomVar = function() {
n = 100
dt0 = data.table(
x=1:n,
y=2*(1:n) + rnorm(n))
dt = dt0[sample(.N, n/2)]
newDt = dt0[sample(.N, n/2)]
res = Add_dichomVar(dt=dt, col="x", num=6)
Dichom = res[["Dichom"]]
AddDichom = res[["AddDichom"]]
dt = res[["dt"]]
newCol = res[["newCol"]]
AddDichom(newDt)
}
## add multiple dichom columns
Add_dichomVarMulti = function(dt, cols, num=num) {
dt = data.table(dt)
fcnList = list()
addFcnList = list()
newCols = NULL
for (col in cols) {
res = Add_dichomVar(dt=dt, col=col, num=num)
Dichom = res[["Dichom"]]
AddDichom = res[["AddDichom"]]
dt = res[["dt"]]
newCol = res[["newCol"]]
fcnList[[col]] = Dichom
addFcnList[[col]] = AddDichom
newCols = c(newCols, newCol)
}
return(list(
"dt"=dt,
"newCols"=newCols,
"fcnList"=fcnList,
addFcnList="addFcnList"
))
}
TestAdd_dichomVarMulti = function() {
n = 100
dt0 = data.table(
x=1:n,
y=2*(1:n) + rnorm(n))
dt = dt0[sample(.N, n/2)]
newDt = dt0[sample(.N, n/2)]
res = Add_dichomVarMulti(dt=dt, cols=c("x", "y"), num=6)
dt = res[["dt"]]
newCols = res[["newCols"]]
}
## birth year to age
BirthYear_toAgeCateg = function(x, currentYear=NULL) {
if (is.null(currentYear)) {
currentYear = as.integer(format(Sys.Date(), "%Y"))
}
if (is.na(x) | is.null(x) | x == "" | x == 0) {
return("other")
}
x = as.numeric(x)
age = currentYear - x
if (age <= 17) {
return("<18")
}
if (age <= 25) {
return("18-25")
}
if (age <= 35) {
return("26-35")
}
if (age <= 50) {
return("36-50")
}
return(">51")
}
# checks what percentage of each column of df satisfy a given property
# also reports for which percent of rows all values of the row satisfy
# and for which rows any of the values satisfy
# property is input using a function: Func
ReportPropertyDf = function(
df,
cols=NULL,
Func,
propertyName="prop",
removeNA=TRUE) {
if (!is.null(cols)) {
cols = colnames(df)
df = SubsetCols(df, keepCols=cols)
}
n = nrow(df)
res = 100 * apply(
X=df,
MARGIN=2,
FUN=function(x)sum(Func(x), na.rm=removeNA)) / n
outDf = data.frame(res)
colnames(outDf) = paste0(propertyName, "_perc")
outDf[ , "colname"] = names(res)
outDf = outDf[ , c("colname", paste0(propertyName, "_perc"))]
colNum = ncol(df)
any_prop = sum(
apply(
X=df,
MARGIN=1,
FUN=function(x)sum(Func(x), na.rm=removeNA)) > 0)
all_prop = sum(
apply(
X=df,
MARGIN=1,
FUN=function(x)sum(Func(x), na.rm=removeNA)) == colNum)
outDf[nrow(outDf) + 1, ] = c(NA, NA)
outDf[nrow(outDf), 1] = paste0("any_", propertyName, "_perc")
outDf[nrow(outDf), 2] = 100 * any_prop / n
outDf[nrow(outDf) + 1, ] = c(NA, NA)
outDf[nrow(outDf), 1] = paste0("all_", propertyName, "_perc")
outDf[nrow(outDf), 2] = 100 * all_prop / n
return(outDf)
}
TestReportPropertyDf = function() {
df = data.frame(
x=c(rep(1, 10), NA),
y=c(NA, 1:10),
z=c(1:10, NA),
u=c(NA, NA, 1:9))
ReportPropertyDf(
df=df,
cols=c("x", "y", "z"),
Func=is.na,
propertyName="missing")
ReportPropertyDf(
df=df,
cols=c("x", "y", "z"),
Func=function(x) {x == 2},
propertyName="is_two")
}
## reports NAs for a data frame
# for each column it reports the perc missing
# also it reports the per of rows with all missing or some missing
# TODO (Reza Hosseini): add correlation matrix for missing patterns
ReportNA = function(df, cols=NULL) {
outDf = ReportPropertyDf(
df=df,
cols=cols,
Func=is.na,
propertyName="missing")
return(outDf)
}
TestReportNA = function() {
df = data.frame(
x=c(1:10, NA),
y=c(NA, 1:10),
z=c(1:10, NA),
u=c(NA, NA, 1:9))
ReportNA(df, cols=c("x", "y"))
}
# works with data.table and data.frame
ReplaceNA = function(df, cols=NULL, replaceValue=0) {
if (is.null(cols)) {
cols = colnames(df)
}
if ("data.table" %in% class(df)) {
for (col in cols) set(df, which(is.na(df[[col]])), col, replaceValue)
return(df)
}
df[ , cols][is.na(df[ , cols])] = replaceValue
return(df)
}
TestReplaceNA = function() {
df = data.frame(
x=c(1:10, NA),
y=c(NA, 1:10),
z=c(1:10, NA),
u=c(NA, NA, 1:9))
ReplaceNA(df)
ReplaceNA(data.frame(df))
ReplaceNA(df, cols="x")
}
## balancing the sample sizes (in terms of number of items)
# we assign the minimum available sample size to all slices
# this is done by defining a new column: which is isBalancedSample
# if you only like to do partial balancing on some slice Values
# specify those slice values in sliceCombinValues_toBalance
BalanceSampleSize = function(
df,
sliceCols,
itemCols=NULL,
sliceCombinValues_toBalance=NULL) {
if (is.null(itemCols)) {
itemCols = c("dummy_item")
df[ , "dummy_item"] = 1:nrow(df)}
df = Concat_stringColsDf(
df=df,
cols=itemCols,
colName="item_combin",
sepStr="-")
df = Concat_stringColsDf(
df=df,
cols=sliceCols,
colName="slice_combin",
sepStr="-")
itemColsStr = paste(itemCols, collapse="_")
sliceColsStr = paste(sliceCols, collapse="_")
df2 = unique(df[ , c("item_combin", "slice_combin")])
df3 = df2[order(df2[ , "slice_combin"]), ]
dt3 = data.table(df3)
dfItemCount_perSlice = data.frame(dt3[ , .N, by="slice_combin"])
names(dfItemCount_perSlice)[names(dfItemCount_perSlice) == "N"] = "item_combin_count"
dfItemCount_perSlice$slice_item_index = lapply(
X=as.list(dfItemCount_perSlice[ , "item_combin_count"]),
FUN=function(x){list(1:x)})
if (is.null(sliceCombinValues_toBalance)) {
minSs = min(dfItemCount_perSlice[ , "item_combin_count"])
# if there is only once slice remaining, we assign False to all
if (nrow(dfItemCount_perSlice) < 2){
minSs = -Inf
}
} else {
df0 = dfItemCount_perSlice[
dfItemCount_perSlice[ , "slice_combin"] %in% sliceCombinValues_toBalance, ]
minSs = min(df0[ , "item_combin_count"])
# if there is only once slice remaining, we assign False to all
if (nrow(df0) < 2){
minSs = -Inf
}
}
dfItemSliceIndex = Flatten_repField(
df=dfItemCount_perSlice,
listCol="slice_item_index")
dfItemSliceIndex = SubsetCols(
df=dfItemSliceIndex,
dropCols="item_combin_count")
colName = paste0(sliceColsStr, ".", itemColsStr, "_index")
boolColName = paste0("balanced_", sliceColsStr, "__", itemColsStr)
setnames(x=dfItemSliceIndex, old="slice_item_index", new=colName)
if (is.null(sliceCombinValues_toBalance)) {
dfItemSliceIndex[boolColName] = dfItemSliceIndex[colName] <= minSs
} else {
dfItemSliceIndex[ , boolColName] = (
(dfItemSliceIndex[ , colName] <= minSs) |
!dfItemSliceIndex[ , "slice_combin"] %in% sliceCombinValues_toBalance)
}
df3[ , colName] = dfItemSliceIndex[ , colName]
df3[ , boolColName] = dfItemSliceIndex[ , boolColName]
fullDf = merge(df, df3, all.x=TRUE, by=c("item_combin", "slice_combin"))
df0 = fullDf[ , c(sliceCols, boolColName, "item_combin")]
dt0 = data.table(df0)
infoDf = dt0[ , .(item_combin_count=length(unique(item_combin))),
by=c(sliceCols, boolColName)]
fullDf = SubsetCols(
df=fullDf,
dropCols=c("item_combin", "slice_combin"))
subDf = fullDf[fullDf[ , boolColName], ]
subDf = SubsetCols(
df=subDf,
dropCols=c(colName, boolColName))
return(list("fullDf"=fullDf, "subDf"=subDf, "infoDf"=infoDf))
}
TestBalanceSampleSize = function() {
n = 100
df = data.frame(
"user_id"=1:n,
"country"=sample(c("us", "jp"), n, replace=TRUE),
"date"=sample(c("1/1", "1/2", "1/3"), n, replace=TRUE))
Mark(df[1:2, ])
res = BalanceSampleSize(
df=df,
sliceCols=c("country"),
itemCols=c("user_id"),
sliceCombinValues_toBalance=NULL)
Mark(res["infoDf"])
## partial balancing
res = BalanceSampleSize(
df=df,
sliceCols="country",
itemCols=c("user_id", "date"),
sliceCombinValues_toBalance=c("JP", "FR"))
Mark(res["infoDf"])
}
# This will make sure that the sample size is the same
# for each (multi-dimensional) value of "wrt_cols" across
# slice_cols. For example for if wrt_cols = [country], slice_cols = [expt_id]
# for Japan we will have same number of
# units on base and test arms eg 2 and 2
# and for US we will have same number eg 3 and 3.
# TODO: Reza Hosseini resolve BUG: if RU has 3 items on base and no items on
# test. RU base will be kept at 3. Maybe RU has to be dropped.
BalanceSampleSize_wrtCols = function(
df,
sliceCols,
wrtCols,
itemCols=NULL,
sliceCombinValues_toBalance=NULL) {
df = Concat_stringColsDf(
df=df,
cols=wrtCols,
colName='wrt_combin',
sepStr='-')
Func = function(group) {
df0 = df[df["wrt_combin"] == group, ]
res = BalanceSampleSize(
df=df0,
sliceCols=sliceCols,
itemCols=itemCols,
sliceCombinValues_toBalance=sliceCombinValues_toBalance)
return(res[['subDf']])}
groups = unique(df[ , "wrt_combin"])
subDf = do.call(what=rbind, args=lapply(FUN=Func, X=groups))
subDf = SubsetCols(subDf, keepCols=colnames(df))
return(list("subDf"=subDf))
}
TestBalanceSampleSize_wrtCols = function() {
n = 20
df = data.frame(
"treat"=c(rep(0, n), rep(1, n)),
"subclass"=sample(1:5, 2*n, replace=TRUE))
sliceCols = "treat"
wrtCols = "subclass"
res = BalanceSampleSize_wrtCols(
df=df,
sliceCols=sliceCols,
wrtCols=wrtCols,
itemCols=NULL,
sliceCombinValues_toBalance=NULL)
subDf = res[["subDf"]]
dt = data.table(df)
subDt = data.table(subDf)
SortDf(dt[ , .N, c(sliceCols, wrtCols)], "subclass")
SortDf(subDt[ , .N, c(sliceCols, wrtCols)], "subclass")
}
Compact_condDist = function(df, groupCols, valueCol, quantileNum=40) {
# create a conditional distbn
dt = data.table(df)
AggFunc = function(x, num=quantileNum) {
qs = quantile(x, seq(0, 1, 1/num), na.rm=TRUE)
return(as.list(qs))
}
dt = dt[ , c(groupCols, valueCol), with=FALSE]
distDt = dt[ , AggFunc(value), by=groupCols]
return(distDt)
}
# test the content of this function
TestCompact_condDist = function() {
n = 1000
df = data.frame(
x=sample(paste0("label", 1:100), n, replace=TRUE),
gender=sample(c("male", "female"), n, replace=TRUE),
value=rnorm(n))
qDt = Compact_condDist(
df=df,
groupCols=c("x", "gender"),
valueCol="value",
quantileNum=40)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.