Nothing
#' WOE Transformation
#'
#' \code{woe_trans} is for transforming data to woe.
#' The \code{woe_trans_all} function is a simpler wrapper for \code{woe_trans}.
#' @param dat A data.frame with independent variables.
#' @param target The name of target variable. Default is NULL.
#' @param x_list A list of x variables.
#' @param x The name of an independent variable.
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param breaks_list A list contains breaks of variables. it is generated by code{\link{get_breaks_all}},code{\link{get_breaks}}
#' @param bins_table A table contians woe of each bin of variables, it is generated by code{\link{get_bins_table_all}},code{\link{get_bins_table}}
#' @param note Logical, outputs info. Default is TRUE.
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param woe_name Logical. Add "_woe" at the end of the variable name.
#' @param save_data Logical, save results in locally specified folder. Default is TRUE
#' @param file_name The name for periodically saved woe file. Default is "dat_woe".
#' @param dir_path The path for periodically saved woe file Default is "./data"
#' @param ... Additional parameters.
#' @return A list of breaks for each variables.
#' @seealso \code{\link{get_tree_breaks}}, \code{\link{cut_equal}}, \code{\link{select_best_class}}, \code{\link{select_best_breaks}}
#' @examples
#' sub = cv_split(UCICreditCard, k = 30)[[1]]
#' dat = UCICreditCard[sub,]
#' dat = re_name(dat, "default.payment.next.month", "target")
#' dat = data_cleansing(dat, target = "target", obs_id = "ID", occur_time = "apply_date",
#' miss_values = list("", -1))
#'
#' train_test = train_test_split(dat, split_type = "OOT", prop = 0.7,
#' occur_time = "apply_date")
#' dat_train = train_test$train
#' dat_test = train_test$test
#' #get breaks of all predictive variables
#' x_list = c("PAY_0", "LIMIT_BAL", "PAY_AMT5", "EDUCATION", "PAY_3", "PAY_2")
#' breaks_list = get_breaks_all(dat = dat_train, target = "target",
#' x_list = x_list, occur_time = "apply_date", ex_cols = "ID",
#' save_data = FALSE, note = FALSE)
#' #woe transform
#' train_woe = woe_trans_all(dat = dat_train,
#' target = "target",
#' breaks_list = breaks_list,
#' woe_name = FALSE)
#' test_woe = woe_trans_all(dat = dat_test,
#' target = "target",
#' breaks_list = breaks_list,
#' note = FALSE)
#'
#' @export
woe_trans_all = function(dat, x_list = NULL, ex_cols = NULL, bins_table = NULL,
target = NULL, breaks_list = NULL, note = FALSE,
save_data = FALSE, parallel = FALSE, woe_name = FALSE,
file_name = NULL, dir_path = tempdir(), ...) {
if (note)cat_line("-- Transforming variables to woe", col = love_color("deep_green"))
opt = options(scipen = 200, stringsAsFactors = FALSE) #
if (is.null(x_list)) {
if (!is.null(bins_table)) {
x_list = unique(bins_table[which(as.character(bins_table[, "Feature"]) != "Total"), "Feature"])
} else {
if(length(breaks_list)>0){
x_list = unique(breaks_list[,1])
}else{
x_list = get_names(dat = dat,
types = c('factor', 'character', 'numeric', 'integer', 'double'),
ex_cols = c(target, ex_cols), get_ex = FALSE)
}
}
}
ex_vars = get_names(dat = dat, types = c('factor', 'character', 'numeric', 'integer', 'double'),
ex_cols = x_list, get_ex = FALSE)
if(sum(is.na(dat))> 0){stop("Input data contains NAs, please process missing value first.") }
dat_woe = loop_function(func = woe_trans, x_list = x_list,
args = list(dat = dat, bins_table = bins_table,
target = target, breaks_list = breaks_list,
woe_name = woe_name),
bind = "cbind", parallel = parallel)
dat = cbind(dat[ex_vars], dat_woe)
if (save_data) {
dir_path = ifelse(!is.character(dir_path),
tempdir(), dir_path)
if (!dir.exists(dir_path)) dir.create(dir_path)
if (!is.character(file_name)) file_name = NULL
save_data(dat, file_name = ifelse(is.null(file_name), "dat.woe", paste(file_name, "dat.woe", sep = ".")), dir_path = dir_path, note = note)
}
return(dat)
options(opt) # reset
}
#' @rdname woe_trans_all
#' @export
woe_trans = function(dat, x, bins_table = NULL, target = NULL, breaks_list = NULL, woe_name = FALSE) {
# bins_table
if (is.null(bins_table)) {
if (!is.null(breaks_list)) {
bins_table = get_bins_table(dat = dat, x = x,
target = target, breaks_list = breaks_list,
note = FALSE)
} else {
stop("bins_table & breaks_list are both missing.\n")
}
}
bins_tbl = bins_table[which(as.character(bins_table[, "Feature"]) == names(dat[x])),
c("Feature", "cuts", "bins", "woe")]
if (woe_name) {
woe_names = paste(names(dat[x]), "woe", sep = "_")
} else {
woe_names = names(dat[x])
}
if (length(bins_tbl) > 0 && all(as.character(bins_tbl[, "Feature"]) != "Total")) {
bins = split_bins(dat = dat, x = x, breaks = bins_tbl[, c("cuts")], bins_no = TRUE)
for (i in 1:nrow(bins_tbl)) {
woe_ind = which(as.character(bins) == as.character(bins_tbl[i, "bins"]))
if(length(woe_ind) > 0){
dat[woe_ind, woe_names] = bins_tbl[i, "woe"]
}
}
dat[, woe_names] = as.numeric(dat[, woe_names])
}
return(dat[woe_names])
}
#' One-Hot Encoding
#'
#' \code{one_hot_encoding} is for converting the factor or character variables into multiple columns
#' @param dat A dat frame.
#' @param cat_vars The name or Column index list to be one_hot encoded.
#' @param merge_cat Logical. If TRUE, to merge categories greater than 8, default is TRUE.
#' @param ex_cols Variables to be excluded, use regular expression matching
#' @param na_act Logical,If true, the missing value is processed, if FALSE missing value is omitted .
#' @param note Logical.Outputs info.Default is TRUE.
#' @return A dat frame with the one hot encoding applied to all the variables with type as factor or character.
#' @seealso \code{\link{de_one_hot_encoding}}
#' @examples
#' dat1 = one_hot_encoding(dat = UCICreditCard,
#' cat_vars = c("SEX", "MARRIAGE"),
#' merge_cat = TRUE, na_act = TRUE)
#' dat2 = de_one_hot_encoding(dat_one_hot = dat1,
#' cat_vars = c("SEX","MARRIAGE"), na_act = FALSE)
#'
#' @importFrom cli cat_rule cat_line cat_bullet
#' @export
one_hot_encoding = function(dat, cat_vars = NULL, ex_cols = NULL,
merge_cat = TRUE, na_act = TRUE, note = FALSE) {
if (note)cat_line("-- One-hot encoding for charactor or factor", col = love_color("deep_green"))
dat = checking_data(dat)
if (is.null(cat_vars)) {
cat_vars = get_names(dat = dat, types = c("character", "factor"), ex_cols = ex_cols)
}
if (length(cat_vars) > 0) {
if (na_act) {
dat[, cat_vars] = process_nas(dat[cat_vars], note = FALSE)
}
if (merge_cat) {
dat[, cat_vars] = merge_category(dat[cat_vars], note = FALSE)
}
for (i in cat_vars) {
if (is.factor(dat[, i]) || is.character(dat[, i])) {
col_name = i
dat[, i] = sapply(dat[, i], function(x) gsub("[^\u4e00-\u9fa5,^a-zA-Z,^0-9]", "_", x))
cat_list = unique(dat[, i])
encode_cols = length(cat_list)
#Create individual column for every unique value in the variable
for (j in 1:encode_cols) {
one_hot_name = (paste(col_name, ".", cat_list[j], ".",sep = ""))
dat[, one_hot_name] = ifelse(dat[, i] == cat_list[j] & !is.na(dat[, i]), 1, 0)
}
}
}
dat = dat[, - which(names(dat) %in% cat_vars)]
}
return(dat)
}
#' Recovery One-Hot Encoding
#'
#' \code{de_one_hot_encoding} is for one-hot encoding recovery processing
#' @param dat_one_hot A dat frame with the one hot encoding variables
#' @param cat_vars variables to be recovery processed, default is null, if null, find these variables through regular expressions .
#' @param na_act Logical,If true, the missing value is assigned as "missing", if FALSE missing value is omitted, the default is TRUE.
#' @param note Logical.Outputs info.Default is TRUE.
#' @return A dat frame with the one hot encoding recorery character variables
#' @seealso \code{\link{one_hot_encoding}}
#' @examples
#' #one hot encoding
#' dat1 = one_hot_encoding(dat = UCICreditCard,
#' cat_vars = c("SEX", "MARRIAGE"),
#' merge_cat = TRUE, na_act = TRUE)
#' #de one hot encoding
#' dat2 = de_one_hot_encoding(dat_one_hot = dat1,
#' cat_vars = c("SEX","MARRIAGE"),
#' na_act = FALSE)
#' @importFrom cli cat_rule cat_line cat_bullet
#' @export
de_one_hot_encoding = function(dat_one_hot, cat_vars = NULL, na_act = TRUE,note = FALSE) {
if(note)cat_line("-- Recoverying one-hot encoding for charactor or factor.\n", col = love_color("deep_green"))
dat_one_hot = checking_data(dat_one_hot)
if (is.null(cat_vars)) {
char_names = one_hot_names = one_hot_names = c()
for (i in 1:length(dat_one_hot)) {
char_names[i] = sub(paste0("\\.$"), "", colnames(dat_one_hot)[i])
if (!is.null(char_names[i]) && !is.na(char_names[i]) &&
char_names[i] == colnames(dat_one_hot)[i]) {
char_names[i] = NA
}
one_hot_names[i] = try(strsplit(char_names[i], "[.]")[[1]][1], silent = TRUE)
}
cat_vars = unique(one_hot_names[!is.na(one_hot_names)])
}
one_hot_vars = unlist(sapply(cat_vars, function(x) grep(paste0(x, "\\.", "\\S{1,100}", "\\."),
paste(colnames(dat_one_hot)))))
de_cat_vars = intersect(cat_vars, unique(gsub("\\d{1}$", "", names(one_hot_vars))))
if (length(de_cat_vars) > 0) {
dat_one_hot[, de_cat_vars] = lapply(de_cat_vars, function(x) {
grx = cv_cols = names_1 = re_code = NULL
grx = paste0(x, "\\.", "\\S{1,100}", "\\.$")
cv_cols = grep(grx, paste(colnames(dat_one_hot)))
names_1 = colnames(dat_one_hot)[cv_cols]
if (na_act) {
re_code = rep("other", nrow(dat_one_hot))
} else {
re_code = rep(NA, nrow(dat_one_hot))
}
for (i in 1:(length(names_1))) {
re_code[which(dat_one_hot[cv_cols][i] == 1)] = strsplit(names_1[i], "[.]")[[1]][2]
}
return(re_code)
}
)
names(dat_one_hot[, de_cat_vars]) = de_cat_vars
dat_one_hot = data.frame(dat_one_hot, stringsAsFactors = FALSE)[, - one_hot_vars]
}
return(dat_one_hot)
}
#' Time Format Transfering
#'
#' \code{time_transfer} is for transfering time variables to time format.
#' @param dat A data frame
#' @param date_cols Names of time variable or regular expressions for finding time variables. Default is "DATE$|time$|date$|timestamp$|stamp$".
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param note Logical, outputs info. Default is TRUE.
#' @return A data.frame with transfermed time variables.
#' @examples
#' #transfer a variable.
#' dat = time_transfer(dat = lendingclub,date_cols = "issue_d")
#' class(dat[,"issue_d"])
#' #transfer a group of variables with similar name.
#' #transfer all time variables.
#' dat = time_transfer(dat = lendingclub[1:3],date_cols = "_d$")
#' class(dat[,"issue_d"])
#' @export
time_transfer = function (dat, date_cols = NULL, ex_cols = NULL, note = FALSE)
{
dat = checking_data(dat, note = FALSE)
if (note)
cat_line("-- Formating time variables", col = love_color("dark_green"))
x_list = get_x_list(x_list = NULL, dat_train = dat, dat_test = NULL,
ex_cols = ex_cols)
date_cols1 = NULL
if (!is.null(date_cols)) {
date_cols1 = names(dat[x_list])[colnames(dat[x_list]) %islike%
date_cols]
} else {
date_cols1 = names(dat[x_list])
}
df_date = dat[date_cols1]
df_date = df_date[!colAllnas(df_date)]
df_date = df_date[!sapply(df_date, is_date)]
if (dim(df_date)[2] != 0) {
df_date_cols = names(df_date)
t_sample = list()
t_len = list()
t_sam = NULL
tryCatch({
for (x in 1:ncol(df_date)) {
t_sam = vapply(as.character(sample(na.omit(df_date[[x]]),
10, replace = TRUE)), function(i) {
if (n_char(i) >= 6) {
n_char(i)
} else {
0
}
}, FUN.VALUE = numeric(1))
t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
if(length(t_sam) == 0){
t_sam = vapply(as.character(sample(na.omit(df_date[[x]]),
100, replace = TRUE)), function(i) {
if (n_char(i) >= 6) {
n_char(i)
} else {
0
}
}, FUN.VALUE = numeric(1))
t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
}
if(length(t_sam) == 0){
t_sam = vapply(as.character(sample(na.omit(df_date[[x]]),
1000, replace = TRUE)), function(i) {
if (n_char(i) >= 6) {
n_char(i)
} else {
0
}
}, FUN.VALUE = numeric(1))
t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
}
if(length(t_sam) == 0){
t_sam = vapply(as.character(na.omit(df_date[[x]])), function(i) {
if (n_char(i) >= 6) {
n_char(i)
} else {
0
}
}, FUN.VALUE = numeric(1))
t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
}
if (length(t_sam) > 0) {
t_sample[[x]] = min(t_sam, na.rm = TRUE)
}
else {
t_sample[[x]] = 0
}
t_len[[x]] = as.character(names(t_sam)[which(n_char(names(t_sam)) ==
t_sample[[x]])][1])
}
}, error = function(e) {
cat("ERROR :", conditionMessage(e), "\n")
}, warning = function(w) {
""
})
date_cols2 = which(t_sample != 0 & t_sample != Inf)
for (x in date_cols2) {
if (!is.numeric(df_date[[x]]) & t_sample[[x]] > 10 &
grepl(":", t_len[[x]]) & length(gregexpr("\\.",
t_len[[x]])[[1]]) == 1 & grepl("\\.", t_len[[x]])) {
df_date[[x]] = gsub("\\.0$", "",df_date[[x]])
t_len[[x]] = gsub(" |\\.0$", "",t_len[[x]])
}
if (t_sample[[x]] >= 5 & t_sample[[x]] <= 6 &
grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}$",
x = substr(t_len[[x]], 1, 10))) {
df_date[[x]] = paste(df_date[[x]], "-01")
df_date[[x]] = as.Date(as.character(df_date[[x]]),
"%Y-%m-%d")
}
if (t_sample[[x]] >= 5 & t_sample[[x]] <= 6 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}$",
x = substr(t_len[[x]], 1, 10))) {
df_date[[x]] = paste(df_date[[x]], "/01")
df_date[[x]] = as.Date(as.character(df_date[[x]]),
"%Y/%m/%d")
}
if (t_sample[[x]] >= 8 & t_sample[[x]] <= 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 10))) {
df_date[[x]] = as.Date(as.character(df_date[[x]]),
"%Y-%m-%d")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-1]{1}[0-9]{1}:[0-9]{2}$",
substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
df_date[[x]] = paste0(df_date[[x]], ":00")
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y-%m-%d %H:%M:%S")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-1]{1}[0-9]{1}:[0-9]{2}:[0-9]{2}",
substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y-%m-%d %H:%M:%S")
}
if (t_sample[[x]] >= 8 & t_sample[[x]] <= 9 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$",
x = t_len[[x]])) {
df_date[[x]] = as.Date(as.character(df_date[[x]]),
"%Y%m%d")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 8)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}:[0-9]{2}$",
substr(t_len[[x]], 9, n_char(t_len[[x]])))) {
df_date[[x]] = gsub(" ", "", df_date[[x]])
df_date[[x]] = paste(substr(df_date[[x]], 1,
8), substr(df_date[[x]], 9, n_char(df_date[[x]])))
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y%m%d %H:%M:%S")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 8)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}$",
substr(t_len[[x]], 9, n_char(t_len[[x]])))) {
df_date[[x]] = gsub(" ", "", df_date[[x]])
df_date[[x]] = paste(substr(df_date[[x]], 1,
8), substr(df_date[[x]], 9, n_char(df_date[[x]])),
":00")
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y%m%d %H:%M:%S")
}
if (t_sample[[x]] >= 8 & t_sample[[x]] <= 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-9]{1,2}$",
x = substr(t_len[[x]], 1, 10))) {
df_date[[x]] = as.Date(as.character(df_date[[x]]),
"%Y/%m/%d")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}$",
substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
df_date[[x]] = paste0(df_date[[x]], ":00")
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y/%m/%d %H:%M:%S")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 8)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}$",
substr(t_len[[x]], 9, n_char(t_len[[x]])))) {
df_date[[x]] = paste0(df_date[[x]], ":00")
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y/%m/%d %H:%M:%S")
}
if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$",
x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}:[0-9]{2}$",
substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
format = "%Y/%m/%d %H:%M:%S")
}
if (t_sample[[x]] == 10 & grepl(pattern = "^[1]{1}[0-9]{9}",
x = t_len[[x]])) {
df_date[[x]] = as.POSIXct(as.numeric(df_date[[x]]),
origin = "1970-01-01 00:00:00")
}
if (t_sample[[x]] == 13 & grepl(pattern = "^[1]{1}[0-9]{12}",
x = t_len[[x]])) {
df_date[[x]] = as.POSIXct(as.numeric(df_date[[x]])/1000,
origin = "1970-01-01 00:00:00")
}
}
dat[df_date_cols] = df_date
rm(df_date)
}
else {
dat = dat
}
dat
}
#' Derivation of Behavioral Variables
#'
#' This function is used for derivating behavioral variables and is not intended to be used by end user.
#'
#' @param dat A data.frame contained only predict variables.
#' @param grx Regular expressions used to match variable names.
#' @param grx_x Regular expression used to match a group of variable names.
#' @param td Number of variables to derivate.
#' @param ID The name of ID of observations or key variable of data. Default is NULL.
#' @param ex_cols A list of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param x_list Names of independent variables.
#' @param der Variables to derivate
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param note Logical, outputs info. Default is TRUE.
#' @details The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination.
#' @importFrom data.table setDT := rbindlist
#' @export
derived_ts_vars = function(dat, grx = NULL, td = NULL, ID = NULL, ex_cols = NULL, x_list = NULL,
der = c("cvs", "sums", "means", "maxs", "max_mins",
"time_intervals", "cnt_intervals", "total_pcts",
"cum_pcts", "partial_acfs"),
parallel = TRUE, note = TRUE) {
if (note) cat_line(paste("--", "Derived variables of", paste(der), " .\n"), col = love_color("dark_green"))
dat = checking_data(dat, note = FALSE)
if (parallel) {
parallel = start_parallel_computing(parallel)
stopCluster = TRUE
} else {
parallel = stopCluster = FALSE
}
if (is.null(ID)) {
dat$ID = as.character(rownames(dat))
ID = 'ID'
} else {
dat[, ID] = as.character(dat[, ID])
}
is_not_numeric = which(sapply(dat, function(x) is.element("integer64", class(x))))
dat[is_not_numeric] = sapply(dat[is_not_numeric], as.numeric)
on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
i. = j. = NULL
if (!parallel) {
df_cv_list = lapply(unlist(grx), function(grx_x) derived_ts(dat, grx_x = grx_x, td = td,
ID = ID, ex_cols = ex_cols, x_list = x_list, der = der))
if (length(df_cv_list) > 1) {
df_cv_list = multi_left_join(df_list = df_cv_list, by = ID)
} else {
df_cv_list = as.data.frame(df_cv_list)
}
} else {
df_cv_list = foreach(i. = unlist(grx),
.errorhandling = c('pass')) %dopar% {
try(do.call(derived_ts, args = list(dat = dat, grx_x = i., td = td, ID = ID, ex_cols = ex_cols, x_list = x_list, der = der)), silent = TRUE)
}
if (length(df_cv_list) > 1) {
df_cv_list = multi_left_join(df_list = df_cv_list, by = ID)
} else {
df_cv_list = as.data.frame(df_cv_list)
}
}
return(df_cv_list)
}
#' @rdname derived_ts_vars
#' @export
derived_ts = function(dat, grx_x = NULL, x_list = NULL, td = NULL, ID = NULL, ex_cols = NULL,
der = c("cvs", "sums", "means", "maxs", "max_mins",
"time_intervals", "cnt_intervals", "total_pcts",
"cum_pcts", "partial_acfs")) {
dat = checking_data(dat, note = FALSE)
if (is.null(ID)) {
dat$ID = rownames(dat)
ID = 'ID'
}
if (is.null(x_list)) {
num_x_list = get_names(dat = dat,
types = c('numeric', 'integer', 'double'),
ex_cols = c(ID, ex_cols), get_ex = FALSE)
} else {
num_x_list = x_list
}
if (!is.null(grx_x)) {
if (is.null(td)) {
cv_cols = num_x_list[grep(grx_x, paste(num_x_list))]
} else {
cv_cols = num_x_list[grep(grx_x, paste(num_x_list))[1:td]]
}
} else {
if (is.null(td) || td > length(num_x_list)) {
cv_cols = num_x_list
} else {
cv_cols = num_x_list[1:td]
}
}
cv_cols = cv_cols[!is.na(cv_cols)]
dat = dat[c(ID, cv_cols)]
if (length(cv_cols) > 0) {
dat = as.data.table(dat)
name_n = orignal_nam = sim_nam = str_num = c()
orignal_nam = names(dat[, cv_cols, with = FALSE])
str_num = as.numeric(str_match(str_r = orignal_nam, pattern = "\\d+"))
if (!any(is.na(str_num)) && !is.null(td) && length(str_num) == td) {
name_n = paste(min(str_num), max(str_num), sep = "to")
}
if (is.null(name_n)) {
name_n = "ts"
}
sim_nam = paste(unique(lapply(1:(length(orignal_nam) - 1),
function(x) sim_str(orignal_nam[x], orignal_nam[x + 1], sep = "_|[0-9]")))[[1]], collapse = "_")
if (any(der == "cvs")) {
dat = dat[, paste(sim_nam, name_n, "cvs", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowCVs(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
}
if (any(der == "sums")) {
dat = dat[, paste(sim_nam, name_n, "sums", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowSums(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
}
if (any(der == "means")) {
dat = dat[, paste(sim_nam, name_n, "means", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowMeans(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
}
if (any(der == "maxs")) {
dat = dat[, paste(sim_nam, name_n, "maxs", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowMaxs(dat[, cv_cols, with = FALSE]))]
}
if (any(der == "max_mins")) {
dat = dat[, paste(sim_nam, name_n, "max_mins", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowMaxMins(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
}
if (any(der == "partial_acfs")) {
dat = dat[, paste(sim_nam, name_n, "partial_acfs", sep = "_") := derived_partial_acf(dat[, cv_cols, with = FALSE])]
}
if (any(der == "time_intervals")) {
dat = dat[, paste(orignal_nam, "time_intervals", sep = "_") := derived_interval(dat[, cv_cols, with = FALSE],
interval_type = "time_interval")]
}
if (any(der == "cnt_intervals")) {
dat = dat[, paste(orignal_nam, "cnt_intervals", sep = "_") := derived_interval(dat[, cv_cols, with = FALSE],
interval_type = "cnt_interval")]
}
if (any(der == "total_pcts")) {
dat = dat[, paste(orignal_nam, "total_pcts", sep = "_") := derived_pct(dat[, cv_cols, with = FALSE],
pct_type = "total_pct")]
}
if (any(der == "cum_pcts")) {
dat = dat[, paste(orignal_nam, "cum_pcts", sep = "_") := derived_pct(dat[, cv_cols, with = FALSE],
pct_type = "cum_pct")]
}
}
dat = quick_as_df(dat)
return(dat)
}
#' Process time series data
#'
#' This function is used for time series data processing.
#'
#' @param dat A data.frame contained only predict variables.
#' @param group The group of behavioral or status variables.
#' @param ID The name of ID of observations or key variable of data. Default is NULL.
#' @param time The name of variable which is time when behavior was happened.
#' @details The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination.
#' @importFrom data.table setDT := dcast.data.table shift .SD
#' @examples
#' dat = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#' 8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#' terms = c('a','b','c','a','c','d','d','a',
#' 'b','c','a','c','d','a','c',
#' 'd','a','e','f','b','c','f','b',
#' 'c','h','h','i','c','d','g','k','k'),
#' time = c(8,3,1,9,6,1,4,9,1,3,4,8,2,7,1,
#' 3,4,1,8,7,2,5,7,8,8,2,1,5,7,2,7,3))
#'
#' time_series_proc(dat = dat, ID = 'id', group = 'terms',time = 'time')
#' @export
time_series_proc = function(dat, ID = NULL, group = NULL, time = NULL) {
time_interval = NULL
if (is.null(time)) stop("time variable is missing.\n")
dat$time = as.numeric(dat[[time]])
if (!is.null(ID)) {
dat$ID = as.character(dat[[ID]])
} else {
dat$ID = as.character(rownames(dat))
ID = "ID"
}
if (!is.null(group)) {
dat$group = as.character(dat[[group]])
} else {
dat$group = "1"
}
dat = as.data.table(dat)
dat = dat[,.SD[order(time)],by = 'ID']
dat[,time_interval := as.numeric(data.table::shift(time,
fill = max(time,na.rm = TRUE),
type = "lead") - time),by = 'ID']
dat = dcast.data.table(dat, ID ~ group,
fun.aggregate = list(cnt_x, sum_x, max_x, min_x, avg_x),
value.var = c('time_interval'))
colnames(dat)[1] = ID
quick_as_df(dat)
}
#' gather or aggregate data
#'
#' This function is used for gathering or aggregating data.
#'
#' @param dat A data.frame contained only predict variables.
#' @param x_list The names of variables to gather.
#' @param ID The name of ID of observations or key variable of data. Default is NULL.
#' @param FUN The function of gathering method.
#' @details The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination.
#' @importFrom stats aggregate
#' @examples
#' dat = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#' 8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#' terms = c('a','b','c','a','c','d','d','a',
#' 'b','c','a','c','d','a','c',
#' 'd','a','e','f','b','c','f','b',
#' 'c','h','h','i','c','d','g','k','k'),
#' time = c(8,3,1,9,6,1,4,9,1,3,4,8,2,7,1,
#' 3,4,1,8,7,2,5,7,8,8,2,1,5,7,2,7,3))
#'
#' gather_data(dat = dat, x_list = "time", ID = 'id', FUN = sum_x)
#' @export
gather_data = function(dat, x_list = NULL, ID = NULL, FUN = sum_x) {
dat = checking_data(dat)
if (is.null(x_list)) {
x_list = get_names(dat, types = c("numeric","double","integer"),ex_cols = ID)
}
if (!is.null(ID)) {
dat$ID = as.character(dat[[ID]])
} else {
dat$ID = as.character(rownames(dat))
ID = "ID"
}
dat_sum = stats::aggregate(dat[,x_list],
by = list(ID = dat$ID), FUN = FUN)
names(dat_sum)[1] = ID
return(dat_sum)
}
#' Process group numeric variables
#'
#' This function is used for grouped numeric data processing.
#'
#' @param dat A data.frame contained only predict variables.
#' @param group The group of behavioral or status variables.
#' @param ID The name of ID of observations or key variable of data. Default is NULL.
#' @param num_var The name of numeric variable to process.
#' @importFrom data.table setDT := dcast.data.table shift
#' @examples
#' dat = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#' 8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#' terms = c('a','b','c','a','c','d','d','a',
#' 'b','c','a','c','d','a','c',
#' 'd','a','e','f','b','c','f','b',
#' 'c','h','h','i','c','d','g','k','k'),
#' time = c(8,3,1,9,6,1,4,9,1,3,4,8,2,7,1,
#' 3,4,1,8,7,2,5,7,8,8,2,1,5,7,2,7,3))
#'
#' time_series_proc(dat = dat, ID = 'id', group = 'terms',time = 'time')
#' @export
var_group_proc = function(dat, ID = NULL, group = NULL, num_var = NULL){
dat = quick_as_df(dat)
if(!is.null(ID)){
dat$ID = as.character(dat[[ID]])
}else{
dat$ID = as.character(rownames(dat))
ID = "ID"
}
if (!is.null(group)){
dat$group = as.character(dat[[group]])
}else{
dat$group = "1"
}
if (!is.null(num_var)){
dat[[num_var]] = as.numeric(dat[[num_var]])
} else{
dat$num_var = 1
num_var = "num_var"
}
dat = as.data.table(dat)
dat = dcast.data.table(dat, ID ~ group,
fun.aggregate = list(cnt_x, sum_x, max_x, min_x, avg_x), value.var = c(num_var))
colnames(dat)[1] = ID
quick_as_df(dat)
}
#' Processing of Time or Date Variables
#'
#' This function is not intended to be used by end user.
#'
#' @param df_tm A data.frame
#' @param x Time variable.
#' @param enddate End time.
#' @param units Units of diff_time, "secs", "mins", "hours", "days", "weeks" is available.
#' @export
#' @importFrom data.table hour setnames
time_vars_process = function(df_tm = df_tm, x, enddate = NULL,
units = c("secs", "mins", "hours", "days", "weeks")) {
if (is_date(df_tm[[x]]) & x != enddate & (!is.null(enddate) && is_date(df_tm[[enddate]])) &
length(units) > 0){
newname = c()
for(unit in units){
if(length(unit) > 0 && is.element("days",unit)) {
df_tm = within(df_tm, {
diff_days = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "days")))
})
newname[unit] = paste0(x, "_to_", enddate, "_diff_days")
df_tm = re_name(df_tm, oldname = c("diff_days"), newname = newname[unit])
}else{
if(length(unit) > 0 && is.element("secs",unit)) {
df_tm = within(df_tm, {
diff_secs = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "secs")))
})
newname[unit] = paste0(x, "_to_", enddate, "_diff_secs")
df_tm = re_name(df_tm, oldname = c("diff_secs"), newname = newname[unit])
}else{
if(length(unit) > 0 && is.element("mins",unit)) {
df_tm = within(df_tm, {
diff_mins = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "mins")))
})
newname[unit] = paste0(x, "_to_", enddate, "_diff_mins")
df_tm = re_name(df_tm, oldname = c("diff_mins"), newname = newname[unit])
}else{
if(length(unit) > 0 && is.element("hours",unit)) {
df_tm = within(df_tm, {
diff_hours = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "hours")))
})
newname[unit] = paste0(x, "_to_", enddate, "_diff_hours")
df_tm = re_name(df_tm, oldname = c("diff_hours"), newname = newname[unit])
}else{
if(length(unit) > 0 && is.element("weeks",unit)) {
df_tm = within(df_tm, {
diff_weeks = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "weeks")))
})
newname[unit] = paste0(x, "_to_", enddate, "_diff_weeks")
df_tm = re_name(df_tm, oldname = c("diff_weeks"), newname = newname[unit])
}
}
}
}
}
}
return(df_tm[newname])
}
}
#' time_variable
#'
#' This function is not intended to be used by end user.
#'
#' @param dat A data.frame.
#' @param date_cols Time variables.
#' @param enddate End time.
#' @param units Units of diff_time, "secs", "mins", "hours", "days", "weeks" is available.
#' @export
time_variable = function(dat, date_cols = NULL, enddate = NULL,
units = c("secs", "mins", "hours", "days", "weeks")) {
dat = checking_data(dat = dat)
date_cols1 = NULL
if (!is.null(date_cols)) {
date_cols1 = names(dat)[colnames(dat) %islike% c(enddate, date_cols)]
} else {
date_cols1 = names(dat)
}
df_date = dat[date_cols1]
df_date = time_transfer(dat = df_date, date_cols = c(enddate, date_cols))
df_date = df_date[!colAllnas(df_date)]
df_tm = df_date[sapply(df_date, is_date)]
time_vars_list = lapply(date_cols1, function(x) time_vars_process(df_tm = df_tm, x, enddate
= enddate,units = units))
index = 0;
j = 1
for (i in 1:length(time_vars_list)) {
if (is.null(time_vars_list[[i]])) {
index[j] = i
j = j + 1
}
}
tm_vars_tbl = as.data.frame(Reduce("cbind", time_vars_list[-index]))
dat = cbind(dat, tm_vars_tbl)
return(dat)
}
#' Processing of Address Variables
#'
#' This function is not intended to be used by end user.
#'
#' @param df_city A data.frame.
#' @param x Variables of city,
#' @param city_class Class or levels of cities.
#' @export
city_varieble_process = function(df_city, x, city_class) {
if (class(df_city)[1] != "data.frame") {
df_city = quick_as_df(df_city)
}
df_city = within(df_city, {
city_level = NA
city_level[df_city[[x]] %alike% city_class[1]] = 1
city_level[df_city[[x]] %alike% city_class[2]] = 2
city_level[df_city[[x]] %alike% city_class[3]] = 3
city_level[df_city[[x]] %alike% city_class[4]] = 4
city_level[df_city[[x]] %alike% city_class[5]] = 5
city_level[df_city[[x]] %alike% city_class[6]] = 6
city_level[is.null(df_city[[x]]) | df_city[[x]] == "NULL" | df_city[[x]] == "" | df_city[[x]] == "missing"|
df_city[[x]] == "Missing" | city_level == "null" | df_city[[x]] == "NA"] = -1
city_level[is.na(city_level)] = -1
})
city_level_name = paste(x, "city_level", sep = "_")
df_city = re_name(dat = df_city, oldname = "city_level", newname = city_level_name)
return(df_city[city_level_name])
}
#' city_varieble
#'
#' This function is used for city variables derivation.
#'
#' @param df A data.frame.
#' @param city_cols Variables of city,
#' @param city_pattern Regular expressions, used to match city variable names. Default is "city$".
#' @param city_class Class or levels of cities.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize summarise n count %>% filter left_join
#' @importFrom parallel detectCores clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do% registerDoSEQ
#' @export
city_varieble = function(df = df, city_cols = NULL,
city_pattern = NULL, city_class = city_class,
parallel = TRUE) {
if (class(df)[1] != "data.frame") {
df = quick_as_df(df)
}
if (is.null(city_cols)) {
city_index = grepl(city_pattern, paste(colnames(df)))
city_cols = names(df[city_index])
} else {
city_cols = names(df[city_cols])
}
df_city = df[city_cols]
if (parallel) {
parallel = start_parallel_computing(parallel)
stopCluster = TRUE
} else {
parallel = stopCluster = FALSE
}
on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
i. = NULL
df_city_list = list()
if (!parallel) {
df_city_list = lapply(city_cols, function(x) city_varieble_process(df_city = df_city, x = x, city_class = city_class))
df_city_tbl = Reduce("cbind", df_city_list) %>% as.data.frame()
} else {
df_city_list = foreach(i. = city_cols, .combine = "c") %dopar% {
try(do.call(city_varieble_process,
args = list(df_city = df_city, x = i., city_class = city_class)),
silent = TRUE)
}
df_city_tbl = quick_as_df(df_city_list)
}
df = cbind(df, df_city_tbl)
return(df)
}
#' Replace Value
#'
#' \code{replace_value} is for replacing values of some variables .
#' \code{replace_value_x} is for replacing values of a variable.
#'
#' @param dat A data.frame.
#' @param x Name of variable to replace value.
#' @param x_list Names of variables to replace value.
#' @param x_pattern Regular expressions, used to match variable names.
#' @param replace_dat A data.frame contains value to replace.
#' @param MARGIN A vector giving the subscripts which the function will be applied over. E.g., for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns. Where X has named dimnames, it can be a character vector selecting dimension names.
#' @param RE_NAME Logical, rename the replaced variable.
#' @param VALUE Values to replace.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize summarise n count %>% filter left_join
#' @importFrom parallel detectCores clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do% registerDoSEQ
#' @export
replace_value = function(dat = dat, x_list = NULL,
x_pattern = NULL, replace_dat, MARGIN = 2,
VALUE = if (MARGIN == 2) colnames(replace_dat) else rownames(replace_dat),
RE_NAME = TRUE,
parallel = FALSE) {
dat = checking_data(dat)
if (is.null(x_list)) {
if (!is.null(x_pattern)) {
x_index = grepl(x_pattern, paste(colnames(dat)))
x_list = names(dat[x_index])
} else {
x_list = get_names(dat)
}
}
if (parallel) {
parallel = start_parallel_computing(parallel)
stopCluster = TRUE
} else {
parallel = stopCluster = FALSE
}
on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
i. = NULL
dat_list = list()
if (!parallel) {
dat_list = lapply(x_list, function(x) replace_value_x(dat = dat, x = x, replace_dat = replace_dat, MARGIN = MARGIN, VALUE = VALUE, RE_NAME = RE_NAME))
dat_tbl = Reduce("cbind", dat_list) %>% quick_as_df()
} else {
dat_list = foreach(i. = x_list, .combine = "c") %dopar% {
try(do.call(city_varieble_process,
args = list(dat = dat, x = i., replace_dat = replace_dat)),
silent = TRUE)
}
dat_tbl = quick_as_df(dat_list)
}
dat = cbind(dat, dat_tbl)
return(dat)
}
#' @rdname replace_value
#' @export
replace_value_x = function(dat, x, replace_dat, MARGIN = 2,
VALUE = if (MARGIN == 2) colnames(replace_dat) else rownames(replace_dat), RE_NAME = TRUE) {
dat = checking_data(dat)
replace_dat = checking_data(replace_dat)
dat = within(dat, {
x_level = NA
if (MARGIN == 2) {
for (i in 1:ncol(replace_dat)) {
x_level[dat[[x]] %alike% replace_dat[i]] = VALUE[i]
}
} else {
for (i in 1:nrow(replace_dat)) {
x_level[dat[[x]] %alike% replace_dat[i,]] = VALUE[i]
}
}
})
if (RE_NAME) {
new_name = paste(x, "replace", sep = "_")
dat = re_name(dat = dat, oldname = "x_level", newname = new_name)
}
return(dat[new_name])
}
#' add_variable_process
#'
#' This function is not intended to be used by end user.
#'
#' @param add A data.frame contained address variables.
#' @export
add_variable_process = function(add) {
# acquire a sets of addresses
add1 = as.data.frame(add)
sim1 = colname1 = list()
for (i in 1:ncol(add1)) {
if (i >= ncol(add1)) break
sim1[[i]] = apply(add1[, i:ncol(add1)], 2,
function(x) {
ifelse(add1[, i] %alike% x, 1, 0)
})
colname1[[i]] = lapply(names(add1)[i:(ncol(add1))], function(n) paste(names(add1)[i], n, sep = '_WITH_'))
}
sim1 = data.frame(t(unlist(sim1)), stringsAsFactors = FALSE)
names(sim1) = unlist(colname1)
# find the variables which are computing similarity with themselves
splitvar = strsplit(names(sim1), "_WITH_")
vars = c()
for (i in 1:(length(sim1))) {
if (splitvar[[i]][1] == splitvar[[i]][2]) {
vars[[i]] = names(sim1)[i]
} else {
vars[[i]] = NA
}
}
# get the final results
sim = sim1[is.na(vars)]
simm = as.vector(sim)
return(simm)
}
#' address_varieble
#'
#' This function is not intended to be used by end user.
#'
#' @param df A data.frame.
#' @param address_cols Variables of address,
#' @param address_pattern Regular expressions, used to match address variable names.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize summarise n count %>% filter left_join
#' @importFrom parallel detectCores clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do% registerDoSEQ
#' @export
address_varieble = function(df, address_cols = NULL, address_pattern = NULL, parallel = TRUE) {
if (class(df)[1] != "data.frame") {
df = as.data.frame(df)
}
if (is.null(address_cols)) {
address_cols = grepl(address_pattern, paste(colnames(df)))
address_vars = names(df)[address_cols]
} else {
address_vars = names(df[address_cols])
}
df_add = df[address_vars]
if (parallel) {
parallel = start_parallel_computing(parallel)
stopCluster = TRUE
} else {
parallel = stopCluster = FALSE
}
on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
i. = NULL
df_add_list = list()
if (!parallel) {
df_add_list = lapply(1:nrow(df_add), function(i.) add_variable_process(add = df_add[i.,]))
df_add_tbl = Reduce("cbind", df_add_list) %>% as.data.frame()
} else {
df_add_list = foreach(i. = 1:nrow(df_add), .combine = "c") %dopar% {
try(do.call(add_variable_process, args = list(add = df_add[i.,])), silent = TRUE)
}
df_add_tbl = as.data.frame(df_add_list)
}
return(df_add_tbl)
}
#' variable_process
#'
#' This function is not intended to be used by end user.
#'
#' @param add A data.frame
#' @importFrom data.table :=
#' @export
variable_process = function(add) {
td = new3 = new2 = grx_x = colname1 = NULL
# acquire a sets of addresses
cv_cols = grep(grx_x, paste(colnames(dat)))[1:td]
cv_cols = cv_cols[!is.na(cv_cols)]
#cv_folds
if (length(cv_cols) > 0) {
dat = dat[, new2 := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowSums(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
dat = dat[, new3 := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
rowMeans(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
}
sim1 = data.frame(t(unlist(sim1)), stringsAsFactors = FALSE)
names(sim1) = unlist(colname1)
# find the variables which are computing similarity with themselves
splitvar = strsplit(names(sim1), "_WITH_")
vars = c()
for (i in 1:(length(sim1))) {
if (splitvar[[i]][1] == splitvar[[i]][2]) {
vars[[i]] = names(sim1)[i]
} else {
vars[[i]] = NA
}
}
# get the final results
sim = sim1[is.na(vars)]
simm = as.vector(sim)
return(simm)
}
#' derived_interval
#'
#' This function is not intended to be used by end user.
#'
#' @param dat_s A data.frame contained only predict variables.
#' @param interval_type Available of c("cnt_interval", "time_interval")
#' @export
#' @importFrom data.table first
derived_interval = function(dat_s, interval_type = c("cnt_interval", "time_interval")) {
interval_list = apply(dat_s, 1, function(m) {
if (interval_type == "time_interval") {
cnt_ind = inter_ind = which(!is.na(m) | m != 0)
interval = rep(NA, length(m))
if (length(cnt_ind) > 1) {
interval[cnt_ind] = vapply(1:(length(inter_ind)), function(i) {
ifelse(i <= length(inter_ind), abs(inter_ind[i] - inter_ind[i + 1]), NA)
}, FUN.VALUE = numeric(1))
}
interval = c(abs(1 - data.table::first(inter_ind)), interval[-length(interval)])
} else {
cnt_ind = which(m >= 0)
inter_ind = unlist(m, use.names = FALSE)[c(cnt_ind)]
interval = rep(NA, length(m))
if (length(cnt_ind) > 1) {
interval[cnt_ind] = vapply(1:(length(inter_ind)), function(i) {
ifelse(i < length(inter_ind), abs(inter_ind[i] - inter_ind[i + 1]), inter_ind[i])
}, FUN.VALUE = numeric(1))
}
}
interval
})
interval_list = as.data.frame(t(interval_list))
interval_list
}
#' derived_pct
#'
#' This function is not intended to be used by end user.
#'
#' @param dat_s A data.frame contained only predict variables.
#' @param pct_type Available of "total_pct"
#' @export
derived_pct = function(dat_s, pct_type = "total_pct") {
dat_s[is.na(dat_s)] = 0
if (pct_type == "total_pct") {
pct_list = dat_s / rowSums(dat_s, na.rm = TRUE)
} else {
cnt_pct_list = dat_s / rowSums(dat_s, na.rm = TRUE)
pct_list = apply(cnt_pct_list, 1, function(x) cumsum(x))
pct_list = as.data.frame(t(pct_list))
}
pct_list
}
#' derived_partial_acf
#'
#' This function is not intended to be used by end user.
#'
#' @param dat_s A data.frame
#' @export
derived_partial_acf = function(dat_s) {
dat_s[is.na(dat_s)] = 0
p_acf = apply(dat_s, 1, function(x) ifelse(length(unique(x)) > 2, mean(abs(ar(ts(x), FALSE,
length(unique(x)) - 1, na.action = na.pass)$partialacf)), NA))
p_acf
}
#' sim_str
#'
#' This function is not intended to be used by end user.
#'
#' @param a A string
#' @param b A string
#' @param sep Seprater of strings. Default is "_|[.]|[A-Z]".
#' @export
sim_str = function(a, b, sep = "_|[.]|[A-Z]") {
intersect(strsplit(a, sep)[[1]], strsplit(b, sep)[[1]])
}
#' Recovery Percent Format
#'
#' \code{de_percent} is a small function for recoverying percent format..
#' @param x Character with percent formant.
#' @param digits Number of digits.Default: 2.
#' @return x without percent format.
#' @examples
#' de_percent("24%")
#' @export
de_percent = function(x, digits = 2) {
x = as.character(x)
round(as.numeric(gsub("%", "", x)) / 100, digits = digits)
}
#' Merge Category
#'
#' \code{merge_category} is for merging category of nominal variables which number of categories is more than m or percent of samples in any categories is less than p.
#' @param dat A data frame with x and target.
#' @param char_list The list of charecteristic variables that need to merge categories, Default is NULL. In case of NULL,merge categories for all variables of string type.
#' @param ex_cols A list of excluded variables. Default is NULL.
#' @param m The minimum number of categories.
#' @param note Logical, outputs info. Default is TRUE.
#' @return A data.frame with merged category variables.
#' @examples
#' #merge_catagory
#' dat = merge_category(lendingclub,ex_cols = "id$|_d$")
#' char_list = get_names(dat = dat,types = c('factor', 'character'),
#' ex_cols = "id$|_d$", get_ex = FALSE)
#' str(dat[,char_list])
#' @export
merge_category = function(dat, char_list = NULL, ex_cols = NULL, m = 10, note = TRUE) {
opt = options(scipen = 200, stringsAsFactors = FALSE, "warn" = -1)
if (note) cat_line(paste0("-- Merging categories..."), col = love_color("dark_green"))
dat = char_to_num(dat = dat, char_list = char_list, ex_cols = ex_cols, note = FALSE)
if (is.null(char_list)) {
char_list = get_names(dat = dat,
types = c('factor', 'character'),
ex_cols = ex_cols, get_ex = FALSE)
}
for (x in char_list) {
dt_x = table(as.character(dat[, x]), useNA = "no")
over_vars = NULL
if (length(dt_x) > m) {
over_vars = order(abs(dt_x), decreasing = TRUE)[m:length(dt_x)]
}
if (length(over_vars) > 0) {
max_class = over_vars
dat[which(dat[, x] %in% names(dt_x[max_class])), x] = "other"
}
}
return(dat)
options(opt) # reset
}
#' character to number
#'
#' \code{char_to_num} is for transfering character variables which are actually numerical numbers containing strings to numeric.
#' @param dat A data frame
#' @param ex_cols A list of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param char_list The list of charecteristic variables that need to merge categories, Default is NULL. In case of NULL, merge categories for all variables of string type.
#' @param note Logical, outputs info. Default is TRUE.
#' @param m The minimum number of categories.
#' @param p The max percent of categories.
#' @return A data.frame
#' @examples
#' dat_sub = lendingclub[c('dti_joint', 'emp_length')]
#' str(dat_sub)
#' #variables that are converted to numbers containing strings
#' dat_sub = char_to_num(dat_sub)
#' str(dat_sub)
#' @export
char_to_num = function (dat, char_list = NULL, m = 0, p = 0.5, note = FALSE,
ex_cols = NULL) {
opt = options(scipen = 200, warn = -1, stringsAsFactors = FALSE)
if (note)
cat_line(paste("-- Transfering character variables which are actually numerical to numeric"),
col = love_color("dark_green"))
if (is.null(char_list)) {
char_list = get_names(dat = dat, types = c("factor",
"character"), ex_cols = ex_cols, get_ex = FALSE)
}
for (x in char_list) {
dt_x = table(as.character(dat[, x]), useNA = "no")
if(any(grepl("\\,",names(dt_x)))){
x_name = gsub("\\,","",names(dt_x))
}else{
x_name = names(dt_x)
}
char_num = tryCatch({
as.numeric(x_name)
}, error = function(e) {
cat("ERROR :", conditionMessage(e), "\n")
}, warning = function(w) {
as.numeric(x_name)
})
char_num_ind = which(!is.na(char_num))
if (length(char_num_ind) > m && length(dt_x) > m &&
round(length(char_num_ind)/length(dt_x), 2) > p ) {
dat[, x] = as.numeric(gsub("\\,","",as.character(dat[, x])))
}
}
return(dat)
options(opt)
}
#' Logarithmic transformation
#'
#' \code{log_trans} is for logarithmic transformation
#' @param dat A data.frame.
#' @param target The name of target variable.
#' @param x_list A list of x variables.
#' @param cor_dif The correlation coefficient difference with the target of logarithm transformed variable and original variable.
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param note Logical, outputs info. Default is TRUE.
#' @return Log transformed data.frame.
#' @examples
#' dat = log_trans(dat = UCICreditCard, target = "default.payment.next.month",
#' x_list =NULL,cor_dif = 0.01,ex_cols = "ID", note = TRUE)
#' @importFrom cli cat_rule cat_line cat_bullet
#' @export
log_trans = function(dat, target, x_list = NULL,cor_dif = 0.01,ex_cols = NULL, note = TRUE){
log_x = log_vars(dat = dat, target = target, x_list = x_list, cor_dif = cor_dif,ex_cols = ex_cols )
if(!is.null(log_x)){
if(note)cat_line("-- Logarithmic transformation", col = love_color("deep_green"))
dat[,log_x] = lapply(dat[,log_x],function(x){
x = as.numeric(unlist(x))
x[which(!is.na(x) & x > 0)] = log(x[which(!is.na(x) & x > 0)])
x
}
)
dat = re_name(dat, oldname = log_x, newname = paste(log_x,"log",sep = "_"))
if(note){
cat_line("-- Following variables are log transformed:", col = love_color("dark_green"))
cat_bullet(paste(format(log_x),paste(log_x,"log",sep = "_"),sep = " -> "), col = "darkgrey")
}
}
return(dat)
}
#' @rdname log_trans
#' @export
log_vars = function(dat, x_list = NULL, target = NULL,cor_dif = 0.01,ex_cols = NULL){
log_x_list = list()
dat = checking_data(dat = dat,target = target)
if(is.null(x_list)){
x_list = get_names(dat = dat,types = c("numeric","double","integer"),ex_cols = c(ex_cols,target))
}
if(length(x_list) > 0){
log_x_list = lapply(dat[x_list],function(x){
flag = as.numeric(as.factor(dat[[target]]))
x = as.numeric(unlist(x))
if(length(which(!is.na(x) & x > 0))> 30 ){
log_x = log(x[!is.na(x) & x > 0])
cor_log_x = cor(log_x,flag[!is.na(x) & x > 0],method = "pearson",use = "complete.obs")
cor_x = cor(x[!is.na(x) & x > 0],flag[!is.na(x) & x > 0],method = "pearson",use = "complete.obs")
abs(cor_log_x) - abs(cor_x)
}else{
0
}
})
}
logvars = names(log_x_list[which(log_x_list >= cor_dif)])
return(logvars)
}
#' Ranking Percent Process
#'
#' \code{ranking_percent_proc} is for processing ranking percent variables.
#' \code{ranking_percent_dict} is for generating ranking percent dictionary.
#' @param dat A data.frame.
#' @param x_list A list of x variables.
#' @param x The name of an independent variable.
#' @param rank_dict The dictionary of rank_percent generated by \code{ranking_percent_dict} .
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param pct Percent of rank. Default is 0.01.
#' @param note Logical, outputs info. Default is TRUE.
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param save_data Logical, save results in locally specified folder. Default is FALSE
#' @param file_name The name for periodically saved rank_percent data file. Default is "dat_rank_percent".
#' @param dir_path The path for periodically saved rank_percent data file Default is "tempdir()"
#' @param ... Additional parameters.
#' @return Data.frame with new processed variables.
#' @examples
#' rank_dict = ranking_percent_dict(dat = UCICreditCard[1:1000,],
#' x_list = c("LIMIT_BAL","BILL_AMT2","PAY_AMT3"), ex_cols = NULL )
#' UCICreditCard_new = ranking_percent_proc(dat = UCICreditCard[1:1000,],
#' x_list = c("LIMIT_BAL", "BILL_AMT2", "PAY_AMT3"), rank_dict = rank_dict, parallel = FALSE)
#' @importFrom data.table is.data.table
#' @export
ranking_percent_proc = function(dat, ex_cols = NULL, x_list = NULL, rank_dict = NULL,pct = 0.01,
parallel = FALSE, note = FALSE, save_data = FALSE,file_name = NULL,dir_path= tempdir(), ... ){
if (note) cat_line("-- Processing ranking percent variables.\n", col = love_color("dark_green"))
x_list = get_x_list(x_list = x_list, dat_train = dat, dat_test = NULL, ex_cols = ex_cols)
if (length(x_list) > 0) {
num_x_list = get_names(dat = dat[x_list], types = c('numeric', 'integer', 'double'),
ex_cols = c(ex_cols), get_ex = FALSE)
} else {
stop("No variable in the x_list or ex_col excludes all variables.\n ")
}
if (is.null(rank_dict)) {
rank_dict = ranking_percent_dict(dat = dat, x_list = num_x_list, ex_cols = ex_cols,
save_data = save_data, file_name = file_name,dir_path = dir_path )
}
if (any(is.element(sapply(rank_dict, class), c("character", "factor")))) {
rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))] =
as.numeric(as.character(rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))]))
}
new_x = paste(num_x_list,"rank_pct",sep = "_")
if (length(num_x_list) > 0) {
dat[, new_x] = loop_function(func = ranking_percent_proc_x, x_list = num_x_list,
args = list(dat = dat, rank_dict = rank_dict,pct = pct),
bind = "cbind", as_list = FALSE, parallel = parallel)
}
if (save_data) {
dir_path = ifelse(!is.character(dir_path),
tempdir(), dir_path)
if (!dir.exists(dir_path)) dir.create(dir_path)
if (!is.character(file_name)) file_name = NULL
save_data(dat, file_name = ifelse(is.null(file_name), "dat_rank_percent",
paste(file_name, "dat_rank_percent", sep = ".")), dir_path = dir_path, note = note)
}
return(dat)
}
#' @rdname ranking_percent_proc
#' @export
ranking_percent_proc_x = function(dat, x,rank_dict = NULL,pct = 0.01) {
if ((!is.null(x) && is.character(x)) & !is.null(dat)) {
dat_x = abs(dat[, x][complete.cases(dat[, x])])
} else {
if (!is.null(dat) && is.vector(dat)) {
dat_x = abs(dat[complete.cases(dat)])
x = "rank_x"
} else {
if (!is.null(dat) && (is.data.frame(dat) | is.data.table(dat)) && length(dat) == 1 && unlist(dat) > 2) {
x = colnames(dat)[1]
dat = unlist(dat)
dat_x = abs(dat[complete.cases(dat)])
} else {
stop("dat is null & x is null")
}
}
}
if (is.null(rank_dict)) {
QL = quantile(dat_x, 0.01)
QU = quantile(dat_x, 0.99)
QU_QL = QU - QL
outliers = QU + 4 * QU_QL
dat_x[dat_x > QU + 4 * QU_QL] = outliers
rank_dict_x = NULL
rank_x = quantile(ecdf(unique(dat_x)), seq(0, 1, by = pct))
rank_percent = as.double(sub("%", "", names(rank_x))) / 100
rank_dict_x = data.frame(rank_percent, rank_x)
colnames(rank_dict_x)[2] = x
}
if (any(is.element(sapply(rank_dict, class), c("character", "factor")))) {
rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))] =
as.numeric(as.character(rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))]))
}
x_new = paste(x,"rank_pct",sep = "_")
dat[which(!is.na(dat[, x])), x_new] = vapply(dat_x, function(i) {
round(rank_dict[, "rank_percent"][which.min(i > rank_dict[, x])], 3)
}, FUN.VALUE = numeric(1))
return(dat[x_new])
}
#' @rdname ranking_percent_proc
#' @export
ranking_percent_dict = function(dat, x_list = NULL, ex_cols = NULL, pct = 0.01, parallel = FALSE,
save_data = FALSE, file_name = NULL, dir_path = tempdir(), ...) {
x_list = get_x_list(x_list = x_list, dat_train = dat, dat_test = NULL, ex_cols = ex_cols)
if (length(x_list) > 0) {
num_x_list = get_names(dat = dat[x_list], types = c('numeric', 'integer', 'double'),
ex_cols = c(ex_cols), get_ex = FALSE)
} else {
stop("No variable in the x_list or ex_col excludes all variables.\n ")
}
rank_dict = NULL
if (length(num_x_list) > 0) {
rank_dict = loop_function(func = ranking_percent_dict_x, x_list = num_x_list,
args = list(dat = dat, pct = pct),
bind = "cbind", as_list = TRUE, parallel = parallel)
rank_dict = multi_left_join(df_list = rank_dict, by = "rank_percent")
rank_dict[, "rank_percent"] = as.numeric(as.character(rank_dict[, "rank_percent"]))
}
if (save_data) {
dir_path = ifelse(!is.character(dir_path),
tempdir(), dir_path)
if (!dir.exists(dir_path)) dir.create(dir_path)
if (!is.character(file_name)) file_name = NULL
save_data(rank_dict, file_name = ifelse(is.null(file_name), "rank_percent_dict",
paste(file_name, "rank_percent_dict", sep = ".")), dir_path = dir_path, note = TRUE)
}
return(rank_dict)
}
#' @rdname ranking_percent_proc
#' @export
ranking_percent_dict_x = function(dat, x = NULL, pct = 0.01) {
if ((!is.null(x) && is.character(x)) & !is.null(dat)) {
dat_x = abs(dat[, x][complete.cases(dat[, x])])
} else {
if (!is.null(dat) && is.vector(dat)) {
dat_x = abs(dat[complete.cases(dat)])
x = "rank_x"
} else {
if (!is.null(dat) && (is.data.frame(dat) | is.data.table(dat)) && length(dat) == 1 && unlist(dat) > 2) {
x = colnames(dat)[1]
dat = unlist(dat)
dat_x = abs(dat[complete.cases(dat)])
} else {
stop("dat is null & x is null")
}
}
}
QL = quantile(dat_x, 0.01)
QU = quantile(dat_x, 0.99)
QU_QL = QU - QL
outliers = QU + 4* QU_QL
dat_x[dat_x > QU + 4 * QU_QL] = outliers
rank_dict_x = NULL
rank_x = quantile(ecdf(unique(dat_x)), seq(0, 1, by = pct))
rank_percent = as.double(sub("%", "", names(rank_x))) / 100
rank_dict_x = data.frame(rank_percent, rank_x)
colnames(rank_dict_x)[2] = x
return(rank_dict_x)
}
#' TF-IDF
#'
#' The \code{term_filter} is for filtering stop_words and low frequency words.
#' The \code{term_idf} is for computing idf(inverse documents frequency) of terms.
#' The \code{term_tfidf} is for computing tf-idf of documents.
#' @param term_df A data.frame with id and term.
#' @param low_freq Use rate of terms or use numbers of terms.
#' @param stop_words Stop words.
#' @param n_total Number of documents.
#' @param idf A data.frame with idf.
#' @return A data.frame
#' @examples
#' term_df = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#' 8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#' terms = c('a','b','c','a','c','d','d','a','b','c','a','c','d','a','c',
#' 'd','a','e','f','b','c','f','b','c','h','h','i','c','d','g','k','k'))
#' term_df = term_filter(term_df = term_df, low_freq = 1)
#' idf = term_idf(term_df)
#' tf_idf = term_tfidf(term_df,idf = idf)
#' @importFrom data.table setDT .N := dcast merge.data.table as.data.table
#' @export
term_tfidf = function(term_df, idf = NULL){
n_term = tf = id = term = NULL
term_df = as.data.table(term_df,key = 'id')
term_df[,n_term := 1]
term_df[,n_term := sum(n_term), by= list(id,term)]
term_df[,tf := n_term / sum(n_term), by= 'id']
n_total = length(unique(term_df$id))
if (is.null(idf)){
idf = term_idf(term_df = term_df,n_total = n_total)
}
term_df = data.table::merge.data.table(term_df,idf,by = 'term',all.x = TRUE)
term_df[,tfidf := tf * idf]
nw = tryCatch(n_total*as.integer(nrow(idf)),warning = function(w){NA})
if(!is.na(nw)&& nw < 1000000000 ){
term_df = as.data.table(term_df,key = 'id')
tfidf = data.table::dcast(term_df, id ~ term, sum,
value.var = "tfidf")
}else{
for(k in 2:100){
nr = tryCatch(n_total*as.integer(nrow(idf)/k),warning = function(w){NA})
if(!is.na(nr)&& nr < 1000000000)break
}
k = k*20
w_cv = cv_split(idf,k = k)
tfidf = list()
for(i in 1:k){
terms = idf[w_cv[[i]],'term']
idf_term = term_df[term %in% unlist(terms)]
tfidf[[i]] = idf_term[,c('id','term','tfidf')]
}
tfidf = lapply(tfidf,function(z)data.table::dcast(z,id ~ term, sum,
value.var = "tfidf"))
tfidf = Reduce(function(...)merge.data.table(...,by = 'id', all = TRUE),
tfidf)
}
return(quick_as_df(tfidf))
}
#' @rdname term_tfidf
#' @export
#'
term_idf = function(term_df,n_total = NULL){
id = N = NULL
colnames(term_df)[1] = 'id'
colnames(term_df)[2] = 'term'
if(is.null(n_total)){
n_total = length(unique(term_df$id))
}
term_df = as.data.table(term_df,key = 'id')
words_freq = term_df[,.N,'term']
words_freq[,idf := log(n_total / (N + 1))]
idf = words_freq[,c('term','idf')]
return(quick_as_df(idf))
}
#' @rdname term_tfidf
#' @export
term_filter = function(term_df, low_freq = 0.01, stop_words = NULL){
N = use_rate = term = id = NULL
colnames(term_df)[1] = 'id'
colnames(term_df)[2] = 'term'
term_df = as.data.table(term_df,key = 'id')
words_freq = term_df[,.N,'term']
if(is.numeric(low_freq) && low_freq < 1){
n_total = length(unique(term_df$id))
words_freq[,use_rate := N / n_total]
low_freq_words = words_freq[use_rate < low_freq,'term']
}else{
if(is.numeric(low_freq) && low_freq>= 1){
low_freq_words = words_freq[ N <= low_freq,'term']
}else{
low_freq_words = words_freq[ N <= 1,'term']
}
}
term_df = term_df[!term_df$term %in% c('',as.character(unlist(low_freq_words)),
as.character(unlist(stop_words)))]
return(quick_as_df(term_df))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.