R/data_transformation.R

Defines functions term_filter term_idf term_tfidf ranking_percent_dict_x ranking_percent_dict ranking_percent_proc_x ranking_percent_proc log_vars log_trans merge_category de_percent sim_str derived_partial_acf derived_pct derived_interval variable_process address_varieble add_variable_process replace_value_x replace_value city_varieble city_varieble_process time_variable time_vars_process var_group_proc gather_data time_series_proc derived_ts derived_ts_vars de_one_hot_encoding one_hot_encoding woe_trans woe_trans_all

Documented in address_varieble add_variable_process city_varieble city_varieble_process de_one_hot_encoding de_percent derived_interval derived_partial_acf derived_pct derived_ts derived_ts_vars gather_data log_trans log_vars merge_category one_hot_encoding ranking_percent_dict ranking_percent_dict_x ranking_percent_proc ranking_percent_proc_x replace_value replace_value_x sim_str term_filter term_idf term_tfidf time_series_proc time_variable time_vars_process var_group_proc variable_process woe_trans woe_trans_all

#' WOE Transformation
#'
#' \code{woe_trans} is for transforming data to woe.
#' The \code{woe_trans_all} function is a simpler wrapper for \code{woe_trans}.
#' @param dat A data.frame with independent variables.
#' @param target The name of target variable. Default is NULL.
#' @param x_list A list of x variables.
#' @param x  The name of an independent variable.
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param breaks_list  A list contains breaks of variables. it is generated by code{\link{get_breaks_all}},code{\link{get_breaks}}
#' @param bins_table A table contians woe of each bin of variables, it is generated by code{\link{get_bins_table_all}},code{\link{get_bins_table}}
#' @param note   Logical, outputs info. Default is TRUE.
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param woe_name Logical. Add "_woe" at the end of the variable name.
#' @param save_data Logical, save results in locally specified folder. Default is TRUE
#' @param file_name The name for periodically saved woe file. Default is "dat_woe".
#' @param dir_path The path for periodically saved woe file Default is "./data"
#' @param ...  Additional parameters.
#' @return A list of breaks for each variables.
#' @seealso \code{\link{get_tree_breaks}}, \code{\link{cut_equal}}, \code{\link{select_best_class}}, \code{\link{select_best_breaks}}
#' @examples
#' sub = cv_split(UCICreditCard, k = 30)[[1]]
#' dat = UCICreditCard[sub,]
#' dat = re_name(dat, "default.payment.next.month", "target")
#' dat = data_cleansing(dat, target = "target", obs_id = "ID", occur_time = "apply_date",
#' miss_values =  list("", -1))
#'
#' train_test = train_test_split(dat, split_type = "OOT", prop = 0.7,
#'                                 occur_time = "apply_date")
#' dat_train = train_test$train
#' dat_test = train_test$test
#' #get breaks of all predictive variables
#' x_list = c("PAY_0", "LIMIT_BAL", "PAY_AMT5", "EDUCATION", "PAY_3", "PAY_2")
#' breaks_list = get_breaks_all(dat = dat_train, target = "target",
#'                               x_list = x_list, occur_time = "apply_date", ex_cols = "ID",
#' save_data = FALSE, note  = FALSE)
#' #woe transform
#' train_woe = woe_trans_all(dat = dat_train,
#'                           target = "target",
#'                           breaks_list = breaks_list,
#'                           woe_name = FALSE)
#' test_woe = woe_trans_all(dat = dat_test,
#'                        target = "target",
#'                          breaks_list = breaks_list,
#'                          note = FALSE)
#'
#' @export

woe_trans_all = function(dat, x_list = NULL, ex_cols = NULL, bins_table = NULL,
          target = NULL, breaks_list = NULL, note = FALSE,
          save_data = FALSE, parallel = FALSE, woe_name = FALSE,
          file_name = NULL, dir_path = tempdir(), ...) {

   if (note)cat_line("-- Transforming variables to woe", col = love_color("deep_green"))
   opt = options(scipen = 200, stringsAsFactors = FALSE) #
   if (is.null(x_list)) {
     if (!is.null(bins_table)) {
       x_list = unique(bins_table[which(as.character(bins_table[, "Feature"]) != "Total"), "Feature"])
     } else {
       if(length(breaks_list)>0){
         x_list = unique(breaks_list[,1])
       }else{
         x_list = get_names(dat = dat,
                            types = c('factor', 'character', 'numeric', 'integer', 'double'),
                            ex_cols = c(target, ex_cols), get_ex = FALSE)
         
       }
     }
   }
   ex_vars = get_names(dat = dat, types = c('factor', 'character', 'numeric', 'integer', 'double'),
                       ex_cols = x_list, get_ex = FALSE)
   if(sum(is.na(dat))> 0){stop("Input data contains NAs, please process missing value first.")	}
   dat_woe = loop_function(func = woe_trans, x_list = x_list,
                           args = list(dat = dat, bins_table = bins_table,
                                       target = target, breaks_list = breaks_list,
                                       woe_name = woe_name),
                           bind = "cbind", parallel = parallel)
   dat = cbind(dat[ex_vars], dat_woe)
   if (save_data) {
     dir_path = ifelse(!is.character(dir_path),
                       tempdir(), dir_path)
     if (!dir.exists(dir_path)) dir.create(dir_path)
     if (!is.character(file_name)) file_name = NULL
     save_data(dat, file_name = ifelse(is.null(file_name), "dat.woe", paste(file_name, "dat.woe", sep = ".")), dir_path = dir_path, note = note)
   }

   return(dat)
   options(opt) # reset
 }

#' @rdname woe_trans_all
#' @export

woe_trans = function(dat, x, bins_table = NULL, target = NULL, breaks_list = NULL, woe_name = FALSE) {
   # bins_table
   if (is.null(bins_table)) {
     if (!is.null(breaks_list)) {
       bins_table = get_bins_table(dat = dat, x = x,
                                   target = target, breaks_list = breaks_list,
                                   note = FALSE)
     } else {
       stop("bins_table & breaks_list are both missing.\n")
     }
   }
   bins_tbl = bins_table[which(as.character(bins_table[, "Feature"]) == names(dat[x])),
                         c("Feature", "cuts", "bins", "woe")]
   if (woe_name) {
     woe_names = paste(names(dat[x]), "woe", sep = "_")
   } else {
     woe_names = names(dat[x])
   }
   if (length(bins_tbl) > 0 && all(as.character(bins_tbl[, "Feature"]) != "Total")) {
     bins = split_bins(dat = dat, x = x, breaks = bins_tbl[, c("cuts")], bins_no = TRUE)
     for (i in 1:nrow(bins_tbl)) {
       woe_ind =  which(as.character(bins) == as.character(bins_tbl[i, "bins"]))
       if(length(woe_ind) > 0){
         dat[woe_ind, woe_names] = bins_tbl[i, "woe"]
       }

     }
     dat[, woe_names] = as.numeric(dat[, woe_names])
   }
   return(dat[woe_names])
 }

#' One-Hot Encoding
#'
#' \code{one_hot_encoding} is for converting the factor or character variables into multiple columns
#' @param dat A dat frame.
#' @param cat_vars The name or Column index list to be one_hot encoded.
#' @param merge_cat Logical. If TRUE, to merge categories greater than 8, default is TRUE.
#' @param ex_cols  Variables to be  excluded, use regular expression matching
#' @param na_act Logical,If true, the missing value is processed, if FALSE missing value is omitted .
#' @param note Logical.Outputs info.Default is TRUE.
#' @return A dat frame with the one hot encoding applied to all the variables with type as factor or character.
#' @seealso \code{\link{de_one_hot_encoding}}
#' @examples
#' dat1 = one_hot_encoding(dat = UCICreditCard,
#' cat_vars = c("SEX", "MARRIAGE"),
#' merge_cat = TRUE, na_act = TRUE)
#' dat2 = de_one_hot_encoding(dat_one_hot = dat1,
#' cat_vars = c("SEX","MARRIAGE"), na_act = FALSE)
#'
#' @importFrom cli cat_rule cat_line cat_bullet
#' @export

one_hot_encoding = function(dat, cat_vars = NULL, ex_cols = NULL,
                            merge_cat = TRUE, na_act = TRUE, note = FALSE) {
  if (note)cat_line("-- One-hot encoding for charactor or factor", col = love_color("deep_green"))

  dat = checking_data(dat)
  if (is.null(cat_vars)) {
    cat_vars = get_names(dat = dat, types = c("character", "factor"), ex_cols = ex_cols)
  }
  if (length(cat_vars) > 0) {
    if (na_act) {
      dat[, cat_vars] = process_nas(dat[cat_vars], note = FALSE)
    }
    if (merge_cat) {
      dat[, cat_vars] = merge_category(dat[cat_vars], note = FALSE)
    }
    for (i in cat_vars) {
      if (is.factor(dat[, i]) || is.character(dat[, i])) {
        col_name = i
        dat[, i] = sapply(dat[, i], function(x) gsub("[^\u4e00-\u9fa5,^a-zA-Z,^0-9]", "_", x))
        cat_list = unique(dat[, i])
        encode_cols = length(cat_list)
        #Create individual column for every unique value in the variable
        for (j in 1:encode_cols) {
          one_hot_name = (paste(col_name, ".", cat_list[j], ".",sep = ""))
          dat[, one_hot_name] = ifelse(dat[, i] == cat_list[j] & !is.na(dat[, i]), 1, 0)
        }
      }
    }
    dat = dat[, - which(names(dat) %in% cat_vars)]
  }
  return(dat)
}



#' Recovery One-Hot Encoding
#'
#' \code{de_one_hot_encoding} is for one-hot encoding recovery processing
#' @param dat_one_hot A dat frame with the one hot encoding variables
#' @param cat_vars  variables to be recovery processed, default is null, if null, find these variables through regular expressions .
#' @param na_act Logical,If true, the missing value is  assigned as "missing", if FALSE missing value is omitted, the default is TRUE.
#' @param note Logical.Outputs info.Default is TRUE.
#' @return A dat frame with the one hot encoding recorery character variables
#' @seealso \code{\link{one_hot_encoding}}
#' @examples
#' #one hot encoding
#' dat1 = one_hot_encoding(dat = UCICreditCard,
#' cat_vars = c("SEX", "MARRIAGE"),
#' merge_cat = TRUE, na_act = TRUE)
#' #de one hot encoding
#' dat2 = de_one_hot_encoding(dat_one_hot = dat1,
#' cat_vars = c("SEX","MARRIAGE"),
#' na_act = FALSE)
#' @importFrom cli cat_rule cat_line cat_bullet
#' @export
de_one_hot_encoding = function(dat_one_hot, cat_vars = NULL, na_act = TRUE,note = FALSE) {

  if(note)cat_line("-- Recoverying one-hot encoding for charactor or factor.\n", col = love_color("deep_green"))
  dat_one_hot = checking_data(dat_one_hot)
  if (is.null(cat_vars)) {
    char_names = one_hot_names = one_hot_names = c()
    for (i in 1:length(dat_one_hot)) {
      char_names[i] = sub(paste0("\\.$"), "", colnames(dat_one_hot)[i])
      if (!is.null(char_names[i]) && !is.na(char_names[i]) &&
          char_names[i] == colnames(dat_one_hot)[i]) {
        char_names[i] = NA
      }
      one_hot_names[i] = try(strsplit(char_names[i], "[.]")[[1]][1], silent = TRUE)
    }
    cat_vars = unique(one_hot_names[!is.na(one_hot_names)])
  }

  one_hot_vars = unlist(sapply(cat_vars, function(x) grep(paste0(x, "\\.", "\\S{1,100}", "\\."),
                                                           paste(colnames(dat_one_hot)))))

  de_cat_vars = intersect(cat_vars, unique(gsub("\\d{1}$", "", names(one_hot_vars))))

  if (length(de_cat_vars) > 0) {
    dat_one_hot[, de_cat_vars] = lapply(de_cat_vars, function(x) {
      grx = cv_cols = names_1 = re_code = NULL
      grx = paste0(x, "\\.", "\\S{1,100}", "\\.$")
      cv_cols =  grep(grx, paste(colnames(dat_one_hot)))
      names_1 = colnames(dat_one_hot)[cv_cols]
      if (na_act) {
        re_code =  rep("other", nrow(dat_one_hot))
      } else {
        re_code  =  rep(NA, nrow(dat_one_hot))
      }
      for (i in 1:(length(names_1))) {
        re_code[which(dat_one_hot[cv_cols][i] == 1)] = strsplit(names_1[i], "[.]")[[1]][2]
      }
      return(re_code)
    }
    )
    names(dat_one_hot[, de_cat_vars]) =  de_cat_vars
    dat_one_hot = data.frame(dat_one_hot, stringsAsFactors = FALSE)[, - one_hot_vars]
  }
  return(dat_one_hot)
}



#' Time Format Transfering
#'
#' \code{time_transfer} is for transfering time variables to time format.
#' @param dat A data frame
#' @param date_cols  Names of time variable or regular expressions for finding time variables. Default is  "DATE$|time$|date$|timestamp$|stamp$".
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param note   Logical, outputs info. Default is TRUE.
#' @return  A data.frame with transfermed time variables.
#' @examples
#' #transfer a variable.
#' dat = time_transfer(dat = lendingclub,date_cols = "issue_d")
#' class(dat[,"issue_d"])
#' #transfer a group of variables with similar name.
#' #transfer all time variables.
#' dat = time_transfer(dat = lendingclub[1:3],date_cols = "_d$")
#' class(dat[,"issue_d"])
#' @export

time_transfer = function (dat, date_cols = NULL, ex_cols = NULL, note = FALSE)
{
  dat = checking_data(dat, note = FALSE)
  
  if (note)
    cat_line("-- Formating time variables", col = love_color("dark_green"))
  x_list = get_x_list(x_list = NULL, dat_train = dat, dat_test = NULL,
                      ex_cols = ex_cols)
  date_cols1 = NULL
  if (!is.null(date_cols)) {
    date_cols1 = names(dat[x_list])[colnames(dat[x_list]) %islike%
                                       date_cols]
  } else {
    date_cols1 = names(dat[x_list])
  }
  df_date = dat[date_cols1]
  df_date = df_date[!colAllnas(df_date)]
  df_date = df_date[!sapply(df_date, is_date)]
  if (dim(df_date)[2] != 0) {
    df_date_cols = names(df_date)
    t_sample = list()
    t_len = list()
    t_sam = NULL
    tryCatch({
      for (x in 1:ncol(df_date)) {
        t_sam = vapply(as.character(sample(na.omit(df_date[[x]]),
                                           10, replace = TRUE)), function(i) {
                                             if (n_char(i) >= 6) {
                                               n_char(i)
                                             } else {
                                               0
                                             }
                                           }, FUN.VALUE = numeric(1))
        t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
        if(length(t_sam) == 0){
          t_sam = vapply(as.character(sample(na.omit(df_date[[x]]),
                                             100, replace = TRUE)), function(i) {
                                               if (n_char(i) >= 6) {
                                                 n_char(i)
                                               } else {
                                                 0
                                               }
                                             }, FUN.VALUE = numeric(1))
          t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
        }
        if(length(t_sam) == 0){
          t_sam = vapply(as.character(sample(na.omit(df_date[[x]]),
                                             1000, replace = TRUE)), function(i) {
                                               if (n_char(i) >= 6) {
                                                 n_char(i)
                                               } else {
                                                 0
                                               }
                                             }, FUN.VALUE = numeric(1))
          t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
        }

        if(length(t_sam) == 0){
          t_sam = vapply(as.character(na.omit(df_date[[x]])), function(i) {
            if (n_char(i) >= 6) {
              n_char(i)
            } else {
              0
            }
          }, FUN.VALUE = numeric(1))
          t_sam = unlist(t_sam[which(unlist(t_sam) > 4)])
        }
        if (length(t_sam) > 0) {
          t_sample[[x]] = min(t_sam, na.rm = TRUE)
        }
        else {
          t_sample[[x]] = 0
        }
        t_len[[x]] = as.character(names(t_sam)[which(n_char(names(t_sam)) ==
                                                       t_sample[[x]])][1])
      }
    }, error = function(e) {
      cat("ERROR :", conditionMessage(e), "\n")
    }, warning = function(w) {
      ""
    })
    date_cols2 = which(t_sample != 0 & t_sample != Inf)
    for (x in date_cols2) {
      if (!is.numeric(df_date[[x]]) & t_sample[[x]] > 10 &
          grepl(":", t_len[[x]]) & length(gregexpr("\\.",
                                                   t_len[[x]])[[1]]) == 1 & grepl("\\.", t_len[[x]])) {
        df_date[[x]] = gsub("\\.0$", "",df_date[[x]])
        t_len[[x]] = gsub(" |\\.0$", "",t_len[[x]])
      }
      if (t_sample[[x]] >= 5 & t_sample[[x]] <= 6 & 
          grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}$",
                x = substr(t_len[[x]], 1, 10))) {
        df_date[[x]] = paste(df_date[[x]], "-01")
        df_date[[x]] = as.Date(as.character(df_date[[x]]),
                               "%Y-%m-%d")
      }
      if (t_sample[[x]] >= 5 & t_sample[[x]] <= 6 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}$",
                                                          x = substr(t_len[[x]], 1, 10))) {
        df_date[[x]] = paste(df_date[[x]], "/01")
        df_date[[x]] = as.Date(as.character(df_date[[x]]),
                               "%Y/%m/%d")
      }
      if (t_sample[[x]] >= 8 & t_sample[[x]] <= 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$",
                                                           x = substr(t_len[[x]], 1, 10))) {
        df_date[[x]] = as.Date(as.character(df_date[[x]]),
                               "%Y-%m-%d")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-1]{1}[0-9]{1}:[0-9]{2}$",
                                                                            substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
        df_date[[x]] = paste0(df_date[[x]], ":00")
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y-%m-%d %H:%M:%S")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-1]{1}[0-9]{1}:[0-9]{2}:[0-9]{2}",
                                                                            substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y-%m-%d %H:%M:%S")
      }
      if (t_sample[[x]] >= 8 & t_sample[[x]] <= 9 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$",
                                                          x = t_len[[x]])) {
        df_date[[x]] = as.Date(as.character(df_date[[x]]),
                               "%Y%m%d")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 8)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}:[0-9]{2}$",
                                                                           substr(t_len[[x]], 9, n_char(t_len[[x]])))) {
        df_date[[x]] = gsub(" ", "", df_date[[x]])
        df_date[[x]] = paste(substr(df_date[[x]], 1,
                                    8), substr(df_date[[x]], 9, n_char(df_date[[x]])))
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y%m%d %H:%M:%S")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 8)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}$",
                                                                           substr(t_len[[x]], 9, n_char(t_len[[x]])))) {
        df_date[[x]] = gsub(" ", "", df_date[[x]])
        df_date[[x]] = paste(substr(df_date[[x]], 1,
                                    8), substr(df_date[[x]], 9, n_char(df_date[[x]])),
                             ":00")
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y%m%d %H:%M:%S")
      }
      if (t_sample[[x]] >= 8 & t_sample[[x]] <= 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-9]{1,2}$",
                                                           x = substr(t_len[[x]], 1, 10))) {
        df_date[[x]] = as.Date(as.character(df_date[[x]]),
                               "%Y/%m/%d")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}$",
                                                                            substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
        df_date[[x]] = paste0(df_date[[x]], ":00")
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y/%m/%d %H:%M:%S")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 8)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}$",
                                                                           substr(t_len[[x]], 9, n_char(t_len[[x]])))) {
        df_date[[x]] = paste0(df_date[[x]], ":00")
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y/%m/%d %H:%M:%S")
      }
      if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$",
                                     x = substr(t_len[[x]], 1, 10)) & grepl(pattern = "[0-9]{1,2}:[0-9]{2}:[0-9]{2}$",
                                                                            substr(t_len[[x]], 11, n_char(t_len[[x]])))) {
        df_date[[x]] = as.POSIXct(as.character(df_date[[x]]),
                                  format = "%Y/%m/%d %H:%M:%S")
      }
      if (t_sample[[x]] == 10 & grepl(pattern = "^[1]{1}[0-9]{9}",
                                      x = t_len[[x]])) {
        df_date[[x]] = as.POSIXct(as.numeric(df_date[[x]]),
                                  origin = "1970-01-01 00:00:00")
      }
      if (t_sample[[x]] == 13 & grepl(pattern = "^[1]{1}[0-9]{12}",
                                      x = t_len[[x]])) {
        df_date[[x]] = as.POSIXct(as.numeric(df_date[[x]])/1000,
                                  origin = "1970-01-01 00:00:00")
      }
    }
    dat[df_date_cols] = df_date
    rm(df_date)
  }
  else {
    dat = dat
  }
  dat
}


#' Derivation of Behavioral Variables
#'
#' This function is used for derivating behavioral variables and is not intended to be used by end user.
#'
#' @param dat A data.frame contained only predict variables.
#' @param grx Regular expressions used to match variable names.
#' @param grx_x Regular expression used to match a group of variable names.
#' @param td Number of variables to derivate.
#' @param ID The name of ID of observations or key variable of data. Default is NULL.
#' @param ex_cols A list of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param x_list Names of independent variables.
#' @param der Variables to derivate
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param note Logical, outputs info. Default is TRUE.
#' @details The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination.
#' @importFrom data.table setDT :=  rbindlist
#' @export


derived_ts_vars = function(dat, grx = NULL, td = NULL, ID = NULL, ex_cols = NULL, x_list = NULL,
							der = c("cvs", "sums", "means", "maxs", "max_mins",
									"time_intervals", "cnt_intervals", "total_pcts",
									"cum_pcts", "partial_acfs"),
							parallel = TRUE, note = TRUE) {
	if (note) cat_line(paste("--", "Derived variables of", paste(der), " .\n"), col = love_color("dark_green"))

	dat = checking_data(dat, note = FALSE)
	if (parallel) {
		parallel = start_parallel_computing(parallel)
		stopCluster = TRUE
	} else {
		parallel = stopCluster = FALSE
	}
	if (is.null(ID)) {
		dat$ID = as.character(rownames(dat))
		ID = 'ID'
	} else {
		dat[, ID] = as.character(dat[, ID])
	}
	is_not_numeric = which(sapply(dat, function(x) is.element("integer64", class(x))))
	dat[is_not_numeric] = sapply(dat[is_not_numeric], as.numeric)
	on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
	i. = j. = NULL
	if (!parallel) {
		df_cv_list = lapply(unlist(grx), function(grx_x) derived_ts(dat, grx_x = grx_x, td = td,
								   ID = ID, ex_cols = ex_cols, x_list = x_list, der = der))
		if (length(df_cv_list) > 1) {
			df_cv_list = multi_left_join(df_list = df_cv_list, by = ID)
		} else {
			df_cv_list = as.data.frame(df_cv_list)
		}
	} else {
		df_cv_list = foreach(i. = unlist(grx),
							  .errorhandling = c('pass')) %dopar% {
								try(do.call(derived_ts, args = list(dat = dat, grx_x = i., td = td, ID = ID, ex_cols = ex_cols, x_list = x_list, der = der)), silent = TRUE)
					  		}
		if (length(df_cv_list) > 1) {
			df_cv_list = multi_left_join(df_list = df_cv_list, by = ID)
		} else {
			df_cv_list = as.data.frame(df_cv_list)
		}
	}
	return(df_cv_list)
}

#' @rdname derived_ts_vars
#' @export

derived_ts = function(dat, grx_x = NULL, x_list = NULL, td = NULL, ID = NULL, ex_cols = NULL,
					   der = c("cvs", "sums", "means", "maxs", "max_mins",
							   "time_intervals", "cnt_intervals", "total_pcts",
							   "cum_pcts", "partial_acfs")) {
	dat = checking_data(dat, note = FALSE)
	if (is.null(ID)) {
		dat$ID = rownames(dat)
		ID = 'ID'
	}
	if (is.null(x_list)) {
		num_x_list = get_names(dat = dat,
						 types = c('numeric', 'integer', 'double'),
						 ex_cols = c(ID, ex_cols), get_ex = FALSE)
	} else {
		num_x_list = x_list
	}

	if (!is.null(grx_x)) {
		if (is.null(td)) {
			cv_cols = num_x_list[grep(grx_x, paste(num_x_list))]
		} else {
			cv_cols = num_x_list[grep(grx_x, paste(num_x_list))[1:td]]
		}
	} else {
		if (is.null(td) || td > length(num_x_list)) {
			cv_cols = num_x_list
		} else {
			cv_cols = num_x_list[1:td]

		}
	}
	cv_cols = cv_cols[!is.na(cv_cols)]
	dat = dat[c(ID, cv_cols)]
	if (length(cv_cols) > 0) {
		dat = as.data.table(dat)
		name_n = orignal_nam = sim_nam = str_num = c()
		orignal_nam = names(dat[, cv_cols, with = FALSE])
		str_num = as.numeric(str_match(str_r = orignal_nam, pattern = "\\d+"))
		if (!any(is.na(str_num)) && !is.null(td) && length(str_num) == td) {
			name_n = paste(min(str_num), max(str_num), sep = "to")
		}
		if (is.null(name_n)) {
			name_n = "ts"
		}
		sim_nam = paste(unique(lapply(1:(length(orignal_nam) - 1),
								  function(x) sim_str(orignal_nam[x], orignal_nam[x + 1], sep = "_|[0-9]")))[[1]], collapse = "_")
						  		if (any(der == "cvs")) {
								dat = dat[, paste(sim_nam, name_n, "cvs", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
				  rowCVs(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
					  		}
		if (any(der == "sums")) {
			dat = dat[, paste(sim_nam, name_n, "sums", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
																	  rowSums(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
		}
		if (any(der == "means")) {
			dat = dat[, paste(sim_nam, name_n, "means", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
																	   rowMeans(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
		}
		if (any(der == "maxs")) {
			dat = dat[, paste(sim_nam, name_n, "maxs", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
																	  rowMaxs(dat[, cv_cols, with = FALSE]))]
		}

		if (any(der == "max_mins")) {
			dat = dat[, paste(sim_nam, name_n, "max_mins", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
																		  rowMaxMins(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
		}
		if (any(der == "partial_acfs")) {
			dat = dat[, paste(sim_nam, name_n, "partial_acfs", sep = "_") := derived_partial_acf(dat[, cv_cols, with = FALSE])]
		}
		if (any(der == "time_intervals")) {
			dat = dat[, paste(orignal_nam, "time_intervals", sep = "_") := derived_interval(dat[, cv_cols, with = FALSE],
																					  interval_type = "time_interval")]
		}
		if (any(der == "cnt_intervals")) {
			dat = dat[, paste(orignal_nam, "cnt_intervals", sep = "_") := derived_interval(dat[, cv_cols, with = FALSE],
																					 interval_type = "cnt_interval")]
		}
		if (any(der == "total_pcts")) {
			dat = dat[, paste(orignal_nam, "total_pcts", sep = "_") := derived_pct(dat[, cv_cols, with = FALSE],
																			 pct_type = "total_pct")]
		}
		if (any(der == "cum_pcts")) {
			dat = dat[, paste(orignal_nam, "cum_pcts", sep = "_") := derived_pct(dat[, cv_cols, with = FALSE],
																		   pct_type = "cum_pct")]
		}
	}
	dat = quick_as_df(dat)
	return(dat)
}



#' Process time series data
#'
#' This function is used for time series data processing.
#'
#' @param dat  A data.frame contained only predict variables.
#' @param group The group of behavioral or status variables.
#' @param ID  The name of ID of observations or key variable of data. Default is NULL.
#' @param time The name of variable which is time when behavior was happened.
#' @details  The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination.
#' @importFrom data.table setDT := dcast.data.table shift .SD
#' @examples
#' dat = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#'                             8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#'                      terms = c('a','b','c','a','c','d','d','a',
#'                                'b','c','a','c','d','a','c',
#'                                   'd','a','e','f','b','c','f','b',
#'                                'c','h','h','i','c','d','g','k','k'),
#'                      time = c(8,3,1,9,6,1,4,9,1,3,4,8,2,7,1,
#'                               3,4,1,8,7,2,5,7,8,8,2,1,5,7,2,7,3))
#'
#' time_series_proc(dat = dat, ID = 'id', group = 'terms',time = 'time')
#' @export

time_series_proc = function(dat, ID = NULL, group = NULL, time = NULL) {
  time_interval = NULL
  if (is.null(time)) stop("time variable is missing.\n")
  dat$time = as.numeric(dat[[time]])
  if (!is.null(ID)) {
    dat$ID = as.character(dat[[ID]])
  } else {
    dat$ID = as.character(rownames(dat))
    ID = "ID"
  }
  if (!is.null(group)) {
    dat$group = as.character(dat[[group]])
  } else {
    dat$group = "1"
  }
  dat = as.data.table(dat)
  dat = dat[,.SD[order(time)],by = 'ID']
  dat[,time_interval := as.numeric(data.table::shift(time,
                                                     fill = max(time,na.rm = TRUE),
                                                     type = "lead") - time),by = 'ID']
  dat = dcast.data.table(dat, ID ~ group,
                          fun.aggregate = list(cnt_x, sum_x, max_x, min_x, avg_x),
                          value.var = c('time_interval'))
  colnames(dat)[1] = ID
  quick_as_df(dat)
}


#' gather or aggregate data
#'
#' This function is used for gathering or aggregating data.
#'
#' @param dat  A data.frame contained only predict variables.
#' @param x_list The names of variables to gather.
#' @param ID  The name of ID of observations or key variable of data. Default is NULL.
#' @param FUN The function of gathering method.
#' @details  The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination.
#' @importFrom stats aggregate
#' @examples
#' dat = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#'                             8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#'                      terms = c('a','b','c','a','c','d','d','a',
#'                                'b','c','a','c','d','a','c',
#'                                   'd','a','e','f','b','c','f','b',
#'                                'c','h','h','i','c','d','g','k','k'),
#'                      time = c(8,3,1,9,6,1,4,9,1,3,4,8,2,7,1,
#'                               3,4,1,8,7,2,5,7,8,8,2,1,5,7,2,7,3))
#'
#' gather_data(dat = dat, x_list = "time", ID = 'id', FUN = sum_x)
#' @export

gather_data = function(dat, x_list = NULL, ID = NULL, FUN = sum_x) {
	dat = checking_data(dat)
	if (is.null(x_list)) {
		x_list = get_names(dat, types = c("numeric","double","integer"),ex_cols = ID)
	}
	if (!is.null(ID)) {
		dat$ID = as.character(dat[[ID]])
	} else {
		dat$ID = as.character(rownames(dat))
		ID = "ID"
	}
	dat_sum = stats::aggregate(dat[,x_list],
		by = list(ID = dat$ID), FUN = FUN)
	names(dat_sum)[1] = ID
	return(dat_sum)
}



#' Process group numeric variables
#'
#' This function is used for grouped numeric data processing.
#'
#' @param dat  A data.frame contained only predict variables.
#' @param group The group of behavioral or status variables.
#' @param ID  The name of ID of observations or key variable of data. Default is NULL.
#' @param num_var The name of numeric variable to process.
#' @importFrom data.table setDT := dcast.data.table shift
#' @examples
#' dat = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#'                             8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#'                      terms = c('a','b','c','a','c','d','d','a',
#'                                'b','c','a','c','d','a','c',
#'                                   'd','a','e','f','b','c','f','b',
#'                                'c','h','h','i','c','d','g','k','k'),
#'                      time = c(8,3,1,9,6,1,4,9,1,3,4,8,2,7,1,
#'                               3,4,1,8,7,2,5,7,8,8,2,1,5,7,2,7,3))
#'
#' time_series_proc(dat = dat, ID = 'id', group = 'terms',time = 'time')
#' @export
var_group_proc = function(dat, ID = NULL, group = NULL, num_var = NULL){
   dat = quick_as_df(dat)
   if(!is.null(ID)){
     dat$ID = as.character(dat[[ID]])
   }else{
     dat$ID = as.character(rownames(dat))
     ID = "ID"
   }
   if (!is.null(group)){
     dat$group = as.character(dat[[group]])
   }else{
     dat$group = "1"
   }
   if (!is.null(num_var)){
     dat[[num_var]] = as.numeric(dat[[num_var]])
   } else{
     dat$num_var = 1
     num_var = "num_var"
   }
   dat = as.data.table(dat)
   dat = dcast.data.table(dat, ID ~ group,
                           fun.aggregate = list(cnt_x, sum_x, max_x, min_x, avg_x), value.var = c(num_var))
   colnames(dat)[1] = ID
   quick_as_df(dat)
}


#' Processing of Time or Date Variables
#'
#' This function is not intended to be used by end user.
#'
#' @param  df_tm  A data.frame
#' @param  x  Time variable.
#' @param  enddate  End time.
#' @param units  Units of diff_time, "secs", "mins", "hours", "days", "weeks" is available.
#' @export
#' @importFrom data.table  hour setnames

time_vars_process = function(df_tm = df_tm, x, enddate = NULL,
							 units = c("secs", "mins", "hours", "days", "weeks")) {
	if (is_date(df_tm[[x]]) & x != enddate & (!is.null(enddate) && is_date(df_tm[[enddate]])) &
		length(units) > 0){
		newname = c()
		for(unit in units){
			   if(length(unit) > 0 && is.element("days",unit)) {
				   df_tm = within(df_tm, {
					   diff_days = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "days")))
				   })
				   newname[unit] = paste0(x, "_to_", enddate, "_diff_days")
				   df_tm = re_name(df_tm, oldname =  c("diff_days"),  newname = newname[unit])
			   }else{
				      if(length(unit) > 0 && is.element("secs",unit)) {
						  df_tm = within(df_tm, {
							  diff_secs = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "secs")))
						  })
						  newname[unit] = paste0(x, "_to_", enddate, "_diff_secs")
						  df_tm = re_name(df_tm, oldname =  c("diff_secs"),  newname = newname[unit])
					  }else{
						     if(length(unit) > 0 && is.element("mins",unit)) {
								 df_tm = within(df_tm, {
									 diff_mins = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "mins")))
								 })
								 newname[unit] = paste0(x, "_to_", enddate, "_diff_mins")
								 df_tm = re_name(df_tm, oldname =  c("diff_mins"),  newname = newname[unit])
							 }else{
								    if(length(unit) > 0 && is.element("hours",unit)) {
										df_tm = within(df_tm, {
											diff_hours = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "hours")))
										})
										newname[unit] = paste0(x, "_to_", enddate, "_diff_hours")
										df_tm = re_name(df_tm, oldname =  c("diff_hours"),  newname = newname[unit])
									}else{
										   if(length(unit) > 0 && is.element("weeks",unit)) {
											   df_tm = within(df_tm, {
												   diff_weeks = floor(as.numeric(difftime(df_tm[[enddate]], df_tm[[x]],units = "weeks")))
											   })
											   newname[unit] = paste0(x, "_to_", enddate, "_diff_weeks")
											   df_tm = re_name(df_tm, oldname =  c("diff_weeks"),  newname = newname[unit])
										}
								 }
						  }
				   }
			}
		}
		return(df_tm[newname])
	}
}


#' time_variable
#'
#' This function is not intended to be used by end user.
#'
#' @param dat A data.frame.
#' @param date_cols Time variables.
#' @param enddate End time.
#' @param units Units of diff_time, "secs", "mins", "hours", "days", "weeks" is available.
#' @export

time_variable = function(dat, date_cols = NULL, enddate = NULL, 
						 units = c("secs", "mins", "hours", "days", "weeks")) {
	dat = checking_data(dat = dat)
	date_cols1 = NULL
	if (!is.null(date_cols)) {
		date_cols1 = names(dat)[colnames(dat) %islike% c(enddate, date_cols)]
	} else {
		date_cols1 = names(dat)
	}
	df_date = dat[date_cols1]
	df_date = time_transfer(dat = df_date, date_cols = c(enddate, date_cols))
	df_date = df_date[!colAllnas(df_date)]
	df_tm = df_date[sapply(df_date, is_date)]

	time_vars_list = lapply(date_cols1, function(x) time_vars_process(df_tm = df_tm, x, enddate 
		= enddate,units = units))
	index = 0;
	j = 1
	for (i in 1:length(time_vars_list)) {
		if (is.null(time_vars_list[[i]])) {
			index[j] = i
			j = j + 1
		}
	}
	tm_vars_tbl = as.data.frame(Reduce("cbind", time_vars_list[-index]))
	dat = cbind(dat, tm_vars_tbl)
	return(dat)
}





#' Processing of Address Variables
#'
#' This function is not intended to be used by end user.
#'
#' @param df_city A data.frame.
#' @param x Variables of city,
#' @param city_class  Class or levels of cities.
#' @export


city_varieble_process = function(df_city, x, city_class) {
	if (class(df_city)[1] != "data.frame") {
		df_city = quick_as_df(df_city)
	}
	df_city = within(df_city, {
		city_level = NA
		city_level[df_city[[x]] %alike% city_class[1]] = 1
		city_level[df_city[[x]] %alike% city_class[2]] = 2
		city_level[df_city[[x]] %alike% city_class[3]] = 3
		city_level[df_city[[x]] %alike% city_class[4]] = 4
		city_level[df_city[[x]] %alike% city_class[5]] = 5
		city_level[df_city[[x]] %alike% city_class[6]] = 6
		city_level[is.null(df_city[[x]]) | df_city[[x]] == "NULL" | df_city[[x]] == "" | df_city[[x]] == "missing"|
			df_city[[x]] == "Missing" | city_level == "null" | df_city[[x]] == "NA"] = -1
		city_level[is.na(city_level)] = -1
	})
	city_level_name = paste(x, "city_level", sep = "_")
	df_city = re_name(dat = df_city, oldname = "city_level", newname = city_level_name)
	return(df_city[city_level_name])
}



#' city_varieble
#'
#' This function is used for city variables derivation.
#'
#' @param df  A data.frame.
#' @param city_cols Variables of city,
#' @param city_pattern  Regular expressions, used to match city variable names. Default is "city$".
#' @param city_class  Class or levels of cities.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize  summarise n  count %>% filter left_join
#' @importFrom parallel detectCores  clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do%  registerDoSEQ
#' @export

city_varieble = function(df = df, city_cols = NULL,
						  city_pattern = NULL, city_class = city_class,
						  parallel = TRUE) {
	if (class(df)[1] != "data.frame") {
		df  = quick_as_df(df)
	}
	if (is.null(city_cols)) {
		city_index = grepl(city_pattern, paste(colnames(df)))
		city_cols = names(df[city_index])
	} else {
		city_cols = names(df[city_cols])
	}
	df_city = df[city_cols]
	if (parallel) {
		parallel = start_parallel_computing(parallel)
		stopCluster = TRUE
	} else {
		parallel = stopCluster = FALSE
	}
	on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
	i. = NULL
	df_city_list = list()
	if (!parallel) {
		df_city_list = lapply(city_cols, function(x) city_varieble_process(df_city = df_city, x = x, city_class = city_class))
		df_city_tbl = Reduce("cbind", df_city_list) %>% as.data.frame()
	} else {
		df_city_list = foreach(i. = city_cols, .combine = "c") %dopar% {
			try(do.call(city_varieble_process,
						args = list(df_city = df_city, x = i., city_class = city_class)),
				silent = TRUE)
		}
		df_city_tbl = quick_as_df(df_city_list)
	}
	df = cbind(df, df_city_tbl)
	return(df)
}



#' Replace Value
#'
#' \code{replace_value} is for replacing values of some variables .
#' \code{replace_value_x} is for replacing values of a variable.
#'
#' @param dat  A data.frame.
#' @param x  Name of variable to replace value.
#' @param x_list Names of variables to replace value.
#' @param x_pattern  Regular expressions, used to match variable names.
#' @param replace_dat A data.frame contains value to replace.
#' @param MARGIN A vector giving the subscripts which the function will be applied over. E.g., for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns. Where X has named dimnames, it can be a character vector selecting dimension names.
#' @param RE_NAME Logical, rename the replaced variable.
#' @param VALUE Values to replace.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize  summarise n  count %>% filter left_join
#' @importFrom parallel detectCores  clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do%  registerDoSEQ
#' @export



replace_value = function(dat = dat, x_list = NULL,
						  x_pattern = NULL, replace_dat, MARGIN = 2,
						  VALUE = if (MARGIN == 2) colnames(replace_dat) else rownames(replace_dat),
						  RE_NAME = TRUE,
						  parallel = FALSE) {
	dat = checking_data(dat)
	if (is.null(x_list)) {
		if (!is.null(x_pattern)) {
			x_index = grepl(x_pattern, paste(colnames(dat)))
			x_list = names(dat[x_index])
		} else {
			x_list = get_names(dat)
		}

	}
	if (parallel) {
		parallel = start_parallel_computing(parallel)
		stopCluster = TRUE
	} else {
		parallel = stopCluster = FALSE
	}
	on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
	i. = NULL
	dat_list = list()
	if (!parallel) {
		dat_list = lapply(x_list, function(x) replace_value_x(dat = dat, x = x, replace_dat = replace_dat, MARGIN = MARGIN, VALUE = VALUE, RE_NAME = RE_NAME))
		dat_tbl = Reduce("cbind", dat_list) %>% quick_as_df()
	} else {
		dat_list = foreach(i. = x_list, .combine = "c") %dopar% {
			try(do.call(city_varieble_process,
						args = list(dat = dat, x = i., replace_dat = replace_dat)),
				silent = TRUE)
		}
		dat_tbl = quick_as_df(dat_list)
	}
	dat = cbind(dat, dat_tbl)
	return(dat)
}
#' @rdname replace_value
#' @export

replace_value_x = function(dat, x, replace_dat, MARGIN = 2, 
				VALUE = if (MARGIN == 2) colnames(replace_dat) else rownames(replace_dat), RE_NAME = TRUE) {
	dat = checking_data(dat)
	replace_dat = checking_data(replace_dat)
	dat = within(dat, {
		x_level = NA
		if (MARGIN == 2) {
			for (i in 1:ncol(replace_dat)) {
				x_level[dat[[x]] %alike% replace_dat[i]] = VALUE[i]
			}
		} else {
			for (i in 1:nrow(replace_dat)) {
				x_level[dat[[x]] %alike% replace_dat[i,]] = VALUE[i]
			}
		}

	})
	if (RE_NAME) {
		new_name = paste(x, "replace", sep = "_")
		dat = re_name(dat = dat, oldname = "x_level", newname = new_name)
	}
	return(dat[new_name])
}



#' add_variable_process
#'
#' This function is not intended to be used by end user.
#'
#' @param  add  A data.frame contained address variables.
#' @export
add_variable_process = function(add) {
    # acquire a sets of addresses
    add1 = as.data.frame(add)
    sim1 = colname1 = list()
    for (i in 1:ncol(add1)) {
        if (i >= ncol(add1)) break
        sim1[[i]] = apply(add1[, i:ncol(add1)], 2,
                       function(x) {
                           ifelse(add1[, i] %alike% x, 1, 0)
                       })
        colname1[[i]] = lapply(names(add1)[i:(ncol(add1))], function(n) paste(names(add1)[i], n, sep = '_WITH_'))
    }
    sim1 = data.frame(t(unlist(sim1)), stringsAsFactors = FALSE)
    names(sim1) = unlist(colname1)
    # find the variables which are computing similarity with themselves
    splitvar = strsplit(names(sim1), "_WITH_")
    vars = c()
    for (i in 1:(length(sim1))) {
        if (splitvar[[i]][1] == splitvar[[i]][2]) {
            vars[[i]] = names(sim1)[i]
        } else {
            vars[[i]] = NA
        }
    }
    # get the final results
    sim = sim1[is.na(vars)]
    simm = as.vector(sim)
    return(simm)
}


#' address_varieble
#'
#' This function is not intended to be used by end user.
#'
#' @param df  A data.frame.
#' @param address_cols Variables of address,
#' @param address_pattern  Regular expressions, used to match address variable names.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize  summarise n  count %>% filter left_join
#' @importFrom parallel detectCores  clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do%  registerDoSEQ
#' @export
address_varieble = function(df, address_cols = NULL, address_pattern = NULL, parallel = TRUE) {
    if (class(df)[1] != "data.frame") {
        df = as.data.frame(df)
    }
    if (is.null(address_cols)) {
        address_cols = grepl(address_pattern, paste(colnames(df)))
        address_vars = names(df)[address_cols]
    } else {
        address_vars = names(df[address_cols])
    }
    df_add = df[address_vars]
    if (parallel) {
        parallel = start_parallel_computing(parallel)
        stopCluster = TRUE
    } else {
        parallel = stopCluster = FALSE
    }
    on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
    i. = NULL
    df_add_list = list()
    if (!parallel) {
        df_add_list = lapply(1:nrow(df_add), function(i.) add_variable_process(add = df_add[i.,]))
        df_add_tbl = Reduce("cbind", df_add_list) %>% as.data.frame()
    } else {
        df_add_list = foreach(i. = 1:nrow(df_add), .combine = "c") %dopar% {
            try(do.call(add_variable_process, args = list(add = df_add[i.,])), silent = TRUE)
        }
        df_add_tbl = as.data.frame(df_add_list)
    }
    return(df_add_tbl)
}



#' variable_process
#'
#' This function is not intended to be used by end user.
#'
#' @param  add  A data.frame
#' @importFrom data.table :=
#' @export
variable_process = function(add) {
    td = new3 = new2 = grx_x = colname1 = NULL

    # acquire a sets of addresses
    cv_cols = grep(grx_x, paste(colnames(dat)))[1:td]
    cv_cols = cv_cols[!is.na(cv_cols)]
    #cv_folds
    if (length(cv_cols) > 0) {
        dat = dat[, new2 := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
        rowSums(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
        dat = dat[, new3 := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
        rowMeans(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
    }
    sim1 = data.frame(t(unlist(sim1)), stringsAsFactors = FALSE)
    names(sim1) = unlist(colname1)
    # find the variables which are computing similarity with themselves
    splitvar = strsplit(names(sim1), "_WITH_")
    vars = c()
    for (i in 1:(length(sim1))) {
        if (splitvar[[i]][1] == splitvar[[i]][2]) {
            vars[[i]] = names(sim1)[i]
        } else {
            vars[[i]] = NA
        }
    }
    # get the final results
    sim = sim1[is.na(vars)]
    simm = as.vector(sim)
    return(simm)
}


#' derived_interval
#'
#' This function is not intended to be used by end user.
#'
#' @param  dat_s  A data.frame contained only predict variables.
#' @param interval_type  Available of c("cnt_interval", "time_interval")
#' @export
#' @importFrom data.table first
derived_interval = function(dat_s, interval_type = c("cnt_interval", "time_interval")) {

	interval_list = apply(dat_s, 1, function(m) {
		if (interval_type == "time_interval") {
			cnt_ind = inter_ind = which(!is.na(m) | m != 0)
			interval = rep(NA, length(m))
			if (length(cnt_ind) > 1) {


				interval[cnt_ind] = vapply(1:(length(inter_ind)), function(i) {

					ifelse(i <= length(inter_ind), abs(inter_ind[i] - inter_ind[i + 1]), NA)

				}, FUN.VALUE = numeric(1))
			}
			interval = c(abs(1 - data.table::first(inter_ind)), interval[-length(interval)])
		} else {
			cnt_ind = which(m >= 0)
			inter_ind = unlist(m, use.names = FALSE)[c(cnt_ind)]
			interval = rep(NA, length(m))
			if (length(cnt_ind) > 1) {


				interval[cnt_ind] = vapply(1:(length(inter_ind)), function(i) {

					ifelse(i < length(inter_ind), abs(inter_ind[i] - inter_ind[i + 1]), inter_ind[i])

				}, FUN.VALUE = numeric(1))
			}
		}

		interval
	})
	interval_list = as.data.frame(t(interval_list))
	interval_list
}


#' derived_pct
#'
#' This function is not intended to be used by end user.
#'
#' @param  dat_s  A data.frame contained only predict variables.
#' @param pct_type  Available of "total_pct"
#' @export
derived_pct = function(dat_s, pct_type = "total_pct") {
    dat_s[is.na(dat_s)] = 0
    if (pct_type == "total_pct") {
        pct_list = dat_s / rowSums(dat_s, na.rm = TRUE)
    } else {
        cnt_pct_list = dat_s / rowSums(dat_s, na.rm = TRUE)
        pct_list = apply(cnt_pct_list, 1, function(x) cumsum(x))
        pct_list = as.data.frame(t(pct_list))
    }

    pct_list
}


#' derived_partial_acf
#'
#' This function is not intended to be used by end user.
#'
#' @param  dat_s  A data.frame
#' @export

derived_partial_acf = function(dat_s) {
    dat_s[is.na(dat_s)] = 0
    p_acf = apply(dat_s, 1, function(x) ifelse(length(unique(x)) > 2, mean(abs(ar(ts(x), FALSE,
    length(unique(x)) - 1, na.action = na.pass)$partialacf)), NA))
    p_acf
}

#' sim_str
#'
#' This function is not intended to be used by end user.
#'
#' @param a A string
#' @param b  A string
#' @param sep Seprater of strings. Default is "_|[.]|[A-Z]".
#' @export
sim_str = function(a, b, sep = "_|[.]|[A-Z]") {
    intersect(strsplit(a, sep)[[1]], strsplit(b, sep)[[1]])
}


#' Recovery Percent Format
#'
#' \code{de_percent} is  a small function for recoverying percent format..
#' @param x  Character with percent formant.
#' @param digits  Number of digits.Default: 2.
#' @return  x without percent format.
#' @examples
#' de_percent("24%")
#' @export

de_percent = function(x, digits = 2) {
    x = as.character(x)
    round(as.numeric(gsub("%", "", x)) / 100, digits = digits)
}


#' Merge Category
#'
#' \code{merge_category} is  for merging   category of nominal variables which number of categories is more than m or percent of samples in any categories is less than p.
#' @param dat A data frame with x and target.
#' @param char_list The list of charecteristic variables that need to merge categories, Default is NULL. In case of NULL,merge categories for all variables of string type.
#' @param ex_cols A list of excluded variables. Default is NULL.
#' @param m The minimum number of categories.
#' @param note Logical, outputs info. Default is TRUE.
#' @return  A data.frame with merged category variables.
#' @examples
#' #merge_catagory
#' dat =  merge_category(lendingclub,ex_cols = "id$|_d$")
#' char_list = get_names(dat = dat,types = c('factor', 'character'),
#' ex_cols = "id$|_d$", get_ex = FALSE)
#' str(dat[,char_list])
#' @export


merge_category = function(dat, char_list = NULL, ex_cols = NULL,  m = 10, note = TRUE) {
	opt = options(scipen = 200, stringsAsFactors = FALSE, "warn" = -1)
	if (note) cat_line(paste0("-- Merging categories..."), col = love_color("dark_green"))
	dat = char_to_num(dat = dat, char_list = char_list, ex_cols = ex_cols, note = FALSE)
	if (is.null(char_list)) {
		char_list = get_names(dat = dat,
						  types = c('factor', 'character'),
						  ex_cols = ex_cols, get_ex = FALSE)
	}
	for (x in char_list) {
		dt_x = table(as.character(dat[, x]), useNA = "no")
		over_vars = NULL
		if (length(dt_x) > m) {
			over_vars = order(abs(dt_x), decreasing = TRUE)[m:length(dt_x)]
		} 
		if (length(over_vars) > 0) {
			max_class = over_vars
			dat[which(dat[, x] %in% names(dt_x[max_class])), x] = "other"
		}
	}
	return(dat)
	options(opt) # reset
}

#' character to number
#'
#' \code{char_to_num} is  for transfering character variables which are actually numerical numbers containing strings  to numeric.
#' @param dat A data frame
#' @param ex_cols A list of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param char_list The list of charecteristic variables that need to merge categories, Default is NULL. In case of NULL, merge categories for all variables of string type.
#' @param note Logical, outputs info. Default is TRUE.
#' @param m The minimum number of categories.
#' @param p The max percent of categories.
#' @return  A data.frame
#' @examples
#' dat_sub = lendingclub[c('dti_joint',	'emp_length')]
#' str(dat_sub)
#' #variables that are converted to numbers containing strings
#' dat_sub = char_to_num(dat_sub)
#' str(dat_sub)
#' @export

char_to_num = function (dat, char_list = NULL, m = 0, p = 0.5, note = FALSE,
                        ex_cols = NULL) {
  opt = options(scipen = 200, warn = -1, stringsAsFactors = FALSE)
  if (note)
    cat_line(paste("-- Transfering character variables which are actually numerical to numeric"),
             col = love_color("dark_green"))
  if (is.null(char_list)) {
    char_list = get_names(dat = dat, types = c("factor",
                                               "character"), ex_cols = ex_cols, get_ex = FALSE)
  }
  
  for (x in char_list) {
    dt_x = table(as.character(dat[, x]), useNA = "no")
    if(any(grepl("\\,",names(dt_x)))){
      x_name = gsub("\\,","",names(dt_x))
    }else{
      x_name = names(dt_x)
    }
    char_num = tryCatch({
      as.numeric(x_name)
    }, error = function(e) {
      cat("ERROR :", conditionMessage(e), "\n")
    }, warning = function(w) {
      as.numeric(x_name)
    })
    char_num_ind = which(!is.na(char_num))
    if (length(char_num_ind) > m && length(dt_x) > m &&
        round(length(char_num_ind)/length(dt_x),  2) > p ) {
      dat[, x] = as.numeric(gsub("\\,","",as.character(dat[, x])))
    }
  }
  return(dat)
  options(opt)
}




#' Logarithmic transformation
#'
#' \code{log_trans} is for logarithmic transformation
#' @param dat A data.frame.
#' @param target The name of target variable.
#' @param x_list A list of x variables.
#' @param cor_dif The correlation coefficient difference with the target of logarithm transformed variable and original variable.
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param note Logical, outputs info. Default is TRUE.
#' @return Log transformed data.frame.
#' @examples
#' dat = log_trans(dat = UCICreditCard, target = "default.payment.next.month",
#' x_list =NULL,cor_dif = 0.01,ex_cols = "ID", note = TRUE)
#' @importFrom cli cat_rule cat_line cat_bullet
#' @export


log_trans = function(dat, target, x_list = NULL,cor_dif = 0.01,ex_cols = NULL, note = TRUE){
  log_x = log_vars(dat = dat, target = target, x_list = x_list, cor_dif = cor_dif,ex_cols = ex_cols )
  if(!is.null(log_x)){
    if(note)cat_line("-- Logarithmic transformation", col = love_color("deep_green"))
    dat[,log_x] = lapply(dat[,log_x],function(x){
	x = as.numeric(unlist(x))
	x[which(!is.na(x) & x > 0)] = log(x[which(!is.na(x) & x > 0)])
	x
	 }
	)
    dat = re_name(dat, oldname = log_x, newname =  paste(log_x,"log",sep = "_"))
    if(note){
      cat_line("-- Following variables are log transformed:", col = love_color("dark_green"))
      cat_bullet(paste(format(log_x),paste(log_x,"log",sep = "_"),sep  = " -> "), col = "darkgrey")
    }
  }
  return(dat)
}

#' @rdname log_trans
#' @export

log_vars = function(dat, x_list = NULL, target = NULL,cor_dif = 0.01,ex_cols = NULL){
  log_x_list = list()
  dat = checking_data(dat = dat,target = target)
  if(is.null(x_list)){
    x_list = get_names(dat = dat,types = c("numeric","double","integer"),ex_cols = c(ex_cols,target))
  }
  if(length(x_list) > 0){
    log_x_list = lapply(dat[x_list],function(x){
      flag = as.numeric(as.factor(dat[[target]]))
      x = as.numeric(unlist(x))
      if(length(which(!is.na(x) & x > 0))> 30 ){
        log_x = log(x[!is.na(x) & x > 0])
        cor_log_x = cor(log_x,flag[!is.na(x) & x > 0],method = "pearson",use = "complete.obs")
        cor_x = cor(x[!is.na(x) & x > 0],flag[!is.na(x) & x > 0],method = "pearson",use = "complete.obs")
        abs(cor_log_x) - abs(cor_x)
      }else{
        0
      }
    })
  }
  logvars = names(log_x_list[which(log_x_list >= cor_dif)])
  return(logvars)
}


#' Ranking Percent Process
#'
#' \code{ranking_percent_proc} is for processing ranking percent variables.
#' \code{ranking_percent_dict} is for generating ranking percent dictionary.
#' @param dat A data.frame.
#' @param x_list A list of x variables.
#' @param x The name of an independent variable.
#' @param rank_dict The dictionary of rank_percent generated by \code{ranking_percent_dict} .
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param pct Percent of rank. Default is 0.01.
#' @param note Logical, outputs info. Default is TRUE.
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param save_data Logical, save results in locally specified folder. Default is FALSE
#' @param file_name The name for periodically saved rank_percent data file. Default is "dat_rank_percent".
#' @param dir_path The path for periodically saved rank_percent data file Default is "tempdir()"
#' @param ...  Additional parameters.
#' @return Data.frame with new processed variables.
#' @examples
#' rank_dict = ranking_percent_dict(dat = UCICreditCard[1:1000,],
#' x_list = c("LIMIT_BAL","BILL_AMT2","PAY_AMT3"), ex_cols = NULL )
#' UCICreditCard_new = ranking_percent_proc(dat = UCICreditCard[1:1000,],
#' x_list = c("LIMIT_BAL", "BILL_AMT2", "PAY_AMT3"), rank_dict = rank_dict, parallel = FALSE)
#' @importFrom data.table is.data.table
#' @export

ranking_percent_proc = function(dat, ex_cols = NULL, x_list = NULL, rank_dict = NULL,pct = 0.01,
                             parallel = FALSE, note = FALSE, save_data = FALSE,file_name = NULL,dir_path= tempdir(), ... ){
    if (note) cat_line("-- Processing ranking percent variables.\n", col = love_color("dark_green"))
    x_list = get_x_list(x_list = x_list, dat_train = dat, dat_test = NULL, ex_cols = ex_cols)
    if (length(x_list) > 0) {
        num_x_list = get_names(dat = dat[x_list], types = c('numeric', 'integer', 'double'),
                           ex_cols = c(ex_cols), get_ex = FALSE)
    } else {
        stop("No variable in the x_list or ex_col excludes all variables.\n ")
    }
    if (is.null(rank_dict)) {
        rank_dict = ranking_percent_dict(dat = dat, x_list = num_x_list, ex_cols = ex_cols,
		save_data = save_data, file_name = file_name,dir_path = dir_path )
    }
    if (any(is.element(sapply(rank_dict, class), c("character", "factor")))) {
        rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))] =
		as.numeric(as.character(rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))]))
    }
    new_x = paste(num_x_list,"rank_pct",sep = "_")
    if (length(num_x_list) > 0) {
        dat[, new_x] = loop_function(func = ranking_percent_proc_x, x_list = num_x_list,
                                          args = list(dat = dat, rank_dict = rank_dict,pct = pct),
                                          bind = "cbind", as_list = FALSE, parallel = parallel)
    }
	if (save_data) {
        dir_path = ifelse(!is.character(dir_path),
                      tempdir(), dir_path)
        if (!dir.exists(dir_path)) dir.create(dir_path)
        if (!is.character(file_name)) file_name = NULL
        save_data(dat, file_name = ifelse(is.null(file_name), "dat_rank_percent",
		paste(file_name, "dat_rank_percent", sep = ".")), dir_path = dir_path, note = note)
    }
    return(dat)
}



#' @rdname ranking_percent_proc
#' @export

ranking_percent_proc_x = function(dat, x,rank_dict = NULL,pct = 0.01) {
    if ((!is.null(x) && is.character(x)) & !is.null(dat)) {
        dat_x = abs(dat[, x][complete.cases(dat[, x])])
    } else {
        if (!is.null(dat) && is.vector(dat)) {
            dat_x = abs(dat[complete.cases(dat)])
            x = "rank_x"
        } else {
            if (!is.null(dat) && (is.data.frame(dat) | is.data.table(dat)) && length(dat) == 1 && unlist(dat) > 2) {
                x = colnames(dat)[1]
                dat = unlist(dat)
                dat_x = abs(dat[complete.cases(dat)])
            } else {
                stop("dat is null & x is null")
            }
        }
    }
    if (is.null(rank_dict)) {
        QL = quantile(dat_x, 0.01)
        QU = quantile(dat_x, 0.99)
        QU_QL = QU - QL
        outliers = QU + 4 * QU_QL
        dat_x[dat_x > QU + 4 * QU_QL] = outliers
        rank_dict_x = NULL
        rank_x = quantile(ecdf(unique(dat_x)), seq(0, 1, by = pct))
        rank_percent = as.double(sub("%", "", names(rank_x))) / 100
        rank_dict_x = data.frame(rank_percent, rank_x)
        colnames(rank_dict_x)[2] = x
    }
    if (any(is.element(sapply(rank_dict, class), c("character", "factor")))) {
        rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))] =
		as.numeric(as.character(rank_dict[, which(is.element(sapply(rank_dict, class), c("character", "factor")))]))
    }

    x_new = paste(x,"rank_pct",sep = "_")
    dat[which(!is.na(dat[, x])), x_new] = vapply(dat_x, function(i) {
	round(rank_dict[, "rank_percent"][which.min(i > rank_dict[, x])], 3)
	}, FUN.VALUE = numeric(1))
    return(dat[x_new])
}


#' @rdname ranking_percent_proc
#' @export
ranking_percent_dict = function(dat, x_list = NULL, ex_cols = NULL, pct = 0.01, parallel = FALSE,
			   save_data = FALSE, file_name = NULL, dir_path = tempdir(), ...) {
	x_list = get_x_list(x_list = x_list, dat_train = dat, dat_test = NULL, ex_cols = ex_cols)
	if (length(x_list) > 0) {
		num_x_list = get_names(dat = dat[x_list], types = c('numeric', 'integer', 'double'),
						   ex_cols = c(ex_cols), get_ex = FALSE)
	} else {
		stop("No variable in the x_list or ex_col excludes all variables.\n ")
	}
	rank_dict = NULL
	if (length(num_x_list) > 0) {
		rank_dict = loop_function(func = ranking_percent_dict_x, x_list = num_x_list,
										  args = list(dat = dat, pct = pct),
										  bind = "cbind", as_list = TRUE, parallel = parallel)
		rank_dict = multi_left_join(df_list = rank_dict, by = "rank_percent")
		rank_dict[, "rank_percent"] = as.numeric(as.character(rank_dict[, "rank_percent"]))
	}
	if (save_data) {
		dir_path = ifelse(!is.character(dir_path),
					  tempdir(), dir_path)
		if (!dir.exists(dir_path)) dir.create(dir_path)
		if (!is.character(file_name)) file_name = NULL
		save_data(rank_dict, file_name = ifelse(is.null(file_name), "rank_percent_dict",
		paste(file_name, "rank_percent_dict", sep = ".")), dir_path = dir_path, note = TRUE)
	}
	return(rank_dict)
}



#' @rdname ranking_percent_proc
#' @export

ranking_percent_dict_x = function(dat, x = NULL, pct = 0.01) {
	if ((!is.null(x) && is.character(x)) & !is.null(dat)) {
		dat_x = abs(dat[, x][complete.cases(dat[, x])])
	} else {
		if (!is.null(dat) && is.vector(dat)) {
			dat_x = abs(dat[complete.cases(dat)])
			x = "rank_x"
		} else {
			if (!is.null(dat) && (is.data.frame(dat) | is.data.table(dat)) && length(dat) == 1 && unlist(dat) > 2) {
				x = colnames(dat)[1]
				dat = unlist(dat)
				dat_x = abs(dat[complete.cases(dat)])
			} else {
				stop("dat is null & x is null")
			}
		}
	}
	QL = quantile(dat_x, 0.01)
	QU = quantile(dat_x, 0.99)
	QU_QL = QU - QL
	outliers = QU + 4* QU_QL
	dat_x[dat_x > QU + 4 * QU_QL] = outliers
	rank_dict_x = NULL
	rank_x = quantile(ecdf(unique(dat_x)), seq(0, 1, by = pct))
	rank_percent = as.double(sub("%", "", names(rank_x))) / 100
	rank_dict_x = data.frame(rank_percent, rank_x)
	colnames(rank_dict_x)[2] = x
	return(rank_dict_x)
}


#' TF-IDF
#'
#' The \code{term_filter} is for filtering stop_words and low frequency words.
#' The \code{term_idf} is for computing idf(inverse documents frequency) of terms.
#' The \code{term_tfidf} is for computing tf-idf of documents.
#' @param term_df A data.frame with id and term.
#' @param low_freq Use rate of terms or use numbers of terms.
#' @param stop_words Stop words.
#' @param n_total Number of documents.
#' @param idf A data.frame with idf.
#' @return A data.frame
#' @examples
#' term_df = data.frame(id = c(1,1,1,2,2,3,3,3,4,4,4,4,4,5,5,6,7,7,
#'                             8,8,8,9,9,9,10,10,11,11,11,11,11,11),
#' terms = c('a','b','c','a','c','d','d','a','b','c','a','c','d','a','c',
#'           'd','a','e','f','b','c','f','b','c','h','h','i','c','d','g','k','k'))
#' term_df = term_filter(term_df = term_df, low_freq = 1)
#' idf = term_idf(term_df)
#' tf_idf = term_tfidf(term_df,idf = idf)
#' @importFrom data.table setDT .N := dcast merge.data.table as.data.table
#' @export

term_tfidf = function(term_df, idf = NULL){
  n_term = tf = id = term = NULL
  term_df = as.data.table(term_df,key = 'id')
  term_df[,n_term := 1]
  term_df[,n_term := sum(n_term), by= list(id,term)]
  term_df[,tf := n_term / sum(n_term), by= 'id']
  n_total = length(unique(term_df$id))
  if (is.null(idf)){
    idf = term_idf(term_df = term_df,n_total = n_total)
  }
  term_df = data.table::merge.data.table(term_df,idf,by = 'term',all.x = TRUE)
  term_df[,tfidf := tf * idf]
  nw = tryCatch(n_total*as.integer(nrow(idf)),warning = function(w){NA})
  if(!is.na(nw)&& nw < 1000000000 ){
  term_df = as.data.table(term_df,key = 'id')
    tfidf  = data.table::dcast(term_df, id ~ term, sum,
                               value.var = "tfidf")
  }else{
    for(k in 2:100){
      nr = tryCatch(n_total*as.integer(nrow(idf)/k),warning = function(w){NA})
      if(!is.na(nr)&& nr < 1000000000)break
    }
    k = k*20
    w_cv = cv_split(idf,k = k)
    tfidf = list()
    for(i in 1:k){
      terms = idf[w_cv[[i]],'term']
      idf_term = term_df[term %in% unlist(terms)]
      tfidf[[i]] = idf_term[,c('id','term','tfidf')]
    }

    tfidf =  lapply(tfidf,function(z)data.table::dcast(z,id ~ term, sum,
                                                       value.var = "tfidf"))
    tfidf = Reduce(function(...)merge.data.table(...,by = 'id', all = TRUE),
                   tfidf)
  }
  return(quick_as_df(tfidf))
}

#' @rdname term_tfidf
#' @export
#'
term_idf = function(term_df,n_total = NULL){
  id = N = NULL
  colnames(term_df)[1] = 'id'
  colnames(term_df)[2] = 'term'
  if(is.null(n_total)){
    n_total = length(unique(term_df$id))
  }
  term_df = as.data.table(term_df,key = 'id')
  words_freq = term_df[,.N,'term']
  words_freq[,idf := log(n_total / (N + 1))]
  idf = words_freq[,c('term','idf')]
  return(quick_as_df(idf))
}

#' @rdname term_tfidf
#' @export

term_filter = function(term_df, low_freq = 0.01, stop_words = NULL){
  N = use_rate = term = id = NULL
  colnames(term_df)[1] = 'id'
  colnames(term_df)[2] = 'term'
  term_df = as.data.table(term_df,key = 'id')
  words_freq = term_df[,.N,'term']
  if(is.numeric(low_freq) && low_freq < 1){
    n_total = length(unique(term_df$id))
    words_freq[,use_rate := N / n_total]
    low_freq_words = words_freq[use_rate < low_freq,'term']
  }else{
    if(is.numeric(low_freq) && low_freq>= 1){
      low_freq_words = words_freq[ N <= low_freq,'term']
    }else{
      low_freq_words = words_freq[ N <= 1,'term']
    }
  }

  term_df = term_df[!term_df$term %in% c('',as.character(unlist(low_freq_words)),
                                         as.character(unlist(stop_words)))]
  return(quick_as_df(term_df))
}

Try the creditmodel package in your browser

Any scripts or data that you put into this service are public.

creditmodel documentation built on Jan. 25, 2021, 5:08 p.m.