R/ncpen_util.R
In ncpen: Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models

Documented in excluded interact.data make.ncpen.data power.data same.base to.indicators to.ncpen.x.mat

#' @title
#' Check whether column names are derivation of a same base.
#'
#' @description
#' This is internal use only function.
#'
#' @param base.cols vector of base column names.
#' @param a first column to be compared.
#' @param b second column to be compared.
#'
#' @return
#' TRUE if same base, FALSE otherwise.
#'
same.base = function(base.cols, a, b) {
     if(a==b) return(TRUE);
     if(is.null(base.cols)) return (FALSE);

     for(base in base.cols) {
          index = regexpr(base, c(a, b));
          if(index[1] > 0 & index[1] == index[2]) {
               return (TRUE);
          }
     }

     return (FALSE);
}
# same.base("aa", "aa", "aa20")
# same.base(NULL, "aa", "aa")

#' @title
#' Check whether a pair should be excluded from interactions.
#'
#' @description
#' This is internal use only function.
#'
#' @param excluded.pair a pair.
#' @param a first column to be compared.
#' @param b second column to be compared.
#'
#' @return
#' TRUE if excluded, FALSE otherwise.
#'
excluded = function(excluded.pair, a, b) {
     if(is.null(excluded.pair)) return (F);

     for(pair in excluded.pair) {
          str1 = paste(pair[1], pair[2]);
          if(str1 == paste(a, b) | str1 == paste(b, a)) {
               return (T);
          }
     }

     return (F);
}

# el = list(c("a", "b"), c("d", "c"));
#
# excluded(el, "b", "a")
# excluded(el, "b", "a1")
# excluded(el, "b", "d")
# excluded(el, "d", "c")

# base.cols = c("aa", "bb", "cc", "dd", "ee");
# a = "aa.33"
# b = "bb.44"
#
# c = "cc.44"
# d = "ccdde";
# e = "aabcde"
# same.base(base.cols, e, e)


#' @title
#' Construct Indicator Matrix
#'
#' @description
#' \code{to.indicators} converts a categorical variable into a \code{\link{data.frame}}
#' with indicator (0 or 1) variables for each category.
#'
#' @param vec a categorical vector.
#' @param exclude.base \code{FALSE} means to include all the categories. \code{TRUE}
#' means to exclude one category as a base case.
#'  If \code{base} is not specified, a random category will be removed.
#' @param base a base category removed from the indicator matrix. This option works
#'  only when the \code{type} variable is set to \code{"exclude.base"}.
#' @param prefix a prefix to be used for column names of the output matrix.
#' Default is "cat_" if \code{prefix} is \code{NULL}.
#' For example, if a category vector has values of c("aa", "bb", "cc"),
#' column names of the output matrix will be "cat_aa", "cat_bb" and "cat_cc".
#' If \code{vec} is a \code{\link{data.frame}} and \code{prefix} is \code{NULL},
#' then the \code{vec}'s column name followed by "_" will be used as a prefix.
#'
#' @return
#' This returns an object of \code{\link{matrix}} which contains indicators.
#'
#'
#' @examples
#' a1 = 4:10;
#' b1 = c("aa", "bb", "cc");
#'
#' to.indicators(a1, base = 10);
#' to.indicators(b1, base = "bb", prefix = "T_");
#' to.indicators(as.data.frame(b1), base = "bb");
#'
#'
#'
#' @export
to.indicators = function(vec, exclude.base = TRUE, base = NULL, prefix = NULL) {
     # Test block
     # vec = as.data.frame(b1);
     # exclude.base = T;
     # prefix = "cat_";
     # base = NULL;
     ############################

     if(class(vec) == "data.frame") {
          if(is.null(prefix)) {
               prefix = paste(colnames(vec), "_", sep = "");
          }
     } else {
          if(is.null(prefix)) {
               prefix = "cat_";
          }
          vec = as.data.frame(as.factor(vec));
     }
     colnames(vec) = prefix;

     if(exclude.base == TRUE) {
          if(is.null(base)) {
               ret = model.matrix(~ . + 0, data = vec)[, -1];
               # return (model.matrix(~ . + 0, data = vec)[,-1]);
          } else {
               ret = model.matrix(~ . + 0, data = vec)[,];
               rm.col = which(colnames(ret)==paste(prefix, base, sep = ""));
               if(length(rm.col) >0) {
                    ret = ret[, -rm.col];
               }

               # return (ret);
          }
     } else {
          ret = model.matrix(~ . + 0, data = vec)[,];
          # return (model.matrix(~ . + 0, data = vec)[,]);
     }

     # if vec has only two categories, model.matrix return a vector without column names.
     # so convert it to matrixk and give column name
     if(is.vector(ret)) {
          ret = as.data.frame(ret);
          colnames(ret) = prefix;
     }

     return (ret);
}

# a1 = 4:10;
# b1 = c("aa", "bb", "cc");
#
# to.indicators(a1, base = 10);
# to.indicators(b1, base = "bb", prefix = "T_");
# to.indicators(as.data.frame(b1), base = "bb");


#' @title
#' Power Data
#'
#' @description
#' \code{power.data} power data and return a \code{\link{data.frame}} with column names with tail.
#'
#' @param data a \code{\link{data.frame}} or \code{\link{matrix}} object.
#' @param power power.
#' @param tail tail text for column names for powered data. For example, if a column "sales" is powered by 4 (=\code{power})
#' and \code{tail} is "_pow", then the output column name becomes "sales_pow4".
#'
#' @return
#' This returns an object of \code{\link{matrix}}.
#'
#'
#' @examples
#' df = data.frame(a = 1:3, b= 4:6);
#' power.data(df, 2, ".pow");
#'
#'
#' @export
power.data = function(data, power, tail = "_pow") {
     pow.data = data^power;
     colnames(pow.data) = paste(colnames(pow.data), tail, power, sep = "");

     return (pow.data);
}

# df = data.frame(a = 1:3, b= 4:6);
# power.data(df, 2, ".pow");

#temp = power.data(subset(raw.prepay.data, select = c("FICO", "INT_RT")), 2);
#head(temp);

#' @title
#' Construct Interaction Matrix
#'
#' @description
#' \code{interact.data} interacts all the data in a \code{\link{data.frame}} or \code{\link{matrix}}.
#'
#' @param data a \code{\link{data.frame}} or \code{\link{matrix}} to interact.
#' @param base.cols indicates columns from one category.
#' Interactions among variables from a same base.col will be avoided. For example, if three indicator columns,
#' "ChannelR", "ChannelC" and "ChannelB", are created from a categorical column "Channel", then the interaction among them
#' can be excluded by assigning \code{base.cols=c("Channel")}. Multiple \code{base.cols} are possible.
#' @param exclude.pair the pairs will be excluded from interactions. This should be a \code{\link{list}} object of pairs.
#' For example, \code{list(c("a1", "a2"), c("d1", "d2"))}.
#'
#' @return
#' This returns an object of \code{\link{matrix}} which contains interactions.
#'
#'
#' @examples
#' df = data.frame(1:3, 4:6, 7:9, 10:12, 13:15);
#' colnames(df) = c("aa", "bb", "cc", "dd", "aa2");
#' df
#'
#' interact.data(df);
#' interact.data(df, base.cols = "aa");
#' interact.data(df, base.cols = "aa", exclude.pair = list(c("bb", "cc")));
#'
#'
#'
#' @export
interact.data = function(data, base.cols = NULL, exclude.pair = NULL) {
     data.fields = colnames(data);
     #    data = prepay.data;
     # if(is.null(base.cols)) {
     #      base.cols = data.fields;
     # }
     #base.cols = c("a", "b", "c");
     #data.fields = c("a", "b", "c", "d", "a_2", "b_2", "c_2");
     n.fields = length(data.fields);
     # construct formula string
     interact.str = "~";

     for(i in 1:(n.fields-1)) {
          for(j in (i+1):n.fields) {
               lfield = data.fields[i];
               rfield = data.fields[j];
               if(!same.base(base.cols, lfield, rfield)
                  & !excluded(exclude.pair, lfield, rfield)) {
                    interact.str = paste(interact.str, paste(lfield, ":", rfield, sep = ""), "+", sep = " ");

               }
          }
     }

     interact.str = substr(interact.str, 1, nchar(interact.str)-2); # truncate the last "+".
     # print(interact.str);
     return (model.matrix(formula(interact.str), data = as.data.frame(data))[,-1]);
}

#' @title
#' Convert a \code{\link{data.frame}} to a \code{ncpen} usable \code{\link{matrix}}.
#'
#' @description
#' This automates the processes of \code{\link{to.indicators}} and \code{\link{interact.data}}.
#' First, it converts categorical variables to a series of indicators.
#' All other numerical and logical variables are preserved.
#' Then, if \code{interact.all == TRUE}, all the variables are interacted.
#'
#' @param df a \code{\link{data.frame}} which includes numerical, logical and categorical columns.
#' @param base a base category removed from the indicator variables. This \code{base} will work as the
#' base case for all the categorical variables.
#' @param interact.all indicates whether to interact all the columns (\code{TRUE}) or not (\code{FALSE}).
#' @param base.cols indicates columns derived from a same column. For example, if \code{age_sq} is \code{age^2},
#' then \code{"age"} is a base column. Categorical columns will be automatically considered as base columns.
#' @param exclude.pair the pairs will be excluded from interactions. This should be a \code{\link{list}} object of pairs.
#' For example, \code{list(c("a1", "a2"), c("d1", "d2"))}.
#'
#' @return
#' This returns an object of \code{\link{matrix}}.
#'
#'
#' @examples
#' df = data.frame(num = c(1, 2, 3, 4, 5),
#'                 ctr = c("K", "O", "R", "R", "K"),
#'                 logi = c(TRUE, TRUE, FALSE, FALSE, TRUE),
#'                 age = c(10, 20, 30, 40, 50),
#'                 age_sq = c(10, 20, 30, 40, 50)^2,
#'                 loc = c("b", "a", "c", "a", "b"),
#'                 FTHB = c(1,0,1,0,1),
#'                 PRM  = c(0,1,0,1,0),
#'                 PMI  = c(1,1,0,0,0));
#'
#' to.ncpen.x.mat(df, interact.all = TRUE,
#'    base.cols = c("age"),
#'    exclude.pair = list(c("FTHB", "PRM")));
#'
#'
#'
#' @export
to.ncpen.x.mat = function(df, base = NULL, interact.all = FALSE, base.cols = NULL, exclude.pair  = NULL) {
     # prepay.data = read.csv(file = "https://raw.githubusercontent.com/zeemkr/data/master/mtg_term_2011_2012.csv");
     # head(prepay.data);
     # df = prepay.data[, 3:ncol(prepay.data)];
     # head(df);
     # df$logic = T;
     # interract.all = T;
     # base.cols = NULL; # c("CHANNEL", "loan_age");
     # exclude.pair = list(c("FTHB", "Purchase"), c("FTHB", "Primary"));

     raw.colnames = colnames(df);
     buff = NULL;
     for(i in 1:ncol(df)) {
          # i = 16;
          if(is.numeric(df[,i]) | is.logical(df[,i])) {
               buff = cbind(buff, df[,i]);
               colnames(buff)[ncol(buff)] = raw.colnames[i];
          } else if(is.factor(df[,i]) | is.character(df[,i]) ) {
               idcs = to.indicators(df[, i], exclude.base = TRUE, base = base, prefix = paste(raw.colnames[i], "_", sep = ""));
               if(is.vector(idcs)) {

               }
               buff = cbind(buff, idcs);
               base.cols = c(base.cols, raw.colnames[i]);
          }
     }

     # remove duplicated base cols.
     base.cols = base.cols[!duplicated(base.cols)];

     if(interact.all == TRUE) {
          buff = cbind(buff, interact.data(buff, base.cols, exclude.pair));
     }

     return (as.matrix(buff));
}


#' @title
#' Create ncpen Data Structure Using a Formula
#'
#'
#' @description
#' This function creates ncpen y \code{vector} and x \code{matrix} from data using formula.
#'
#' @param formula (formula) regression formula. Intercept will not be created.
#' @param data (numeric matrix or data.frame) contains both y and X.
#'
#'
#' @return List of y vector and x matrix.
#'   \item{y.vec}{y \code{vector}}
#'   \item{x.mat}{x \code{matrix}}
#'
#' @author Dongshin Kim, Sunghoon Kwon, Sangin Lee
#'
#'
#' @examples
#' data = data.frame(y = 1:5, x1 = 6:10, x2 = 11:15);
#' formula = log(y) ~ log(x1) + x2;
#' make.ncpen.data(formula, data);
#'
#' @export
make.ncpen.data = function(formula, data) {
     # compse y.vec and x.mat ---------------------------------
     # formula = log(y) ~ log(x1) + x2;
     # formula = y ~ .;
     # data = data.frame(y = 1:5, x1 = 6:10, x2 = 11:15);

     data = data[, all.vars(formula)];
     complete.ids = complete.cases(data);
     if(sum(!complete.ids) >0 ) {
          data = data[complete.ids, ];
          warning(paste(sum(!complete.ids), " observations deleted due to missingness", sep = ""));
     }

     if(!is.data.frame(data)) {
          data = data.frame(data);
     }

     formula.str = as.character(formula);
     y.var = as.formula(paste("~", formula.str[2]));
     y.vec = model.matrix(y.var, data);
     y.vec = y.vec[, 2]; # exclude intercept

     x.var = as.formula(paste(formula.str[2], "~", formula.str[3]));
     x.mat = model.matrix(x.var, data);
     x.mat = x.mat[, 2:ncol(x.mat)]; # exclude intercept
     #---------------------------------------------------------

     return(list(y.vec = y.vec, x.mat = x.mat));
}

# df = data.frame(1:3, 4:6, 7:9, 10:12, 13:15);
# colnames(df) = c("aa", "bb", "cc", "dd", "aa2");
# df
#
# interact.data(df);
# interact.data(df, base.cols = "aa");
# interact.data(df, base.cols = "aa", exclude.pair = list(c("bb", "cc")));
#

# Not made avaialble yet -----------------------------------------------
# step.data = function(vec, by = 0.50) {
#      # "st.min_(min + by) is reference
#      #vec = ncpen.data$int_spread;
#      #by = 0.5
#
#      field.name = names(vec)[1];
#      #field.name = "aa"
#      min.val = min(vec);
#      max.val = max(vec);
#
#      l.bound = min.val + by;
#      u.bound = l.bound + by;
#
#      step.var.str = sprintf("%s.%.2f_%.2f", field.name, l.bound, u.bound);
#      step.var.str= gsub("-", "n", step.var.str); # - sign to n
#
#      ret = as.data.frame(vec);
#      ret[, step.var.str] = 1*(l.bound <= ret[,1] & ret[,1] < u.bound);
#      #head(ret);
#      while(T) {
#           l.bound = u.bound;
#           if(l.bound > max.val) {
#                break;
#           }
#           u.bound = l.bound+by;
#
#           step.var.str = sprintf("%s.%.2f_%.2f", field.name, l.bound, u.bound);
#           step.var.str= gsub("-", "n", step.var.str); # - sign to n
#           ret[, step.var.str] = 1*(l.bound <= ret[,1] & ret[,1] < u.bound);
#      }
#
#      return (ret[,-1]);
# }
# # step.data(subset(raw.prepay.data, select = c("int_spread")), by = 0.5);
#-----------------------------------------------------------------------

Any scripts or data that you put into this service are public.

ncpen documentation built on May 1, 2019, 9:21 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ncpen
Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models

R/ncpen_util.R
In ncpen: Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models

Defines functions same.base excluded to.indicators power.data interact.data to.ncpen.x.mat make.ncpen.data

Documented in excluded interact.data make.ncpen.data power.data same.base to.indicators to.ncpen.x.mat

Try the ncpen package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ncpen Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models

R/ncpen_util.R In ncpen: Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models

Defines functions same.base excluded to.indicators power.data interact.data to.ncpen.x.mat make.ncpen.data

Documented in excluded interact.data make.ncpen.data power.data same.base to.indicators to.ncpen.x.mat

Try the ncpen package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ncpen
Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models

R/ncpen_util.R
In ncpen: Unified Algorithm for Non-convex Penalized Estimation for Generalized Linear Models