R/utility_funcs.r

Defines functions decide logit logisticf get_valid_cols inv_model_idx get_model_idx partition_data get_const_features scale_dat poly_transform permute_matrix pairwise_distance avg_median_pairwise_distance rbf_feature get_rbf_centroids add_rbf_features get_subset_idx

Documented in add_rbf_features avg_median_pairwise_distance decide get_const_features get_model_idx get_rbf_centroids get_subset_idx get_valid_cols inv_model_idx logisticf logit pairwise_distance partition_data permute_matrix poly_transform rbf_feature scale_dat

#' Select indices of rows in x which correspond to values of labels, which can be multiple elements
#' @param x vector of labels
#' @param labels reference labels to compare to
#' e.g. x <- c("t", "b", "t", "v", "b"); labels <- c("t", "b")
#' get_subset_idx(x, labels) = T, T, T, F, T
#' @export
get_subset_idx <- function(x, labels) {
  !is.na(match(x, labels))
}


#' Add n_centroids RBF features to X
#'
#' @param X covariate matrix
#' @param s median pairwise distance of points in X
#' @param n_centroids number of RBF features to add
#' @param Xi matrix of reference points to calculate RBF w.r.t
#' @return X augmented covariate matrix
#' @export
add_rbf_features <- function(X, s, n_centroids, Xi=NULL) {
    set.seed(1234)
    if (is.null(Xi)) {
        rbf_centroids <- get_rbf_centroids(X, n_centroids)
        Xi <- rbf_centroids$"xi"
    }
    for (i in 1:n_centroids) {
        label <- sprintf("rbf%i", i)
        X[, label] <- rbf_feature(X, s, xi=Xi[i,])
    }
    return(X)
}

#' Get reference points for RBF centroids
#'
#' @param X covariate matrix
#' @param n_centroids number of RBF centroids
#' @param idx [Optional] location of RBF centroid reference points
#' @return list of Xi (centroid points) and idx (location of them)
#' @export
get_rbf_centroids <- function(X, n_centroids, idx=NULL) {
    n <- nrow(X)
    d <- ncol(X)
    if (is.null(idx)) {
        idx <- sample(1:n, n_centroids)
    }
    Xi <- matrix(NA, nrow=n_centroids, ncol=d)
    for (i in 1:n_centroids) {
        Xi[i, ] <- unlist(X[idx[i],])
    }
    return(list("xi"=Xi, "idx"=idx))
}

#' Compute single RBF feature at some centroid i in idx (or xi in Xi)
#'
#' @param X covariate matrix
#' @param s median pairwise distance of points in X
#' @param idx [Optional] location of reference centroid
#' @param xi [Optional] reference centroid
#' @export
rbf_feature <- function(X, s, idx=NULL, xi=NULL) {
    n <- nrow(X)
    if (is.null(idx)) {
        set.seed(1234)
        idx <- sample(1:n, 1)
    }
    if (is.null(xi)) {
        xi <- X[idx, ]
    }
    Xi <- matrix(xi, nrow=n, ncol=length(xi), byrow=TRUE)
    Xi <- apply(Xi, 2, unlist)
    rbf <- exp(-pairwise_distance(X, Xi)/(2 * s))
    return(rbf)
}

#' Calculate average of median pairwise distances for between all adjacent points
#'
#' @param X covariate matrix
#' @return s median pairwise distance
#' @export
avg_median_pairwise_distance <- function(X) {
    n <- nrow(X)
    X_perm <- X
    s <- rep(0,2)
    for (i in 1:2) {
        X_perm <- permute_matrix(X_perm)
        s[i] <- median(pairwise_distance(X, X_perm))
    }
    mean(s)
}

#' Calculate distance for each row of X0 and X1
#'
#' @param X0 covariate matrix
#' @param X1 covariate matrix
#' @return vector of distances
#' @export
pairwise_distance <- function(X0, X1) {
    rowSums((X0 - X1)^2, na.rm=TRUE)
}

#' Cyclic permutation of rows of X by r rows
#'
#' @param X covariate matrix
#' @param r number of rows to permute (default=1)
#' @export
permute_matrix <- function(X, r=1) {
    n <- nrow(X)
    X_perm <- rbind(X[(n-r+1):n,], X[1:(n-r),])
    return(X_perm)
}

#' Polynomial transform
#'
#' Run a b-degree polynomial transform on the columns of X (of order b)
#'
#' @param X matrix of covariates
#' @param b order of polynomial
#' @return augmented matrix of covariates
#' @export
poly_transform <- function(X, b=2){
  if (b==1) {return(X)}
  orig_cols <- colnames(X)
    for(i in 2:b){
        Xb <- apply(X[, orig_cols], 2, function(col) col^i)
        colnames(Xb) <- paste0(colnames(Xb), "^", i)
        X <- cbind(X, Xb)
    }

    #truncate particularly large values that might overflow
    if (b >=2) {
        X[abs(X) > 1e6] <- 1e6
    }
    return(X)
}

#' Scale features
#'
#' This define a function to scale features of a matrix with reference to another matrix
#' useful because you can normalise X_train, and apply the same transformation to X_test.
#' Use with caution if either dataset contain extreme outliers.
#'
#' @param X matrix of covariates.
#' @param ref matrix of covariates from which to calculate mu and sd.
#' @param na.rm a logical to indicate if NAs should be stripped in mean and standard deviation computations.
#' @param add.intercept a logical to indicate if column of 1s should be added to the output (Intercept)
#' @return augmented matrix of covariates, standardized and an intercept column
#' @export
scale_dat <- function(X, ref, na.rm=FALSE, add.intercept=TRUE){
  if(ncol(X) != ncol(ref)) stop('Two inputs must have the same number of columns')

  #calculate column means and sds of ref, ignoring NAs
  mu <- colMeans(ref, na.rm=na.rm)
  sd <- apply(ref, 2, function(x) sd(x, na.rm=na.rm))

  #transform columns of X
  for(i in 1:ncol(ref)){
    X[,i] <- (X[,i] - mu[i]) / sd[i] #is there a smarter way to do this not in a loop?
  }

  #also add column of 1s called intercept
  if (add.intercept) {
    Intercept <- rep(1, nrow(X))
    X <- cbind(Intercept, X)
  }

  return(X)
}

#' find colnames for columns that are constant (e.g. all 1, -999, NA etc)
#' @param X matrix of covariates
#' @return list of column names
#' @export
get_const_features <- function(X) {
    col_sd <- apply(X, 2, sd)
    cols <- col_sd == 0 | is.na(col_sd)
    return(setdiff(colnames(X)[cols], "Intercept"))
}

#' Partition data into (random) folds for cross-validation.
#'
#' @param n number of rows
#' @param k number of folds
#' @param random flag to choose whether to randomly select
#' @return ind vector of integers denoting the OOS fold each row belongs to
#' Returns vector of indices denoting the OOS index, i_e. for rows with I_i=1, those are OOS for i=1
#' @export
partition_data <- function(n, k, random=FALSE){
    #minimum size of the group
    size <- floor(n/k)

    #number of remaining points after even division
    remainder <- n %% k

    #repeat 1:k size times, then 1:remainder (makes a vector length n)
    ind <- c(rep(seq(1:k), size), seq(1, remainder, length.out = remainder))

    #if you want, shuffle the index
    if(random==T){
        ind <- ind[sample(ind)]
    }
}

#' helper function to get the index corresponding to the model built on folds {l !=k} and for jet number j {1,2,3}, ordering columns by fold and then with nesting on j i.e. first six cols are k=1, j=1,2,3; k=2, j=1,2,3; etc
#' @param j this jet group
#' @param k this fold
#' @param K number of folds
#' @export
get_model_idx <- function(j, k, K) {
    K*(j-1) + k
}

#' does the inverse procedure to get_model_idx, s.t. if idx <- get_model_idx(j, k, K)
#' then (j,k) <- inv_model_idx(idx) (if R output tuples)
#' @param idx model index
#' @param K number of folds
#' @return numeric pair of j and k
#' @export
inv_model_idx <- function(idx, K) {
    # assumes indexing loops internally over j and externally over k
    j <-  ceiling(idx/K)
    k <- idx - K*(j-1)
    return(c(j, k))
}

#' Gets the columns from header that aren't in features_to_rm
#' @param header list of column names e.g. names(X)
#' @param features_to_rm list of feature names we want to remove
#' @param j jet group
#' @return index of columns to retain
#' @export
get_valid_cols <- function(header, features_to_rm, j) {
    match(setdiff(header, features_to_rm[[j]]), header)
}

#' Calculate logistic function
#' @param x float
#' @return logisticf(x)
#' @export
logisticf <- function(x) {
    1/(1 + exp(-x))
}

#' Calculate logit function
#' @param p float in [0,1]
#' @return logit(p)
#' @export
logit <- function(p) {
    log(p/(1-p))
}

#' Thresholding function
#'
#' @param p vector of probabilities
#' @param thresh threshold over which we assign output of 1
#' @export
decide <- function(p, thresh = 0.5) {
    label <- as.numeric(p > thresh)
    return(label)
}

#' #' Calculate AMS metric
#' #'
#' #' \eqn{s = sum_{i in B \cup G}w_i}
#' #' \eqn{b = sum_{i in B \cup G}w_i};
#' #' i.e. s is the sum of the weights of successful signal classifications (TP)
#' #' and b is the sum of the weights of incorrect signal classifications (FP)
#' #' @param s count of true positives
#' #' @param b count of false positives
#' ams_metric <- function(s, b) {
#'     br <- 10
#'     sqrt(2 * ((s + b + br) * log(1 + s/(b + br)) - s))
#' }
#'
#' #' Count the number of true positives
#' #' need to make sure dims of y, y_hat and w are the same
#' #' @param y response vector
#' #' @param y_hat predicted response vector
#' #' @param w weights
#' count_s <- function(y, y_hat, w) {
#'     stopifnot(length(y) == length(y_hat))
#'     stopifnot(length(y) == length(w))
#'     s <- sum(w[y == 1 & y_hat == 1])
#'     return(s)
#' }
#'
#' #' Count the number of false positives
#' #' need to make sure dims of y, y_hat and w are the same
#' #' @param y response vector
#' #' @param y_hat predicted response vector
#' #' @param w weights
#' count_b <- function(y, y_hat, w) {
#'     stopifnot(length(y) == length(y_hat))
#'     stopifnot(length(y) == length(w))
#'     b <- sum(w[y != 1 & y_hat == 1])
#'     return(b)
#' }
#'
#' #' Calculate AMS metric
#' #'
#' #' @param y response vector
#' #' @param y_hat predicted response vector
#' #' @param w weights of samples in y
#' #' @param sum_w total sum of weights for re-normalisation.
#' #' If sum_w is not provided the calculation assumes weights have already been rescaled.
#' #'
#' #' @return ams
#' #' @export
#' calculate_ams_partition <- function(y, y_hat, w, sum_w=NULL) {
#'   if (!is.null(sum_w)) {
#'     w <- w * (sum_w/sum(w))
#'   }
#'   y_hat <- as.numeric(y_hat)
#'   s <- count_s(y, y_hat, w)
#'   b <- count_b(y, y_hat, w)
#'   ams <- ams_metric(s, b)
#'   return(ams)
#' }
ant-stephenson/lhc documentation built on Jan. 28, 2021, 3:47 p.m.