glmSTIR: Nearest-neighbor Projected-Distance Regression

Documented in nearestNeighbors2 npdrDistances2 npdrLearner npdrLearnerCV

# toc
# These methods require flexclust dist2:
# npdrLearnerCV -- cv tune hyperparameters with knn classifer
# npdrLearner --  knn classifier

# ==============================================================#
#' \code{npdrLearnerCV}
#'
#' Tune a hyperparmeter that maximizes the cross-validation accuracy of a k
#' -nearest-neighbors classifier. You can tune k, but keep in mind that the 
#' resulting k might be underestimated because the training sample size is
#' smaller than the original sample size. When other hyperparameters are 
#' optimized, k is fixed to the npdr theoretical value that adapts to the 
#' training size (todo: make more flexible with knn alpha). You can tune the 
#' number of ICA or PCA components as the components are used as the space for 
#' calculating nearest neighbors. todo: create function interface that allows 
#' user to create their own sapply_hyper_fn.
#' @param x (m+1) x p dataframe of m instances, 1 class column and p attributes
#' @param label column label for class \code{"class"}
#' @param tune_grid vector of hyperparameter values to test for best 
#' classification accuracy
#' @param dist_metric for distance matrix between instances
#' (default: \code{"manhattan"}, others include \code{"euclidean"},
#' and for GWAS \code{"allele-sharing-manhattan"}).
#' @param tune_type type of hyperparmater to optimize. default: \code{"knn"}, 
#' others include \code{"ica"} (number of ica components for ica space 
#' transformation, and \code{"pca"} (number of components for PCA transformation.
#' @param num_folds number of cross-validation folds for tuning
#' @return list containing best hyperparameter (best_param), its highest 
#' accuracy (best_acc), and a table of fold and parameter accuracies (cv_table)
#' @examples
#' library(flexclust) # need for npdrLearner knn classifier
#' library(fastICA)   # need if tuning ica tansformation
#' cv.out <- npdrLearnerCV(x=dats, label="class", 
#'               tune_grid = seq(20,90,5),   # tuning knn
#'               dist_metric = "manhattan",
#'               tune_type = "knn",
#'               num_folds=5, verbose=T)
#' cv.out$best_param
#' plot(cv.out$cv_table$hyp,cv.out$cv_table$means,
#'         xlab="hyperparameter", ylab="accuracy", 
#'         main="CV hyperparameter tuning", type="l")
#' text(cv.out$best_param,cv.out$best_acc,paste("max.loc =",cv.out$best_param))
#' Or you can tune number of knns 
#' cv.out <- npdrLearnerCV(x=dats, label="class", 
#'                      tune_grid = seq(20,90,5),   # tuning knn
#'                        dist_metric = "manhattan",
#'                        tune_type = "knn",
#'                        num_folds=5, verbose=T)
#' @export 
npdrLearnerCV <- function(x, label="class", 
                          tune_grid = seq(10,90,10), #knn
                          dist_metric = "manhattan",
                          tune_type = "knn",
                          num_folds = 5, verbose=F) 
{
  # Cross-Validtation Hyperparameter optimization 
  # by knn classification accuracy
  class_idx <- which(colnames(x)==label)
  folds <- caret::createFolds(x[,class_idx], k = num_folds) 
  ### switch the thing we want to tune (tune_type)
  sapply_hyper_fn = switch(tune_type,
                           knn = function(hyper.param,te.idx) {
                             # tune_type = "knn"
                             # ex. tune_grid = seq(10,90,10)
                             # values should be less than num samples-1
                             full_dat <- x[,-class_idx]
                             full_dat$class <- as.factor(x[,class_idx]) 
                             test_results <- npdrLearner(train.outcome="class", 
                                                      train.data=full_dat[-te.idx,], 
                                                      test.outcome="class", 
                                                      test.data=full_dat[te.idx,],
                                                      nbd.method = "relieff", 
                                                      nbd.metric = dist_metric, 
                                                      msurf.sd.frac = 0.5, 
                                                      knn=hyper.param) 
                             return(test_results$accuracy) },
                           ica = function(hyper.param,te.idx) { 
                             #   tune_type = "ica"
                             # ex.tune_grid = seq(10,80,10) # num of ICs
                             # values should be less than number of variables
                             # compute ICs for all samples for given n.comp
                             ICs <- fastICA(x[,-class_idx], 
                                            n.comp=hyper.param, fun="exp",
                                            method = "C", verbose=F,
                                            maxit=100000,tol=0.00000001)
                             IC.src <- data.frame(ICs$S)
                   IC.src$class <- as.factor(x[,class_idx])  # add class back
                             m.samp <- nrow(IC.src)  # full sample size
                             # use training sample size for theoretical knn
                             # b/c knn's calculated in training
                             m.minority <- min(table(IC.src$class))
                             # use training sample size for theoretical knn
                             # b/c knn's calculated in training
           m.train <- m.minority-m.minority/num_folds # num train samp, 2 fold
                             k.train <- npdr::knnSURF(m.train - 1, 0.5)
                             test_results <- npdrLearner(
                                              train.outcome="class", 
                                              train.data=IC.src[-te.idx,], 
                                              test.outcome="class", 
                                              test.data=IC.src[te.idx,],
                                              nbd.method = "relieff", 
                                              nbd.metric = dist_metric, 
                                              msurf.sd.frac = 0.5, 
                                              knn=k.train) 
                             return(test_results$accuracy) },
                           pca = function(hyper.param,te.idx) { 
                             #   tune_type = "pca"
                             # ex.tune_grid = seq(5,50,5) # num of PCs
                             # values should be less than number of variables
                             # compute ICs for all samples for given n.comp
                             PCs<-prcomp(x[,-class_idx])
                             topPCs <- data.frame(PCs$x[,1:hyper.param]) 
                  topPCs$class <- as.factor(x[,class_idx])  # add class back
                             m.samp <- nrow(topPCs)  # full sample size
                             # use training sample size for theoretical knn
                             # b/c knn's calculated in training
                             m.train <- m.samp-m.samp/num_folds # num train samp
                             k.train <- npdr::knnSURF(m.train - 1, 0.5)
                             test_results <- npdrLearner(train.outcome="class", 
                                                         train.data=topPCs[-te.idx,], 
                                                         test.outcome="class", 
                                                         test.data=topPCs[te.idx,],
                                                         nbd.method = "relieff", 
                                                         nbd.metric = dist_metric, 
                                                         msurf.sd.frac = 0.5, 
                                                         knn=k.train) 
                             return(test_results$accuracy) }
  ) # end switch
  #### Begin cross validation
  ## Iterate over cv folds
  cv.results <- list()
  for (fold.id in seq(1,num_folds)){
    te.idx <- folds[[fold.id]]
    if (verbose){cat("fold", fold.id, "of",num_folds,"\n")}
    # add tune-alpha and tune-pca
    if(verbose){cat("\t inner loop over hyperparameters...\n")}
    # iterate over hyperparameter
    scores <- sapply(tune_grid,        # hyp loop var
                     function(hyp){sapply_hyper_fn(hyp,te.idx=te.idx)}
    )  # end sapply hyp loop over hyperparameters
    cv.results[[fold.id]] <- scores  # scores vector
  } # end for folds loop
  cv.results <- data.frame(cv.results)  # turn list to df
  cv.results$means <- rowMeans(as.matrix(cv.results))
  cv.results$hyp <- tune_grid
  colnames(cv.results) <- c(names(folds),"means","hyp")
  #### Select best performance
  best.idx <- which.max(cv.results$means)  # accuracy
  
  return(list(
    best_param = tune_grid[best.idx],
    best_acc = cv.results$means[best.idx],
    cv_table = cv.results
  ))
} # end npdrLearnerCV

# =========================================================================#
#' npdrLearner
#'
#' Uses npdr neighborhoods to learn a nearest neighbor classification or
#' regression model (latter not implemented but easy).
#' Finds the nearest neighbors of test instances to a training dataset.
#' Uses majority class of training neighbors for test prediction.
#' Allows adaptive Relief neighborhoods or specify k.
#' Regression would simply use the average value of the neighbor phenotypes.
#' Uses functions npdrDistances2 and nearestNeighbors2.
#'
#' @param train.outcome character name or length-m numeric outcome vector for
#' train data
#' @param train.data m x p matrix of m instances and p attributes of train data.
#' May also include outcome vector but then outcome should be a name.
#' Include attr names as colnames.
#' @param test.outcome character name or length-m numeric outcome vector of test data
#' @param test.data m x p matrix of m instances and p attributes for test data.
#' May also include outcome vector but then outcome should be a name.
#' Include attr names as colnames.
#' @param nbd.method neighborhood method: `multisurf` or `surf` (no k) or
#' `relieff` (specify k). Used by nearestNeighbors2().
#' @param nbd.metric used in npdrDistances2 for distance matrix between
#' instances, default: `manhattan` (numeric). Used by nearestNeighbors2().
#' @param knn number of constant nearest hits/misses for `relieff` (fixed-k).
#' Used by nearestNeighbors2().
#' The default knn=0 means use the expected SURF theoretical `k` with
#' `msurf.sd.frac` (0.5 by default)
#' @param msurf.sd.frac multiplier of the standard deviation from the mean
#' distances; subtracted from mean for SURF or multiSURF.
#' The multiSURF default is `msurf.sd.frac=0.5`: mean - sd/2.
#' Used by nearestNeighbors2().
#' @param dopar.nn whether or not neighborhood is computed in parallel,
#' default as FALSE.
#' @return  list: neighborhoods for each test instance, prediction for each
#' test instance, accuracy on test set
#'
#' @examples
#' test.results <- npdrLearner(
#'   train.outcome = "class", train.data = case.control.3sets$train,
#'   test.outcome = "class", test.data = case.control.3sets$validation,
#'   nbd.method = "relieff", nbd.metric = "manhattan",
#'   dopar.nn = FALSE, knn = 0
#' )
#' test.results$accuracy
#' @export
npdrLearner <- function(train.outcome, train.data, test.outcome, test.data,
                        nbd.method = "relieff", nbd.metric = "manhattan",
                        msurf.sd.frac = 0.5, dopar.nn = FALSE, knn = 0) {
  if (length(train.outcome) == 1) {
    # e.g., outcome="class" or outcome=101 (pheno col index) and dataset is data.frame including outcome variable
  train.pheno.original <- as.factor(train.data[, train.outcome, drop = TRUE]) 
    org_train_levels <- levels(train.pheno.original) 
    if (any(is.na(suppressWarnings(as.numeric(org_train_levels))))){ 
                                                             #///num levels
      # if class levels are strings that do not map to integers
      # change them to 1, 2, ...
      train.pheno <- train.pheno.original
      for (i in seq(1,length(org_train_levels),1)){ # make levels 1, 2, ...
        levels(train.pheno)[levels(train.pheno)==org_train_levels[i]] = i
      }
      train.data <- train.data %>% select(-train.outcome) 
    } else{ # if levels already numeric-looking strings
        train.pheno <- train.pheno.original %>%
        as.character() %>%
        as.numeric() # get phenotype
      train.data <- train.data %>% select(-train.outcome) 
      # outcome = "qtrait" or 101
    }                                                   #///num levels
  } else { # user specifies a separate phenotype vector
    train.pheno <- train.outcome  # still needs to be numeric levels
    # add   #///num levels
    # assumes user provides a separate outcome data vector
    # train.data <- train.data 
    # assumes dataset only contains attributes/predictors
  }
  if (length(test.outcome) == 1) {
    # e.g., outcome="class" or outcome=101 (pheno col index) and dataset is data.frame including outcome variable
    test.pheno.original <- as.factor(test.data[, test.outcome, drop = TRUE]) 
    org_test_levels <- levels(test.pheno.original) 
    #original_levels <- levels(test.pheno.original)  
    if (any(is.na(suppressWarnings(as.numeric(org_test_levels))))){       
                                                            #///num levels
      # if class levels are strings that do not map to integers
      # change them to 1, 2, ...
      test.pheno <- test.pheno.original
      for (i in seq(1,length(org_test_levels),1)){ # make levels 1, 2, ...
        levels(test.pheno)[levels(test.pheno)==org_test_levels[i]] = i
      }
      test.data <- test.data %>% select(-test.outcome) 
    } else{ # if already numeric-looking strings
      test.pheno <- test.pheno.original %>%
        as.character() %>%
        as.numeric() # get phenotype
      test.data <- test.data %>% select(-test.outcome) 
      # outcome = "qtrait" or 101
    }                                                   #///num levels
  } else { # user specifies a separate phenotype vector
    test.pheno <- test.outcome 
    # assume users provides a separate outcome data vector
    # test.data <- test.data # assumes dataset only contains predictors
  }
  # get ids of nearest training instances for each test instance
  # number of elements in test.neighbors list equal to test.data sample size
  # each element has nearest neighbors from train.data
  test.neighbors <- nearestNeighbors2(train.data, test.data,
    nbd.method = nbd.method,
    nbd.metric = nbd.metric,
    sd.vec = NULL, sd.frac = msurf.sd.frac,
    k = knn, dopar.nn = dopar.nn
  )
  # predict test instances based on the majority class of nearest training neighbors
  test.predict <- sapply(
    1:length(test.neighbors),
    function(i) {
      table(train.pheno[test.neighbors[[i]]]) %>%
        which.max() %>%
        names() %>%
        as.numeric()  
    }
  )
  test.acc <- sum(test.predict == test.pheno) / length(test.pheno)
  # map back to original class levels
  for (i in seq(1,length(org_test_levels),1)){
    # dplyr use original class levels
    test.predict <- replace(test.predict,test.predict %in% i,
                            org_test_levels[i])
  }
  test.actual <- as.character(test.pheno.original)
  conf.mat <- as.matrix(table(test.predict,test.actual))
  names(dimnames(conf.mat)) <- c("Test Predicted", "Test Actual")
  list(train_nbs_of_test = test.neighbors, 
       actual = test.actual, confusion = conf.mat,
       prediction = test.predict, accuracy = test.acc)
}


# =========================================================================#
#' npdrDistances2
#'
#' Create m1 x m2 distance matrix between two datasets,
#' (m1 instances and p attributes in dataset1 and
#' m2 instances and p attributes in dataset2).
#' Datasets should not include phenotype column.
#' Uses function dist2 from flexclust.
#' Used by nearestNeighbors2().
#'
#' @param attr.mat1 m1 x p matrix of m instances and p attributes
#' @param attr.mat2 m2 x p matrix of m instances and p attributes
#' @param metric for distance matrix between instances
#' (default: \code{"manhattan"}, others include \code{"euclidean"},
#' and for GWAS \code{"allele-sharing-manhattan"}).
#' @return matrix of m1 instances x m2 instances pairwise distances.
#' @examples
#' train_dat <- case.control.3sets$train
#' valid_dat <- case.control.3sets$validation
#'
#' if (require("flexclust")) {
#'   dist.mat <- npdrDistances2(
#'     train_dat[, names(train_dat) != "class"],
#'     valid_dat[, names(valid_dat) != "class"],
#'     metric = "manhattan"
#'   )
#' }
#' @export
npdrDistances2 <- function(attr.mat1, attr.mat2, metric = "manhattan") {
  check_installed("flexclust", reason = "for matrix distance computation `dist2()`")

  # first mat is rows and second is columns
  npdr.dist.fn <- flexclust::dist2
  # Compute distance matrix between all samples (rows) between test and training data
  # default is numeric manhattan ("manhattan"), max-min scaling is only needed for relief
  if (metric == "allele-sharing-manhattan") {
    # allele-sharing-manhattan, AM for SNPs
    distance.mat <- npdr.dist.fn(attr.mat1 / 2, attr.mat2 / 2, method = "manhattan")
  } else if (metric == "euclidean") {
    distance.mat <- npdr.dist.fn(attr.mat1, attr.mat2, method = "euclidean")
  } else {
    distance.mat <- npdr.dist.fn(attr.mat1, attr.mat2, method = "manhattan")
  }
  as.matrix(distance.mat)
}

# =========================================================================#
#' nearestNeighbors2
#'
#' Find nearest neighbors of each instance in attr.mat2 (test) to instances in
#' attr.mat1 (train) using relief neighborhood methods.
#' Used by npdrLearner, nearest neighbor classifier.
#' Input data should not include phenotype column.
#'
#' @param attr.mat1 m1 x p matrix of m instances and p attributes (training data)
#' @param attr.mat2 m2 x p matrix of m instances and p attributes (test data)
#' @param nbd.metric used in npdrDistances2 for distance matrix between
#' instances, default: `manhattan` (numeric)
#' @param nbd.method neighborhood method: `multisurf` or `surf` (no k) or
#' `relieff` (specify k)
#' @param sd.vec vector of standard deviations
#' @param sd.frac multiplier of the standard deviation from the mean distances,
#' subtracted from mean distance to create for SURF or multiSURF radius.
#' The multiSURF default "dead-band radius" is sd.frac=0.5: mean - sd/2
#' @param k number of constant nearest hits/misses for `relieff` (fixed k).
#' The default k=0 means use the expected SURF theoretical k with sd.frac
#' (0.5 by default) for relieff nbd.
#' @param dopar.nn whether or not neighborhood is computed in parallel, default as F
#' @return list of Ri's (data2 test instances) NN's in data1 (train instances)
#'
#' @examples
#' train_dat <- case.control.3sets$train
#' valid_dat <- case.control.3sets$validation
#' test.neighbors <- nearestNeighbors2(
#'   train_dat[, names(train_dat) != "class"],
#'   valid_dat[, names(valid_dat) != "class"], # no phenotype column
#'   nbd.method = "relieff",
#'   nbd.metric = "manhattan",
#'   sd.vec = NULL, sd.frac = 0.5,
#'   k = 0, # uses multisurf k estimate
#'   dopar.nn = FALSE
#' )
#' @export
nearestNeighbors2 <- function(attr.mat1, attr.mat2,
                              nbd.method = "multisurf",
                              nbd.metric = "manhattan",
                              sd.vec = NULL, sd.frac = 0.5, dopar.nn = FALSE,
                              k = 0) {
  if (dopar.nn) {
    check_installed("foreach", reason = "for fast parallel computing with `foreach()` and `%dopar%`")
    check_installed("doParallel", reason = "for `registerDoParallel()`")
    check_installed("parallel", reason = "for `makeCluster()`, `detectCores()`, and `stopCluster()`")
    `%dopar%` <- foreach::`%dopar%`
  }

  # create a matrix with num.samp rows and two columns
  # first column is sample Ri, second is Ri's nearest neighbors
  num.samp1 <- nrow(attr.mat1) # training set, rows of dist mat
  num.samp2 <- nrow(attr.mat2) # testing set, cols of dist mat

  dist.mat <- npdrDistances2(
    as.matrix(attr.mat1),
    as.matrix(attr.mat2),
    metric = nbd.metric
  )
  dist.df <- dist.mat %>% 
    as.data.frame() %>% 
    `colnames<-`(seq.int(num.samp2)) %>% 
    `rownames<-`(seq.int(num.samp1))

  if (nbd.method == "relieff") {
    if (k == 0) { # if no k specified or value 0
      # replace k with the theoretical expected value for SURF (close to multiSURF)
      erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1
      # theoretical surf k (sd.frac=.5) for regression problems (does not depend on a hit/miss group)
      k <- floor((num.samp1 - 1) * (1 - erf(sd.frac / sqrt(2))) / 2) # uses sd.frac
    }

    if (dopar.nn) {
      avai.cors <- parallel::detectCores() - 2
      cl <- parallel::makeCluster(avai.cors)
      doParallel::registerDoParallel(cl)
      Ri.nearestNeighbors.list <- vector("list", num.samp2)
      Ri.nearestNeighbors.list <-
        foreach::foreach(Ri.int = seq.int(num.samp2), .packages = c("dplyr")) %dopar% {

          # foreach return, makes Ri.nearestNeighbors.list
          return(get_Ri_nearest(dist.df, as.character(Ri.int), nbd.method, k = k))
        }
      parallel::stopCluster(cl)
    } else { # relieff, no parallel
      Ri.nearestNeighbors.list <- vector("list", num.samp2)
      # look down each column of dist.df for neighbors of Ri
      for (Ri in colnames(dist.df)) { # for each instance/column Ri
        Ri.int <- as.integer(Ri)
        Ri.nearest.idx <- get_Ri_nearest(dist.df, as.character(Ri.int), nbd.method, k = k)

        if (!is.null(Ri.nearest.idx)) { # if neighborhood not empty
          Ri.nearestNeighbors.list[[Ri.int]] <- Ri.nearest.idx
        }
      } # end for
    }
    return(Ri.nearestNeighbors.list)
  }
  
  if (nbd.method == "surf") {
    radius.surf <- mean(dist.mat) # const r = mean(all distances)
    sd.const <- sd(dist.mat)
    # bam: orignal surf does not subtract sd-frac but should for fair multisurf comparison
    Ri.radius <- rep(radius.surf - sd.frac * sd.const, num.samp2)
    names(Ri.radius) <- as.character(1:num.samp2)
  }
  
  if (nbd.method == "multisurf") {
    sd.vec <- sd.vec %||% sapply(1:num.samp2, function(i) sd(dist.mat[, i]))
    Ri.radius <- colMeans(dist.mat) - sd.frac * sd.vec # use adaptive radius
    names(Ri.radius) <- as.character(1:num.samp2)
  }
  
  if (dopar.nn) {
    avai.cors <- parallel::detectCores() - 2
    cl <- parallel::makeCluster(avai.cors)
    doParallel::registerDoParallel(cl)
    Ri.nearestNeighbors.list <- vector("list", num.samp2)
    Ri.nearestNeighbors.list <-
      foreach::foreach(
        Ri.int = seq.int(num.samp2), .packages = c("dplyr")
      ) %dopar% {
        return(get_Ri_nearest(dist.df, as.character(Ri.int), nbd.method, Ri.radius = Ri.radius))
      }
    parallel::stopCluster(cl)
  } else {
    # put each Ri's nbd in a list then rbind them at the end with bind_rows()
    Ri.nearestNeighbors.list <- vector("list", num.samp2) # initialize list

    for (Ri in colnames(dist.df)) { # for each sample Ri
      Ri.int <- as.integer(Ri)
      Ri.nearest.idx <- get_Ri_nearest(dist.df, Ri, nbd.method, Ri.radius = Ri.radius)

      if (!is.null(Ri.nearest.idx)) { # similar to relieff
        Ri.nearestNeighbors.list[[Ri.int]] <- Ri.nearest.idx
      }
    }
  }

  # list of Ri's (data2 test instances) NN's in data1 (train instances)
  Ri.nearestNeighbors.list
}