R/data_eng_obj.R

Defines functions data_bin_2 data_bin_3 data_bin_4

# can be quickly applied to data.table

# bin data vectors into 4 levels
data_bin_4 <- function(x) {
  p = quantile(x)
  if (any(table(p) > 1)) {
    # then we have duplicate levels
    p <- unique(p)
  }
  as.numeric(cut(x, breaks=p, include.lowest = T))
}


# bin data vectors into 3 levels
data_bin_3 <- function(x) {
  p = quantile(x, probs = c(0, 0.33333, 0.66666, 1))
  if (any(table(p) > 1)) {
    # then we have duplicate levels
    p <- unique(p)
  }
  as.numeric(cut(x, breaks=p, include.lowest = T))
}


# bin data vectors into 2 levels
data_bin_2 <- function(x) {
  p = quantile(x, probs = c(0, 0.5, 1))
  if (any(table(p) > 1)) {
    # then we have duplicate levels
    p <- unique(p)
  }
  as.numeric(cut(x, breaks=p, include.lowest = T))
}


#' deng_obj  data engineering object (deng)
#' 
#' An object that transforms data into more robust forms.
#' This includes pairs, signature-pairs, quartiles, and ranks
#'
#' @examples
#' # New object
#' ann <- Recm$new("Ann")
#' 
#' @export
Data_eng <- R6Class("Data_eng",
                public = list(
                  
                  #' @field data_mode what type of engineering to do
                  data_mode = NULL,
                  
                  #' @field signatures if there are signatures to be used
                  signatures = NULL,
                  
                  #' @field pair_list list of column names to create pairs
                  pair_list = NULL,
                  
                  #' @description Create a new `Recm` object.
                  #' @param name The object is named.
                  #' @return A new `recm` object.
                  initialize = function(data_mode  = NULL,
                                        signatures = NULL,
                                        pair_list  = NULL) {
                    self$data_mode <- data_mode
                    self$signatures <- signatures
                    self$pair_list <- pair_list
                    
                    if (!all(data_mode %in% c('pairs','allpairs','sigpairs','namedpairs','quartiles','tertiles','binary','ranks','original'))) {
                      print("ERROR:  please choose a valid collection of data modes: ")
                      print('pairs,namedpairs,allpairs,sigpairs,quartiles,tertiles,binary,ranks,original')
                      stop(paste0('data_mode, ', self$data_mode  ,' wrong value'))
                    }
                    
                  },
                  
                  
                  # data engineering
                  #' @description Data engineering, replaces the object's data.table.
                  data_eng = function(data=NULL) {
                    
                    rankdat <- NULL
                    pairdat <- NULL
                    newdat <- data.table()
                    allgenes <- c(unlist(self$pair_list), unlist(self$signatures))
                    datcols <- colnames(data)
                    
                    if ('original' %in% self$data_mode) {
                      newdat <- cbind(newdat, data)
                    }
                    
                    if (length(allgenes) == 0) {
                      stop('No provided feature names in pairs or sigpairs.')
                    }
                    
                    if ('binary' %in% self$data_mode) {
                      bindat <- as.data.table(t(apply(data,1,data_bin_2)))
                      colnames(bindat) <- datcols
                      bindat <- bindat[, ..allgenes]
                      colnames(bindat) <- sapply(colnames(bindat), function(a) paste0(a,'_binary',collapse = ''))
                      newdat <- cbind(newdat, bindat)
                    } 
                    
                    if ('tertiles' %in% self$data_mode) {
                      tertdat <- as.data.table(t(apply(data,1,data_bin_3)))
                      colnames(tertdat) <- datcols
                      tertdat <- tertdat[, ..allgenes]
                      colnames(tertdat) <- sapply(colnames(tertdat), function(a) paste0(a,'_tertiles',collapse = ''))
                      newdat <- cbind(newdat, tertdat)
                    } 
                    
                    if ('quartiles' %in% self$data_mode) {
                      quartdat <- as.data.table(t(apply(data,1,data_bin_4)))
                      colnames(quartdat) <- datcols
                      quartdat <- quartdat[, ..allgenes]
                      colnames(quartdat) <- sapply(colnames(quartdat), function(a) paste0(a,'_quartile',collapse = ''))
                      newdat <- cbind(newdat, quartdat)
                    } 

                    if ('ranks' %in% self$data_mode) {
                      rankdat <- as.data.table(t(apply(data,1,rank)))
                      colnames(rankdat) <- datcols
                      rankdat <- rankdat[, ..allgenes]
                      colnames(rankdat) <- sapply(colnames(rankdata), function(a) paste0(a,'_ranked',collapse = ''))
                      newdat <- cbind(newdat, rankdat)
                    }
                    
                    if ('namedpairs' %in% self$data_mode) {
                      # if mode includes 'pairs' then we need to make var-pairs
                      if (is.null(self$pair_list)) {
                        stop('Error: pairlist not found.')
                      }
                      pair_list_format <- gsub(' ', '_', self$pair_list)
                      if (!all(pair_list_format %in% colnames(data))) {
                        print("ERROR: pair_list must be column names in data.")
                        print(pair_list_format)
                        stop(paste0('pair_list contains invalid value'))
                      }
                      if (length(self$pair_list) %% 2 != 0) {
                        print("Pair list must have an even length.")
                        stop('pair_list has invalid format or length.')
                      }
                      newcol_names <- c()
                      newcol_dat <- list()
                      cols <- pair_list_format 
                      for (ci in seq.int(from=1,to=length(self$pair_list),by=2)) {
                          cj <- ci+1
                          res0 <- as.numeric(data[,.SD,.SDcols=cols[ci]] > data[,.SD,.SDcols=cols[cj]])
                          this_new_col <- paste0(cols[ci],'_X_', cols[cj])
                          newcol_names <- c(newcol_names, this_new_col)
                          newcol_dat[[this_new_col]] <- res0
                      }
                      pairdat <- data.table(data.frame(newcol_dat))
                      newdat <- cbind(newdat, pairdat)
                    }

                    if ('allpairs' %in% self$data_mode) {
                      # if mode includes 'pairs' then we need to make var-pairs
                      if (!is.null(self$pair_list)) {
                        pair_list_format <- gsub(' ', '_', self$pair_list)
                        if (!all(pair_list_format %in% colnames(data))) {
                          print("ERROR: pair_list must be column names in data.")
                          print(pair_list_format)
                          stop(paste0('pair_list contains invalid value'))
                        }
                        subdat <- data[, pair_list_format, with=FALSE]  ## subset to only what's in the pair_list
                      } else {
                        subdat <- data
                      }
                      newcol_names <- c()
                      newcol_dat <- list()
                      cols <- colnames(subdat)
                      for (ci in 1:(length(cols)-1)) {
                        for (cj in (ci+1):length(cols)) {
                          res0 <- as.numeric(subdat[,.SD,.SDcols=ci] > subdat[,.SD,.SDcols=cj])
                          this_new_col <- paste0(cols[ci],'_X_', cols[cj])
                          newcol_names <- c(newcol_names, this_new_col)
                          newcol_dat[[this_new_col]] <- res0
                        }
                      }
                      allpairdat <- data.table(data.frame(newcol_dat))
                      newdat <- cbind(newdat, allpairdat)
                    }
                    
                    if ('sigpairs' %in% self$data_mode) {
                      if (is.null(self$signatures)) {
                        stop("Signatures is null in data eng obj. Please include signatures.")
                      }
                      # list of the new columns added to the data
                      newcol_names <- c()
                      # the new data frame we're building
                      newcol_dat <- list()
                      # names of the signatures
                      sig_names <- names(self$signatures)
                      # for each pair of sig_names
                      for (ci in 1:(length(sig_names)-1)) {
                        for (cj in (ci+1):length(sig_names)) {
                          # fix the names and get the features
                          sn1 <- sig_names[ci]
                          sn2 <- sig_names[cj]
                          s1 <- gsub(' ', '_', self$signatures[[sn1]])
                          s2 <- gsub(' ', '_', self$signatures[[sn2]])
                          sig_pair_temp <- list()
                          
                          # each sig is a group to get AUC
                          sig_group = c( rep.int('a', length(s1)), rep.int('b', length(s2)))
                          
                          # for each sample si
                          # sig_pair_temp <- sapply(1:nrow(data), function(si) {
                          #   val_group <- c( as.numeric(data[si,.SD,.SD=s1]), as.numeric(data[si,.SD,.SD=s2]) )
                          #   roc_obj <- ROCit::rocit(val_group, sig_group)
                          #   roc_obj$AUC
                          # }) 
                          
                          ### New sig fun ###
                          compute_roc <- function(sig_group, idx, jdx, a) {
                            val_group <- c( as.numeric(a[idx]), as.numeric(a[jdx]) )
                            roc_obj <- ROCit::rocit(val_group, sig_group)
                            return(roc_obj$AUC)
                          }
                          idx <- which(colnames(data) %in% s1)
                          jdx <- which(colnames(data) %in% s2)
                          sig_pair_temp <- apply(data, 1, FUN = function(a) compute_roc(sig_group, idx, jdx, a))
                          ###    ###    ###
                          
                          this_new_col <- paste0(sig_names[ci],'_X_', sig_names[cj])
                          newcol_names <- c(newcol_names, this_new_col)
                          newcol_dat[[this_new_col]] <- as.numeric(sig_pair_temp)
                          print(paste0("sigpair new column: ", this_new_col ))
                        }
                      }
                      sigpairs_dat <- data.table(data.frame(newcol_dat))
                      newdat <- cbind(newdat, sigpairs_dat)
                    }
              
                    return(newdat)
                  } # end data_eng
                ) # end public
)
                  
Gibbsdavidl/robencla documentation built on Dec. 25, 2024, 12:44 a.m.