R/helpers.R
In oscar: Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm

Documented in oscar.binarize oscar.bs oscar.bs.k oscar.cost.after oscar.cv oscar.pareto oscar.sparsify

####
#
# Relevant helper functions
#
####

#' @title Cross-validation for oscar-fitted model objects over k-range
#'
#' @description Create a cross-validation matrix with the chosen goodness metric with n-folds. Based on the goodness metric, one ought to pick optimal cardinality (parameter 'k').
#'
#' @param fit oscar-model object
#' @param fold Number of cross-validation folds, Default: 10
#' @param seed Random seed for reproducibility with NULL indicating that it is not set, Default: NULL
#' @param strata Should stratified cross-validation be used; separate values indicate balanced strata. Default: Unit vector, which will treat all observations equally.
#' @param verb Level of verbosity with higher integer giving more information, Default: 0
#' @param ... Additional parameters passed to oscar-function
#'
#' @return A matrix with goodness of fit over folds and k-values
#'
#' @details A k-fold cross-validation is run by mimicking the parameters contained in the original oscar S4-object. This requires the original data at slots @x and @y.
#*
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   fit_cv <- oscar.cv(fit, fold=10, seed=123)
#'   fit_cv
#' }
#' @rdname oscar.cv
#' @export 
#' @importFrom survival coxph Surv
oscar.cv <- function(
	# oscar-object
	fit,
	# k-fold
	fold = 10,
	# RNG seed (integer) that can be set for exact reproducibility
	seed = NULL,
	# Should some strata be balanced over the bins? By default no, should be a vector equal to the number of rows in x
        strata = rep(1, times=nrow(fit@x)),
	# Level of verbosity mainly for debugging
	verb = 0,
	# Additional parameters passed to oscar-function
	...
){
	# Internal cv-sample allocation function, modified from the ePCR-package
	cv <- function(
		# Original x data frame
		x,
		# Number of CV-folds
		fold = fold,
		# Should some strata be balanced over the bins? By default no, should be a vector equal to the number of rows in x
		strata = rep(1, times=nrow(x)),
		# Should data be randomized when creating cv-folds in addition to allocating to bins
		shuffle = TRUE,
		# Seed number for reproducibility
		# If NULL then seed is not set
		seed = seed
	){
		# Seed number
		if(!is.null(seed)) set.seed(seed)
	
		# Allocate folds
		uniqs <- unique(strata)
		folds <- rep(1:fold, times=ceiling(nrow(x)/fold))[1:nrow(x)]
		ifelse(shuffle,
			ord <- unlist(lapply(uniqs, FUN=function(z) sample(which(strata==z)))),
			ord <- unlist(lapply(uniqs, FUN=function(z) which(strata==z))))
		
		whichfold <- vector(length=nrow(x))
		whichfold[ord] <- folds
	
		# Construct test and train sets based on the folds
		test <- lapply(1:fold, FUN=function(z) which(z==whichfold))
		train <- lapply(1:fold, FUN=function(z) do.call("c", test[-z]))
		
		# Return cv sample indices per each fold for train and test sets respectively
		list(train = train, test = test)
	}
	# Generate cv-folds
	cvsets <- cv(fit@x, fold = fold, seed = seed, strata = strata)
	cvsets
	# Verbose
	if(verb>=2) print(str(cvsets))
	
	# K-steps
	ks <- 1:nrow(fit@k)
	# Loop over the CV folds, make a list of predictions and the real values
	cvs <- lapply(1:fold, FUN=function(z){
		if(verb>=1) cat(paste0("\nCV fold ", z, " of ", fold, "\n"))
		
		# Remove redundant columns but give out a warning
		x <- fit@x[cvsets$train[[z]],]
		k <- fit@k
		w <- fit@w
		if(any(apply(x, MARGIN=2, FUN=\(q){ all(q==unique(q)[1]) }))){
			omits <- which(apply(x, MARGIN=2, FUN=\(q){ all(q==unique(q)[1]) }))
			x <- x[,-omits]
			k <- k[-omits,-omits]
			w <- w[-omits]
			warning(paste("CV matrix 'x' contains redundant data columns in fold", z, " and these are removed in the cross-validation"))
		}
		# Constructing appropriate model object
		if(fit@family == "cox"){
			# Cox model is 2-column in y-response
			y <- survival::Surv(fit@y[cvsets$train[[z]],])
		}else if(fit@family %in% c("mse", "gaussian", "logistic")){
			# All other models have a y-vector
			y <- c(fit@y)[cvsets$train[[z]]]
		}else{
			stop(paste("Incorrect family-parameter fit@family:", fit@family))
		}
		# Exhaustively use same set of fitting parameters in CV fits as in original fit
		fittmp <- oscar::oscar(
			x = x, 
			y = y, 
			family = fit@family, 
			k = k, 
			w = w, 
			kmax = fit@kmax,
			verb = verb, 
			start = fit@start, 
			#rho = fit@rho,
			solver = fit@solver,
			in_selection = fit@in_selection, 
			percentage = fit@percentage, 
			...
		)
		
		if(verb>=1) print("CV fit, predicting...")
		# Perform predictions over all k-value fits
		# Model specificity in predictions (?)
		pred <- lapply(1:fit@kmax, FUN=function(ki){
			x <- fit@x[cvsets$test[[z]],,drop=FALSE]
			# MSE/Gaussian
			if(fit@family %in% c("mse", "gaussian")){
				oscar::predict(fit, type = "response", k = ki, newdata = x)
			# Logistic	
			}else if(fit@family %in% c("logistic")){
				oscar::predict(fit, type = "response", k = ki, newdata = x)
			# Cox
			}else if(fit@family %in% c("cox")){
				oscar::predict(fit, type = "response", k = ki, newdata = x)
			}
		})
		# True values; vectorization if not Cox ph model
		if(fit@family=="cox"){
			true <- fit@y[cvsets$test[[z]],]
		}else if(fit@family %in% c("mse", "gaussian", "logistic")){
			true <- c(fit@y)[cvsets$test[[z]]]
		}else{
			stop(paste("Incorrect family-parameter fit@family:", fit@family))
		}
		
		if(verb>=1) print(paste("Finished fold", z))
		
		# Return predictions vs. real y-values
		list(
			# Predicted values
			pred = pred, 
			# True values; vectorization if not Cox ph model
			true = true
		)
	})
	
	if(verb>=2){
		print("cvs prior to goodness measure")
		print(class(cvs))
		print(cvs)
	}
	
	# Construct goodness as a function of k
	# Loop over cv folds
	# $pred slot holds k-iterations
	# $true slot holds the real answer
	cvs <- lapply(cvs, FUN=function(z){
		if(verb>=2){
			print("true:")
			print(z$true)
			print("z$pred:")
			print(z$pred)
		}
		lapply(z$pred, FUN=function(q){
			# MSE/Gaussian, mean squared error
			if(fit@family %in% c("mse", "gaussian")){
				mean((q-z$true)^2)
			# Logistic; ROC-AUC by default
			}else if(fit@family %in% c("logistic") & fit@metric == "auc"){
				#pROC::auc(response = z$true, predictor = c(q))
				# Less 'cat' output
				invisible(as.numeric(pROC::auc(pROC::roc(response=z$true, predictor=c(q), levels=c(0,1), direction="<"))))
				#sum(as.integer(q>0.5)==z$true)/length(q)
			# Logistic; correct classification rate if metric desired is accuracy
			}else if(fit@family %in% c("logistic") & fit@metric == "accuracy"){
				sum(as.integer(q>0.5)==z$true)/length(q)
			# Cox proportional hazards model; concordance-index
			}else if(fit@family %in% c("cox")){
				# NOTE: Assuming first colmn is the survival time, second is the observed event status
				survival::coxph(survival::Surv(time = z$true[,1], event = z$true[,2]) ~ q)$concordance["concordance"]
			}else{
				stop(paste("Invalid 'family'", fit@family, "or 'metric':", fit@metric))
			}
		})
	})
	
	# Rows: cv-folds, cols: k-values
	cvs <- do.call("rbind", lapply(cvs, FUN=function(z) do.call("c", z)))
	rownames(cvs) <- paste("cv_", 1:nrow(cvs), sep="")
	# Return cv results
	cvs
}

#' @title Bootstrapping for oscar-fitted model objects
#'
#' @description This model bootstraps the fitting of a given oscar object (re-fits the model for data that is equal in size but sampled with replacement). The output objects give insight into robustness of the oscar-coefficient path, as well as relative importance of model objects.
#'
#' @param fit oscar-model object
#' @param bootstrap Number of bootstrapped datasets, Default: 100
#' @param seed Random seed for reproducibility with NULL indicating that it is not set, Default: NULL
#' @param verb Level of verbosity with higher integer giving more information, Default: 0
#' @param ... Additional parameters passed to oscar-function
#'
#' @return 3-dimensional array with dimensions corresponding to k-steps, beta coefficients, and bootstrap runs
#'
#' @details The function provides a fail-safe try-catch in an event of non-convergence of the model fitting procedure. This may occur for example if a bootstrapped data matrix has a column consist of a single value only over all observations.
#'
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   fit_bs <- oscar.cv(fit, bootstrap = 20, seed = 123)
#'   fit_bs
#' }
#' @rdname oscar.bs
#' @export 
oscar.bs <- function(
	# oscar-object
	fit,
	# How many bootstrapped datasets to generate
	bootstrap = 100,
	# RNG seed (integer) that can be set for exact reproducibility
	seed = NULL,
	# Level of verbosity (<1 supresses everything in R; parameter also passed to Fortran subroutine)
	verb = 0,
	# Additional parameters passed to oscar-function
	...
){
	# Seed number for RNG reproducibility
	if(!is.null(seed)) set.seed(seed)
	# Generate and fit bootstrapped datasets
	bperks <- lapply(1:bootstrap, FUN=function(i){
		if(verb>=1) print(paste("Bootstrap iteration", i))
		# Sampling with replacement from rows
		samps <- sample(1:nrow(fit@x), replace=TRUE)
		xtemp <- fit@x[samps,]
		ytemp <- fit@y[samps,]
		# Failsafe to not produce any rendundant columns
		while(any(apply(xtemp, MARGIN=2, FUN=\(q){ all(q==unique(q)[1]) }))){
			warning("Boostrapping produces a redundant variable, re-sampling...")
			# Sampling with replacement from rows
			samps <- sample(1:nrow(fit@x), replace=TRUE)
			xtemp <- fit@x[samps,]
			ytemp <- fit@y[samps,]
		}
		# Wrap expression inside try for catching errors
		try({
			# Exhaustively use same set of fitting parameters in CV fits as in original fit
			ftemp <- oscar::oscar(
				x = xtemp, 
				y = ytemp, 
				family = fit@family, 
				k = fit@k, 
				w = fit@w, 
				kmax = fit@kmax,
				verb = verb, 
				start = fit@start, 
				#rho = fit@rho,
				solver = fit@solver,
				in_selection = fit@in_selection, 
				percentage = fit@percentage, 
				...
			)
		})
		# Return successfully fitted model
		if(!inherits(ftemp,"try-error")){
			ftemp@bperk # Return bootstrapped beta per ks
		}else{
			NA # Model fitting issues, return NA-bperk
		}
	})
	# Extract bootstrapped beta coefficient values and create a 3-dim array of regular size
	dat <- array(as.numeric(unlist(lapply(bperks, FUN=c))), 
		dim=c(fit@kmax, ncol(bperks[[1]]), bootstrap), 
		dimnames=list(paste0("k_",1:fit@kmax), colnames(bperks[[1]]), paste0("bs_",1:bootstrap))
	)	
	# Return 3-dim array with first dim as k, second as beta coef, third as bootstrap runs
	dat
}
	
#' @title Reformatting bootstrap output for cardinality k rows
#'
#' @description The function reformats bootstrapped runs to a single long data.frame, where all bootstrapped runs are covered along with the choices for the variables at each cardinality 'k'.
#'
#' @param bs Bootstrapped list from oscar.bs
#'
#' @return Reformatted data.frame
#'
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   fit_bs <- oscar.bs(fit, bootstrap = 20, seed = 123)
#'   ll <- oscar.bs.k(fit_bs)
#'   head(ll)
#'   tail(ll)
#' }
#' @rdname oscar.bs.k
#' @export
oscar.bs.k <- function(
	bs	# Bootstrapped list from oscar.bs
){
	# Omit entries with try-errors
	if(any(unlist(lapply(bs, FUN=function(x) { inherits(x, "try-error") })))){
		bs <- bs[-which(unlist(lapply(bs, FUN=function(x) { inherits(x, "try-error") })))]
		warning(paste("try-errors detected in some bootstrap runs; failed bootstrap count:", sum(unlist(lapply(bs, FUN=function(x) { inherits(x, "try-error") })))))
	}
	
	# Choices of variables as a function of k
	bs <- apply(bs, MARGIN=3, FUN=function(z){
		apply(z, MARGIN=1, FUN=function(q){
			colnames(z)[which(!q==0)]
		})
	})
	
	# Concatenate all together into a data.frame
	bs <- as.data.frame(
		do.call("rbind", 
			lapply(1:length(bs), FUN=function(z){
				do.call("rbind", lapply(1:length(bs[[z]]), FUN=function(q){
					cbind(
						kth = rep(paste("k_", q, sep=""), times=length(bs[[z]][[q]])), # k:th var
						bsn = z, # z:th bootstrap run
						var = bs[[z]][[q]] # The variables in order
					)
				}))
			})
		)
	)
	
	bs
}

#' @title Create a sparse matrix representation of betas as a function of k
#'
#' @description Variable estimates (rows) as a function of cardinality (k, columns). Since a model can drop out variables in favor of two better ones as k increases, this sparse representation helps visualize which variables are included at what cardinality.
#'
#' @param fit oscar-model object
#' @param kmax Create matrix until kmax-value; by default same as for fit object, but for high dimensional tasks one may wish to reduce this
#'
#' @return A sparse matrix of variables (rows) as a function of cardinality k (columns), where elements are the beta estimates.
#'
#' @details Uses sparseMatrix-class from Matrix-package
#'
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   oscar.sparsify(fit, kmax=5)
#' }
#'
#' @rdname oscar.sparsify
#' @importFrom Matrix sparseMatrix
#' @export 
oscar.sparsify <- function(
	fit,
	kmax = fit@kmax
){
	if(!inherits(fit, "oscar")){
		stop("'fit' should be a fit oscar object")
	}
	
	# Order in which variables are first observed as non-zero
	#varorder <- unique(unlist(fit@kperk))
	# For models fit kits, a more sophisticated approach is required
	varorder <- unique(unlist(apply(fit@bperk, MARGIN=1, FUN=function(z) which(!z==0))))

	# Ordered beta matrix, order variables and transpose
	bkorder <- t(fit@bperk[,varorder])
	# Extract pairwise indices for non-zero elements
	nonzeroes <- which(!bkorder==0, arr.ind=TRUE)
	
	smat <- Matrix::sparseMatrix(
		i = nonzeroes[,1], # row indices of non-zero elements
		j = nonzeroes[,2], # col indices of non-zero elements
		x = c(bkorder[!bkorder==0]) # beta estimates at {i,j} running indices
	)
	dimnames(smat) <- dimnames(bkorder)	
	
	smat[,1:kmax]
}

#' @title Binary logical indicator matrix representation of an oscar object's coefficients (zero vs. non-zero, i.e. feature inclusion)
#'
#' @description Create a sparse matrix with binary indicator 1 indicating that a coefficient was non-zero, and value 0 (or . in sparse matrix) indicating that a coefficient was zero (i.e. feature not included)
#'
#' @param fit Fit oscar-model object
#' @param kmax Create matrix until kmax-value; by default same as for fit object, but for high dimensional tasks one may wish to reduce this
#'
#' @return A binary logical indicator matrix of variables (rows) as a function of cardinality k (columns), where elements are binary indicators for 1 as non-zero and 0 as zero.
#'
#' @details The matrix consists of TRUE/FALSE values, and is very similar to the oscar.sparsify, where the function provides estimate values in a sparse matrix format.
#'
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   oscar.binarize(fit, kmax=5)
#' }
#'
#' @rdname oscar.binarize
#' @export 
oscar.binarize <- function(
	fit, # oscar model object
	kmax = fit@kmax # limit to kmax
){
	if(!inherits(fit, "oscar")){
		stop("'fit' should be a fit oscar object")
	}
	
	# Full sparse matrix representation
	binmat <- apply(oscar::oscar.sparsify(fit), MARGIN=2, FUN=function(z) { ifelse(z==0, FALSE, TRUE) })
	
	binmat[,1:kmax]
}

#' @title Retrieve a set of pareto-optimal points for an oscar-model based on model goodness-of-fit or cross-validation
#'
#' @description This function retrieves the set of pareto optimal points for an oscar model fit in n-proportional time as cardinality axis is readily sorted. It is advisable to optimize model generalization (via cross-validation) rather than mere goodness-of-fit.
#'
#' @param fit Fit oscar S4-object
#' @param cv A cross-validation matrix as produced by oscar.cv; if CV is not provided, then goodness-of-fit from fit object itself is used rather than cross-validation generalization metric
#' @param xval The x-axis to construct pareto front based on; by default 'cost' vector for features/kits, can also be 'cardinality'/'k'
#' @param weak If weak pareto-optimality is allowed; by default FALSE.
#' @param summarize Function that summarizes over cross-validation folds; by default, this is the mean over the k-folds.
#' 
#' @return A data.frame containing points and indices at which pareto optimal points exist
#'
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   fit_cv <- oscar.cv(fit, fold=10)
#'   oscar.pareto(fit, cv=fit_cv)
#' }
#'
#' @rdname oscar.pareto
#' @export
oscar.pareto <- function(
	fit,
	cv,
	xval = "cost",
	weak = FALSE,
	summarize = mean
){
	# 
	if(xval == "cost"){
		xs <- unlist(lapply(1:fit@kmax, FUN=function(k) { oscar::cost(fit, k) }))
		ord <- order(xs)
	}else if(xval %in% c("cardinality", "k")){
		xs <- 1:fit@kmax
		ord <- 1:fit@kmax
	}else{
		stop(paste("Invalid xval value:",xval))
	}
	# x-axis is always the cardinality k
	# y-axis is either the goodness of fit or cross-validation performance. Latter is preferred.
	# No CV provided
	if(missing(cv)){
		ys <- fit@goodness
	# CV provided
	}else{
		# Apply summarization function over columns (the cardinality values), as rows are the folds in CV
		ys <- apply(cv, MARGIN=2, FUN=summarize)
	}
	# For certain metrics, test in the other direction
	if(fit@metric %in% c("mse")){
		ys <- -ys
	}
	# Loop over the x-values in ascending order; first one is always a pareto point, so we'll store it
	# Assuming x-axis is cardinality, we want lower cardinality models i.e. conservative in terms of their complexity
	# xs are actually cardinalities starting from 1, so we can just use integer indices
	df <- data.frame(y = ys, x = xs, ord = ord)
	df <- df[ord,]	
	ymax <- df$y[1]
	paretos <- 1
	for(i in 1:nrow(df)){
		# Always testing for higher values, as the metrics in other direction were already flipped
		if(df$y[i] > ymax){
			paretos <- c(paretos, i)
			ymax <- df$y[i]
		}
	}
	# Result for {x,y,pareto yes/no}
	res <- data.frame(k = df$x, y = df$y, paretofront = FALSE, ord = ord)
	# Replace y with the actual model metric
	names(res)[1:2] <- c(xval, fit@metric)
	res[paretos,"paretofront"] <- TRUE
	# If weak pareto-optimal points should be removed as per default
	if(!weak){
		res <- do.call("rbind", by(res, INDICES=res[,1], FUN=function(x) { x[which(!x[,2]==max(x[,2])),3]<-FALSE; x }))
		rownames(res) <- NULL
	}
	# Return to original point ordering
	res <- res[order(res$ord),]
	res
}

#' @title Return total cost of model fits if the cost is not included in the oscar object
#'
#' @description If at least one measurement from a kit is included in the model, the kit cost is added.
#'
#' @param object Fit oscar S4-object 
#'
#' @return A vector for numeric values of total kit costs at different cardinalities.
#'
#' @examples 
#' if(interactive()){
#'   data(ex)
#'   fit <- oscar(x=ex_X, y=ex_Y, k=ex_K, w=ex_c, family='cox')
#'   oscar.cost.after(fit)
#' }
#'
#' @rdname oscar.cost.after
#' @export
oscar.cost.after <- function(object){

	# The values are readily stored in oscar S4-objects
	kit.matrix <- object@k
	cost.vector <- object@w

  # Assume that the features are in the same order in x-matrix of oscar object and kit matrix
  # Assume that the kits are in the same order in kit.matrix and cost.vector
  
  #Sanity checks
  if(nrow(kit.matrix)!=length(cost.vector)){
    stop("Number of kits in the kit matrix is different from the length of cost vector.")
  }
  if(ncol(kit.matrix)!=ncol(object@x)){
    stop("Number of predictors in the kit.matrix differs from the number of predictors in x-matrix of oscar object.")
  }
  
  costs <- c()
  for(i in 1:object@kmax){ # Go through each cardinality
    cost.tmp <-0
    nzero <- which(object@bperk[i,]!=0)

    for(j in 1:ncol(kit.matrix)){ #Go through kits
      if(any(kit.matrix[j,nzero]!=0)){
        cost.tmp <- cost.tmp+cost.vector[j] # Add kit price if any feature is incl.
      }
    }
    costs<-c(costs,as.numeric(cost.tmp))
  }
  return(costs)
}
Any scripts or data that you put into this service are public.
oscar documentation built on Oct. 2, 2023, 5:08 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
oscar
Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm

R/helpers.R
In oscar: Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm

Defines functions oscar.cost.after oscar.pareto oscar.binarize oscar.sparsify oscar.bs.k oscar.bs oscar.cv

Documented in oscar.binarize oscar.bs oscar.bs.k oscar.cost.after oscar.cv oscar.pareto oscar.sparsify

Try the oscar package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

oscar Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm

R/helpers.R In oscar: Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm

Defines functions oscar.cost.after oscar.pareto oscar.binarize oscar.sparsify oscar.bs.k oscar.bs oscar.cv

Documented in oscar.binarize oscar.bs oscar.bs.k oscar.cost.after oscar.cv oscar.pareto oscar.sparsify

Try the oscar package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

oscar
Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm

R/helpers.R
In oscar: Optimal Subset Cardinality Regression (OSCAR) Models Using the L0-Pseudonorm