R/glmvsd.R

Defines functions glmvsd

Documented in glmvsd

glmvsd <- function(x, y, n_train = ceiling(n/2), no_rep = 100, 
					n_train_bound = n_train-2, n_bound = n-2,
					model_check, psi = 1, 
					family = c("gaussian", "binomial"), method = c("union", "customize"),
                   	candidate_models, weight_type = c("BIC", "AIC", "ARM"), prior = TRUE,
					reduce_bias = FALSE) {
    # check data and parameter
    family <- match.arg(family)
    method <- match.arg(method)
    weight_type <- match.arg(weight_type)
    y <- drop(y)
    y <- as.numeric(y)
    x <- as.matrix(x)
    p <- NCOL(x)
    n <- length(y)
    if (family == "binomial") {
      if (!all(y %in% c(0, 1))) 
        stop("There can only be 0 or 1 in y when using binomial family")
    }
    if (n != NROW(x)) 
        stop("x and y have different number of observations")
    if (n_train >= n) 
        stop("Training size must be less than the number of observations")
    if (missing(model_check)) 
        stop("User must provide a base model.")
	if(is.vector(model_check)) model_check <- matrix(model_check,nrow=1)
    # use union option to compute candidate models
    if (method == "union") {
      if (family == "gaussian") 
        candidate_models <- gaussianfit(x, y)
      if (family == "binomial") 
        candidate_models <- binomialfit(x, y)
    }
    if (method == "customize") {
      if (missing(candidate_models)) 
        stop("Users must supply a candidate model.")
      if (!is.matrix(candidate_models)) 
        stop("Supplied model must be a matrix.")
      if (NCOL(candidate_models) != NCOL(x)) 
        stop("Number of variables in candidate model and x does not match.")
      if (!all(as.numeric(candidate_models) %in% c(0, 1))) 
        stop("There can only be 0 or 1 in candidate_models")
    }
    # clean the candidate models mk
    candidate_models <- unique(candidate_models)
    rownames(candidate_models) <- NULL
    candidate_models <- candidate_models[order(rowSums(candidate_models)), ]
    if (weight_type == "ARM") {
      candidate_models <- candidate_models[rowSums(candidate_models) < n_train_bound, ]
    }else{
	  candidate_models <- candidate_models[rowSums(candidate_models) < n_bound, ]
    }
    # compute weights
    if (family == "gaussian") {
      if (weight_type == "ARM") {
        fit <- lsARM(x = x, y = y, candidate_models = candidate_models, 
                     n_train = n_train, no_rep = no_rep, psi = psi, prior = prior)
      }
      if (weight_type == "AIC" | weight_type == "BIC") {
        fit <- lsIC(x = x, y = y, candidate_models = candidate_models, 
                    psi = psi, prior = prior, type = weight_type)
      }
    }
    if (family == "binomial") {
      if (weight_type == "ARM") {
        fit <- logitARM(x = x, y = y, candidate_models = candidate_models, 
                        n_train = n_train, no_rep = no_rep, psi = psi, 
						prior = prior, reduce_bias = reduce_bias)
      }
      if (weight_type == "AIC" | weight_type == "BIC") {
        fit <- logitIC(x = x, y = y, candidate_models = candidate_models, 
						psi = psi, prior = prior, type = weight_type, 
						reduce_bias = reduce_bias)
      }
    }
    weight <- fit$weight
	  # compute VSD etc
	  # initialization
	  no_checkmod <- NROW(model_check)
	  VSD <- rep(NA, no_checkmod)
	  VSD_minus <- rep(NA, no_checkmod)
	  VSD_plus <- rep(NA, no_checkmod)
	  Precision <- matrix(NA, no_checkmod, length(weight))
	  Recall <- matrix(NA, no_checkmod, length(weight))
	  Fmeasure <- rep(NA, no_checkmod)
	  Gmeasure <- rep(NA, no_checkmod)
	  sd.F <- rep(NA, no_checkmod)
	  sd.G <- rep(NA, no_checkmod)
	  # size of m0
	  model_check_size <- rowSums(model_check)
	  if(any(model_check_size==0)) warning("The Model under check is a null model. F- and G-measure results will be NA")
	  # size of mk
	  candidate_models_size <- rowSums(candidate_models)
	  # start the loop
	  for (mindex in seq(no_checkmod)) {
	    # compare m0 and mk
	    TMP_matrix <- sweep(candidate_models, MARGIN = 2, model_check[mindex, ], "-")
	    diff <- rowSums(abs(TMP_matrix))
	    diff_plus <- rowSums(TMP_matrix == 1)
	    diff_minus <- rowSums(TMP_matrix == -1)
	    # compute VSD and VSD plus and minus
	    VSD[mindex] <- sum(weight*diff)  # glmvsd value
	    VSD_plus[mindex] <- sum(weight*diff_plus)  # false negative 
	    VSD_minus[mindex] <- sum(weight*diff_minus)  # false positive
	    # compute F measure and G measure using precision and recall
	    Precision[mindex,] <- (model_check_size[mindex]-diff_minus)/model_check_size[mindex]
	    Recall[mindex,] <- (model_check_size[mindex]-diff_minus)/candidate_models_size
	    Fmeasure_tmp <- 2*(model_check_size[mindex]-diff_minus)/(model_check_size[mindex]+candidate_models_size)
	    Gmeasure_tmp <- (model_check_size[mindex]-diff_minus)/sqrt(model_check_size[mindex]*candidate_models_size)
	    Fmeasure[mindex] <- sum(weight*Fmeasure_tmp, na.rm = TRUE)
	    Gmeasure[mindex] <- sum(weight*Gmeasure_tmp, na.rm = TRUE)
		 sd.F[mindex] <- sqrt(sum(weight*(Fmeasure_tmp-Fmeasure[mindex])^2, na.rm = TRUE))
		 sd.G[mindex] <- sqrt(sum(weight*(Gmeasure_tmp-Gmeasure[mindex])^2, na.rm = TRUE))
	  }
    # output 
    object <- list(candidate_models_cleaned = candidate_models, VSD = VSD, VSD_minus = VSD_minus, VSD_plus = VSD_plus,
					Precision = Precision, Recall = Recall,
	    			Fmeasure = Fmeasure, Gmeasure = Gmeasure,
					sd.F = sd.F, sd.G = sd.G,
              weight = weight)
    class(object) <- "glmvsd"
    object
}

Try the glmvsd package in your browser

Any scripts or data that you put into this service are public.

glmvsd documentation built on Aug. 14, 2022, 9:05 a.m.