R/mplot-package.R

Defines functions process.fn txt.fn safeDeparse mextract

Documented in process.fn txt.fn

#' Graphical model stability and model selection procedures
#'
#' @name mplot-package
#' @docType package
#' @title Graphical model stability and model selection procedures
#' @keywords package
#' @references Tarr G, Mueller S and Welsh AH (2018). mplot: An R Package for 
#'   Graphical Model Stability and Variable Selection Procedures. 
#'   Journal of Statistical Software, 83(9), pp. 1-28. doi: 10.18637/jss.v083.i09
NULL

#' Body fat data set
#'
#' A data frame with 128 observations on 15 variables.
#'
#' @name bodyfat
#' @format A data frame with 128 observations on 15 variables.
#' \describe{
#' \item{Id}{Identifier}
#' \item{Bodyfat}{Bodyfat percentage}
#' \item{Age}{Age (years)}
#' \item{Weight}{Weight (kg)}
#' \item{Height}{Height (inches)}
#' \item{Neck}{Neck circumference (cm)}
#' \item{Chest}{Chest circumference (cm)}
#' \item{Abdo}{Abdomen circumference (cm) "at the umbilicus
#'            and level with the iliac crest"}
#' \item{Hip}{Hip circumference (cm)}
#' \item{Thigh}{Thigh circumference (cm)}
#' \item{Knee}{Knee circumference (cm)}
#' \item{Ankle}{Ankle circumference (cm)}
#' \item{Bic}{Extended biceps circumference (cm)}
#' \item{Fore}{Forearm circumference (cm)}
#' \item{Wrist}{Wrist circumference (cm) "distal to the
#'             styloid processes"}
#' }
#' @details A subset of the 252 observations available in the \code{mfp} package.
#'   The selected observations avoid known high leverage points and
#'   outliers.  The unused points from the data set could be used to validate
#'   selected models.
#' @docType data
#' @keywords datasets
#' @usage data(bodyfat)
#' @references Johnson W (1996, Vol 4). Fitting percentage of
#'   body fat to simple body measurements. Journal of Statistics
#'   Education. Bodyfat data retrieved from
#'   http://www.amstat.org/publications/jse/v4n1/datasets.johnson.html
#'   An expanded version is included in the \code{mfp} R package.
#' @examples
#' data(bodyfat)
#' full.mod = lm(Bodyfat~.,data=subset(bodyfat,select=-Id))
NULL

#' Rock-wallabies data set
#'
#' On Chalkers Top in the Warrumbungles (NSW, Australia) 200 evenly distributed
#' one metre squared plots were surveyed. Plots were placed at a density
#' of 7-13 per hectare. The presence or absence of fresh
#' (<1 month old) scats of rock-wallabies was recorded for each plot
#' along with location and a selection of predictor variables.
#'
#' @name wallabies
#' @format A data frame with 200 observations on 9 variables.
#' \describe{
#' \item{rw}{Presence of rock-wallaby scat}
#' \item{edible}{Percentage cover of edible vegetation}
#' \item{inedible}{Percentage cover of inedible vegetation}
#' \item{canopy}{Percentage canopy cover}
#' \item{distance}{Distance from diurnal refuge}
#' \item{shelter}{Whether or not a plot occurred within a shelter point (large
#'                rock or boulder pile)}
#' \item{lat}{Latitude of the plot location}
#' \item{long}{Longitude of the plot location}
#' }
#' @details Macropods defaecate randomly as they forage and scat 
#'   (faecal pellet) surveys are a reliable method for detecting the
#'   presence of rock-wallabies and other macropods. 
#'   Scats are used as an indication of spatial foraging patterns 
#'   of rock-wallabies and sympatric macropods. Scats deposited while
#'   foraging were not confused with scats deposited while
#'   resting because the daytime refuge areas of rock-wallabies
#'   were known in detail for each colony and no samples were
#'   taken from those areas. Each of the 200 sites were 
#'   examined separately to
#'   account for the different levels of predation risk and the
#'   abundance of rock-wallabies.
#' @docType data
#' @keywords datasets
#' @usage data(wallabies)
#' @references 
#'    Tuft KD, Crowther MS, Connell K, Mueller S and McArthur C (2011), 
#'    Predation risk and competitive interactions affect foraging of 
#'    an endangered refuge-dependent herbivore. Animal Conservation, 
#'    14: 447-457. doi: 10.1111/j.1469-1795.2011.00446.x
#' @examples
#' data(wallabies)
#' wdat = data.frame(subset(wallabies,select=-c(lat,long)), 
#'   EaD = wallabies$edible*wallabies$distance,
#'   EaS = wallabies$edible*wallabies$shelter,
#'   DaS = wallabies$distance*wallabies$shelter)
#' M1 = glm(rw~., family = binomial(link = "logit"), data = wdat)
NULL


#' Blood and other measurements in diabetics
#'
#' The diabetes data frame has 442 rows and 11 columns.
#' These are the data used in Efron et al. (2004).
#'
#' @name diabetes
#' @format A data frame with 442 observations on 11 variables.
#' \describe{
#' \item{age}{Age}
#' \item{sex}{Gender}
#' \item{bmi}{Body mass index}
#' \item{map}{Mean arterial pressure (average blood pressure)}
#' \item{tc}{Total cholesterol (mg/dL)? Desirable range: below 200 mg/dL}
#' \item{ldl}{Low-density lipoprotein ("bad" cholesterol)? 
#'            Desirable range: below 130 mg/dL }
#' \item{hdl}{High-density lipoprotein ("good" cholesterol)? 
#'            Desirable range: above 40 mg/dL}
#' \item{tch}{Blood serum measurement}
#' \item{ltg}{Blood serum measurement}
#' \item{glu}{Blood serum measurement (glucose?)}
#' \item{y}{A quantitative measure of disease progression 
#'          one year after baseline}
#' }
#' @details Data sourced from http://web.stanford.edu/~hastie/Papers/LARS
#' @docType data
#' @keywords datasets
#' @usage data(diabetes)
#' @references Efron, B., Hastie, T., Johnstone, I., Tibshirani, R., (2004).
#'   Least angle regression. The Annals of Statistics 32(2) 407-499.
#'   DOI: 10.1214/009053604000000067
#' @examples
#' data(diabetes)
#' full.mod = lm(y~.,data=diabetes)
NULL




#' Artificial example
#'
#' An artificial data set which causes stepwise regression
#' procedures to select a non-parsimonious model.
#' The true model is a simple linear regression of
#' y against x8.
#'
#' @name artificialeg
#' @format A data frame with 50 observations on 10 variables.
#' @details Inspired by the pathoeg data set in the MPV pacakge.
#' @docType data
#' @keywords datasets
#' @usage data(artificialeg)
#' @examples
#' data(artificialeg)
#' full.mod = lm(y~.,data=artificialeg)
#' step(full.mod)
#' # generating model
#' n=50
#' set.seed(8) # a seed of 2 also works
#' x1 = rnorm(n,0.22,2)
#' x7 = 0.5*x1 + rnorm(n,0,sd=2)
#' x6 = -0.75*x1 + rnorm(n,0,3)
#' x3 = -0.5-0.5*x6 + rnorm(n,0,2)
#' x9 = rnorm(n,0.6,3.5)
#' x4 = 0.5*x9 + rnorm(n,0,sd=3)
#' x2 = -0.5 + 0.5*x9 + rnorm(n,0,sd=2)
#' x5 = -0.5*x2+0.5*x3+0.5*x6-0.5*x9+rnorm(n,0,1.5)
#' x8 = x1 + x2 -2*x3 - 0.3*x4 + x5 - 1.6*x6 - 1*x7 + x9 +rnorm(n,0,0.5)
#' y = 0.6*x8 + rnorm(n,0,2)
#' artificialeg = round(data.frame(x1,x2,x3,x4,x5,x6,x7,x8,x9,y),1)
NULL




#' Forced Expiratory Volume
#'
#' This data set consists of 654 observations on youths aged 3 to 19 from 
#' East Boston recorded duing the middle to late 1970's. 
#' Forced expiratory volume (FEV), a measure of lung capacity, is the 
#' variable of interest. Age and height are two continuous predictors. 
#' Sex and smoke are two categorical predictors.
#'
#' @name fev
#' @format A data frame with 654 observations on 5 variables.
#' \describe{
#' \item{age}{Age (years)}
#' \item{fev}{Forced expiratory volume (liters).  Roughly the amount 
#'            of air an individual can exhale in the first second of 
#'            a forceful breath.}
#' \item{height}{Height (inches).}
#' \item{sex}{Female is 0. Male is 1.}
#' \item{smoke}{A binary variable indicating whether or not the 
#'              youth smokes. Nonsmoker is 0. Smoker is 1.}
#' }
#' @details Copies of this data set can also be found in the 
#'  \code{coneproj} and \code{tmle} packages.
#' @references 
#'  Tager, I. B., Weiss, S. T., Rosner, B., and Speizer, F. E. (1979). 
#'  Effect of parental cigarette smoking on pulmonary function in children. 
#'  \emph{American Journal of Epidemiology}, \bold{110}, 15-26.
#'  
#'  Rosner, B. (1999).
#'  \emph{Fundamentals of Biostatistics}, 5th Ed., Pacific Grove, CA: Duxbury.
#'   
#'   Kahn, M.J. (2005). An Exhalent Problem for Teaching Statistics.
#'   \emph{Journal of Statistics Education},  \bold{13}(2). 
#'    http://www.amstat.org/publications/jse/v13n2/datasets.kahn.html
#' @docType data
#' @keywords datasets
#' @usage data(fev)
#' @examples
#' data(fev)
#' full.mod = lm(fev~.,data=fev)
#' step(full.mod)
NULL




#' Extract model elements
#'
#' This function extracts things like the formula,
#' data matrix, etc. from a lm or glm object
#'
#' @param model a fitted 'full' model, the result of a call
#'   to lm or glm (and in the future lme or lmer).
#' @param screen logical, whether or not to perform an initial
#'   screen for outliers.  Highly experimental, use at own risk.
#'   Default = FALSE.
#' @param redundant logical, whether or not to add a redundant
#'   variable.  Default = TRUE.
#' @noRd
mextract = function(model, screen = FALSE, redundant = TRUE){
  # what's the name of the dependent variable?
  yname = deparse(stats::formula(model)[[2]])
  # Set up the data frames for use
  data = stats::model.frame(model)
  X = stats::model.matrix(model)
  n = nrow(X)
  # full model plus redundant variable
  exp.vars = names(model$coefficients)[names(model$coefficients) != "(Intercept)"]
  
  if (redundant) {
    REDUNDANT.VARIABLE = stats::runif(n, min = 0, max = 1)
    X = cbind(X,REDUNDANT.VARIABLE)
    data = cbind(data,REDUNDANT.VARIABLE)
    exp.vars = c(exp.vars,"REDUNDANT.VARIABLE")
  }
  if (colnames(X)[1] == "(Intercept)") {
    # overwrite intercept with y-variable
    X[,1] = stats::model.frame(model)[,yname]
  } else {
    X = cbind(stats::model.frame(model)[,yname],X)
  }
  colnames(X)[1] = yname
  X = data.frame(X)
  fixed = stats::as.formula(
    paste(
      paste(yname, "~"),
      paste(colnames(X)[-1], collapse = "+"), 
      collapse=" ")
  )
  Xy = X[c(2:ncol(X),1)]
  
  k = length(exp.vars) + 1 # +1 for intercept
  if (screen) {
    if (!requireNamespace("mvoutlier", quietly = TRUE)) {
      stop("mvoutlier package needed when screen=TRUE. Please install it.",
           call. = FALSE)
    }
    x.mad = apply(Xy, 2, stats::mad)
    Xy.sub = Xy[,which(x.mad != 0)]
    Xy = Xy[mvoutlier::pcout(Xy.sub)$wfinal01 == 1,]
    n = dim(Xy)[1]
    if (k >= n) {
      warning("Screening deleted too many observations.")
      return()
    }
  }
  wts = model$weights
  if (is.element("glm",class(model))) {
    wts = model$prior.weights
    Xy[,yname] = model$y
  }
  if (is.null(wts)) {
    wts = rep(1,n)
  }
  
  return(list(yname = yname, fixed = fixed,
              wts = wts, X = Xy, k = k,
              n = n, exp.vars = exp.vars,
              data = data, family = stats::family(model)))
  
  # MIXED MODELS NOT IMPLEMENTED IN FIRST RELEASE
  #   if(class(model)=="lmerMod"){ # lme4 package
  #     X = stats::model.matrix(model)
  #     data = data.frame(stats::model.frame(model)) # or slot(model, "frame")
  #     if(isGLMM(model)){
  #       family = stats::family(model) #
  #       # pass to glm ??
  #       # i.e. wild bootstrap appropriate for glmm's too?
  #       # though should really call glmer for the full model
  #       # so it may be the case that class(model) won't be lmerMod
  #       # if the family argument was passed to it as lmer()
  #       # calls glmer() if there is a family argument present
  #     }
  #     yname = deparse(stats::formula(model)[[2]])
  #     X = data.frame(data[,yname],X)
  #     colnames(X)[1] = yname
  #     fixed.formula = paste(yname,"~",
  #                           paste(names(fixef(model))[-1],collapse="+"))
  #     model = lm(fixed.formula,data = X)
  #   } else if(class(model)=="lme"){ # nlme package
  #     data = model$data
  #     X = stats::model.matrix(model,data = data)
  #     # nlme package assumes gaussian errors
  #     yname = deparse(stats::formula(model)[[2]])
  #     X = data.frame(data[,yname],X)
  #     colnames(X)[1] = yname
  #     fixed.formula = paste(yname,"~",
  #                           paste(names(stats::coef(model))[-1],collapse="+"))
  #     model = lm(fixed.formula, data = X)
  #   }
}

#' Safe deparse
#'
#' Supports long formula construction
#'
#' @param expr expression to be safely deparsed
#'
#' @noRd
safeDeparse <- function(expr){
  ret <- paste(deparse(expr), collapse = "")
  #rm whitespace
  gsub("[[:space:]][[:space:]]+", " ", ret)
}

#' Print text for fence methods
#'
#' This function provides the text for the case when trace=TRUE
#' when using lmfence and glmfence functions.
#'
#' @param score realised value
#' @param UB upper bound
#' @param obj fitted model object
#' @keywords internal
txt.fn = function(score,UB,obj){
  cat("\n")
  cat(paste("hatQm:", round(score,2),"; Upper bound:", round(UB,2)),"\n")
  cat(paste("hatQm <= UB:",score<=UB,"\n"))
  cat(deparse(stats::formula(obj)))
  cat("\n")
}

#' Process results within af function
#'
#' This function is used by the af function to process
#' the results when iterating over different boundary values
#'
#' @param fence.mod set of fence models
#' @param fence.rank set of fence model ranks
#' @export
#' @keywords Internal
process.fn = function(fence.mod,fence.rank){
  del2 = function(x) x[-c(1:2)]
  splitAt <- function(x, pos) unname(split(x, cumsum(seq_along(x) %in% pos)))
  
  # best only (true fence)
  fence.mod.bo = fence.mod[fence.rank==1]
  #  temp.bo = sort(table(sapply(fence.mod.bo,deparse,
  #                              width.cutoff=500)), decreasing=TRUE)
  temp.bo = sort(table(unlist(lapply(lapply(fence.mod.bo,as.character),del2))),
                 decreasing=TRUE)
  pstarj.bo = as.numeric(temp.bo[1]/length(fence.mod.bo))
  pstarnamej.bo = names(temp.bo)[1]
  
  # all that pass the fence
  #temp.all = sort(table(sapply(fence.mod,deparse,
  #                             width.cutoff=500)),decreasing=TRUE)
  temp.all = sort(table(unlist(lapply(lapply(fence.mod,as.character),del2))),
                  decreasing=TRUE)
  #old version
  #pstarj.all = as.numeric(temp.all[1]/length(fence.mod))
  #pstarnamej.all = names(temp.all)[1]
  # new version
  unlist.fence.rank = unlist(fence.rank)
  fence.rank.split = splitAt(unlist.fence.rank,which(unlist.fence.rank==1))
  custom.p = 1/unlist(lapply(fence.rank.split,length))
  custom.names = unlist(lapply(lapply(fence.mod.bo,as.character),del2))
  agg = stats::aggregate(custom.p,by=list(custom.names),sum)
  agg = agg[order(agg$x,decreasing = TRUE),]
  pstarj.all = agg[1,2]/sum(unlist.fence.rank==1)
  pstarnamej.all = agg[1,1]
  
  return(c(pstarj.bo,pstarnamej.bo,pstarj.all,pstarnamej.all))
}

#' @importFrom doRNG "%dorng%"
NULL

globalVariables(".")
#  # hack for no visible binding for global variable '.'

Try the mplot package in your browser

Any scripts or data that you put into this service are public.

mplot documentation built on July 10, 2021, 5:07 p.m.