R/strikeFX.R

#' Visualize PITCHf/x strikezones
#' 
#' A suite of bivariate plots with "px" on the horizontal axis and "pz" on the vertical axis.
#'
#' @param data PITCHf/x data to be visualized.
#' @param geom plotting geometry. Current choices are: "point", "hex", "bin", "tile" and "raster"
#' @param contour logical. Should contour lines be included?
#' @param point.size Size of points (when geom="point")
#' @param point.alpha plotting transparency parameter (when geom="point").
#' @param color variable used to define coloring scheme.
#' @param fill variable used to define subplot scheme (when geom="subplot2d").
#' @param layer list of other ggplot2 (layered) modifications.
#' @param model Either a \link{gamObject} or a call to fit a model via \link{gam} or \link{bam}.
#' Note that the horizontal and vertical location of the pitch MUST be included as covariates named "px" and "pz", respectively.
#' Relevant factor variables must also be included as covariates in order to produce facetted or differenced plot(s).
#' If this option is used, the geometry must be either "hex", "tile" or "bin". If a non-valid geometry is used, the geometry will be forced to "tile".
#' @param model.save logical. Save the fitted \code{model}? If TRUE, the relevant model object is \link{assign}ed to the global environment
#' @param density1 List of length one. The name should correspond to a variable in \code{data}. The value should correspond to an (observed) value of that variable.
#' @param density2 Similar to \code{density1}. If \code{density1 != density2}, the relevant estimates are automatically differenced.
#' @param limitz limits for horizontal and vertical axes. 
#' @param adjust logical. Should vertical locations be adjusted according to batter height?
#' @param draw_zones logical. Should strikezones be included?
#' @param parent is the function being called from a higher-level function? (experimental)
#' @param ... extra options passed onto geom commands
#' @return Returns an object of the class used by package ggplot2 to represent plots.
#' @export
#' @import mgcv
#' @import ggplot2
#' @import hexbin
#' @importFrom MASS kde2d
#' @examples
#' data(pitches)
#' 
#' strikeFX(pitches)
#' \dontrun{
#' strikeFX(pitches, layer=facet_grid(.~stand))
#'  #silly example on how to modify default settings and add layers
#'  strikeFX(pitches, color="", layer=facet_grid(s~stand))+
#'  geom_point(aes(x=px, y=pz, shape=pitch_types))+ #you could add color here
#'  geom_text(aes(x=px+0.5, y=pz, label=b)) 
#'  
#'  p <- strikeFX(pitches, geom="tile", layer=facet_grid(.~stand))
#'  p+theme(aspect.ratio=1)
#' 
#' strikeFX(pitches, geom="hex", density1=list(des="Called Strike"), density2=list(des="Ball"), 
#'          draw_zones=FALSE, contour=TRUE, layer=facet_grid(.~stand))
#'          
#' noswing <- subset(pitches, des %in% c("Ball", "Called Strike"))
#' noswing$strike <- as.numeric(noswing$des %in% "Called Strike")
#' library(mgcv)
#' m1 <- bam(strike ~ s(px, pz, by=factor(stand)) + 
#'                factor(stand), data=noswing, family = binomial(link='logit'))
#' # geom will automatically be set to 'raster'
#' strikeFX(noswing, model=m1, layer=facet_grid(.~stand))
#' 
#' m2 <- bam(strike ~ s(px, pz, by=factor(stand)) + s(px, pz, by=factor(inning_side)) + 
#'            factor(stand) + factor(inning_side), data=noswing, family = binomial(link='logit'))
#' strikeFX(noswing, model=m2, density1=list(inning_side="top"), 
#'          density2=list(inning_side="bottom"), layer=facet_grid(.~stand))
#' }
#' 

strikeFX <- function(data, geom = "point", contour=FALSE, point.size=3, point.alpha=1/3, color = "pitch_types", fill = "des", layer = list(), model, model.save=TRUE, density1=list(), density2=list(), limitz=c(-2, 2, 0.5, 4.5), adjust=FALSE, draw_zones=TRUE, parent=FALSE, ...){ 
  px=pz_adj=..density..=top=bottom=right=left=x=y=z=NULL #ugly hack to comply with R CMD check
  if (any(!geom %in% c("point", "bin", "hex", "raster", "tile", "subplot2d"))) warning("Current functionality is designed to support the following geometries: 'point', 'bin', 'hex', 'tile', 'subplot2d'.")
  if ("pitch_type" %in% names(data)) { #Add descriptions as pitch_types
    data$pitch_type <- factor(data$pitch_type)
    types <- data.frame(pitch_type=c("SI", "FF", "IN", "SL", "CU", "CH", "FT", "FC", "PO", "KN", "FS", "FA", NA, "FO"),
                   pitch_types=c("Sinker", "Fastball (four-seam)", "Intentional Walk", "Slider", "Curveball", "Changeup", 
                                 "Fastball (two-seam)", "Fastball (cutter)", "Pitchout", "Knuckleball", "Fastball (split-finger)",
                                 "Fastball", "Unknown", "Forkball"))
    data <- plyr::join(data, types, by = "pitch_type", type="inner")
  } 
  if (!"b_height" %in% names(data)) {
    warning("pitchRx assumes the height of each batter is recorded as 'b_height'. Since there is no such column, we will assume each batter has a height of 6'2''")
    data$b_height <- "6-2"
  } else {
    no.height <- is.na(data$b_height)
    if (any(no.height)) {
      #warning("Some batter heights are missing (NA).")
      data$b_height[no.height] <- "6-2"
    }
  }
  
  if (!color %in% names(data)) {
    #warning(paste(color, "is the variable that defines coloring but it isn't in the dataset!"))
    color <- ""
  }
  locations <- c("px", "pz")
  FX <- data[complete.cases(data[,locations]),] #get rid of records missing the necessary parameters
  for (i in locations) FX[,i] <- as.numeric(FX[,i])
  if (parent) { #ugly workaround for shiny implementation
    layers <- NULL
    for (i in layer)
      layers <- list(layers, eval(i)) 
  } else {
    layers <- layer
  }
  facets <- getFacets(layer=layers)
  boundaries <- getStrikezones(FX, facets, strikeFX = TRUE) 
  if (adjust) {
    FX$pz_adj <- boundaries[[1]] #adjusted vertical locations
  } else {
    FX$pz_adj <- FX$pz # "adjusted" vert locations
  }
  FX <- plyr::join(FX, boundaries[[2]], by="stand", type="inner")
  for (i in locations) FX[,i] <- as.numeric(FX[,i])
  #Recycled plot formats
  labelz <- labs(x = "Horizontal Pitch Location", y = "Height from Ground")
  legendz <- theme(legend.position = c(0.25,0.05), legend.direction = "horizontal")
  xrange <- xlim(limitz[1:2])
  yrange <- ylim(limitz[3:4])
  if (draw_zones) {
    white_zone <- geom_rect(mapping=aes(ymax = top, ymin = bottom, xmax = right, xmin = left), alpha=0, fill="pink", colour="white")
    black_zone <- geom_rect(mapping=aes(ymax = top, ymin = bottom, xmax = right, xmin = left), alpha=0, fill="pink", colour="grey20")
  } else {
    white_zone <- NULL
    black_zone <- NULL
  }
  #As of version 0.7, we assume the model is begin fit via gam/bam
  #Most of this is modeled after ?mgcv::vis.gam (view is 'px' and 'pz', non-facets are conditioned upon)
  #Does it make sense to look at differenced grids given a pre-fitted model????
  if (!missing(model)) {
    #figure out whether model argument is a fitted model or if it has yet to be fitted
    classy <- tryCatch(class(model), error = function(e) NULL)
    isfit <- !is.null(classy)
    if (!isfit) { #fit the model
      form <- as.list(substitute(model))
      fit <- fitModel(FX, form, model.save)
    } else {
      fit <- model
    }
    #create a 2D grid (eventually passed to the ?predict.gam method)
    x.grid <- seq(limitz[1], limitz[2], 0.05) #add option for grid granularity?
    y.grid <- seq(limitz[3], limitz[4], 0.05) 
    grid <- expand.grid(px=x.grid, pz=y.grid)
    #extract covariates included in gam/bam (see ?gamObject)
    vars <- names(fit$var.summary)
    #Summaries for factors have a single value. Other types have 3 values and we want to condition on the 2nd value (see var.summary of ?gamObject)
    var_summary <- lapply(fit$var.summary, function(x) { if (length(x) == 1) x else x[2] })
    factors <- vars[sapply(fit$var.summary, length) == 1]
    ind <- vars %in% c("px", "pz")
    extras <- vars[!ind] #we need to either condition on or facet by these variables
    if (!all(c("px", "pz") %in% vars)) warning("The horizontal location and vertical locations of each pitch should included as covariates in your model as 'px' and 'pz'.")

    #check if the facetting scheme makes sense for the model. if not, give warning and throw out facet call
    if (!is.null(facets)){ #facets exist
      if (length(factors) == 0){ #facets not possible!
        warning("Facet layer will be ignored since no factor variables were put into the model.")
        facets <- NULL
        id <- sapply(lapply(layer, class), function(x) which("facet" %in% x)) 
        layer[id > 0] <- NULL #get rid of the facet layer
      } else { #factors variables do exist
        ind <- facets %in% factors 
        idx <- which(!ind) #which facet variables are not included?
        if (length(idx) > 0) {
          warning(paste0("Facet layer will be ignored since the following variables included in the model as factors: ", facets[idx]))
          facets <- NULL
          id <- sapply(lapply(layer, class), function(x) which("facet" %in% x)) 
          layer[id > 0] <- NULL #get rid of the facet layer
        }
      }
    }
    #'fixed' variables can have multiple 'conditioned' values (px, pz are also here since they are part of the 'view')
    fixed <- c(facets, c("px", "pz"))
    if (length(density1) > 0) fixed <- c(fixed, names(density1))
    if (length(density2) > 0) fixed <- c(fixed, names(density2))
    #'given' variables have one 'conditioned' value (should be the mode for factors and closest obs. to the median for numerics)
    givens <- var_summary[!names(var_summary) %in% fixed]
    for (i in seq_along(givens)) {
      message(paste("Conditioning on:", names(givens[i]), " == ", givens[[i]]))
      grid[names(givens)[i]] <- givens[[i]]
    }
    #now make predictions for each unique value of the facetting variables
    if (!is.null(facets)) {
      sheet <- NULL
      vals <- unique(FX[facets])
      val.names <- names(FX[facets])
      for (i in seq_len(dim(vals)[1])) {
        greed <- suppressWarnings(cbind(grid, vals[i,]))
        names(greed) <- c(names(grid), val.names)
        if (identical(density1, density2)) { #probabilities are not differenced
          greed$z <- predict(fit, greed, type="response") #add an option to look at se!?! see ?predict.gam
        } else { #probabilities are differenced
          greed1 <- greed
          greed1[names(density1)] <- density1[[1]]
          z1 <- predict(fit, greed1, type="response")
          greed2 <- greed
          greed2[names(density2)] <- density2[[1]]
          z2 <- predict(fit, greed2, type="response")
          greed$z <- z1 - z2
        }
        sheet <- rbind(sheet, greed)
      }
    } else { #no facets
      if (identical(density1, density2)) { #probabilities are not differenced
        grid$z <- predict(fit, grid, type="response") #add an option to look at se!?! see ?predict.gam
      } else { #probabilities are differenced
        grid1 <- grid
        grid1[names(density1)] <- density1[[1]]
        z1 <- predict(fit, grid1, type="response")
        grid2 <- grid
        grid2[names(density2)] <- density2[[1]]
        z2 <- predict(fit, grid2, type="response")
        grid$z <- z1 - z2
      }
      grid$z <- predict(fit, grid, type="response")
      sheet <- grid
    }
    #for (i in factors) sheet[[i]] <- factor(sheet[[i]])
    p <- plotDensity(sheet, boundaries, contour, geom, ...)
    p <- p + white_zone
    if (any(sheet$z < 0)) col_scale <- scale_fill_gradient2(midpoint=0) else col_scale <- NULL
    return(p+labelz+xrange+yrange+layers+col_scale)
  }
  if (geom %in% "subplot2d") { #special handling for subplotting
    if (!requireNamespace('ggsubplot')) {
      message("The 'subplot2d' geom requires library(ggsubplot)!")
      return(NULL)
    }
    return(ggplot(data=FX)+labelz+xrange+yrange+
             ggsubplot::geom_subplot2d(aes(x=px, y=pz_adj, 
                    subplot = geom_bar(aes_string(x=fill, fill = fill))), ...)+
             black_zone+layers)
  }
  if (geom %in% c("bin", "hex", "tile", "raster")) { #special handling for (2D) density geometries
    if (identical(density1, density2)) { #densities are not differenced
      FX1 <- subsetFX(FX, density1)
      t <- ggplot(data=FX1, aes(x=px, y=pz_adj))+labelz+xrange+yrange
      if (geom %in% "bin") t <- t + geom_bin2d(...) 
      if (geom %in% "hex") t <- t + geom_hex(...)
      if (geom %in% c("tile", "raster")) 
        t <- t + stat_density2d(geom = geom, aes(fill = ..density..), contour = FALSE, ...)
      #Contours and strikezones are drawn last
      if (contour) t <- t + geom_density2d()
      t <- t + white_zone
      return(t+layers)
    } else { #densities are differenced (note that diffDensity handles the subsetting)
      if (!is.null(facets)) {
        stuff <- dlply(FX, facets, function(x) { diffDensity(x, density1, density2, limz=limitz) } )
        densities <- ldply(stuff)
      } else densities <- diffDensity(FX, density1, density2, limz=limitz)
    }
    t2 <- plotDensity(densities, boundaries, contour, geom, ...)
    t2 <- t2 + black_zone
    return(t2+scale_fill_gradient2(midpoint=0)+labelz+xrange+yrange+layers)
  }
  if (geom %in% "point") {
    p <- ggplot(data=FX, aes(ymax=top, ymin=bottom, xmax=right, xmin=left)) + legendz + labelz + xrange + yrange + scale_size(guide = "none") + scale_alpha(guide="none")
    p <- p + black_zone
    if (color == "") {
      point_mapping <- aes_string(x = "px", y="pz_adj")
    } else {
      point_mapping <- aes_string(x = "px", y="pz_adj", colour = color)
    }
    if (contour) p <- p + geom_density2d(point_mapping, colour="black")
    p <- p + geom_point(mapping=point_mapping, size=point.size, alpha=point.alpha, ...)
    return(p + layers)
  }
}

#fit a model and return the predicted values for a (possibly differenced) 2D grid
fitModel <- function(dat, expr, model.save=TRUE) {
  fun <- as.character(expr[[1]])
  expr[[1]] <- NULL
  expr$data <- dat #overwrite data argument
  fit <- do.call(fun, expr)
  if (model.save){
    suffix <- gsub(":| ", "-", format(Sys.time(), "%m %d %X")) #month-day-hour-minute-second
    filename <- paste0("model-", suffix, ".rds")
    saveRDS(fit, file=filename)
    message(paste0("Saved model fit as: ", filename))
    Sys.sleep(1) #ensure we get unique filenames
  }
  fit
  #meanFun(fit, grid)
}

# #use fitted model to find mean response over a 2D grid
# meanFun <- function(fit, grid, se=FALSE) {
#   pred.grid <- predict(fit, grid, type="response", se)
#   data.frame(x=grid[,1], y=grid[,2], z=pred.grid)
# }

#Plot a 2D density with pre-computed heights
plotDensity <- function(dens, bounds, contour, geom, ...){
  px=pz=pz_adj=..density..=top=bottom=right=left=x=y=z=NULL #ugly hack to comply with R CMD check
  common <- intersect(names(dens), names(bounds[[2]]))
  if (length(common) == 0) { #Artifically create a variable (stand) if none exists (required for joining and thus drawing strikezones)
    nhalf <- dim(dens)[1]/2
    if (nhalf == floor(nhalf)) {
      dens$stand <- c(rep("R", nhalf), rep("L", nhalf))
    } else {
      dens$stand <- c(rep("R", nhalf), rep("L", nhalf), "L")
    }
  }
  dens.df <- suppressMessages(plyr::join(dens, bounds[[2]], type="inner")) #defaults to join "by" all common variables
  # One should use geom_raster when tiles are same size
  p <- ggplot(data=dens.df, aes(x=px, y=pz, fill=z)) + geom_raster(...)
  # Would anyone really want to plot a *function* of the probabilities???
#   if (geom %in% "hex") {
#     p <- p + stat_summary_hex(aes(x=px, y=pz, z=z), ...)
#   } else {
#     p <- p + stat_summary2d(aes(x=px, y=pz, z=z), ...) 
#   }
  if (contour) p <- p + stat_contour(aes(x=px, y=pz, z=z)) #passing binwidth here throws error
  return(p)
}


# Special subset handling for density estimates
# 
# @param data PITCHf/x data
# @param density either density1 or density2 passed on from \link{strikeFX}

subsetFX <- function(data, density) {
  if (length(density) == 1) {
    index <- data[,names(density)] %in% density
    subset(data, index)
  } else {
    if (length(density) > 1) warning("The length of each density parameter should be 0 or 1.")
    data
  }
}

# Differenced 2D Kernel Density Estimates
#
# Computes differenced 2D Kernel Density Estimates using MASS::kde2d
#
# Computes two densities on the same support and subtracts them according to the \code{density} expression.
# 
# 
# @param data PITCHf/x data
# @param density1 either density1 or density2 passed on from \link{strikeFX}.
# @param density2 either density1 or density2 passed on from \link{strikeFX}.
# @param limz MASS::kde2d paramaters passed from \link{strikeFX}.
# @return Returns a data frame with differenced density estimates as column z.

diffDensity <- function(data, density1, density2, limz){
  data1 <- subsetFX(data, density1)
  data2 <- subsetFX(data, density2)
  est1 <- kde2d(data1[,"px"], data1[,"pz_adj"], n=100, lims=limz)
  grid <- expand.grid(px = est1$x, pz = est1$y)
  est2 <- kde2d(data2[,"px"], data2[,"pz_adj"], n=100, lims=limz)
  grid["z"] <- as.vector(est1$z) - as.vector(est2$z)
  return(grid)
}

Try the pitchRx package in your browser

Any scripts or data that you put into this service are public.

pitchRx documentation built on May 2, 2019, 5:56 a.m.