R/pai.R

Defines functions pai_summary pai

Documented in pai pai_summary

#' Predictive Accuracy Index
#'
#' Given a set of predictions and observed counts, returns the PAI (predictive accuracy index),
#' PEI (predictive efficiency index), and the RRI (recovery rate index)
#'
#' @param dat data frame with the predictions, observed counts, and area sizes (can be a vector of ones)
#' @param count character specifying the column name for the observed counts (e.g. the out of sample crime counts)
#' @param pred character specifying the column name for the predicted counts (e.g. predictions based on a model)
#' @param area character specifying the column name for the area sizes (could also be street segment distances, see Drawve & Wooditch, 2019)
#' @param other vector of strings for any other column name you want to keep (e.g. an ID variable), defaults to empty `c()`
#'
#' @details
#' Given predictions over an entire sample, this returns a dataframe with the sorted best PAI (sorted by density of predicted counts per area). 
#' PAI is defined as:
#' 
#' \deqn{PAI = \frac{c_t/C}{a_t/A}}{PAI = (ct/C)/(at/A)}
#' 
#' Where the numerator is the percent of crimes in cumulative t areas, and the denominator is the percent of the area encompassed.
#' PEI is the observed PAI divided by the best possible PAI if you were a perfect oracle, so is scaled between 0 and 1. 
#' RRI is `predicted/observed`, so if you have very bad predictions can return Inf or undefined!
#' See Wheeler & Steenbeek (2019) for the definitions of the different metrics. 
#' User note, PEI may behave funny with different sized areas.
#'
#' @returns
#' A dataframe with the columns:
#'  - `Order`, The order of the resulting rankings
#'  - `Count`, the counts for the original crimes you specified
#'  - `Pred`, the original predictions 
#'  - `Area`, the area for the units of analysis
#'  - `Cum*`, the cumulative totals for Count/Pred/Area
#'  - `PCum*`, the proportion cumulative totals, e.g. `CumCount/sum(Count)`
#'  - `PAI`, the PAI stat
#'  - `PEI`, the PEI stat
#'  - `RRI`, the RRI stat (probably should analyze/graph the `log(RRI)`)!
#'
#' Plus any additional variables specified by `other` at the end of the dataframe.
#' @export
#' @examples
#' 
#' # Making some very simple fake data
#' crime_dat <- data.frame(id=1:6,
#'                         obs=c(6,7,3,2,1,0),
#'                         pred=c(8,4,4,2,1,0))
#' crime_dat$const <- 1
#' p1 <- pai(crime_dat,'obs','pred','const')
#' print(p1)
#' 
#' # Combining multiple predictions, making
#' # A nice table
#' crime_dat$rand <- sample(crime_dat$obs,nrow(crime_dat),FALSE)
#' p2 <- pai(crime_dat,'obs','rand','const')
#' pai_summary(list(p1,p2),c(1,3,5),c('one','two'))
#' 
#' @references
#' Drawve, G., & Wooditch, A. (2019). A research note on the methodological and theoretical considerations for assessing crime forecasting accuracy with the predictive accuracy index. *Journal of Criminal Justice*, 64, 101625.
#' 
#' Wheeler, A. P., & Steenbeek, W. (2021). Mapping the risk terrain for crime using machine learning. *Journal of Quantitative Criminology*, 37(2), 445-480.
#' @seealso 
#' [pai_summary()] for a summary table of metrics for multiple pai tables given fixed N thresholds
pai <- function(dat,count,pred,area,other=c()){
    LimData <- dat[,c(count,pred,area,other)]
    p <- LimData[,pred]/LimData[,area] #sorts by the highest density
    cr <- LimData[,count]
    LimData <- LimData[order(p,-1*cr, decreasing = TRUE),] #sorting asc for cr gives min potential PAI
    LimData$Order <- 1:nrow(LimData)
    LimData$Count <- LimData[,count]
    LimData$CumCount <- cumsum(LimData[,count])
    tot_area <- sum(LimData[,area])
    tot_crime <- sum(LimData[,count])
    LimData$Area <- LimData[,area]
    LimData$CumArea <- cumsum(LimData$Area)
    LimData$PCumArea <- LimData$CumArea/tot_area
    LimData$PCumCount <- (LimData$CumCount/tot_crime)
    LimData$PAI <- LimData$PCumCount / LimData$PCumArea
    # Calculating PEI, note may not be apples to apples with diffe
    cr_dens <- LimData$Count/LimData$Area
    pei_dat <- LimData[order(cr_dens, decreasing = TRUE),]
    cum_perfect <- cumsum(pei_dat$Count)
    pcum_perfect <- cum_perfect/tot_crime
    pcum_area <- cumsum(pei_dat$Area)/tot_area
    perf_pai <- pcum_perfect/pcum_area
    LimData$PEI <- LimData$PAI/perf_pai
    # Calculating RRI
    LimData$Pred <- LimData[,pred]
    LimData$CumPred <- cumsum(LimData[,pred])
    LimData$RRI <- LimData$CumPred/LimData$CumCount #?Handle 0's?
    # Variables I want
    keep_vars <- c('Order','Count','CumCount','PCumCount','Pred','CumPred',
                   'Area','CumArea','PCumArea','PAI','PEI','RRI',other)
    return( LimData[,keep_vars] )
}

#' Summary Table for Multiple PAI Stats
#'
#' Takes a list of multiple PAI summary tables (for different predictions) and returns summaries at fixed area thresholds
#'
#' @param pai_list, list of data frames that have the PAI stats from the `pai` function
#' @param thresh, vector of area numbers to select, e.g. 10 would select the top 10 areas, c(10,100) would select the top 10 and the top 100 areas
#' @param labs vector of characters that specifies the labels for each PAI dataframe, should be the same length as `pai_list`
#' @param wide boolean, if TRUE (default), returns data frame in wide format. Else returns summaries in long format
#'
#' @details
#' Given predictions over an entire sample, this returns a dataframe with the sorted best PAI (sorted by density of predicted counts per area). 
#' PAI is defined as:
#' 
#' \deqn{PAI = \frac{c_t/C}{a_t/A}}{PAI = (ct/C)/(at/A)}
#' 
#' Where the numerator is the percent of crimes in cumulative t areas, and the denominator is the percent of the area encompassed.
#' PEI is the observed PAI divided by the best possible PAI if you were a perfect oracle, so is scaled between 0 and 1. 
#' RRI is `predicted/observed`, so if you have very bad predictions can return Inf or undefined!
#' See Wheeler & Steenbeek (2019) for the definitions of the different metrics. 
#' User note, PEI may behave funny with different sized areas.
#'
#' @returns
#' A dataframe with the PAI/PEI/RRI, and cumulative crime/predicted counts, for each original table
#'
#' @export
#' @examples
#' 
#' # Making some very simple fake data
#' crime_dat <- data.frame(id=1:6,
#'                         obs=c(6,7,3,2,1,0),
#'                         pred=c(8,4,4,2,1,0))
#' crime_dat$const <- 1
#' p1 <- pai(crime_dat,'obs','pred','const')
#' print(p1)
#' 
#' # Combining multiple predictions, making
#' # A nice table
#' crime_dat$rand <- sample(crime_dat$obs,nrow(crime_dat),FALSE)
#' p2 <- pai(crime_dat,'obs','rand','const')
#' pai_summary(list(p1,p2),c(1,3,5),c('one','two'))
#' 
#' @references
#' Drawve, G., & Wooditch, A. (2019). A research note on the methodological and theoretical considerations for assessing crime forecasting accuracy with the predictive accuracy index. *Journal of Criminal Justice*, 64, 101625.
#' 
#' Wheeler, A. P., & Steenbeek, W. (2021). Mapping the risk terrain for crime using machine learning. *Journal of Quantitative Criminology*, 37(2), 445-480.
#' @seealso 
#' [pai()] for a summary table of metrics for multiple pai tables given fixed N thresholds
pai_summary <- function(pai_list,thresh,labs,wide=TRUE){
    keep_vars <- c('lab','Order','CumCount','CumPred','PAI','PEI','RRI')
    names(pai_list) <- labs
    combo <- do.call(rbind,pai_list)
    combo$lab <- do.call(rbind,strsplit(row.names(combo),"[.]"))[,1]
    combo_sub <- combo[combo$Order %in% thresh,keep_vars]
    # If wide return in wide format
    if (wide){
        cs <- stats::reshape(combo_sub,idvar='Order',timevar = 'lab',direction='wide')
    }
    else { cs <- combo_sub } #Else return in long format
    row.names(cs) <- 1:nrow(cs)
    return(cs)
}

# ToDo, thesh could be PCumArea instead or Order

Try the ptools package in your browser

Any scripts or data that you put into this service are public.

ptools documentation built on Feb. 16, 2023, 10:40 p.m.