Nothing
#'Create Cutoffs Dataframe
#'
#'\code{createCutoffsDF} is an internal function, which creates a dataframe with
#'identical cutoff values for all ZIP codes (if \code{type = "unadj"}), or
#'quantile cutoffs in a ZIP code (if \code{type = "perc"} or \code{type =
#'"perc.resolve.ties"}). This function is called extensively by the
#'\code{findCutoffs} function.
#'
#'\code{createCutoffsDF} takes in a matrix of restaurants' scores and a vector
#'corresponding to restaurants' ZIP codes, and outputs a data frame of cutoff
#'scores to be used in grade classification. The returned ZIP code cutoff data
#'frame has one row for each unique ZIP code and has \code{(length(gamma)+1)}
#'columns, corresponding to one column for the ZIP code name, and
#'\code{(length(gamma))} cutoff scores separating the \code{(length(gamma)+1)}
#'grading categories. Across each ZIP code's row, cutoff scores increase and we
#'assume, as in the King County (WA) case, that greater risk is associated with
#'larger inspection scores. (If scores are decreasing in risk, users should
#'transform inspection scores before utilizing functions in the
#'\code{QuantileGradeR} package with a simple function such as \code{f(score) =
#'- score}.)
#'
#'The way in which cutoff scores are calculated for each ZIP code depends on the
#'value of the \code{type} variable. The \code{type} variable can take one of
#'three values (see later).
#'
#'@section Modes: \code{type = "unadj"} creates a ZIP code cutoff data frame
#' with the same cutoff scores (meaningful values in a jurisdiction's
#' inspection system that are contained in the vector \code{gamma}) for all ZIP
#' codes. This ZIP code data frame can then be used to carry out "unadjusted"
#' grading, in which a restaurant's most recent routine inspection score is
#' compared to these cutoffs.
#'@section Modes: \code{type = "perc"} takes in a vector of quantiles,
#' \code{gamma}, and returns a data frame of the scores in each ZIP code
#' corresponding to these quantiles (using the "Nearest Rank" definition of
#' quantile).
#'@section Modes: \code{type = "perc.resolve.ties"} takes in a vector of
#' quantiles, \code{gamma}, and instead of returning (for B/C cutoffs, for
#' example) the scores in each ZIP code that result in \emph{at least}
#' (\code{gamma[2]} x 100)\% of restaurants in the ZIP code scoring less than
#' or equal to these cutoffs, \code{type = "perc.resolve.ties"} takes into
#' account the fact that ties exist in ZIP codes. Returned scores for A/B
#' cutoffs are those that result in the \emph{closest} percentage of
#' restaurants in the ZIP code scoring less than or equal to the A/B cutoff to
#' the desired percentage, (\code{gamma[1]} x 100)\%. Similarly, B/C cutoffs
#' are the scores in the ZIP code that result in the \emph{closest} percentage
#' of restaurants in the ZIP code scoring less than or equal to the B/C cutoff
#' and more than the A/B cutoff to the desired percentage, (\code{(gamma[2] -
#' gamma[1])} x 100)\%.
#'
#'
#'
#'
#'@param X Numeric matrix of size \code{n} x \code{p}, where \code{n} is the
#' number is restaurants to be graded and \code{p} is the number of inspections
#' to be used in grade assignment. Entry \code{X[i,j]} represents the
#' inspection score for the \code{i}th restaurant in the \code{j}th most recent
#' inspection.
#'@param z Character vector of length \code{n} representing ZIP codes (or other
#' subunits within a jurisdiction). \code{z[i]} is the ZIP code corresponding
#' to the restaurant with inspection scores in row \code{i} of \code{X}.
#'@param gamma Numeric vector representing absolute grade cutoffs or quantiles,
#' depending on \code{type} variable value. Entries in gamma should be
#' increasing, with \code{gamma[1] <= gamma[2]} etc (this is related to the
#' "Warning" section and larger scores being associated with higher risk). If
#' \code{type = "perc"} or \code{type = "perc.resolve.ties"}, gamma values
#' represent quantiles and should take on values between 0 and 1.
#'@param type Character string that is one of \code{"unadj"},
#' \code{"perc"}, or \code{"perc.resolve.ties"}, and that indicates the grading
#' algorithm to be implemented.
#'@keywords internal
#'@export
#'
createCutoffsDF <- function(X, z, gamma, type) {
## Preliminary Checks
#Check X, z, gamma are all of the correct class types; if not, convert.
X <- matrix(as.numeric(X),nrow = NROW(X))
z <- as.character(z)
gamma <- as.numeric(gamma)
#Check that length of z and number of rows of X match. If not, throw an error.
if (NROW(X) != length(z))
stop("number of rows in X and length of z do not match!")
#Check that type takes on one of the desired values
if (!type %in% c("unadj","perc","perc.resolve.ties"))
stop("type incorrectly specified!")
#Check that gamma vector is sorted in increasing order
if(is.unsorted(gamma)){
stop("gamma vector should be sorted in increasing order!")
}
#Check that entries in gamma vector, if type == "perc" or "perc.resolve.ties", are values between 0 and 1
if ((type == "perc"|type =="perc.resolve.ties") &
FALSE %in% c(gamma >= 0, gamma <= 1)){
stop(
"gamma values represent quantiles and so should be values between 0 and 1!"
)
}
## Initialize zip.cutoffs.df data frame - the object to be returned with a
## row for each unique ZIP code and the number of columns equal to the
## number of grades (one column for ZIPs, the remainder of columns for the
## (no.grades-1) ZIP code cutoff points)
no.grades <- length(gamma) + 1
no.unique.zips <- length(unique(z))
zip.cutoffs.df <-
data.frame(matrix(NA, nrow = no.unique.zips, ncol = no.grades))
## Names for columns in zip.cutoffs.df
cutoff.names <- paste("Gamma", LETTERS[1:(no.grades - 1)], sep = ".")
cutoff.names <- append(cutoff.names, "ZIP", after = 0)
names(zip.cutoffs.df) <- cutoff.names
mean.scores <- rowMeans(X, na.rm = T)
## If type=="unadj", the grading algorithm to be applied uses the same grade
## cutoffs for all ZIP codes
if (type == "unadj") {
## First column is column of unique ZIP codes
zip.cutoffs.df[,"ZIP"] <- unique(z)
for (i in 1:length(gamma)) {
zip.cutoffs.df[,c(i + 1)] <- rep(gamma[i], no.unique.zips)
}
return(zip.cutoffs.df)
}
## If type=="perc", the grading algorithm uses quantile grade cutoffs
## for each ZIP code and gamma is a vector of quantiles
if (type == "perc") {
mean.scores.z <- data.frame(mean.scores, z)
zip.cutoffs.df <-
stats::aggregate(
data = mean.scores.z, mean.scores ~ z, FUN = function(mean.scores.vec)
stats::quantile(
mean.scores.vec, probs = gamma, type = 1, na.rm = T
)
)
zip.cutoffs.df <- do.call(data.frame, zip.cutoffs.df)
names(zip.cutoffs.df) <- cutoff.names
return(zip.cutoffs.df)
}
## If type == "perc.resolve.ties"
if (type == "perc.resolve.ties") {
mean.scores.z <- data.frame(mean.scores, z)
# Distribution of scores in each ZIP code
distribution.z <- as.data.frame(table(mean.scores.z))
# Convert ZIP code column in distribution.z to character type
distribution.z$z <- as.character(distribution.z$z)
# Remove scores which no restaurants in ZIP code obtain
distribution.z <- distribution.z[-which(distribution.z$Freq == 0),]
# Number of restaurants in each ZIP code
no.rests.z <- as.data.frame(table(mean.scores.z$z))
names(no.rests.z) <- c("z", "no.rests")
distribution.z <- merge(distribution.z, no.rests.z, by = "z")
# Proportion of restaurants with a given score
distribution.z$prop.score <- distribution.z$Freq/distribution.z$no.rests
# Update cutoffs one at a time - finding A/B cutoff first and then finding other cutoffs
for(i in 1:length(gamma)){
# A/B Cutoff
if(i == 1){
# Desired percentage of restaurants with A grades:
desired.perc <- gamma[i]
# Initialize zip.cutoffs.df frame
zip.cutoffs.df <- as.data.frame(unique(distribution.z$z))
names(zip.cutoffs.df) <- "ZIP"
} else{ # B/C cutoffs and beyond
# Desired percentage of restaurants with "B" grade (in case of i == 2 and B/C cutoff calculation) is difference between current gamma value,
# which is a quantile, and gamma[i-1]
desired.perc <- gamma[i] - gamma[i - 1]
}
# Current cutoff name
current.cutoff <- cutoff.names[i + 1]
# Cumulative sum of proportions
cumulative.sums <- stats::aggregate(data = distribution.z, prop.score ~ z, FUN = cumsum)
distribution.z$cum.sum <- as.numeric(unlist(cumulative.sums$prop.score))
# Absolute difference between cumulative sum of proportions and desired percentage
distribution.z$diff <- abs(distribution.z$cum.sum - desired.perc)
# Find minimum absolute difference between cumulative sum of proportions
# and desired percentage for each restaurant
min.diffs <- stats::aggregate(data = distribution.z, diff ~ z, FUN = min)
min.diffs$row.number <- row.names(min.diffs)
# Cutoff.locs contains the row numbers in distribution.z which correspond to the
# minimum differences for each ZIP code
cutoff.locs <- as.data.frame(which(outer(distribution.z$z,
min.diffs$z, "==") & outer(distribution.z$diff,
min.diffs$diff, "=="), arr.ind = TRUE))
cutoff.locs <- merge(cutoff.locs, min.diffs[,-which(colnames(min.diffs) == "diff")],
by.x = "col", by.y = "row.number", all.x = TRUE)
# Are there multiple scores for a ZIP code minimizing difference?
# If so, choose larger score for cutoff (so that more restaurants receive higher grade in ZIP code)
cutoff.locs <- stats::aggregate(data = cutoff.locs, row~z, FUN = max)
cutoff.locs$ZIP <- cutoff.locs$z
# Identify cutoff values and append to zip.cutoffs.df
cutoffs <- distribution.z[cutoff.locs$row, c("z", "mean.scores")]
zip.cutoffs.df <- merge(zip.cutoffs.df, cutoffs, by.x = "ZIP", by.y = "z", all.x = TRUE)
names(zip.cutoffs.df)[ncol(zip.cutoffs.df)] <- current.cutoff
# Identify the starting row numbers for each ZIP code in distribution.z
start.zips <- data.frame(ZIP = unique(distribution.z$z), start.loc = match(unique(distribution.z$z), distribution.z$z))
# Prepare distribution.z for next for-loop iteration - delete those scores in each ZIP code
# which now have grades associated with them
rows.for.deletion <- merge(start.zips,cutoff.locs[,c("row", "ZIP")], by = "ZIP", all = TRUE)
to.delete <- unlist(lapply(1:nrow(rows.for.deletion), FUN = function(x) rows.for.deletion$start.loc[x]:rows.for.deletion$row[x]))
distribution.z <- distribution.z[-to.delete,]
}
# zip.cutoffs.df to required format
zip.cutoffs.df$ZIP <- as.character(zip.cutoffs.df$ZIP)
zip.cutoffs.df[,2:ncol(zip.cutoffs.df)] <- sapply(zip.cutoffs.df[,2:ncol(zip.cutoffs.df)], FUN = function(x)as.numeric(as.character(x)))
# Replace any NA values with the value of the last non-NA cutoff in row
na.locations <- which(is.na(zip.cutoffs.df), arr.ind = TRUE)
if(length(na.locations) > 0){
na.locations <- as.data.frame(na.locations)
last.non.na <- stats::aggregate(data = na.locations, col ~ row, FUN = min)
last.non.na$col <- last.non.na$col - 1
names(last.non.na) <- c("row", "last.nonna")
na.locations <- merge(na.locations, last.non.na, by = "row")
for(i in 1:nrow(na.locations)){
zip.cutoffs.df[na.locations[i,"row"], na.locations[i,"col"]] <- zip.cutoffs.df[na.locations[i,"row"], na.locations[i,"last.nonna"]]
}
}
# Convert ZIP column to factor type to be consistent with other cases
zip.cutoffs.df$ZIP <- as.factor(zip.cutoffs.df$ZIP)
return(zip.cutoffs.df)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.