knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
knitr::opts_chunk$set(echo = TRUE) library(dplyr) library(magrittr) library(purrr) library(stringr) library(tidyr) library(readr) library(njoaguof)
Load the Use of Force raw data.
data("use_of_force_raw") uof_raw <- use_of_force_raw
We will also need county and municipality data from the US Census, gathered using the tidycensus
package:
data("census_counties") data("census_municipalities")
Some regular expressions we will use repeatedly
trailing_comma_regex <- "(?<=.),?$" sep_comma_no_space <- ",(?! )" sep_comma_space_no_paren <- r"( ?,(?![^(]*\)) )" sep_comma_optional_space <- r"( ?, ?)"
Before we start parsing, we will verify some things.
First, here are the columns we are expecting.
raw_names <- c( "FormID" ,"County", "AgencyName" ,"OfficerName", "UserID" ,"IncidentID", "ReportNumber" ,"IncidentCaseNumber", "IncidentDate" ,"OtherOfficerInvolved", "OfficerInUniform" ,"IncidentMunicipality", "IndoorOrOutdoor" ,"IncidentWeather", "VideoFootage" ,"VideoType", "IncidentLighting", "LocationType" ,"IncidentType", "ContactOrigin" ,"PlannedContact", "OfficerAge" ,"OfficerRaceEthnicity", "OfficerRank" ,"OfficerGender", "OfficerInjuryType" ,"OfficerInjuriesInjured", "OfficerMedicalTreatment" ,"OfficerHospitalTreatment", "TotalSubInjuredInIncident" ,"SubjectInjuredInIncident", "SubjectInjuredPriorToIncident" ,"PerceivedConditionOfSubject", "SubjectActions" ,"SubjectResistance", "SubjectMedicalTreatment" ,"SubjectInjuryType", "SubjectArrested" ,"ReasonSubjectNotArrested", "SubjectType" ,"SubjectAge", "SubjectRaceEthnicity" ,"SubjectGender", "ForceType", "IncidentYear", "KEEPDROP" ,"staticContent976" ) stopifnot(all.equal(names(uof_raw), raw_names))
We will use this function to check the levels of multi-valued fields.
check_list_levels <- function(tbl, column, separating_regex, levels, other_acceptable_values = character(0)) { message("Checking for unmatched values in ", rlang::as_label(rlang::enquo(column))) unmatched_values <- tbl %>% dplyr::pull({{column}}) %>% stringr::str_split(separating_regex) %>% unlist() %>% unique() %>% setdiff(levels) %>% setdiff(other_acceptable_values) if (length(unmatched_values) != 0) { message(paste(unmatched_values, collapse = " | ")) stop() } }
And here are all the levels.
subject_type_levels <- c("Person", "Animal", "Other", "Unknown Subject(s)") gender_levels <- c("Male", "Female", #"Gender Non-Conforming/X", "Non-Binary/X", "Other") incident_type_levels <- c( "Aggressive/Injured Animal", "Assault", "Assisting another officer", "Burglary", "Distribution of CDS", "Disturbance (drinking, fighting, disorderly)", "Domestic", "Eluding", "Medical Emergency", "MV Accident/Aid", "MV/Traffic Stop", "Pedestrian Stop", "Possession of CDS", "Potential Mental Health Incident", "Report of Gunfire", "Robbery", "Subject with a gun", "Subject with other weapon", "Suspicious person", "Terroristic Threats", "Theft/Shoplifting", "Trespassing", "Wanted Person", "Welfare Check", "Other" ) race_levels <- c( "American Indian", "Asian", "Asian/Pacific Islander", "Black", "Hispanic", "Native Hawaiian or other Pacific Islander", "Pacific Islander", "Two or more races", "White", "Other" ) weather_levels <- c("Clear", "Cloudy", "Fog", "Rain", "Snow/Sleet/Ice") video_type_levels <- c( "Body Worn", "CED Camera", "Cell Phone", "Commercial Building", "Motor Vehicle", "Residential/Home", "Station House", "Other", "Unknown" ) lighting_levels <- c("Artificial", "Darkness", "Dawn/Dusk", "Daylight") location_type_levels <- c( "Alcohol Establishment (bar, club, casino)", "Business", "Court House", "Hospital", "Jail/Prison", "Police Station", "Residence", "Restaurant", "School", "Street", "Other" ) contact_origin_levels <- c( "Citizen Initiated", "Dispatched", "Officer Dispatched", "Officer Initiated", "Pre-Planned Contact" ) planned_contact_levels <- c( "Arrest", "Judicial Order Service (TRO, FRO, etc.)", "No Knock Warrant", "Prisoner Transfer", "Processing", "Search Warrant Execution", "Other" ) officer_injury_type_levels <- c( "Abrasion/Laceration/Puncture", "Chest pains/shortness of breath", "Complaint of pain", "Concussion", "Contusion/bruise", "Fracture/dislocation", "Gunshot wound", "Other", "Unknown" ) officer_medical_treatment_levels <- c( "EMS on scene", "Hospital", "Officer Administered First Aid", "Refused", "Urgent Care" ) perceived_condition_levels <- c( "No unusual condition noted", "Other unusual condition noted", "Potential Mental Health Incident", "Under influence of alcohol/drugs/both" ) subject_action_levels <- c( "Attack with Blunt object", "Attack with Bodily fluids", "Attack with Edge Weapon", "Attack with Hands fists legs", "Attack with Motor Vehicle", "Attack with Other Weapon", "Attempt to commit crime", "Attempt to destroy evidence", "Attempt to escape from Custody", "Attempt to self-harm", "Biting", "Failure to Disperse", "Fired Gun", "Kick", "Other Attack", "Other Threat", "Prevent harm to another", "Push or shove", "Resisted arrest/police officer control", "Spitting", "Strike with open hand fist or elbow", "Threat to Kick", "Threat to Push or shove", "Threat to Strike with open hand fist or elbow", "Threat with Blunt object", "Threat with Bodily Fluids", "Threat with Edge Weapon", "Threat with Gun", "Threat with Hands fists legs", "Threat with Motor vehicle", "Threat with Other Weapon", "Verbal/Fighting stance Threat" ) subject_resistance_levels <- c( "Active Assailant", "Active Resistor", "Aggressive resistance (attempt to attack or harm)", "Attempt to flee", "Dead-weight tactics(going limp)", "Dead-weight tactics (going limp)", "Non-response (consciously ignoring)", "Passive Resistor", "Resistive tension (stiffening tighening muscles)", "Threatening Assailant", "Verbal", "Other" ) subject_medical_treatment_levels <- c( "EMS on scene", "Hospital", "Mental Health Facility", "Officer Administered First Aid", "Refused", "Urgent Care", "Unknown", "Not Provided" ) subject_injury_levels <- c( "Abrasion/Laceration/Puncture", "Chest pains/shortness of breath", "Complaint of pain", "Concussion", "Contusion/bruise", "Fracture/dislocation", "Gunshot wound", "No Injury", "Other", "Unknown", "Not Provided" ) force_type_levels <- c( "Canine bit (apprehension)", "Canine bit (spontaneous)", "CED Spark Display", "Chokehold, Carotid artery restraint", "Compliance hold with impact weapon- not a strike", "CS Gas", "Discharged Chemical at", "Discharged Firearm at", "High Volume OC Spray", "Intent to strike with a motor vehicle", "Kneeling on Chest, Back", "Pointing Firearm", "Struck", "Used arm bar on", "Used arms", "Used arms/hands", "Used CED on", "Used fists/punch", "Used head", "Used legs/kicks", "Used Less-lethal device on", "Used pressure points on", "Used take down on", "Other" ) reason_not_arrested_levels <- c( "Already in Custody", "Deceased", "Insufficient Probable Cause- includes continuing investigation", "Medical/Mental Health Incident", "No Probable Cause- Crime Unfounded", "No Probable Cause- Subject Not Involved", "Subject Fled", "Other" ) county_levels <- census_counties %>% pull(county)
Now we will check that the multi-value fields contain only these levels.
uof_raw_trimmed <- uof_raw %>% dplyr::mutate(across( where(is.character), ~ stringr::str_replace(., trailing_comma_regex, "") )) check_list_levels( uof_raw_trimmed, SubjectType, #sep_comma_no_space, sep_comma_space_no_paren, subject_type_levels, c("", "Not Provided") ) check_list_levels( uof_raw_trimmed, SubjectGender, #sep_comma_no_space, sep_comma_space_no_paren, gender_levels, c("", "Not Provided") ) check_list_levels( uof_raw_trimmed, SubjectRaceEthnicity, #sep_comma_no_space, sep_comma_space_no_paren, race_levels, c("Am. Indian", "Black or African American", "", "Not Provided") ) check_list_levels( uof_raw_trimmed, IncidentWeather, sep_comma_space_no_paren, weather_levels, c("N/A", "", "Not Provided") ) check_list_levels( uof_raw_trimmed, VideoType, sep_comma_space_no_paren, video_type_levels, c("[]", "") ) check_list_levels( uof_raw_trimmed, IncidentLighting, sep_comma_space_no_paren, lighting_levels ) check_list_levels( uof_raw_trimmed, LocationType, sep_comma_space_no_paren, location_type_levels, "" ) check_list_levels( uof_raw_trimmed, IncidentType, sep_comma_space_no_paren, incident_type_levels, c("", "Not Provided") ) check_list_levels( uof_raw_trimmed, ContactOrigin, sep_comma_space_no_paren, contact_origin_levels, c("", "Not Provided") ) check_list_levels( uof_raw_trimmed, PlannedContact, sep_comma_space_no_paren, planned_contact_levels, c("", "Other ") ) check_list_levels( uof_raw_trimmed, OfficerInjuryType, sep_comma_space_no_paren, officer_injury_type_levels, c("No Injury", "Not Provided", "Not injured") ) check_list_levels( uof_raw_trimmed, OfficerMedicalTreatment, sep_comma_space_no_paren, officer_medical_treatment_levels, c("Not Provided", "") ) check_list_levels( uof_raw_trimmed, PerceivedConditionOfSubject, #sep_comma_no_space, sep_comma_space_no_paren, perceived_condition_levels, c("Not Provided", "") ) # For subject_actions, internal commas are used # inconsistently. We'll normalize this field by # internal commas before splitting uof_raw_trimmed <- uof_raw_trimmed |> mutate(SubjectActions=str_replace_all(SubjectActions, "Hands, fists, legs", "Hands fists legs")) |> mutate(SubjectActions=str_replace_all(SubjectActions, "Hands,fists,legs", "Hands fists legs")) |> mutate(SubjectActions=str_replace_all(SubjectActions, "hand, fist, or elbow", "hand fist or elbow")) check_list_levels( uof_raw_trimmed, SubjectActions, sep_comma_optional_space, subject_action_levels, c("Not Provided", "") ) # For subject_resistance, internal commas are used # inconsistently. We'll normalize this field by # internal commas before splitting uof_raw_trimmed <- uof_raw_trimmed |> mutate(SubjectResistance=str_replace_all(SubjectResistance, "stiffening, tighening", "stiffening tighening")) check_list_levels( uof_raw_trimmed, SubjectResistance, sep_comma_optional_space, subject_resistance_levels, c("", "Not Provided") ) check_list_levels( uof_raw_trimmed, SubjectMedicalTreatment, sep_comma_optional_space, subject_medical_treatment_levels, "" ) check_list_levels( uof_raw_trimmed, SubjectInjuryType, sep_comma_optional_space, subject_injury_levels, "" ) # For ForceType, we have to use an ad-hoc separator expression, # because of internal commas in two levels. sep_comma_space_force_special <- r"(, (?!Back|Carotid))" check_list_levels( uof_raw_trimmed, ForceType, sep_comma_space_force_special, force_type_levels, c("Not Provided", "") ) # There are two values here that we want to normalize -- # The values "NoProbableCause-SubjectNotInvolved and # Insufficient Probable Cause-includes continuing investigation. We # will do that later, when we build the # incident_subject_reason_not_arrested table. check_list_levels( uof_raw_trimmed, ReasonSubjectNotArrested, sep_comma_optional_space, reason_not_arrested_levels, c("Not Provided", "", "NoProbableCause-SubjectNotInvolved", "Insufficient Probable Cause-includes continuing investigation") )
Finally, we check the county field. There are two non-county values which we
are expecting (and will map to NA
): "Other"
and "NJSP"
.
stopifnot(all( setdiff(uof_raw %>% dplyr::pull(County) %>% paste0(" County"), county_levels) %in% c("Other County", "NJSP County") ) )
subject
tablesubject <- uof_raw %>% select(FormID, SubjectArrested, SubjectType, SubjectAge, SubjectRaceEthnicity, SubjectGender, SubjectInjuredInIncident, SubjectInjuredPriorToIncident) %>% mutate(across( where(is.character), ~ str_replace(., trailing_comma_regex, "") )) %>% filter(if_any(-FormID, ~ . != "")) max_subjects <- subject$SubjectArrested %>% map_int(str_count, ",") %>% max() + 1 subject <- subject %>% separate(SubjectArrested, paste0("arrested__", 1:max_subjects), ",", fill="right") %>% separate(SubjectType, paste0("type__", 1:max_subjects), ",", fill="right") %>% separate(SubjectAge, paste0("age__", 1:max_subjects), ",", fill="right") %>% separate(SubjectRaceEthnicity, paste0("race__", 1:max_subjects), ",", fill="right") %>% separate(SubjectGender, paste0("gender__", 1:max_subjects), ",", fill="right") %>% separate(SubjectInjuredInIncident, paste0("injured__", 1:max_subjects), ",", fill="right") %>% separate(SubjectInjuredPriorToIncident, paste0("injured_prior__", 1:max_subjects), ",", fill="right") %>% pivot_longer(cols = -FormID, names_to="column", values_to="value", values_drop_na=TRUE) %>% separate(column, c("column", "index"), "__") %>% pivot_wider(names_from="column", values_from="value") %>% rename(form_id=FormID)
Now we will clean up this table, column by column.
index
and arrested
index
is the index into the subject list. It should be an integer. arrested
is a boolean.
subject <- subject %>% mutate(index=as.integer(index), arrested=as.logical(arrested) )
type
and gender
We will convert these to factors.
subject <- subject %>% mutate(type=factor(type, levels=subject_type_levels), gender=factor(gender, levels=gender_levels))
age
This field is not currently strictly numeric -- it also contains values "Unknown"
and "Juvenile"
. We will add a boolean column, juvenile
and convert age
to an integer. The value of "Unknown"
will map to NA
in both columns.
as_integer_or_na <- function(x) suppressWarnings(as.integer(x)) subject <- subject %>% mutate(juvenile=case_when( age=="Juvenile" ~ TRUE, !is.na(as_integer_or_na(age)) ~ FALSE )) %>% mutate(age=as_integer_or_na(age))
race
We need to normalize some names before converting to a factor.
subject <- subject %>% mutate( race = case_when( race == "Black or African American" ~ "Black", race == "Am. Indian" ~ "American Indian", TRUE ~ race ), race=factor(race, levels=race_levels) )
injured
We expect only "Yes", "No", "Unknown" and "". We will convert this to a boolean (or NA
)
subject <- subject %>% mutate(injured = str_trim(injured), injured_prior = str_trim(injured_prior)) stopifnot( 0 == subject %>% filter(!(injured %in% c("Yes", "No", "Unknown", "", NA))) %>% nrow() , 0 == subject %>% filter(!(injured_prior %in% c("Yes", "No", "Unknown", "", NA))) %>% nrow() ) subject <- subject %>% mutate( injured = case_when( injured == "Yes" ~ TRUE, injured == "No" ~ FALSE, TRUE ~ NA ), injured_prior = case_when( injured_prior == "Yes" ~ TRUE, injured_prior == "No" ~ FALSE, TRUE ~ NA ) )
Finally, we reorder the columns.
subject <- subject %>% select(form_id, index, arrested, type, age, juvenile, race, gender, injured, injured_prior)
subject
These tables are built from columns that contain value-lists with the following properties:
For each of these columns, we will create a new table with one row per value, using this function:
### table should have two columns: FormID and list_col make_set_membership_table <- function(table, levels, separating_regex = ",") { table <- table %>% mutate(list_col = str_replace(list_col, trailing_comma_regex, "")) %>% filter(list_col != "") max_values <- table$list_col %>% map_int(str_count, separating_regex) %>% max() + 1 table %>% separate(list_col, paste0("list_col__", 1:max_values), separating_regex, fill="right" ) %>% pivot_longer(cols=-FormID, names_to="column", values_to="value", values_drop_na = TRUE) %>% mutate(value=factor(str_trim(value), levels=levels)) %>% filter(!is.na(value)) %>% rename(form_id=FormID) %>% select(form_id, value) }
incident_weather
incident_weather <- uof_raw %>% select(FormID, list_col=IncidentWeather) %>% make_set_membership_table(weather_levels, sep_comma_space_no_paren) %>% rename(weather=value)
incident_video_type
incident_video_type <- uof_raw %>% select(FormID, list_col=VideoType) %>% make_set_membership_table(video_type_levels, sep_comma_space_no_paren) %>% rename(video_type=value)
incident_lighting
incident_lighting <- uof_raw %>% select(FormID, list_col=IncidentLighting) %>% make_set_membership_table(lighting_levels, sep_comma_space_no_paren) %>% rename(lighting=value)
incident_location_type
incident_location_type <- uof_raw %>% select(FormID, list_col=LocationType) %>% make_set_membership_table(location_type_levels, sep_comma_space_no_paren) %>% rename(location_type=value)
incident_type
incident_type <- uof_raw %>% select(FormID, list_col=IncidentType) %>% make_set_membership_table(incident_type_levels, sep_comma_space_no_paren) %>% rename(type=value)
incident_contact_origin
incident_contact_origin <- uof_raw %>% select(FormID, list_col=ContactOrigin) %>% make_set_membership_table(contact_origin_levels, sep_comma_space_no_paren) %>% rename(contact_origin=value)
incident_planned_contact
incident_planned_contact <- uof_raw %>% select(FormID, list_col=PlannedContact) %>% make_set_membership_table(planned_contact_levels, sep_comma_space_no_paren) %>% rename(planned_contact=value)
incident_officer_injury_type
This is pulled from the field OfficerInjuryType
, which is of mixed type. The field may be equal to "No Injury"
"Not Injured"
, or to "Not Provided"
. Otherwise, it is value-list column, using the levels below. When it is a value-list column, we will use it to populate the incident_officer_injury_type
table. The other values will be reflected in the incident
table.
incident_officer_injury_type <- uof_raw %>% select(FormID, list_col=OfficerInjuryType) %>% make_set_membership_table(officer_injury_type_levels, sep_comma_space_no_paren) %>% rename(officer_injury_type=value)
incident_officer_medical_treatment
incident_officer_medical_treatment <- uof_raw %>% select(FormID, list_col=OfficerMedicalTreatment) %>% make_set_membership_table(officer_medical_treatment_levels, sep_comma_space_no_paren) %>% rename(officer_medical_treatment=value)
These tables are built from columns that contain value-lists that are "messy" in the following sense. Each column contains value-lists with repeated values -- because the values apply to individual subjects -- but there is no reliable way to assign individual list elements to individual subjects. For example, FormID
20221
has two subjects, but field SubjectActions
has three values:
uof_raw %>% filter(FormID==20221) %>% mutate(SubjectActions=str_replace(SubjectActions, ",?$", "")) %>% select(FormID, SubjectType, SubjectActions) %>% separate(SubjectActions, into=paste0("SubjectActions_", 1:3), ",") %>% glimpse()
The value "Resisted arrest/police officer control"
appears twice because it applies to both subjects. But "Prevent harm to another"
appears only once, and it not clear if it applies to the first subject or to the second one.
We will create a separate table for each of these columns, assigning all of the values -- including the repeated values -- to the incident. There will also be an index column, so that the list position can be reconstructed if necessary.
make_messy_relationship_table <- function(table, levels, separating_regex) { table <- table %>% mutate(list_col = str_replace(list_col, trailing_comma_regex, "")) %>% filter(list_col != "") max_values <- table$list_col %>% map_int(str_count, separating_regex) %>% max() + 1 table %>% separate(list_col, paste0("value__", 1:max_values), separating_regex, fill="right" ) %>% pivot_longer(cols = -FormID, names_to="column", values_to="value", values_drop_na=TRUE) %>% separate(column, c("column", "index"), "__") %>% mutate(index=as.integer(index)) %>% pivot_wider(names_from="column", values_from="value") %>% mutate(value=factor(str_trim(value), levels=levels)) %>% rename(form_id=FormID) %>% filter(!is.na(value)) }
incident_subject_perceived_condition
incident_subject_perceived_condition <- uof_raw %>% select(FormID, list_col=PerceivedConditionOfSubject) %>% make_messy_relationship_table(perceived_condition_levels, sep_comma_space_no_paren) %>% rename(perceived_condition=value)
incident_subject_action
incident_subject_action <- uof_raw_trimmed %>% select(FormID, list_col=SubjectActions) %>% make_messy_relationship_table(subject_action_levels, sep_comma_optional_space) %>% rename(subject_action=value)
incident_subject_resistance
incident_subject_resistance <- uof_raw_trimmed %>% select(FormID, list_col=SubjectResistance) %>% make_messy_relationship_table(subject_resistance_levels, sep_comma_optional_space) %>% rename(subject_resistance=value)
incident_subject_medical_treatment
incident_subject_medical_treatment <- uof_raw %>% select(FormID, list_col=SubjectMedicalTreatment) %>% make_messy_relationship_table(subject_medical_treatment_levels, sep_comma_optional_space) %>% rename(subject_medical_treatment=value)
incident_subject_injury
incident_subject_injury <- uof_raw %>% select(FormID, list_col=SubjectInjuryType) %>% make_messy_relationship_table(subject_injury_levels, sep_comma_optional_space) %>% rename(subject_injury=value)
incident_subject_force_type
incident_subject_force_type <- uof_raw %>% select(FormID, list_col=ForceType) %>% make_messy_relationship_table(force_type_levels, sep_comma_space_no_paren) %>% rename(force_type=value)
incident_subject_reason_not_arrested
Two values in this field needs to be normalized: We will replace
"NoProbableCause-SubjectNotInvolved"
with "No Probable Cause- Subject Not Involved"
and
incident_subject_reason_not_arrested <- uof_raw %>% mutate(ReasonSubjectNotArrested = str_replace_all( ReasonSubjectNotArrested, "NoProbableCause-SubjectNotInvolved", "No Probable Cause- Subject Not Involved" )) %>% mutate(ReasonSubjectNotArrested = str_replace_all( ReasonSubjectNotArrested, "Insufficient Probable Cause-includes continuing investigation", "Insufficient Probable Cause- includes continuing investigation" )) %>% select(FormID, list_col="ReasonSubjectNotArrested") %>% make_messy_relationship_table(reason_not_arrested_levels, sep_comma_optional_space) %>% rename(reason_not_arrested=value)
incident
tableFirst, we select the columns we will need from the raw data.
incident <- uof_raw %>% select( form_id = FormID, agency_county = County, agency_name = AgencyName, officer_name = OfficerName, officer_name_id = UserID, report_number = ReportNumber, incident_case_number = IncidentCaseNumber, incident_date_1 = IncidentDate, incident_municipality = IncidentMunicipality, other_officer_involved = OtherOfficerInvolved, officer_in_uniform = OfficerInUniform, indoor_or_outdoor = IndoorOrOutdoor, video_footage = VideoFootage, officer_age = OfficerAge, officer_race = OfficerRaceEthnicity, officer_rank = OfficerRank, officer_gender = OfficerGender, officer_injured = OfficerInjuriesInjured, subject_injured_count = TotalSubInjuredInIncident )
Before we clean up these fields, let us review the fields we are omitting.
IncidentID
This is a synthetic field composed of other fields, so we do not need it.
stopifnot( 0 == uof_raw %>% filter( IncidentID != str_to_upper(glue::glue("{County}-{AgencyName}-{IncidentCaseNumber}")) ) %>% filter( IncidentID != str_to_upper(glue::glue("{County}-{AgencyName}- {IncidentCaseNumber}")) ) %>% filter( IncidentID != str_to_upper(glue::glue("{County}-{AgencyName}-\t{IncidentCaseNumber}")) ) %>% filter( IncidentID != str_to_upper(glue::glue("{County}-{AgencyName}- {IncidentCaseNumber}")) ) %>% nrow() )
incident_weather
, video_type
, incident_lighting
, location_type
, incident_type
, contact_origin
, planned_contact
, OfficerMedicalTreatment
, PerceivedConditionOfSubject
,SubjectActions
,SubjectResistance
, SubjectMedicalTreatment
, SubjectInjuryType
, ForceType
, and ReasonSubjectNotArrested
These fields are handled in other tables.
OfficerInjuryType
We did not capture the "Not injured"
or empty-string value in the incident_officer_injury_type
table, but that value is largely redundant with OfficerInjuriesInjured
which is "True"
precisely when OfficerInjuryType
is "Not Injured"
or the empty string, except in 8 rows, which are evidently miscoded.
stopifnot( 8 == uof_raw %>% select(OfficerInjuryType, OfficerInjuriesInjured) %>% mutate(NoInjury = str_detect(OfficerInjuryType, "Not injured") | OfficerInjuryType == "") %>% filter(NoInjury == (OfficerInjuriesInjured == "True")) %>% nrow() )
SubjectArrested
, SubjectType
, SubjectAge
, SubjectRaceEthnicity
, SubjectsGender
, SubjectInjuredInIncident
, SubjectInjuredPriorToIncident
These fields are reflected in the subject
table.
IncidentYear
IncidentYear
is redundant with IncidentDate
.
stopifnot( all( uof_raw$IncidentYear == lubridate::year(uof_raw$IncidentDate) ) )
incident
fieldsagency_county
Switch to using the census county names. In particular, the non-counties "NJSP"
and "Other"
will be mapped to NA
.
incident <- incident %>% mutate(agency_county=paste0(agency_county, " County")) %>% mutate(agency_county = factor(agency_county, county_levels))
agency_name
There are over 400 distinct values in this field, at least 19 of which appear exactly once, so we will not use a factor here.
In order to reconcile the agency names in this dataset with the agency names in the NJ OAG Law Enforcement Officer Diversity dataset, we will make some modifications.
incident <- incident %>% mutate( agency_name = case_when( agency_name == "Burlington County Sheriffs Office" ~ "Burlington Co Sheriffs Office", agency_name == "Mercer Co Prosecutor Off" ~ "Mercer Co Prosecutors Office", agency_name == "Hudson Co ProsecutorOff-S/Force" ~ "Hudson Co Prosecutors Office", agency_name == "Somerset Co Sheriffs Dept" ~ "Somerset Co Sheriffs Office", agency_name == "Middle Twsp PD" ~ "Middle Twp PD", agency_name == "State Police" ~ "New Jersey State Police", agency_name == "Rutgers Univ Police" ~ "Rutgers University PD", agency_name == "Division Of Fish And Wildlife" ~ "NJ Division Of Fish And Wildlife", agency_name == "NJ State Human Services Police" ~ "NJ Department Of Human Services", agency_name == "Division of Criminal Justice" ~ "NJ Division of Criminal Justice", agency_name == "Park Police" ~ "New Jersey State Park Police", agency_name == "Toms River Township" ~ "Toms River Twp PD", agency_name == "Hardyston Twp Police Dept" ~ "Hardyston Twp PD", agency_name == "Linden Police Department" ~ "Linden PD", TRUE ~ agency_name ) )
officer_name
and officer_name_id
The officer_name
associated to a single officer_name_id
is not always consistent in spelling and capitalization. For example:
incident %>% group_by(officer_name_id) %>% summarise(distinct_names=n_distinct(officer_name)) %>% filter(distinct_names>1) %>% left_join(select(incident, officer_name, officer_name_id), by="officer_name_id") %>% unique()
Therefore, we will choose the most common variant for each name id and apply that to all rows. We will save variant names in a new table, officer_name_variants
.
incident <- incident %>% mutate(officer_name = na_if(officer_name, "")) standard_names <- incident %>% count(officer_name, officer_name_id) %>% group_by(officer_name_id) %>% slice(which.max(n)) %>% ungroup() %>% select(officer_name_id, officer_name) officer_name_variants <- incident %>% select(officer_name_id, officer_name) %>% unique() incident <- incident %>% select(-officer_name) %>% left_join(standard_names, by="officer_name_id") rm(standard_names)
report_number
, incident_case_number
, and incident_date_1
We leave these fields as they are.
incident_municipality
This field is of
the form <municipality>, <county-name>
. We will break this field up into incident_municipality
and incident_municipality_county
.
incident <- incident %>% separate( incident_municipality, c("incident_municipality", "incident_municipality_county"), sep = ",", fill = "right" ) %>% mutate(incident_municipality_county = str_trim(incident_municipality_county))
Next, we will build a lookup table, matching the unique municipality/county values in the UOF dataset to the census municipality names. This is done in three steps.
unique_municipalities <- incident %>% select(incident_municipality_county, incident_municipality) %>% unique()
First step: Exact matches:
lookup_1 <- unique_municipalities %>% inner_join(census_municipalities, by=c("incident_municipality"="municipality_and_type", "incident_municipality_county"="county")) %>% mutate(census_municipality=incident_municipality) %>% select(incident_municipality, incident_municipality_county, census_municipality)
Second step: Matches with the base name (excluding the municipality "type", such as "borough" or "township"):
lookup_2 <- unique_municipalities %>% anti_join(lookup_1, by=c("incident_municipality", "incident_municipality_county")) %>% inner_join(census_municipalities, by=c("incident_municipality"="municipality", "incident_municipality_county"="county")) %>% mutate(census_municipality=municipality_and_type) %>% select(incident_municipality, incident_municipality_county, census_municipality)
Third step: The remaining municipalities require ad-hoc logic.
lookup_3 <- unique_municipalities %>% anti_join( lookup_1, by = c("incident_municipality", "incident_municipality_county") ) %>% anti_join( lookup_2, by = c("incident_municipality", "incident_municipality_county") ) %>% mutate( census_municipality = case_when( # Name changes: Dover, Ocean County --> Toms River # Washington, Mercer County --> Robbinsville # West Paterson, Essex County --> Woodland Park # South Belmar, Monmouth County --> Spring Lake incident_municipality_county == "Ocean County" & incident_municipality == "Dover" ~ "Toms River township", incident_municipality_county == "Mercer County" & incident_municipality == "Washington" ~ "Robbinsville township", incident_municipality_county == "Passaic County" & incident_municipality == "West Paterson" ~ "Woodland Park borough", incident_municipality_county == "Monmouth County" & incident_municipality == "South Belmar" ~ "Spring Lake borough", # "Princeton township" and "Princeton borough" merged to form just-plain # "Princeton" -- the only municipality in NJ without a type! incident_municipality_county == "Mercer County" & incident_municipality == "Princeton township" ~ "Princeton", # Cleanup miscellaneous idiosyncrasies. incident_municipality_county == "Essex County" & incident_municipality == "Village of South Orange" ~ "South Orange Village township", incident_municipality_county == "Essex County" & incident_municipality == "Caldwell Borough" ~ "Caldwell borough", incident_municipality_county == "Monmouth County" & incident_municipality == "Spring Lake Boro" ~ "Spring Lake borough", incident_municipality_county == "Somerset County" & incident_municipality == "Peapack & Gladstone" ~ "Peapack and Gladstone borough" ) )
Now build the full lookup table:
lookup <- rbind(lookup_1, lookup_2, lookup_3)
Check that we have no duplicates, and that the only municipality we are mapping
to NA
is "Other"
:
stopifnot(0 == lookup %>% count(incident_municipality, incident_municipality_county) %>% filter(n>1) %>% nrow() ) stopifnot("Other" == lookup %>% filter(is.na(census_municipality)) %>% pull(incident_municipality) )
Now, finally, we can use the lookup table to tidy the incident_municipality field
:
incident <- incident %>% left_join(lookup, by = c("incident_municipality", "incident_municipality_county")) %>% mutate(incident_municipality=census_municipality) %>% select(-census_municipality)
Let's also use the county levels for incident_municipality_county
:
incident <- incident %>% mutate(incident_municipality_county = factor(incident_municipality_county, county_levels))
other_officer_involved
and officer_in_uniform
These fields are booleans.
incident <- incident %>% mutate( other_officer_involved = as.logical(other_officer_involved), officer_in_uniform = as.logical(officer_in_uniform) )
indoor_or_outdoor
This field could indicate one or the other, or both, though the order is not consistent:
incident %>% count(indoor_or_outdoor)
We will break this into a pair of booleans.
incident <- incident %>% mutate( indoors = str_detect(indoor_or_outdoor, "Indoors"), outdoors = str_detect(indoor_or_outdoor, "Outdoors") ) %>% select(-indoor_or_outdoor)
video_footage
.We will correct the spelling of "Unknown"
before using a factor.
video_footage_levels <- c("Yes", "No", "Unknown") incident <- incident %>% mutate(video_footage=str_replace(video_footage, "Unknow", "Unknown")) %>% mutate(video_footage=factor(video_footage, levels=video_footage_levels))
officer_age
This column requires some cleanup. We also remove implausible ages.
incident <- incident |> mutate(officer_age = str_remove(officer_age, regex("\\s*years\\s*old", ignore_case=TRUE))) |> mutate(officer_age = str_remove_all(officer_age, "/|`")) |> mutate(officer_age = str_replace(officer_age, regex("twenty-nine", ignore_case=TRUE), "29")) |> mutate(officer_age = str_replace(officer_age, regex("twenty four", ignore_case=TRUE), "24")) # After this cleanup, we expect only three exceptions exceptions <- c("NA", "3.5", "NEWARK POLICE VEST") stopifnot( exceptions == incident |> filter(!str_detect(officer_age, "^\\d+$")) |> pull(officer_age) |> unique() ) # Convert to integer and remove implausible ages incident <- incident |> mutate(officer_age = parse_integer(officer_age, na=exceptions)) |> mutate(officer_age = ifelse(between(officer_age, 18, 67), officer_age, NA))
officer_race
and officer_gender
We perform the same transformations as we did on the subject
race
and gender
columns.
incident <- incident %>% mutate( officer_race = case_when( officer_race == "Black or African American" ~ "Black", officer_race == "Am. Indian" ~ "American Indian", TRUE ~ officer_race ), officer_race = factor(officer_race, levels = race_levels) ) incident <- incident %>% mutate(officer_gender=factor(officer_gender, levels=gender_levels))
officer_rank
This field appears to be free text, and there are a lot of variations. We leave the column unchanged.
officer_injured
We will convert this to a boolean.
incident <- incident %>% mutate(officer_injured = (officer_injured=="1"))
subject_injured_count
We leave this field unchanged.
subject_count
columnincident <- incident %>% left_join(count(subject, form_id, name = "subject_count"), by = "form_id")
incident <- incident %>% relocate( form_id, report_number, incident_case_number, incident_date_1, agency_county, agency_name, incident_municipality, incident_municipality_county, officer_name_id, officer_name, officer_age, officer_race, officer_rank, officer_gender, officer_injured, video_footage, indoors, outdoors, subject_count, subject_injured_count )
incident
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.