source(here::here('R/utils/arrests_utils_objects.R'))
source(here::here('R/utils/arrests_utils.R'))
source(here::here('R/utils/global_utils.R'))
source(here::here('R/make_sps/make_arrest_sps.R'))
source(here::here("R/make_sps/make_arrest_post_2020_sps.R"))
setwd("E:/ucr_data_storage/raw_data/asr_from_fbi")
files = list.files(pattern = "DAT|dat|TXT|txt")
files <- files[files != "2021_ASR12MON_NATIONAL_MASTER_FILE.txt"] # 2021 data does not follow setup file
files
get_post_2020_arrests("2021_ASR12MON_NATIONAL_MASTER_FILE.txt")
#clean_arrest_files(files[35:length(files)])
combine_arrest_yearly()
setwd("E:/ucr_data_storage/clean_data/arrests")
save_as_zip("ucr_arrests_monthly_other_crimes_1974_2020_",
pattern = "monthly_other_crimes")
save_as_zip("ucr_arrests_monthly_alcohol_or_property_1974_2020_",
pattern = "monthly_alcohol")
save_as_zip("ucr_arrests_monthly_drug_1974_2020_",
pattern = "monthly_drug")
save_as_zip("ucr_arrests_monthly_index_1974_2020_",
pattern = "monthly_index")
save_as_zip("ucr_arrests_monthly_all_crimes_race_sex_1974_2020_",
pattern = "monthly_all")
save_as_zip("ucr_arrests_yearly_data_1974_2021_",
pattern = "yearly.*_1974_2021")
get_post_2020_arrests <- function(files) {
setwd("E:/ucr_data_storage/clean_data/arrests")
file_categories <- list.files(pattern = "yearly.*2020.rds")
file_categories <- gsub(".....rds$", "", file_categories)
for (file in files) {
setwd("E:/ucr_data_storage/raw_data/asr_from_fbi")
agency_header <- read_ascii_setup(file,
here::here("setup_files/arrests_agency_header_post_2020.sps")) %>%
filter(offense %in% 0) %>%
dplyr::select(-offense,
-not_used1,
-not_used2,
-not_used3,
-not_used4,
-agency_count,
-identifier_code,
-sequence_number,
-county) %>%
distinct(ori, .keep_all = TRUE) %>%
mutate(year = fix_years(year),
state_abb = make_state_abb(state))
detail_header <- read_ascii_setup(file,
here::here("setup_files/arrests_detail_header_post_2020.sps")) %>%
filter(!offense_code %in% "header record") %>%
dplyr::select(-identifier_code,
-state,
-population_group,
-country_division,
-msa,
-not_used,
-year,
-card_1_indicator,
-card_2_indicator,
-card_3_indicator,
-adjustment) %>%
distinct(ori, offense_code, .keep_all = TRUE)
detail_header <- fastDummies::dummy_rows(detail_header,
select_columns = c("ori",
"offense_code"),
dummy_value = 0)
# Make columns numeric
for (i in 3:ncol(detail_header)) {
detail_header[, i] <- as.numeric(detail_header[, i])
}
detail_header <- make_simple_columns(detail_header)
arrestee_info <- names(detail_header)
arrestee_info <- arrestee_info[!arrestee_info %in% c("ori", "offense_code")]
detail_header <-
detail_header %>%
pivot_wider(names_from = offense_code,
values_from = arrestee_info,
names_glue = "{offense_code}_{.value}")
detail_header <- data.frame(detail_header)
source(here::here('R/crosswalk.R'))
crosswalk <- read_merge_crosswalks()
agency_header <-
agency_header %>%
filter(ori %in% detail_header$ori) %>%
left_join(crosswalk) %>%
left_join(detail_header)
names(agency_header) <- gsub("poss_drug_total_tot_female_adult",
"poss_drug_total_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("poss_heroin_coke_tot_female_adult",
"poss_heroin_coke_tot_female_adu", names(agency_header))
names(agency_header) <- gsub("poss_other_drug_tot_female_adult",
"poss_other_drug_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("poss_synth_narc_tot_female_adult",
"poss_synth_narc_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("sale_drug_total_tot_female_adult",
"sale_drug_total_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("sale_heroin_coke_tot_female_adult",
"sale_heroin_coke_tot_female_adu", names(agency_header))
names(agency_header) <- gsub("sale_other_drug_tot_female_adult",
"sale_other_drug_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("sale_synth_narc_tot_female_adult",
"sale_synth_narc_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("poss_heroin_coke_female_under_10",
"poss_heroin_coke_female_under10", names(agency_header))
names(agency_header) <- gsub("sale_heroin_coke_female_under_10",
"sale_heroin_coke_female_under10", names(agency_header))
names(agency_header) <- gsub("prostitution_pro_tot_female_adult", "prostitution_pro_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("prostitution_ass_tot_female_adult", "prostitution_ass_tot_female_adul", names(agency_header))
names(agency_header) <- gsub("prostitution_pur_tot_female_adult", "prostitution_pur_tot_female_adul", names(agency_header))
setwd("E:/ucr_data_storage/clean_data/arrests")
for (file_category in file_categories) {
temp2020 <- readRDS(paste0(file_category, "2020.rds"))
temp_new <- agency_header[, names(agency_header) %in% names(temp2020)]
saveRDS(temp_new, file = paste0(file_category,
unique(agency_header$year), ".rds"))
write_dta(temp_new, path = paste0(file_category,
unique(agency_header$year), ".dta"))
}
}
}
clean_arrest_files <- function(files) {
print(files)
for (file in files) {
sps_years <- "1980_present"
if (grepl("7[4-9]", file)) {
sps_years <- "1974_1979"
}
setwd("E:/ucr_data_storage/raw_data/asr_from_fbi")
agency_header <- get_agency_header(file, sps_years)
monthly_header <- get_monthly_header(file, sps_years)
detail_header <- get_detail_header(file, sps_years)
number_of_months_reported <- get_number_months_reported(detail_header)
yearly_monthly_header <- make_arrests_yearly(monthly_header, "monthly")
yearly_detail_header <- make_arrests_yearly(detail_header, "offenses")
# Some agency header rows have the wrong year and appear to be
# just bad data (e.g. state = 0, ORI is not in right format)
# Keeps just most common year
agency_header <- agency_header[agency_header$year %in%
as.numeric(names(sort(table(agency_header$year),
decreasing = TRUE)[1])), ]
source(here::here('R/crosswalk.R'))
crosswalk <- read_merge_crosswalks()
# Monthly
long_to_wide_and_save(detail_header,
monthly_header,
agency_header,
number_of_months_reported,
type = "monthly",
crosswalk)
# Yearly
long_to_wide_and_save(yearly_detail_header,
yearly_monthly_header,
agency_header,
number_of_months_reported,
type = "yearly",
crosswalk)
message(file); gc();
}
}
make_arrests_yearly <- function(data, type) {
cols_to_remove <- "month"
cols_to_remove <- c(cols_to_remove,
grep("date", names(data), value = TRUE))
constant_cols <- c("ori", cols_to_remove)
constant_cols <- constant_cols[constant_cols != "month"]
group_by_cols <- "ori"
if (type == "offenses") {
group_by_cols <- c(group_by_cols, "offense_code")
}
constant_data <-
data %>%
dplyr::select(one_of(constant_cols)) %>%
dplyr::distinct(ori, .keep_all = TRUE)
agg_data <-
data %>%
dplyr::select(-one_of(cols_to_remove)) %>%
dplyr::group_by_at(vars(one_of(group_by_cols))) %>%
dplyr::summarize_all(sum, na.rm = TRUE)
data <-
constant_data %>%
dplyr::left_join(agg_data, by = "ori")
return(data)
}
long_to_wide_and_save <- function(detail_header,
monthly_header,
agency_header,
number_of_months_reported,
type,
crosswalk) {
setwd("E:/ucr_data_storage/clean_data/arrests")
cross_names <- names(crosswalk)
cross_names <- cross_names[!cross_names %in% c("ori", "ori9")]
time_vars <- "year"
identifier_cols <- c("ori",
"offense_code")
first_join_by <- "ori"
if (type == "monthly") {
time_vars <- c("year", "month")
identifier_cols <- c("ori",
"offense_code",
"month")
first_join_by <- c("ori", "month")
}
for (i in 1:length(combined_crimes)) {
for (n in 1:nrow(matches_types)) {
file_name <- paste0("ucr_arrests_",
type, "_",
names(combined_crimes)[i], "_",
matches_types$name[n], "_")
if (names(combined_crimes)[i] == "all_crimes" &&
matches_types$name[n] %in% "age") {
next # Skips over the age categories for the all_crimes one -
# so my laptop doesn't crash
}
if (names(combined_crimes)[i] != "all_crimes" &&
matches_types$name[n] %in% "race_sex") {
next
}
# Subset to desired offense-rows and columns
wide_data <-
detail_header %>%
dplyr::filter(offense_code %in% combined_crimes[[i]]) %>%
dplyr::select(identifier_cols,
dplyr::matches(matches_types$match[n]))
# A very small number of results have same ORI, offense_code,
# and month so this removes the duplicate. I think that
# agency just made a mistake when putting in their data
if (type == "monthly") {
wide_data <-
wide_data %>%
dplyr::distinct(ori, offense_code, month, .keep_all = TRUE)
}
number_of_months_reported_temp <-
number_of_months_reported %>%
dplyr::select(ori,
number_of_months_reported,
one_of(paste0("num_months_", combined_crimes[[i]])))
monthly_cols <- paste0("num_months_", combined_crimes[[i]])
monthly_cols <- names(number_of_months_reported_temp)[names(number_of_months_reported_temp) %in% monthly_cols]
wide_data <-
wide_data %>%
make_long_to_wide(type = type) %>%
# Combine everything together
dplyr::full_join(monthly_header, by = first_join_by)
if (type == "monthly") {
all_agencies_time <- data.frame(ori = rep(agency_header$ori, times = 12)) %>%
arrange(ori)
all_agencies_time$month <- rep(tolower(month.name), nrow(agency_header))
all_agencies_time <- all_agencies_time[!paste(all_agencies_time$ori,
all_agencies_time$month) %in%
paste(wide_data$ori,
wide_data$month),]
} else if (type == "yearly") {
all_agencies_time <- data.frame(ori = agency_header$ori)
all_agencies_time <- all_agencies_time[!all_agencies_time$ori %in% wide_data$ori,
drop = FALSE, ]
}
wide_data <- bind_rows(wide_data, all_agencies_time)
wide_data[is.na(wide_data)] <- 0
wide_data <-
wide_data %>%
dplyr::full_join(agency_header, by = "ori") %>%
dplyr::left_join(number_of_months_reported_temp, by = "ori") %>%
dplyr::left_join(crosswalk, by = "ori") %>%
dplyr::mutate_if(is.character, tolower) %>%
dplyr::mutate(ori = toupper(ori),
ori9 = toupper(ori9),
state_abb = toupper(state_abb)) %>%
dplyr::select(ori,
ori9,
population,
agency_name,
one_of(time_vars),
state,
state_abb,
population_group,
country_division,
cross_names,
matches("date"),
covered_by,
number_of_months_reported,
monthly_cols,
everything()) %>%
dplyr::select(-agency_type,
-crosswalk_agency_name,
-census_name) %>%
filter(!is.na(year)) %>%
mutate_at(c("number_of_months_reported", monthly_cols), na_to_0)
saveRDS(wide_data, file = paste0(file_name, unique(wide_data$year), ".rds"))
if (type == "monthly") {
write_dta(wide_data, path = paste0(file_name, unique(wide_data$year), ".dta"))
}
gc()
}
}
}
combine_arrest_yearly <- function() {
setwd("E:/ucr_data_storage/clean_data/arrests")
file_categories <- list.files(pattern = "yearly.*rds")
file_categories <- gsub(".....rds$", "", file_categories)
file_categories <- file_categories[file_categories != "ucr_arrests_yearly_all_variables_"]
if (any(grep("[0-9]", file_categories))) {
file_categories <- file_categories[-grep("[0-9]", file_categories)]
}
file_categories <- unique(file_categories)
for (file_category in file_categories) {
annual_files <- list.files(pattern = file_category)
# Drop dta or already aggregated files
if (any(grep("dta$|[0-9]{4}_[0-9]{4}", annual_files))) {
annual_files <- annual_files[-grep("dta$|[0-9]{4}_[0-9]{4}", annual_files)]
}
data <- vector(mode = "list", length(annual_files))
for (i in 1:length(annual_files)) {
data[[i]] <- readRDS(annual_files[i])
message(annual_files[i])
}
# 1974 and 1975 doesn't have possession crimes or total drug sale crimes
data <- data.table::rbindlist(data, use.names=TRUE,
fill = TRUE) %>%
arrange(ori,
desc(year)) %>%
dplyr::select(ori,
ori9,
population,
agency_name,
year,
state,
state_abb,
population_group,
country_division,
fips_state_code,
fips_county_code,
fips_state_county_code,
fips_place_code,
longitude,
latitude,
covered_by,
everything())
# Handles 1974 and 1975 having no drug possession arrests. Replaces those NAs with zero.
# Also for post-2020 years where some categories are missing.
data[is.na(data)] <- 0
saveRDS(data,
file = paste0(file_category, min(data$year), "_", max(data$year), ".rds"))
write_dta(data,
path = paste0(file_category, min(data$year), "_", max(data$year), ".dta"))
message(file_category)
message("\n\n\n")
rm(data); gc()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.