data-raw/process_raw_data.R

library(dplyr)
library(igraph)
library(purrr)
library(magrittr)
library(tidyr)
library(parallel)
library(sf)
library(spdep)
library(geosphere)
library(rmapshaper)

#------------------------------- Load raw data
# The cross section dataset contains information for only those municipalities
# with land distribution data
cross_section_raw <- haven::read_dta("data-raw/Cross_Section_final_RAW_1076.dta")

# The panel dataset contains information for municipalities that we cannot
# use in our models because we lack land distribution data for them
panel_raw <- haven::read_dta("data-raw/panel-raw.dta")

survey_raw <- haven::read_dta("data-raw/survey-raw.dta")

municipalities_and_regions_raw <- readr::read_csv("data-raw/DANE_municipality_codes_and_regions.csv")

#-------------------------------------------------------------------------------
# Shapefile for Colombian municipalities
#    source:   <https://data.humdata.org/dataset/cod-ab-col>
#    filename: "Col Administrative Divisions Shapefiles.zip"
#-------------------------------------------------------------------------------

# Read in shapefile.
CO_shape <- st_read('data-raw/CO-shapefiles/col_admbnda_adm2_mgn_20200416.shp')


#------------------- Merge department/province names
municipalities_and_regions <- municipalities_and_regions_raw %>%
  rename(department = number_dept, municipality = Muni_code) %>%
  mutate(municipality = as.numeric(municipality))

cross_section <- left_join(cross_section_raw, municipalities_and_regions)
panel <- left_join(panel_raw, municipalities_and_regions)

rm(municipalities_and_regions_raw, cross_section_raw, panel_raw,
   municipalities_and_regions)

#------------------------------- Remove two islands: 88001 and 88564
CO_islands <- c('88001', '88564')

survey_raw %>%
  filter(municipality_origin %in% CO_islands) #None in the survey

cross_section %<>% # Assignment pipe!
  filter(!(municipality %in% CO_islands))

panel <- panel %>%
  filter(!(municipality %in% CO_islands))

CO_shape <- CO_shape %>%
  mutate(municipality = as.numeric(stringr::str_remove(ADM2_PCODE, 'CO'))) %>%
  filter(!(municipality %in% CO_islands))

rm(CO_islands)

#-------------------------------- Construct List of Neighbors
# construct list of neighbors: neighbors gives *indices* corresponding to
# rows of CO_shape while neighbor_codes gives *municipality codes*, i.e. the
# correponding values of the column municipality from CO_shape
neighbors <- poly2nb(CO_shape)
neighbor_codes <- lapply(neighbors, function(x) CO_shape$municipality[x])
names(neighbor_codes) <- CO_shape$municipality

#-------------------------------------- Select and rename panel variables
panel %<>% # Assignment pipe!
  filter(year %in% 1996:2008) %>% # Only use panel data from 1996-2008
  select(municipality,
         year,
         V_cum = ac_vcivilparas_UR, # Cumulative paramilitary violence
         V_flow = vcivilparas_UR, # Flow of paramilitary violence
         placeboV_cum = ac_vcivilnotparas_UR, # Cumulative non-paramilitary violence
         placeboV_flow = vcivilnotparas_UR, # Flow of non-paramilitary violence
         D_AS = desplazados_paras_AS, # Four measures of displacement
         D_RUV = desplazados_total_RUV,
         D_CEDE = desplazados_CEDE,
         D_JYP = desplazados_JYP,
         pop1993 = Total_Poblacion1993)

#--------------------------- Compare municipalities in neighbors list vs panel
panel_munis <- panel %>%
  pull(municipality) %>%
  unique()

munis <- as.numeric(stringr::str_remove(CO_shape$ADM2_PCODE, 'CO'))

# All municipalities in panel are also in adjacency matrix
identical(sum(panel_munis %in% munis), length(panel_munis))

#---------------------- Construct average violence in neighboring municipalities

get_V_flow_neighbors <- function(muni_code) {
  V_flow_neighbors <- panel %>%
    filter(municipality %in% neighbor_codes[[paste(muni_code)]]) %>%
    select(municipality, year, V_flow) %>%
    pivot_wider(names_from = municipality, values_from = V_flow) %>%
    select(-year) %>%
    apply(1, mean)

  out <- panel %>%
    filter(municipality == muni_code) %>%
    select(municipality, year)

  # A very small number of municipalities have neighbors, but not neighbors for
  # which we observe violence data, in which case V_flow_neighbors equals numeric(0)
  if(length(V_flow_neighbors) > 0){
    out$V_flow_neighbors <- V_flow_neighbors
  } else {
    out$V_flow_neighbors <- NA_real_
  }
  return(out)
}

V_flow_neighbors <- mclapply(panel_munis, get_V_flow_neighbors, mc.cores = 8)
V_flow_neighbors <- do.call(rbind, V_flow_neighbors)


panel %<>% # Assignment pipe!
  left_join(V_flow_neighbors)

rm(V_flow_neighbors, get_V_flow_neighbors, munis, panel_munis)

#-------------------------------- Clean land distribution characteristics

#------------------------------------------------------------------------------
# The column landless_families in from Cross_section.dta was constructed by
# Camilo's original RA. There are some discrepancies in this column. Our
# procedure for constructing a measure of the number of landless families in a
# municipality from the underlying census data is as follows. We assume, absent
#evidence to the contrary, that there is at most one landowner per family.
#------------------------------------------------------------------------------
# 1. Calculate n_landowners by summing counts in each column "landown_*"
# 2. Set n_families = pmax(num_families, n_landowners)
# 3. Set n_landless = n_families - n_landowners
#------------------------------------------------------------------------------

cross_section %<>% # Assignment pipe!
  rowwise(municipality) %>%
  mutate(n_landowners = sum(c_across(starts_with('landown_')))) %>%
  ungroup() %>%
  rename(n_families_census = num_families) %>%
  mutate(n_families = pmax(n_families_census, n_landowners),
         n_landless = n_families - n_landowners,
         omega_n = n_landless / n_families) %>%
  select(-landless_families)


#---------------------------------------------------------------------------------------
# We observe a discretized land distribution: we know the total amount of land and the
# total number of landholders within each of the following "bins" measured in hectares
#---------------------------------------------------------------------------------------
# 0
# (0,1)
# [1,3)
# [3,5)
# [5,10)
# [10,15)
# [15, 20)
# [20, 50)
# [50,100)
# [100, 200)
# [200, 500)
# [500, 1000)
# >=2000
#---------------------------------------------------------------------------------------
landowner_bins <- cross_section %>% # Number of families in each landholding "bin"
  select(n_landless, starts_with('landown_')) %>%
  relocate(n_landless) %>% # Make this the first column
  rename(`0` = n_landless,
         `(0,1)` = landown_less_1,
         `[1,3)` = landown_1_3,
         `[3,5)` = landown_3_5,
         `[5,10)` = landown_5_10,
         `[10,15)` = landown_10_15,
         `[15,20)` = landown_15_20,
         `[20,50)` = landown_20_50,
         `[50,100)` = landown_50_100,
         `[100,200)` = landown_100_200,
         `[200,500)` = landown_200_500,
         `[500,1000)` = landown_500_1000,
         `[1000,2000)` = landown_1000_2000 ,
         `>=2000` = landown_2000_plus)

landtotal_bins <- cross_section %>% # Total land in each landholding "bin"
  mutate(tot_land_0 = 0) %>% # landless families are those with exactly 0 land
  select(starts_with('tot_land_')) %>%
  relocate(tot_land_0) %>% # Make this the first column
  rename(`0` = tot_land_0,
         `(0,1)` = tot_land_1,
         `[1,3)` = tot_land_2,
         `[3,5)` = tot_land_3,
         `[5,10)` = tot_land_4,
         `[10,15)` = tot_land_5,
         `[15,20)` = tot_land_6,
         `[20,50)` = tot_land_7,
         `[50,100)` = tot_land_8,
         `[100,200)` = tot_land_9,
         `[200,500)` = tot_land_10,
         `[500,1000)` = tot_land_11,
         `[1000,2000)` = tot_land_12,
         `>=2000` = tot_land_13)

# Construct a list of land distributions: one for each municipality
make_land_dist <- function(i) {
  total_land <- landtotal_bins[i,]
  n_families <- landowner_bins[i,]
  out <- data.frame(t(rbind(total_land = landtotal_bins[i,], n_families = landowner_bins[i,])))
  out$mean_land <- ifelse(out$n_families > 0, out$total_land / out$n_families, 0)
  out$frac_families <- out$n_families / sum(out$n_families)
  return(out)
}
land_distributions <- lapply(seq_len(nrow(cross_section)), make_land_dist)
names(land_distributions) <- cross_section$municipality
rm(make_land_dist, landowner_bins, landtotal_bins)

# Function that computes summary statistics of the land distribution for use in
# our LINEAR IV models. The non-linear IV models rely on the entire land distribution
# stored in land_distributions
get_land_statistics <- function(x) {
  z <- x[-1,] # summary stats for *landholders* so remove landless
  z$frac_families <- z$frac_families / sum(z$frac_families)
  c('omega' = x$frac_families[1],
    'H1' = sum(asinh(z$mean_land) * z$frac_families),
    'H2' = sum(asinh(z$mean_land)^2 * z$frac_families),
    n_families = sum(x$n_families))
}

land_statistics <- lapply(land_distributions, get_land_statistics)
land_statistics <- as.data.frame(do.call(rbind, land_statistics))
land_statistics$municipality <- as.numeric(rownames(land_statistics))
rownames(land_statistics) <- NULL
rm(get_land_statistics)

cross_section %<>% # Assignment pipe!
  left_join(y = land_statistics) %>%
  mutate(omegaC = 1 - omega) %>% # It's convenient to have (1 - omega) as a separate variable
  select(-omega_n) # This is identical to omega: share of landless -- don't need both

rm(land_statistics)

# Cumulative violence in the final year of the panel, i.e. total violence, can
# be viewed as a cross-sectional characteristic. Merge this with the other
# cross_section data

merge_me <- panel %>%
  filter(year == max(year)) %>%
  select(municipality, V_cum)

cross_section %<>%
  inner_join(merge_me) %>%
  rename(V_total = V_cum) # Better to call it V_total: total violence from 1996:2008

rm(merge_me)

#------------------------ Flag municipalities w/ land data and >=100 landholders

# The municipalities in cross_section are those for which we have land dist data
identical(sum(!is.na(cross_section$H1)), nrow(cross_section))

cross_section %<>%
  mutate(at_least_100_landowners = n_landowners >= 100)

munis_100_plus <- cross_section %>%
  filter(at_least_100_landowners) %>%
  pull(municipality)

panel %<>%
  mutate(has_land_data = municipality %in% cross_section$municipality,
         at_least_100_landowners = municipality %in% munis_100_plus)

rm(munis_100_plus)


#-------------------------------------Clean Displacement Data

#-------------------------------------------------------------------------------
# Some displacement measures are zero for all municipalities during a given year.
# These are not "true" zeros, but rather missing observations: some agencies did
# not choose to report in a given year:
#-------------------------------------------------------------------------------
# AS      did not record in: 1996, 2012
# CEDE    did not record in: 1996, 2010, 2011, 2012
#-------------------------------------------------------------------------------
# Replace these zeros with NA.
#-------------------------------------------------------------------------------
# Technically not necessary to include 2010:2012 since we've already subsetted
# to remove those years.
panel[panel$year %in% c(1996, 2012),]$D_AS <- NA
panel[panel$year %in% c(1996, 2010:2012),]$D_CEDE <- NA


#-------------------------------------------------------------------------------
# For all measures except JYP, there are some municipalities in which a single
# measure reports zero total displacement over the sample period, while all other
# measures report positive displacement. These "zeros" are really NAs.
#-------------------------------------------------------------------------------
disagreement <- panel %>%
  select(municipality, year, starts_with('D_')) %>%
  group_by(municipality) %>%
  summarise(across(.cols = starts_with('D_'),
                   .fns = sum, na.rm = TRUE)) %>%
  rowwise() %>%
  filter(sum(c_across(starts_with('D_')) == 0) == 1) %>%
  ungroup()


AS_replace <- disagreement %>%
  filter(D_AS == 0) %>%
  pull(municipality)
panel[panel$municipality %in% AS_replace,]$D_AS <- NA

RUV_replace <- disagreement %>%
  filter(D_RUV == 0) %>%
  pull(municipality)
panel[panel$municipality %in% RUV_replace,]$D_RUV <- NA

CEDE_replace <- disagreement %>%
  filter(D_CEDE == 0) %>%
  pull(municipality)
panel[panel$municipality %in% CEDE_replace,]$D_CEDE <- NA

rm(disagreement, AS_replace, RUV_replace, CEDE_replace)

#--------------------------------------- Create lagged violence flow
panel <- panel %>%
  group_by(municipality) %>%
  mutate(lag_V_flow = lag(V_flow, order_by = year)) %>%
  ungroup()

#--------------------------- Re-scale displacement measures

# What share of total RUV and JYP displacement was reported in 1996?
# Average these two shares
share_1996 <- panel %>%
  select(year, D_RUV, D_JYP) %>%
  group_by(year) %>%
  summarize(RUV_sum = sum(D_RUV, na.rm = TRUE),
            JYP_sum = sum(D_JYP, na.rm = TRUE)) %>%
  mutate(RUV_share = RUV_sum / sum(RUV_sum),
         JYP_share = JYP_sum / sum(JYP_sum)) %>%
  filter(year == 1996) %>%
  select(RUV_share, JYP_share) %>%
  rowMeans()

panel %<>%
  mutate(across(c(D_RUV, D_JYP), # Normalize total displacement to 6 million
         list(norm = ~ 6e6 * . / sum(., na.rm = TRUE)))) %>%
  # Because AS and CEDE did not report in 1996 we don't normalize them to 6 million.
  # Instead we normalize them to a *slightly smaller value* constructed using
  # share_1996
  mutate(across(c(D_AS, D_CEDE),
                list(norm = ~ (1 - share_1996) * 6e6 * . / sum(., na.rm = TRUE)))) %>%
  rowwise() %>%
  mutate(D_med = median(c_across(ends_with('_norm')), na.rm = FALSE),
         D_avg = mean(c_across(ends_with('_norm')), na.rm = FALSE),
         D_med_any = median(c_across(ends_with('_norm')), na.rm = TRUE),
         D_avg_any = mean(c_across(ends_with('_norm')), na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(D_avg_no_AS = (D_RUV + D_CEDE + D_JYP) / 3,
         D_avg_no_RUV = (D_AS + D_CEDE + D_JYP) / 3,
         D_avg_no_CEDE = (D_AS + D_RUV + D_JYP) / 3,
         D_avg_no_JYP = (D_AS + D_RUV + D_CEDE) / 3) %>%
  # After taking medians of the normalized displacement variables (sum to 6 million) the result
  # will no longer sum to 6 million. Renormalize so it does
  mutate(across(c(starts_with('D_med'), starts_with('D_avg')), ~ 6e6 * . / sum(., na.rm = TRUE)))

rm(share_1996)


# Covariates ------------------------------------------------------------------

cross_section %<>%

  rename(mines = mine_titles_90, oil = oil_prod_98) %>%

  mutate(mines = if_else(mines > 0, 1, 0),
         drug_routes = if_else(drug_routes > 1, 1, 0),
         oil = if_else(oil > 0, 1, 0),
         bureaucracy = local_bureaucracy_95 + state_bureaucracy_95,
         elec_comp = 0.5 * (ratio_votes_dif_alc + ratio_votes_dif_asa),
         offices = rowSums(across(c(notaries_95,
                                    banks_95,
                                    savings_95,
                                    health_centers_95,
                                    health_posts_95,
                                    schools_95,
                                    libraries_95,
                                    fire_95,
                                    jails_95,
                                    culture_95,
                                    instruments_95,
                                    tax_95,
                                    police_95,
                                    police_posts_95,
                                    courts_95,
                                    telecom_95,
                                    post_95,
                                    agrarian_95,
                                    hospitals_95)))) %>%

  select(-starts_with('tot_land'), -starts_with('landown_'), -gini,
         -convexity_L, -starts_with('owners_'), -ends_with('_95'),
         -starts_with('ratio_votes'), -n_families_census,
         -contains('_signal'), -contains('educ_'), -dum_2001_land)


# Spatial stuff ----------------------------------------------------------------

# Read in ARCGIS-generated list of municipalities intersecting road network
roads <- readr::read_csv('data-raw/real_roads.csv') %>%
  mutate(municipality = stringr::str_remove(ADM2_PCODE, 'CO'),
         municipality = as.numeric(municipality))


# Read and clean forest & elevation data --------------------------------------

forests <- readr::read_csv("data-raw/Forest_Master_50.csv") %>%
  mutate(share_forested = extent_2000_ha / area_ha,
         is_forested = share_forested > 0.5) %>%
  select(ADM2_PCODE, is_forested) %>%
  mutate(municipality = stringr::str_remove(ADM2_PCODE, 'CO'),
         municipality = as.numeric(municipality)) %>%
  select(-ADM2_PCODE)

# Municipality 70523 (Palmito, Sucre) appears twice in forests, once with
# is_forested TRUE and again with is_forested FALSE. There are no forests here!
forests %<>%
  filter(!((municipality == 70523) & (is_forested)))

# Elevation *difference*
elevation <- readr::read_csv("data-raw/elevationsFinal.csv") %>%
  select(-`...1`) %>% # first column is a row number starting from zero
  mutate(municipality = stringr::str_remove(ADM2_PCODE, 'CO'),
         municipality = as.numeric(municipality)) %>%
  select(-ADM2_PCODE) %>%
  arrange(municipality)


# Create dataframe of geographic information ----------------------------------
geography <- cross_section %>%
  select(municipality, ruggedness, slope, elevation) %>%
  left_join(elevation) %>%
  left_join(forests) %>%
  mutate(has_road = if_else(municipality %in% roads$municipality, 1, 0),
         is_forested = 1 * is_forested)

rm(elevation, forests, roads)

centroid_coords <- CO_shape %>%
  st_centroid() %>% # compute lat and lon of centroids
  st_coordinates() # extract lat and lon of centroids as *matrix*

get_neighbor_distances <- function(i) {
  if(all(neighbors[[i]] == 0)) {
    return(NA) # No neighbors? No distances!
  } else {
    own_coords <- centroid_coords[i, ]
    neighbor_coords <- centroid_coords[neighbors[[i]], ]
    distGeo(own_coords, neighbor_coords) / 1000 # convert meters to km
  }
}
neighbor_distances <- lapply(1:length(neighbors), get_neighbor_distances)

get_pairwise_distances <- function(i) {
# Only return neighbors whose index is greater than i: eliminate "repeat" pairs
  i_neighbors <- neighbors[[i]]
  i_neighbor_distances <- neighbor_distances[[i]]
  keep <- i < i_neighbors
  if(any(keep)) {
    data.frame(i, j = i_neighbors[keep], dist = i_neighbor_distances[keep])
  } else {
    # Return NULL if no neighbors (entry in neighbors is zero) or all repeats
    NULL
  }
}


pairwise_distances <- lapply(1:length(neighbors), get_pairwise_distances)
pairwise_distances <- do.call(rbind, pairwise_distances)
pairwise_distances <- tibble(pairwise_distances)

rm(get_neighbor_distances, get_pairwise_distances, centroid_coords)


# Impute missing values --------------------------------------------------------

# There are 41 missing values for the variables elevation_difference and
# is_forested from geography. We impute these with an average of the same
# variables for *neighboring* municipalities. If a variable is missing for all
# neighboring municipalities, we fill in the overall mean of the variable for
# geography.


# Create list of neighboring municipalities with municipality *codes* rather
# than row numbers from CO_shape
muni_codes <- as.numeric(stringr::str_remove(CO_shape$ADM2_PCODE, 'CO'))
neighbors_codes <- lapply(neighbors, function(x) muni_codes[x])
names(neighbors_codes) <- muni_codes

# geography only has data for 1049 municipalities while CO_shape has data for
# 1122. Add the missing municipalities to geography
geography %<>%
  full_join(tibble(municipality = muni_codes))


impute_missing_var <- function(var_name) {
  muni_missing_var <- geography[is.na(geography[, var_name]), 'municipality', drop = TRUE]
  municipality_indices <- which(muni_codes %in% muni_missing_var)
  neighbor_list <- neighbors_codes[municipality_indices]
  f <- function(x) {
    neighbor_geography <- pull(geography[geography$municipality %in% x, var_name])
    mean(neighbor_geography, na.rm = TRUE)
  }
  out <- lapply(neighbor_list, f)
  out <- do.call(rbind, out)
  out <- data.frame(row.names(out), out[,1])
  names(out) <- c('municipality', paste0('impute_', var_name))
  row.names(out) <- NULL
  out$municipality <- as.numeric(out$municipality)
  # If no neighbors, then there's an NaN in the second column. Replace with the
  # overall average
  out[is.na(out[, 2]), 2] <- mean(geography[, var_name, drop = TRUE], na.rm = TRUE)
  return(tibble(out))
}

# Impute missing values for all variables *except* municipality (first column)
# and store the result in a list
imputed_geography <- lapply(names(geography)[-1], impute_missing_var)

imputed_geography %<>%
  reduce(full_join, by = 'municipality')


# Last step of the imputation, following the logic of the comment, but
# for all the imputed variables, not just impute_forest
geography %<>%
  full_join(imputed_geography) %>%
  mutate(ruggedness = if_else(!is.na(ruggedness), ruggedness, impute_ruggedness)) %>%
  mutate(slope = if_else(!is.na(slope), slope, impute_slope)) %>%
  mutate(elevation = if_else(!is.na(elevation), elevation, impute_elevation)) %>%
  mutate(elevation_difference = if_else(!is.na(elevation_difference),
                                       elevation_difference,
                                       impute_elevation_difference)) %>%
  mutate(is_forested =  if_else(!is.na(is_forested), is_forested, impute_is_forested)) %>%
  mutate(has_road = if_else(!is.na(has_road), has_road, impute_has_road)) %>%
  select(-starts_with('impute_'))


rm(impute_missing_var, imputed_geography, muni_codes, neighbors_codes)

cross_section <- cross_section %>%
  full_join(geography)

# Construct dataset of all unique *pairs* of neighboring municipalities --------

# Create geography_i by renaming the columns of geography to have an '_i'
# at the end. Do the same for geography_j. Then merge with pairwise distances.
geography_i <- geography %>%
  rename_with(~ stringr::str_c(., '_i'))

geography_j <- geography %>%
  rename_with(~ stringr::str_c(., '_j'))

pairwise_distances %<>% # Assignment pipe!
  mutate(municipality_i = stringr::str_remove(CO_shape$ADM2_PCODE[i], 'CO'),
         municipality_j = stringr::str_remove(CO_shape$ADM2_PCODE[j], 'CO'),
         municipality_i = as.numeric(municipality_i),
         municipality_j = as.numeric(municipality_j)) %>%
  select(-i, -j) %>%
  left_join(geography_i) %>%
  left_join(geography_j)

rm(geography_i, geography_j, geography)

link_stats <- pairwise_distances %>%
  mutate(forest = is_forested_i + is_forested_j,
         elevation = (elevation_difference_i + elevation_difference_j) / 2,
         ruggedness = (ruggedness_i + ruggedness_j) / 2,
         slope = (slope_i + slope_j) / 2,
         elevation_diff = abs(elevation_i - elevation_j)) %>%
  select(municipality_i, municipality_j, dist, forest, elevation, slope,
         ruggedness, elevation_diff)

pca <- prcomp(~ forest + elevation + ruggedness + slope + elevation_diff,
              data = link_stats, scale = TRUE, na.action = na.omit)


# Extract first PC and "rotate" so that higher values indicate less
# favorable terrain (higher "friction"). Note that this is mean zero
print(pca)

link_stats$PC1_geography <- -1 * pca$x[,1]

rm(pca, pairwise_distances)

# "Effective distance" is defined as:
#    dist * friction
# The friction is constructed from PC1 as follows:
#     z = (PC1 - min(PC1)) / (max(PC1) - min(PC1))
#     friction = 1 + (max_friction - 1) * z
#
# This gives a friction of 1 at the minimum of PC1 and a fraction of max_friction
# at the maximum of PC1. We use four values of max_friction: 1, 2, 2.5, and 3.
# Notice that max_friction = 1 means that PC1 plays no role: this is distance
# "as the crow flies" from one municipality centroid to the next.

link_stats %<>%
  mutate(z = (PC1_geography - min(PC1_geography)) / diff(range(PC1_geography)))


# Dijkstra's algorithm ---------------------------------------------------------

# First construct a graph from our dataframe of link_statistics
munigraph <- graph_from_data_frame(link_stats, directed = FALSE)

# The "Epicenter" of paramilitary violence: Valencia, Cordoba
epicenter <- '23855'
# Unweighted graph, no weights -> return "number of hops" between nodes
hops <- distances(munigraph, v = epicenter) # to argument defaults to V(graph)


get_dist_friction <- function(max_friction) {
  dist <- edge_attr(munigraph, 'dist')
  z <- edge_attr(munigraph, 'z')
  friction <- 1 + (max_friction - 1) * z
  my_weights <- dist * friction
  distances(munigraph, v = epicenter, weights = my_weights)
}

max_friction <- c(1, 2, 2.5, 3)
d_geography <- lapply(max_friction, get_dist_friction)
d_geography <- do.call(rbind, d_geography)
d_geography <- as.data.frame(t(d_geography))
names(d_geography) <- paste0('d_geography', max_friction)
d_geography$municipality <- as.numeric(rownames(d_geography))

dist_to_epicenter <- d_geography
dist_to_epicenter$d_hops <- drop(hops)
dist_to_epicenter <- as_tibble(dist_to_epicenter)
dist_to_epicenter <- dist_to_epicenter %>%
  relocate(municipality) %>%
  rename(d_crow = d_geography1)

# Calculate percentiles of distance measures (except hops)
dist_to_epicenter <- dist_to_epicenter %>%
  mutate(across(c('d_crow', starts_with('d_geography')), ~ (rank(.x) / length(.x)),
                .names = 'p_{.col}'))

rm(get_dist_friction, epicenter, max_friction, d_geography, hops,
   link_stats, neighbor_distances, munigraph)

cross_section <- cross_section %>%
  full_join(dist_to_epicenter)

rm(dist_to_epicenter)

# Simplify CO_shape polygons and remove islands --------------------------------

CO_shape <- ms_simplify(CO_shape) # defaults to retaining 5% of points

#-------------------------------------------------------------------------------
# Create abandoned land dataset.

# Note: There was an intermediate hand-cleaning step that was
# omitted between merging raw digitized with AttributeTableFinal.csv and input
# here to remove duplicates; the abandoned_land_handcleaned.csv file is the
# result of that cleaned merge.
#-------------------------------------------------------------------------------


usethis::use_data(CO_shape, overwrite = TRUE)
usethis::use_data(cross_section, overwrite = TRUE)
usethis::use_data(land_distributions, overwrite = TRUE)
usethis::use_data(neighbors, overwrite = TRUE)
usethis::use_data(neighbor_codes, overwrite = TRUE)
usethis::use_data(panel, overwrite = TRUE)

# clean up
rm(list = ls())
fditraglia/forcedMigration documentation built on July 16, 2022, 12:33 p.m.