Process the brain cancer data

This file shows how to reshape the New Mexico brain cancer data from the R package rsatscan [@rsatscan] to a format suitable for the scanstatistics package.

We begin by loading the needed packages and the data, which comes as these three data frames:

library(rsatscan)
library(data.table)

NM_pop <- as.data.table(rsatscan::NMpop)
NM_cas <- as.data.table(rsatscan::NMcas)

To get more familiar with the counties of New Mexico, let's plot them on a map:

library(ggplot2)
library(magrittr)
library(dplyr)

# Grab polygon data for plotting
NM_map <- map_data("county", "new mexico")

# Calculate geographical centroids from a polygon
# See: https://en.wikipedia.org/wiki/Centroid#Centroid_of_polygon
polygon_centroid <- function(x0, y0) {
  x1 <- c(x0[-1], x0[1])
  y1 <- c(y0[-1], y0[1])
  A <- sum(x0 * y1 - x1 * y0) / 2
  Cx <- sum((x0 + x1) * (x0 * y1 - x1 * y0)) / (6 * A)
  Cy <- sum((y0 + y1) * (x0 * y1 - x1 * y0)) / (6 * A)
  c(long = Cx, lat = Cy)
}
pgc <- function(df) polygon_centroid(as.numeric(df$long), as.numeric(df$lat))

centroids <- data.frame(subregion = unique(NM_map$subregion), long = NA, lat = NA)

# Calculate geographic centroids for each county
for (i in 1:nrow(centroids)) {
  centroids[i, c("long", "lat")] <- NM_map %>%
    filter(subregion == centroids$subregion[i]) %>%
    pgc
}

# Plot map with labels at centroids
ggplot() + 
  geom_polygon(data = NM_map,
               mapping = aes(x = long, y = lat, group = group),
               color = "white", fill = "grey") +
  geom_text(data = centroids, 
            mapping = aes(x = long, y = lat, label = subregion)) +
  ggtitle("Counties of New Mexico") +
  theme_bw()

Before proceeding, we will make some changes to the data to ensure that the counties are named the same across all tables. The main functions of the scanstatistic package require locations to be encoded as integers. Thus we will make the county names into a factor variable, which are integer vectors with additional attributes. All levels of the factor should also be present in the data.

library(stringr)
# library(plyr, warn.conflicts = FALSE)

# This function corrects inconsistencies in the county names
format_counties <- function(x) {
  x %>%
    tolower %>%
    str_replace_all(pattern = " ", replacement = "") %>%
    str_replace_all(pattern = "ñ", replacement = "n") %>%
    str_replace_all(pattern = "guadelupe", replacement = "guadalupe")
}

# Make the county factor variable
all_counties <- NM_map$subregion %>%
  unique %>%
  format_counties %>%
  as.factor

# Character vector without Cibola county
counties <- levels(all_counties)[-which(levels(all_counties) == "cibola")]

# This function corrects inconsistencies in x and makes it into a factor
factorize_counties <- function(x, fac = counties) {
  x %>%
    format_counties %>%
    factor(levels = c(fac, "cibola"))
}

# Fix inconsistencies and factorize counties in the data
centroids %<>% 
  mutate(county = format_counties(subregion),
         county = factorize_counties(county),
         center_long = long, center_lat = lat,
         long = NULL, lat = NULL,
         subregion = NULL)
NM_cas %<>% mutate(county = factorize_counties(county)) %>% as.data.table
NM_pop %<>% mutate(county = factorize_counties(county))
NM_map %<>% mutate(county = factorize_counties(subregion)) %>% as.data.table
NM_pop %<>% mutate(year = year + 1900) %>% as.data.table

Scan statistics aim to find localized anomalies in the data, meaning that the locations and timepoints involved in the anomaly are close together in some way. In this example, we will measure the spatial similarity using the spherical distance between the county seats. In other circumstances, we might use the geographic centroids (calculated above), or another distance measure. So let's grab the coordinates of the county seats from Wikipedia and plot them. Kudos to Cory Nissen for this blogpost, which shows how to fetch data from a table on Wikipedia.

library(rvest)

# Function to clean table entries
clean_entry <- function(x) {
  x %>%
    gsub("\\d+♠", "",  .) %>% 
    gsub("^[^\\(]+\\(", "", .) %>%
    gsub("\\,", "", .) %>%
    str_split(pattern = "[:blank:]") %>%
    lapply(function(x) x[1]) %>%
    unlist %>%
    as.numeric
    # gsub("\\ [:alpha:]+", "", .)
    # gsub("\ km2\\)", "", .) %>%
    # gsub("\ km²\\)", "", .)
}

# Grab the table from Wikipedia (as at 2017-08-01)
seat_url <- "https://en.wikipedia.org/wiki/List_of_counties_in_New_Mexico"
NM_geo <- seat_url %>%
  read_html %>%
  rvest::html_nodes(xpath = '//*[@id="mw-content-text"]/div/table[3]') %>%
  html_table %>%
  .[[1]] %>%
  select(County, `County seat[3]`, `Population[6]`, `Area[3][7]`) %>%
  rename(county = County, 
         seat = `County seat[3]`, 
         population = `Population[6]`, 
         `area(km2)` = `Area[3][7]`) %>%
  mutate_at(.vars = c("population", "area(km2)"), .funs = clean_entry) %>%
  mutate(county = gsub(" County", "", county)) %>%
  mutate(county = factorize_counties(county, counties)) %>%
  as.data.table


# Get the coordinates for the New Mexico county seats using ggmap::geocode
get_NM_longlat <- function(seats) {
  suppressMessages({
    res <- t(sapply(seats, function(x) t(ggmap::geocode(paste0(x, ", New Mexico")))))
  })
  data.frame(seat_long = res[, 1], seat_lat = res[, 2])
}

NM_geo <- cbind(NM_geo, get_NM_longlat(NM_geo$seat))
NM_geo <- merge(NM_geo, centroids, by = "county")

ggplot() + 
  geom_polygon(data = NM_map,
               mapping = aes(x = long, y = lat, group = group),
               color = "white", fill = "grey") +
  geom_point(data = NM_geo, 
             mapping = aes(x = seat_long, y = seat_lat)) +
  ggtitle("County seats of New Mexico") +
  theme_bw()

For the scan statistic functions in this package to work, locations and timepoints with zero counts must be filled in as such, and not left out. Thus, we will add the needed zeros to the case data. We will only consider the total number of cancer cases in each county and year, so we also take the oppotunity to aggregate the case counts over each age group and sex:

# Aggregate population and cases over sex and age group
NM_cas <- NM_cas[, .(count = sum(cases)), by = .(year, county)]
NM_pop <- NM_pop[, .(population = sum(population)), by = .(year, county)]

# Create the complete, sorted table of counties, years, age groups, and sex
combo_YC <- as.data.table(expand.grid(year = unique(NM_cas$year), 
                                      county = counties))

# Set keys for merge
setkeyv(NM_cas, c("year", "county"))
setkeyv(NM_pop, c("year", "county"))
setkeyv(combo_YC, c("year", "county"))

# Merge data
NM_cas <- merge(NM_cas, combo_YC, all = TRUE)
NM_pop <- merge(NM_pop, combo_YC, all = TRUE)

# Fill in zero case counts
NM_cas[is.na(count), count := 0]

To demonstrate the scan statistic methods of this package, we will try to detect brain cancer clusters in the years 1986--1989, using data from the previous years to predict how many cancer cases we can expect to see if there is no anomalous increase in incidence. In this particular example, we want to use the available population size data as offsets to our predicted means, in order to properly compare the occurence of cancer in the different counties. Since the population counts are only available for three of the years, we will interpolate the population for the remaining years by fitting a quadratic function of time to the population data for each county after aggregating the number of cancer cases over age group and gender[^1]. Though this leads to unrealistically smooth population changes, it is adequate for the purpose of demonstrating the functions in the scanstatistics package.

[^1]: One could also fit the quadratic function for each county, age group and sex, and then aggregate over the latter two categories in the hope that this would cancel out the errors made in each interpolation. However, this seems to lead to some unreasonable year-to-year population changes for some of the counties.

NM_pop[, population := as.numeric(population)]

# Interpolate county populations
for (s in counties) {
  pop_mod <- lm(population ~ 1 + I(scale(year)) + I(scale(year)^2), 
                data = NM_pop[county == s, ])
  pop_pred <- predict(pop_mod, NM_pop[is.na(population) & county == s, ], 
                      type = "response")
  NM_pop[is.na(population) & county == s, population := pop_pred]
}

Finally, we join the population and case tables and plot how the number of cases evolve in each county over the years:

NM_popcas <- merge(NM_pop, NM_cas)
setkeyv(NM_popcas, c("county", "year"))

ggplot(NM_popcas) + geom_line(aes(x = year, y = count, color = county)) +
  theme(legend.position = "none")

The data tables NM_popcas, NM_geo and NM_map are then saved as those available in the scanstatistics package, after converting them to data frames:

NM_popcas %<>% as.data.frame
NM_geo %<>% as.data.frame
NM_map %<>% as.data.frame


BenjaK/scanstatistics documentation built on May 5, 2019, 2:41 p.m.