In litteR: Litter Analysis

logger$info("Entering descriptive statistics section")

Descriptive statistics

Basic statistics

MIN_YEARS <- 5L
MIN_SURVEYS <- 10L

The number of years and the number of surveys for each location_code should not be too small, otherwise the calculations in this report will be less reliable. In addition, the surveys should ideally also be evenly spread in time.

Note that litteR does not enforce a minimum number of years or surveys. That is the responsibility of the user. As a guideline, we advise a minimum of r MIN_YEARS years and r MIN_SURVEYS surveys, evenly distributed in time.

The table below gives the number of surveys and the number of years for each location_code.

if (REGIONAL_ANALYSIS) {
    d <- d_ltr %>%
        filter(type_name == "TC") %>%
        transmute(
            region_code,
            location_code,
            date,
            year = date %>%
                format("%Y"))  %>%
        group_by(region_code, location_code) %>%
        summarise(
            `number of years` = length(unique(year)),
            `number of surveys` = n(),
            .groups = "drop") 
    d %>%
        kable(align = "llrr")
} else {
    d <- d_ltr %>%
        filter(type_name == "TC") %>%
        transmute(
            location_code,
            date,
            year = date %>%
                format("%Y"))  %>%
        group_by(location_code) %>%
        summarise(
            `number of years` = length(unique(year)),
            `number of surveys` = n(),
            .groups = "drop") 
    d %>%
        kable(align = "lrr")
}

if (any(d %>% chuck("number of years") < MIN_YEARS)) {
    location_code <- d %>%
        filter(`number of years` < MIN_YEARS) %>%
        chuck("location_code")
    logger$warn(
        str_glue("The number of years may be insufficient (<{MIN_YEARS} years) for: "),
        location_code %>% sQuote %>% enumerate)
}    
if (any(d %>% chuck("number of surveys") < MIN_SURVEYS)) {
    location_code <- d %>%
        filter(`number of surveys` < MIN_SURVEYS) %>%
        chuck("location_code")
    logger$warn(
        str_glue("The number of surveys may be insufficient (<{MIN_SURVEYS} surveys) for: "),
        location_code %>% sQuote %>% enumerate)
}    
if (any(d %>% chuck("number of surveys") < 3L)) {
    location_code <- d %>%
        filter(`number of surveys` < 3L) %>%
        chuck("location_code")
    logger$warn(
        "The number of surveys is insufficient (<3 surveys) to estimate the regional trend for: ",
        location_code %>% sQuote %>% enumerate)
}

For each location code and group/type name, the following statistics have been estimated for the period r str_c(DATE_FROM, DATE_TO, sep = " to "):

mean count (mean):, i.e., the arithmetic mean of the counts for each litter type;
median count (median), i.e., the median of the counts for each litter type;
relative count (%TC): the contribution of each litter type to the total count of litter types (%);
coefficient of variation (cv): the ratio of the standard deviation to the mean of the counts for each litter type (expressed as a fraction);
ratio of mad and median (rmad, expressed as a fraction);
number of surveys (n);
Theil-Sen slope (slope): a robust non-parametric estimator of slope (litter counts / year);
p-value: the p-value associated with the one-tailed Mann-Kendall test to test the null hypothesis of
- no monotonically increasing trend in case the Theil-Sen slope is greater than zero;
- no monotonically decreasing trend in case the Theil-Sen slope is smaller than zero;

These statistics will be estimated for litter types with the greatest counts making up r PERCENTAGE_TOTAL_COUNT% of the total count for each location and for all groups specified in r sQuote(path_file(FILE_TYPES)).

These statistics have been stored in file r sQuote(path_file(FILE_STATS)).

logger$info("Creating table with litter statistics")
d_stats <- d_ltr %>%
    group_by(location_code, type_name, topx) %>%
    summarise(
        from = format(min(date)),
        to = format(max(date)),
        rel_count = mean(rel_count),
        mean = mean(count),
        median = median(count),
        cv = cv(count),
        rmad = rmad(count),
        n = n(),
        ts = list(theil_sen(date, count)),
        b0 = ts %>%
          first %>%
          intercept,
        b1 = ts %>%
          first %>%
          slope,
        slope = b1 * 365.25,
        p_value = if_else(
            are_na(b1),
            NA_real_,
            if_else(
                b1 > 0,
                p_value(mann_kendall(count, type = "increasing")),
                p_value(mann_kendall(count, type = "decreasing"))
            )
        ),
        .groups = "drop") %>%
    select(-ts)

# store stats in a CSV-file
d <- d_stats %>%
    filter(topx | are_na(topx)) %>% # only top x% or groups
    select(location_code, from, to, type_name,
           rel_count, mean, median, cv, rmad, n, slope, p_value) %>%
    arrange(location_code, desc(rel_count)) %>%
    rename(`%TC` = rel_count)

d %>%
    rename(`type/group_name` = type_name) %>%
    mutate(across(where(is_double) & !p_value, ~ formatC(.x, format = "fg", digits = 4))) %>%
    mutate(p_value = formatC(p_value, format = "f", digits = 4)) %>%
    mutate(across(where(is_character), ~ str_remove(.x, "^ +"))) %>%
    write_csv(FILE_STATS)

logger$info("Table with litter statistics created")

The statistics for the litter groups are given in the table below. These group statistics are based on all litter types and not only on those types with the highest counts.

d %>%
    filter(type_name == str_to_upper(type_name)) %>% # group codes
    rename(group_code = type_name) %>%
    mutate(across(where(is_double) & !p_value, ~ formatC(.x, format = "fg", digits = 4))) %>%
    mutate(p_value = formatC(p_value, format = "f", digits = 4)) %>%
    rename(`p-value` = p_value) %>%
    kable(align = "llllrrrrrrrr")

The figures below show for each location code the median count for each group.

d_stats %>%
    select(location_code, type_name, median) %>%
    filter(type_name == str_to_upper(type_name)) %>% # group codes
    arrange(location_code) %>%
    split(.$location_code) %>%
    walk(function(x) {
        x$type_name <- x %>%
            chuck("type_name") %>%
            factor(
                levels = d_type %>%
                    chuck("group_code") %>%
                    levels %>%
                    rev,
                ordered = TRUE)
      g <- ggplot(data = x) +
        geom_col(mapping = aes(x = type_name, y = median), fill = "blue", alpha = 0.5) +
        scale_x_discrete(name = "") +
        scale_y_continuous(name = "median count") +
        coord_flip() +
        ggtitle(x$location_code[1])
      print(g)
    })

Top 10

The table below gives for each location the top 10 of litter types, i.e., the 10 litter types with the highest median litter counts.

logger$info("Creating table with top 10 of litter types for each location")

d <- d_stats %>%
    select(location_code, type_name, median) %>%
    filter(type_name != str_to_upper(type_name)) %>% # remove group codes
    group_split(location_code) %>%
    map_df(function(x) {
        x %>%
            arrange(desc(median)) %>%
            slice(1:10) %>%
            mutate(rank = 1:n())}) %>%
    select(location_code, rank, type_name, `median count` = median) 

d %>%
    kable(align = "lllr", digits = 1)

The figure(s) below show(s) for each location the top 10 of litter types.

logger$info("Creating figures with top 10 of litter types for each location")

d %>%
    group_split(location_code) %>%
    walk(function(x) {
        x$type_name <- factor(
            x = x$type_name,
            levels = rev(x$type_name),
            ordered = TRUE)
        g <- ggplot(data = x) +
            geom_col(
                mapping = aes(x = type_name, y = `median count`),
                fill = "blue", alpha = 0.5) +
            scale_x_discrete(name = "") +
            coord_flip() +
            ggtitle(x$location_code[1])
        print(g)
    })