logger$info("Entering descriptive statistics section")



Descriptive statistics

Basic statistics

MIN_YEARS <- 5L
MIN_SURVEYS <- 10L

The number of years and the number of surveys for each location_code should not be too small, otherwise the calculations in this report will be less reliable. In addition, the surveys should ideally also be evenly spread in time.

Note that litteR does not enforce a minimum number of years or surveys. That is the responsibility of the user. As a guideline, we advise a minimum of r MIN_YEARS years and r MIN_SURVEYS surveys, evenly distributed in time.

The table below gives the number of surveys and the number of years for each location_code.



if (REGIONAL_ANALYSIS) {
    d <- d_ltr %>%
        filter(type_name == "TC") %>%
        transmute(
            region_code,
            location_code,
            date,
            year = date %>%
                format("%Y"))  %>%
        group_by(region_code, location_code) %>%
        summarise(
            `number of years` = length(unique(year)),
            `number of surveys` = n(),
            .groups = "drop") 
    d %>%
        kable(align = "llrr")
} else {
    d <- d_ltr %>%
        filter(type_name == "TC") %>%
        transmute(
            location_code,
            date,
            year = date %>%
                format("%Y"))  %>%
        group_by(location_code) %>%
        summarise(
            `number of years` = length(unique(year)),
            `number of surveys` = n(),
            .groups = "drop") 
    d %>%
        kable(align = "lrr")
}


if (any(d %>% chuck("number of years") < MIN_YEARS)) {
    location_code <- d %>%
        filter(`number of years` < MIN_YEARS) %>%
        chuck("location_code")
    logger$warn(
        str_glue("The number of years may be insufficient (<{MIN_YEARS} years) for: "),
        location_code %>% sQuote %>% enumerate)
}    
if (any(d %>% chuck("number of surveys") < MIN_SURVEYS)) {
    location_code <- d %>%
        filter(`number of surveys` < MIN_SURVEYS) %>%
        chuck("location_code")
    logger$warn(
        str_glue("The number of surveys may be insufficient (<{MIN_SURVEYS} surveys) for: "),
        location_code %>% sQuote %>% enumerate)
}    
if (any(d %>% chuck("number of surveys") < 3L)) {
    location_code <- d %>%
        filter(`number of surveys` < 3L) %>%
        chuck("location_code")
    logger$warn(
        "The number of surveys is insufficient (<3 surveys) to estimate the regional trend for: ",
        location_code %>% sQuote %>% enumerate)
}    



For each location code and group/type name, the following statistics have been estimated for the period r str_c(DATE_FROM, DATE_TO, sep = " to "):

These statistics will be estimated for litter types with the greatest counts making up r PERCENTAGE_TOTAL_COUNT% of the total count for each location and for all groups specified in r sQuote(path_file(FILE_TYPES)).

These statistics have been stored in file r sQuote(path_file(FILE_STATS)).

logger$info("Creating table with litter statistics")
d_stats <- d_ltr %>%
    group_by(location_code, type_name, topx) %>%
    summarise(
        from = format(min(date)),
        to = format(max(date)),
        rel_count = mean(rel_count),
        mean = mean(count),
        median = median(count),
        cv = cv(count),
        rmad = rmad(count),
        n = n(),
        ts = list(theil_sen(date, count)),
        b0 = ts %>%
          first %>%
          intercept,
        b1 = ts %>%
          first %>%
          slope,
        slope = b1 * 365.25,
        p_value = if_else(
            are_na(b1),
            NA_real_,
            if_else(
                b1 > 0,
                p_value(mann_kendall(count, type = "increasing")),
                p_value(mann_kendall(count, type = "decreasing"))
            )
        ),
        .groups = "drop") %>%
    select(-ts)

# store stats in a CSV-file
d <- d_stats %>%
    filter(topx | are_na(topx)) %>% # only top x% or groups
    select(location_code, from, to, type_name,
           rel_count, mean, median, cv, rmad, n, slope, p_value) %>%
    arrange(location_code, desc(rel_count)) %>%
    rename(`%TC` = rel_count)

d %>%
    rename(`type/group_name` = type_name) %>%
    mutate(across(where(is_double) & !p_value, ~ formatC(.x, format = "fg", digits = 4))) %>%
    mutate(p_value = formatC(p_value, format = "f", digits = 4)) %>%
    mutate(across(where(is_character), ~ str_remove(.x, "^ +"))) %>%
    write_csv(FILE_STATS)

logger$info("Table with litter statistics created")

The statistics for the litter groups are given in the table below. These group statistics are based on all litter types and not only on those types with the highest counts.



d %>%
    filter(type_name == str_to_upper(type_name)) %>% # group codes
    rename(group_code = type_name) %>%
    mutate(across(where(is_double) & !p_value, ~ formatC(.x, format = "fg", digits = 4))) %>%
    mutate(p_value = formatC(p_value, format = "f", digits = 4)) %>%
    rename(`p-value` = p_value) %>%
    kable(align = "llllrrrrrrrr")



The figures below show for each location code the median count for each group.

d_stats %>%
    select(location_code, type_name, median) %>%
    filter(type_name == str_to_upper(type_name)) %>% # group codes
    arrange(location_code) %>%
    split(.$location_code) %>%
    walk(function(x) {
        x$type_name <- x %>%
            chuck("type_name") %>%
            factor(
                levels = d_type %>%
                    chuck("group_code") %>%
                    levels %>%
                    rev,
                ordered = TRUE)
      g <- ggplot(data = x) +
        geom_col(mapping = aes(x = type_name, y = median), fill = "blue", alpha = 0.5) +
        scale_x_discrete(name = "") +
        scale_y_continuous(name = "median count") +
        coord_flip() +
        ggtitle(x$location_code[1])
      print(g)
    })



Top 10

The table below gives for each location the top 10 of litter types, i.e., the 10 litter types with the highest median litter counts.

logger$info("Creating table with top 10 of litter types for each location")

d <- d_stats %>%
    select(location_code, type_name, median) %>%
    filter(type_name != str_to_upper(type_name)) %>% # remove group codes
    group_split(location_code) %>%
    map_df(function(x) {
        x %>%
            arrange(desc(median)) %>%
            slice(1:10) %>%
            mutate(rank = 1:n())}) %>%
    select(location_code, rank, type_name, `median count` = median) 

d %>%
    kable(align = "lllr", digits = 1)



The figure(s) below show(s) for each location the top 10 of litter types.

logger$info("Creating figures with top 10 of litter types for each location")

d %>%
    group_split(location_code) %>%
    walk(function(x) {
        x$type_name <- factor(
            x = x$type_name,
            levels = rev(x$type_name),
            ordered = TRUE)
        g <- ggplot(data = x) +
            geom_col(
                mapping = aes(x = type_name, y = `median count`),
                fill = "blue", alpha = 0.5) +
            scale_x_discrete(name = "") +
            coord_flip() +
            ggtitle(x$location_code[1])
        print(g)
    })


Try the litteR package in your browser

Any scripts or data that you put into this service are public.

litteR documentation built on Aug. 27, 2022, 1:05 a.m.