demo/inttest-bootstrap.R

# This script demonstrates bootstrap sampling an area
#
# ok and ok_group are there so we can run this code and verify it is correct,
# and not needed every-day use.
#
# NB: if you want to run this, you must be using mfdb-workspace, also be warned
# any data stored in your database will be destroyed.
library(unittest)
library(mfdb)
source('tests/utils/helpers.R')
source('tests/utils/inttest-helpers.R')

# Empty database
if (exists("mdb")) mfdb_disconnect(mdb)
mfdb(gsub("inttest", "inttest-bootstrap", Sys.getenv('INTTEST_SCHEMA', 'inttest')), db_params = db_params, destroy_schema = TRUE)
mdb <- mfdb(gsub("inttest", "inttest-bootstrap", Sys.getenv('INTTEST_SCHEMA', 'inttest')), db_params = db_params, save_temp_tables = FALSE)

ok_group("Bootstrap samples of area sizes and lengths", {
    # Set-up areas/divisions
    mfdb_import_area(mdb, data.frame(
        name = c('45G01', '45G02', '45G03'),
        division = c('divA', 'divA', 'divB'),
        size = c(10,200,400)))

    # Import data for each areacell
    mfdb_import_survey(mdb,
        data_source = 'cell1',
        data.frame(
            year = c('1998'),
            month = c(1:12),
            areacell = c('45G01'),
            species = c('COD'),
            length = c(110,150,130,110,135,146, 165,162,136,135,134,122)))
    mfdb_import_survey(mdb,
        data_source = 'cell2',
        data.frame(
            year = c('1998'),
            month = c(1:12),
            areacell = c('45G02'),
            species = c('COD'),
            length = c(210,250,230,210,235,246, 265,262,236,235,234,222)))
    mfdb_import_survey(mdb,
        data_source = 'cell3',
        data.frame(
            year = c('1998'),
            month = c(1:12),
            areacell = c('45G03'),
            species = c('COD'),
            length = c(310,350,330,310,335,346, 365,362,336,335,334,322)))

    # We want 1 area, that's a combination of A and B
    area_group <- mfdb_group(all = c("divA", "divB"))
    # We want to take 5 samples "all" with replacement
    # We fix the seed to 35, so we get the same samples every time this script is run
    area_bootstrap_group <- mfdb_bootstrap_group(5, area_group, seed = 35)
    agg <- mfdb_area_size(mdb, params = list(area = area_bootstrap_group))

    # Instead of the normal 1, we get back 5 data frames
    ok(cmp(length(agg), 5), "Got 5 distinct groupings")

    # In the first case, all = divB & divA
    ok(cmp(agg[[1]], structure(
        data.frame(
            area = 'all',
            size = 400 + (10 + 200),  # divB + divA
            stringsAsFactors = FALSE
        ),
        area = list(all = c("divB", "divA")),
        generator = "mfdb_area_size"
    )), "divB, divA")

    # In the second, all = divB & divB
    ok(cmp(agg[[2]], structure(
        data.frame(
            area = 'all',
            size = 400 + 400,  # divB + divB
            stringsAsFactors = FALSE
        ),
        area = list(all = c("divB", "divB")),
        generator = "mfdb_area_size"
    )), "divB, divB")

    ok(cmp(agg[[3]], structure(
        data.frame(
            area = 'all',
            size = (10 + 200) + (10 + 200),
            stringsAsFactors = FALSE
        ),
        area = list(all = c("divA", "divA")),
        generator = "mfdb_area_size"
    )), "divA, divA")

    ok(cmp(agg[[4]], structure(
        data.frame(
            area = 'all',
            size = (10 + 200) + 400,
            stringsAsFactors = FALSE
        ),
        area = list(all = c("divA", "divB")),
        generator = "mfdb_area_size"
    )), "divA, divB")

    ok(cmp(agg[[5]], structure(
        data.frame(
            area = 'all',
            size = 400 + 400,
            stringsAsFactors = FALSE
        ),
        area = list(all = c("divB", "divB")),
        generator = "mfdb_area_size"
    )), "divB, divB")

    # Can do the same with counts
    length_group <- mfdb_step_interval('len', 50)
    agg <- mfdb_sample_count(mdb, c('length'), params = list(
        area = area_bootstrap_group,
        length = length_group,
        null = NULL))
    ok(cmp(length(agg), 5), "Got 5 distinct groupings")
    ok(cmp(agg[["0.0.1.0"]], structure(
        data.frame(
            year = 'all',
            step = 'all',
            area = 'all',
            length = c('len100', 'len150', 'len200', 'len250', 'len300', 'len350'),
            number = c(9, 3, 9, 3, 9, 3),
            stringsAsFactors = FALSE),
        year = list(all = as.integer(c(1998, 1998))),
        step = list(all = as.integer(c(1, 12))),
        area = list(all = c("divB", "divA")),
        length = list(
            len0   = structure(call("seq",   0,  49), min =   0, max =  50),
            len50  = structure(call("seq",  50,  99), min =  50, max = 100),
            len100 = structure(call("seq", 100, 149), min = 100, max = 150),
            len150 = structure(call("seq", 150, 199), min = 150, max = 200),
            len200 = structure(call("seq", 200, 249), min = 200, max = 250),
            len250 = structure(call("seq", 250, 299), min = 250, max = 300),
            len300 = structure(call("seq", 300, 349), min = 300, max = 350),
            len350 = structure(call("seq", 350, 399), min = 350, max = 400)),
        generator = "mfdb_sample_count")), "divB, divA")
    ok(cmp(agg[["0.0.2.0"]], structure(
        data.frame(
            year = 'all',
            step = 'all',
            area = 'all',
            length = c('len300', 'len350'),
            number = c(18, 6),
            stringsAsFactors = FALSE),
        year = list(all = as.integer(c(1998, 1998))),
        step = list(all = as.integer(c(1, 12))),
        area = list(all = c("divB", "divB")),
        length = list(
            len0   = structure(call("seq",   0,  49), min =   0, max =  50),
            len50  = structure(call("seq",  50,  99), min =  50, max = 100),
            len100 = structure(call("seq", 100, 149), min = 100, max = 150),
            len150 = structure(call("seq", 150, 199), min = 150, max = 200),
            len200 = structure(call("seq", 200, 249), min = 200, max = 250),
            len250 = structure(call("seq", 250, 299), min = 250, max = 300),
            len300 = structure(call("seq", 300, 349), min = 300, max = 350),
            len350 = structure(call("seq", 350, 399), min = 350, max = 400)),
        generator = "mfdb_sample_count")), "divB, divB")
    ok(cmp(agg[["0.0.3.0"]], structure(
        data.frame(
            year = 'all',
            step = 'all',
            area = 'all',
            length = c('len100', 'len150', 'len200', 'len250'),
            number = c(18, 6, 18, 6),
            stringsAsFactors = FALSE),
        year = list(all = as.integer(c(1998, 1998))),
        step = list(all = as.integer(c(1, 12))),
        area = list(all = c("divA", "divA")),
        length = list(
            len0   = structure(call("seq",   0,  49), min =   0, max =  50),
            len50  = structure(call("seq",  50,  99), min =  50, max = 100),
            len100 = structure(call("seq", 100, 149), min = 100, max = 150),
            len150 = structure(call("seq", 150, 199), min = 150, max = 200),
            len200 = structure(call("seq", 200, 249), min = 200, max = 250),
            len250 = structure(call("seq", 250, 299), min = 250, max = 300)),
        generator = "mfdb_sample_count")), "divA, divA")
    ok(cmp(agg[["0.0.4.0"]], structure(
        data.frame(
            year = 'all',
            step = 'all',
            area = 'all',
            length = c('len100', 'len150', 'len200', 'len250', 'len300', 'len350'),
            number = c(9, 3, 9, 3, 9, 3),
            stringsAsFactors = FALSE),
        year = list(all = as.integer(c(1998, 1998))),
        step = list(all = as.integer(c(1, 12))),
        area = list(all = c("divA", "divB")),
        length = list(
            len0   = structure(call("seq",   0,  49), min =   0, max =  50),
            len50  = structure(call("seq",  50,  99), min =  50, max = 100),
            len100 = structure(call("seq", 100, 149), min = 100, max = 150),
            len150 = structure(call("seq", 150, 199), min = 150, max = 200),
            len200 = structure(call("seq", 200, 249), min = 200, max = 250),
            len250 = structure(call("seq", 250, 299), min = 250, max = 300),
            len300 = structure(call("seq", 300, 349), min = 300, max = 350),
            len350 = structure(call("seq", 350, 399), min = 350, max = 400)),
        generator = "mfdb_sample_count")), "divA, divB")
    ok(cmp(agg[["0.0.5.0"]], structure(
        data.frame(
            year = 'all',
            step = 'all',
            area = 'all',
            length = c('len300', 'len350'),
            number = c(18, 6),
            stringsAsFactors = FALSE),
        year = list(all = as.integer(c(1998, 1998))),
        step = list(all = as.integer(c(1, 12))),
        area = list(all = c("divB", "divB")),
        length = list(
            len0   = structure(call("seq",   0,  49), min =   0, max =  50),
            len50  = structure(call("seq",  50,  99), min =  50, max = 100),
            len100 = structure(call("seq", 100, 149), min = 100, max = 150),
            len150 = structure(call("seq", 150, 199), min = 150, max = 200),
            len200 = structure(call("seq", 200, 249), min = 200, max = 250),
            len250 = structure(call("seq", 250, 299), min = 250, max = 300),
            len300 = structure(call("seq", 300, 349), min = 300, max = 350),
            len350 = structure(call("seq", 350, 399), min = 350, max = 400)),
        generator = "mfdb_sample_count")), "divB, divB")

    agg <- mfdb_sample_rawdata(mdb, c('length'), params = list(
        area = area_bootstrap_group,
        length = length_group,
        null = NULL))
    ok(cmp(attr(agg[['0.0.1.0']], 'area'), list(all = c("divB", "divA"))), "area agg_summary for bs 1")
    ok(cmp(attr(agg[['0.0.2.0']], 'area'), list(all = c("divB", "divB"))), "area agg_summary for bs 2")
    ok(cmp(attr(agg[['0.0.3.0']], 'area'), list(all = c("divA", "divA"))), "area agg_summary for bs 3")
    ok(cmp(attr(agg[['0.0.4.0']], 'area'), list(all = c("divA", "divB"))), "area agg_summary for bs 4")
    ok(cmp(attr(agg[['0.0.5.0']], 'area'), list(all = c("divB", "divB"))), "area agg_summary for bs 5")
    ok(cmp(sum(agg[['0.0.1.0']]$length == 'len300'),  9), "len300/divB appears once in bs 1")
    ok(cmp(sum(agg[['0.0.2.0']]$length == 'len300'), 18), "len300/divB appears twice in bs 2")
    ok(cmp(sum(agg[['0.0.3.0']]$length == 'len300'),  0), "len300/divB does not appear in bs 3")
    ok(cmp(sum(agg[['0.0.4.0']]$length == 'len300'),  9), "len300/divB appears once in bs 4")
    ok(cmp(sum(agg[['0.0.5.0']]$length == 'len300'), 18), "len300/divB appears twice in bs 5")

    ok(cmp(
        sort(agg[['0.0.1.0']][c(agg[['0.0.1.0']]$length == 'len300'), c('raw_length')]),
        sort(c(310, 330, 310, 335, 346, 336, 335, 334, 322))
    ), "Contents of len300/divB match")
})

ok_group("Bootstrapped empty results", {
    # Take 5 samples of an empty region
    area_group <- mfdb_group(all = c("aardvark"))
    area_bootstrap_group <- mfdb_bootstrap_group(5, area_group, seed = 35)
    agg <- mfdb_area_size(mdb, params = list(area = area_bootstrap_group))
    ok(cmp(names(agg), c(
        "0", "1", "2", "3", "4",
        NULL)), "Still got 5 data.frames")

    area_group <- mfdb_group(all = c("aardvark"))
    area_bootstrap_group <- mfdb_bootstrap_group(5, area_group, seed = 35)
    agg <- mfdb_sample_meanlength(mdb, c(), params = list(area = area_bootstrap_group))
    ok(cmp(names(agg), c(
        "0.0.0", "0.0.1", "0.0.2", "0.0.3", "0.0.4",
        NULL)), "Still got 5 data.frames")
})

mfdb_disconnect(mdb)

Try the mfdb package in your browser

Any scripts or data that you put into this service are public.

mfdb documentation built on June 21, 2022, 5:07 p.m.