Output Functions

context("test-write_bed")

# write_bed is complicated, deserves its own section

# some simple tests don't depend on BEDMatrix being around...

# simulate X (to share across tests)
# create a simple matrix with random valid data
# have n != m so we can check dimensions within write_plink
n <- 5
m <- 10
miss <- 0.1 # add missingness for good measure
fo <- tempfile('delete-me-random-test') # output name without extensions!
fo_bed <- add_ext(fo, 'bed') # output with extension
# create ancestral allele frequencies
p_anc <- runif(m)
# create genotypes
X <- rbinom(n*m, 2, p_anc)
# add missing values
X[sample(n*m, n*m*miss)] <- NA
# turn into matrix
X <- matrix(X, nrow = m, ncol = n)
# genotypes with names (trivial in these cases, to match autogenerated values)
X_named <- X
rownames(X_named) <- 1:m
colnames(X_named) <- 1:n

# simulate phenotype (to share across tests)
pheno <- rnorm(n)

test_that("write_bed and read_bed work", {
    # test that there are errors when crucial data is missing
    expect_error(write_bed()) # all is missing
    expect_error(write_bed('file')) # X is missing
    expect_error(write_bed(X = matrix(NA, 1, 1))) # file is missing
    # die if X is not a matrix!
    expect_error(write_bed('file', 'not-matrix'))
    
    # this should work
    expect_silent(
        write_bed(fo, X, verbose = FALSE)
    )

    # read tests
    # parse data back, verify agreement!
    expect_silent(
        X2 <- read_bed(fo, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X2)
    
    # test that we can read same file when extension is not BED
    fo_not_bed <- add_ext(fo, 'not') # output with wrong extension
    file.copy( fo_bed, fo_not_bed )
    expect_silent(
        X3 <- read_bed(fo_not_bed, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X3)
    # cleanup
    rm( X3 )
    invisible( file.remove( fo_not_bed ) )
    if ( file.exists( fo_not_bed ) )
        stop( 'Could not delete: ', fo_not_bed )
    
    # errors for missing params
    expect_error( read_bed() ) # missing all
    expect_error( read_bed(m_loci = m, n_ind = n) ) # missing file
    expect_error( read_bed(fo, n_ind = n) ) # missing m_loci
    expect_error( read_bed(fo, m_loci = m) ) # missing n_ind
    # error tests for bad dimensions
    expect_error( read_bed(fo, m_loci = n,   n_ind = m) ) # reversed dimensions: padding checks and padding mismatches catch these!
    expect_error( read_bed(fo, m_loci = m+1, n_ind = n) )
    expect_error( read_bed(fo, m_loci = m-1, n_ind = n) )
    expect_error( read_bed(fo, m_loci = m,   n_ind = n-1) )
    expect_error( read_bed(fo, m_loci = m,   n_ind = n+4) ) # have to be off by a whole byte to notice some of these errors
    # file path that doesn't exist
    expect_error( read_bed( 'doesnt-exist', m_loci = m, n_ind = n ) )
    # very long non-existent file path
    expect_error( read_bed( '/made/up/long/path/doesnt-exist_doesnt-exist_doesnt-exist_doesnt-exist_doesnt-exist', m_loci = m, n_ind = n ) )
    
    # delete output when done
    invisible(file.remove(fo_bed))
    # make sure deletion worked
    if ( file.exists( fo_bed ) )
        stop('Could not delete: ', fo_bed)
    
    # now add invalid values to X, make sure it dies!
    X2 <- X
    # add a negative value in a random location
    X2[sample(n*m, 1)] <- -1
    # actually die!
    expect_error( write_bed(fo, X2) )
    # make sure output doesn't exist anymore (code should automatically clean it up)
    expect_false( file.exists(fo_bed) )

    # another test
    X2 <- X
    # add a 3 in a random location
    X2[sample(n*m, 1)] <- 3
    # actually die!
    expect_error( write_bed(fo, X2) )
    # make sure output doesn't exist anymore (code should automatically clean it up)
    expect_false( file.exists(fo_bed) )

    # NOTE: if X contains values that truncate to the correct range (say, 1.5, which becomes 1 upon truncation), then that's what Rcpp does internally and no errors are raised!
})

test_that("write_bed with `append = TRUE` works", {

    # here's the awkward part, where we write it back in parts
    # write every two lines
    for ( i in 1 : ( nrow( X ) / 2 ) ) {
        # try writing it back elsewhere
        expect_silent(
            write_bed(
                fo,
                X[ (2*i-1):(2*i), ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        X2 <- read_bed(fo, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X2)
    # delete output when done
    invisible(file.remove(fo_bed))

    # repeat writing one line at the time
    for ( i in 1 : nrow( X ) ) {
        # try writing it back elsewhere
        expect_silent(
            write_bed(
                fo,
                X[ i, , drop = FALSE ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        X2 <- read_bed(fo, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X2)
    # delete output when done
    invisible(file.remove(fo_bed))
})

# let's include BEDMatrix in tests, if it's available
test_BEDMatrix <- FALSE

# this test requires BEDMatrix to read file and print it back out
if (suppressMessages(suppressWarnings(require(BEDMatrix)))) {

    # wrapper to get BEDMatrix to read a matrix in the format we expect it to be
    # NOTE: since file handles don't appear to close properly, don't use this on files that have to be deleted as part of the test!
    read_bed_hack <- function(file) {
        # run this way since BEDMatrix is rather verbose
        # when there's no paired FAM/BIM, here provide dimensions and we're good (NULL requires FAM/BIM)
        X <- suppressMessages(suppressWarnings(BEDMatrix(file)))
        # this turns it into a regular matrix just like the one we expect
        X <- as.matrix(X)
        # transpose as needed
        X <- t(X)
        # remove dimnames for later comparison (adds unnecessary complexity)
        dimnames(X) <- NULL
        # finally done!
        return(X)
    }

    # switch this to know we can test with BEDMatrix
    test_BEDMatrix <- TRUE
}

# generic testing function
testOneInput <- function(nameIn, m_loci, n_ind) {
    # load using my code
    expect_silent(
        X <- read_bed(nameIn, m_loci = m_loci, n_ind = n_ind, verbose = FALSE) # hack use dimensions from the X read by BEDMatrix
    )

    if (test_BEDMatrix) {
        # load dummy file
        X2 <- read_bed_hack(nameIn)
        # compare now
        expect_equal(X, X2)
    }
    
    # ensure expected failures do fail
    # mess with dimensions on purpose
    expect_error( read_bed(nameIn, m_loci = n_ind,   n_ind = m_loci) ) # reverse dimensions, get caught because of padding checks (non-commutative unless both are factors of 4)
    expect_error( read_bed(nameIn, m_loci = m_loci+1, n_ind = n_ind) )
    expect_error( read_bed(nameIn, m_loci = m_loci-1, n_ind = n_ind) )
    # NOTE: this used to cause an error when we required that paddings be zero, but now, given some real files (in the wild) with non-zero pads, and seeing that other software just ignores the pads, decided to ignore them too (now these don't trigger errors)
    #expect_error( read_bed(nameIn, m_loci = m_loci,   n_ind = n_ind-1) )
    # sadly many +1 individual cases don't cause error because they just look like zeroes (in all loci) if there is enough padding.
    # do expect an error if we're off by a whole byte (4 individuals)
    expect_error( read_bed(nameIn, m_loci = m_loci,   n_ind = n_ind+4) )
    
    # write second version (BED only)
    nameOut <- tempfile( paste0(nameIn, '_rewrite') )
    expect_silent(
        write_bed(nameOut, X, verbose = FALSE)
    )
    
    # compare outputs, they should be identical!
    # this is less than ideal, but at least it's a pure R solution (not depending on linux 'cmp' or 'diff')
    f1 <- add_ext(nameIn, 'bed')
    f2 <- add_ext(nameOut, 'bed')
    # load all data brute force
    data1 <- readLines(f1, warn = FALSE)
    data2 <- readLines(f2, warn = FALSE)
    # compare now
    expect_equal(data1, data2)

    # remove temporary output file
    file.remove(f2)
    # make sure deletion worked
    if ( file.exists(f2) )
        stop('Could not delete: ', f2)
}

test_that("write_bed and read_bed agree with each other and BEDMatrix", {
    # repeat on several files
    testOneInput('dummy-33-101-0.1', 101, 33)
    testOneInput('dummy-4-10-0.1', 10, 4)
    testOneInput('dummy-5-10-0.1', 10, 5)
    testOneInput('dummy-6-10-0.1', 10, 6)
    testOneInput('dummy-7-10-0.1', 10, 7)
})

test_that("write_plink works", {
    # test that there are errors when crucial data is missing
    expect_error(write_plink()) # all is missing
    expect_error(write_plink('file')) # tibble is missing
    expect_error(write_plink(X = matrix(NA, 1, 1))) # file is missing

    # for tests, change fo (file out) here
    fo <- tempfile('test-write-plink-1')

    # this autocompletes bim and fam!
    expect_silent(
        write_plink(fo, X, verbose = FALSE)
    )
    # make sure we can read outputs!
    # read with my new function
    expect_silent(
        data <- read_plink(fo, verbose = FALSE)
    )
    # compare again (must compare named version for success)
    expect_equal(X_named, data$X)
    # compare row and column names internally
    expect_equal( rownames(data$X), data$bim$id )
    expect_equal( colnames(data$X), data$fam$id )
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )

    # change again
    fo <- tempfile('test-write-plink-2')

    # this autocompletes bim and fam except for pheno
    expect_silent(
        write_plink(fo, X, pheno = pheno, verbose = FALSE)
    )
    # in this case parse fam and make sure we recover pheno!
    expect_silent(
        fam <- read_fam(fo, verbose = FALSE)
    )
    # compare!
    expect_equal(fam$pheno, pheno)
    # read with my new function
    expect_silent(
        data <- read_plink(fo, verbose = FALSE)
    )
    # compare again (must compare named version for success)
    expect_equal(X_named, data$X)
    # compare row and column names internally
    expect_equal( rownames(data$X), data$bim$id )
    expect_equal( colnames(data$X), data$fam$id )
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )

    # test with `write_phen = TRUE`
    expect_silent(
        write_plink(fo, X, pheno = pheno, verbose = FALSE, write_phen = TRUE)
    )
    # in this case parse phen and make sure we recover pheno!
    expect_silent(
        phen <- read_phen(fo, verbose = FALSE)
    )
    # compare!
    expect_equal(phen$pheno, pheno)
    # (do not re-test genotype parts)
    # delete all outputs when done
    # this also tests that all files existed!
    expect_silent( delete_files_plink(fo) )
    expect_silent( delete_files_phen(fo) )

    # change again
    fo <- tempfile('test-write-plink-3')

    # create a case in which fam is also provided, make sure we get warning
    fam <- make_fam(n = n)
    expect_warning( write_plink(fo, X, fam = fam, pheno = pheno) )
    # parse fam and make sure pheno was missing
    expect_silent(
        fam <- read_fam(fo, verbose = FALSE)
    )
    expect_equal(fam$pheno, rep.int(0, n))
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )
})

test_that("write_plink with `append = TRUE` works", {
    # for this test, we need an original BIM table to compare to
    bim <- make_bim( n = m )
    # need to change some modes for the purpose of the test only
    bim$chr <- as.character( bim$chr )
    bim$id <- as.character( bim$id )
    bim$ref <- as.character( bim$ref )
    bim$alt <- as.character( bim$alt )
    
    # here's the awkward part, where we write it back in parts
    # write every two lines
    for ( i in 1 : ( nrow( X ) / 2 ) ) {
        indexes <- (2*i-1):(2*i)
        # try writing it back elsewhere
        expect_silent(
            write_plink(
                fo,
                X = X[ indexes, ],
                bim = bim[ indexes, ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        obj2 <- read_plink(fo, verbose = FALSE)
    )
    expect_equal( X_named, obj2$X ) # need named version for this test
    expect_equal( bim, obj2$bim[] ) # need [] to change stupid readr class, for testing only
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )
    
    # repeat writing one line at the time
    for ( i in 1 : nrow( X ) ) {
        # try writing it back elsewhere
        expect_silent(
            write_plink(
                fo,
                X = X[ i, , drop = FALSE ],
                bim = bim[ i, ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        obj2 <- read_plink(fo, verbose = FALSE)
    )
    expect_equal( X_named, obj2$X ) # need named version for this test
    expect_equal( bim, obj2$bim[] ) # need [] to change stupid readr class, for testing only
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )
})

# this is a case reported by richelbilderbeek
test_that( "read_plink works with file without zero pads", {
    name <- 'HumanOrigins249_tiny'
    expect_silent(
        data <- read_plink( name, verbose = FALSE )
    )
    if (test_BEDMatrix) {
        # load using BEDMatrix
        X2 <- read_bed_hack( name )
        # for comparison, clean up genio version (read_plink returns row/col names)
        X <- data$X
        dimnames(X) <- NULL
        # compare now
        expect_equal( X, X2 )
    }
})

test_that( "geno_to_char works", {
    # use data where plink1 told us what the answer was
    name <- 'dummy-4-10-0.1'
    expect_silent(
        data <- read_plink( name, verbose = FALSE )
    )
    X <- data$X
    bim <- data$bim
    m <- nrow( X )
    n <- ncol( X )
    
    # need to primitively parse corresponding ped output
    # (sorry this is tedious)
    name_ped <- paste0( name, '.ped' )
    Y_exp <- readr::read_table( name_ped, col_names = FALSE )
    # remove FAM columns (first 6)
    # also transpose
    Y_exp <- t( Y_exp[ , -(1:6) ] )
    # and collapse the way we have it
    for ( i in 1 : m ) {
        # indexes on Y matrix
        # first row
        j <- 2 * i - 1
        # get two rows at the time
        # store back in final row
        Y_exp[ i, ] <- paste0( Y_exp[ j, ], '/', Y_exp[ j + 1, ] )
    }
    # shrink matrix now
    Y_exp <- Y_exp[ 1:m , ]
    # cases are coded as 0/0 are NA, replace now:
    Y_exp[ Y_exp == '0/0' ] <- NA
    # copy correct names (missing in this data)
    dimnames( Y_exp ) <- dimnames( X )
    # for simplicity, flip heterozygotes so they're all A/B instead of some being B/A
    Y_exp[ Y_exp == 'B/A' ] <- 'A/B'

    # START TESTING!

    # errors due to missing required arguments
    expect_error( geno_to_char( X = X ) )
    expect_error( geno_to_char( bim = bim ) )
    # disagreeing dimensions
    expect_error( geno_to_char( X, bim[ -1, ] ) )
    # slashes in allele names
    bim_bad <- bim
    i <- sample( m, 1 )
    bim_bad$ref[ i ] <- 'a/b'
    expect_error( geno_to_char( X, bim_bad ) )
    bim_bad <- bim
    i <- sample( m, 1 )
    bim_bad$alt[ i ] <- 'a/b'
    expect_error( geno_to_char( X, bim_bad ) )
    # same ref/alt at a locus
    bim_bad <- bim
    i <- sample( m, 1 )
    bim_bad$alt[ i ] <- bim_bad$ref[ i ]
    expect_error( geno_to_char( X, bim_bad ) )
    # invalid values in X
    X_bad <- X
    i <- sample( m*n, 1 )
    X_bad[ i ] <- -1
    expect_error( geno_to_char( X_bad, bim ) )
    X_bad <- X
    i <- sample( m*n, 1 )
    X_bad[ i ] <- 0.5
    expect_error( geno_to_char( X_bad, bim ) )
    X_bad <- X
    i <- sample( m*n, 1 )
    X_bad[ i ] <- 3
    expect_error( geno_to_char( X_bad, bim ) )
    
    # now a succesful run
    expect_silent(
        Y_obs <- geno_to_char(X, bim)
    )
    # allele order doesn't matter, so normalize it here as we did with the expected matrix
    Y_obs[ Y_obs == 'B/A' ] <- 'A/B'
    # make sure we recovered the desired data!
    expect_equal( Y_obs, Y_exp )
})

# this is a case reported by richelbilderbeek
# sadly, on my machine the "ruthless crash" was treated as a normal error (same as it is after the fix), but might as well include that here too
test_that( 'write_bed does not crash ruthlessly when output directory does not exist', {
    expect_error(
        write_bed( 'dir-does-not-exist/test', X, verbose = FALSE )
    )
})