Output Functions

context("test-write_bed")

# write_bed is complicated, deserves its own section

# some simple tests don't depend on BEDMatrix being around...

# simulate X (to share across tests)
# create a simple matrix with random valid data
# have n != m so we can check dimensions within write_plink
n <- 5
m <- 10
miss <- 0.1 # add missingness for good measure
fo <- tempfile('delete-me-random-test') # output name without extensions!
fo_bed <- add_ext(fo, 'bed') # output with extension
# create ancestral allele frequencies
p_anc <- runif(m)
# create genotypes
X <- rbinom(n*m, 2, p_anc)
# add missing values
X[sample(n*m, n*m*miss)] <- NA
# turn into matrix
X <- matrix(X, nrow = m, ncol = n)
# genotypes with names (trivial in these cases, to match autogenerated values)
X_named <- X
rownames(X_named) <- 1:m
colnames(X_named) <- 1:n

# simulate phenotype (to share across tests)
pheno <- rnorm(n)

test_that("write_bed and read_bed work", {
    # test that there are errors when crucial data is missing
    expect_error(write_bed()) # all is missing
    expect_error(write_bed('file')) # X is missing
    expect_error(write_bed(X = matrix(NA, 1, 1))) # file is missing
    # die if X is not a matrix!
    expect_error(write_bed('file', 'not-matrix'))
    
    # this should work
    expect_silent(
        write_bed(fo, X, verbose = FALSE)
    )

    # read tests
    # parse data back, verify agreement!
    expect_silent(
        X2 <- read_bed(fo, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X2)
    
    # test that we can read same file when extension is not BED
    fo_not_bed <- add_ext(fo, 'not') # output with wrong extension
    file.copy( fo_bed, fo_not_bed )
    expect_silent(
        X3 <- read_bed(fo_not_bed, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X3)
    # cleanup
    rm( X3 )
    invisible( file.remove( fo_not_bed ) )
    if ( file.exists( fo_not_bed ) )
        stop( 'Could not delete: ', fo_not_bed )
    
    # errors for missing params
    expect_error( read_bed() ) # missing all
    expect_error( read_bed(m_loci = m, n_ind = n) ) # missing file
    expect_error( read_bed(fo, n_ind = n) ) # missing m_loci
    expect_error( read_bed(fo, m_loci = m) ) # missing n_ind
    # error tests for bad dimensions
    expect_error( read_bed(fo, m_loci = n,   n_ind = m) ) # reversed dimensions: padding checks and padding mismatches catch these!
    expect_error( read_bed(fo, m_loci = m+1, n_ind = n) )
    expect_error( read_bed(fo, m_loci = m-1, n_ind = n) )
    expect_error( read_bed(fo, m_loci = m,   n_ind = n-1) )
    expect_error( read_bed(fo, m_loci = m,   n_ind = n+4) ) # have to be off by a whole byte to notice some of these errors
    # file path that doesn't exist
    expect_error( read_bed( 'doesnt-exist', m_loci = m, n_ind = n ) )
    # very long non-existent file path
    expect_error( read_bed( '/made/up/long/path/doesnt-exist_doesnt-exist_doesnt-exist_doesnt-exist_doesnt-exist', m_loci = m, n_ind = n ) )
    
    # delete output when done
    invisible(file.remove(fo_bed))
    # make sure deletion worked
    if ( file.exists( fo_bed ) )
        stop('Could not delete: ', fo_bed)
    
    # now add invalid values to X, make sure it dies!
    X2 <- X
    # add a negative value in a random location
    X2[sample(n*m, 1)] <- -1
    # actually die!
    expect_error( write_bed(fo, X2) )
    # make sure output doesn't exist anymore (code should automatically clean it up)
    expect_false( file.exists(fo_bed) )

    # another test
    X2 <- X
    # add a 3 in a random location
    X2[sample(n*m, 1)] <- 3
    # actually die!
    expect_error( write_bed(fo, X2) )
    # make sure output doesn't exist anymore (code should automatically clean it up)
    expect_false( file.exists(fo_bed) )

    # NOTE: if X contains values that truncate to the correct range (say, 1.5, which becomes 1 upon truncation), then that's what Rcpp does internally and no errors are raised!
})

test_that("write_bed with `append = TRUE` works", {

    # here's the awkward part, where we write it back in parts
    # write every two lines
    for ( i in 1 : ( nrow( X ) / 2 ) ) {
        # try writing it back elsewhere
        expect_silent(
            write_bed(
                fo,
                X[ (2*i-1):(2*i), ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        X2 <- read_bed(fo, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X2)
    # delete output when done
    invisible(file.remove(fo_bed))

    # repeat writing one line at the time
    for ( i in 1 : nrow( X ) ) {
        # try writing it back elsewhere
        expect_silent(
            write_bed(
                fo,
                X[ i, , drop = FALSE ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        X2 <- read_bed(fo, m_loci = m, n_ind = n, verbose = FALSE)
    )
    expect_equal(X, X2)
    # delete output when done
    invisible(file.remove(fo_bed))
})

# let's include BEDMatrix in tests, if it's available
test_BEDMatrix <- FALSE

# this test requires BEDMatrix to read file and print it back out
if (suppressMessages(suppressWarnings(require(BEDMatrix)))) {

    # wrapper to get BEDMatrix to read a matrix in the format we expect it to be
    # NOTE: since file handles don't appear to close properly, don't use this on files that have to be deleted as part of the test!
    read_bed_hack <- function(file) {
        # run this way since BEDMatrix is rather verbose
        # when there's no paired FAM/BIM, here provide dimensions and we're good (NULL requires FAM/BIM)
        X <- suppressMessages(suppressWarnings(BEDMatrix(file)))
        # this turns it into a regular matrix just like the one we expect
        X <- as.matrix(X)
        # transpose as needed
        X <- t(X)
        # remove dimnames for later comparison (adds unnecessary complexity)
        dimnames(X) <- NULL
        # finally done!
        return(X)
    }

    # switch this to know we can test with BEDMatrix
    test_BEDMatrix <- TRUE
}

# generic testing function
testOneInput <- function(nameIn, m_loci, n_ind) {
    # load using my code
    expect_silent(
        X <- read_bed(nameIn, m_loci = m_loci, n_ind = n_ind, verbose = FALSE) # hack use dimensions from the X read by BEDMatrix
    )

    if (test_BEDMatrix) {
        # load dummy file
        X2 <- read_bed_hack(nameIn)
        # compare now
        expect_equal(X, X2)
    }
    
    # ensure expected failures do fail
    # mess with dimensions on purpose
    expect_error( read_bed(nameIn, m_loci = n_ind,   n_ind = m_loci) ) # reverse dimensions, get caught because of padding checks (non-commutative unless both are factors of 4)
    expect_error( read_bed(nameIn, m_loci = m_loci+1, n_ind = n_ind) )
    expect_error( read_bed(nameIn, m_loci = m_loci-1, n_ind = n_ind) )
    # NOTE: this used to cause an error when we required that paddings be zero, but now, given some real files (in the wild) with non-zero pads, and seeing that other software just ignores the pads, decided to ignore them too (now these don't trigger errors)
    #expect_error( read_bed(nameIn, m_loci = m_loci,   n_ind = n_ind-1) )
    # sadly many +1 individual cases don't cause error because they just look like zeroes (in all loci) if there is enough padding.
    # do expect an error if we're off by a whole byte (4 individuals)
    expect_error( read_bed(nameIn, m_loci = m_loci,   n_ind = n_ind+4) )
    
    # write second version (BED only)
    nameOut <- tempfile( paste0(nameIn, '_rewrite') )
    expect_silent(
        write_bed(nameOut, X, verbose = FALSE)
    )
    
    # compare outputs, they should be identical!
    # this is less than ideal, but at least it's a pure R solution (not depending on linux 'cmp' or 'diff')
    f1 <- add_ext(nameIn, 'bed')
    f2 <- add_ext(nameOut, 'bed')
    # load all data brute force
    data1 <- readLines(f1, warn = FALSE)
    data2 <- readLines(f2, warn = FALSE)
    # compare now
    expect_equal(data1, data2)

    # remove temporary output file
    file.remove(f2)
    # make sure deletion worked
    if ( file.exists(f2) )
        stop('Could not delete: ', f2)
}

test_that("write_bed and read_bed agree with each other and BEDMatrix", {
    # repeat on several files
    testOneInput('dummy-33-101-0.1', 101, 33)
    testOneInput('dummy-33-2-0.1', 2, 33)
    testOneInput('dummy-4-10-0.1', 10, 4)
    testOneInput('dummy-5-10-0.1', 10, 5)
    testOneInput('dummy-6-10-0.1', 10, 6)
    testOneInput('dummy-7-10-0.1', 10, 7)
})

test_that("write_plink works", {
    # test that there are errors when crucial data is missing
    expect_error(write_plink()) # all is missing
    expect_error(write_plink('file')) # tibble is missing
    expect_error(write_plink(X = matrix(NA, 1, 1))) # file is missing

    # for tests, change fo (file out) here
    fo <- tempfile('test-write-plink-1')

    # this autocompletes bim and fam!
    expect_silent(
        write_plink(fo, X, verbose = FALSE)
    )
    # make sure we can read outputs!
    # read with my new function
    expect_silent(
        data <- read_plink(fo, verbose = FALSE)
    )
    # compare again (must compare named version for success)
    expect_equal(X_named, data$X)
    # compare row and column names internally
    expect_equal( rownames(data$X), data$bim$id )
    expect_equal( colnames(data$X), data$fam$id )
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )

    # change again
    fo <- tempfile('test-write-plink-2')

    # this autocompletes bim and fam except for pheno
    expect_silent(
        write_plink(fo, X, pheno = pheno, verbose = FALSE)
    )
    # in this case parse fam and make sure we recover pheno!
    expect_silent(
        fam <- read_fam(fo, verbose = FALSE)
    )
    # compare!
    expect_equal(fam$pheno, pheno)
    # read with my new function
    expect_silent(
        data <- read_plink(fo, verbose = FALSE)
    )
    # compare again (must compare named version for success)
    expect_equal(X_named, data$X)
    # compare row and column names internally
    expect_equal( rownames(data$X), data$bim$id )
    expect_equal( colnames(data$X), data$fam$id )
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )

    # test with `write_phen = TRUE`
    expect_silent(
        write_plink(fo, X, pheno = pheno, verbose = FALSE, write_phen = TRUE)
    )
    # in this case parse phen and make sure we recover pheno!
    expect_silent(
        phen <- read_phen(fo, verbose = FALSE)
    )
    # compare!
    expect_equal(phen$pheno, pheno)
    # (do not re-test genotype parts)
    # delete all outputs when done
    # this also tests that all files existed!
    expect_silent( delete_files_plink(fo) )
    expect_silent( delete_files_phen(fo) )

    # change again
    fo <- tempfile('test-write-plink-3')

    # create a case in which fam is also provided, make sure we get warning
    fam <- make_fam(n = n)
    expect_warning( write_plink(fo, X, fam = fam, pheno = pheno) )
    # parse fam and make sure pheno was missing
    expect_silent(
        fam <- read_fam(fo, verbose = FALSE)
    )
    expect_equal(fam$pheno, rep.int(0, n))
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )
})

test_that("write_plink with `append = TRUE` works", {
    # for this test, we need an original BIM table to compare to
    bim <- make_bim( n = m )
    # need to change some modes for the purpose of the test only
    bim$chr <- as.character( bim$chr )
    bim$id <- as.character( bim$id )
    bim$ref <- as.character( bim$ref )
    bim$alt <- as.character( bim$alt )
    
    # here's the awkward part, where we write it back in parts
    # write every two lines
    for ( i in 1 : ( nrow( X ) / 2 ) ) {
        indexes <- (2*i-1):(2*i)
        # try writing it back elsewhere
        expect_silent(
            write_plink(
                fo,
                X = X[ indexes, ],
                bim = bim[ indexes, ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        obj2 <- read_plink(fo, verbose = FALSE)
    )
    expect_equal( X_named, obj2$X ) # need named version for this test
    expect_equal( bim, obj2$bim[] ) # need [] to change stupid readr class, for testing only
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )
    
    # repeat writing one line at the time
    for ( i in 1 : nrow( X ) ) {
        # try writing it back elsewhere
        expect_silent(
            write_plink(
                fo,
                X = X[ i, , drop = FALSE ],
                bim = bim[ i, ],
                append = TRUE,
                verbose = FALSE
            )
        )
    }
    
    # read tests
    # parse data back, verify agreement!
    expect_silent(
        obj2 <- read_plink(fo, verbose = FALSE)
    )
    expect_equal( X_named, obj2$X ) # need named version for this test
    expect_equal( bim, obj2$bim[] ) # need [] to change stupid readr class, for testing only
    # delete all three outputs when done
    # this also tests that all three files existed!
    expect_silent( delete_files_plink(fo) )
})

# this is a case reported by richelbilderbeek
test_that( "read_plink works with file without zero pads", {
    name <- 'HumanOrigins249_tiny'
    expect_silent(
        data <- read_plink( name, verbose = FALSE )
    )
    if (test_BEDMatrix) {
        # load using BEDMatrix
        X2 <- read_bed_hack( name )
        # for comparison, clean up genio version (read_plink returns row/col names)
        X <- data$X
        dimnames(X) <- NULL
        # compare now
        expect_equal( X, X2 )
    }
})

test_that( "geno_to_char works", {
    # use data where plink1 told us what the answer was
    name <- 'dummy-4-10-0.1'
    expect_silent(
        data <- read_plink( name, verbose = FALSE )
    )
    X <- data$X
    bim <- data$bim
    m <- nrow( X )
    n <- ncol( X )
    
    # need to primitively parse corresponding ped output
    # (sorry this is tedious)
    name_ped <- paste0( name, '.ped' )
    Y_exp <- readr::read_table( name_ped, col_names = FALSE )
    # remove FAM columns (first 6)
    # also transpose
    Y_exp <- t( Y_exp[ , -(1:6) ] )
    # and collapse the way we have it
    for ( i in 1 : m ) {
        # indexes on Y matrix
        # first row
        j <- 2 * i - 1
        # get two rows at the time
        # store back in final row
        Y_exp[ i, ] <- paste0( Y_exp[ j, ], '/', Y_exp[ j + 1, ] )
    }
    # shrink matrix now
    Y_exp <- Y_exp[ 1:m , ]
    # cases are coded as 0/0 are NA, replace now:
    Y_exp[ Y_exp == '0/0' ] <- NA
    # copy correct names (missing in this data)
    dimnames( Y_exp ) <- dimnames( X )
    # for simplicity, flip heterozygotes so they're all A/B instead of some being B/A
    Y_exp[ Y_exp == 'B/A' ] <- 'A/B'

    # START TESTING!

    # errors due to missing required arguments
    expect_error( geno_to_char( X = X ) )
    expect_error( geno_to_char( bim = bim ) )
    # disagreeing dimensions
    expect_error( geno_to_char( X, bim[ -1, ] ) )
    # slashes in allele names
    bim_bad <- bim
    i <- sample( m, 1 )
    bim_bad$ref[ i ] <- 'a/b'
    expect_error( geno_to_char( X, bim_bad ) )
    bim_bad <- bim
    i <- sample( m, 1 )
    bim_bad$alt[ i ] <- 'a/b'
    expect_error( geno_to_char( X, bim_bad ) )
    # same ref/alt at a locus
    bim_bad <- bim
    i <- sample( m, 1 )
    bim_bad$alt[ i ] <- bim_bad$ref[ i ]
    expect_error( geno_to_char( X, bim_bad ) )
    # invalid values in X
    X_bad <- X
    i <- sample( m*n, 1 )
    X_bad[ i ] <- -1
    expect_error( geno_to_char( X_bad, bim ) )
    X_bad <- X
    i <- sample( m*n, 1 )
    X_bad[ i ] <- 0.5
    expect_error( geno_to_char( X_bad, bim ) )
    X_bad <- X
    i <- sample( m*n, 1 )
    X_bad[ i ] <- 3
    expect_error( geno_to_char( X_bad, bim ) )
    
    # now a succesful run
    expect_silent(
        Y_obs <- geno_to_char(X, bim)
    )
    # allele order doesn't matter, so normalize it here as we did with the expected matrix
    Y_obs[ Y_obs == 'B/A' ] <- 'A/B'
    # make sure we recovered the desired data!
    expect_equal( Y_obs, Y_exp )
})

# this is a case reported by richelbilderbeek
# sadly, on my machine the "ruthless crash" was treated as a normal error (same as it is after the fix), but might as well include that here too
test_that( 'write_bed does not crash ruthlessly when output directory does not exist', {
    expect_error(
        write_bed( 'dir-does-not-exist/test', X, verbose = FALSE )
    )
})

test_that( "sim_and_write_plink works", {
    # start by loading an existing file with known values
    # this first test won't have p_anc
    out <- read_plink( 'dummy-33-101-0.1', verbose = FALSE )
    X <- out$X
    bim <- out$bim
    fam <- out$fam
    m_loci <- nrow( bim )
    # this ought to omit it from the return value?
    p_anc <- NULL

    # a global variable updated as we go
    m_last <- 0
    
    # define a trivial but complete genotype simulator function
    sim_chunk <- function( m_chunk ) {
        # get subset of interest
        indexes <- m_last + ( 1 : m_chunk )
        # update global value (use <<-) for next round
        m_last <<- m_last + m_chunk
        # return all of these elements in a named list!
        obj <- list(
            X = X[ indexes, , drop = FALSE ],
            bim = bim[ indexes, ]
        )
        if ( !is.null( p_anc ) )
            obj$p_anc <- p_anc[ indexes ]
        # add p_anc if not NULL
        return( obj )
    }

    # write to the temp output
    expect_silent(
        sim_and_write_plink( sim_chunk, m_loci, fam, fo )
    )

    # read it back, make sure it matches
    out2 <- read_plink( fo, verbose = FALSE )
    expect_equal( out2, out )

    # cleanup
    expect_silent( delete_files_plink(fo) )

    # repeat with a small chunk size, to make sure we write in several parts
    m_last <- 0 # always reset!
    expect_silent(
        sim_and_write_plink( sim_chunk, m_loci, fam, fo, n_data_cut = 197 )
    )
    out2 <- read_plink( fo, verbose = FALSE )
    expect_equal( out2, out )
    expect_silent( delete_files_plink(fo) )

    # test singleton edge case
    m_last <- 0 # always reset!
    expect_silent(
        sim_and_write_plink( sim_chunk, m_loci, fam, fo, n_data_cut = 1 )
    )
    out2 <- read_plink( fo, verbose = FALSE )
    expect_equal( out2, out )
    expect_silent( delete_files_plink(fo) )

    # version with mid-size chunk and p_anc
    fp <- tempfile('delete-me-random-test-p-anc.txt.gz') # this one has an extension
    p_anc <- runif( m_loci )
    m_last <- 0 # always reset!
    expect_silent(
        sim_and_write_plink( sim_chunk, m_loci, fam, fo, fp, n_data_cut = 197 )
    )
    out2 <- read_plink( fo, verbose = FALSE )
    expect_equal( out2, out )
    # load and check p-anc too
    p_anc2 <- as.numeric( readr::read_lines( fp ) )
    expect_equal( p_anc2, p_anc )
    expect_silent( delete_files_plink(fo) )
    file.remove( fp )
})

test_that( 'symlink works', {
    # a local file that exists
    file <- 'dummy-33-101-0.1.fam'
    # a file that doesn't exist already, setup to be in a temporary path
    # NOTE: out of necessity, this is a path not in current directory, so that's already testing that complicated case!
    link <- fo
    # this ought to succeed!
    expect_silent(
        symlink( file, link, verbose = FALSE )
    )

    # confirm that file contents are the same?
    fam1 <- read_fam( file, verbose = FALSE )
    fam2 <- read_fam( link, verbose = FALSE )
    expect_equal( fam2, fam1 )
    
    # delete when done!
    expect_true( file.remove( link ) )

    # repeat when input file has absolute path
    file <- R.utils::getAbsolutePath( file )
    expect_silent(
        symlink( file, link, verbose = FALSE )
    )
    fam1 <- read_fam( file, verbose = FALSE )
    fam2 <- read_fam( link, verbose = FALSE )
    expect_equal( fam2, fam1 )
    expect_true( file.remove( link ) )

    # so only case not directly tested is two files in same local path, but that's sort of the easiest case and tested in my own runs, so I'm not worried that would somehow fail!
})

test_that( "het_reencode_bed works", {
    # apply reencoding to this existing file
    fi <- 'dummy-33-101-0.1'
    expect_silent(
        het_reencode_bed( fi, fo, verbose = FALSE )
    )

    # to validate, read both and apply the explicit mapping in R, which is simpler code
    data <- read_plink( fi, verbose = FALSE )
    X <- data$X
    # this is desired transformation
    X[ X == 2 ] <- 0
    X[ X == 1 ] <- 2
    data2 <- read_plink( fo, verbose = FALSE )
    # confirm transformation matched
    expect_equal( data2$X, X )
    # these should be symlinks, so they should be identical!
    expect_equal( data2$bim, data$bim )
    expect_equal( data2$fam, data$fam )
    
    # all bed/bim/fam files were created (the other two are symlinks), delete all now
    expect_silent( delete_files_plink( fo ) )
})
OchoaLab/genio documentation built on Feb. 22, 2025, 4:13 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
OchoaLab/genio
Genetics Input/Output Functions

tests/testthat/test-write_bed.R
In OchoaLab/genio: Genetics Input/Output Functions

R Package Documentation

Browse R Packages

We want your feedback!

OchoaLab/genio Genetics Input/Output Functions

tests/testthat/test-write_bed.R In OchoaLab/genio: Genetics Input/Output Functions

R Package Documentation

Browse R Packages

We want your feedback!

OchoaLab/genio
Genetics Input/Output Functions

tests/testthat/test-write_bed.R
In OchoaLab/genio: Genetics Input/Output Functions