context("generators")
test_that("Checking the generator for the Fasta files", {
testthat::skip_if_not_installed("tensorflow")
testthat::skip_if_not(reticulate::py_module_available("tensorflow"))
testpath <- file.path("fasta_2")
vocabulary <- c("a", "c", "g", "t")
batch_size <- 5
maxlen <- 3
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary)
arrays <- gen()
expect_equivalent(dim(arrays[[1]])[1], batch_size)
expect_equivalent(dim(arrays[[1]])[2], maxlen)
expect_equivalent(dim(arrays[[1]])[3], length(vocabulary))
expect_equivalent(dim(arrays[[2]])[1], batch_size)
expect_equivalent(dim(arrays[[2]])[2], length(vocabulary))
expect_equivalent(length(arrays),2)
# a.fasta file starts with aaccggtt
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # c
expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) # c
arrays_2 <- gen()
expect_equivalent(arrays_2[[1]][2, 1, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays_2[[1]][2, 2, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays_2[[1]][2, 3, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays_2[[2]][2, ], c(0, 0, 0, 1)) # t
# test transition to second fasta file
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary)
for (i in 1:5){
arrays <- gen()
}
# samples start at beginning of b.fasta
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][5, 3, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[2]][5, ], c(1, 0, 0, 0)) # a
# complete one iteration (100 samples)
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary)
for (i in 1:9){
arrays <- gen()
}
# start from a.fasta again
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # c
expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) # c
###################
# test for different step size
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 4, maxlen = 3, step = 2)
arrays <- gen()
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][4, ], c(1, 0, 0, 0))
####
# tests with chars outside vocabulary, vocabulary does not contain "A"
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("c", "g", "t"))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0)) # a
expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0)) # c
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # c
####
# test padding
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 1, maxlen = 10, step = 4,
vocabulary = c("a", "c", "g", "t"))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][1, ], c(0, 0, 0, 1))
# no padding
testpath <- file.path("fasta_3")
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 2, maxlen = 12, step = 1,
vocabulary = c("a", "c", "g", "t"), padding = FALSE)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 5, ], c(0, 1, 0, 0))
####
testpath <- file.path("fasta_2")
expect_error(generator_fasta_lm())
expect_error(generator_fasta_lm(""))
expect_is(generator_fasta_lm(testpath, batch_size = batch_size, maxlen = maxlen), "function")
expect_is(gen(), "list")
expect_is(gen()[[1]], "array")
expect_is(gen()[[2]], "matrix")
expect_silent(generator_fasta_lm(testpath, batch_size = batch_size, maxlen = maxlen))
expect_type(gen()[[1]], "double")
expect_type(gen()[[2]], "double")
############# Test label generator (header) #############
testpath <- file.path("fasta_2")
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # A
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # A
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # C
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # W
expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) # A
expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) # A
expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) # T
expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) # W
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 8, step = 2, vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"))
arrays <- gen()
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 1, 0))
arrays <- gen()
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 4, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][5, 6, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][5, 7, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][5, 8, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][1, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][2, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 8, maxlen = 7, step = 2, vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"))
arrays <- gen()
# go through a/b.fasta once discard samples with target z
expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) # A
expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) # A
expect_equivalent(arrays[[1]][8, 3, ], c(0, 1, 0, 0)) # C
expect_equivalent(arrays[[2]][8, ], c(1, 0, 0)) # W
############# Test label generator (folder) #############
directories <- c("label_folder/x", "label_folder/y", "label_folder/z")
val <- FALSE
gen_list <- generator_initialize(directories = directories,
val = val,
format = "fasta",
batch_size = 6,
maxlen = 2,
vocabulary = c("a", "c", "g", "t"),
step = 2)
gen <- generator_fasta_label_folder_wrapper(val = val, path = directories, gen_list = gen_list)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 0, 1))
# test skipping file
for (i in 1:2){
arrays <- gen()
}
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 0, 1))
#
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 0, 1))
####### Test discard ambiguous nucleotides ###########
testpath <- file.path("fasta_3")
vocabulary = c("a", "c", "g", "t")
batch_size <- 6
maxlen <- 3
step <- 2
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "discard", step = step)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][6, ], c(0, 0, 0, 1))
# label header
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "discard", step = step, reverse_complement = FALSE,
vocabulary_label = c("X", "Y"))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][5, ], c(0, 1))
expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 1))
# label folder
directories = c("fasta_2", "fasta_3")
gen <- get_generator(val = FALSE,
train_type = "label_folder",
path = directories,
format = "fasta",
batch_size = 6,
maxlen = 3,
ambiguous_nuc = "discard",
vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE,
step = 2)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1))
expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 1))
expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 1))
####### Test ambiguous nucleotides as 1/length(vocabulary) ###########
testpath <- file.path("fasta_3")
vocabulary = c("a", "c", "g", "t")
batch_size <- 4
maxlen <- 3
step <- 2
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "equal", step = step)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1))
# label header
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "equal", step = step, reverse_complement = FALSE,
vocabulary_label = c("X", "Y"))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[2]][4, ], c(1, 0))
# label folder
directories = c("fasta_2", "fasta_3")
gen <- get_generator(train_type = "label_folder",
val = FALSE,
path = directories,
format = "fasta",
batch_size = 4,
maxlen = 3,
vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE,
ambiguous_nuc = "equal",
step = 2)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[2]][3, ], c(0, 1))
expect_equivalent(arrays[[1]][4, 1, ], c(1/4, 1/4, 1/4, 1/4))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1))
####### Test ambiguous nucleotides as "empirical" ###########
# LM
testpath <- file.path("fasta_3")
vocabulary <- c("a", "c", "g", "t")
batch_size <- 4
maxlen <- 3
step <- 2
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step)
arrays <- gen()
nuc_dist <- 1/18*c(8, 2, 3, 5)
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], nuc_dist)
expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], nuc_dist)
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][2, ], nuc_dist)
expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], nuc_dist)
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], nuc_dist)
expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1))
# LM second file
testpath <- file.path("fasta_3")
vocabulary <- c("a", "c", "g", "t")
batch_size <- 4
maxlen <- 3
step <- 20
gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step)
arrays <- gen()
nuc_dist_1 <- 1/18*c(8, 2, 3, 5)
nuc_dist_2 <- 1/17*c(3, 2, 6, 6)
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], nuc_dist)
expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 1, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1))
# label header
testpath <- file.path("fasta_3")
vocabulary <- c("a", "c", "g", "t")
batch_size <- 4
maxlen <- 3
step <- 2
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen,
vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step, reverse_complement = FALSE,
vocabulary_label = c("X", "Y"))
arrays <- gen()
nuc_dist <- 1/18*c(8, 2, 3, 5)
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], nuc_dist)
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][2, 1, ], nuc_dist)
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], nuc_dist)
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 3, ], nuc_dist)
expect_equivalent(arrays[[2]][4, ], c(1, 0))
# label folder
directories = c("fasta_2", "fasta_3")
gen <- get_generator(path = directories,
val = FALSE,
train_type = "label_folder",
format = "fasta",
batch_size = 4,
maxlen = 3,
vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE,
ambiguous_nuc = "empirical",
step = 2)
arrays <- gen()
nuc_dist <- 1/18*c(8, 2, 3, 5)
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], nuc_dist)
expect_equivalent(arrays[[2]][3, ], c(0, 1))
expect_equivalent(arrays[[1]][4, 1, ], nuc_dist)
expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1))
############# padding/amb nucleotide LM ############
gen <- generator_fasta_lm(path_corpus = "fasta_3",
batch_size = 3,
maxlen = 15,
step = 1,
ambiguous_nuc = "equal")
arrays <- gen()
equal_vector <- rep(0.25, 4)
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 6, ], equal_vector)
expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 8, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 9, ], equal_vector)
expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 11, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 12, ], equal_vector)
expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 15, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 6, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 7, ], equal_vector)
expect_equivalent(arrays[[1]][3, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 11, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 12, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 13, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 14, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 15, ], equal_vector)
expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1))
############# padding/amb nucleotide, label_header ############
gen <- generator_fasta_label_header_csv(path_corpus = "fasta_3",
batch_size = 3,
maxlen = 15,
step = 1,
vocabulary_label = c("X", "Y"),
reverse_complement = FALSE,
ambiguous_nuc = "empirical")
nuc_dist_1 <- 1/18*c(8, 2, 3, 5)
nuc_dist_2 <- 1/17*c(3, 2, 6, 6)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], nuc_dist_1)
expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 8, ], nuc_dist_1)
expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 11, ], nuc_dist_1)
expect_equivalent(arrays[[1]][1, 12, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 14, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 14, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 11, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 12, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 13, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][3, 14, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 15, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(0, 1))
############# padding/amb nucleotide, label_folder ############
directories = c("fasta_2", "fasta_3")
gen <- get_generator(path = directories,
val = FALSE,
train_type = "label_folder",
padding = TRUE,
format = "fasta",
batch_size = 6,
maxlen = 15,
ambiguous_nuc = "equal",
vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE,
step = 1)
equal_vector <- rep(0.25, 4)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 6, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 7, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 8, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 9, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 10, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 11, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 12, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 6, ], equal_vector)
expect_equivalent(arrays[[1]][6, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][6, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][6, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][6, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][6, 11, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 12, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 13, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][6, 14, ], equal_vector)
expect_equivalent(arrays[[1]][6, 15, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 1))
###### more than 2 files in one batch ######
# LM
gen <- generator_fasta_lm(path_corpus = "fasta_3",
batch_size = 8,
maxlen = 12,
max_iter = 10000,
step = 50,
ambiguous_nuc = "empirical")
nuc_dist_1 <- 1/18*c(8, 2, 3, 5)
nuc_dist_2 <- 1/17*c(3, 2, 6, 6)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 6, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 5, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 7, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 8, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 9, ], nuc_dist_2)
expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][6, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][8, 5, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][8, 6, ], c(0, 0, 1, 0))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 5, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 7, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 8, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 9, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][3, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 6, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 5, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][6, 8, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 5, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 7, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 8, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 9, ], nuc_dist_2)
expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][8, 4, ], c(0, 1, 0, 0))
# label header
gen <- generator_fasta_label_header_csv(path_corpus = "fasta_3",
batch_size = 8,
maxlen = 12,
max_iter = 10000,
step = 50,
ambiguous_nuc = "empirical",
reverse_complement = FALSE,
vocabulary_label = c("X", "Y")
)
nuc_dist_1 <- 1/18*c(8, 2, 3, 5)
nuc_dist_2 <- 1/17*c(3, 2, 6, 6)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 5, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 7, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 8, ], nuc_dist_2)
expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][6, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][8, 4, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][8, 5, ], c(0, 0, 1, 0))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 5, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][2, 7, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 8, ], nuc_dist_2)
expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][3, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][5, 4, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][6, 6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][6, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 2, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 3, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 4, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 5, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 6, ], nuc_dist_2)
expect_equivalent(arrays[[1]][7, 7, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 8, ], nuc_dist_2)
expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_1)
expect_equivalent(arrays[[1]][8, 4, ], c(0, 1, 0, 0))
# label folder
directories = c("fasta_2", "fasta_3")
gen <- get_generator(path = directories,
train_type = "label_folder",
batch_size = 20,
maxlen = 12,
val = FALSE,
padding = TRUE,
ambiguous_nuc = "empirical",
vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE,
step = 1)
nuc_dist_1 <- 1/18*c(8, 2, 3, 5)
nuc_dist_2 <- 1/17*c(3, 2, 6, 6)
arrays <- gen()
expect_equivalent(arrays[[1]][9, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][9, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][9, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][9, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][9, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][9, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][9, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][9, 8, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][9, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][9, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][9, 11, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][9, 12, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][9, ], c(1, 0))
expect_equivalent(arrays[[1]][12, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][12, 2, ], nuc_dist_1)
expect_equivalent(arrays[[1]][12, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][12, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][12, 5, ], nuc_dist_1)
expect_equivalent(arrays[[1]][12, 6, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][12, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][12, 8, ], nuc_dist_1)
expect_equivalent(arrays[[1]][12, 9, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][12, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][12, 11, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][12, 12, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][12, ], c(0, 1))
expect_equivalent(arrays[[1]][18, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][18, 2, ], nuc_dist_1)
expect_equivalent(arrays[[1]][18, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][18, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][18, 5, ], nuc_dist_1)
expect_equivalent(arrays[[1]][18, 6, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][18, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][18, 8, ], nuc_dist_1)
expect_equivalent(arrays[[1]][18, 9, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][18, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][18, 11, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][18, 12, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][18, ], c(0, 1))
arrays <- gen()
expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 3, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 4, ], c(0, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 6, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][7, 7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][7, 8, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][7, 9, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][7, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][7, 11, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][7, 12, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][7, ], c(1, 0))
expect_equivalent(arrays[[1]][14, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][14, 2, ], nuc_dist_1)
expect_equivalent(arrays[[1]][14, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][14, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][14, 5, ], nuc_dist_1)
expect_equivalent(arrays[[1]][14, 6, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][14, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][14, 8, ], nuc_dist_1)
expect_equivalent(arrays[[1]][14, 9, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][14, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][14, 11, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][14, 12, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][14, ], c(0, 1))
expect_equivalent(arrays[[1]][20, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][20, 2, ], nuc_dist_1)
expect_equivalent(arrays[[1]][20, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][20, 4, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][20, 5, ], nuc_dist_1)
expect_equivalent(arrays[[1]][20, 6, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][20, 7, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][20, 8, ], nuc_dist_1)
expect_equivalent(arrays[[1]][20, 9, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][20, 10, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][20, 11, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][20, 12, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][20, ], c(0, 1))
# test quality scores LM
gen <- generator_fasta_lm(path_corpus = "fastq",
format = "fastq",
batch_size = 10,
maxlen = 3,
max_iter = 10000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
shuffle_file_order = FALSE,
step = 2,
seed = 1234,
shuffle_input = FALSE,
file_limit = NULL,
path_file_log = NULL,
reverse_complement = FALSE,
output_format = "target_right",
ambiguous_nuc = "zeros",
use_quality_score = TRUE,
proportion_per_seq = NULL,
padding = FALSE)
a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4)
c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4)
g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4)
t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], a)
expect_equivalent(arrays[[1]][1, 2, ], a)
expect_equivalent(arrays[[1]][1, 3, ], c)
expect_equivalent(arrays[[1]][2, 1, ], c)
expect_equivalent(arrays[[1]][2, 2, ], c)
expect_equivalent(arrays[[1]][2, 3, ], g)
expect_equivalent(arrays[[1]][3, 1, ], a)
expect_equivalent(arrays[[1]][3, 2, ], c)
expect_equivalent(arrays[[1]][3, 3, ], g)
expect_equivalent(arrays[[1]][4, 1, ], g)
expect_equivalent(arrays[[1]][4, 2, ], t)
expect_equivalent(arrays[[1]][4, 3, ], a)
expect_equivalent(arrays[[1]][5, 1, ], c)
expect_equivalent(arrays[[1]][5, 2, ], g)
expect_equivalent(arrays[[1]][5, 3, ], t)
expect_equivalent(arrays[[1]][6, 1, ], t)
expect_equivalent(arrays[[1]][6, 2, ], c)
expect_equivalent(arrays[[1]][6, 3, ], g)
expect_equivalent(arrays[[1]][7, 1, ], a)
expect_equivalent(arrays[[1]][7, 2, ], t)
expect_equivalent(arrays[[1]][7, 3, ], a)
expect_equivalent(arrays[[1]][8, 1, ], a)
expect_equivalent(arrays[[1]][8, 2, ], a)
expect_equivalent(arrays[[1]][8, 3, ], c)
expect_equivalent(arrays[[2]][1, ], c)
expect_equivalent(arrays[[2]][2, ], g)
expect_equivalent(arrays[[2]][3, ], t)
expect_equivalent(arrays[[2]][4, ], c)
expect_equivalent(arrays[[2]][5, ], c)
expect_equivalent(arrays[[2]][6, ], t)
expect_equivalent(arrays[[2]][7, ], t)
expect_equivalent(arrays[[2]][8, ], c)
# test quality scores label
gen <- generator_fasta_label_folder(path_corpus = "fastq",
format = "fastq",
batch_size = 10,
maxlen = 3,
max_iter = 10000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
shuffle_file_order = FALSE,
step = 2,
seed = 1234,
shuffle_input = FALSE,
file_limit = NULL,
path_file_log = NULL,
reverse_complement = FALSE,
ambiguous_nuc = "zeros",
use_quality_score = TRUE,
proportion_per_seq = NULL,
num_targets = 2,
ones_column = 1,
padding = FALSE)
a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4)
c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4)
g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4)
t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, ], a)
expect_equivalent(arrays[[1]][1, 2, ], a)
expect_equivalent(arrays[[1]][1, 3, ], c)
expect_equivalent(arrays[[1]][2, 1, ], c)
expect_equivalent(arrays[[1]][2, 2, ], c)
expect_equivalent(arrays[[1]][2, 3, ], g)
expect_equivalent(arrays[[1]][3, 1, ], a)
expect_equivalent(arrays[[1]][3, 2, ], c)
expect_equivalent(arrays[[1]][3, 3, ], g)
expect_equivalent(arrays[[1]][4, 1, ], g)
expect_equivalent(arrays[[1]][4, 2, ], t)
expect_equivalent(arrays[[1]][4, 3, ], a)
expect_equivalent(arrays[[1]][5, 1, ], a)
expect_equivalent(arrays[[1]][5, 2, ], c)
expect_equivalent(arrays[[1]][5, 3, ], g)
expect_equivalent(arrays[[1]][6, 1, ], c)
expect_equivalent(arrays[[1]][6, 2, ], g)
expect_equivalent(arrays[[1]][6, 3, ], t)
expect_equivalent(arrays[[1]][7, 1, ], t)
expect_equivalent(arrays[[1]][7, 2, ], c)
expect_equivalent(arrays[[1]][7, 3, ], g)
expect_equivalent(arrays[[1]][8, 1, ], a)
expect_equivalent(arrays[[1]][8, 2, ], t)
expect_equivalent(arrays[[1]][8, 3, ], a)
expect_equivalent(arrays[[1]][9, 1, ], a)
expect_equivalent(arrays[[1]][9, 2, ], t)
expect_equivalent(arrays[[1]][9, 3, ], a)
expect_equivalent(arrays[[1]][10, 1, ], a)
expect_equivalent(arrays[[1]][10, 2, ], a)
expect_equivalent(arrays[[1]][10, 3, ], c)
expect_equivalent(arrays[[2]][1, ], c(1,0))
expect_equivalent(arrays[[2]][10, ], c(1,0))
## test read data with quality
gen <- generator_fasta_label_folder(path_corpus = "read_data",
format = "fastq",
batch_size = 5,
maxlen = 12,
max_iter = 10000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
shuffle_file_order = FALSE,
step = 2,
seed = 1234,
shuffle_input = FALSE,
file_limit = NULL,
path_file_log = NULL,
read_data = TRUE,
reverse_complement = FALSE,
ambiguous_nuc = "zeros",
use_quality_score = TRUE,
proportion_per_seq = NULL,
num_targets = 2,
ones_column = 1,
padding = FALSE)
a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4)
c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4)
g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4)
t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,a,a,c,c,c))
expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,c,c,g,g,g))
expect_equivalent(arrays[[1]][[1]][2, , ], rbind(a,c,a,c,a,c))
expect_equivalent(arrays[[1]][[2]][2, , ], rbind(c,g,c,g,c,g))
expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,g,g,t,t,t))
expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,t,t,g,g,g))
expect_equivalent(arrays[[1]][[1]][4, , ], rbind(g,t,g,t,g,t))
expect_equivalent(arrays[[1]][[2]][4, , ], rbind(t,g,t,g,t,g))
expect_equivalent(arrays[[1]][[1]][5, , ], rbind(a,a,a,c,c,c))
expect_equivalent(arrays[[1]][[2]][5, , ], rbind(c,c,c,g,g,g))
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,c,a,c,a,c))
expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,g,c,g,c,g))
expect_equivalent(arrays[[1]][[1]][2, , ], rbind(g,g,g,t,t,t))
expect_equivalent(arrays[[1]][[2]][2, , ], rbind(t,t,t,g,g,g))
expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,t,g,t,g,t))
expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,g,t,g,t,g))
# additional input LM
gen <- generator_fasta_lm(path_corpus = "fasta_3",
format = "fasta",
batch_size = 10,
maxlen = 5,
vocabulary = c("a", "c", "g", "t"),
shuffle_file_order = FALSE,
step = 4,
shuffle_input = FALSE,
reverse_complement = FALSE,
output_format = "target_right",
ambiguous_nuc = "zeros",
added_label_path = "label.csv",
add_input_as_seq = FALSE,
padding = FALSE)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1,], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][2,], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][3,], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][4,], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][5,], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][6,], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][7,], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][8,], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][9,], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][10,], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[2]][10, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[2]][10, 3, ], c(0, 0, 0, 0))
# additional input label_folder
dir <- c("label_folder/x", "label_folder/y", "label_folder/z")
gen_list <- generator_initialize(directories = dir,
format = "fasta",
batch_size = 15,
maxlen = 4,
step = 2,
val = FALSE,
padding = FALSE,
added_label_path = "label.csv",
add_input_as_seq = FALSE)
gen <- generator_fasta_label_folder_wrapper(val = FALSE, path = dir, gen_list = gen_list)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][5, ], c(1, 0, 0, 1))
expect_equivalent(arrays[[1]][[1]][6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[1]][7, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[1]][8, ], c(0, 1, 0, 1))
expect_equivalent(arrays[[1]][[1]][9, ], c(0, 1, 0, 1))
expect_equivalent(arrays[[1]][[1]][10, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[1]][11, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][12, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][13, ], c(0, 0, 1, 1))
expect_equivalent(arrays[[1]][[1]][14, ], c(0, 0, 1, 1))
expect_equivalent(arrays[[1]][[1]][15, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[2]][5, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[2]][5, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[2]][10, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[2]][10, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[2]][15, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][[2]][15, 2, ], c(0, 0, 0, 1))
gen <- generator_fasta_label_folder_wrapper(val = FALSE, path = dir, gen_list = gen_list)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, ], c(1, 0, 0, 1))
expect_equivalent(arrays[[1]][[1]][2, ], c(1, 0, 0, 1))
expect_equivalent(arrays[[1]][[1]][3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][4, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][5, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][[1]][6, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[1]][7, ], c(0, 1, 0, 1))
expect_equivalent(arrays[[1]][[1]][8, ], c(0, 1, 0, 1))
expect_equivalent(arrays[[1]][[1]][9, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[1]][10, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][[1]][11, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][12, ], c(0, 0, 1, 1))
expect_equivalent(arrays[[1]][[1]][13, ], c(0, 0, 1, 1))
expect_equivalent(arrays[[1]][[1]][14, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][[1]][15, ], c(0, 0, 1, 0))
## read data with quality and 2 classes
gen <- get_generator(path = c("read_data_2/label_a", "read_data_2/label_b"),
train_type = "label_folder",
format = "fastq",
batch_size = 4,
maxlen = 12,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
shuffle_file_order = FALSE,
step = 1,
seed = 1234,
shuffle_input = FALSE,
file_limit = NULL,
path_file_log = NULL,
reverse_complement = FALSE,
val = FALSE,
ambiguous_nuc = "zero",
proportion_per_seq = NULL,
read_data = TRUE,
use_quality_score = TRUE,
padding = FALSE,
added_label_path = NULL,
skip_amb_nuc = NULL)
arrays <- gen()
a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4)
c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4)
g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4)
t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,a,a,a,a,a))
expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,c,c,c,c,c))
expect_equivalent(arrays[[1]][[1]][2, , ], rbind(a,a,a,a,a,a))
expect_equivalent(arrays[[1]][[2]][2, , ], rbind(c,c,c,c,c,c))
expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,g,g,g,g,g))
expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,t,t,t,t,t))
expect_equivalent(arrays[[1]][[1]][4, , ], rbind(g,g,g,g,g,g))
expect_equivalent(arrays[[1]][[2]][4, , ], rbind(t,t,t,t,t,t))
### get output tensor from csv file + concat
testpath <- file.path("fasta_2")
label_from_csv <- "output_label.csv"
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5,
maxlen = 10, step = 10,
vocabulary = c("a", "c", "g", "t", "Z"),
reverse_complement = FALSE,
vocabulary_label = c("w", "x", "y"),
format = "fasta",
max_iter = 10000,
verbose = FALSE,
shuffle_file_order = FALSE,
seed = 1234,
shuffle_input = FALSE,
file_limit = NULL,
path_file_log = NULL,
ambiguous_nuc = "zero",
proportion_per_seq = NULL,
read_data = FALSE,
use_quality_score = FALSE,
padding = TRUE,
skip_amb_nuc = NULL,
max_samples = NULL,
concat_seq = "ZZ",
added_label_path = NULL,
add_input_as_seq = NULL,
target_from_csv = label_from_csv)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 0, 1))
expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 2, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], 1:4)
expect_equivalent(arrays[[2]][2, ], 1:4)
expect_equivalent(arrays[[2]][3, ], 1:4)
expect_equivalent(arrays[[2]][4, ], 11:14)
expect_equivalent(arrays[[2]][5, ], 11:14)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 8, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 0, 0, 1))
expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 0, 1))
expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, ], 11:14)
expect_equivalent(arrays[[2]][2, ], 1:4)
expect_equivalent(arrays[[2]][3, ], 1:4)
expect_equivalent(arrays[[2]][4, ], 1:4)
expect_equivalent(arrays[[2]][5, ], 11:14)
## 2 added input files LM
gen <- generator_fasta_lm(path_corpus = "fasta_3",
format = "fasta",
batch_size = 10,
maxlen = 5,
vocabulary = c("a", "c", "g", "t"),
shuffle_file_order = FALSE,
step = 4,
shuffle_input = FALSE,
reverse_complement = FALSE,
output_format = "target_right",
ambiguous_nuc = "zeros",
added_label_path = c("label.csv",
"add_seq.csv"),
add_input_as_seq = c(FALSE, TRUE),
padding = FALSE)
v1 <- c(0, 0, 1, 0)
v2 <- c(1, 0, 0, 0)
m1 <- matrix(c(1, 0, 0, 0,
0, 1, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1), byrow = TRUE, ncol = 4)
m2 <- matrix(c(0, 0, 0, 1,
0, 0, 0, 0,
0, 1, 0, 0,
0, 0, 0, 0), byrow = TRUE, ncol = 4)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, ], v1)
expect_equivalent(arrays[[1]][[1]][2, ], v1)
expect_equivalent(arrays[[1]][[1]][3, ], v1)
expect_equivalent(arrays[[1]][[1]][4, ], v1)
expect_equivalent(arrays[[1]][[1]][5, ], v2)
expect_equivalent(arrays[[1]][[1]][6, ], v2)
expect_equivalent(arrays[[1]][[1]][7, ], v2)
expect_equivalent(arrays[[1]][[1]][8, ], v2)
expect_equivalent(arrays[[1]][[1]][9, ], v2)
expect_equivalent(arrays[[1]][[1]][10, ], v1)
expect_equivalent(arrays[[1]][[2]][1, , ], m1)
expect_equivalent(arrays[[1]][[2]][2, , ], m1)
expect_equivalent(arrays[[1]][[2]][3, , ], m1)
expect_equivalent(arrays[[1]][[2]][4, , ], m1)
expect_equivalent(arrays[[1]][[2]][5, , ], m2)
expect_equivalent(arrays[[1]][[2]][6, , ], m2)
expect_equivalent(arrays[[1]][[2]][7, , ], m2)
expect_equivalent(arrays[[1]][[2]][8, , ], m2)
expect_equivalent(arrays[[1]][[2]][9, , ], m2)
expect_equivalent(arrays[[1]][[2]][10, , ], m1)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, ], v1)
expect_equivalent(arrays[[1]][[1]][2, ], v1)
expect_equivalent(arrays[[1]][[1]][3, ], v1)
expect_equivalent(arrays[[1]][[1]][4, ], v2)
expect_equivalent(arrays[[1]][[1]][5, ], v2)
expect_equivalent(arrays[[1]][[1]][6, ], v2)
expect_equivalent(arrays[[1]][[1]][7, ], v2)
expect_equivalent(arrays[[1]][[1]][8, ], v2)
expect_equivalent(arrays[[1]][[1]][9, ], v1)
expect_equivalent(arrays[[1]][[1]][10, ], v1)
expect_equivalent(arrays[[1]][[2]][1, , ], m1)
expect_equivalent(arrays[[1]][[2]][2, , ], m1)
expect_equivalent(arrays[[1]][[2]][3, , ], m1)
expect_equivalent(arrays[[1]][[2]][4, , ], m2)
expect_equivalent(arrays[[1]][[2]][5, , ], m2)
expect_equivalent(arrays[[1]][[2]][6, , ], m2)
expect_equivalent(arrays[[1]][[2]][7, , ], m2)
expect_equivalent(arrays[[1]][[2]][8, , ], m2)
expect_equivalent(arrays[[1]][[2]][9, , ], m1)
expect_equivalent(arrays[[1]][[2]][10, , ], m1)
## 2 added input files, label_folder
dir <- c("label_folder/x", "label_folder/y", "label_folder/z")
gen <- get_generator(path = dir,
train_type = "label_folder",
format = "fasta",
batch_size = 15,
maxlen = 4,
step = 2,
val = FALSE,
padding = FALSE,
added_label_path = c("label.csv",
"add_seq.csv"),
add_input_as_seq = c(FALSE, TRUE)
)
x1 <- c(1, 0, 0, 0)
x2 <- c(1, 0, 0, 1)
y1 <- c(0, 1, 0, 0)
y2 <- c(0, 1, 0, 1)
z1 <- c(0, 0, 1, 0)
z2 <- c(0, 0, 1, 1)
mx1 <- matrix(c(1, 0, 0, 0,
1, 0, 0, 0,
1, 0, 0, 0,
1, 0, 0, 0), byrow = TRUE, ncol = 4)
mx2 <- matrix(c(1, 0, 0, 0,
0, 1, 0, 0,
1, 0, 0, 0,
0, 1, 0, 0), byrow = TRUE, ncol = 4)
my1 <- matrix(c(0, 1, 0, 0,
0, 1, 0, 0,
0, 1, 0, 0,
0, 1, 0, 0), byrow = TRUE, ncol = 4)
my2 <- matrix(c(0, 1, 0, 0,
0, 0, 1, 0,
0, 1, 0, 0,
0, 0, 1, 0), byrow = TRUE, ncol = 4)
mz1 <- matrix(c(0, 0, 1, 0,
0, 0, 1, 0,
0, 0, 1, 0,
0, 0, 1, 0), byrow = TRUE, ncol = 4)
mz2 <- matrix(c(0, 0, 1, 0,
0, 0, 0, 1,
0, 0, 1, 0,
0, 0, 0, 1), byrow = TRUE, ncol = 4)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, ], x1)
expect_equivalent(arrays[[1]][[1]][2, ], x1)
expect_equivalent(arrays[[1]][[1]][3, ], x1)
expect_equivalent(arrays[[1]][[1]][4, ], x1)
expect_equivalent(arrays[[1]][[1]][5, ], x2)
expect_equivalent(arrays[[1]][[1]][6, ], y1)
expect_equivalent(arrays[[1]][[1]][7, ], y1)
expect_equivalent(arrays[[1]][[1]][8, ], y2)
expect_equivalent(arrays[[1]][[1]][9, ], y2)
expect_equivalent(arrays[[1]][[1]][10, ], y1)
expect_equivalent(arrays[[1]][[1]][11, ], z1)
expect_equivalent(arrays[[1]][[1]][12, ], z1)
expect_equivalent(arrays[[1]][[1]][13, ], z2)
expect_equivalent(arrays[[1]][[1]][14, ], z2)
expect_equivalent(arrays[[1]][[1]][15, ], z1)
expect_equivalent(arrays[[1]][[2]][1, , ], mx1)
expect_equivalent(arrays[[1]][[2]][2, , ], mx1)
expect_equivalent(arrays[[1]][[2]][3, , ], mx1)
expect_equivalent(arrays[[1]][[2]][4, , ], mx1)
expect_equivalent(arrays[[1]][[2]][5, , ], mx2)
expect_equivalent(arrays[[1]][[2]][6, , ], my1)
expect_equivalent(arrays[[1]][[2]][7, , ], my1)
expect_equivalent(arrays[[1]][[2]][8, , ], my2)
expect_equivalent(arrays[[1]][[2]][9, , ], my2)
expect_equivalent(arrays[[1]][[2]][10, , ], my1)
expect_equivalent(arrays[[1]][[2]][11, , ], mz1)
expect_equivalent(arrays[[1]][[2]][12, , ], mz1)
expect_equivalent(arrays[[1]][[2]][13, , ], mz2)
expect_equivalent(arrays[[1]][[2]][14, , ], mz2)
expect_equivalent(arrays[[1]][[2]][15, , ], mz1)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, ], x2)
expect_equivalent(arrays[[1]][[1]][2, ], x2)
expect_equivalent(arrays[[1]][[1]][3, ], x1)
expect_equivalent(arrays[[1]][[1]][4, ], x1)
expect_equivalent(arrays[[1]][[1]][5, ], x1)
expect_equivalent(arrays[[1]][[1]][6, ], y1)
expect_equivalent(arrays[[1]][[1]][7, ], y2)
expect_equivalent(arrays[[1]][[1]][8, ], y2)
expect_equivalent(arrays[[1]][[1]][9, ], y1)
expect_equivalent(arrays[[1]][[1]][10, ], y1)
expect_equivalent(arrays[[1]][[1]][11, ], z1)
expect_equivalent(arrays[[1]][[1]][12, ], z2)
expect_equivalent(arrays[[1]][[1]][13, ], z2)
expect_equivalent(arrays[[1]][[1]][14, ], z1)
expect_equivalent(arrays[[1]][[1]][15, ], z1)
expect_equivalent(arrays[[1]][[2]][1, , ], mx2)
expect_equivalent(arrays[[1]][[2]][2, , ], mx2)
expect_equivalent(arrays[[1]][[2]][3, , ], mx1)
expect_equivalent(arrays[[1]][[2]][4, , ], mx1)
expect_equivalent(arrays[[1]][[2]][5, , ], mx1)
expect_equivalent(arrays[[1]][[2]][6, , ], my1)
expect_equivalent(arrays[[1]][[2]][7, , ], my2)
expect_equivalent(arrays[[1]][[2]][8, , ], my2)
expect_equivalent(arrays[[1]][[2]][9, , ], my1)
expect_equivalent(arrays[[1]][[2]][10, , ], my1)
expect_equivalent(arrays[[1]][[2]][11, , ], mz1)
expect_equivalent(arrays[[1]][[2]][12, , ], mz2)
expect_equivalent(arrays[[1]][[2]][13, , ], mz2)
expect_equivalent(arrays[[1]][[2]][14, , ], mz1)
expect_equivalent(arrays[[1]][[2]][15, , ], mz1)
# 3 targets, target right
gen <- generator_fasta_lm(path_corpus = "fasta_3",
batch_size = 5,
maxlen = 4,
step = 5,
output_format = "target_right",
padding = FALSE,
target_len = 3)
arrays <- gen()
expect_equivalent(arrays[[1]][1, , ],
matrix(
c(1, 0, 0, 0,
1, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][5, , ],
matrix(
c(1, 0, 0, 0,
0, 1, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1),
byrow = TRUE, ncol = 4
))
m1 <- matrix(
c(0, 1, 0, 0,
0, 0, 0, 1,
0, 0, 0, 0,
0, 0, 1, 0,
1, 0, 0, 0),
byrow = TRUE, ncol = 4
)
m2 <- matrix(
c(0, 0, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1,
0, 0, 0, 1,
0, 1, 0, 0),
byrow = TRUE, ncol = 4)
m3 <- matrix(
c(0, 0, 1, 0,
1, 0, 0, 0,
0, 0, 0, 1,
0, 0, 0, 1,
0, 0, 1, 0),
byrow = TRUE, ncol = 4)
expect_equivalent(arrays[[2]][ ,1 , ], m1)
expect_equivalent(arrays[[2]][ ,2 , ], m2)
expect_equivalent(arrays[[2]][ ,3 , ], m3)
# 3 targets, target middle cnn
gen <- generator_fasta_lm(path_corpus = "fasta_3",
batch_size = 5,
maxlen = 4,
step = 5,
output_format = "target_middle_cnn",
padding = FALSE,
target_len = 3)
arrays <- gen()
expect_equivalent(arrays[[1]][1, , ],
matrix(
c(1, 0, 0, 0,
1, 0, 0, 0,
0, 0, 0, 0,
0, 0, 1, 0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][5, , ],
matrix(
c(1, 0, 0, 0,
0, 1, 0, 0,
0, 1, 0, 0,
0, 0, 1, 0),
byrow = TRUE, ncol = 4
))
m1 <- matrix(
c(0, 0, 0, 0,
0, 0, 1, 0,
1, 0, 0, 0,
0, 0, 1, 0,
0, 0, 1, 0),
byrow = TRUE, ncol = 4
)
m2 <- matrix(
c(0, 1, 0, 0,
0, 0, 0, 0,
1, 0, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1),
byrow = TRUE, ncol = 4)
m3 <- matrix(
c(0, 1, 0, 0,
0, 0, 0, 1,
0, 0, 0, 0,
0, 0, 1, 0,
1, 0, 0, 0),
byrow = TRUE, ncol = 4)
expect_equivalent(arrays[[2]][ ,1 , ], m1)
expect_equivalent(arrays[[2]][ ,2 , ], m2)
expect_equivalent(arrays[[2]][ ,3 , ], m3)
# 3 targets, target middle lstm
gen <- generator_fasta_lm(path_corpus = "fasta_3",
batch_size = 5,
maxlen = 4,
step = 5,
output_format = "target_middle_lstm",
padding = FALSE,
target_len = 3)
arrays <- gen()
expect_equivalent(arrays[[1]][[1]][1, , ],
matrix(
c(1, 0, 0, 0,
1, 0, 0, 0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][[2]][1, , ],
matrix(
c(0, 0, 1, 0,
0, 0, 0, 0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][[1]][5, , ],
matrix(
c(1, 0, 0, 0,
0, 1, 0, 0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][[2]][5, , ],
matrix(
c(0, 0, 1, 0,
0, 1, 0, 0),
byrow = TRUE, ncol = 4
))
m1 <- matrix(
c(0, 0, 0, 0,
0, 0, 1, 0,
1, 0, 0, 0,
0, 0, 1, 0,
0, 0, 1, 0),
byrow = TRUE, ncol = 4
)
m2 <- matrix(
c(0, 1, 0, 0,
0, 0, 0, 0,
1, 0, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1),
byrow = TRUE, ncol = 4)
m3 <- matrix(
c(0, 1, 0, 0,
0, 0, 0, 1,
0, 0, 0, 0,
0, 0, 1, 0,
1, 0, 0, 0),
byrow = TRUE, ncol = 4)
expect_equivalent(arrays[[2]][ ,1 , ], m1)
expect_equivalent(arrays[[2]][ ,2 , ], m2)
expect_equivalent(arrays[[2]][ ,3 , ], m3)
# coverage + set learning for label_folder
directories <- c("coverage_data/x", "coverage_data/y")
val <- FALSE
batch_size <- 6
samples_per_target <- 3
#new_batch_size <- batch_size/samples_per_target
path <- directories
voc_len <- 4
maxlen <- 7
reshape_mode <- "time_dist"
set_learning <- list(reshape_mode = reshape_mode,
maxlen = maxlen,
samples_per_target = samples_per_target)
gen <- get_generator(path = directories,
train_type = "label_folder",
val = FALSE,
padding = TRUE,
format = "fasta",
batch_size = batch_size,
maxlen = maxlen,
vocabulary = c("a", "c", "g", "t"),
step = 4,
use_coverage = 1,
set_learning = set_learning)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, , ], matrix(
c(7,0,0,0,
7,0,0,0,
0,0,0,0,
0,7,0,0,
0,7,0,0,
0,0,0,0,
0,0,7,0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][1, 3, , ], matrix(
c(11,0,0,0,
11,0,0,0,
11,0,0,0,
11,0,0,0,
0,0,0,0,
0,0,0,11,
0,0,0,11),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][2, 1, , ], matrix(
c(0,0,0,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,0,1,
0,0,0,1),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][3, 1, , ], matrix(
c(0,0,0,0,
17,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][3, 2, , ], matrix(
c(7,0,0,0,
7,0,0,0,
0,0,0,0,
0,7,0,0,
0,7,0,0,
0,0,0,0,
0,0,7,0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][4, 1, , ], matrix(
c(2,0,0,0,
0,2,0,0,
0,0,2,0,
0,0,0,2,
2,0,0,0,
2,0,0,0,
0,2,0,0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][5, 3, , ], matrix(
c(0,0,0,0,
17,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][6, 3, , ], matrix(
c(0,0,0,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,0,1,
0,0,0,1),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[2]], matrix(
c(1,0,
1,0,
1,0,
0,1,
0,1,
0,1),
byrow = TRUE, ncol = 2
))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, , ], matrix(
c(11,0,0,0,
11,0,0,0,
11,0,0,0,
11,0,0,0,
0,0,0,0,
0,0,0,11,
0,0,0,11),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[1]][4, 3, , ], matrix(
c(0,0,0,0,
17,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0),
byrow = TRUE, ncol = 4
))
expect_equivalent(arrays[[2]], matrix(
c(1,0,
1,0,
1,0,
0,1,
0,1,
0,1),
byrow = TRUE, ncol = 2
))
# coverage + set learning for label_folder + normalizing input tensor
directories <- c("coverage_data/x", "coverage_data/y")
val <- FALSE
batch_size <- 6
samples_per_target <- 3
#new_batch_size <- batch_size/samples_per_target
path <- directories
voc_len <- 4
maxlen <- 7
use_coverage <- 17
reshape_mode <- "time_dist"
set_learning <- list(reshape_mode = reshape_mode,
maxlen = maxlen,
samples_per_target = samples_per_target)
gen <- get_generator(path = directories,
train_type = "label_folder",
val = FALSE,
padding = TRUE,
format = "fasta",
batch_size = batch_size,
maxlen = maxlen,
vocabulary = c("a", "c", "g", "t"),
step = 4,
use_coverage = use_coverage,
set_learning = set_learning)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, , ], matrix(
c(7,0,0,0,
7,0,0,0,
0,0,0,0,
0,7,0,0,
0,7,0,0,
0,0,0,0,
0,0,7,0),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][1, 3, , ], matrix(
c(11,0,0,0,
11,0,0,0,
11,0,0,0,
11,0,0,0,
0,0,0,0,
0,0,0,11,
0,0,0,11),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][2, 1, , ], matrix(
c(0,0,0,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,0,1,
0,0,0,1),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][3, 1, , ], matrix(
c(0,0,0,0,
17,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][3, 2, , ], matrix(
c(7,0,0,0,
7,0,0,0,
0,0,0,0,
0,7,0,0,
0,7,0,0,
0,0,0,0,
0,0,7,0),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][4, 1, , ], matrix(
c(2,0,0,0,
0,2,0,0,
0,0,2,0,
0,0,0,2,
2,0,0,0,
2,0,0,0,
0,2,0,0),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][5, 3, , ], matrix(
c(0,0,0,0,
17,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][6, 3, , ], matrix(
c(0,0,0,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,1,0,
0,0,0,1,
0,0,0,1),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[2]], matrix(
c(1,0,
1,0,
1,0,
0,1,
0,1,
0,1),
byrow = TRUE, ncol = 2
))
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1, , ], matrix(
c(11,0,0,0,
11,0,0,0,
11,0,0,0,
11,0,0,0,
0,0,0,0,
0,0,0,11,
0,0,0,11),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[1]][4, 3, , ], matrix(
c(0,0,0,0,
17,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0),
byrow = TRUE, ncol = 4
)/use_coverage)
expect_equivalent(arrays[[2]], matrix(
c(1,0,
1,0,
1,0,
0,1,
0,1,
0,1),
byrow = TRUE, ncol = 2
))
# rds label generator
gen <- generator_rds(rds_folder = "rds", batch_size = 1)
l_x <- list()
l_y <- list()
for (i in 1:40) {
z <- gen()
l_x[[i]] <- z[[1]][1,1,1]
l_y[[i]] <- which.max(z[[2]])
}
expect_equivalent(sort(unlist(l_x)), rep(1:20, each=2))
expect_equivalent(sort(unlist(l_y)), rep(1:20, each=2))
gen <- generator_rds(rds_folder = "rds", batch_size = 10)
l_x <- list()
l_y <- list()
for (i in 1:4) {
z <- gen()
l_x[[i]] <- z[[1]][,1,1]
l_y[[i]] <- apply(z[[2]], 1, which.max)
}
expect_equivalent(sort(unlist(l_x)), rep(1:20, each = 2))
expect_equivalent(sort(unlist(l_y)), rep(1:20, each=2))
# rds lm generator
target_len <- 3
batch_size <- 1
gen <- generator_rds(rds_folder = "rds_lm", batch_size = batch_size, target_len = target_len)
for (one_iter in 1:3) {
first_input <- 1 + (100*(0:4))
for (i in 1:5) {
z <- gen()
expect_equivalent(dim(z[[1]]), c(batch_size, 7 - target_len, 4))
l_x <- z[[1]][1,1,1]
first_input <- setdiff(first_input, l_x)
l_y <- NULL
for (j in 1:target_len) {
l_y[[j]] <- z[[2]][[j]][1,1]
}
expect_equivalent(l_y, l_x + 3 + (1:target_len))
}
expect_equivalent(length(first_input), 0)
}
batch_size <- 5
gen <- generator_rds(rds_folder = "rds_lm", batch_size = batch_size, target_len = target_len)
for (one_iter in 1:3) {
first_input <- 1 + (100*(0:4))
z <- gen()
expect_equivalent(dim(z[[1]]), c(batch_size, 7 - target_len, 4))
l_x <- z[[1]][ , 1, 1]
first_input <- setdiff(first_input, l_x)
l_y <- NULL
for (j in 1:target_len) {
l_y[[j]] <- z[[2]][[j]][,1]
}
expect_equivalent(sort(l_y[[1]]), 5 + (100*(0:4)))
expect_equivalent(sort(l_y[[2]]), 6 + (100*(0:4)))
expect_equivalent(sort(l_y[[3]]), 7 + (100*(0:4)))
expect_equivalent(length(first_input), 0)
}
# n-gram rds
n_gram <- 3
gen <- generator_rds(rds_folder = "n_gram_rds",
batch_size = 1,
target_len = 6,
n_gram = n_gram,
n_gram_stride = n_gram)
arrays <- gen()
y <- arrays[[2]]
y_1_n_gram <- apply(y[[1]], 1, which.max)
y_2_n_gram <- apply(y[[2]], 1, which.max)
int_seq <- c(1,2,0)
expect_equivalent(y_1_n_gram[1], 1 + sum(4^((n_gram-1):0) * (int_seq))) # cga
int_seq <- c(0,0,1)
expect_equivalent(y_2_n_gram[1], 1 + sum(4^((n_gram-1):0) * (int_seq))) # aac
# set learning concat with coverage encoding
directories <- c("coverage_data/x", "coverage_data/y")
val <- FALSE
batch_size <- 8
samples_per_target <- 3
#new_batch_size <- batch_size/samples_per_target
path <- directories
voc_len <- 4
maxlen <- 6
use_coverage <- 17
reshape_mode <- "concat"
set_learning <- list(reshape_mode = reshape_mode,
maxlen = maxlen,
buffer_len = NULL,
samples_per_target = samples_per_target)
buffer_size <- 0
concat_maxlen <- (maxlen * samples_per_target) + (buffer_size * (samples_per_target - 1))
gen <- get_generator(path = directories,
train_type = "label_folder",
val = FALSE,
padding = TRUE,
format = "fasta",
batch_size = batch_size,
maxlen = maxlen,
vocabulary = c("a", "c", "g", "t"),
step = maxlen,
use_coverage = use_coverage,
set_learning = set_learning)
m <- matrix(
c(0,0,0,0,
0,0,1/17,0,
0,0,1/17,0,
0,0,1/17,0,
0,0,1/17,0,
0,0,0,1/17,
13/17,0,0,0,
0,13/17,0,0,
0,0,13/17,0,
0,0,0,13/17,
13/17,0,0,0,
0,13/17,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
1,0,0,0),
byrow = TRUE, ncol = 4
)
m2 <- matrix(
c(0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
0,0,0,0,
1,0,0,0,
2/17,0,0,0,
0,2/17,0,0,
0,0,2/17,0,
0,0,0,2/17,
2/17,0,0,0,
2/17,0,0,0,
0,0,3/17,0,
0,0,3/17,0,
0,0,3/17,0,
0,0,3/17,0,
0,0,0,3/17,
0,0,0,3/17),
byrow = TRUE, ncol = 4
)
y <- matrix(c(1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1), ncol = 2, byrow = TRUE)
arrays <- gen()
expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,]))
expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,]))
expect_equivalent(arrays[[2]], y)
expect_equivalent(arrays[[1]][4, , ], m)
expect_equivalent(arrays[[1]][8, , ], m2)
arrays <- gen()
expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,]))
expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,]))
expect_equivalent(arrays[[2]], y)
expect_equivalent(arrays[[1]][4, , ], m)
arrays <- gen()
expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,]))
expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,]))
expect_equivalent(arrays[[2]], y)
expect_equivalent(arrays[[1]][4, , ], m)
expect_equivalent(arrays[[1]][5, , ], m2)
# rds generator with multi inputs/outputs
x1 <- array(0, dim = c(9,5,4))
x2 <- array(0, dim = c(9,5,3))
y1 <- array(0, dim = c(9,2))
y2 <- array(0, dim = c(9,6))
for (i in 1:dim(x1)[1]) {
x1[i,,] <- i
y1[i, ] <- i
x2[i,,] <- i + 10
y2[i, ] <- i + 10
}
index_1 <- 1:5
index_2 <- 6:9
x_list_1 <- list(x1[index_1, , ], x2[index_1, , ])
x_list_2 <- list(x1[index_2, , ], x2[index_2, , ])
y_list_1 <- list(y1[index_1, ], y2[index_1, ])
y_list_2 <- list(y1[index_2, ], y2[index_2, ])
z1 <- list(x = x_list_1, y = y_list_1)
z2 <- list(x = x_list_2, y = y_list_2)
temp_dir <- tempfile()
dir.create(temp_dir)
saveRDS(z1, paste0(temp_dir, "/file_1.rds"))
saveRDS(z2, paste0(temp_dir, "/file_2.rds"))
gen <- generator_rds(rds_folder = temp_dir,
batch_size = 10, path_file_log = NULL,
max_samples = NULL,
proportion_per_seq = NULL,
target_len = NULL,
seed = 1,
reverse_complement = FALSE,
sample_by_file_size = FALSE,
n_gram = NULL, n_gram_stride = 1,
reverse_complement_encoding = FALSE,
add_noise = NULL)
for (k in 1:5) {
z <- gen()
x1 <- z[[1]][[1]] %>% as.array()
x2 <- z[[1]][[2]] %>% as.array()
y1 <- z[[2]][[1]] %>% as.array()
y2 <- z[[2]][[2]] %>% as.array()
for (i in 1:dim(x1)[1]) {
expect_equivalent(min(x1[i,,]), max(y1[i,]))
expect_equivalent(min(x1[i,,]) + 10, max(x2[i,,]))
expect_equivalent(max(x2[i,,]), min(y2[i,]))
expect_equivalent(max(y1[i,]) + 10, min(y2[i,]))
}
}
# integer encoding label header #
testpath <- file.path("fasta_2")
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1], 1) # A
expect_equivalent(arrays[[1]][1, 2], 1) # A
expect_equivalent(arrays[[1]][1, 3], 2) # C
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # W
expect_equivalent(arrays[[1]][5, 1], 1) # A
expect_equivalent(arrays[[1]][5, 2], 1) # A
expect_equivalent(arrays[[1]][5, 3], 4) # T
expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) # W
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 8, step = 2, vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE)
arrays <- gen()
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 1, 0))
arrays <- gen()
expect_equivalent(arrays[[1]][5, 1], 3)
expect_equivalent(arrays[[1]][5, 2], 3)
expect_equivalent(arrays[[1]][5, 3], 3)
expect_equivalent(arrays[[1]][5, 4], 3)
expect_equivalent(arrays[[1]][5, 5], 4)
expect_equivalent(arrays[[1]][5, 6], 4)
expect_equivalent(arrays[[1]][5, 7], 4)
expect_equivalent(arrays[[1]][5, 8], 4)
expect_equivalent(arrays[[2]][1, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][2, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][3, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 8, maxlen = 7, step = 2, vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE)
arrays <- gen()
# go through a/b.fasta once discard samples with target z
expect_equivalent(arrays[[1]][8, 1], 1) # A
expect_equivalent(arrays[[1]][8, 2], 1) # A
expect_equivalent(arrays[[1]][8, 3], 2) # C
expect_equivalent(arrays[[2]][8, ], c(1, 0, 0)) # W
# label folder with integer encoding
directories <- c("label_folder/x", "label_folder/y", "label_folder/z")
gen <- get_generator(path = directories,
train_type = "label_folder",
val = FALSE,
padding = TRUE,
format = "fasta",
batch_size = 6,
maxlen = 2,
return_int = TRUE,
vocabulary = c("a", "c", "g", "t"),
step = 2)
arrays <- gen()
expect_equivalent(arrays[[1]][1, 1], 1)
expect_equivalent(arrays[[1]][1, 2], 2)
expect_equivalent(arrays[[1]][2, 1], 1)
expect_equivalent(arrays[[1]][2, 2], 2)
expect_equivalent(arrays[[1]][3, 1], 3)
expect_equivalent(arrays[[1]][3, 2], 2)
expect_equivalent(arrays[[1]][4, 1], 3)
expect_equivalent(arrays[[1]][4, 2], 2)
expect_equivalent(arrays[[1]][5, 1], 4)
expect_equivalent(arrays[[1]][5, 2], 4)
expect_equivalent(arrays[[1]][6, 1], 4)
expect_equivalent(arrays[[1]][6, 2], 4)
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 0, 1))
# test skipping file
for (i in 1:2) {
arrays <- gen()
}
expect_equivalent(arrays[[1]][1, 1], 1)
expect_equivalent(arrays[[1]][1, 2], 2)
expect_equivalent(arrays[[1]][2, 1], 1)
expect_equivalent(arrays[[1]][2, 2], 3)
expect_equivalent(arrays[[1]][3, 1], 2)
expect_equivalent(arrays[[1]][3, 2], 3)
expect_equivalent(arrays[[1]][4, 1], 2)
expect_equivalent(arrays[[1]][4, 2], 3)
expect_equivalent(arrays[[1]][5, 1], 1)
expect_equivalent(arrays[[1]][5, 2], 1)
expect_equivalent(arrays[[1]][6, 1], 1)
expect_equivalent(arrays[[1]][6, 2], 1)
expect_equivalent(arrays[[2]][1, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][2, ], c(1, 0, 0))
expect_equivalent(arrays[[2]][3, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][4, ], c(0, 1, 0))
expect_equivalent(arrays[[2]][5, ], c(0, 0, 1))
expect_equivalent(arrays[[2]][6, ], c(0, 0, 1))
# n-gram integer encoding, label folder #
directories <- c("label_folder/x", "label_folder/y", "label_folder/z")
gen <- get_generator(path = directories,
train_type = "label_folder",
batch_size = 6,
maxlen = 12,
padding = TRUE,
n_gram = 3,
n_gram_stride = 2,
return_int = TRUE,
vocabulary = c("a", "c", "g", "t"),
step = 2)
arrays <- gen()
x <- arrays[[1]]
y <- arrays[[2]]
expect_equivalent(dim(x), c(6, 5))
expect_equivalent(x[1, 1], 0) # padding
expect_equivalent(x[1, 2], 5) # ACA
expect_equivalent(unique(x[5, 1:4]), 0) # padding
expect_equivalent(x[5, 5], 64) # TTT = 4^3
# n-gram one-hot encoding, label folder #
directories <- c("label_folder/x", "label_folder/y", "label_folder/z")
gen <- get_generator(path = directories,
train_type = "label_folder",
batch_size = 6,
maxlen = 12,
padding = TRUE,
n_gram = 3,
n_gram_stride = 2,
return_int = FALSE,
vocabulary = c("a", "c", "g", "t"),
step = 2)
arrays <- gen()
x <- arrays[[1]]
y <- arrays[[2]]
expect_equivalent(dim(x), c(6, 5, 64))
expect_equivalent(unique(x[1, 1, ]), 0) # padding
expect_equivalent(which.max(x[1, 2, ]), 5) # ACA
expect_equivalent(unique(as.vector(x[5, 1:4, ])), 0) # padding
expect_equivalent(which.max(x[5, 5, ]), 64) # TTT = 4^3
##### masked lm #####
testpath <- file.path("a.fastq")
masked_lm <- list(mask_rate = 0.25, random_rate = 0.25, identity_rate = 0.25, include_sw = TRUE)
gen <- get_generator(path = testpath,
train_type = "masked_lm",
masked_lm = masked_lm,
batch_size = 1,
maxlen = 200,
format = "fastq",
padding = TRUE,
return_int = TRUE)
z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
expect_equivalent(x[1,1:12], rep(0, 12)) # padding
expect_equivalent(sw[1,1:12], rep(0, 12)) # no sample weights in padding region
sw_pos <- which(sw[1,] == 1)
random_pos <- which(x[1,] %in% c(2,3,4))
masked_pos <- which(x[1,] == 5)
# masked and random positions must have sw 1
expect_contains(sw_pos, random_pos)
expect_contains(sw_pos, masked_pos)
###
testpath <- file.path("fasta_2/b.fasta")
masked_lm <- list(mask_rate = 0.25, random_rate = 0.25, identity_rate = 0.25, include_sw = TRUE)
gen <- get_generator(path = testpath,
train_type = "masked_lm",
shuffle_input = FALSE,
masked_lm = masked_lm,
batch_size = 3,
maxlen = 10,
padding = TRUE,
return_int = TRUE)
z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
expect_equivalent(sum(x[,1:2]), 0) # padding
expect_equivalent(sum(sw[,1:2]), 0) # no sample weights in padding region
for (i in 1:3) {
sw_pos <- which(sw[i,] == 1)
masked_pos <- which(x[i,] == 5)
expect_contains(sw_pos, masked_pos) # masked positions must have sw 1
}
#### test reshape ####
directories <- c("fasta_2", "fasta_3")
fx <- function(x) {return(x)}
reshape_xy <- list(x = fx)
expect_error(gen <- get_generator(path = directories,
reshape_xy = reshape_xy,
train_type = "label_folder",
batch_size = 4,
maxlen = 3))
directories <- c("fasta_2", "fasta_3")
fx <- function(x = NULL, y = NULL) {
return(x + 1)
}
fy <- function(x = NULL, y = NULL) {
return(x)
}
reshape_xy <- list(x = fx, y = fy)
gen <- get_generator(path = directories,
reshape_xy = reshape_xy,
val = FALSE,
train_type = "label_folder",
format = "fasta",
batch_size = 4,
maxlen = 3,
vocabulary = c("a", "c", "g", "t"),
reverse_complement = FALSE,
ambiguous_nuc = "zero",
step = 2)
arrays <- gen()
arrays[[1]][1,,]
y <- arrays[[2]]
expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0) + 1)
expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0) + 1)
expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0) + 1)
expect_equivalent(arrays[[2]][1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[2]][1, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][4, 1, ], rep(0, 4) + 1)
expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0) + 1)
expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0) + 1)
expect_equivalent(arrays[[2]][4, 1, ], rep(0, 4))
expect_equivalent(arrays[[2]][4, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][4, 3, ], c(0, 1, 0, 0))
testpath <- file.path("fasta_2")
label_from_csv <- "output_label.csv"
fx <- function(x = NULL, y = NULL) {
return(y + 3)
}
fy <- function(x = NULL, y = NULL) {
return(x + 2)
}
reshape_xy <- list(x = fx, y = fy)
gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5,
reshape_xy = reshape_xy,
maxlen = 10, step = 10,
vocabulary = c("a", "c", "g", "t", "Z"),
reverse_complement = FALSE,
vocabulary_label = c("w", "x", "y"),
shuffle_file_order = FALSE,
seed = 1234,
shuffle_input = FALSE,
padding = TRUE,
concat_seq = "ZZ",
target_from_csv = label_from_csv)
arrays <- gen()
expect_equivalent(arrays[[2]][1, 8, ], c(0, 0, 0, 1, 0) + 2)
expect_equivalent(arrays[[2]][1, 9, ], c(0, 0, 0, 0, 1) + 2)
expect_equivalent(arrays[[2]][1, 10, ], c(0, 0, 0, 0, 1) + 2)
expect_equivalent(arrays[[2]][4, 3, ], c(1, 0, 0, 0, 0) + 2)
expect_equivalent(arrays[[2]][4, 4, ], c(1, 0, 0, 0, 0) + 2)
expect_equivalent(arrays[[1]][1, ], 1:4 + 3)
expect_equivalent(arrays[[1]][2, ], 1:4 + 3)
expect_equivalent(arrays[[1]][3, ], 1:4 + 3)
expect_equivalent(arrays[[1]][4, ], 11:14 + 3)
expect_equivalent(arrays[[1]][5, ], 11:14 + 3)
arrays <- gen()
expect_equivalent(arrays[[2]][1, 8, ], c(1, 0, 0, 0, 0) + 2)
expect_equivalent(arrays[[2]][2, 3, ], c(0, 1, 0, 0, 0) + 2)
expect_equivalent(arrays[[1]][1, ], 11:14 + 3)
expect_equivalent(arrays[[1]][5, ], 11:14 + 3)
# set learning
directories = c("fasta_2", "fasta_3")
maxlen <- 3
samples_per_target <- 3
reshape_mode <- "time_dist"
set_learning <- list(reshape_mode = reshape_mode,
maxlen = maxlen,
samples_per_target = samples_per_target)
gen <- get_generator(val = FALSE,
set_learning = set_learning,
train_type = "label_folder",
path = directories,
format = "fasta",
batch_size = 2,
maxlen = maxlen,
ambiguous_nuc = "discard",
vocabulary = c("a", "c", "g", "t"),
step = 2)
arrays <- gen()
# add axis to previous test
expect_equivalent(arrays[[1]][1, 1, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 1, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][1, 1, 3, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[1]][1, 2, 1, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 2, 2, ], c(0, 1, 0, 0))
expect_equivalent(arrays[[1]][1, 2, 3, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 3, 1, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 3, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][1, 3, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][2, 1, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][2, 1, 2, ], c(0, 0, 1, 0))
expect_equivalent(arrays[[1]][2, 1, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, 1, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, 2, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 2, 3, ], c(1, 0, 0, 0))
expect_equivalent(arrays[[1]][2, 3, 1, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][2, 3, 2, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[1]][2, 3, 3, ], c(0, 0, 0, 1))
expect_equivalent(arrays[[2]][1, ], c(1, 0))
expect_equivalent(arrays[[2]][2, ], c(0, 1))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.