train_model_cpc | R Documentation |
Train a CPC (Oord et al.) inspired neural network on genomic data.
train_model_cpc(
train_type = "CPC",
encoder = NULL,
context = NULL,
path,
path_val = NULL,
path_checkpoint = NULL,
path_tensorboard = NULL,
train_val_ratio = 0.2,
run_name,
batch_size = 32,
epochs = 100,
steps_per_epoch = 2000,
shuffle_file_order = FALSE,
initial_epoch = 1,
seed = 1234,
path_file_log = TRUE,
train_val_split_csv = NULL,
file_limit = NULL,
proportion_per_seq = NULL,
max_samples = NULL,
maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
step = NULL,
file_filter = NULL,
stride = 0.4,
pretrained_model = NULL,
learningrate = 0.001,
learningrate_schedule = NULL,
k = 5,
stepsmin = 2,
stepsmax = 3,
emb_scale = 0.1
)
train_type |
Either |
encoder |
A keras encoder for the cpc function. |
context |
A keras context model for the cpc function. |
path |
Path to training data. If |
path_val |
Path to validation data. See |
path_checkpoint |
Path to checkpoints folder or |
path_tensorboard |
Path to tensorboard directory or |
train_val_ratio |
For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration
processes |
run_name |
Name of the run. Name will be used to identify output from callbacks. |
batch_size |
Number of samples used for one network update. |
epochs |
Number of iterations. |
steps_per_epoch |
Number of training batches per epoch. |
shuffle_file_order |
Boolean, whether to go through files sequentially or shuffle beforehand. |
initial_epoch |
Epoch at which to start training. Note that network
will run for ( |
seed |
Sets seed for reproducible results. |
path_file_log |
Write name of files to csv file if path is specified. |
train_val_split_csv |
A csv file specifying train/validation split. csv file should contain one column named |
file_limit |
Integer or |
proportion_per_seq |
Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence). |
max_samples |
Maximum number of samples to use from one file. If not |
maxlen |
Length of predictor sequence. |
patchlen |
The length of a patch when splitting the input sequence. |
nopatches |
The number of patches when splitting the input sequence. |
step |
Frequency of sampling steps. |
file_filter |
Vector of file names to use from path_corpus. |
stride |
The overlap between two patches when splitting the input sequence. |
pretrained_model |
A pretrained keras model, for which training will be continued |
learningrate |
A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001. |
learningrate_schedule |
A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay". |
k |
Value of k for sparse top k categorical accuracy. Defaults to 5. |
stepsmin |
In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction. |
stepsmax |
The maximum distance between the predicted patch and the given patch. |
emb_scale |
Scales the impact of a patches context. |
A list of training metrics.
#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()
for (current_path in c(path_train_1, path_train_2,
path_val_1, path_val_2)) {
dir.create(current_path)
deepG::create_dummy_data(file_path = current_path,
num_files = 3,
seq_length = 10,
num_seq = 5,
vocabulary = c("a", "c", "g", "t"))
}
# create model
encoder <- function(maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
eval = FALSE) {
if (is.null(nopatches)) {
nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
}
inp <- keras::layer_input(shape = c(maxlen, 4))
stridelen <- as.integer(0.4 * patchlen)
createpatches <- inp %>%
keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") %>%
tensorflow::tf$image$extract_patches(
sizes = list(1L, patchlen, 4L, 1L),
strides = list(1L, stridelen, 4L, 1L),
rates = list(1L, 1L, 1L, 1L),
padding = "VALID",
name = "prep_patches"
) %>%
keras::layer_reshape(list(nopatches, patchlen, 4L),
name = "prep_reshape2") %>%
tensorflow::tf$reshape(list(-1L, patchlen, 4L),
name = "prep_reshape3")
danQ <- createpatches %>%
keras::layer_conv_1d(
input_shape = c(maxlen, 4L),
filters = 320L,
kernel_size = 26L,
activation = "relu"
) %>%
keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) %>%
keras::layer_dropout(0.2) %>%
keras::layer_lstm(units = 320, return_sequences = TRUE) %>%
keras::layer_dropout(0.5) %>%
keras::layer_flatten() %>%
keras::layer_dense(925, activation = "relu")
patchesback <- danQ %>%
tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
keras::keras_model(inp, patchesback)
}
context <- function(latents) {
cres <- latents
cres_dim = cres$shape
predictions <-
cres %>%
keras::layer_lstm(
return_sequences = TRUE,
units = 256, # WAS: 2048,
name = paste("context_LSTM_1",
sep = ""),
activation = "relu"
)
return(predictions)
}
# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
### cpc functions ###
encoder = encoder,
context = context,
#### Generator settings ####
path_checkpoint = temp_dir,
path = c(path_train_1, path_train_2),
path_val = c(path_val_1, path_val_2),
run_name = "TEST",
batch_size = 8,
epochs = 3,
steps_per_epoch = 6,
patchlen = 100,
nopatches = 8)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.