The package relies upon keras to perform much of the heavy lifting. Start by making sure the keras package is installed and may be loaded.

rm(list=ls(all=TRUE))
knitr::opts_chunk$set(echo = TRUE)
library(keras)
library(learnMotifs)
library(ggplot2)

Set model options. Note we do not cache or set a seed so results will differ each run.

opt <- list()
opt$max_len <- 200 # sequence length
opt$log_dir <- 'log' # logging directory
opt$embedding_pos <- 'GGGGGGGG' # sub-sequence inserted into Y=1 sequences
opt$embedding_neg <- 'CCCCCCCC' # sub-sequence inserted into Y=0 sequences
opt$batch_size <- 64 # batch size for training 
opt$epochs <- 20 # training epochs
opt$n_filters <- 4 # number of convolutional filters
opt$filter_len <- 12 # convolutional filter length
opt$lambda_pos <- 3e-3 # regularization penalty for position-specific weights in filters
opt$lambda_filter <- 1e-8 # group regularization penalty treating filter as groups
opt$lambda_l1 <- 3e-3 # regularization penalty for all weights in filters
opt$lambda_offset <- 0 # regularization penalty for the offset used in the convolutional filters
opt$lambda_beta <- 3e-3 # regularization penalty for the betas (effect sizes)
opt$learning_rate <- 0.02 # stochastic gradient descent learning rate
opt$decay_rate <- opt$learning_rate / opt$epochs / 2 # decay for learning rate
opt$plot_Activations <- TRUE # boxplot of activation differences by class per filter
opt$plot_filterMotifs <- TRUE # plot sequence logos (IGMs) directly
opt$plot_empiricalMotifs <- FALSE # plot sequence logos from maximally-activated sub-sequences
opt$output_plots_every <- 5 # plotting frequency (every X epochs)
opt$downsample <- 1 # downsample training cases (provide a proportion)
opt$shuffle <- .1 # proportion of cases to shuffle (i.e. how many cases contain the wrong label)
opt$cache_Old <- TRUE # cache old results

Load random nucleotide sequences files. Here we'll simply use sequences with no embedded motifs and insert the embedding_pos and embedding_neg patterns into the postive and negative cases, respectively.

negative.cases <- EmptyBackground

Join and randomly shuffle to increase difficulty. Embed the patterns.

idx <- as.logical(rbinom(nrow(negative.cases),size=1,.5))
positive.cases <- negative.cases[idx,]
negative.cases <- negative.cases[!idx,]
negative.cases$y <- 0
positive.cases$y <- 1
for (irow in 1:nrow(positive.cases)) {
  if (runif(1)<(1-opt$shuffle)) {
    location <- sample(1:(opt$max_len-nchar(opt$embedding_pos)),1)
    substr(positive.cases[irow,'sequence'],start=location,stop=(location+nchar(opt$embedding_pos)-1)) <- opt$embedding_pos
    positive.cases[irow,'embeddings'] <- opt$embedding_pos
  }
}
for (irow in 1:nrow(negative.cases)) {
  if (runif(1)<(1-opt$shuffle)) {
    location <- sample(1:(opt$max_len-nchar(opt$embedding_neg)),1)
    substr(negative.cases[irow,'sequence'],start=location,stop=(location+nchar(opt$embedding_neg)-1)) <- opt$embedding_neg
    negative.cases[irow,'embeddings'] <- opt$embedding_neg
  }
}
all.cases <- rbind(positive.cases,negative.cases)
all.cases <- all.cases[sample(1:nrow(all.cases),size=nrow(all.cases),replace=FALSE),]

Optionally downsample.

if (opt$downsample<1) {all.cases <- all.cases[sample(1:nrow(all.cases),size=opt$downsample*nrow(all.cases)),]}

Split into training and validation.

idx <- sample(1:nrow(all.cases),round(.9*nrow(all.cases)))
training_data <- all.cases[idx,c('sequence')]
training_labels <- all.cases[idx,'y']
validation_data <- all.cases[-idx,c('sequence')]
validation_labels <- all.cases[-idx,'y']

Set up logging directory and save validation set.

setup_log_dir(opt)
write.table(cbind(validation_data,validation_labels),file=file.path(opt$log_dir,'testset.csv'),sep=',',row.names=F,col.names=F)

One-hot encode for the model.

training_array <- one_hot(training_data,opt$filter_len)
validation_array <- one_hot(validation_data,opt$filter_len)

Build the model following typical keras syntax.

deNovo_sequence <- layer_input(shape=c(4,opt$max_len + 2*opt$filter_len-2,1),name='deNovo_input')
deNovo_model <- deNovo_sequence %>%
  layer_deNovo(filter=opt$n_filters,
               filter_len=opt$filter_len,
               lambda_pos=opt$lambda_pos,
               lambda_filter=opt$lambda_filter,
               lambda_l1=opt$lambda_l1,
               lambda_offset=opt$lambda_offset,
               input_shape = c(4, opt$max_len+2*opt$filter_len-2, 1),
               activation='sigmoid') %>%
  layer_max_pooling_2d(pool_size = c(1,(opt$max_len+opt$filter_len-1)),name='deNovo_pool') %>%
  layer_flatten(name='deNovo_flatten') %>%
  layer_dense(units=1,activation='sigmoid',kernel_regularizer = regularizer_l1(l=opt$lambda_beta))
model <- keras_model(
      inputs = deNovo_sequence, 
      outputs = deNovo_model)
model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_adam(lr=opt$learning_rate,decay=opt$decay_rate),
  metrics = c("accuracy")
)

Learn motifs. Define a callback to output plots during the training procedure and ensure weights represent IGMs.

deNovo_callback <- deNovo_IGM_Motifs$new(model,opt$output_plots_every,opt$log_dir,
                                     plot_activations=opt$plot_Activations, 
                                     plot_filters=opt$plot_filterMotifs,
                                     plot_crosscorrel_motifs=opt$plot_empiricalMotifs,
                                     deNovo_data=validation_array, 
                                     test_labels=validation_labels, test_seqs=validation_data,
                                     num_deNovo=opt$n_filters, filter_len=opt$filter_len)
sequence_fit <- model %>% fit(
  x=training_array, training_labels,
  batch_size = opt$batch_size,
  epochs = opt$epochs,
  validation_data = list(validation_array, validation_labels),
  shuffle = TRUE,
  callbacks = list(deNovo_callback)
)

Save training plot.

p <- plot(sequence_fit,method='ggplot2',smooth=TRUE)
ggsave(paste(opt$log_dir,"Training_loss_.pdf",sep="/"),plot=p,width=15,height=7.5,units='in')
model$save(file.path(opt$log_dir,'deNovo_model.h5'))
model$save_weights(file.path(opt$log_dir,'deNovo_model_weights.h5'))
write.table(model$to_yaml(),file.path(opt$log_dir,'deNovo_model.yaml'))


mPloenzke/learnMotifs documentation built on May 27, 2019, 11:55 a.m.