In mPloenzke/learnMotifs: Learn Nucleotide Motifs via Convolutions

The package relies upon keras to perform much of the heavy lifting. Start by making sure the keras package is installed and may be loaded.

rm(list=ls(all=TRUE))
knitr::opts_chunk$set(echo = TRUE)
library(keras)
library(learnMotifs)
library(ggplot2)

Set model options. Note we do not cache or set a seed so results will differ each run.

opt <- list()
opt$max_len <- 200 # sequence length
opt$log_dir <- 'log' # logging directory
opt$embedding_pos <- 'GGGGGGGG' # sub-sequence inserted into Y=1 sequences
opt$embedding_neg <- 'CCCCCCCC' # sub-sequence inserted into Y=0 sequences
opt$batch_size <- 64 # batch size for training 
opt$epochs <- 20 # training epochs
opt$n_filters <- 4 # number of convolutional filters
opt$filter_len <- 12 # convolutional filter length
opt$lambda_pos <- 3e-3 # regularization penalty for position-specific weights in filters
opt$lambda_filter <- 1e-8 # group regularization penalty treating filter as groups
opt$lambda_l1 <- 3e-3 # regularization penalty for all weights in filters
opt$lambda_offset <- 0 # regularization penalty for the offset used in the convolutional filters
opt$lambda_beta <- 3e-3 # regularization penalty for the betas (effect sizes)
opt$learning_rate <- 0.02 # stochastic gradient descent learning rate
opt$decay_rate <- opt$learning_rate / opt$epochs / 2 # decay for learning rate
opt$plot_Activations <- TRUE # boxplot of activation differences by class per filter
opt$plot_filterMotifs <- TRUE # plot sequence logos (IGMs) directly
opt$plot_empiricalMotifs <- FALSE # plot sequence logos from maximally-activated sub-sequences
opt$output_plots_every <- 5 # plotting frequency (every X epochs)
opt$downsample <- 1 # downsample training cases (provide a proportion)
opt$shuffle <- .1 # proportion of cases to shuffle (i.e. how many cases contain the wrong label)
opt$cache_Old <- TRUE # cache old results

Load random nucleotide sequences files. Here we'll simply use sequences with no embedded motifs and insert the embedding_pos and embedding_neg patterns into the postive and negative cases, respectively.

negative.cases <- EmptyBackground

Join and randomly shuffle to increase difficulty. Embed the patterns.

idx <- as.logical(rbinom(nrow(negative.cases),size=1,.5))
positive.cases <- negative.cases[idx,]
negative.cases <- negative.cases[!idx,]
negative.cases$y <- 0
positive.cases$y <- 1
for (irow in 1:nrow(positive.cases)) {
  if (runif(1)<(1-opt$shuffle)) {
    location <- sample(1:(opt$max_len-nchar(opt$embedding_pos)),1)
    substr(positive.cases[irow,'sequence'],start=location,stop=(location+nchar(opt$embedding_pos)-1)) <- opt$embedding_pos
    positive.cases[irow,'embeddings'] <- opt$embedding_pos
  }
}
for (irow in 1:nrow(negative.cases)) {
  if (runif(1)<(1-opt$shuffle)) {
    location <- sample(1:(opt$max_len-nchar(opt$embedding_neg)),1)
    substr(negative.cases[irow,'sequence'],start=location,stop=(location+nchar(opt$embedding_neg)-1)) <- opt$embedding_neg
    negative.cases[irow,'embeddings'] <- opt$embedding_neg
  }
}
all.cases <- rbind(positive.cases,negative.cases)
all.cases <- all.cases[sample(1:nrow(all.cases),size=nrow(all.cases),replace=FALSE),]

Optionally downsample.

if (opt$downsample<1) {all.cases <- all.cases[sample(1:nrow(all.cases),size=opt$downsample*nrow(all.cases)),]}

Split into training and validation.

idx <- sample(1:nrow(all.cases),round(.9*nrow(all.cases)))
training_data <- all.cases[idx,c('sequence')]
training_labels <- all.cases[idx,'y']
validation_data <- all.cases[-idx,c('sequence')]
validation_labels <- all.cases[-idx,'y']

Set up logging directory and save validation set.

setup_log_dir(opt)
write.table(cbind(validation_data,validation_labels),file=file.path(opt$log_dir,'testset.csv'),sep=',',row.names=F,col.names=F)

One-hot encode for the model.

training_array <- one_hot(training_data,opt$filter_len)
validation_array <- one_hot(validation_data,opt$filter_len)

Build the model following typical keras syntax.

deNovo_sequence <- layer_input(shape=c(4,opt$max_len + 2*opt$filter_len-2,1),name='deNovo_input')
deNovo_model <- deNovo_sequence %>%
  layer_deNovo(filter=opt$n_filters,
               filter_len=opt$filter_len,
               lambda_pos=opt$lambda_pos,
               lambda_filter=opt$lambda_filter,
               lambda_l1=opt$lambda_l1,
               lambda_offset=opt$lambda_offset,
               input_shape = c(4, opt$max_len+2*opt$filter_len-2, 1),
               activation='sigmoid') %>%
  layer_max_pooling_2d(pool_size = c(1,(opt$max_len+opt$filter_len-1)),name='deNovo_pool') %>%
  layer_flatten(name='deNovo_flatten') %>%
  layer_dense(units=1,activation='sigmoid',kernel_regularizer = regularizer_l1(l=opt$lambda_beta))
model <- keras_model(
      inputs = deNovo_sequence, 
      outputs = deNovo_model)
model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_adam(lr=opt$learning_rate,decay=opt$decay_rate),
  metrics = c("accuracy")
)

Learn motifs. Define a callback to output plots during the training procedure and ensure weights represent IGMs.

deNovo_callback <- deNovo_IGM_Motifs$new(model,opt$output_plots_every,opt$log_dir,
                                     plot_activations=opt$plot_Activations, 
                                     plot_filters=opt$plot_filterMotifs,
                                     plot_crosscorrel_motifs=opt$plot_empiricalMotifs,
                                     deNovo_data=validation_array, 
                                     test_labels=validation_labels, test_seqs=validation_data,
                                     num_deNovo=opt$n_filters, filter_len=opt$filter_len)
sequence_fit <- model %>% fit(
  x=training_array, training_labels,
  batch_size = opt$batch_size,
  epochs = opt$epochs,
  validation_data = list(validation_array, validation_labels),
  shuffle = TRUE,
  callbacks = list(deNovo_callback)
)

Save training plot.

p <- plot(sequence_fit,method='ggplot2',smooth=TRUE)
ggsave(paste(opt$log_dir,"Training_loss_.pdf",sep="/"),plot=p,width=15,height=7.5,units='in')
model$save(file.path(opt$log_dir,'deNovo_model.h5'))
model$save_weights(file.path(opt$log_dir,'deNovo_model_weights.h5'))
write.table(model$to_yaml(),file.path(opt$log_dir,'deNovo_model.yaml'))