The package relies upon keras to perform much of the heavy lifting. Start by making sure the keras package is installed and may be loaded.
rm(list=ls(all=TRUE)) knitr::opts_chunk$set(echo = TRUE) library(keras) library(learnMotifs) library(ggplot2)
Set model options. Note we do not cache or set a seed so results will differ each run.
opt <- list() opt$max_len <- 200 # sequence length opt$log_dir <- 'log' # logging directory opt$embedding_pos <- 'GGGGGGGG' # sub-sequence inserted into Y=1 sequences opt$embedding_neg <- 'CCCCCCCC' # sub-sequence inserted into Y=0 sequences opt$batch_size <- 64 # batch size for training opt$epochs <- 20 # training epochs opt$n_filters <- 4 # number of convolutional filters opt$filter_len <- 12 # convolutional filter length opt$lambda_pos <- 3e-3 # regularization penalty for position-specific weights in filters opt$lambda_filter <- 1e-8 # group regularization penalty treating filter as groups opt$lambda_l1 <- 3e-3 # regularization penalty for all weights in filters opt$lambda_offset <- 0 # regularization penalty for the offset used in the convolutional filters opt$lambda_beta <- 3e-3 # regularization penalty for the betas (effect sizes) opt$learning_rate <- 0.02 # stochastic gradient descent learning rate opt$decay_rate <- opt$learning_rate / opt$epochs / 2 # decay for learning rate opt$plot_Activations <- TRUE # boxplot of activation differences by class per filter opt$plot_filterMotifs <- TRUE # plot sequence logos (IGMs) directly opt$plot_empiricalMotifs <- FALSE # plot sequence logos from maximally-activated sub-sequences opt$output_plots_every <- 5 # plotting frequency (every X epochs) opt$downsample <- 1 # downsample training cases (provide a proportion) opt$shuffle <- .1 # proportion of cases to shuffle (i.e. how many cases contain the wrong label) opt$cache_Old <- TRUE # cache old results
Load random nucleotide sequences files. Here we'll simply use sequences with no embedded motifs and insert the embedding_pos and embedding_neg patterns into the postive and negative cases, respectively.
negative.cases <- EmptyBackground
Join and randomly shuffle to increase difficulty. Embed the patterns.
idx <- as.logical(rbinom(nrow(negative.cases),size=1,.5)) positive.cases <- negative.cases[idx,] negative.cases <- negative.cases[!idx,] negative.cases$y <- 0 positive.cases$y <- 1 for (irow in 1:nrow(positive.cases)) { if (runif(1)<(1-opt$shuffle)) { location <- sample(1:(opt$max_len-nchar(opt$embedding_pos)),1) substr(positive.cases[irow,'sequence'],start=location,stop=(location+nchar(opt$embedding_pos)-1)) <- opt$embedding_pos positive.cases[irow,'embeddings'] <- opt$embedding_pos } } for (irow in 1:nrow(negative.cases)) { if (runif(1)<(1-opt$shuffle)) { location <- sample(1:(opt$max_len-nchar(opt$embedding_neg)),1) substr(negative.cases[irow,'sequence'],start=location,stop=(location+nchar(opt$embedding_neg)-1)) <- opt$embedding_neg negative.cases[irow,'embeddings'] <- opt$embedding_neg } } all.cases <- rbind(positive.cases,negative.cases) all.cases <- all.cases[sample(1:nrow(all.cases),size=nrow(all.cases),replace=FALSE),]
Optionally downsample.
if (opt$downsample<1) {all.cases <- all.cases[sample(1:nrow(all.cases),size=opt$downsample*nrow(all.cases)),]}
Split into training and validation.
idx <- sample(1:nrow(all.cases),round(.9*nrow(all.cases))) training_data <- all.cases[idx,c('sequence')] training_labels <- all.cases[idx,'y'] validation_data <- all.cases[-idx,c('sequence')] validation_labels <- all.cases[-idx,'y']
Set up logging directory and save validation set.
setup_log_dir(opt) write.table(cbind(validation_data,validation_labels),file=file.path(opt$log_dir,'testset.csv'),sep=',',row.names=F,col.names=F)
One-hot encode for the model.
training_array <- one_hot(training_data,opt$filter_len) validation_array <- one_hot(validation_data,opt$filter_len)
Build the model following typical keras syntax.
deNovo_sequence <- layer_input(shape=c(4,opt$max_len + 2*opt$filter_len-2,1),name='deNovo_input') deNovo_model <- deNovo_sequence %>% layer_deNovo(filter=opt$n_filters, filter_len=opt$filter_len, lambda_pos=opt$lambda_pos, lambda_filter=opt$lambda_filter, lambda_l1=opt$lambda_l1, lambda_offset=opt$lambda_offset, input_shape = c(4, opt$max_len+2*opt$filter_len-2, 1), activation='sigmoid') %>% layer_max_pooling_2d(pool_size = c(1,(opt$max_len+opt$filter_len-1)),name='deNovo_pool') %>% layer_flatten(name='deNovo_flatten') %>% layer_dense(units=1,activation='sigmoid',kernel_regularizer = regularizer_l1(l=opt$lambda_beta)) model <- keras_model( inputs = deNovo_sequence, outputs = deNovo_model) model %>% compile( loss = "binary_crossentropy", optimizer = optimizer_adam(lr=opt$learning_rate,decay=opt$decay_rate), metrics = c("accuracy") )
Learn motifs. Define a callback to output plots during the training procedure and ensure weights represent IGMs.
deNovo_callback <- deNovo_IGM_Motifs$new(model,opt$output_plots_every,opt$log_dir, plot_activations=opt$plot_Activations, plot_filters=opt$plot_filterMotifs, plot_crosscorrel_motifs=opt$plot_empiricalMotifs, deNovo_data=validation_array, test_labels=validation_labels, test_seqs=validation_data, num_deNovo=opt$n_filters, filter_len=opt$filter_len) sequence_fit <- model %>% fit( x=training_array, training_labels, batch_size = opt$batch_size, epochs = opt$epochs, validation_data = list(validation_array, validation_labels), shuffle = TRUE, callbacks = list(deNovo_callback) )
Save training plot.
p <- plot(sequence_fit,method='ggplot2',smooth=TRUE) ggsave(paste(opt$log_dir,"Training_loss_.pdf",sep="/"),plot=p,width=15,height=7.5,units='in') model$save(file.path(opt$log_dir,'deNovo_model.h5')) model$save_weights(file.path(opt$log_dir,'deNovo_model_weights.h5')) write.table(model$to_yaml(),file.path(opt$log_dir,'deNovo_model.yaml'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.