The package relies upon keras to perform much of the heavy lifting. Start by making sure the keras
package is installed and may be loaded.
rm(list=ls(all=TRUE)) knitr::opts_chunk$set(echo = TRUE) library(keras) library(learnMotifs) library(tidyverse)
Set model options. Note we do not cache or set a seed so results will differ each run.
opt <- list() opt$max_len <- 200 # sequence length opt$log_dir <- 'log_JASPAR' # logging directory opt$batch_size <- 64 # batch size for training opt$epochs <- 10 # training epochs opt$lambda_beta <- .05 # regularization penalty for the betas (effect sizes) opt$extranneous_jaspar_motifs <- 'ALL' # number of extra JASPAR-annotated motifs to initialize as conv. filters beyond MYC, IRF, CTCF opt$learning_rate <- 0.05 # stochastic gradient descent learning rate opt$decay_rate <- opt$learning_rate / opt$epochs / 2 # decay for learning rate opt$plot_Activations <- TRUE # boxplot of activation differences by class per filter opt$plot_filterMotifs <- TRUE # plot sequence logos (IGMs) directly opt$plot_empiricalMotifs <- FALSE # plot sequence logos from maximally-activated sub-sequences opt$JASPAR_pfms <- file.path(opt$log_dir,'JASPAR_known_motifs.txt') # location and file name to download and save annotated motifs to opt$output_plots_every <- 5 # plotting frequency (every X epochs) opt$downsample <- 1 # downsample training cases (provide a proportion) opt$shuffle <- .1 # proportion of cases to shuffle between class (i.e. how many cases contain the wrong label) opt$cache_Old <- FALSE # cache old results
Load random nucleotide sequences files. The negative cases (Y=0) will simply be randomly generated nucleotide sequences from background while the positive cases (Y=1) contain up to three embeddings of three distinct motifs (MYC, CTCF, IRF). Cases may contain multiple motifs and were simulated with the simdna package. In this vignette we will initialize (and fix) the convolutional filters using the annotated JASPAR motifs as information gain matrices (IGMs).
negative.cases <- EmptyBackground positive.cases <- MYC_CTCF_IRF
Join and randomly shuffle to increase difficulty.
positive.cases$y <- rbinom(nrow(positive.cases),1,prob=(1-opt$shuffle)) negative.cases$y <- rbinom(nrow(negative.cases),1,prob=opt$shuffle) all.cases <- rbind(positive.cases,negative.cases) all.cases <- all.cases[sample(1:nrow(all.cases),size=nrow(all.cases),replace=FALSE),]
Optionally downsample.
if (opt$downsample<1) {all.cases <- all.cases[sample(1:nrow(all.cases),size=opt$downsample*nrow(all.cases)),]}
Split into training and validation.
idx <- sample(1:nrow(all.cases),round(.9*nrow(all.cases))) training_data <- all.cases[idx,c('sequence')] training_labels <- all.cases[idx,'y'] validation_data <- all.cases[-idx,c('sequence')] validation_labels <- all.cases[-idx,'y']
Set up logging directory.
setup_log_dir(opt)
Import previously-annotated motifs. We'll use JASPAR and then format them to be used as convolutional filters by converting them to IGMs. One may opt to use PWMs instead.
inserted.motifs <- c('MYC','CTCF','IRF') download.file('http://jaspar.genereg.net/download/CORE/JASPAR2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt',opt$JASPAR_pfms) jaspar.raw <- my_readJASPARMatrix(opt$JASPAR_pfms,type='all') jaspar.names <- names(jaspar.raw) if (opt$extranneous_jaspar_motifs=='ALL') { jaspar.pos <- 1:length(jaspar.raw) } else { jaspar.pos <- sapply(inserted.motifs,function(j) {which(jaspar.names==j)}) jaspar.pos <- c(jaspar.pos,sample((1:length(jaspar.raw))[!(1:length(jaspar.raw) %in% jaspar.pos)],opt$extranneous_jaspar_motifs,replace=FALSE)) } jaspar.pos <- jaspar.pos[!is.na(jaspar.pos)] jaspar.names <- jaspar.names[jaspar.pos] jaspar.maxlen <- find_max_length(jaspar.raw,jaspar.pos) jaspar.motifs <- pfm_to_igm(jaspar.raw,jaspar.pos,jaspar.maxlen)
One-hot encode for the model.
training_array_jaspar <- one_hot(training_data,jaspar.maxlen) validation_array_jaspar <- one_hot(validation_data,jaspar.maxlen)
Calculate the expected activations to use as offsets. Since we implement the sigmoidal activation function in the model we calculate the average activations per motif (i.e. convolutional filter) on a small batch of data and use these values as the associated offsets to ensure the post-sigmoidal activations are centered around 0.5, the steepest-sloped location of the sigmoid curve.
jaspar.offsets <- calculate_offsets(jaspar.motifs, jaspar.pos, jaspar.maxlen, training_array_jaspar, batchsize=256)
Build the JASPAR model to calculate the motif effect. We expect to find the largest effect sizes for the three inserted motifs.
jaspar_sequence <- layer_input(shape=c(4,opt$max_len + 2*jaspar.maxlen-2,1),name='jaspar_input') jaspar_output <- jaspar_sequence %>% layer_fixedMotif(filter=length(jaspar.pos), motif_maxlen=jaspar.maxlen, fixed.motifs=jaspar.motifs, fixed.offsets=jaspar.offsets, input_shape = c(4, opt$max_len+2*jaspar.maxlen-2, 1), name = 'jaspar_conv') %>% layer_max_pooling_2d(pool_size = c(1,(opt$max_len+jaspar.maxlen-1)),name='jaspar_pool') %>% layer_flatten(name='jaspar_flatten') %>% layer_dense(units=1,activation='sigmoid',kernel_constraint = nonnegative_constraint,kernel_regularizer = regularizer_l1(l=opt$lambda_beta)) model <- keras_model( inputs = c(jaspar_sequence), outputs = c(jaspar_output)) model %>% compile( loss = "binary_crossentropy", optimizer = optimizer_sgd(lr=opt$learning_rate,decay=opt$decay_rate), metrics = c("accuracy") ) model_fit <- model %>% fit( x=training_array_jaspar, training_labels, batch_size = opt$batch_size, epochs = opt$epochs, validation_data = list(validation_array_jaspar, validation_labels), shuffle = TRUE )
Save training plot.
p <- plot(model_fit,method='ggplot2',smooth=TRUE) ggsave(paste(opt$log_dir,"Training_loss_Jaspar_Preliminary.pdf",sep="/"),plot=p,width=15,height=7.5,units='in')
Create a tibble with the motif name and the estimated effect size.
betas <- tibble(Effect=c(model$get_weights()[[3]]),Filter=jaspar.names)
Many of the motifs have an associated effect size of zero. These are irrelevant and may be removed from the model without impacting performance. This is due to the sparsity imposed on the betas.
print(paste(sum(betas$Effect==0), 'of the', nrow(betas),'motifs may be discarded at the defined threshold and regularization penalty.',sep=' '))
Let's look at which three motifs have the largest effect sizes. Ideally these should include the inserted motifs (MYC, CTCF, IRF). Note that the motif named IRF is equivalent to the motif named IRF2 and the difference in naming is simply due to the source. In other words, there is no motif named IRF in the JASPAR annotations.
betas %>% arrange(desc(Effect)) %>% head(3)
Output the sequence logo plots for any motifs with an estimated effect size greater than 0.05. Note the high similarity between the retained motifs.
relevant_filters <- model$get_weights()[[1]][,,,which(abs(betas$Effect)>.05),drop=FALSE] filter_list <- format_filter_motifs(relevant_filters) names(filter_list) <- pull(betas[which(abs(betas$Effect)>.05),'Filter']) plot_motifs(filter_list,ylow=0,yhigh=2,method='custom',fl=file.path(opt$log_dir,"relevant_filters.pdf"))
Plot and write them to csv them for inspection along with the mean activation difference between classes for visualization.
betas$Information <- apply(jaspar.motifs,4,sum) input_tensor <- model$input activations_fn <- k_function(inputs = list(input_tensor), outputs = list(model$get_layer(name='jaspar_flatten')$output)) activations <- activations_fn(list(validation_array_jaspar))[[1]] colnames(activations) <- jaspar.names activations <- as.tibble(activations) activations$Y <- validation_labels activations <- activations %>% gather(Filter,Activation,-Y) %>% left_join(betas,by='Filter') betas <- calcActivationDifference(activations) plot_a1 <- plot_activation_difference(betas) ggsave(paste(opt$log_dir,"Betas_Jaspar_Preliminary.pdf",sep="/"),plot=plot_a1,width=15,height=7.5,units='in')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.