In davetgerrard/GenomicLayers: Simple, sequence-based simulation of epi-genomes.

Introduction

script to run Rap1 simulation store output every ten iterations

library(GenomicLayers)

Binding factors ===================

RAP1 (RAP1P?) to bind to regexp

# patternLength set to minimum length the pattern can match (CA)
bf.RAP1.1 <- createBindingFactor.DNA_regexp("bf.RAP1.1", forRegExp="(C{1,3})A",revRegExp= "T(G{1,3})",
                                            mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=20)

# a second version to see if the reg-exp is too lax.
bf.RAP1.2 <- createBindingFactor.DNA_regexp("bf.RAP1.2", forRegExp="(C{2,3})A",revRegExp= "T(G{2,3})",
                                            mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=20)

bf.RAP1.3 <- createBindingFactor.DNA_consensus("bf.RAP1.3", patternString="GGTGT", 
                                               mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=5)

# this version to extend the state to 11 so that two hits with 6bp would be contiguous
bf.RAP1.3C <- createBindingFactor.DNA_consensus("bf.RAP1.3C", patternString="GGTGT", 
                                            mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=5)

bf.RAP1.4 <- createBindingFactor.DNA_regexp("bf.RAP1.4", forRegExp="GGTGT(.{0,3})GGTGT",
                                            revRegExp="ACACC(.{0,3})ACACC", patternLength = 10,
                                            mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=20, )

bf.RAP1.4B <- createBindingFactor.DNA_regexp("bf.RAP1.4B", forRegExp="GGTGT(.{0,3})GGTGT",
                                            revRegExp="ACACC(.{0,3})ACACC",
                                            mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=20, )

bf.RAP1.5 <- createBindingFactor.DNA_consensus("bf.RAP1.5", patternString="GGTGTGTGGGTGT", 
                                               mod.layers = "Sir3p_potential", mod.marks=1, stateWidth=20, )

nucleosomeWidth <- 147

Sir3p binds to telomeres and is also spread cannot bind acetylated regions or H3K4me

# bf.Sir3p_M1 <- createBindingFactor.layer_region("bf.Sir3p_M1", type="layer_region", 
#                                              patternLength = 5, patternString = "N",
#                                              profile.layers = c("Sir3p_potential"),
#                                              stateWidth=10, profile.marks = c(1), 
#                                              mod.layers = "bound.Sir3p", mod.marks = 1)


# 
bf.Sir3p_S <- createBindingFactor.layer_region("bf.Sir3p_S", type="layer_region", 
                                             patternLength = 5, patternString = "N",
                                             profile.layers = c("Sir3p_potential"),
                                             stateWidth=5, profile.marks = c(1), 
                                             mod.layers = "bound.Sir3p", mod.marks = 1)

need also something to spread Sir3p along the chromosome. want a binding factor that creates mods either up or downstream of binding site. Parameter 'offset.method' to accept function as argument

upDownFuncRnorm <- function(n, offset.mean, offset.sd)  {
  y <- round(rnorm(n, mean=offset.mean, sd=offset.sd))
  z <- sample(c(1, -1), length(y), replace=T)  # random vector of 1,-1  to negate half the values

  return(round(y*z))
}



# PROBLEM cannot spread into region that has H3K4me or H4K16ac
bf.Sir3p.spread <-  createBindingFactor.layer_region("bf.Sir3p.spread", type="layer_region", 
                                                     patternLength = 5, 
                                                     profile.layers = c("bound.Sir3p"),
                                                     stateWidth=5, profile.marks = c(1), 
                                                     mod.layers = "bound.Sir3p", mod.marks = 1, offset = 100,
                                                     offset.method=upDownFuncRnorm, 
                                                     offset.params=list(offset.mean=147, offset.sd=30))

Create the layerSet object to include the genome and all the layers we may need

Layers needed: -

RAP1 binding/recruitment - can act as recruiter for other repressive modifiers. bound.Sir3p

library(BSgenome.Scerevisiae.UCSC.sacCer3) 

genome <- BSgenome.Scerevisiae.UCSC.sacCer3

sequences_to_keep <- "chrIII"   # telomere has many C{1,3}A repeats at both ends of the chromosome.
# chrIII also contains the mating type loci HMR and HML
genomeSub <- keepBSgenomeSequences(genome, sequences_to_keep)
genomeSub    # this should now still be a useable BSgenome object but with only one chromosome.  


# set up a LayerSet on the genome :  a list with link to genome and GRanges objects to store position information
# two additional layers "H3K4me","H4K16ac" are specified here but not used in these sims.
layerSubGenome <- createLayerSet.BSgenome(genome=genomeSub, 
                                          layer.names=c("Sir3p_potential",  "bound.Sir3p",  "H3K4me","H4K16ac"),
                                          n.layers=4)

You may want to test the binding of your factors against the genome.

(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.1))

(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.2))

(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.3))
(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.3C))
(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.4))
(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.4B))
(hits <- matchBindingFactor.BSgenome(layerSet = layerSubGenome,bindingFactor =  bf.RAP1.5))

# and a check of the sequence at these positions
getSeq(genomeSub, hits)

gregexpr("GGTGT(.{0,3})GGTGT", getSeq(genomeSub))
gregexpr("ACACC(.{0,3})ACACC", getSeq(genomeSub))   # rev-comp of above
gregexpr("ACACC(.{0,4})ACACC", getSeq(genomeSub))  ## max distance to be telomere specific

gregexpr("GGTGT(.{1,3})GGTGT", getSeq(genomeSub))

vmatchPattern(pattern = "GGTGT", genomeSub)

MODEL 1

# combine all bfs into a list to pass to runLayerBinding. Order DOES matter.
bfSet.1 <- list(bf.RAP1.3=bf.RAP1.3, 
                bf.Sir3p_S=bf.Sir3p_S,
                bf.Sir3p.spread=bf.Sir3p.spread                

)   # binding factors must be in a list and named. Easiest to use each BFs name.

# specify abundances of bfs.
abs.1 <- structure(c(50,20,1000), names=c(names(bfSet.1)))

iters <- 200
saveFreq <- 10
outFile <- "path_to_outFile"


# take a copy of the subGenome object 
layerSubGenome.1 <- layerSubGenome
# cache the sequence specific parts of profiles
layerSubGenome.1 <- generateHitsCache(layerList=layerSubGenome.1, factorSet=bfSet.1[c('bf.RAP1.3' )], cache.layers="LAYER.0") 

for(i in 1:iters) {
  print(i)
  layerSubGenome.1 <- runLayerBinding.BSgenome(layerList=layerSubGenome.1, 
                                               factorSet=bfSet.1, 
                                               bf.abundances = abs.1, 
                                               verbose=TRUE, collect.stats=TRUE) 

  if((i %% saveFreq) == 0) {   # export current model state.
    tempFileOut <- paste0(outFile, ".iter.", i, ".RData")
    print(paste("Saving to ", tempFileOut))
    #save(layerSubGenome.1, file=tempFileOut)  #  use this to save model state periodically.
     plotLayers(layerSubGenome.1, layerNames=c("Sir3p_potential","bound.Sir3p"), chrom="chrIII", xlim=c(1,seqlengths(genomeSub)["chrIII"]))
  }



}

The output of model 1 should be widespread binding across the whole chromosome.

MODEL 2

spreading of rap1.4, addition only, no marks removal

# combine all bfs into a list to pass to runLayerBinding. Order DOES matter.
bfSet.2 <- list(bf.RAP1.4=bf.RAP1.4, 
                bf.Sir3p_S=bf.Sir3p_S,
                bf.Sir3p.spread=bf.Sir3p.spread                

)   # binding factors must be in a list and named. Easiest to use each BFs name.

# specify abundances of bfs.
abs.2 <- structure(c(50,20,1000), names=c(names(bfSet.2)))

Now repeat using model 2

iters <- 200
saveFreq <- 10
outFile <- "path_to_outFile"


# take a copy of the subGenome object 
layerSubGenome.2 <- layerSubGenome
# cache the sequence specific parts of profiles
layerSubGenome.2 <- generateHitsCache(layerList=layerSubGenome.2, factorSet=bfSet.2[c('bf.RAP1.4' )], cache.layers="LAYER.0") 

for(i in 1:iters) {
  print(i)
  layerSubGenome.2 <- runLayerBinding.BSgenome(layerList=layerSubGenome.2, 
                                               factorSet=bfSet.2, 
                                               bf.abundances = abs.2, 
                                               verbose=TRUE, collect.stats=TRUE) 

  if((i %% saveFreq) == 0) {   # export current model state.
    tempFileOut <- paste0(outFile, ".iter.", i, ".RData")
    print(paste("Saving to ", tempFileOut))
    #save(layerSubGenome.2, file=tempFileOut)  #  use this to save model state periodically.
     plotLayers(layerSubGenome.2, layerNames=c("Sir3p_potential","bound.Sir3p"), chrom="chrIII", xlim=c(1,seqlengths(genomeSub)["chrIII"]))
  }



}

Can examine the final state of the layers

layerSubGenome.2$layerSet$bound.Sir3p

layerSubGenome.2$history

Now set up and run a whole genome model. We will use the nuclear chromosomes only and remove the mitochondrial chromosome/genome first.

sequences_to_keep <- setdiff(seqnames(genome), "chrM")  

genomeNuc <- keepBSgenomeSequences(genome, sequences_to_keep)
genomeNuc    # this should now still be a useable BSgenome object but with no mitochondrial chromosome.
# set up a LayerSet on the genome :  a list with link to genome and GRanges objects to store position information
# two additional layers "H3K4me","H4K16ac" are specified here but not used in these sims.
layerGenomeNuc <- createLayerSet.BSgenome(genome=genomeNuc, 
                                          layer.names=c("Sir3p_potential",  "bound.Sir3p",  "H3K4me","H4K16ac"),
                                          n.layers=4)

Need to increase the abundances of the binding factors roughly in proportion. Helps to look at the seqlengths of chromosomes.

seqlengths(genomeNuc)
seqlengths(genomeNuc)["chrIII"] / sum(seqlengths(genomeNuc))   # approx 1/40th of the genome
abs.2
abs.wg <- abs.2 * 40   # increase abundances 40-fold

Now run the whole (nuclear) genome simulation.

iters <- 200
saveFreq <- 10
outFile <- "path_to_outFile"


# take a copy of the subGenome object 
layerGenomeNuc.2 <- layerGenomeNuc
# cache the sequence specific parts of profiles
layerGenomeNuc.2 <- generateHitsCache(layerList=layerGenomeNuc.2, factorSet=bfSet.2[c('bf.RAP1.4' )], cache.layers="LAYER.0") 

for(i in 1:iters) {
  print(i)
  layerGenomeNuc.2 <- runLayerBinding.BSgenome(layerList=layerGenomeNuc.2, 
                                               factorSet=bfSet.2, 
                                               bf.abundances = abs.wg, 
                                               verbose=TRUE, collect.stats=TRUE)   # using abs.wg

  if((i %% saveFreq) == 0) {   # export current model state.
    tempFileOut <- paste0(outFile, ".iter.", i, ".RData")
    print(paste("Saving to ", tempFileOut))
    #save(layerGenomeNuc.2, file=tempFileOut)  #  use this to save model state periodically.
     plotLayers(layerGenomeNuc.2, layerNames=c("Sir3p_potential","bound.Sir3p"), chrom="chrIII", xlim=c(1,seqlengths(genomeNuc)["chrIII"]))
  }



}

Might be worth comparing how the chrIII abundances scale up to the whole genome

layerSubGenome.2$layerSet$bound.Sir3p
sum(width(layerSubGenome.2$layerSet$bound.Sir3p))
#layerSubGenome.2$history   

# now compare with whole genome sim
layerGenomeNuc.2$layerSet$bound.Sir3p
sum(width(layerGenomeNuc.2$layerSet$bound.Sir3p))
#layerGenomeNuc.2$history
# calculate the coverage by chromosome using by()
by(layerGenomeNuc.2$layerSet$bound.Sir3p, INDICES=seqnames(layerGenomeNuc.2$layerSet$bound.Sir3p) , FUN=function(x) sum(width(x)))

COMMANDS TO RUN FROM THE COMMAND LINE IN A SCRIPT - NOT RUN HERE

# Load the getopt library
library(getopt)

# Define command-line options
spec = matrix(c(
  'verbose', 'v', 2, "integer",
  'help'   , 'h', 0, "logical",
  'out.file' , 'o', 2, "character", 
  'iterations', 'i', 2, "integer",
  'saveFreq',  's', 2, "integer"
), byrow=TRUE, ncol=4)

# Parse command-line options
opt <- getopt(spec)

# Set default values for options if they are not provided
if ( is.null(opt$verbose)) { opt$verbose <- FALSE }

# Output file
outFile <- opt$out.file
if (is.null(opt$out.file)) { outFile <- "outputfile" }

iters <- opt$iterations
if (is.null(opt$iterations)) { iters <- 100 }

saveFreq <- opt$saveFreq
if (is.null(opt$saveFreq)) { saveFreq <- iters }

# Print the values for verification
print(paste("Out file stem:", opt$out.file))