jDMR: A heuristic DMR caller for large-scale WGBS data

Documented in makeReg

makeReg<-function(ref.genome, contexts, chr, min.C, N.boot, N.sim.C, fp.rate, set.tol, out.dir, out.name, makeRegnull)
{
  
  # Defining function for cleaning objects in the global environment from within the function
  CleanEnvir <- function(pattern)
  {
    rm(list = ls(envir=globalenv())[
      grep(pattern, ls(envir=globalenv()))], envir = globalenv())
  }
  
  #ref.genome<-out
  #context="CHH"
  #chr=1
  #min.C=5
  #N.boot=10^5
  #fp.rate=0.01
  #set.tol<-0.001
  
  # This function constructs the regions based on a sequence input
  conReg<-function(seq.in, min.C, win, chr)
  {
    
    start<-NULL
    stop<-NULL
    index<-rep(0, length(seq.in))
    
    pb <- txtProgressBar(min = 1, max = length(seq.in), char = "=", style = 3, file = "")
    for (i in seq_len(c(length(seq.in)- c(min.C-1))))
    {
      if (as.numeric(seq.in[i+ c(min.C-1)])-as.numeric(seq.in[i]) <= win){index[i:c(i + c(min.C-1))]<-rep(1, min.C)}
      
      Sys.sleep(1/length(seq.in)); setTxtProgressBar(pb, i)
      
    }
    
    close(pb)
    
    runs<-rle(index > 0)
    myruns = which(runs$values == TRUE & runs$lengths >= min.C)
    runs.lengths.cumsum = cumsum(runs$lengths)
    ends = runs.lengths.cumsum[myruns]
    newindex = ifelse(myruns>1, myruns-1, 0)
    starts = runs.lengths.cumsum[newindex] + 1
    if (0 %in% newindex) starts = c(1,starts)
    
    
    start.pos<-as.numeric(unlist(seq.in[starts]))
    end.pos<-as.numeric(unlist(seq.in[ends]))
    cluster.length<-end.pos - start.pos
    
    cluster<-cbind(chr, start.pos, end.pos, cluster.length)
    colnames(cluster)<-c("chr","start", "end", "cluster.length")
    cluster<-as.data.frame(cluster, stringsAsFactors = FALSE)
    cluster$chr<-as.character(cluster$chr)
    cluster$start<-as.integer(cluster$start)
    cluster$end<-as.integer(cluster$end)
    cluster$cluster.length<-as.integer(cluster$cluster.length)
    cluster$region<-paste("reg", 1:nrow(cluster), sep="")
    
    return(cluster)
  }
  
  
  
  
  cat("- Reading in the data", "\n")
  cat("-------------------------", "\n")
  cat("                         ", "\n")
  
  # Subselect genome based on chr.id
  #$ref.genome <- ref.genome %>% filter(ref.genome$chr==chr)
  
  # Making the position vectors
  CHH.pos.plus<-ref.genome[which(ref.genome$strand  == "+" &  ref.genome$context == "CHH"),]$pos
  CHG.pos.plus<-ref.genome[which(ref.genome$strand  == "+" &  ref.genome$context == "CHG"),]$pos
  CG.pos.plus<-ref.genome[which(ref.genome$strand  == "+" &  ref.genome$context == "CG"),]$pos
  CHH.pos.minus<-ref.genome[which(ref.genome$strand  == "-" &  ref.genome$context == "CHH"),]$pos
  CHG.pos.minus<-ref.genome[which(ref.genome$strand  == "-" &  ref.genome$context == "CHG"),]$pos
  CG.pos.minus<-ref.genome[which(ref.genome$strand  == "-" &  ref.genome$context == "CG"),]$pos
  
  # Determining the number of CHH, CHG and CG sites (+) strand
  N.CHH.obs<-length(CHH.pos.plus)
  N.CHG.obs<-length(CHG.pos.plus)
  N.CG.obs<-length(CG.pos.plus)
  
  
  if (N.sim.C == "all")
  {
    chr.length<-max(ref.genome$pos)
    N.CHH<-N.CHH.obs
    N.CHG<-N.CHG.obs
    N.CG<-N.CG.obs
    set.seed(123)
    rp.CHH<-sample(1:chr.length, size=N.CHH, replace = FALSE)
    rp.CHG<-sample(1:chr.length, size=N.CHG, replace = FALSE)
    rp.CG<-sample(1:chr.length, size=N.CG, replace = FALSE)
  }else{
    if (is.numeric(N.sim.C) == FALSE){stop("N.sim.C needs to be numeric")}else{
      chr.length<-max(ref.genome$pos)
      scale.factor<-N.sim.C/c(N.CHH.obs + N.CHG.obs + N.CG.obs) 
      chr.length<-round(chr.length*scale.factor,0)
      N.CHH<-round(N.CHH.obs*scale.factor,0)
      N.CHG<-round(N.CHG.obs*scale.factor,0)
      N.CG<-round(N.CG.obs*scale.factor,0)
      set.seed(123)
      rp.CHH<-sample(1:chr.length, size=N.CHH, replace = FALSE)
      rp.CHG<-sample(1:chr.length, size=N.CHG, replace = FALSE)
      rp.CG<-sample(1:chr.length, size=N.CG, replace = FALSE)
    }
  }
  
  
  # Remove this large data. It's no longer needed
  CleanEnvir(pattern = "ref.genome")
  tryCatch(
    withTimeout ({
      Sys.sleep(2)
      
      cat("- Simulating cytosines", "\n")
      cat("-------------------------", "\n")
      
      rp.CHH<-sort(rp.CHH, method="quick")
      rp.CHG<-sort(rp.CHG, method="quick")
      rp.CG<-sort(rp.CG, method="quick")
      
      CHH.lost<-0
      CHG.lost<-0
      CG.lost<-0
      
      tolerance = set.tol + 0.00001
      
      while (tolerance > set.tol)
      {
        ### CHH
        # Draw random positions (rp) of CHH sites
        rp.CHH<-c(rp.CHH, sample(1:chr.length, size= CHH.lost, replace =FALSE))
        if  (length(rp.CHH) > N.CHH){rp.CHH<-sample(rp.CHH, size = N.CHH, replace=F)}
        rp.CHH<-sort(rp.CHH)
        
        ### CHG 
        # Draw random positions (rp) of CHG sites
        # Rules 1 to 3
        rp.CHG<-c(rp.CHG, sample(1:chr.length, size= CHG.lost, replace =FALSE))
        if  (length(rp.CHG) > N.CHG){rp.CHG<-sample(rp.CHG, size = N.CHG, replace=F)}
        rp.CHG<-sort(rp.CHG)
        rp.CHG<-rp.CHG[which(diff(rp.CHG, lag=1) >= 3)]
        
        # Rule 4
        rp.CHH<-rp.CHH[which(!is.element(rp.CHH, rp.CHG))]
        
        # Rule 5
        # no action
        
        # Rule 6
        # no action
        
        # Rule 7
        rp.CHH<-rp.CHH[which(!is.element(rp.CHH, c(rp.CHG+1)))]
        
        # Rule 8
        rp.CHH<-rp.CHH[which(!is.element(rp.CHH, c(rp.CHG+2)))]
        
        
        ### CG 
        # Draw random positions (rp) of CG sites
        # Rules 1 to 3
        rp.CG<-c(rp.CG, sample(1:chr.length, size= CG.lost, replace =FALSE))
        if  (length(rp.CG) > N.CG){rp.CG<-sample(rp.CG, size = N.CG, replace=F)}
        rp.CG<-sort(rp.CG)
        rp.CG<-rp.CG[which(diff(rp.CG, lag=1) >= 2)]
        
        # Rule 4
        rp.CHH<-rp.CHH[which(!is.element(rp.CHH, rp.CG))]
        
        # Rule 5
        rp.CHH.temp<-rp.CHH+1
        rp.CHH.temp<-rp.CHH.temp[which(is.element(rp.CHH.temp, rp.CG))] 
        rp.CHH<-rp.CHH[which(!is.element(c(rp.CHH+1), rp.CG))]
        rp.CHG<-c(rp.CHG, c(rp.CHH.temp-1))
        rp.CHG<-unique(rp.CHG)
        if (length(rp.CHG) > N.CHG){rp.CHG<-sample(rp.CHG, size = N.CHG, replace =F)}
        rp.CHG<-sort(rp.CHG)
        rp.CHG<-rp.CHG[which(diff(rp.CHG, lag=1) >= 3)]
        
        # Rule 6
        # no action
        
        # Rule 7
        rp.CHH<-rp.CHH[which(!is.element(rp.CHH, c(rp.CG+1)))]
        
        # Rule 8
        rp.CHG<-rp.CHG[which(!is.element(rp.CHG, rp.CG))]
        
        # Rule 9
        # no action
        
        # Rule 10
        rp.CHG.temp<-rp.CHG+2
        rp.CHG.temp<-rp.CHG.temp[which(is.element(rp.CHG.temp, rp.CG))] 
        rp.CHG<-rp.CHG[which(!is.element(c(rp.CHG+2), rp.CG))]
        rp.CG<-c(rp.CG, c(rp.CHG.temp-2))
        rp.CG<-unique(rp.CG)
        if (length(rp.CG) > N.CG){rp.CG<-sample(rp.CG, size = N.CG, replace =F)}
        rp.CG<-sort(rp.CG)
        rp.CG<-rp.CG[which(diff(rp.CG, lag=1) >= 2)]
        
        # Rule 11
        rp.CHG<-rp.CHG[which(!is.element(rp.CHG, c(rp.CG+1)))]
        
        # Counting the lost sites
        CHH.lost<-N.CHH - length(rp.CHH)
        CHG.lost<-N.CHG - length(rp.CHG)
        CG.lost<-N.CG - length(rp.CG)
        N.lost<-CHH.lost + CHG.lost + CG.lost
        
        tolerance<-N.lost/(N.CG + N.CHG + N.CHH)
        
        cat(tolerance, "\n")
        
      }
      cat("[Done]\n")},
      timeout=10),
    TimeoutException=function(ex) {cat("[skipped due to timeout]\n")})
  
  
  cat("                         ", "\n")
  cat("- Converged", "\n")
  cat("-------------------------", "\n")
  
  sim.out<-data.frame(c(N.CG.obs, N.CHG.obs, N.CHH.obs), c(length(rp.CG), length(rp.CHG), length(rp.CHH)))
  sim.out<-rbind(sim.out, colSums(sim.out))
  rownames(sim.out)<-c("CG", "CHG", "CHH", "Total")
  sim.out$percent<-sim.out[,2]/sim.out[,1]*100
  colnames(sim.out)<-c("N.observed", "N.simulated", "Percent")
  
  print(sim.out)
  
  cat("                         ", "\n")
  
  cat("- Calling regions:", "\n")
  cat("-------------------------", "\n")
  cat("                         ", "\n")
  
  
  for (icontext in 1:length(contexts))
  {
    context.temp<-contexts[icontext]
    
    
    if (context.temp == "C")
    {
      sim.geno.out<-c(rp.CG, rp.CHG, rp.CHH)
      sim.geno.out<-sort(sim.geno.out, method="quick")
      
      obs.geno.plus<-sort(c(CG.pos.plus, CHG.pos.plus, CHH.pos.plus))
      obs.geno.minus<-sort(c(CG.pos.minus, CHG.pos.minus, CHH.pos.minus))
      
      set.seed(42)
      index1<-sample(1:c(length(sim.geno.out)-min.C), size=N.boot, replace=TRUE)
      index2<-index1 + c(min.C-1)
      dist<-sim.geno.out[index2] -sim.geno.out[index1]
      null.dist<-as.numeric(quantile(dist, probs=fp.rate))
      
      cat("Building from: C (+) strand", "\n")
      reg.obs.plus<-conReg(seq.in=obs.geno.plus, min.C=min.C, win=null.dist, chr=chr)
      cat("Building from: C (-) strand", "\n")
      reg.obs.minus<-conReg(seq.in=obs.geno.minus, min.C=min.C, win=null.dist, chr=chr)
      df <- data.frame(id=c(rep("+", nrow(reg.obs.plus)), rep("-", nrow(reg.obs.minus))), 
                       start=c(reg.obs.plus[,2], reg.obs.minus[,2]), end=c(reg.obs.plus[,3], reg.obs.minus[,3]))
      gr <- GRanges(seqnames = rep(1,nrow(df)),IRanges(start = df$start, end = df$end))
      reg.obs<-as.data.frame(reduce(gr))[,1:3]
      reg.obs[,1]<-rep(chr, nrow(reg.obs))
      reg.obs$cluster.length <- reg.obs$end - reg.obs$start
      reg.obs$region<-paste("reg", 1:nrow(reg.obs), sep="")
      colnames(reg.obs)[1]<-"chr"
      
      if ("C" %in% contexts[which(makeRegnull == TRUE)])
      {
        cat("Building: C null regions", "\n")
        reg.sim<-conReg(seq.in=sim.geno.out, min.C=min.C, win=null.dist, chr=chr)
      }
      if (!is.element("C", contexts[which(makeRegnull == TRUE)]))
      {
        cat("C NULL omitted", "\n")
      }
    }
    
    
    if (context.temp == "CG")
    {
      sim.geno.out<-rp.CG
      
      obs.geno.plus<-sort(CG.pos.plus)
      obs.geno.minus<-sort(CG.pos.minus)
      
      set.seed(42)
      index1<-sample(1:c(length(sim.geno.out)-min.C), size=N.boot, replace=TRUE)
      index2<-index1 + c(min.C-1)
      dist<-sim.geno.out[index2] - sim.geno.out[index1]
      null.dist<-as.numeric(quantile(dist, probs=fp.rate))
      
      cat("Building from: from CG (+) strand", "\n")
      reg.obs<-conReg(seq.in=obs.geno.plus, min.C=min.C, win=null.dist, chr=chr)
      reg.obs$start<-reg.obs$start - 1
      reg.obs$end<-reg.obs$end + 1
      reg.obs$cluster.length<-reg.obs$end - reg.obs$start
      
      if ("CG" %in% contexts[which(makeRegnull == TRUE)])
      {
        cat("Building: CG null regions", "\n")
        cat("", "\n")
        reg.sim<-conReg(seq.in=sim.geno.out, min.C=min.C, win=null.dist, chr=chr)
        reg.sim$start<-reg.sim$start - 1
        reg.sim$end<-reg.sim$end + 1
        reg.sim$cluster.length<-reg.sim$end - reg.sim$start
      }
      if (!is.element("CG", contexts[which(makeRegnull == TRUE)]))
      {
        cat("CG NULL omitted", "\n")
      }
    }
    
    if (context.temp == "CHG")
    {
      sim.geno.out<-rp.CHG
      obs.geno.plus<-sort(CHG.pos.plus)
      obs.geno.minus<-sort(CHG.pos.minus)
      
      set.seed(42)
      index1<-sample(1:c(length(sim.geno.out)-min.C), size=N.boot, replace=TRUE)
      index2<-index1 + c(min.C-1)
      dist<-sim.geno.out[index2] - sim.geno.out[index1]
      null.dist<-as.numeric(quantile(dist, probs=fp.rate))
      
      cat("Building from: CHG (+) strand", "\n")
      reg.obs<-conReg(seq.in=obs.geno.plus, min.C=min.C, win=null.dist, chr=chr)
      reg.obs$start<-reg.obs$start - 2
      reg.obs$end<-reg.obs$end + 2
      reg.obs$cluster.length<-reg.obs$end - reg.obs$start
      
      if ("CHG" %in% contexts[which(makeRegnull == TRUE)])
      {
        cat("Building: CHG null regions", "\n")
        reg.sim<-conReg(seq.in=sim.geno.out, min.C=min.C, win=null.dist, chr=chr)
        reg.sim$start<-reg.sim$start - 2
        reg.sim$end<-reg.sim$end + 2
        reg.sim$cluster.length<-reg.sim$end - reg.sim$start
      }
      if (!is.element("CHG", contexts[which(makeRegnull == TRUE)]))
      {
        cat("CHG NULL omitted", "\n")
      }
    }
    
    if (context.temp == "CHH")
    {
      sim.geno.out<-rp.CHH
      obs.geno.plus<-sort(CHH.pos.plus)
      obs.geno.minus<-sort(CHH.pos.minus)
      
      set.seed(42)
      index1<-sample(1:c(length(sim.geno.out)-min.C), size=N.boot, replace=TRUE)
      index2<-index1 + c(min.C-1)
      dist<-sim.geno.out[index2] - sim.geno.out[index1]
      null.dist<-as.numeric(quantile(dist, probs=fp.rate))
      
      cat("Building from: CHH (+) strand", "\n")
      reg.obs.plus<-conReg(seq.in=obs.geno.plus, min.C=min.C, win=null.dist, chr=chr)
      cat("Building from: CHH (-) strand", "\n")
      reg.obs.minus<-conReg(seq.in=obs.geno.minus, min.C=min.C, win=null.dist, chr=chr)
      df <- data.frame(id=c(rep("+", nrow(reg.obs.plus)), rep("-", nrow(reg.obs.minus))), 
                       start=c(reg.obs.plus[,2], reg.obs.minus[,2]), end=c(reg.obs.plus[,3], reg.obs.minus[,3]))
      gr <- GRanges(seqnames = rep(1,nrow(df)),IRanges(start = df$start, end = df$end))
      reg.obs<-as.data.frame(reduce(gr))[,1:3]
      reg.obs[,1]<-rep(chr, nrow(reg.obs))
      reg.obs$cluster.length <- reg.obs$end - reg.obs$start
      reg.obs$region<-paste("reg", 1:nrow(reg.obs), sep="")
      colnames(reg.obs)[1]<-"chr"
      
      
      if ("CHH" %in% contexts[which(makeRegnull == TRUE)])
      {
        cat("CHH null regions", "\n")
        reg.sim<-conReg(seq.in=sim.geno.out, min.C=min.C, win=null.dist, chr=chr)
      }
      if (!is.element("C", contexts[which(makeRegnull == TRUE)]))
      {
        cat("CHH NULL omitted", "\n")
      }
    }
    
    if (context.temp %in% contexts[which(makeRegnull == TRUE)])
    {
      output<-list(reg.obs, reg.sim, chr, context.temp)
      names(output)<-c("reg.obs", "reg.sim", "chr", "context")
      dput(output, paste(out.dir, out.name, "_regions_","chr", chr, "_", context.temp, ".Rdata", sep=""))
    }
    if (!is.element(context.temp, contexts[which(makeRegnull == TRUE)]))
    {
      reg.sim<-NA
      output<-list(reg.obs, reg.sim, chr, context.temp)
      names(output)<-c("reg.obs", "reg.sim", "chr", "context")
      dput(output, paste(out.dir, out.name, "_regions_","chr", chr, "_", context.temp, ".Rdata", sep=""))
    }
    
  } # End of context loop
  
  # Removing specific objects from global environemnt
  CleanEnvir(pattern = "reg.obs.plus")
  CleanEnvir(pattern = "reg.obs.minus")
  CleanEnvir(pattern = "obs.geno.plus")
  CleanEnvir(pattern = "obs.geno.minus")
  
  
  
  
}