R/Main.R

setMethod("simulateSV",
          signature(),

          function(output, genome, chrs, dels, ins, invs, dups, trans, size, sizeDels, sizeIns, sizeInvs, sizeDups, regionsDels, regionsIns, regionsInvs, regionsDups, regionsTrans, maxDups, percCopiedIns, percBalancedTrans, bpFlankSize, percSNPs, indelProb, maxIndelSize, repeatBias, weightsMechanisms, weightsRepeats, repeatMaskerFile, bpSeqSize, random, seed, verbose){

              
  options(stringsAsFactors=FALSE, scipen=10)
  
  if(!missing(seed)){ 
    set.seed(seed)
  }

  ## default for random is TRUE
  ## is only one value given, set it for all SVs where regions were specified (only place, where a FALSE makes sense)
  if(length(random) == 1){
    r = rep(TRUE, 5)
    r[!c(missing(regionsDels), missing(regionsIns), missing(regionsInvs), missing(regionsDups), missing(regionsTrans))] = random
    random = r
  }
  
  .validateInput(output, genome, chrs, dels, ins, invs, dups, trans, sizeDels, sizeIns, sizeInvs, sizeDups, regionsDels, regionsIns, regionsInvs, regionsDups, regionsTrans, maxDups, percCopiedIns, percBalancedTrans, percSNPs, indelProb, weightsMechanisms, weightsRepeats, bpSeqSize, random, seed)

  ## see if genome is given
  ## if not, read the hg19 by default
  if(missing(genome)){
    if(verbose==TRUE) message("Loading human genome (hg19)")
    if(missing(chrs)){
      genome = .getHG19(NA)
    }else{
      genome = .getHG19(chrs)
    }
    gaps = genome[[2]]
    genome = genome[[1]]
    chrs = names(genome)
    genomeOrganism = "hg19"
  ## if genome is given, there are two possibilities:
  ## 1. genome is a filename pointing to a FASTA-file
  ## 2. genome is a named DNAStringSet
  }else{
    if(class(genome) == "character"){
      if(verbose==TRUE) message("Loading genome from FASTA file: ", genome)
      genome = readDNAStringSet(filepath=genome)  ## read genome from FASTA-file
    }
    if(missing(chrs)){
      chrs = names(genome)
    }else{
		genome = genome[match(chrs, names(genome))]
	}
    ## compute gaps within the given genome (N-sequences)
    gaps = GRanges()
    for(chr in chrs){
      Ns = whichAsIRanges(as.integer(genome[[chr]]) == as.integer(DNAString("N")))
      if(length(Ns) > 0){
        gaps = c(gaps, GRanges(Ns, seqnames=chr))
      }
    }
    genomeOrganism = "unknown"
  }
  gaps = as.data.frame(gaps)[, 1:3]
  genomeCoords = GRanges(IRanges(start=1, end=width(genome)), seqnames=names(genome))

  ## for the default case, i.e. simulation on the hg19:
  ## set the weights for the sv formation mechanisms and their occurence within certain repeat regions
  ## default values for SV formation mechanisms were derived from:
  ## deletions, insertions/duplications: Mills et al., Mapping copy number variation by population-scale genome sequencing
  ## inversions: Pang et al., Mechanisms of Formation of Structural Variation in a Fully Sequenced Human Genome
  ## translocations: Ou et al., Observation and prediction of recurrent human translocations mediated by NAHR between nonhomologous chromosomes & Chen et al., Mapping translocation breakpoints by next-generation sequencing
  
  ## default values for repeat regions enriched for SV formation mechanisms were derived from:
  ## Lam et al., Nucleotide-resolution analysis of structural variants using BreakSeq and a breakpoint library (Supplemental Table 5)

  ## in this order: NAHR, NHR, TEI, VNTR, other
  mechanisms = c("NAHR","NHR","TEI","VNTR","Other")
  bpTypes = c("L1","L2","Alu","MIR","SD","TR","Random")

  ## for simulation on hg19 set the weights for the repeat elements (if not given by the user)
  ## for simulation on any other organism (user specified genome), set the weights to zero except for random simulation (i.e. turn this feature off)
  if(missing(weightsMechanisms)){
	if(genomeOrganism == "hg19" & repeatBias){
      data("weightsMechanisms", package="RSVSim", envir=environment())
    }else{
      weightsMechanisms = data.frame(
        dels = c(0,0,0,0,1),
        ins = c(0,0,0,0,1),
        invs = c(0,0,0,0,1),
        dups = c(0,0,0,0,1),
        trans = c(0,0,0,0,1)
        )
      rownames(weightsMechanisms) = mechanisms
    }
  }  
  if(missing(weightsRepeats)){
    if(genomeOrganism == "hg19" & repeatBias){
      data("weightsRepeats", package="RSVSim", envir=environment())
    }else{
      weightsRepeats = data.frame(
        NAHR = c(0,0,0,0,0,0,0),
        NHR = c(0,0,0,0,0,0,0),
        TEI = c(0,0,0,0,0,0,0),
        VNTR = c(0,0,0,0,0,0,0),
        Other = c(0,0,0,0,0,0,1)
        )
      rownames(weightsRepeats) = bpTypes
    }
  }

  ## several options to get the repeatmasker regions:
  ## 1. bp shall be placed randomly (no bpWeights)
  ## then, only use all the genomic coordinates without special weighting for repeats
  if(genomeOrganism != "hg19" | !repeatBias | all(random) == FALSE){
    if(genomeOrganism == "hg19"){
      if(verbose==TRUE) message("Bias for hg19 repeat regions is turned OFF")
    }else{
      if(verbose==TRUE) message("Breakpoints will be distributed uniformly across the genome")
    }
    bpRegions = vector(mode="list", length=1)
    names(bpRegions) = "Random"
  }else{
    if(verbose==TRUE) message("Bias for hg19 repeat regions is turned ON")
    ## 2. repeatMasker regions were loaded and saved to disk once befoe in the data directory of the package
    ## then just load the data
    if(file.exists(file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"))){
      if(verbose==TRUE) message("Loading hg19 repeat regions")
      data("repeats_hg19", package="RSVSim", envir=environment())  ## loads object named "repeats"
    }else{
      ## 3. filename of repeatmasker file is given (file downloaded from http://www.repeatmasker.org/genomes/hg19/RepeatMasker-rm330-db20120124/hg19.fa.out.gz)
      ## then, read the file, extract LINES,SINES and read segmental duplications from UCSC
      if(!missing(repeatMaskerFile)){
        if(verbose==TRUE) message("Loading hg19 repeat regions for the first time from given RepeatMasker output file")
        repeats = .readRepeatMaskerOutput(repeatMaskerFile)
        if(file.exists(file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"))){
         if(verbose==TRUE)  message("Repeat regions were saved to ", file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"), " for faster access next time")
        }else{
          warning("Saving of repeat regions to ", file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"), " failed")
        }
      }
      ## 4. no repeatMasker file is given
      ## then, read the repeatMasker track and the segmental duplications from UCSC via rTracklayer package
      else{
        if(verbose==TRUE) message("Loading hg19 repeat regions for the first time from the UCSC Browser's RepeatMasker track (this may take up to 45 minutes)")
        repeats = .loadFromUCSC_RepeatMasks(save=TRUE, verbose=verbose)  
        if(file.exists(file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"))){
         if(verbose==TRUE)  message("Repeat regions were saved to ", file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"), " for faster access next time")
        }else{
          warning("Saving of repeat regions to ", file.path(path.package("RSVSim"), "data", "repeats_hg19.RData"), " failed")
        }
      }
    }
    repeats = lapply(repeats, function(r) return(r[seqnames(r) %in% chrs]))
    repeats = lapply(repeats, function(r) return(reduce(r, min.gapwidth=50)))  # merge repeats which are only 100bp away from each other
    bpRegions = vector(mode="list", length=length(repeats)+1)
    names(bpRegions) = bpTypes
    bpRegions[1:length(repeats)] = repeats
    rm(repeats)
  }

  ## put SVs anywhere in the genome if no regions were specified
  ## if regions were given, take care, that the regions lie within the available chromosomes

  if(missing(regionsDels)){
    regionsDels = genomeCoords
  }else{
    regionsDels = regionsDels[seqnames(regionsDels) %in% intersect(chrs, as.character(seqnames((regionsDels))))]
    if(length(regionsDels) == 0){
      stop("No regions on given chromosomes")
    }
    ## for non-random distribution, set number of deletions to number of given regions
    if(random[1] == FALSE){
      dels = length(regionsDels)
    }
  }
  if(missing(regionsIns)){
    regionsIns = genomeCoords
  }else{
    if(random[2] == TRUE){
      regionsIns = regionsIns[seqnames(regionsIns) %in% intersect(chrs, as.character(seqnames((regionsIns))))]
    }else{
      regionsIns = regionsIns[seqnames(regionsIns) %in% intersect(chrs, as.character(seqnames((regionsIns)))) & regionsIns$chrB %in% chrs]
      ins = length(regionsIns)
    }
    if(length(regionsIns) == 0){
      stop("No regions on given chromosomes")
    }
  }
  if(missing(regionsInvs)){
    regionsInvs = genomeCoords
  }else{
    regionsInvs = regionsInvs[seqnames(regionsInvs) %in% intersect(chrs, as.character(seqnames((regionsInvs))))]
    if(length(regionsInvs) == 0){
      stop("No regions on given chromosomes")
    }
    ## for non-random distribution, set number of inversions to number of given regions
    if(random[3] == FALSE){
      invs = length(regionsInvs)
    }
  }
  if(missing(regionsDups)){
    regionsDups = genomeCoords
    dupTimes = sample(1:(maxDups-1), dups, replace=TRUE) + 1
  }else{
    regionsDups = regionsDups[seqnames(regionsDups) %in% intersect(chrs, as.character(seqnames((regionsDups))))]
    if(length(regionsDups) == 0){
      stop("No regions on given chromosomes")
    }
    ## for non-random distribution, set number of duplications to number of given regions
    dupTimes = sample(1:(maxDups-1), length(regionsDups), replace=TRUE) + 1
    if(random[4] == FALSE){
      dups = length(regionsDups)
      if(ncol(mcols(regionsDups)) > 0){
		  if("times" %in% colnames(mcols(regionsDups))){
			dupTimes = mcols(regionsDups)$times
		  }
	  }
    }
  }
  if(missing(regionsTrans)){
    regionsTrans = genomeCoords
  }else{
    if(random[5] == TRUE){
      regionsTrans = regionsTrans[seqnames(regionsTrans) %in% intersect(chrs, as.character(seqnames((regionsTrans))))]
    }else{
      regionsTrans = regionsTrans[seqnames(regionsTrans) %in% intersect(chrs, as.character(seqnames((regionsTrans)))) & regionsTrans$chrB %in% chrs]
      trans = length(regionsTrans)
    }
    if(length(regionsTrans) == 0){
      stop("No regions on given chromosomes")
    }
  }

  ## restrict SV regions to given chromosomes

  ## inversion and deletion sizes
  if((length(sizeDels) > 1 & length(sizeDels) != dels) | (length(sizeIns) > 1 & length(sizeIns) != ins) | (length(sizeInvs) > 1 & length(sizeInvs) != invs) | (length(sizeDups) > 1 & length(sizeDups) != dups)){
    warning("Length of vectors with SV sizes vary from the number of SVs")
  }
  if(!missing(size)){
    sizeDels = sizeIns = sizeInvs = sizeDups = size
  }
  
  sizeDels = rep(sizeDels, dels)[1:dels]
  sizeIns = rep(sizeIns, ins)[1:ins]
  sizeInvs = rep(sizeInvs, invs)[1:invs]
  sizeDups = rep(sizeDups, dups)[1:dups]
  
  ## bpSeqSize will be used two times, in upstream and downstream direction; so divide it hear by two
  bpSeqSize = round(bpSeqSize / 2)

  ## Simulation ############################################################

  ## 1. simulate regions for translocations:
  type = "translocation"
  translocations = posTrans_1 = posTrans_2 = NULL
  if(random[5] == TRUE){
    ## 1.1 for chromosome terminals
    if(trans > 0){
      if(verbose==TRUE) message("Calculating coordinates: ", trans, " translocations")
    
      subtrahend = gaps[, c("seqnames","start","end")]
      subtrahend = GRanges(IRanges(subtrahend$start, subtrahend$end), seqnames=subtrahend$seqnames)
      regionsTrans  = .subtractIntervals(regionsTrans, subtrahend)
      bpRegions[["Random"]] = regionsTrans
      t = .simTranslocationPositions(trans, bpRegions, weightsMechanisms[, "trans", drop=FALSE], weightsRepeats, genome, percBalancedTrans, c(), verbose)
      translocations = t[[1]]
      posTrans_1 = t[[2]]
      posTrans_2 = t[[3]]
    }   
  }else{
    if(verbose==TRUE) message("Using given coordinates: Translocations")    
    trans = length(regionsTrans)
    posTrans_1 = as.data.frame(regionsTrans)
    posTrans_1$names = rownames(posTrans_1)
    posTrans_2 = as.data.frame(regionsTrans)[, c("chrB","startB","endB")]
    colnames(posTrans_2) = c("seqnames","start","end")
    
    ## Add balanced information (default:TRUE) if not given
    if(!("balanced" %in% colnames(posTrans_1)) | !("balanced" %in% colnames(posTrans_2))){
      posTrans_1$balanced = posTrans_2$balanced = TRUE
    }
    ## Determine wich translocations need to be inverted (segments on different ends of chromosomes)
    idx = which((posTrans_1$start == 1 & posTrans_2$start != 1) | (posTrans_1$start != 1 & posTrans_2$start == 1))
    posTrans_1$inverted = posTrans_2$inverted = FALSE
    posTrans_1$inverted[idx] = posTrans_2$inverted[idx] = TRUE

    size1 = posTrans_1$end-posTrans_1$start+1
    size2 = posTrans_2$end-posTrans_2$start+1
    translocations = cbind(posTrans_1$names, posTrans_1[, c("seqnames","start","end")], size1, posTrans_2[, c("seqnames","start","end")], size2, posTrans_2$balanced)
    colnames(translocations) = c("Name", "ChrA", "StartA", "EndA", "SizeA", "ChrB", "StartB", "EndB", "SizeB","Balanced")
  }

  ## 2. simulate insertions
  type = "insertion"
  insertions = posIns_1 = posIns_2 = NULL
  if(random[2] == TRUE){
    if(ins > 0){
      if(verbose==TRUE) message("Calculating coordinates: ", ins, " insertions")  
      
      subtrahend = rbind(gaps[, c("seqnames","start","end")], posTrans_1[, c("seqnames","start","end")], posTrans_2[, c("seqnames","start","end")])
      subtrahend = GRanges(IRanges(subtrahend$start, subtrahend$end), seqnames=subtrahend$seqnames)
      regionsIns  = .subtractIntervals(regionsIns, subtrahend)
      bpRegions[["Random"]] = regionsIns
      i = .simInsertionPositions(ins, bpRegions, weightsMechanisms[, "ins", drop=FALSE], weightsRepeats, genome, sizeIns, percCopiedIns, verbose)
      insertions = i[[1]]
      posIns_1 = i[[2]]
      posIns_2 = i[[3]]
    }
  }else{
    if(verbose==TRUE) message("Using given coordinates: Insertions")    
    ins = length(regionsIns)
    posIns_1 = as.data.frame(regionsIns)
    size = posIns_1$end-posIns_1$start+1
    posIns_1$names = rownames(posIns_1)
    posIns_2 = as.data.frame(regionsIns)[, c("chrB","startB")]
    posIns_2$endB = posIns_2$startB + size - 1
    colnames(posIns_2) = c("seqnames","start","end")
    ## Add duplicate information (default:FALSE) if not given
    if(!("copied" %in% colnames(posIns_1))){
      posIns_1$copied = posIns_2$copied = FALSE
    }
 
    insertions = cbind(posIns_1$names, posIns_1[rownames(posIns_1), c("seqnames","start","end")], posIns_2[rownames(posIns_1), c("seqnames","start","end")], size, posIns_1$copied)
    colnames(insertions) = c("Name", "ChrA", "StartA", "EndA", "ChrB", "StartB", "EndB", "Size", "Copied")
  }

  ## 3. simulate deletions
  type = "deletion"
  deletions = posDel = NULL
  if(random[1] == TRUE){
    if(dels > 0){
      if(verbose==TRUE) message("Calculating coordinates: ", dels, " deletions")
      
      subtrahend = rbind(gaps[, c("seqnames","start","end")], posIns_1[, c("seqnames","start","end")], posIns_2[, c("seqnames","start","end")], posTrans_1[, c("seqnames","start","end")], posTrans_2[, c("seqnames","start","end")])
      subtrahend = GRanges(IRanges(subtrahend$start, subtrahend$end), seqnames=subtrahend$seqnames)
      regionsDels = .subtractIntervals(regionsDels, subtrahend)
      bpRegions[["Random"]] = regionsDels
      d = .simPositions(dels, bpRegions, weightsMechanisms[, "dels", drop=FALSE], weightsRepeats, sizeDels, "deletion", verbose)
      deletions = d[[1]]
      posDel = d[[2]]
    }
  }else{
    if(verbose==TRUE) message("Using given coordinates: Deletions")
    dels = length(regionsDels)
    posDel = as.data.frame(regionsDels)
    posDel$names = rownames(posDel)
    size=posDel$end-posDel$start+1
    deletions = cbind(posDel$names, posDel[, c("seqnames","start","end")], size)
    colnames(deletions) = c("Name", "Chr", "Start","End","Size")
  }
  
  ## 4. simulate inversions
  type = "inversion"
  inversions = posInv = NULL
  if(random[3] == TRUE){
    if(invs > 0){
      if(verbose==TRUE) message("Calculating coordinates: ", invs, " inversions")
      
      subtrahend = rbind(gaps[, c("seqnames","start","end")], posDel[, c("seqnames","start","end")], posIns_1[, c("seqnames","start","end")], posIns_2[, c("seqnames","start","end")], posTrans_1[, c("seqnames","start","end")], posTrans_2[, c("seqnames","start","end")])
      subtrahend = GRanges(IRanges(subtrahend$start, subtrahend$end), seqnames=subtrahend$seqnames)
      regionsInvs  = .subtractIntervals(regionsInvs, subtrahend)
      bpRegions[["Random"]] = regionsInvs
      i = .simPositions(invs, bpRegions, weightsMechanisms[, "invs", drop=FALSE], weightsRepeats, sizeInvs, type, verbose)
      inversions = i[[1]]
      posInv = i[[2]]
    }
  }else{
    if(verbose==TRUE) message("Using given coordinates: Inversions")
    invs = length(regionsInvs)
    posInv = as.data.frame(regionsInvs)
    posInv$names = rownames(posInv)
    size=posInv$end-posInv$start+1
    inversions = cbind(posInv$names, posInv[, c("seqnames","start","end")], size)
    colnames(inversions) = c("Name","Chr", "Start","End", "Size")
  }
  
  ## 5. simulate tandem duplications
  type = "tandemDuplication"
  tandemDups = posDup = NULL
  if(random[4] == TRUE){
    if(dups > 0){
      if(verbose==TRUE) message("Calculating coordinates: ", dups, " tandem duplications")
      
      subtrahend = rbind(gaps[, c("seqnames","start","end")], posDel[, c("seqnames","start","end")], posIns_1[, c("seqnames","start","end")], posIns_2[, c("seqnames","start","end")], posInv[, c("seqnames","start","end")], posTrans_1[, c("seqnames","start","end")], posTrans_2[, c("seqnames","start","end")])
      subtrahend = GRanges(IRanges(subtrahend$start, subtrahend$end), seqnames=subtrahend$seqnames)
      regionsDups  = .subtractIntervals(regionsDups, subtrahend)
      bpRegions[["Random"]] = regionsDups
      td = .simPositions(dups, bpRegions, weightsMechanisms[, "dups", drop=FALSE], weightsRepeats, sizeDups, type, verbose)
      tandemDups = td[[1]]
      posDup = td[[2]]
    }
  }else{
    if(verbose==TRUE) message("Using given coordinates: Tandem Duplications")
    dups = length(regionsDups)
    posDup = as.data.frame(regionsDups)
    posDup$names = rownames(posDup)
    size=posDup$end-posDup$start+1
    tandemDups = cbind(posDup$names, posDup[, c("seqnames","start","end")], size)
    colnames(tandemDups) = c("Name","Chr", "Start","End", "Size")
  }
  
  ## Execution: implement regions into genome sequence ############################################################
    
  ## 1. inversions
  if(invs > 0){
    if(verbose==TRUE) message("Rearranging genome: Inversions")
    if(verbose==TRUE) pb = txtProgressBar(min = 0, max = invs, style = 3)
    for(i in 1:invs){

      chr = as.character(posInv$seqnames[i])
      rearrangement = .execInversion(genome[[chr]], chr, posDel, posIns_1, posIns_2, posInv, posDup, posTrans_1, posTrans_2, bpSeqSize, i, bpFlankSize, percSNPs, indelProb, maxIndelSize)
      genome[[chr]] = rearrangement[[1]]
      posDel = rearrangement[[2]]
      posIns_1 = rearrangement[[3]]
      posIns_2 = rearrangement[[4]]
      posInv = rearrangement[[5]]
      posDup = rearrangement[[6]]
      posTrans_1 = rearrangement[[7]]
      posTrans_2 = rearrangement[[8]]
      
      if(verbose==TRUE) setTxtProgressBar(pb, i)
      
    }
    if(verbose==TRUE) close(pb)
  }

  
  ## 2. translocations
  if(trans > 0){
    if(verbose==TRUE) message("Rearranging genome: Translocations")
    if(verbose==TRUE) pb = txtProgressBar(min = 0, max = trans, style = 3)
    for(i in 1:trans){
      chr1 = as.character(posTrans_1$seqnames[i])
      chr2 = as.character(posTrans_2$seqnames[i])
      rearrangement = .execTranslocation(genome[[chr1]], genome[[chr2]], chr1, chr2, posDel, posIns_1, posIns_2, posInv, posDup, posTrans_1, posTrans_2, i, bpSeqSize, bpFlankSize, percSNPs, indelProb, maxIndelSize)
      genome[[chr1]] = rearrangement[[1]]
      genome[[chr2]] = rearrangement[[2]]
      posDel = rearrangement[[3]]
      posIns_1 = rearrangement[[4]]
      posIns_2 = rearrangement[[5]]
      posInv = rearrangement[[6]]
      posDup = rearrangement[[7]]
      posTrans_1 = rearrangement[[8]]
      posTrans_2 = rearrangement[[9]]
     
      if(verbose==TRUE) setTxtProgressBar(pb, i)
      
    }
    if(verbose==TRUE) close(pb)
  }

  
  ## 3. insertions
  if(ins > 0){
    if(verbose==TRUE) message("Rearranging genome: Insertions")
    if(verbose==TRUE) pb = txtProgressBar(min = 0, max = ins, style = 3)
    for(i in 1:ins){
      chr1 = as.character(posIns_1$seqnames[i])
      chr2 = as.character(posIns_2$seqnames[i])      
      rearrangement = .execInsertion(genome[[chr1]], genome[[chr2]], chr1, chr2, posDel, posIns_1, posIns_2, posInv, posDup, posTrans_1, posTrans_2, i, bpSeqSize, bpFlankSize, percSNPs, indelProb, maxIndelSize)
      genome[[chr1]] = rearrangement[[1]]
      genome[[chr2]] = rearrangement[[2]]
      posDel = rearrangement[[3]]
      posIns_1 = rearrangement[[4]]
      posIns_2 = rearrangement[[5]]
      posInv = rearrangement[[6]]
      posDup = rearrangement[[7]]
      posTrans_1 = rearrangement[[8]]
      posTrans_2 = rearrangement[[9]]
     
      if(verbose==TRUE) setTxtProgressBar(pb, i)
      
    }
    if(verbose==TRUE) close(pb)
  }
  
  ## 4. tandem duplications
  if(dups > 0){
    if(verbose==TRUE) message("Rearranging genome: Tandem duplications")
    ## add a column for the number of duplications
    tandemDups = cbind(tandemDups[, c("Name", "Chr", "Start","End", "Size")], 0, "")
    names(tandemDups) = c("Name", "Chr", "Start","End", "Size", "Duplications", "BpSeq")
    if(verbose==TRUE) pb = txtProgressBar(min = 0, max = dups, style = 3)
    for(i in 1:dups){
#      times = sample(2:maxDups,1)  ## how many times the sequence is duplicated
 	  times = dupTimes[i]
      chr = as.character(posDup$seqnames[i])
      rearrangement = .execTandemDuplication(genome[[chr]], chr, posDel, posIns_1, posIns_2, posInv, posDup, posTrans_1, posTrans_2, bpSeqSize, times, i, bpFlankSize, percSNPs, indelProb, maxIndelSize)
      genome[[chr]] = rearrangement[[1]]
      posDel = rearrangement[[2]]
      posIns_1 = rearrangement[[3]]
      posIns_2 = rearrangement[[4]]
      posInv = rearrangement[[5]]
      posDup = rearrangement[[6]]
      posTrans_1 = rearrangement[[7]]
      posTrans_2 = rearrangement[[8]]      
      tandemDups$BpSeq[i] = rearrangement[[9]]
      tandemDups$Duplications[i] = times
     
      if(verbose==TRUE) setTxtProgressBar(pb, i)
      
    }
    if(verbose==TRUE) close(pb)
  }

  ## 5. deletions
  if(dels > 0){
    if(verbose==TRUE) message("Rearranging genome: Deletions")
    if(verbose==TRUE) pb = txtProgressBar(min = 0, max = dels, style = 3)
    for(i in 1:dels){
      chr = as.character(posDel$seqnames[i])
      rearrangement = .execDeletion(genome[[chr]], chr, posDel, posIns_1, posIns_2, posInv, posDup, posTrans_1, posTrans_2, bpSeqSize, i, bpFlankSize, percSNPs, indelProb, maxIndelSize)
      genome[[chr]] = rearrangement[[1]]
      posDel = rearrangement[[2]]
      posIns_1 = rearrangement[[3]]
      posIns_2 = rearrangement[[4]]
      posInv = rearrangement[[5]]
      posDup = rearrangement[[6]]
      posTrans_1 = rearrangement[[7]]
      posTrans_2 = rearrangement[[8]]
     
      if(verbose==TRUE) setTxtProgressBar(pb, i)
      
    }
    if(verbose==TRUE) close(pb)
  }

  ## Retrieve breakpoint sequences for translocations, deletions and inversions (for duplications, it works during execution)
  if(invs > 0){
    inversions$BpSeq_5prime = inversions$BpSeq_3prime = ""
    for(i in 1:invs){
      bpSeq = .getBpSeq(genome, posInv, bpSeqSize, i)
      inversions$BpSeq_5prime[i] = bpSeq[1]
      inversions$BpSeq_3prime[i] = bpSeq[2]
    }
  }

  if(dels > 0){
    deletions$BpSeq = ""
    for(i in 1:dels){
      pos = posDel
      pos$end = pos$start
      bpSeq = .getBpSeq(genome, pos, bpSeqSize, i)
      deletions$BpSeq[i] = bpSeq[1]
    }
  }
  
  if(trans > 0){
    translocations$BpSeqB = translocations$BpSeqA = ""
    for(i in 1:trans){
      if(posTrans_1$balanced[i] == TRUE){
        bpSeq = .getBpSeq(genome, posTrans_1, bpSeqSize, i)
        translocations$BpSeqA[i] = bpSeq[which(bpSeq != "")][1]
      }
      bpSeq = .getBpSeq(genome, posTrans_2, bpSeqSize, i)
      translocations$BpSeqB[i] = bpSeq[which(bpSeq != "")][1]
    }
  }
  if(ins > 0){
    insertions$BpSeqB_3prime = insertions$BpSeqB_5prime = insertions$BpSeqA = ""
    for(i in 1:ins){
      if(posIns_1$copied[i] == FALSE){
        bpSeq = .getBpSeq(genome, posIns_1, bpSeqSize, i)
        insertions$BpSeqA[i] = bpSeq[1]
      }
      bpSeq = .getBpSeq(genome, posIns_2, bpSeqSize, i)
      insertions$BpSeqB_5prime[i] = bpSeq[1]
      insertions$BpSeqB_3prime[i] = bpSeq[2]
    }
  }

  ## Output ############################################################
  if(!is.na(output)){
    if(output == "."){
      if(verbose==TRUE) message("Writing output to current directory")
    }else{
      if(verbose==TRUE) message("Writing output to ", output)
    }
    
    if(dels > 0){
      write.table(deletions, file.path(output, "deletions.csv"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
    }
    if(ins > 0){
      write.table(insertions, file.path(output, "insertions.csv"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
    }
    if(invs > 0){
      write.table(inversions, file.path(output, "inversions.csv"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
    }
    if(dups > 0){
      write.table(tandemDups, file.path(output, "tandemDuplications.csv"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
    }
    if(trans > 0){
      write.table(translocations, file.path(output, "translocations.csv"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
    }
    
    writeXStringSet(genome, file.path(output, "genome_rearranged.fasta"), append=FALSE, format="fasta")
  }
  
  idx = c(dels>0, ins>0, invs>0, dups>0, trans>0)

  ##  names(svs) = c("deletions", "insertions", "inversions","tandemDuplications","translocations")[idx]
  metadata(genome) = list(deletions=deletions, insertions=insertions, inversions=inversions, tandemDuplications=tandemDups, translocations=translocations)[idx]
  
  return(genome)
  
})


.validateInput <- function(output, genome, chrs, dels, ins, invs, dups, trans, sizeDels, sizeIns, sizeInvs, sizeDups, regionsDels, regionsIns, regionsInvs, regionsDups, regionsTrans, maxDups, percCopiedIns, percBalancedTrans, percSNPs, indelProb, weightsMechanisms, weightsRepeats, bpSeqSize, random, seed){

  ## output
  if(!is.na(output)){
    if(!file.exists(output)){
      stop("Output directory does not exist")
    }
  }
  ## genome and chrs
  if(!missing(genome)){
    if(class(genome) == "DNAStringSet"){
      if(is.null(names(genome))){
        stop("Please enter chromosome names in your genome DNAStringSet")
      }
      if(!missing(chrs)){
        if(!all(chrs %in% names(genome))){
          stop("Invalid argument: Specified chromosomes and chromosome names in the genome do not match")
        }
      }
    }
    if(class(genome) == "BSgenome"){
      stop("Please extract the desired sequences from the BSgenome package and combine them into a named DNAStringSet")
    }
  }

  ## Number and size of SVs
  if(any(c(dels, ins, invs, dups, trans, sizeDels, sizeIns, sizeInvs, sizeDups) < 0)){
    stop("Invalid argument: Number of SVs and their size cannot be smaller than zero. Makes sense, doesn't it?")
  }
  
  ## percBalancedTrans, percCopiedIns, percSNPs, indelProb
  if(percBalancedTrans < 0 | percBalancedTrans > 1 | percCopiedIns < 0 | percCopiedIns > 1 | percSNPs < 0 | percSNPs > 1 | indelProb < 0 | indelProb > 1){
    stop("Invalid argument: Percentages have to be given as value between 0 and 1.")
  }

  ## random and regions
  if(length(random) != 1 & length(random) != 5){
    stop("Invalid argument: Give either one value for argument random for all SVs or five values for each SV in this order: deletions, insertions, inversions, tandem duplications, translocations")
  }
  if(random[1] == FALSE & missing(regionsDels)){
    stop("Missing argument: Please specifiy the regions for deletions")
  }
  if(random[2] == FALSE & missing(regionsIns)){
    stop("Missing argument: Please specifiy the regions for insertions")
  }
  if(random[3] == FALSE & missing(regionsInvs)){
    stop("Missing argument: Please specifiy the regions for inversions")
  }
  if(random[4] == FALSE & missing(regionsDups)){
    stop("Missing argument: Please specifiy the regions for tandem duplications")
  }
  if(random[5] == FALSE & missing(regionsTrans)){
    stop("Missing argument: Please specifiy the regions for translocations")
  }
  
  ## regions
  if(!missing(regionsIns) & random[2] == FALSE){
    if(!all(c("chrB","startB") %in% colnames(mcols(regionsIns)))){
      stop("Invalid argument: regionsIns is missing columns chrB and startB")
    }
  }
  if(!missing(regionsTrans) & random[5] == FALSE){
    if(!all(c("chrB","startB","endB") %in% colnames(mcols(regionsTrans)))){
      stop("Invalid argument: regionsTrans is missing columns chrB, startB and endB")
    }
  }

  ## repeat weights
  mechanisms = c("NAHR","NHR","TEI","VNTR","Other")
  bpTypes = c("L1","L2","Alu","MIR","SD","TR","Random")
  svTypes = c("dels", "invs", "ins", "dups", "trans")
  if(!missing(weightsMechanisms)){
    if(is.data.frame(weightsMechanisms)){
      if(!all(rownames(weightsMechanisms) %in% mechanisms) | !all(colnames(weightsMechanisms) %in% svTypes)){
        stop("Invalid argument: Please make sure that the row names of parameter weightsMechanisms equal \"NAHR\",\"NHR\",\"TEI\",\"VNTR\" and the column names equal \"Other\"\"dels\", \"invs\", \"ins\", \"dups\", \"trans\".")
      }
    }
  }
  if(!missing(weightsRepeats)){
    if(is.data.frame(weightsRepeats)){
        if(!all(rownames(weightsRepeats) %in% bpTypes) | !all(colnames(weightsRepeats) %in% mechanisms)){
          stop("Invalid argument: Please make sure that the row names of parameter weightsRepeats equal \"L1\",\"L2\",\"Alu\",\"MIR\",\"SD\",\"TR\",\"Random\" and the column names equal \"NAHR\",\"NHR\",\"TEI\",\"VNTR\",\"Other\".")
        }
    }    
  }

}


## Deprecated: Transposons (maybe useful for future release)
#
#    if(trans[3] > 0){
#      ## 1.3 for intra-chromosomal translocations
#    
#      message("Calculating coordinates: ", trans[3], " intra-chromosomal translocations")  
#      
#      intrachrom=TRUE
#      subtrahend = RangedData(rbind(gaps[, c("space","start","end")], posTrans_1[, c("space","start","end")], posTrans_2[, c("space","start","end")]))
#      regionsTrans  = .subtractIntervals(regionsTrans, subtrahend)
#      t = .simTranslocationPositions(trans[3], regionsTrans, genome, percInvertedTrans, percBalancedTrans, sizeTrans2, "intrachrom")
#      translocations = rbind(translocations, t[[1]])
#      posTrans_1 = rbind(posTrans_1, t[[2]])
#      posTrans_2 = rbind(posTrans_2, t[[3]])
#    }


## Maybe add gene annotation in future release (if it makes sense)
#    require(biomaRt)
#    ensembl=useMart("ensembl")
#    dataset="hsapiens_gene_ensembl"
#    ensembl=useDataset(dataset, mart=ensembl)
#    bmAttributes = c(
#      "hgnc_symbol",
#      "chromosome_name",  
#      "start_position",
#      "end_position"
#      )
#    bmFilter=c("chromosomal_region")
#    bmValues = as.list(paste(substr(d$Chr, 4, nchar(d$Chr)), d$Start, d$End, sep=":"))
#    genes = getBM(attributes=bmAttributes, filter=bmFilter, values=bmValues, mart=ensembl)
#    d$Genes = ""
#    for(i in 1:nrow(d)){
#      gene_overlap = subset(genes, (chromosome_name == substr(d$Chr[i], 4, nchar(d$Chr))) & (IRanges(start_position, end_position) %in% IRanges(d$Start[i], d$End[i])) & hgnc_symbol != "")
#      d$Genes[i] = paste(unique(gene_overlap$hgnc_symbol), collapse=",")
#    }
Budheimer/RSVSim documentation built on May 30, 2019, 1:26 p.m.