data-raw/insilico_sv.R

# This script will generate 1,000,000 random SVs from the mappable genome

library(BSgenome.Hsapiens.UCSC.hg19)

# Reading in the mappability track from UCSC and subsetting it to only include uniquely mappable loci
map.gr <- rtracklayer::import(file.path("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data",
                                        "wgEncodeCrgMapabilityAlign100mer.bigWig"),
                              format = "BigWig")
map.gr <- map.gr[map.gr$score == 1]

# Computing the length of the mappable regions in each chromosome and sampling chromosomes
# for SV breakpoints with probability proportional to each chromosome's contribution to the mappable genome
chrs <- paste0("chr", c(seq(1, 22, 1), "X", "Y"))
chr_lens <- seqlengths(BSgenome.Hsapiens.UCSC.hg19)[1:24]
mappable_chr_lens <- sapply(chrs, function(t) sum(width(map.gr[seqnames(map.gr) == t])))

bp1_chr <- sample(names(mappable_chr_lens), size = 1000000, prob = mappable_chr_lens/sum(mappable_chr_lens), replace = TRUE)
bp2_chr <- sample(names(mappable_chr_lens), size = 1000000, prob = mappable_chr_lens/sum(mappable_chr_lens), replace = TRUE)

bp1_chr_ct <- table(bp1_chr)
bp2_chr_ct <- table(bp2_chr)

bp1_pos <- NULL
for (i in 1:length(bp1_chr_ct)) {
  sample_space <- 1:chr_lens[names(chr_lens) == names(bp1_chr_ct)[i]]
  sample_size <- as.numeric(bp1_chr_ct[i])
  tmp.bp1_pos <- sample(sample_space, size = sample_size * 10, replace = TRUE)
  # Only keeping sampled breakpoints that overlap a mappable region of the genome
  tmp.bp1_gr <- GRanges(names(bp1_chr_ct)[i], ranges = tmp.bp1_pos)
  tmp.bp1.mappable_gr <- subsetByOverlaps(tmp.bp1_gr, map.gr)
  stopifnot(length(tmp.bp1.mappable_gr) >= sample_size)

  bp1_pos <- c(bp1_pos, start(tmp.bp1.mappable_gr)[1:sample_size])
}

bp2_pos <- NULL
for (i in 1:length(bp2_chr_ct)) {
  sample_space <- 1:chr_lens[names(chr_lens) == names(bp2_chr_ct)[i]]
  sample_size <- as.numeric(bp2_chr_ct[i])
  tmp.bp2_pos <- sample(sample_space, size = sample_size * 10, replace = TRUE)
  # Only keeping sampled breakpoints that overlap a mappable region of the genome
  tmp.bp2_gr <- GRanges(names(bp2_chr_ct)[i], ranges = tmp.bp2_pos)
  tmp.bp2.mappable_gr <- subsetByOverlaps(tmp.bp2_gr, map.gr)
  stopifnot(length(tmp.bp2.mappable_gr) >= sample_size)

  bp2_pos <- c(bp2_pos, start(tmp.bp2.mappable_gr)[1:sample_size])
}

# Shuffling the order of bp1 and bp2
bp1_order <- sample(1:length(bp1_chr), size = length(bp1_chr), replace = FALSE)
bp2_order <- sample(1:length(bp2_chr), size = length(bp2_chr), replace = FALSE)

# Creating vectors for chromosome, position, and strand
bp1.chr = rep(names(bp1_chr_ct), bp1_chr_ct)[bp1_order]
bp1.pos = bp1_pos[bp1_order]
bp1.strand = sample(c("+", "-"), size = 1000000, replace = TRUE)

bp2.chr = rep(names(bp2_chr_ct), bp2_chr_ct)[bp2_order]
bp2.pos = bp2_pos[bp2_order]
bp2.strand = sample(c("+", "-"), size = 1000000, replace = TRUE)

# For intrachromosomal events, moving the leftmost position in the genome to the left of the table
# (matches the table provided by the PCAWG project)
intra_hits <- which(bp1.chr == bp2.chr)
switch_ind <- intersect(intra_hits, which(bp1.pos > bp2.pos))

# Compiling the chrommosome and breakpoint for each SV into a data.frame.
# Also generating a random strand for each breakpoint
out.df <- data.frame(bp1.chr = c(bp2.chr[switch_ind], bp1.chr[-switch_ind]),
                     bp1.pos = c(bp2.pos[switch_ind], bp1.pos[-switch_ind]),
                     bp1.strand = c(bp2.strand[switch_ind], bp1.strand[-switch_ind]),
                     bp2.chr = c(bp1.chr[switch_ind], bp2.chr[-switch_ind]),
                     bp2.pos = c(bp1.pos[switch_ind], bp2.pos[-switch_ind]),
                     bp2.strand = c(bp1.strand[switch_ind], bp2.strand[-switch_ind]))

# Adding an SV type (DEL, DUP, INV, TRA) )based on the strand orientation (+-, -+, ++, --) and
# whether or not the SV is intrachrommosomal or interchromosomal.
strand_orien <- paste0(out.df$bp1.strand, out.df$bp2.strand)
same_chr <- which(out.df$bp1.chr == out.df$bp2.chr)
out.df$type <- NA
out.df$type[which(strand_orien == "+-")] <- "DEL"
out.df$type[which(strand_orien == "-+")] <- "DUP"
out.df$type[which(strand_orien %in% c("++", "--"))] <- "INV"
out.df$type[-same_chr] <- "TRA"

# Shuffling out.df so that not all the intrachromosomal events are at the top of the list
out.df <- out.df[sample(1:nrow(out.df), size = nrow(out.df), replace = FALSE),]

write.table(out.df, file = "/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/processed_data/in_silico_sv.txt",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

# Creating an rda object as well
bp1.gr <- GRanges(seqnames = out.df$bp1.chr, ranges = out.df$bp1.pos, strand = out.df$bp1.strand)
bp2.gr <- GRanges(seqnames = out.df$bp2.chr, ranges = out.df$bp2.pos, strand = out.df$bp2.strand)
insilico_sv <- bp1.gr
insilico_sv$linked.to <- bp2.gr
save(insilico_sv, file = "/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/processed_data/insilico_sv.rda")
cancer-genomics/plasmasv documentation built on May 15, 2020, 11:35 a.m.