#' Simulate random LOH, gains and HD to identify enriched events
#'
#' @param annotated_segments_file Full path to the aggregated subclones file, which has been annotated with the CNA type
#' @param refsegs_dir Full path to the directory with the refsegs files
#' @param gain_output_dir Full path to the directory where the simulations of the gains will be saved
#' @param loh_output_dir Full path to the directory where the simulations of the losses of heterozygosity will be saved
#' @param hd_output_dir Full path to the directory where the simulations of the homozygous deletions will be saved
#' @param tumour_type String that represents the type of tumour we work with
#' @param genome_build String that represents the genome build - could be hg19 or hg38
#' @param run Number of simulation
#' @return files with the random placing of the LOH/gain/HD events
identify_enriched_regions_loh <- function(annotated_segments_file, refsegs_dir, loh_output_dir, tumour_type, run){
# LOH ####
# load the annotated segments data
allsegsa <- read.table(annotated_segments_file, header = T, stringsAsFactors = F, sep = "\t")
all.data <- allsegsa[allsegsa$frac1_A>0&allsegsa$frac1_A<=1&allsegsa$tumour_ploidy>0, ]
loh.data = all.data[(all.data$CNA=="sLOH"|all.data$CNA=="cLOH"),] # pick the segments with LOH
loh.segment.counts = read.table(paste0(refsegs_dir,tumour_type,"_LOH_refsegs.txt"),header=T,stringsAsFactors=F) # pick the info about how many samples have loss in specific intervals
colnames(loh.segment.counts) <- c("chr", "startpos", "endpos", "count")
chr.names = c(1:22,"X")
chr.data = list(0) # store the LOH.segment.counts data
chr.seg.counts = array(0,23) # number of segments per chromosome
for(chr in 1:23){
chr.data[[chr]] = loh.segment.counts[loh.segment.counts$chr==chr.names[chr],]
chr.seg.counts[chr] = nrow(chr.data[[chr]])
}
cum.chr.seg.counts = c(0,cumsum(chr.seg.counts)) # sort out some genome and chr info
chr.lengths <- c(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566,155270560)
#chr.lengths<-c(248956422,242193529,198295559,190214555,181538259,170805979,159345973,145138636,138394717,133797422,135086622,133275309,114364328,107043718,101991189,90338345,83257441,80373285,58617616,64444167,46709983,50818468,156040895)
genome.length = sum(as.numeric(chr.lengths))
half.genome.length = round(genome.length/2)
chr.boundaries = c(0,cumsum(as.numeric(chr.lengths)))
# create a vector to store the expected number of times an event occurring by chance
sim.counts = vector(mode = "numeric",length = nrow(loh.segment.counts))
loh.data$loh.length = loh.data$endpos - loh.data$startpos + 1 # lengths of the loh segments
sim.loh = NULL # we will store the simulated LOH events here
for(sample in unique(loh.data$Tumour_Name)){
sample.sim.loh = NULL
sample.data = loh.data[loh.data$Tumour_Name==sample,] # pick the loh/loss data for this sample
for(i in 1:nrow(sample.data)){
#April 2014 - allow deletion to cross over chromosome boundary, otherwise telomeres are never hit
genome.start.pos = sample(half.genome.length,1)
if(sample(2,1)==2){
genome.start.pos = genome.start.pos + half.genome.length
}
genome.end.pos = genome.start.pos+sample.data$loh.length[i]-1
chr = sum(genome.start.pos>chr.boundaries)
chr.end = sum(genome.end.pos>chr.boundaries)
if (chr == chr.end){
chr.start.pos = genome.start.pos - chr.boundaries[chr]
chr.end.pos = genome.end.pos - chr.boundaries[chr]
sample.sim.loh = rbind(sample.sim.loh,c(chr,chr.start.pos,chr.end.pos))
} else {
#if deletion overlaps chromosome boundary, add 2 separate deletions, one on each chromosome
chr.start.pos = genome.start.pos - chr.boundaries[chr]
chr.end.pos = chr.lengths[chr]
sample.sim.loh = rbind(sample.sim.loh,c(chr,chr.start.pos,chr.end.pos))
if ((chr.end-chr)>1) {
for (j in (chr+1):(chr.end-1)) {
chr.start.pos=1
chr.end.pos=chr.lengths[j]
sample.sim.loh = rbind(sample.sim.loh,c(j,chr.start.pos,chr.end.pos))
}
}
chr.start.pos = 1
chr.end.pos = genome.end.pos - chr.boundaries[chr.end]
if (chr.end==24){chr.end=1} # loop from chr 23 to chr 1
sample.sim.loh = rbind(sample.sim.loh,c(chr.end,chr.start.pos,chr.end.pos))
}
}
sim.loh = rbind(sim.loh,sample.sim.loh)
}
sim.hits = vector(mode="numeric",length = nrow(loh.segment.counts))
for(c in 1:23){
if (dim(loh.segment.counts[loh.segment.counts$chr==chr.names[c],])[1]) {
chr = chr.names[c]
chr.sim.data = data.frame(startpos = sim.loh[sim.loh[,1]==c,2],endpos = sim.loh[sim.loh[,1]==c,3])
all.breakpoints = c(1,sort(unique(unlist(chr.sim.data))),chr.lengths[c])
no.breakpoints = length(all.breakpoints)-1
chr.segment.data = cbind(chr,all.breakpoints[1:no.breakpoints],all.breakpoints[-1],0)
chr.segment.data[,4] = sapply(1:nrow(chr.segment.data),function(d,s,i){sum(d$startpos >= as.numeric(s[i,2]) & d$startpos < as.numeric(s[i,3])| d$endpos > as.numeric(s[i,2]) & d$endpos <= as.numeric(s[i,3]) | d$startpos < as.numeric(s[i,2]) & d$endpos > as.numeric(s[i,3]))},d=chr.sim.data,s=chr.segment.data)
chr.segment.data[1:(no.breakpoints-1),3] = as.numeric(chr.segment.data[1:(no.breakpoints-1),3])-1
colnames(chr.segment.data) = c("chr","startpos","endpos","count")
chr.segment.data = data.frame(chr=chr.segment.data[,1],startpos=as.numeric(chr.segment.data[,2]),endpos=as.numeric(chr.segment.data[,3]),count=as.numeric(chr.segment.data[,4]))
#segment.data = rbind(segment.data,chr.segment.data)
no.segments = chr.seg.counts[c]
for(s in 1:no.segments){
sim.hits[cum.chr.seg.counts[c]+s] = sum(sapply(1:nrow(chr.segment.data),function(c2,d,l,i){max(min(c2$endpos[s],d$endpos[i])-max(c2$startpos[s],d$startpos[i])+1,0)*d$count[i]/l},d=chr.segment.data,c2=chr.data[[c]],l=chr.data[[c]]$endpos[s] - chr.data[[c]]$startpos[s]+1))
if(sim.hits[cum.chr.seg.counts[c]+s]>=chr.data[[c]]$count[s]){
sim.counts[cum.chr.seg.counts[c]+s] = sim.counts[cum.chr.seg.counts[c]+s] + 1
}
}
}
}
write.table(sim.counts,paste(loh_output_dir,"sim_counts_",tumour_type,"_LOH_",run,".txt",sep=""),sep="\t",row.names=F,quote=F)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.