R/fitcopynumber.R

Defines functions callChrXsubclones make_posthoc_plots collapse_bafsegmented_to_segments load.rho.psi.file plot.gw.subclonal.cn mask_high_cn_segments merge_segments determine_copynumber callSubclones fit.copy.number

Documented in callChrXsubclones callSubclones fit.copy.number make_posthoc_plots mask_high_cn_segments

#' Fit copy number
#'
#' Function that will fit a clonal copy number profile to segmented data. It first
#' matches the raw LogR with the segmented BAF to create segmented LogR. Then ASCAT
#' is run to obtain a clonal copy number profile. Beyond logRsegmented it produces
#' the rho_and_psi file and the cellularity_ploidy file.
#' @param samplename Samplename used to name the segmented logr output file
#' @param outputfile.prefix Prefix used for all output file names, except logRsegmented
#' @param inputfile.baf.segmented Filename that points to the BAF segmented data
#' @param inputfile.baf Filename that points to the raw BAF data
#' @param inputfile.logr Filename that points to the raw LogR data
#' @param dist_choice The distance metric that is used internally to rank clonal copy number solutions
#' @param ascat_dist_choice The distance metric used to obtain an initial cellularity and ploidy estimate
#' @param min.ploidy The minimum ploidy to consider (Default 1.6)
#' @param max.ploidy The maximum ploidy to consider (Default 4.8)
#' @param min.rho The minimum cellularity to consider (Default 0.1)
#' @param max.rho The maximum cellularity to consider (Default 1.0)
#' @param min.goodness The minimum goodness of fit for a solution to have to be considered (Default 63)
#' @param uninformative_BAF_threshold The threshold beyond which BAF becomes uninformative (Default 0.51)
#' @param gamma_param Technology parameter, compaction of Log R profiles. Expected decrease in case of deletion in diploid sample, 100 "\%" aberrant cells; 1 in ideal case, 0.55 of Illumina 109K arrays (Default 1)
#' @param use_preset_rho_psi Boolean whether to use user specified rho and psi values (Default F)
#' @param preset_rho A user specified rho to fit a copy number profile to (Default NA)
#' @param preset_psi A user specified psi to fit a copy number profile to (Default NA)
#' @param read_depth Legacy parameter that is no longer used (Default 30)
#' @param analysis A String representing the type of analysis to be run, this determines whether the distance figure is produced (Default paired)
#' @author dw9, sd11
#' @export
fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmented, inputfile.baf, inputfile.logr, dist_choice, ascat_dist_choice, min.ploidy=1.6, max.ploidy=4.8, min.rho=0.1,  max.rho=1.0, min.goodness=63, uninformative_BAF_threshold=0.51, gamma_param=1, use_preset_rho_psi=F, preset_rho=NA, preset_psi=NA, read_depth=30, analysis="paired") {

  assert.file.exists(inputfile.baf.segmented)
  assert.file.exists(inputfile.baf)
  assert.file.exists(inputfile.logr)

  # Check for enough options supplied for rho and psi
  if ((max.ploidy - min.ploidy) < 0.05) {
    stop(paste("Supplied ploidy range must be larger than 0.05: ", min.ploidy, "-", max.ploidy, sep=""))
  }
  if ((max.rho - min.rho) < 0.01) {
    stop(paste("Supplied rho range must be larger than 0.01: ", min.rho, "-", max.rho, sep=""))
  }

  # Read in the required data
  segmented.BAF.data = as.data.frame(read_bafsegmented(inputfile.baf.segmented))
  raw.BAF.data = as.data.frame(read_baf(inputfile.baf))
  raw.logR.data = as.data.frame(read_logr(inputfile.logr))

  # Assign rownames as those are required by various clonal_ascat.R functions
  # If there are duplicates (possible with old versions of BB) then remove those
  identifiers = paste(segmented.BAF.data[,1], segmented.BAF.data[,2], sep="_")
  dups = which(duplicated(identifiers))
  if (length(dups) > 0) {
      segmented.BAF.data = segmented.BAF.data[-dups,]
      identifiers = identifiers[-dups]
  }
  rownames(segmented.BAF.data) = identifiers

  # Drop NAs
  raw.BAF.data = raw.BAF.data[!is.na(raw.BAF.data[,3]),]
  raw.logR.data = raw.logR.data[!is.na(raw.logR.data[,3]),]

  ## Chromosome names are sometimes 'chr1', etc.
  #if(length(grep("chr",raw.BAF.data[1,1]))>0){
  #  raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1])
  #}
  #if(length(grep("chr",raw.logR.data[1,1]))>0){
  #  raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1])
  #}

  BAF.data = list()
  logR.data = list()
  segmented.logR.data = list()
  matched.segmented.BAF.data = list()
  gsubchr = function(chr) gsub("chr","",as.character(chr))

  chr.names = gsubchr(unique(segmented.BAF.data[,1]))

  segmented.BAF.data$Chromosome = gsubchr(segmented.BAF.data$Chromosome)
  raw.BAF.data$Chromosome = gsubchr(raw.BAF.data$Chromosome)
  raw.logR.data$Chromosome =gsubchr(raw.logR.data$Chromosome)

  baf_segmented_split = split(segmented.BAF.data, f=segmented.BAF.data$Chromosome)
  baf_split = split(raw.BAF.data, f=raw.BAF.data$Chromosome)
  logr_split = split(raw.logR.data, f=raw.logR.data$Chromosome)

  # For each chromosome
  for(chr in chr.names){
    chr.BAF.data = baf_split[[chr]]

    # Skip the rest if there is no data for this chromosome
    if(nrow(chr.BAF.data)==0){ next }
    # Match segments with chromosome position
    chr.segmented.BAF.data = baf_segmented_split[[chr]]
    indices = match(chr.segmented.BAF.data[,2],chr.BAF.data$Position )

    if (sum(is.na(indices))==length(indices) | length(indices)==0) {
	    next
    }

    # Drop NAs here too
    chr.segmented.BAF.data = chr.segmented.BAF.data[!is.na(indices),]

    # Append the segmented data
    matched.segmented.BAF.data[[chr]] = chr.segmented.BAF.data
    BAF.data[[chr]] = chr.BAF.data[indices[!is.na(indices)],]

    # Append raw LogR
    chr.logR.data = logr_split[[chr]]
    indices = match(chr.segmented.BAF.data[,2],chr.logR.data$Position)
    logR.data[[chr]] = chr.logR.data[indices[!is.na(indices)],]
    chr.segmented.logR.data = chr.logR.data[indices[!is.na(indices)],]

    # Append segmented LogR
    segs = rle(chr.segmented.BAF.data[,5])$lengths
    cum.segs = c(0,cumsum(segs))
    for(s in 1:length(segs)){
      chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3] = mean(chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3], na.rm=T)
    }
    segmented.logR.data[[chr]] = chr.segmented.logR.data
  }

  # Sync the dataframes
  selection = c()
  for (chrom in chr.names) {
    matched.segmented.BAF.data.chr = matched.segmented.BAF.data[[chrom]] #matched.segmented.BAF.data[matched.segmented.BAF.data[,1]==chrom,]
    logR.data.chr = logR.data[[chrom]] #logR.data[logR.data[,1]==chrom,]

    selection = matched.segmented.BAF.data.chr[,2] %in% logR.data.chr[,2]
    matched.segmented.BAF.data[[chrom]] = matched.segmented.BAF.data.chr[selection,]
    segmented.logR.data[[chrom]] = segmented.logR.data[[chrom]][selection,]
  }

  # Combine the split data frames into a single for the subsequent steps
  matched.segmented.BAF.data = do.call(rbind, matched.segmented.BAF.data)
  segmented.logR.data = do.call(rbind, segmented.logR.data)
  BAF.data = do.call(rbind, BAF.data)
  logR.data = do.call(rbind, logR.data)
  names(matched.segmented.BAF.data)[5] = samplename

  # write out the segmented logR data
  row.names(segmented.logR.data) = row.names(matched.segmented.BAF.data)
  row.names(logR.data) = row.names(matched.segmented.BAF.data)
  write.table(segmented.logR.data,paste(samplename,".logRsegmented.txt",sep=""),sep="\t",quote=F,col.names=F,row.names=F)

  # Prepare the data for going into the runASCAT functions
  segBAF = 1-matched.segmented.BAF.data[,5]
  segLogR = segmented.logR.data[,3]
  logR = logR.data[,3]
  names(segBAF) = rownames(matched.segmented.BAF.data)
  names(segLogR) = rownames(matched.segmented.BAF.data)
  names(logR) = rownames(matched.segmented.BAF.data)

  chr.segs = NULL
  for(ch in 1:length(chr.names)){
    chr.segs[[ch]] = which(logR.data[,1]==chr.names[ch])
  }

  if(use_preset_rho_psi){
    ascat_optimum_pair = list(rho=preset_rho, psi = preset_psi, ploidy = preset_psi)
  }else{
    distance.outfile=paste(outputfile.prefix, "distance.png", sep="", collapse="") # kjd 20-2-2014
    copynumberprofile.outfile=paste(outputfile.prefix, "copynumberprofile.png", sep="", collapse="") # kjd 20-2-2014
    nonroundedprofile.outfile=paste(outputfile.prefix, "nonroundedprofile.png", sep="", collapse="") # kjd 20-2-2014
    cnaStatusFile = paste(outputfile.prefix, "copynumber_solution_status.txt", sep="", collapse="")

    ascat_optimum_pair = runASCAT(logR, 1-BAF.data[,3], segLogR, segBAF, chr.segs, ascat_dist_choice,distance.outfile, copynumberprofile.outfile, nonroundedprofile.outfile, cnaStatusFile=cnaStatusFile, gamma=gamma_param, allow100percent=T, reliabilityFile=NA, min.ploidy=min.ploidy, max.ploidy=max.ploidy, min.rho=min.rho, max.rho=max.rho, min.goodness, chr.names=chr.names, analysis=analysis) # kjd 4-2-2014
  }

  distance.outfile=paste(outputfile.prefix,"second_distance.png",sep="",collapse="") # kjd 20-2-2014
  copynumberprofile.outfile=paste(outputfile.prefix,"second_copynumberprofile.png",sep="",collapse="") # kjd 20-2-2014
  nonroundedprofile.outfile=paste(outputfile.prefix,"second_nonroundedprofile.png",sep="",collapse="") # kjd 20-2-2014

  # All is set up, now run ASCAT to obtain a clonal copynumber profile
  out = run_clonal_ASCAT( logR, 1-BAF.data[,3], segLogR, segBAF, chr.segs, matched.segmented.BAF.data, ascat_optimum_pair, dist_choice, distance.outfile, copynumberprofile.outfile, nonroundedprofile.outfile, gamma_param=gamma_param, read_depth, uninformative_BAF_threshold, allow100percent=T, reliabilityFile=NA,  psi_min_initial=min.ploidy, psi_max_initial=max.ploidy, rho_min_initial=min.rho, rho_max_initial=max.rho, chr.names=chr.names) # kjd 21-2-2014

  ascat_optimum_pair_fraction_of_genome = out$output_optimum_pair_without_ref
  ascat_optimum_pair_ref_seg = out$output_optimum_pair
  is.ref.better = out$is.ref.better

  # Save rho, psi and ploidy for future reference
  rho_psi_output = data.frame(rho = c(ascat_optimum_pair$rho,ascat_optimum_pair_fraction_of_genome$rho,ascat_optimum_pair_ref_seg$rho),psi = c(ascat_optimum_pair$psi,ascat_optimum_pair_fraction_of_genome$psi,ascat_optimum_pair_ref_seg$psi), ploidy = c(ascat_optimum_pair$ploidy,ascat_optimum_pair_fraction_of_genome$ploidy,ascat_optimum_pair_ref_seg$ploidy), distance = c(NA,out$distance_without_ref,out$distance), is.best = c(NA,!is.ref.better,is.ref.better),row.names=c("ASCAT","FRAC_GENOME","REF_SEG"))
  write.table(rho_psi_output,paste(outputfile.prefix,"rho_and_psi.txt",sep=""),quote=F,sep="\t")
}

#' Fit subclonal copy number
#'
#' This function fits a subclonal copy number profile where a clonal profile is unlikely.
#' It goes over each segment of a clonal copy number profile and does a simple t-test. If the
#' test is significant it is unlikely that the data can be explained by a single copy number
#' state. We therefore fit a second state, i.e. there are two cellular populations with each
#' a different state: Subclonal copy number.
#' @param sample.name Name of the sample, used in figures
#' @param baf.segmented.file String that points to a file with segmented BAF output
#' @param logr.file String that points to the raw LogR file to be used in the subclonal copy number figures
#' @param rho.psi.file String pointing to the rho_and_psi file generated by \code{fit.copy.number}
#' @param output.file Filename of the file where the final copy number fit will be written to
#' @param output.figures.prefix Prefix of the filenames for the chromosome specific copy number figures
#' @param output.gw.figures.prefix Prefix of the filenames for the genome wide copy number figures
#' @param chr_names Vector of allowed chromosome names
#' @param masking_output_file Filename of where the masking details need to be written. Masking is performed to remove very high copy number state segments
#' @param max_allowed_state The maximum CN state allowed (Default 100)
#' @param prior_breakpoints_file A two column file with prior breakpoints, possibly from structural variants. This file must contain two columns: chromosome and position. These are used when making the figures
#' @param gamma Technology specific scaling parameter for LogR (Default 1)
#' @param segmentation.gamma Legacy parameter that is no longer used (Default NA)
#' @param siglevel Threshold under which a p-value becomes significant. When it is significant a second copy number state will be fitted (Default 0.05)
#' @param maxdist Slack in BAF space to allow a segment to be off it's optimum before becoming significant. A segment becomes significant very quickly when a breakpoint is missed, this parameter alleviates the effect (Default 0.01)
#' @param noperms The number of permutations to be run when bootstrapping the confidence intervals on the copy number state of each segment (Default 1000)
#' @param seed Seed to set when performing bootstrapping (Default: Current time)
#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median==0|1, mean, median. (Default: 3)
#' @author dw9, sd11
#' @export

callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.file, output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, masking_output_file, max_allowed_state=250, prior_breakpoints_file=NULL, gamma=1, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, noperms=1000, seed=as.integer(Sys.time()), calc_seg_baf_option=3) {
  
  set.seed(seed)

  # Load rho/psi/goodness of fit
  res = load.rho.psi.file(rho.psi.file)
  rho = res$rho
  psit = res$psit
  psi = rho*psit + 2 * (1-rho) # psi of all cells
  goodness = res$goodness

  # Load the BAF segmented data
  BAFvals = as.data.frame(read_bafsegmented(baf.segmented.file))
  if (colnames(BAFvals)[1] == "X") {
	  # If there were rownames, then delete this column. Should not be an issue with new BB runs
	  BAFvals = BAFvals[,-1]
  }

  BAF = BAFvals[,3]
  BAFphased = BAFvals[,4]
  BAFseg = BAFvals[,5]

  # Save SNP positions separately
  SNPpos = BAFvals[,c(1,2)]

  # Load the raw LogR data
  LogRvals = as.data.frame(read_logr(logr.file))
  if (colnames(LogRvals)[1] == "X") {
	  # If there were rownames, then delete this column. Should not be an issue with new BB runs
	  LogRvals = LogRvals[,-1]
  }

  # Chromosome names are sometimes 'chr1', etc.
  #if(length(grep("chr",LogRvals[1,1]))>0){
#	      LogRvals[,1] = gsub("chr","",LogRvals[,1])
  #}

  ctrans = c(1:length(chr_names))
  names(ctrans) = chr_names
  ctrans.logR = c(1:length(chr_names))
  names(ctrans.logR) = chr_names

  # = as.vector(ctrans.logR[as.vector(LogRvals[,1])]*1000000000+LogRvals[,2])
  BAFpos = as.vector(ctrans[as.vector(BAFvals[,1])]*1000000000+BAFvals[,2])

  ################################################################################################
  # Determine copy number for each segment
  ################################################################################################
  res = determine_copynumber(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms)
  subcloneres = res$subcloneres
  write.table(subcloneres, gsub(".txt", "_1.txt", output.file), quote=F, col.names=T, row.names=F, sep="\t")

  # Scan the segments for cases that should be merged
  res = merge_segments(subcloneres, BAFvals, LogRvals, rho, psi, gamma, calc_seg_baf_option)
  BAFvals = res$bafsegmented

  res = determine_copynumber(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms)
  subcloneres = res$subcloneres
  BAFpvals = res$BAFpvals

  # Scan for very high copy number segments and set those to NA - This is in part an artifact of small segments
  res = mask_high_cn_segments(subcloneres, BAFvals, max_allowed_state)
  subcloneres = res$subclones
  # No longer writing out the BAFsegmented data after masking
  #BAFvals = res$bafsegmented
  #write.table(BAFvals, file=baf.segmented.file, sep="\t", row.names=F, col.names=T, quote=F)
  # Write the masking details to file
  masking_details = data.frame(samplename=sample.name, masked_count=res$masked_count, masked_size=res$masked_size, max_allowed_state=max_allowed_state)
  write.table(masking_details, file=masking_output_file, quote=F, col.names=T, row.names=F, sep="\t")
  
  # Write the final copy number profile 
  # NAP: generating two output files: first reporting solution A and the second reporting alternative solutions (B to F)
  write.table(subcloneres[,1:17], output.file, quote=F, col.names=T, row.names=F, sep="\t")
  write.table(subcloneres[,c(1:7,18:67)], gsub(".txt","_alternatives.txt",output.file), quote=F, col.names=T, row.names=F, sep="\t")

  ################################################################################################
  # Make a plot per chromosome
  ################################################################################################
  # Collapse the BAFsegmented into breakpoints to be used in plotting
  segment_breakpoints = collapse_bafsegmented_to_segments(BAFvals)
  if (!is.null(prior_breakpoints_file) & !ifelse(is.null(prior_breakpoints_file), TRUE, prior_breakpoints_file=="NA") & !ifelse(is.null(prior_breakpoints_file), TRUE, is.na(prior_breakpoints_file))) {
    svs = read.table(prior_breakpoints_file, header=T, stringsAsFactors=F)
  }

  # Create a plot per chromosome that shows the segments with their CN state in text
  for (chr in chr_names) {
    pos = SNPpos[SNPpos[,1]==chr, 2]
    #if no points to plot, skip
    if (length(pos)==0) { next }
    
    if (!is.null(prior_breakpoints_file) & !ifelse(is.null(prior_breakpoints_file), TRUE, prior_breakpoints_file=="NA") & !ifelse(is.null(prior_breakpoints_file), TRUE, is.na(prior_breakpoints_file))) {
      svs_pos = svs[svs$chromosome==chr,]$position / 1000000
    } else {
      svs_pos = NULL
    }

    breakpoints_pos = segment_breakpoints[segment_breakpoints$chromosome==chr,]
    breakpoints_pos = sort(unique(c(breakpoints_pos$start, breakpoints_pos$end) / 1000000))

    png(filename = paste(output.figures.prefix, chr,".png",sep=""), width = 2000, height = 2000, res = 200)
    create.subclonal.cn.plot(chrom=chr,
                             chrom.position=pos/1000000,
                             LogRposke=LogRvals[LogRvals[,1]==chr,2],
                             LogRchr=LogRvals[LogRvals[,1]==chr,3],
                             BAFchr=BAF[SNPpos[,1]==chr],
                             BAFsegchr=BAFseg[SNPpos[,1]==chr],
                             BAFpvalschr=BAFpvals[SNPpos[,1]==chr],
                             subcloneres=subcloneres,
                             breakpoints_pos=breakpoints_pos,
                             svs_pos=svs_pos,
                             siglevel=siglevel,
                             x.min=min(pos)/1000000,
                             x.max=max(pos)/1000000,
                             title=paste(sample.name,", chromosome ", chr, sep=""),
                             xlab="Position (Mb)",
                             ylab.logr="LogR",
                             ylab.baf="BAF (phased)")
    dev.off()
  }

  # Cast columns back to numeric
  subclones = as.data.frame(subcloneres)
  subclones[,2:ncol(subclones)] = sapply(2:ncol(subclones), function(x) { as.numeric(as.character(subclones[,x])) })

  # Recalculate the ploidy based on the actual fit
  seg_length = floor((subclones$endpos-subclones$startpos)/1000)
  is_subclonal_maj = abs(subclones$nMaj1_A - subclones$nMaj2_A) > 0
  is_subclonal_min = abs(subclones$nMin1_A - subclones$nMin2_A) > 0
  is_subclonal_maj[is.na(is_subclonal_maj)] = F
  is_subclonal_min[is.na(is_subclonal_min)] = F
  segment_states_min = subclones$nMin1_A * ifelse(is_subclonal_min, subclones$frac1_A, 1)  + ifelse(is_subclonal_min, subclones$nMin2_A, 0) * ifelse(is_subclonal_min, subclones$frac2_A, 0)
  segment_states_maj = subclones$nMaj1_A * ifelse(is_subclonal_maj, subclones$frac1_A, 1)  + ifelse(is_subclonal_maj, subclones$nMaj2_A, 0) * ifelse(is_subclonal_maj, subclones$frac2_A, 0)
  ploidy = sum((segment_states_min+segment_states_maj) * seg_length, na.rm=T) / sum(seg_length, na.rm=T)

  # Plot genome wide figures
  plot.gw.subclonal.cn(subclones=subclones, BAFvals=BAFvals, rho=rho, ploidy=ploidy, goodness=goodness, output.gw.figures.prefix=output.gw.figures.prefix, chr.names=chr_names, tumourname=sample.name)

  # Create user friendly cellularity and ploidy output file
  cellularity_ploidy_output = data.frame(purity = c(rho), ploidy = c(ploidy), psi = c(psit))
  cellularity_file = gsub("_subclones.txt", "_purity_ploidy.txt", output.file) # NAP: updated the name of the output file, consistent with new title
  write.table(cellularity_ploidy_output, cellularity_file, quote=F, sep="\t", row.names=F)
}

#' Given all the determined values make a copy number call for each segment
#'
#' @param BAFvals BAFsegmented data.frame with 5 columns
#' @param LogRvals Raw logR values in data.frame with 3 columns
#' @param rho Optimal rho value, the choosen cellularity
#' @param psi Optimal psi value, the choosen ploidy
#' @param gamma Platform gamma parameter
#' @param ctrans Named vector of chromosome names
#' @param ctrans.logR Named vector of chromosome names
#' @param maxdist Max distance a segment is tolerated to be not considered for subclonal copy number
#' @param siglevel Level at which a segment can become significantly different from the nearest clonal state
#' @param noperms Number of bootstrap permutations
#' @return A data.frame with copy number determined for each segment
#' @author dw9
#' @noRd
determine_copynumber = function(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms) {
  BAFphased = BAFvals[,4]
  BAFseg = BAFvals[,5]
  BAFpos = as.vector(ctrans[as.vector(BAFvals[,1])]*1000000000+BAFvals[,2])
  LogRpos = as.vector(ctrans.logR[as.vector(LogRvals[,1])]*1000000000+LogRvals[,2])

  #DCW 240314
  switchpoints = c(0,which(BAFseg[-1] != BAFseg[-(length(BAFseg))] | BAFvals[-1,1] != BAFvals[-nrow(BAFvals),1]),length(BAFseg))
  BAFlevels = BAFseg[switchpoints[-1]]

  pval = NULL
  BAFpvals = vector(length=length(BAFseg))
  subcloneres = NULL

  for (i in 1:length(BAFlevels)) {
    # subcloneres = rbind(subcloneres, fit_segment(BAFpos, LogRpos, BAFlevels, BAFphased, LogRvals, switchpoints, rho, psi, gamma, i))
    l = BAFlevels[i]

    # Make sure that BAF>=0.5, otherwise nMajor and nMinor may be the wrong way around
    l = max(l,1-l)

    BAFke = BAFphased[(switchpoints[i]+1):switchpoints[i+1]]

    #startpos = min(BAFpos[names(BAFke)])
    #endpos = max(BAFpos[names(BAFke)])
    startpos = min(BAFpos[(switchpoints[i]+1):switchpoints[i+1]])
    endpos = max(BAFpos[(switchpoints[i]+1):switchpoints[i+1]])
    #chrom = names(ctrans[floor(startpos/1000000000)])
    # Assuming all SNPs in this segment are on the same chromosome
    chrom = BAFvals[(switchpoints[i]+1):switchpoints[i+1],]$Chromosome[1]
    LogR = mean(LogRvals[LogRpos>=startpos&LogRpos<=endpos & !is.infinite(LogRvals[,3]),3],na.rm=T)

    # if we don't have a value for LogR, fill in 0
    if (is.na(LogR)) {
      LogR = 0
    }
    nMajor = (rho-1+l*psi*2^(LogR/gamma))/rho
    nMinor = (rho-1+(1-l)*psi*2^(LogR/gamma))/rho

    # Increase nMajor and nMinor together, to avoid impossible combinations (with negative subclonal fractions)
    if (nMinor<0) {
      if (l==1) {
        # Avoid calling infinite copy number
        nMajor = 1000
      } else {
        nMajor = nMajor + l * (0.01 - nMinor) / (1-l)
      }
      nMinor = 0.01
    }

    # Note that these are sorted in the order of ascending BAF:
    nMaj = c(floor(nMajor), ceiling(nMajor), floor(nMajor), ceiling(nMajor))
    nMin = c(ceiling(nMinor), ceiling(nMinor), floor(nMinor), floor(nMinor))
    x = floor(nMinor)
    y = floor(nMajor)

    # Total copy number, to determine priority options
    ntot = nMajor + nMinor

    levels = (1-rho+rho*nMaj)/(2-2*rho+rho*(nMaj+nMin))
    # Problem if rho=1 and nMaj=0 and nMin=0
    levels[nMaj==0 & nMin==0] = 0.5

    #DCW - just test corners on the nearest edge to determine clonality
    # If the segment is called as subclonal, this is the edge that will be used to determine the subclonal proportions that are reported first
    all.edges = orderEdges(levels, l, ntot,x,y)
    nMaj.test = all.edges[1,c(1,3)]
    nMin.test = all.edges[1,c(2,4)]
    test.levels = (1-rho+rho*nMaj.test)/(2-2*rho+rho*(nMaj.test+nMin.test))
    whichclosestlevel.test = which.min(abs(test.levels-l))

    # Test whether a segment should be subclonal
    if (is.na(sd(BAFke)) || sd(BAFke)==0) {
      pval[i] = 0 # problem caused by segments with constant BAF (usually 1 or 2)
    } else {
      pval[i] = t.test(BAFke, alternative="two.sided", mu=test.levels[whichclosestlevel.test])$p.value
    }
    if (abs(l-test.levels[whichclosestlevel.test])<maxdist) {
      pval[i] = 1
    }

    #DCW 240314
    BAFpvals[(switchpoints[i]+1):switchpoints[i+1]] = pval[i]

    # If the difference is significant, call subclonal level
    if (pval[i] <= siglevel) {

      all.edges = orderEdges(levels, l, ntot,x,y)
      # Switch order, so that negative copy numbers are at the end
      na.indices = which(is.na(rowSums(all.edges)))
      if (length(na.indices)>0) {
        all.edges = rbind(all.edges[-na.indices,], all.edges[na.indices,])
      }
      nMaj1 = all.edges[,1]
      nMin1 = all.edges[,2]
      nMaj2 = all.edges[,3]
      nMin2 = all.edges[,4]

      tau = (1 - rho + rho * nMaj2 - 2 * l * (1 - rho) - l * rho * (nMin2 + nMaj2)) / (l * rho * (nMin1 + nMaj1) - l * rho * (nMin2 + nMaj2) - rho * nMaj1 + rho * nMaj2)
      sdl = sd(BAFke,na.rm=T)/sqrt(sum(!is.na(BAFke)))
      sdtau = abs((1 - rho + rho * nMaj2 - 2 * (l+sdl) * (1 - rho) - (l+sdl) * rho * (nMin2 + nMaj2)) / ((l+sdl) * rho * (nMin1 + nMaj1) - (l+sdl) * rho * (nMin2 + nMaj2) - rho * nMaj1 + rho * nMaj2) - tau) / 2 +
        abs((1 - rho + rho * nMaj2 - 2 * (l-sdl) * (1 - rho) - (l-sdl) * rho * (nMin2 + nMaj2)) / ((l-sdl) * rho * (nMin1 + nMaj1) - (l-sdl) * rho * (nMin2 + nMaj2) - rho * nMaj1 + rho * nMaj2) - tau) / 2

      # Bootstrapping to obtain 95% confidence intervals
      sdtaubootstrap = vector(length=length(tau), mode="numeric")
      tau25 = vector(length=length(tau), mode="numeric")
      tau975 = vector(length=length(tau), mode="numeric")

      for (option in 1:length(tau)) {
        nMaj1o = nMaj1[option]
        nMin1o = nMin1[option]
        nMaj2o = nMaj2[option]
        nMin2o = nMin2[option]

        permFraction = vector(length=noperms,mode="numeric")
        for (j in 1:noperms) {
          permBAFs=sample(BAFke,length(BAFke),replace=T)
          permMeanBAF=mean(permBAFs)
          permFraction[j] = (1 - rho + rho * nMaj2o - 2 * permMeanBAF * (1 - rho) - permMeanBAF * rho * (nMin2o + nMaj2o)) / (permMeanBAF * rho * (nMin1o + nMaj1o) - permMeanBAF * rho * (nMin2o + nMaj2o) - rho * nMaj1o + rho * nMaj2o)
        }
        orderedFractions = sort(permFraction)
        sdtaubootstrap[option] = sd(permFraction)
        tau25[option] = orderedFractions[25]
        tau975[option] = orderedFractions[975]
      }

      subcloneres = rbind(subcloneres, c(chrom,startpos-floor(startpos/1000000000)*1000000000,
                                         endpos-floor(endpos/1000000000)*1000000000,l,pval[i],LogR,ntot,
                                         nMaj1[1],nMin1[1],tau[1],nMaj2[1],nMin2[1],1-tau[1],sdtau[1],sdtaubootstrap[1],tau25[1],tau975[1],
                                         nMaj1[2],nMin1[2],tau[2],nMaj2[2],nMin2[2],1-tau[2],sdtau[2],sdtaubootstrap[2],tau25[2],tau975[2],
                                         nMaj1[3],nMin1[3],tau[3],nMaj2[3],nMin2[3],1-tau[3],sdtau[3],sdtaubootstrap[3],tau25[3],tau975[3],
                                         nMaj1[4],nMin1[4],tau[4],nMaj2[4],nMin2[4],1-tau[4],sdtau[4],sdtaubootstrap[4],tau25[4],tau975[4],
                                         nMaj1[5],nMin1[5],tau[5],nMaj2[5],nMin2[5],1-tau[5],sdtau[5],sdtaubootstrap[5],tau25[5],tau975[5],
                                         nMaj1[6],nMin1[6],tau[6],nMaj2[6],nMin2[6],1-tau[6],sdtau[6],sdtaubootstrap[6],tau25[6],tau975[6]))
    }else {
      #if called as clonal, use the best corner from the nearest edge
      subcloneres = rbind(subcloneres, c(chrom,startpos-floor(startpos/1000000000)*1000000000,
                                         endpos-floor(endpos/1000000000)*1000000000,l,pval[i],LogR,ntot,
                                         nMaj.test[whichclosestlevel.test],nMin.test[whichclosestlevel.test],1,rep(NA,57)))

    }
  }
  colnames(subcloneres) = c("chr","startpos","endpos","BAF","pval","LogR","ntot",
                            "nMaj1_A","nMin1_A","frac1_A","nMaj2_A","nMin2_A","frac2_A","SDfrac_A","SDfrac_A_BS","frac1_A_0.025","frac1_A_0.975",
                            "nMaj1_B","nMin1_B","frac1_B","nMaj2_B","nMin2_B","frac2_B","SDfrac_B","SDfrac_B_BS","frac1_B_0.025","frac1_B_0.975",
                            "nMaj1_C","nMin1_C","frac1_C","nMaj2_C","nMin2_C","frac2_C","SDfrac_C","SDfrac_C_BS","frac1_C_0.025","frac1_C_0.975",
                            "nMaj1_D","nMin1_D","frac1_D","nMaj2_D","nMin2_D","frac2_D","SDfrac_D","SDfrac_D_BS","frac1_D_0.025","frac1_D_0.975",
                            "nMaj1_E","nMin1_E","frac1_E","nMaj2_E","nMin2_E","frac2_E","SDfrac_E","SDfrac_E_BS","frac1_E_0.025","frac1_E_0.975",
                            "nMaj1_F","nMin1_F","frac1_F","nMaj2_F","nMin2_F","frac2_F","SDfrac_F","SDfrac_F_BS","frac1_F_0.025","frac1_F_0.975")
  subcloneres = as.data.frame(subcloneres)
  for (i in 2:ncol(subcloneres)) {
    subcloneres[,i] = as.numeric(as.character(subcloneres[,i]))
  }
  return(list(subcloneres=subcloneres, BAFpvals=BAFpvals))
}


#' Merge copy number segments
#'
#' Merges segments if there is not enough evidence for them to be separate. Two adjacent segments are merged
#' when they are either fit with the same clonal copy number state or when their BAF is not significantly different
#' and their logR puts them in the same square.
#' @param subclones A completely fit copy number profile in Battenberg output format
#' @param bafsegmented A BAFsegmented data.frame with the 5 columns that corresponds to the subclones file
#' @param logR The raw logR data
#' @param rho The rho estimate that the profile was fit with
#' @param psi the psi estimate that the profile was fit with
#' @param platform_gamma The gamma parameter for this platform
#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median== 0|1, mean, median. (Default: 3)
#' @return A list with two fields: bafsegmented and subclones. The subclones field contains a data.frame in
#' Battenberg output format with the merged segments. The bafsegmented field contains the BAFsegmented data
#' corresponding to the provided subclones data.frame.
#' @author sd11
#' @noRd
merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamma, calc_seg_baf_option=3) {
  subclones_cleaned = data.frame()
  merged = T
  counter = 1
  previous_merged = F

  print("Merging segments")
  while(merged) {
    print(paste0("Iter: ",counter))
    merged = F
    for (i in 2:nrow(subclones)) {
      if ((i %% 50)==0) { print(paste0(i, " / ", nrow(subclones))) }

      # If the second segment was not merged with the first we need to save segment 1. Here we throw away the previously saved copy of 2 and save 1 and 2
      if (i==3 & !previous_merged) {
        subclones_cleaned = subclones[1,,drop=F]
      }

      # Don't do any double merging. This needs to happen at the next iteration, as we've just merged i with i-1 we don't add i again
      if (previous_merged) {
        previous_merged = F
        #subclones_cleaned = rbind(subclones_cleaned, subclones[i-1,])
        next
      }

      # if segments are on different chromosomes, keep the separate segments
      if (subclones$chr[i-1]!=subclones$chr[i]) {
        subclones_cleaned = rbind(subclones_cleaned, subclones[i-1,])
        next
      }

      # if distance between segment is large, keep the separate segments
      if (subclones$chr[i-1]==subclones$chr[i] & diff(c(subclones$endpos[i-1], subclones$startpos[i]))>3000000) {
        subclones_cleaned = rbind(subclones_cleaned, subclones[i-1,])
        next
      }

      # if the segments have the exact same copy number state, merge the segments
      nmin_equals = subclones$nMin1_A[i-1]==subclones$nMin1_A[i]
      nmaj_equals = subclones$nMaj1_A[i-1]==subclones$nMaj1_A[i]
      not_subclonal = (subclones$frac1_A[i-1]==1) & (subclones$frac1_A[i]==1)

      # It would be nice to store the baf and logr values for this segment temporarily, but the memory allocation time is too great, therefore the selection happens a couple of times inline below
      if (not_subclonal & nmin_equals & nmaj_equals) {
        # MERGE
        new_entry = data.frame(subclones[i-1,])
        new_entry$endpos = subclones[i,]$endpos

        #
        # Note: When adding options, also add to segment.baf.phased
        #
        if (calc_seg_baf_option==1) {
          new_entry$BAF = median(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]],
                                 bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)

        } else if (calc_seg_baf_option==2) {
          new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], 
                                   bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)
        } else if (calc_seg_baf_option==3) {
          mean_baf = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], 
                            bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)
          median_baf = median(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], 
                                bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)
          # We'll prefer the median BAF as a segment summary
          # but change to the mean when the median is extreme
          # as at 0 or 1 the BAF is uninformative for the fitting
          if (median_baf!=0 & mean_baf!=1) {
            new_entry$BAF = median_baf
          } else {
            new_entry$BAF = mean_baf
          }
        } else {
          warning("Supplied calc_seg_baf_option to callSubclones not valid, using mean BAF by default")
          # The mean is taken here as that has originally been the default. This behaviour is consistent with the segmentation functions
          new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], 
                                 bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)
        }

        new_entry$LogR = mean(c(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3],
                                logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3]), na.rm=T)
        subclones_cleaned = rbind(subclones_cleaned, new_entry)

        # Set BAFseg to reflect the current segment
        bafsegmented$BAFseg[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i]] = new_entry$BAF
        # TODO The logRseg is currently not updated

        previous_merged = T
        merged = T
        next
      }

      # if the logr puts the segments in the same square and BAF is not significantly different, merge them

      # Helper functions to calculate the precise nmaj and nmin
      calc_nmin = function(rho, psi, baf, logr, platform_gamma) {
        return((rho-1-(baf-1)*2^(logr/platform_gamma)*((1-rho)*2+rho*psi))/rho)
      }

      calc_nmaj = function(rho, psi, baf, logr, platform_gamma) {
        return((rho-1+baf*2^(logr/platform_gamma)*((1-rho)*2+rho*psi))/rho)
      }

      nmin_curr = round(calc_nmin(rho, psi, subclones$BAF[i], subclones$LogR[i], platform_gamma))
      nmaj_curr = round(calc_nmaj(rho, psi, subclones$BAF[i], subclones$LogR[i], platform_gamma))
      nmin_prev = round(calc_nmin(rho, psi, subclones$BAF[i-1], subclones$LogR[i-1], platform_gamma))
      nmaj_prev = round(calc_nmaj(rho, psi, subclones$BAF[i-1], subclones$LogR[i-1], platform_gamma))

      # Perform t-test on the BAFphased
      if (sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]])) > 10 &
          sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10 &
          sum(!is.na(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3])) > 10 &
          sum(!is.na(logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])) > 10) {
        baf_significant = t.test(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]],
                                 bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])$p.value < 0.05
        logr_significant = t.test(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3],
                                  logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])$p.value < 0.05
      } else {
        # If not enough SNPs to reliably do a t-test we keep the separate segments, so set this to TRUE
        baf_significant = T
        logr_significant = T
      }

      # If both BAF and logR are not significantly different and at least one allele is of the same state, then merge the subclonal copy number
      if (!(baf_significant & logr_significant) & (nmin_curr==nmin_prev | nmaj_curr==nmaj_prev)) {
        # MERGE
        new_entry = data.frame(subclones[i-1,])
        new_entry$endpos = subclones$endpos[i]
        if (calc_seg_baf_option==1) {
          new_entry$BAF = median(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]],
                                 bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)
        } else if (calc_seg_baf_option==2 | ! calc_seg_baf_option %in% c(1,2)) {
          new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]],
                                 bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T)
        }
        new_entry$LogR = mean(c(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3],
                                logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3]), na.rm=T)
        subclones_cleaned = rbind(subclones_cleaned, new_entry)

        # Set BAFseg to reflect the current segment
        bafsegmented$BAFseg[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i]] = new_entry$BAF
        # TODO Update the logRseg as well

        previous_merged = T
        merged = T
        next
      }

      # No other criteria met, therefore keep the segment
      subclones_cleaned = rbind(subclones_cleaned, subclones[i-1,])

      if (i==nrow(subclones)) {
        # Last segment not merged, so save that too
        subclones_cleaned = rbind(subclones_cleaned, subclones[i,])
      }
    }
    subclones = subclones_cleaned
    subclones_cleaned = data.frame()
    counter = counter+1
  }
  return(list(bafsegmented=bafsegmented, subclones=subclones))
}

#' Mask segments that have a too high CN state
#' @param subclones Subclones output data
#' @param bafsegmented BAFsegmented data
#' @param max_allowed_state The maximum state allowed before overruling takes place
#' @return A list with the masked subclones, bafsegmented and the number of segments masked and their total genome size
#' @author sd11
mask_high_cn_segments = function(subclones, bafsegmented, max_allowed_state) {
  count = 0
  masked_size = 0
  for (i in 1:nrow(subclones)) {
    if (subclones$nMaj1_A[i] > max_allowed_state | subclones$nMin1_A[i] > max_allowed_state) {
      # Mask this segment
      subclones[i, "nMaj1_A"] = NA
      subclones[i, "nMin1_A"] = NA
      subclones[i, "nMaj2_A"] = NA
      subclones[i, "nMin2_A"] = NA
      # Mask the BAFsegmented
      bafsegmented[subclones$chr[i] == bafsegmented$Chromosome & subclones$startpos[i] < bafsegmented$Position & subclones$endpos[i] >= bafsegmented$Position,c("BAFseg")] = NA
      count = count+1
      masked_size = masked_size + (subclones$endpos[i]-subclones$startpos[i])
    }
  }
  return(list(subclones=subclones, bafsegmented=bafsegmented, masked_count=count, masked_size=masked_size))
}


#' Plot the copy number genome wide in two different ways. This creates the Battenberg average
#' profile where subclonal copy number is represented as a mixture of two different states and
#' the Battenberg subclones profile where subclonal copy number is plotted as two different
#' separate states. The thickness of the line represents the fraction of tumour cells carying
#' the particular state.
#' @noRd
plot.gw.subclonal.cn = function(subclones, BAFvals, rho, ploidy, goodness, output.gw.figures.prefix, chr.names, tumourname) {
  # Map start and end of each segment into the BAF values. The plot uses the index of this BAF table as x-axis
  pos_min = array(NA, nrow(subclones))
  pos_max = array(NA, nrow(subclones))
  for (i in 1:nrow(subclones)) {
    segm_chr = subclones$chr[i] == BAFvals$Chromosome & subclones$startpos[i] < BAFvals$Position & subclones$endpos[i] >= BAFvals$Position
    pos_min[i] = min(which(segm_chr))
    pos_max[i] = max(which(segm_chr))
  }

  # For those segments that are subclonal, Obtain the second state.
  is_subclonal = which(subclones$frac1_A < 1)
  subcl_min = array(NA, length(is_subclonal))
  subcl_max = array(NA, length(is_subclonal))
  for (i in 1:length(is_subclonal)) {
    segment_index = is_subclonal[i]
    segm_chr = subclones$chr[segment_index] == BAFvals$Chromosome & subclones$startpos[segment_index] < BAFvals$Position & subclones$endpos[segment_index] >= BAFvals$Position
    subcl_min[i] = min(which(segm_chr))
    subcl_max[i] = max(which(segm_chr))
  }

  # Determine whether it's the major or the minor allele that is represented by two states
  is_subclonal_maj = abs(subclones$nMaj1_A - subclones$nMaj2_A) > 0
  is_subclonal_min = abs(subclones$nMin1_A - subclones$nMin2_A) > 0
  is_subclonal_maj[is.na(is_subclonal_maj)] = F
  is_subclonal_min[is.na(is_subclonal_min)] = F

  # BB represents subclonal CN as a mixture of two CN states. Calculate this mixture for both minor allele and total CN.
  #segment_states_min = subclones$nMin1_A * ifelse(is_subclonal_min, subclones$frac1_A, 1)  + ifelse(is_subclonal_min, subclones$nMin2_A, 0) * ifelse(is_subclonal_min, subclones$frac2_A, 0)
  #segment_states_tot = (subclones$nMaj1_A+subclones$nMin1_A) * ifelse(is_subclonal_maj, subclones$frac1_A, 1) + ifelse(is_subclonal_maj, subclones$nMaj2_A+subclones$nMin2_A, 0) * ifelse(is_subclonal_maj, subclones$frac2_A, 0)

  segment_states_min = subclones$nMin1_A * ifelse(is_subclonal_min, subclones$frac1_A, 1)  + ifelse(is_subclonal_min, subclones$nMin2_A, 0) * ifelse(is_subclonal_min, subclones$frac2_A, 0)
  segment_states_maj = subclones$nMaj1_A * ifelse(is_subclonal_maj, subclones$frac1_A, 1)  + ifelse(is_subclonal_maj, subclones$nMaj2_A, 0) * ifelse(is_subclonal_maj, subclones$frac2_A, 0)
  segment_states_tot = segment_states_maj + segment_states_min

  # Determine which SNPs are on which chromosome, to be used as a proxy for chromosome size in the plots
  chr.segs = lapply(1:length(chr.names), function(ch) { which(BAFvals$Chromosome==chr.names[ch]) })

  # Plot subclonal copy number as mixtures of two states
  png(filename = paste(output.gw.figures.prefix, "_average.png", sep=""), width = 2000, height = 500, res = 200)
  create.bb.plot.average(bafsegmented=BAFvals,
                            ploidy=ploidy,
                            rho=rho,
                            goodnessOfFit=goodness,
                            pos_min=pos_min,
                            pos_max=pos_max,
                            segment_states_min=segment_states_min,
                            segment_states_tot=segment_states_tot,
                            chr.segs=chr.segs,
                            chr.names=chr.names,
			    tumourname=tumourname)
  dev.off()

  # Plot subclonal copy number as two separate states
  png(filename = paste(output.gw.figures.prefix, "_subclones.png", sep=""), width = 2000, height = 500, res = 200)
  create.bb.plot.subclones(bafsegmented=BAFvals,
                             subclones=subclones,
                             ploidy=ploidy,
                             rho=rho,
                             goodnessOfFit=goodness,
                             pos_min=pos_min,
                             pos_max=pos_max,
                             subcl_min=subcl_min,
                             subcl_max=subcl_max,
                             is_subclonal=is_subclonal,
                  			     is_subclonal_maj=is_subclonal_maj,
                  			     is_subclonal_min=is_subclonal_min,
                  			     chr.segs=chr.segs,
                  			     chr.names=chr.names,
			     tumourname=tumourname)
  dev.off()
}

#' Load the rho and psi estimates from a file.
#' @noRd
load.rho.psi.file = function(rho.psi.file) {
  rho_psi_info = read.table(rho.psi.file, header=T, sep="\t", stringsAsFactors=F)
  # Always use best solution from grid search - reference segment sometimes gives strange results
  rho = rho_psi_info$rho[rownames(rho_psi_info)=="FRAC_GENOME"] # rho = tumour percentage (called tp in previous versions)
  psit = rho_psi_info$psi[rownames(rho_psi_info)=="FRAC_GENOME"] # psi of tumour cells
  goodness = rho_psi_info$distance[rownames(rho_psi_info)=="FRAC_GENOME"] # goodness of fit
  return(list(rho=rho, psit=psit, goodness=goodness))
}

#' Collapse a BAFsegmented file into segment start and end points
#'
#' This function looks through the BAFsegmented for stretches of equal
#' BAFseg and records the start and end coordinates in a data.frame
#' @param bafsegmented The BAFsegmented output from segmentation
#' @return A data.frame with columns chromosome, start and end
#' @author sd11
#' @noRd
collapse_bafsegmented_to_segments = function(bafsegmented) {
  segments_collapsed = data.frame()
  for (chrom in unique(bafsegmented$Chromosome)) {
    bafsegmented_chrom = bafsegmented[bafsegmented$Chromosome==chrom,]
    segments = rle(bafsegmented_chrom$BAFseg)
    startpoint = 1
    for (i in 1:length(segments$lengths)) {
      endpoint = startpoint+segments$lengths[i]-1
      segments_collapsed = rbind(segments_collapsed,
                                 data.frame(chromosome=chrom, start=bafsegmented_chrom$Position[startpoint], end=bafsegmented_chrom$Position[endpoint]))
      startpoint = endpoint+1
    }
  }
  return(segments_collapsed)
}

#' Function to make additional figures
#'
#' @param samplename Name of the sample for the plot title
#' @param logr_file File containing all logR data
#' @param subclones_file File with the copy number fit
#' @param rho_psi_file File with rho and psi parameters
#' @param bafsegmented_file File containing the BAFsegmented data
#' @param logrsegmented_file File with the logRsegmented data
#' @param allelecounts_file Optional file with raw allele counts (Default: NULL)
#' @author sd11
#' @export
make_posthoc_plots = function(samplename, logr_file, subclones_file, rho_psi_file, bafsegmented_file, logrsegmented_file, allelecounts_file=NULL) {
  # Make some post-hoc plots
  logr = Battenberg::read_table_generic(logr_file)
  subclones = Battenberg::read_table_generic(subclones_file)
  rho_psi = read.table(rho_psi_file, header=T, stringsAsFactors=F)
  purity = rho_psi["FRAC_GENOME", "rho"]
  totalcn_chrom_plot(samplename, subclones, logr, paste0(samplename, "_totalcn_chrom_plot.png"), purity)

  bafsegmented = as.data.frame(Battenberg::read_table_generic(bafsegmented_file))
  logrsegmented = as.data.frame(Battenberg::read_table_generic(logrsegmented_file, header=F))
  colnames(logrsegmented) = c("Chromosome", "Position", "logRseg")
  outputfile = paste0(samplename, "_alleleratio.png")
  allele_ratio_plot(samplename=samplename, logr=logr, bafsegmented=bafsegmented, logrsegmented=logrsegmented, outputfile=outputfile, max.plot.cn=8)

  if (!is.null(allelecounts_file)) {
    allelecounts = as.data.frame(Battenberg::read_table_generic(allelecounts_file))
    outputfile = paste0(samplename, "_coverage.png")
    coverage_plot(samplename, allelecounts, outputfile)
  }
}


#' Fit ChrX subclonal copy number (male only)
#'
#' Function to call ChrX copy number based on LogR (suitable for male samples). Copy number
#' cannot be called for the non-PAR region of ChrX due to the hemizygosity of all 1000G SNPs.
#' This function enables calling subclonal copy number for the non-PAR region by segmenting LogR.
#' A number of correction steps are undertaken to account for the noisy nature of LogR. This function
#' requires the following libraries: copynumber, data.table and ggplot2. It reads in three files generated
#' by previous steps of Battenberg, namely TUMOURNAME_mutantLogR_gcCorrected.tab, TUMOURNAME_purity_ploidy.txt
#' and TUMOURNAME_subclones.txt.
#' @param TUMOURNAME The tumour name used for Battenberg (i.e. the tumour BAM file name without the .bam extension)
#' @param X_GAMMA The PCF gamma value for segmentation of 1000G SNP LogR values (Default 1000)
#' @param X_KMIN The min number of SNPs to support a segment in PCF of LogR values (Default 100)
#' @param AR Should the segment carrying the androgen receptor (AR) locus to be visually distinguished in average plot? (Default TRUE)
#' @param GENOMEBUILD The genome build used in running Battenberg (hg19 or hg38)
#' @author Naser Ansari-Pour (BDI, Oxford)
#' @export
callChrXsubclones = function(TUMOURNAME,X_GAMMA=1000,X_KMIN=100,GENOMEBUILD,AR=TRUE){
print(TUMOURNAME)
PCFinput=data.frame(read_table_generic(paste0(TUMOURNAME,"_mutantLogR_gcCorrected.tab")),stringsAsFactors=F)
PCFinput=PCFinput[which(PCFinput$Chromosome=="X" & PCFinput$Position>2.6e6 & PCFinput$Position<156e6),] # get nonPAR
colnames(PCFinput)[3]=TUMOURNAME
print(paste("Number of chrX nonPAR SNPs =",nrow(PCFinput)))
PCF=copynumber::pcf(PCFinput,gamma=X_GAMMA,kmin=X_KMIN)
write.table(PCF,paste0(TUMOURNAME,"_PCF_gamma_",X_GAMMA,"_chrX.txt"),col.names=T,row.names=F,quote=F,sep="\t")
print("PCF segmentation done")

if (GENOMEBUILD=="hg19"){
x_centromere=c(58632012,61632012) # hg19
ar=data.frame(startpos=66763874,endpos=66950461)
} else {
  x_centromere=c(58605580,62412542) #hg38
  ar=data.frame(startpos=67544021,endpos=67730619)
}

# INPUT for copy number inference
SAMPLEsegs=data.frame(PCF,stringsAsFactors=F)
pupl=read.table(paste0(TUMOURNAME,"_purity_ploidy.txt"),header=T,stringsAsFactors=F)
SAMPLEpurity=pupl$purity
#SAMPLEploidy=round(pupl$ploidy/2)*2
SAMPLEn=pupl$ploidy
print(paste(SAMPLEpurity,SAMPLEn))

# Estimating LogR deviation in diploid and gained regions (AUTOSOMAL)
BB=read.table(paste0(TUMOURNAME,"_subclones.txt"),header=T,stringsAsFactors = F)

BBdip=BB[which(BB$nMaj1_A==1 & BB$nMin1_A==1 & BB$frac1_A==1),]
# correction for LogR values
BBcorr=-mean(BBdip$LogR) #diploid only
if (nrow(BBdip)<=1){
print("likely WGD sample")
BBdip=BB[which(BB$nMaj1_A==2 & BB$nMin1_A==2 & BB$frac1_A==1),]
cnloh=BB[which(BB$nMaj1_A==2 & BB$nMin1_A==0 & BB$frac1_A==1),]
if (nrow(cnloh)>0){
BBcorr=-mean(cnloh$LogR)
} else if (nrow(cnloh)==0){
print("CRUDE estimation of BBcorr based on assumption of 2 copies vs ploidy")
BBcorr=-log2(2/SAMPLEn)
}
}
BBg1=BB[which(BB$nMaj1_A==2 & BB$nMin1_A==1 & BB$frac1_A==1),]
BBg2=BB[which(BB$nMaj1_A==3 & BB$nMin1_A==1 & BB$frac1_A==1),]
BBg3=BB[which(BB$nMaj1_A==4 & BB$nMin1_A==1 & BB$frac1_A==1),]
BBg4=BB[which(BB$nMaj1_A==3 & BB$nMin1_A==2 & BB$frac1_A==1),] # likely observed in WGD samples

# get max gain N:
BBcomb=rbind(BBdip,BBg1,BBg2,BBg3,BBg4)
maxNMaj=max(BBcomb$nMaj1_A)

# SD for LogR values - diploid and gain regions
BBsd=c(sd(BBdip$LogR),sd(BBg1$LogR),sd(BBg2$LogR),sd(BBg3$LogR))
#BBsd_mean=mean(BBsd,na.rm=T)
BBsd_max=max(BBsd, na.rm=T)
BBsd_max=max(BBsd_max,0.05) # accept a minimum of 5% sd in LogR variation

# BB LOH - estimating sd for LOH/loss events
BBloh=BB[which(BB$nMaj1_A==1 & BB$nMin1_A==0 & BB$frac1_A==1),]
if (nrow(BBloh)<=1){ #sd would be NA
print("likely WGD sample or no clonal LOH event")
BBloh=BB[which(BB$nMin1_A==0 & BB$frac1_A==1),] # all LOH events with varying nMaj1_A including 2:0 events
}

# expected ChrX logR values
explogrgainX=function(x){log2((SAMPLEpurity*x+(1-SAMPLEpurity)*1)/1)}
explogrGain=sapply(2:10000,explogrgainX) # up to 10000 copies!

explogrLoss=max(log2(0+(1-SAMPLEpurity)*1),log2(0.01)) # if purity ~ 1, then purity of 0.99 is assumed for a realistic explogR estimate

# assign CN
SEG=data.frame()
for (j in 1:nrow(SAMPLEsegs)){
  seg=SAMPLEsegs[j,]
  seg$type=ifelse(seg$mean<0,"loss","gain")
  
  # is segment different from zero?
  seg$mean=seg$mean+BBcorr
  
  if (seg$type=="gain"){
    seg$CNA=ifelse(seg$mean>(0+1.96*BBsd_max),"yes","no")
  } else {
    seg$CNA=ifelse(seg$mean<(0-1.96*BBsd_max),"yes","no")
  }
  # copy number
  if (seg$CNA=="yes"){
    if (seg$type=="gain"){
      rank=which(sort(c(explogrGain,seg$mean))==seg$mean) # rank of observed logR mean for segment among the expected logR values
      seg$CN=rank+1
      # clonality test
      if (rank==1){
        seg$clonal=ifelse(round(explogrGain[rank]-seg$mean,digits=2)<=round((BBsd_max/explogrGain[rank]),digits=2),"yes","no") # CV
        
      } else if (rank>=5){ # STOPS calling 'subclonal' events when copy number is >=5
        if (abs(seg$mean-explogrGain[rank-1])<abs(seg$mean-explogrGain[rank])){
          
          seg$clonal="yes"
          seg$CN=seg$CN-1
        } else {
          seg$clonal="yes"
        }
      } else {
        if (abs(seg$mean-explogrGain[rank-1])<abs(seg$mean-explogrGain[rank])){
          
          seg$clonal=ifelse(round(seg$mean-explogrGain[rank-1],digits = 2)<=round((BBsd_max/explogrGain[rank-1]),digits=2),"yes","no")
          if (seg$clonal=="yes"){
            seg$CN=seg$CN-1
          }
        }
        else {
          seg$clonal=ifelse(round(explogrGain[rank]-seg$mean,digits=2)<round((BBsd_max/explogrGain[rank]),digits = 2),"yes","no")
        }
      }
    } else if (seg$type=="loss"){
      seg$CN=0
      if (nrow(BBloh)>0){
        seg$clonal=ifelse(round(abs(explogrLoss-seg$mean),digits=2)<round(abs(sd(BBloh$LogR)/explogrLoss),digits=2),"yes","no")
      } else if (nrow(BBloh)==0){
        seg$clonal=ifelse(round(abs(explogrLoss-seg$mean),digits=2)<round(abs(BBsd_max/explogrLoss),digits=2),"yes","no")
      }
    }
  }
  else {
    seg$CN=1
    seg$clonal=NA
    print(paste("no CNA for segment",j))
  }
  if (seg$arm=="p" & seg$end.pos>x_centromere[1]-1e6 & seg$CNA=="yes" & seg$end.pos<seg$start.pos+1e6){
    print("segment is p-arm centromere noise")
    print(seg)
  } else if (seg$arm=="q" & seg$end.pos<x_centromere[2]+1e6 & seg$CNA=="yes" & seg$end.pos<seg$start.pos+1e6){
    print("segment is q-arm centromere noise")
    print(seg)
  } else {
    SEG=rbind(SEG,seg)
  }
}

# CALCULATE CCF 
CCF=data.frame()
for (j in 1:nrow(SEG)){
    seg=SEG[j,]
  if (seg$CNA=="yes"){
  if (seg$type=="gain"){
    if (seg$clonal=="no"){
    seg$CCF=(2^seg$mean-(SAMPLEpurity*(seg$CN-1)+(1-SAMPLEpurity)*1))/SAMPLEpurity 
    } else {
    seg$CCF=1
  }
  } else if (seg$type=="loss"){
    if (seg$clonal=="no"){
    seg$CCF=(1-2^(seg$mean))/SAMPLEpurity # for Loss (assuming one chrX in all cells prior to Loss)
      if (seg$CCF>=0.95){
      seg$CCF=1
      seg$clonal="yes"
    }
    } else {
    seg$CCF=1
    }
  }
  } else {
    seg$CCF=1
  }
  CCF=rbind(CCF,seg)
}

# GENERATE FINAL OUTPUT
SUBCLONES=data.frame()
  for (j in 1:nrow(CCF)){
    subclones=CCF[j,]
    if (subclones$CNA=="no"){
      subclones=data.frame(subclones,nMaj1=1,nMin1=0,frac1=1,nMaj2=0,nMin2=0,frac2=0)
    } else {
      if (subclones$type=="gain" & subclones$clonal=="yes"){
        subclones=data.frame(subclones,nMaj1=subclones$CN,nMin1=0,frac1=1,nMaj2=0,nMin2=0,frac2=0)  
      }
      else if (subclones$type=="gain" & subclones$clonal=="no"){
        if(subclones$CCF>0.5){ # switch nMaj/nMin so that the first nMaj/nMin represent the MAJOR CLONE
        subclones=data.frame(subclones,nMaj1=subclones$CN,nMin1=0,frac1=subclones$CCF,nMaj2=subclones$CN-1,nMin2=0,frac2=1-subclones$CCF)
        } else {
          subclones=data.frame(subclones,nMaj1=subclones$CN-1,nMin1=0,frac1=1-subclones$CCF,nMaj2=subclones$CN,nMin2=0,frac2=subclones$CCF)  
        }
      }
      else if (subclones$type=="loss" & subclones$clonal=="yes"){
        subclones=data.frame(subclones,nMaj1=subclones$CN,nMin1=0,frac1=1,nMaj2=0,nMin2=0,frac2=0) # very unlikely scenario; no sequencing reads should be present!
      }
      else if (subclones$type=="loss" & subclones$clonal=="no"){
        if(subclones$CCF>0.5){ # switch nMaj/nMin so that the first nMaj/nMin represent the MAJOR CLONE
        subclones=data.frame(subclones,nMaj1=subclones$CN,nMin1=0,frac1=subclones$CCF,nMaj2=1,nMin2=0,frac2=1-subclones$CCF)
        } else {
          subclones=data.frame(subclones,nMaj1=1,nMin1=0,frac1=1-subclones$CCF,nMaj2=subclones$CN,nMin2=0,frac2=subclones$CCF)
      }
    }
    }
    print(j)
    SUBCLONES=rbind(SUBCLONES,subclones)
}

SUBCLONES$average=(SUBCLONES$nMaj1+SUBCLONES$nMin1)*SUBCLONES$frac1+(SUBCLONES$nMaj2+SUBCLONES$nMin2)*SUBCLONES$frac2

SUBCLONESout=data.frame(SUBCLONES[,c("chrom","arm")],startpos=SUBCLONES$start.pos,endpos=SUBCLONES$end.pos,nSNPs=SUBCLONES$n.probes,
                      LogR=SUBCLONES$mean,SUBCLONES[,c("type","CNA","CN","clonal","nMaj1","nMin1","frac1","nMaj2","nMin2","frac2")],
                      subclonalCN=SUBCLONES$average,stringsAsFactors = F)
SUBCLONESout$type[SUBCLONESout$type=="gain"]="+ve"
SUBCLONESout$type[SUBCLONESout$type=="loss"]="-ve"

# merge adjacent segments with same copy number  
  SUBCLONESout$rank=1:nrow(SUBCLONESout)
  SUBCLONESout=SUBCLONESout[order(SUBCLONESout$subclonalCN),]
  
  SPLIT=split(SUBCLONESout$rank, cumsum(c(1, diff(SUBCLONESout$rank) != 1))) # find consecutive segments with same subclonalCN
  outputDF=data.frame()
  for (j in 1:length(SPLIT)){
    if (length(SPLIT[[j]])>1){
      print(length(SPLIT[[j]]))
      SUBsplit=SUBCLONESout[which(!is.na(match(SUBCLONESout$rank,SPLIT[[j]]))),]
      if (length(unique(SUBsplit$arm))==1){
      if (sd(SUBsplit$subclonalCN)<=0.01){
        mergedseg=SUBsplit[1,]
        mergedseg$endpos=SUBsplit[length(SPLIT[[j]]),"endpos"]
        mergedseg$nSNPs=sum(SUBsplit$nSNPs)
        mergedseg$LogR=weighted.mean(SUBsplit$LogR,SUBsplit$nSNPs)
        outputDF=rbind(outputDF,mergedseg)
      } else {
        outputDF=rbind(outputDF,SUBsplit)
        print("adjacent not same subclonalCN in SPLIT")
        }
      } else if (length(SPLIT[[j]])==2){
      outputDF=rbind(outputDF,SUBsplit)
      } else{
        # if (length(SUBsplit$arm=="p"))
     pseg=SUBsplit[SUBsplit$arm=="p",]
     if (nrow(pseg)>1){
       if (sd(pseg$subclonalCN)<=0.01){
         mergedseg=pseg[1,]
         mergedseg$endpos=pseg[nrow(pseg),"endpos"]
         mergedseg$nSNPs=sum(pseg$nSNPs)
         mergedseg$LogR=weighted.mean(pseg$LogR,pseg$nSNPs)
         outputDF=rbind(outputDF,mergedseg)
       } else {
         outputDF=rbind(outputDF,pseg)
         print("adjacent not same subclonalCN in pseg")
         }
     } else {outputDF=rbind(outputDF,pseg)}
     qseg=SUBsplit[SUBsplit$arm=="q",]
     if (nrow(qseg)>1){
       if (sd(qseg$subclonalCN)<=0.01){
         mergedseg=qseg[1,]
         mergedseg$endpos=qseg[nrow(qseg),"endpos"]
         mergedseg$nSNPs=sum(qseg$nSNPs)
         mergedseg$LogR=weighted.mean(qseg$LogR,qseg$nSNPs)
         outputDF=rbind(outputDF,mergedseg)
       } else {
         outputDF=rbind(outputDF,qseg)
         print("adjacent not same subclonalCN in qseg")
         }
     } else {outputDF=rbind(outputDF,qseg)}
    }
    }  else {
      SUBsplit=SUBCLONESout[which(SUBCLONESout$rank==SPLIT[[j]]),]
      outputDF=rbind(outputDF,SUBsplit)
    }
  }
  outputDF=outputDF[order(outputDF$startpos),]
  outputDF$rank=NULL

print(paste("Number of rows merged =",nrow(SUBCLONESout)-nrow(outputDF)))
write.table(outputDF,paste0(TUMOURNAME,"_chrX_subclones.txt"),col.names = T,row.names = F,quote = F,sep="\t")

# PLOT
outputDF$diff=outputDF$endpos-outputDF$startpos
PGAclonal=sum(outputDF[which(outputDF$clonal=="yes"),]$diff)/sum(outputDF[which(!is.na(outputDF$clonal)),]$diff)


plot_BB=ggplot()+geom_hline(yintercept = 0:ceiling(max(outputDF$subclonalCN)),linetype="longdash",col="grey",size=0.2)+
  geom_rect(data=outputDF,aes(xmin=startpos,xmax=endpos,ymin=subclonalCN-0.02,ymax=subclonalCN+0.02))+
  geom_vline(xintercept = x_centromere,linetype="longdash",col="green")+
  #geom_hline(yintercept = nonpar,linetype="dotted",col="blue")+
  ylim(-0.2,ceiling(max(outputDF$subclonalCN))+0.2)+labs(x="ChrX coordinate (bp)",y="Average Ploidy")+
  theme(plot.title = element_text(hjust = 0.5,size=12),panel.background = element_blank())+
  ggtitle(paste0(TUMOURNAME," , PLOIDY: ",round(SAMPLEn,digits = 3)," , PURITY: ",round(SAMPLEpurity*100,digits = 0),
                 "%, PGAclonal: ",round(PGAclonal*100,digits = 1),"%"))

# ANDROGEN RECEPTOR LOCUS
if (AR){
  data.table::setDT(ar)
  data.table::setkey(ar,"startpos","endpos")
  data.table::setDT(outputDF)
  data.table::setkey(outputDF,"startpos","endpos")
  segAR=data.table::foverlaps(ar,outputDF,type="any",nomatch = 0)
  segAR$subclonalCN=(segAR$nMaj1+segAR$nMin1)*segAR$frac1+(segAR$nMaj2+segAR$nMin2)*segAR$frac2
  plot_BB=plot_BB+geom_rect(data=segAR,aes(xmin=startpos,xmax=endpos,ymin=subclonalCN-0.02,ymax=subclonalCN+0.02),fill="red")
}

pdf(paste0(TUMOURNAME,"_chrX_average_ploidy.pdf"))
print(plot_BB)
dev.off()

}
Wedge-Oxford/battenberg documentation built on Aug. 4, 2023, 6:27 p.m.