R/QualityControl_CNAqc.R

Defines functions qc_CNAqc larger.all.later.numbers get.worst.filter estimate.dpclust.purity switch.ploidy.get.ploidy switch.ploidy.get.purity estimate.new.ploidy log.message pastedot pasteu

# complete QC of Battenberg output
# written by Anna Frangou and Alex J. Cornish
#
# usage:
# Rscript QC.R [RUN] [DIR_BATTENBERG] [SUBDIR_CALLSUBCLONES] [SUBDIR_POSTPROCESSING] [SUBDIR_DPCLUST] [SUBDIR_VAFPEAKS] [FILENAME_SAMPLELIST] [FILENAME_CONFIG] [FILENAME_CHRSIZES] [FILENAME_QC]
#
#### subroutines ####
pasteu <- function(...) paste(..., sep="_")

pastedot <- function(...) paste(..., sep=".")

log.message <- function(x, level=0) message(paste0("[", Sys.time(), "] ", paste(rep(" ", times=level * 4), collapse=""), "- ", x))

# estimate.new.ploidy <- function(rho.old, psi.old, rho.new) (rho.old * (psi.old - 2) + 2 * rho.new) / rho.new # from eq. S5, Van Loo et al. (PNAS, 2010)

estimate.new.ploidy <- function(rho.old, psi.old, rho.new, dip.or.tetra) {
  if (dip.or.tetra == "tetra") {ploidytype = "tetra"} else {ploidytype = "dip"}
  if (ploidytype=="dip") {psi.new = ((rho.old * psi.old) + 2*(rho.new - rho.old)) / rho.new}
  if (ploidytype=="tetra") {psi.new = ((rho.old * psi.old) + 4*(rho.new - rho.old)) / rho.new}
  return(psi.new)
}

switch.ploidy.get.purity <- function(rho.old,psi.old,dip.or.tetra) {
  if (dip.or.tetra == "tetra") {ploidytype = "tetra"} else {ploidytype = "dip"}
  if (ploidytype == "tetra") {rho.new = 2*rho.old / (rho.old+1)}
  if (ploidytype == "dip")  {rho.new = rho.old / (2-rho.old)}
  return(rho.new)
}

switch.ploidy.get.ploidy <- function(rho.old,psi.old,dip.or.tetra) {
  if (dip.or.tetra == "tetra") {ploidytype = "tetra"} else {ploidytype = "dip"}
  if (ploidytype == "tetra") {psi.new = psi.old/2}
  if (ploidytype == "dip")  {psi.new = psi.old*2}
  return(psi.new)
}

estimate.dpclust.purity <- function(clusters, rho.old) {
  # identify clonal cluster as cluster with highest CCF (noisy clusters should have already been removed)
  # reestimate purity using position of this clonal cluster
  # if this new purity is >1, exclude cluster and retry
  # if purity >1 after all clusters considered, return old purity
  if (rho.old > 1) stop("old rho should be <1")
  consider <- rep(T, nrow(clusters))
  rho.new <- Inf
  while (rho.new > 1) {
    if (sum(consider)) {
      # select cluster with greatest CCF to compute new purity
      clonalcluster <- which(clusters$location == max(clusters$location[consider]))[1]
      rho.new <- rho.old * clusters$location[clonalcluster] # compute new purity
      consider[clonalcluster] <- F # ensure this cluster is not considered in any later rounds
    } else {
      # if rho.new is >1 and there are no more clusters to consider, return old purity
      warning("failed to compute new purity")
      rho.new <- rho.old
    }
  }
  rho.new
}

get.worst.filter <- function(filter.list) {
  # identify the worst filter result from a list of filter results (FAIL worse than FLAG worse than PASS)
  worst.filter <- "PASS"
  if ("FLAG" %in% filter.list) worst.filter <- "FLAG"
  if ("FAIL" %in% filter.list) worst.filter <- "FAIL"
  worst.filter
}

larger.all.later.numbers <- function(x) sapply(1:length(x), function(i) ifelse(i == length(x), T, x[i] > max(x[(i+1):length(x)])))

qc_CNAqc <- function(
  run,
  dir.battenberg,
  subdir.callsubclones,
  subdir.postprocessing,
  subdir.dpclust,
  subdir.vafpeaks,
  filename.samplelist,
  filename.config,
  filename.chr.sizes,
  filename.qc
) {
  # complete QC

  # required libraries
  library(stringr)

  source(filename.config) # source config file
  time.pretty <- str_replace_all(str_replace_all(Sys.time(), " ", "_"), ":", "-")
  run.name <- paste0("run", run) # name of this run

  log.message("reading in data")
  chr.sizes.raw <- read.delim(filename.chr.sizes, sep="\t", header=F, stringsAsFactors=F)
  chr.sizes <- structure(chr.sizes.raw[[2]], names=chr.sizes.raw[[1]])
  samplelist <- read.delim(filename.samplelist, sep="\t", header=T, stringsAsFactors=F)
  print(rownames(samplelist))
  rownames(samplelist) <- ids <- paste0("tumo", samplelist$tumour_sample_platekey, "_norm", samplelist$germline_sample_platekey)
  print(ids)

  # directories and filenames
  filenames.battenberg.segs <- sapply(ids, function(id) file.path(
    dir.battenberg,"intermediateFiles",samplelist[id, "participant_id"],id,
    subdir.postprocessing,
    pasteu(samplelist[id, "tumour_sample_platekey"], "subclones.txt")))
  filenames.battenberg.purity <- sapply(ids, function(id) file.path(
    dir.battenberg,"intermediateFiles",samplelist[id, "participant_id"],id,
    subdir.callsubclones,
    pasteu(samplelist[id, "tumour_sample_platekey"], "cellularity_ploidy.txt")))
  filenames.battenberg.plot <- sapply(ids, function(id) file.path(
    dir.battenberg,"intermediateFiles",samplelist[id, "participant_id"],id,
    subdir.callsubclones,
    pasteu(samplelist[id, "tumour_sample_platekey"], "BattenbergProfile_average.pdf")))
  filenames.dpclust.optima <- sapply(ids, function(id) file.path(
    dir.battenberg,"intermediateFiles",samplelist[id, "participant_id"],id,
    subdir.dpclust,
    pasteu(samplelist[id, "tumour_sample_platekey"], "optimaInfo.txt")))
  filenames.peaks <- sapply(ids, function(id) file.path(
    dir.battenberg,"intermediateFiles",samplelist[id, "participant_id"],id,
    subdir.vafpeaks,
    pastedot(samplelist[id, "participant_id"], id, "peak_data.RData")))

  log.message("reading in sample-level data")
  # gets IDs for samples that are being QC'd
  ids <- ids[file.exists(filenames.battenberg.purity) & file.exists(filenames.battenberg.segs) & file.exists(filenames.dpclust.optima)]
  # reformat subclones file
  segs <- sapply(ids, function(id) read.delim(filenames.battenberg.segs[[id]], sep="\t", header=T, stringsAsFactors=F)[,1:13], simplify=F)
  # add 'chr'
  for (id in ids) segs[[id]]$chr <- paste0("chr", segs[[id]]$chr)
  # get battenberg purity from cellularity_ploidy file
  battenberg.purity <- do.call("rbind", sapply(ids, function(id) read.delim(filenames.battenberg.purity[id], sep="\t", header=T, stringsAsFactors=F), simplify=F))
  # get dpclust optimaInfo file
  dpclust <- sapply(ids, function(id) read.delim(filenames.dpclust.optima[id], sep="\t", header=T, stringsAsFactors=F), simplify=F)
  # calculate number of mutations for this sample that constitutes the threshold for superclonal clusters or 0.5 clusters during qc
  fivepcclusters <- sapply(dpclust,function(x) round(sum(x$no.of.mutations) * thres.propmuts.superclonal.or.tetra))

  # remove clusters smaller than thres.propmuts (% of total mutations)
  log.message("removing small clusters from DPClust optima")
  dpclust <- sapply(dpclust, function(x) x[x$no.of.mutations > (sum(x$no.of.mutations) * thres.propmuts), ], simplify=F)

  # read in or setup QC file - this contains info for all runs, for all samples on the sample list
  if (run > 1) {
    log.message("reading existing QC file")
    qc <- read.delim(filename.qc, sep="\t", header=T, stringsAsFactors=F)
    rownames(qc) <- qc$tumour_normal_id
  } else {
    log.message("creating QC file")
    qc <- samplelist[ids, c("participant_id", "sex", "tumour_sample_platekey", "germline_sample_platekey", "somatic_small_variants_qced_vcf")]
    qc$tumour_normal_id <- ids
  }

  # if not the first run, filter down to only deal with samples that failed previous runs
  if (sum(grepl(paste0(run.name, "_"), colnames(qc)))) warning("columns already found for run ", run)
  if (run > 1) ids <- ids[which(qc[ids, pasteu(paste0("run", run - 1), "filter_overallfilter")] == "FAIL" | 
                               qc[ids, pasteu(paste0("run", run - 1), "filter_overallfilter")] == "FLAG")] # identify samples not passing previous run
  if (length(ids) == 0) stop("no tumour-normal pairs to QC")

  # add purity and ploidy estimates from BB (cellularity file) and dpclust (optimaInfo) - dpclust ploidy reestimation comes later because we need dip/tetra classification first 
  log.message("add purity and ploidy estimates")
  qc[ids, pasteu(run.name, "battenberg_purity")] <- battenberg.purity[ids, ifelse("cellularity" %in% colnames(battenberg.purity),'cellularity','purity')] 
  qc[ids, pasteu(run.name, "battenberg_ploidy")] <- battenberg.purity[ids, "psi"]
  qc[ids, pasteu(run.name, "dpclust_purity")] <- sapply(ids, function(id) estimate.dpclust.purity(dpclust[[id]], qc[id, pasteu(run.name, "battenberg_purity")]))
  qc[ids, pasteu(run.name, "dpclust_ploidy")] <- NA # dpclust ploidy reestimation comes later because we need dip/tetra classification first - holding column in same place

  qc[ids, pasteu(run.name, "fivepc_cluster_size")] <- fivepcclusters[ids]
  for (str in c("vafpeaks_purity", "vafpeaks_ploidy", "vafpeaks_score", "reestimated_purity", "reestimated_ploidy")) qc[[pasteu(run.name, str)]] <- NA

  # compute statistics required for QC for each sample
  log.message("computing QC statistics")
  for (id in ids) {
    log.message(paste0("sample ", which(ids == id), "/", length(ids)), level=1)

    # add segment sizes, extract autosome segments and get segment copy number in major and minor subclones
    segs[[id]]$size <- segs[[id]]$endpos - segs[[id]]$startpos + 1
    segs.auto <- segs[[id]][segs[[id]]$chr %in% autosomes, ]
    # remove segments with missing major/minor copy number (should only be telomeres and maybe centromeres)
    segs.auto <- segs.auto[!is.na(segs.auto$nMaj1_A) & !is.na(segs.auto$nMin1_A), ]
    # put copy number of largest subclone first in the segs table
    flip.clones <- rep(F, nrow(segs.auto))
    consider.flip <- !is.na(segs.auto$frac2_A)
    flip.clones[consider.flip] <- segs.auto$frac1_A[consider.flip] < 0.5
    segs.auto[flip.clones, c("nMaj1_A", "nMin1_A", "frac1_A", "nMaj2_A", "nMin2_A", "frac2_A")] <- segs.auto[flip.clones, c("nMaj2_A", "nMin2_A", "frac2_A", "nMaj1_A", "nMin1_A", "frac1_A")]
    # total copy number for each subclone
    segs.auto$ctClone1 <- segs.auto$nMaj1_A + segs.auto$nMin1_A
    segs.auto$ctClone2 <- segs.auto$nMaj2_A + segs.auto$nMin2_A

    # identify failing chromosomes - ie those with less than 50% of the chromosome with a CNV call
    chrs.fail <- list()
    chrs.fail$chrmissing <- sort(setdiff(autosomes, segs.auto$chr))
    chrs.fail$chrsizeincorrect <- chrs[(sapply(autosomes, function(chr) sum(segs.auto$size[segs.auto$chr == chr])) / chr.sizes[autosomes]) < thres.chrsizeincorrect.tol]

    # add information on which chromosomes fail and number of failures
    for (condition in names(chrs.fail)) {
      qc[id, pasteu(run.name, "which", condition)] <- paste(chrs.fail[[condition]], collapse=";")
      qc[id, pasteu(run.name, "n", condition)] <- length(chrs.fail[[condition]])
    }

    # identify segments satisfying various conditions used to apply filters
    segs.conditions <- list()

    segs.conditions$all <- segs.auto # all segments, needs to be the first condition

    segs.conditions$copynumber11 <- segs.auto[
      (segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 1) &
        segs.auto$frac1_A == 1,
      ] # copy number is 1:1 and there is no evidence of sub-clonal change

    segs.conditions$copynumber10 <- segs.auto[
      ((segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 0) | (segs.auto$nMaj1_A == 0 & segs.auto$nMin1_A == 1)) &
        segs.auto$frac1_A == 1,
      ] # copy number is 1:0 and there is no evidence of sub-clonal change

    #minorCNround = round(segs.auto$nMin1_A*segs.auto$frac1_A+segs.auto$nMin2_A*segs.auto$frac2_A)
    segs.auto2 = segs.auto
    segs.auto2[is.na(segs.auto2)] = 0
    segs.conditions$minorCNzero <- segs.auto2[
      (which(round((segs.auto2$nMin1_A*segs.auto2$frac1_A)+(segs.auto2$nMin2_A*segs.auto2$frac2_A))==0)),
      ] # minor CN of 0 after rounding copy number, allows us to define a sample as diploid or tetraploid

    segs.conditions$copynumber21 <- segs.auto[
      ((segs.auto$nMaj1_A == 2 & segs.auto$nMin1_A == 1) | (segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 2)) &
        segs.auto$frac1_A == 1,
      ] # copy number is 2:1 and there is no evidence of sub-clonal change

    segs.conditions$copynumber22 <- segs.auto[
      (segs.auto$nMaj1_A == 2 & segs.auto$nMin1_A == 2) &
        segs.auto$frac1_A == 1,
      ] # copy number is 2:2 and there is no evidence of sub-clonal change

    segs.conditions$copynumber32 <- segs.auto[
      ((segs.auto$nMaj1_A == 3 & segs.auto$nMin1_A == 2) | (segs.auto$nMaj1_A == 2 & segs.auto$nMin1_A == 3)) &
        segs.auto$frac1_A == 1,
      ] # copy number is 3:2 and there is no evidence of sub-clonal change

                                               
    segs.auto.tmp <- segs.auto # tmp copy of segs.auto 
    segs.auto.tmp$ctClone1 <- segs.auto.tmp$nMaj1_A + segs.auto.tmp$nMin1_A
    segs.auto.tmp$ctClone2 <- segs.auto.tmp$nMaj2_A + segs.auto.tmp$nMin2_A                                           
    
    # remove all subclonal homdels < 20% from segs.tmp.auto                                           
    toremove <- which((segs.auto.tmp$nMaj1_A == 0 & segs.auto.tmp$frac1_A <= 0.2) | ((!(is.na(segs.auto.tmp$nMaj2_A) | is.na(segs.auto.tmp$frac2_A))) & segs.auto.tmp$nMaj2_A == 0 & segs.auto.tmp$frac2_A <= 0.2))
    if (length(toremove) > 0) { segs.auto.tmp <- segs.auto.tmp[-toremove,] }
                                               
    segs.conditions$homodelall <- segs.auto.tmp[
      segs.auto.tmp$ctClone1 == 0 |
        sapply(as.double(segs.auto.tmp$ctClone2), identical, 0.0),
      ] # homozygous deletion of either major or (if evidence of sub-clonality > 20%) minor clone
                                              
    segs.conditions$homodelallclonal <- segs.auto.tmp[
      which(segs.auto.tmp$nMaj1_A == 0 & segs.auto.tmp$nMin1_A==0 & segs.auto.tmp$frac1_A == 1),
     ] # clonal homdels     

    segs.conditions$homodelallsubclonal <- segs.auto.tmp[
      which((segs.auto.tmp$nMaj1_A == 0 & segs.auto.tmp$nMin1_A==0 & segs.auto.tmp$frac1_A != 1) | 
      (segs.auto.tmp$nMaj2_A == 0 & segs.auto.tmp$nMin2_A==0)),
     ] # subclonal homdels (only those > 20%)    
                                               
    segs.conditions$homodellargest <- if (nrow(segs.conditions$homodelall)) {
      segs.conditions$homodelall[which(segs.conditions$homodelall$size == max(segs.conditions$homodelall$size))[1], ]
    } else {
      segs.conditions$homodelall
    } # if we have homdels, find max size 
                                               
    segs.conditions$cnodd <- segs.auto[
      (((segs.auto$nMaj1_A == 2 & segs.auto$nMin1_A == 1) | (segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 2)) |
         ((segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 0) | (segs.auto$nMaj1_A == 0 & segs.auto$nMin1_A == 1)) |
         ((segs.auto$nMaj1_A == 3 & segs.auto$nMin1_A == 2) | (segs.auto$nMaj1_A == 2 & segs.auto$nMin1_A == 3)) | 
         ((segs.auto$nMaj1_A == 3 & segs.auto$nMin1_A == 0) | (segs.auto$nMaj1_A == 0 & segs.auto$nMin1_A == 3)) | 
         ((segs.auto$nMaj1_A == 5 & segs.auto$nMin1_A == 0) | (segs.auto$nMaj1_A == 0 & segs.auto$nMin1_A == 5)) |
         ((segs.auto$nMaj1_A == 4 & segs.auto$nMin1_A == 1) | (segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 4)) |
         ((segs.auto$nMaj1_A == 3 & segs.auto$nMin1_A == 1) | (segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 3)) |
         ((segs.auto$nMaj1_A == 5 & segs.auto$nMin1_A == 1) | (segs.auto$nMaj1_A == 1 & segs.auto$nMin1_A == 5)) |
         ((segs.auto$nMaj1_A == 5 & segs.auto$nMin1_A == 3) | (segs.auto$nMaj1_A == 3 & segs.auto$nMin1_A == 5))) &
        segs.auto$frac1_A == 1,
      ] # copy number is 1:0, 2:1, 3:0, 5:0, 4:1, 3:2, 3:1, 5:1, or 5:3 and there is no evidence of sub-clonal change

    segs.conditions$aroundpoint5.narrow <- segs.auto[
      segs.auto$frac1_A <= thres.50pcpeak.upper & segs.auto$frac1_A >= thres.50pcpeak.lower,
      ] # copy number fraction around 0.5, narrow boundary

    segs.conditions$aroundpoint5.wide <- segs.auto[
      segs.auto$frac1_A <= thres.50pcpeak.upper.wide & segs.auto$frac1_A >= thres.50pcpeak.lower.wide,
      ] # copy number fraction around 0.5, wide boundary

    # add information on the number of segments, length of segments and fraction of considered genome segments represent
    for (condition in names(segs.conditions)) {
      qc[id, pasteu(run.name, "nsegs", condition)] <- nrow(segs.conditions[[condition]])
      qc[id, pasteu(run.name, "lsegs", condition)] <- sum(segs.conditions[[condition]]$size)
      qc[id, pasteu(run.name, "fgenome", condition)] <- qc[id, pasteu(run.name, "lsegs", condition)] / sum(segs.conditions$all$size)
    }

    # add tumour ploidy, ie psi_t (different to average ploidy)
    qc[id, pasteu(run.name, "psi_t")] <- sum(segs.auto$size * (segs.auto$nMaj1_A + segs.auto$nMin1_A))/sum(segs.auto$size)
    # add fraction of LOH *-2 +2.9 (from PCAWG equation)
    qc[id, pasteu(run.name, "minorCNzerotimesminus2plus2.9")] <- (qc[id,pasteu(run.name,"fgenome_minorCNzero")]*-2)+2.9

    # define sample as 'narrow' if there is one cluster within 0.95-1.05 and 0.45-0.55 (or as per config file)
    # or 'wide' if there are two clusters in either of those boundaries
    # classify sample as narrow or wide
    qc[id, pasteu(run.name, "narrow.or.wide")] <- ifelse(((sum(dpclust[[id]]$location >= thres.clonalpeak.lower.wide &
                                                                 dpclust[[id]]$location <= thres.clonalpeak.upper.wide)>1) |
                                                            (sum(dpclust[[id]]$location >= thres.50pcpeak.lower.wide &
                                                                   dpclust[[id]]$location <= thres.50pcpeak.upper.wide)>1)),"wide","narrow")


    # classify sample as currently diploid or tetraploid
    # if qc$psi_t > qc$minorCNzerotimesminus2plus2.9, then tetraploid, otherwise diploid
    qc[id, pasteu(run.name, "dip.or.tetra")] <- ifelse(qc[id,pasteu(run.name,"psi_t")] > qc[id,pasteu(run.name,"minorCNzerotimesminus2plus2.9")],
                                                       "tetra",
                                                       "dip")
                                               
    # classify dpclust ploidy now we have dip/tetra classification
    qc[id, pasteu(run.name, "dpclust_ploidy")] <- estimate.new.ploidy(qc[id, pasteu(run.name, "battenberg_purity")], 
                                                                                              qc[id, pasteu(run.name, "battenberg_ploidy")], 
                                                                                              qc[id, pasteu(run.name, "dpclust_purity")],
                                                                                             qc[id, pasteu(run.name, "dip.or.tetra")])                                           

    # peak closest to clonal
    qc[id, pasteu(run.name, "peak.closest.to.clonal")] <- min(abs(1-dpclust[[id]]$location))

    # add DPClust peak subclonal, superclonal, 50% location information,
    # this is dependent on whether the sample is defined as narrow or wide
    # only consider clusters comprising a certain proportion of mutations
    qc[id, pasteu(run.name, "nsubclonalpeaks")] <- ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                          sum(dpclust[[id]]$location < thres.clonalpeak.lower),
                                                          sum(dpclust[[id]]$location < thres.clonalpeak.lower.wide))
    qc[id, pasteu(run.name, "nclonalpeaks")] <- ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                       sum(dpclust[[id]]$location >= thres.clonalpeak.lower &
                                                             dpclust[[id]]$location <= thres.clonalpeak.upper),
                                                       sum(dpclust[[id]]$location >= thres.clonalpeak.lower.wide &
                                                             dpclust[[id]]$location <= thres.clonalpeak.upper.wide))
    qc[id, pasteu(run.name, "nsuperclonalpeaks")] <- ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                            sum(dpclust[[id]]$location > thres.clonalpeak.upper &
                                                                  dpclust[[id]]$no.of.mutations > qc[id, pasteu(run.name, "fivepc_cluster_size")]),
                                                            sum(dpclust[[id]]$location > thres.clonalpeak.upper.wide &
                                                                  dpclust[[id]]$no.of.mutations > qc[id, pasteu(run.name, "fivepc_cluster_size")]))
    qc[id, pasteu(run.name, "n50pcpeaks")] <- ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                     sum(dpclust[[id]]$location > thres.50pcpeak.lower &
                                                           dpclust[[id]]$location < thres.50pcpeak.upper &
                                                           dpclust[[id]]$no.of.mutations > qc[id, pasteu(run.name, "fivepc_cluster_size")]),
                                                     sum(dpclust[[id]]$location > thres.50pcpeak.lower.wide  &
                                                           dpclust[[id]]$location < thres.50pcpeak.upper.wide &
                                                           dpclust[[id]]$no.of.mutations > qc[id, pasteu(run.name, "fivepc_cluster_size")]))

    # add number of mutations in each of the subclonal, clonal, and superclonal categories
    qc[id, pasteu(run.name, "nmutssubclonal")] <-	ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                         sum(dpclust[[id]]$no.of.mutations[dpclust[[id]]$location < thres.clonalpeak.lower]),
                                                         sum(dpclust[[id]]$no.of.mutations[dpclust[[id]]$location < thres.clonalpeak.lower.wide]))


    qc[id, pasteu(run.name, "nmutsclonal")] <- ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                      sum(dpclust[[id]]$no.of.mutations[dpclust[[id]]$location >= thres.clonalpeak.lower &
                                                                                          dpclust[[id]]$location <= thres.clonalpeak.upper]),
                                                      sum(dpclust[[id]]$no.of.mutations[dpclust[[id]]$location >= thres.clonalpeak.lower.wide &
                                                                                          dpclust[[id]]$location <= thres.clonalpeak.upper.wide]))

    qc[id, pasteu(run.name, "nmutssuperclonal")] <- ifelse(qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow",
                                                           sum(dpclust[[id]]$no.of.mutations[dpclust[[id]]$location > thres.clonalpeak.upper &
                                                                                               dpclust[[id]]$no.of.mutations > qc[id, pasteu(run.name, "fivepc_cluster_size")]]),
                                                           sum(dpclust[[id]]$no.of.mutations[dpclust[[id]]$location > thres.clonalpeak.upper.wide &
                                                                                               dpclust[[id]]$no.of.mutations > qc[id, pasteu(run.name, "fivepc_cluster_size")]]))

                                               
    # uses VAF peak data calculated in previous step (peaks)
    load(filenames.peaks[id])                                      
    #expected.locs <- peaks$summary$expected.locs
    expected.locs <- peaks$peaks_analysis$matches
    qc[id, pasteu(run.name, "vafpeaks")] <-peaks$peaks_analysis$QC                                          
    if (is.null(nrow(expected.locs))) {
      # if there are no matched peaks then 2:2 peak matching does not occur
      if (peaks$peaks_analysis$QC=="FLAG") {
        qc[id, pasteu(run.name, "vafpeaks_tetraploid")] <- TRUE  
      } else {
        qc[id, pasteu(run.name, "vafpeaks_tetraploid")] <- FALSE
      }
    } else {
      # select peaks to consider and determine whether they are within the threshold and purity distance off
      #expected.locs <- expected.locs[order(abs(expected.locs$peak.diff)), ]
      expected.locs <- expected.locs[order(abs(expected.locs$offset)), ]

      # find mutations in 2:2 regions with multiplicity 1 and multiplicity 2
      mult1 <- which(expected.locs$karyotype=="2:2" & expected.locs$mutation_multiplicity==1)
      #which(expected.locs$state == "2:2" & expected.locs$multiplicity == 1)
      mult2 <- which(expected.locs$karyotype=="2:2" & expected.locs$mutation_multiplicity==2)
      # if these mutations exist, the position of this peak must be within a certain distance of expected peak
      # in order to qualify as these 2:2 peaks existing - this is evidence of proper tetraploidy as there are
      # mutations at multiplicity 1 and 2 in 2:2 regions
      mult1.match <- ifelse(length(mult1)>0, abs(expected.locs$offset[mult1]) < thres.incorrecttetraploid.peak.diff, FALSE)
      mult2.match <- ifelse(length(mult2)>0, abs(expected.locs$offset[mult2]) < thres.incorrecttetraploid.peak.diff, FALSE)
      qc[id, pasteu(run.name, "vafpeaks_tetraploid")] <- mult1.match & mult2.match

      # add other information from vafpeaks
      #qc[id, pasteu(run.name, "vafpeaks_purity")] <- peaks$summary$purity.new
      qc[id, pasteu(run.name, "vafpeaks_purity")] <- qc[id, pasteu(run.name, "battenberg_purity")]-peaks$peaks_analysis$score
      # if this purity goes below 5%, fix it at 5%, if it goes above 95%, fix it at 95%
      if (qc[id, pasteu(run.name, "vafpeaks_purity")]<0.05) {qc[id, pasteu(run.name, "vafpeaks_purity")]=0.05}
      if (qc[id, pasteu(run.name, "vafpeaks_purity")]>0.95) {qc[id, pasteu(run.name, "vafpeaks_purity")]=0.95}
      qc[id, pasteu(run.name, "vafpeaks_ploidy")] <- estimate.new.ploidy(qc[id, pasteu(run.name, "battenberg_purity")],
                                                                         qc[id, pasteu(run.name, "battenberg_ploidy")],
                                                                         qc[id, pasteu(run.name, "vafpeaks_purity")],
                                                                        qc[id, pasteu(run.name, "dip.or.tetra")])
      #qc[id, pasteu(run.name, "vafpeaks_score")] <- peaks$summary$eta
      qc[id, pasteu(run.name, "vafpeaks_score")] <-peaks$peaks_analysis$score
      # add in peaks$peaks_analysis$QC to qc table
      #qc[id, pasteu(run.name, "vafpeaks")] <-peaks$peaks_analysis$QC
    }


    # give a call for whether the ploidy is judged to be correct by our criteria
    # if diploid, else if tetraploid
    #qc[id, pasteu(run.name, "ploidy_type_accepted")] <- if (qc[id,pasteu(run.name,"battenberg_ploidy")]<=2.6) {
    qc[id, pasteu(run.name, "ploidy_type_accepted")] <- if (qc[id,pasteu(run.name,"dip.or.tetra")]=="dip") {
      if (qc[id,pasteu(run.name,"narrow.or.wide")]=="narrow") {
        ifelse(qc[id,pasteu(run.name,"fgenome_aroundpoint5.narrow")] <= thres.incorrectdiploid.aroundpoint5 |
                 qc[id,pasteu(run.name,"n50pcpeaks")]==0,"PASS","FAIL")
      } else if (qc[id,pasteu(run.name,"narrow.or.wide")]=="wide") {
        ifelse(qc[id,pasteu(run.name,"fgenome_aroundpoint5.wide")] <= thres.incorrectdiploid.aroundpoint5 |
                 qc[id,pasteu(run.name,"n50pcpeaks")]==0,"PASS","FAIL")
      }
    #} else if (qc[id,pasteu(run.name,"battenberg_ploidy")] > 2.6) {
    } else if (qc[id,pasteu(run.name,"dip.or.tetra")] == "tetra") {
      ifelse(qc[id,pasteu(run.name,"fgenome_cnodd")] >= thres.incorrecttetraploid.cnodd, "PASS","FAIL") 
             #& qc[id, pasteu(run.name, "vafpeaks_tetraploid")] == TRUE,"PASS","FAIL")
    }

    # calculate the new ploidy and purity for IF the ploidy is deemed incorrect - not necessarily used
    qc[id, pasteu(run.name, "newploidy_purity")] <- switch.ploidy.get.purity(qc[id, pasteu(run.name,"battenberg_purity")],
                                                                             qc[id, pasteu(run.name,"battenberg_ploidy")],
                                                                            qc[id, pasteu(run.name,"dip.or.tetra")])


    qc[id, pasteu(run.name, "newploidy_ploidy")] <- switch.ploidy.get.ploidy(qc[id, pasteu(run.name,"battenberg_purity")],
                                                                             qc[id, pasteu(run.name,"battenberg_ploidy")],
                                                                            qc[id, pasteu(run.name,"dip.or.tetra")])

    # add pass or fail for basic filters
    #qc[id, pasteu(run.name, "basic_filters_passed")] <- ifelse(qc[id, pasteu(run.name, "n_chrmissing")] != 0 |
    #                                                             qc[id, pasteu(run.name, "n_chrsizeincorrect")] != 0 |
    #                                                             qc[id, pasteu(run.name, "lsegs_homodellargest")] > thres.homodel.homodellargest |
    #                                                             qc[id, pasteu(run.name, "nclonalpeaks")] == 0 |
    #                                                             qc[id, pasteu(run.name, "nsuperclonalpeaks")] != 0 |
    #                                                             #is.na(qc[id, pasteu(run.name, "vafpeaks_score")]) | # flagging this as unassessable instead of failing it through insufficient mutations.
    #                                                             abs(qc[id, pasteu(run.name, "vafpeaks_score")]) > thres.purity.diff,"FAIL","PASS")

    # select reestimated purity/ploidy (these are only used if sample fails with filters)
    # if sample's ploidy is deemed wrong, make these the new parameters
    # if not, use vafpeaks params, unless it doesn't exist, in which case use DPClust params
    # if it's the last run (4), use DPClust params, unless the sample has switched ploidy twice, in which case use VAFPeaks                                        
                                               
    # if ploidy type fails, run with new ploidy params
    if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="FAIL" | qc[id, pasteu(run.name, "ploidy_type_accepted")]=="FLAG") {
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "newploidy_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "newploidy_ploidy")]
    } else if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS" &
        # (qc[id, pasteu(run.name, "vafpeaks_score")] > thres.purity.diff &
        !is.na(qc[id, pasteu(run.name, "vafpeaks_score")])) { # if ploidy type passes, and we have vafpeaks params, run with those (if we don't have vafpeaks here, use dpclust)
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "vafpeaks_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "vafpeaks_ploidy")]
    } else if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS" &
               is.na(qc[id, pasteu(run.name, "vafpeaks_score")])) {  #if ploidy type passes, and we have no vafpeaks params
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "dpclust_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "dpclust_ploidy")]
    } else if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS" & 
               qc[id, pasteu(run.name, "nsuperclonalpeaks")]>0) { # ploidy passes and we have a superclonal cluster, use DPClust parameters
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "dpclust_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "dpclust_ploidy")]
    } else if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS" &
              qc[id, pasteu(run.name, "peak.closest.to.clonal")] <= 0.95 & 
              qc[id, pasteu(run.name, "peak.closest.to.clonal")] >= 0.9 & 
             qc[id, pasteu(run.name, "nsuperclonalpeaks")]==0) { # ploidy passes, no superclone, highest cluster is 0.9-0.95, use DPC params
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "dpclust_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "dpclust_ploidy")]
    } else if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS" & 
               run.name=="run3") { # if ploidy passes, and we're on the last run, overwrite vafpeaks params with DPClust parameters
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "dpclust_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "dpclust_ploidy")]
    } else if (qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS" & 
               qc[id, pasteu(run.name, "lsegs_homodelall")] >= 100000000) { # if homdels are too large, use dpclust parameters not vafpeaks parameters
      qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "dpclust_purity")]
      qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "dpclust_ploidy")]
    }
                                               
    # if we have too much of the genome with homdels and the sample is diploid, switch to tetraploid
    if (qc[id,pasteu(run.name,"dip.or.tetra")] == "dip" & 
        qc[id, pasteu(run.name, "lsegs_homodellargest")] > thres.homodel.homodellargest) { 
         qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "newploidy_purity")]
         qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "newploidy_ploidy")]
    }
                                               
    # check if the new purity we've chosen differs from the current ones by more than 5% (and it's passed ploidy), if not, pick others
    # if none of them do, we don't rerun
    if (abs(qc[id, pasteu(run.name, "reestimated_purity")] - qc[id, pasteu(run.name, "battenberg_purity")]) < 0.05 &
        qc[id, pasteu(run.name, "ploidy_type_accepted")]=="PASS") {

        vafpurity = qc[id, pasteu(run.name, "vafpeaks_purity")] == qc[id, pasteu(run.name, "reestimated_purity")]
        dpcpurity = qc[id, pasteu(run.name, "dpclust_purity")] == qc[id, pasteu(run.name, "reestimated_purity")]

        # if currently using vafpeaks purity for next run
        if (!is.na(vafpurity) && vafpurity==T) {
                # check if dpclust purity diffes from current purity estimate by more than 5%, if so, use it, if not, mark sample as having no new possible params
                if (!is.na(dpcpurity) & (abs(qc[id, pasteu(run.name, "dpclust_purity")] - qc[id, pasteu(run.name, "battenberg_purity")]) >= 0.05)) {
                        qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "dpclust_purity")]
                        qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "dpclust_ploidy")]
                } else {
                        qc[id, pasteu(run.name, "reestimated_purity")] <- NA
                        qc[id, pasteu(run.name, "reestimated_ploidy")] <- NA
                }
        } else if (!is.na(dpcpurity) && dpcpurity==T) { # else if currently using dpcpurity
                # check if vafpeaks purity differs from current estimate by more than 5%, if so, use it
                 if (!is.na(vafpurity) & (abs(qc[id, pasteu(run.name, "vafpeaks_purity")] - qc[id, pasteu(run.name, "battenberg_purity")]) >= 0.05)) {
                        qc[id, pasteu(run.name, "reestimated_purity")] <- qc[id, pasteu(run.name, "vafpeaks_purity")]
                        qc[id, pasteu(run.name, "reestimated_ploidy")] <- qc[id, pasteu(run.name, "vafpeaks_ploidy")]
                } else {
                        qc[id, pasteu(run.name, "reestimated_purity")] <- NA
                        qc[id, pasteu(run.name, "reestimated_ploidy")] <- NA
                }
        }
    }                                     

  }

  log.message("applying filters")
  filters <- list()
  chrsizerun1 <- list()                                             
  # all chrs must be listed at least once in the subclones file
  filters$chrmissing <- ifelse(qc[ids, pasteu(run.name, "n_chrmissing")] != 0, "FAIL", "PASS")
  # chr size must pass the required threshold listed in the qc config file
  filters$chrsizewrong <- ifelse(qc[ids, pasteu(run.name, "n_chrsizeincorrect")] != 0, "FLAG", "PASS")
  # chr size must pass the required threshold listed in the qc config file in first run (when rerunning in certain cases later)
  # this is in a diff list so as not to interfere with filters list                                             
  chrsizerun1$chrsizewrongrun1 <- ifelse(qc[ids, "run1_n_chrsizeincorrect"] != 0, "FLAG", "PASS")                                             
  ## no single homdel can be larger than the threshold listed in the qc config file
  filters$homodeletions <- ifelse(qc[ids, pasteu(run.name, "lsegs_homodellargest")] > thres.homodel.homodellargest, "FAIL", "PASS")
  # if total length of homdels is > smaller threshold in qc config file (suggested 10MB), flag the sample
  # and if total length of homdels is > larger threshold in qc config file (suggested 100MB), fail the sample    
  #filters$homodeletions <- ifelse(qc[ids, pasteu(run.name, "lsegs_homodelall")] <= thres.homodel.homodelall.flag, "PASS",
  #                               ifelse(qc[ids, pasteu(run.name, "lsegs_homodelall")] > thres.homodel.homodelall.fail,"FAIL","FLAG"))
  # if total fraction of subclonal homdels is <20%, pass the sample on homdels
  #filters$homodeletions <- if (qc[ids, pasteu(run.name, "fgenome_homodelallsubclonal")] <= thres.homodel.homodelallsubclonal.fail) {
  #  filters$homodeletions <- "PASS"                                            
  #}                                            
  #filters$homodeletions <-ifelse(qc[ids, pasteu(run.name, "lsegs_homodelall")] > thres.homodel.homodelall.fail, "FAIL", "PASS")                                              
  # if total length of homdels is > smaller threshold in qc config file (suggested 10MB), flag the sample
  #filters$homodeletions <- ifelse(qc[ids, pasteu(run.name, "lsegs_homodelall")] > thres.homodel.homodelall.flag, "FLAG", "PASS")                                            
  # there must exist a clonal peak between the boundaries in the qc config file, dependent on narrow or wide status of sample
  filters$noclonalpeak <- ifelse(qc[ids, pasteu(run.name, "nclonalpeaks")] == 0, "FAIL", "PASS")
  # there must be no superclonal peaks that are of a size larger than the threshold in the qc config file
  filters$superclonalpeaks <- ifelse(qc[ids, pasteu(run.name, "nsuperclonalpeaks")] != 0, "FAIL", "PASS")

  # vafpeaks filter must have passed or be FLAG in order for the sample to pass (ie if not enough mutations to assess, we only use other metrics to assess the call)
  # (flag is only given if vafpeaks doesn't have enough mutations so can't pass or fail it)
  #filters$vafpeaks <- ifelse(qc[ids, pasteu(run.name, "vafpeaks")] == "FAIL", "FAIL", "PASS")  
  filters$vafpeaks <- qc[ids, pasteu(run.name, "vafpeaks")] # retain the FAIL, FLAG, and PASS options                                            

  # the ploidy call must be deemed correct, so if diploid, must have a ploidy classified as diploid through PCAWG eqn, and
  # can have no more than qc.config threshold of the genome between 0.5 boundary (if narrow sample (same boundary), or
  # can have a cluster of mutations between narrow 0.5 boundary if narrow sample (or wide boundary if wide sample)
  # but can't have both
  # and if tetraploid, must have a ploidy classified as tetraploid through PCAWG eqn (psi_t>LOH*-2+2.9)=tetra, else dip), and
  # must have at least 10% of the genome at 1+0, 2+1, or 3+2 states, and
  # must have peaks of mutations in 2:2 regions with multiplicity 1 and 2
  filters$ploidytype <- ifelse(qc[ids, pasteu(run.name,"ploidy_type_accepted")] == "PASS", "PASS", "FLAG")
  # rest of the filters must be deemed ok
  #filters$basicfilterspassed <- ifelse(filters$chrmissing=="PASS" &
  #                                       filters$chrsizewrong=="PASS" &
  #                                       filters$homodeletions=="PASS" &
  #                                       filters$noclonalpeak=="PASS" &
  #                                       filters$superclonalpeaks=="PASS" &
  #                                       (filters$vafpeaks=="PASS" | filters$vafpeaks=="FLAG"),"PASS","FAIL")                                          
                                  
  # classify the sample as having passed, failed, or is flagged
   # classify the sample as having passed, failed, or is flagged
  filters <- data.frame(filters, stringsAsFactors=F)
                                               
  # if any fail or flag exists at all, set that sample to rerun
  filters$overallfilter <- apply(filters, 1, get.worst.filter)

  # but now overwrite those samples in these special cases that we don't want to rerun
  # at this point they are marked with a flag or fail overall if any criteria has a flag or a fail 
  # vafpeaks flag/fail but everything else is ok, we don't rerun 
  filters$vafonly = (filters$vafpeaks == "FLAG" | filters$vafpeaks == "FAIL") &
        filters$chrmissing == "PASS" &
        (filters$chrsizewrong == "PASS" | filters$chrsizewrong == "FLAG") &
        filters$homodeletions == "PASS" &
        filters$ploidytype == "PASS" &
        filters$noclonalpeak == "PASS" &
        filters$superclonalpeaks == "PASS"
  # chr length flagged but everything else is ok, if run==1, we rerun in the normal way    
  # if chr length flagged and everything else is ok and run!=1, we check to see if the first run had a flag for chr size, 
  # if so, this is long runs of homozygosity, so we don't rerun                                              
  filters$chrsizeonly = (filters$vafpeaks == "FLAG" | filters$vafpeaks == "PASS") &                                    
        filters$chrmissing == "PASS" &
        filters$chrsizewrong == "FLAG" &
        run!=1 &
        chrsizerun1$chrsizewrongrun1 == "FLAG" &                                       
        filters$homodeletions == "PASS" &
        filters$ploidytype == "PASS" &
        filters$noclonalpeak == "PASS" &
        filters$superclonalpeaks == "PASS"
  # homdel flagged but everything else is ok, we don't rerun                                             
  filters$homdelflagonly = (filters$vafpeaks == "FLAG" | filters$vafpeaks == "PASS") &
        filters$chrmissing == "PASS" &
        (filters$chrsizewrong == "PASS" | filters$chrsizewrong == "FLAG") &
        filters$homodeletions == "FLAG" &
        filters$ploidytype == "PASS" &
        filters$noclonalpeak == "PASS" &
        filters$superclonalpeaks == "PASS"                                                  
                                               
  # set these exceptions as PASS                                             
  filters$overallfilter[which(filters$vafonly == T | 
                              filters$chrsizeonly == T |
                              filters$homdelflagonly == T)] = "PASS"                                     
                                               
  filters.qc <- filters
  dimnames(filters.qc) <- list(ids, pasteu(run.name, "filter", colnames(filters.qc)))
  qc <- data.frame(qc, filters.qc[rownames(qc), ])

  log.message("writing table of QC results")
  write.table(qc, file=filename.qc, quote=F, sep="\t", row.names=F, col.names=T)

}
afrangou/CleanCNA documentation built on Dec. 28, 2021, 8:21 p.m.