DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package

ALLELECOUNTER = "alleleCounter"
LINKAGEPULL = "Linkage_pull.pl"

#' Concatenate split files
#' 
#' Convenience function to concatenate a series of files specified in a file of file names.
#' This function assumes all files have the same layout.
#' @param fofn A file of file names to be concatenated
#' @param inputdir Full path to where the input files are stored
#' @param outfile Full path to where the output should be written
#' @param haveHeader Boolean that specifies whether the input files have a header
#' @author sd11
#' @export
concat_files = function(fofn, inputdir, outfile, haveHeader) {
  
  inputdir = paste(inputdir,"/", sep="")
  list_of_files = read.table(fofn, stringsAsFactors=F, header=F)[,1]
  
  output = data.frame()
  for (infile in list_of_files) {
    infile = paste(inputdir, infile, sep="")
    # Check if file is there and it contains data
    if (file.exists(infile) & file.info(infile)$size != 0) {
      dat = read.delim(infile, header=haveHeader, quote=NULL, stringsAsFactors=F)
      output = rbind(output, dat)
    }
  }
  
  write.table(output, file=outfile, col.names=haveHeader, row.names=F, sep="\t", quote=F)
}

#' Split a file per chromosome
#'
#' Convenience function to split an input file per chromosome. All it requires is that
#' the infile has as first column chromosome specification. The output files will be named
#' outdir/prefixCHROMNUMBERpostfix
#' @param infile The file to be split
#' @param prefix Prefix of the output file
#' @param postfix Postfix of the output file
#' @param outdir Directory where the output files are to be written
#' @param chrom_file A simple list of chromosomes to be considered
#' @author sd11
#' @export
split_by_chrom = function(infile, prefix, postfix, outdir, chrom_file) {
  outdir = paste(outdir, "/", sep="")
  
  # Check if there are lines in the file, otherwise it will crash this script
  if (file.info(infile)$size == 0) {
    print("No lines in loci file")
    q(save="no")
  }
  
  loci = read.delim(infile, stringsAsFactors=F, header=F)
  
  chroms = read.delim(chrom_file, stringsAsFactors=F, header=F)
  
  for (i in 1:nrow(chroms)) {
    selection = loci[loci[,1]==chroms[i,1],]
    chrom_id = chroms[i,2]
    write.table(selection, file=paste(outdir, prefix, chrom_id, postfix, sep=""), quote=F, row.names=F, col.names=F, sep="\t")
  }
}

############################################
# VCF 2 LOCI
############################################
#' Parse genome index
#' 
#' Convenience function that parses a reference genome index as generated
#' by samtools index
#' @param fai_file The index
#' @return A data frame with columns "chromosome", "length", "offset", "fasta_line_length", "line_blen"
#' @author sd11
#' @export
parseFai = function(fai_file) {
  fai = read.table(fai_file, header=F, stringsAsFactors=F)
  colnames(fai) = c("chromosome", "length", "offset", "fasta_line_length", "line_blen")
  return(fai)
}

#' Parse chromosomes to ignore file
#' 
#' Convenience function that parses an ignore file. This file
#' is expected to have a single column with just chromosome names
#' @param ignore_file The file specifying to be ignored chromosomes
#' @return A data frame with a single column named "chromosome"
#' @author sd11
#' @export
parseIgnore = function(ignore_file) {
  ign = read.table(ignore_file, header=F, stringsAsFactors=F)
  colnames(ign) = c("chromosome")
  return(ign)
}

#' Transform vcf to loci file
#' 
#' Function that dumps the loci of snvs from a series of vcf files into a single loci file
#' @param vcf_files A vector of vcf files to be considered
#' @param fai_file Reference genome index
#' @param ign_file A file with chromosomes to be excluded from consideration
#' @param outfile Where to store the output
#' @author sd11
#' @export
vcf2loci = function(vcf_files, fai_file, ign_file, outfile) {
  fai = parseFai(fai_file)
  ign = parseIgnore(ign_file)
  allowed_chroms = which(!(fai$chromosome %in% ign$chromosome))
  
  # Run through each supplied vcf file, collect the loci from each
  combined.loci = data.frame()
  for (vcf_file in vcf_files) {
    vcf.cols = ncol(read.delim(vcf_file, comment.char="#", header=F, stringsAsFactor=F, nrows=1))
    vcf.cols.default = 10 # vcf file standard contains 10 columns
    vcf.colClasses = c(NA, NA, "NULL", NA, NA, rep("NULL", 5+(vcf.cols-vcf.cols.default)))
    vcf.loci = read.delim(vcf_file, comment.char="#", header=F, stringsAsFactor=F, colClasses=vcf.colClasses)
    colnames(vcf.loci) = c("chromosome", "pos", "ref","alt")
    vcf.loci.sel = subset(vcf.loci, chromosome %in% fai$chromosome[allowed_chroms])
    combined.loci = rbind(combined.loci, vcf.loci.sel)
  }
  # Remove duplicate entries
  combined.loci = unique(combined.loci)
  write.table(combined.loci, col.names=F, quote=F, row.names=F, file=outfile, sep="\t")
}

############################################
# Mutation signatures
############################################

#' Obtain tri-nucleotide context. It reads in the supplied loci file and querries
#' the provided reference genome fasta for the context. Finally it writes out the
#' loci with annotated context.
#' @param loci_file A 4 column loci file with chrom, position, reference allele and alt allele
#' @param outfile Where to write the output
#' @param ref_genome Full path to a reference genome Fasta file
#' @author sd11
#' @export
getTrinucleotideContext = function(loci_file, outfile, ref_genome) {
  loci = read.table(loci_file, sep='\t', header=F, stringsAsFactors=F)
  loci.g = GenomicRanges::GRanges(seqnames=loci[,1], ranges=IRanges::IRanges(loci[,2], loci[,2]))
  r = Rsamtools::scanFa(file=ref_genome, resize(GenomicRanges::granges(loci.g), 3, fix="center"))
  write.table(cbind(loci, as.data.frame(r)), file=outfile, row.names=F, col.names=F, quote=F, sep="\t")
}

# #' Checks each tri-nucleotide context whether its CAG or CTG
# #' @param vector_of_trinucleotides A vector containing the base to check and its immediate neighbors
# #' @author sd11
# isDeaminase = function(vector_of_trinucleotides) {
#   return(grepl("(CAG)|(CTG)|(GAC)|(GTC)", vector_of_trinucleotides))
# }

#' Filter a with tri-nucleotide context annotated list of loci for a particular signature
#' supplied as a regex like so: (CAG)|(CTG)|(GAC)|(GTC).
#' @param signature_anno_loci_file Filepath to a with tri-nucleotide context annotated loci
#' @param signature_regex A regex that captures the signature to keep.
#' @param outfile Filepath to where to store the output.
#' @param trinucleotide_column Integer representing the column within the input file that contains the context
#' @param alt_alleles An optional vector with reference alleles to allow.
#' @param alt_allele_column The column in the annotated loci file that contains the reference base.
#' @author sd11
#' @export
filterForSignature = function(signature_anno_loci_file, signature_regex, outfile, trinucleotide_column=5, alt_alleles=NA, alt_allele_column=4) {
  signature_anno_loci = read.table(signature_anno_loci_file, sep='\t', header=F, stringsAsFactors=F)
  signature_anno_loci_filt = signature_anno_loci[grepl(signature_regex, signature_anno_loci[,trinucleotide_column]), ]

  if (!is.na(alt_alleles)) {
    regex_split = gsub("(", "", signature_regex, fixed=T)
    regex_split = gsub(")", "", regex_split, fixed=T)
    regex_split = unlist(strsplit(regex_split, "|", fixed=T))
    selection = rep(F, nrow(signature_anno_loci_filt))
    
    # Go through all signatures and their specific reference context
    for (i in 1:length(regex_split)) {
      alt_i = alt_alleles[i]
      sign_i = regex_split[i]
      # Select only those mutations that have this particular context and reference
      selection[signature_anno_loci_filt[,trinucleotide_column]==sign_i & signature_anno_loci_filt[,alt_allele_column]==alt_i] = T
    }
    signature_anno_loci_filt = signature_anno_loci_filt[selection,]
  }
  write.table(signature_anno_loci_filt, file=outfile, row.names=F, col.names=F, quote=F, sep="\t")
}

#' Convenience function that filters a with tri-nucleotide context annotated list of loci for 
#' cytosine deaminase signature, or C>T at CpG.
#' @param signature_anno_loci_file Filepath to a with tri-nucleotide context annotated loci
#' @param outfile Filepath to where to store the output.
#' @param ref_genome Full path to an indexed reference genome fasta file
#' @param trinucleotide_column Integer representing the column within the input file that contains the context
#' @param alt_allele_column The column in the annotated loci file that contains the reference base.
#' @author sd11
#' @export
filterForDeaminase = function(loci_file, outfile, ref_genome, trinucleotide_column=5, alt_allele_column=4) {
  signature_anno_loci_file = gsub(".txt", "_signature_anno.txt", loci_file)
  getTrinucleotideContext(loci_file, signature_anno_loci_file, ref_genome)
  filterForSignature(signature_anno_loci_file=signature_anno_loci_file, 
                     signature_regex="(CCG)|(GCC)|(CGG)|(GGC)", 
                     outfile=outfile, 
                     trinucleotide_column=trinucleotide_column, 
                     alt_alleles=c("T", "T", "A", "A"), 
                     alt_allele_column=alt_allele_column)
}


############################################
# Allele counting
############################################
#' Run alleleCount
#' 
#' Count the alleles for specified locations in the loci file. Expects alleleCount binary in $PATH
#' @param locifile A file with at least chromsome and position columns of the locations to be counted
#' @param bam A bam file
#' @param outfile Where to store the output
#' @param min_baq The minimum base quality required for a read to be counted
#' @param min_maq The minimum mapping quality required for a read to be counted
#' @author sd11
#' @export
alleleCount = function(locifile, bam, outfile, min_baq=20, min_maq=35) {
  cmd = paste(ALLELECOUNTER,
              "-b", bam,
              "-o", outfile,
              "-l", locifile,
              "-m", min_baq,
              "-q", min_maq, sep=" ")
  system(cmd, wait=T)
}

#' Dump allele counts from vcf - Sanger ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
dumpCounts.Sanger = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="sanger", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
}

#' Dump allele counts from vcf - DKFZ ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
dumpCounts.DKFZ = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="dkfz", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
}

#' Dump allele counts from vcf - Broad ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
dumpCounts.Broad = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="broad", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
}

#' Dump allele counts from vcf - MuSE ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
dumpCounts.Muse = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="muse", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
}

#' Dump allele counts from vcf - ICGC pancancer consensus pipeline
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned 
#' by alleleCounter. There are no counts for the matched normal. 
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
dumpCounts.ICGCconsensus = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
}

#' Dump allele counts from vcf - Mutect
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned 
#' by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param samplename Parameter specifying the samplename to be used for matching the right column in the VCF
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @author sd11
#' @export
dumpCounts.mutect = function(vcf_infile, tumour_outfile, samplename, normal_outfile=NA, refence_genome="hg19") {
  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="mutect", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
}

#' Dump allele counts from VCF
#' 
#' This function implements all the steps required for dumping counts from VCF
#' as supplied by the ICGC pancancer pipelines. 
#' @noRd
dumpCountsFromVcf = function(vcf_infile, tumour_outfile, centre, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
  # Helper function for writing the output  
  write.output = function(output, output_file) {
    write.table(output, file=output_file, col.names=T, quote=F, row.names=F, sep="\t")
  }
  
  # Read in the vcf and dump the tumour counts in the right format
  v = VariantAnnotation::readVcf(vcf_infile, refence_genome)
  tumour = getCountsTumour(v, centre=centre, samplename=samplename)
  tumour = formatOutput(tumour, v)
  write.output(tumour, tumour_outfile)
  
  # Optionally dump the normal counts in the right format
  if (!is.na(normal_outfile)) {
    normal = getCountsNormal(v, centre=centre, samplename=samplename)
    normal = formatOutput(normal, v)
    write.output(normal, normal_outfile)
  }
}

#' Format a 4 column counts table into the alleleCounter format. This function assumes A, C, G, T format.
#' @noRd
formatOutput = function(counts_table, v) {
  output = data.frame(as.character(seqnames(v)), start(ranges(v)), counts_table, rowSums(counts_table))
  colnames(output) = c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth")
  return(output)
}

#' Dump allele counts from vcf for normal
#' 
#' Returns an allele counts table for the normal sample
#' @param v The vcf file
#' @param centre The sequencing centre of which pipeline the vcf file originates
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
getCountsNormal = function(v, centre="sanger", samplename=NA) {
  if (centre=="sanger") {
    return(getAlleleCounts.Sanger(v, 1))
  } else if(centre=="dkfz") {
    sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "CONTROL")
    return(getAlleleCounts.DKFZ(v, sample_col))
  } else if (centre=="muse") {
    # Assuming the tumour name is provided
    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) != samplename)
    return (getAlleleCounts.MuSE(v, sample_col))
  } else if (centre=="broad") {
    print("The Broad ICGC pipeline does not report allele counts for the matched normal")
    q(save="no", status=1)
  } else if (centre=="icgc_consensus") {
    print("The ICGC consensus pipeline does not report allele counts for the matched normal")
    q(save="no", status=1)
  } else if (centre=="mutect") {
    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename)
    return(getAlleleCounts.mutect(v, sample_col))
  } else {
    print(paste("Supplied centre not supported:", centre))
    q(save="no", status=1)
  }
}

#' Dump allele counts from vcf for tumour
#' 
#' Returns an allele counts table for the tumour sample
#' @param v The vcf file
#' @param centre The sequencing centre of which pipeline the vcf file originates
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
getCountsTumour = function(v, centre="sanger", samplename=NA) {
  if (centre=="sanger") {
    return(getAlleleCounts.Sanger(v, 2))
  } else if (centre=="dkfz") {
    sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "TUMOR")
    return(getAlleleCounts.DKFZ(v, sample_col))
  } else if (centre=="muse") {
    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename)
    return (getAlleleCounts.MuSE(v, sample_col))
  } else if (centre=="broad") {
    return(getAlleleCounts.Broad(v, 1))
  } else if (centre=="icgc_consensus") {
    return(getAlleleCounts.ICGC_consensus(v))
  } else if (centre=="mutect") {
    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename)
    return(getAlleleCounts.mutect(v, sample_col))
  } else {
    print(paste("Supplied centre not supported:", centre))
    q(save="no", status=1)
  }
}

#' Dump allele counts from Sanger pipeline vcf
#' 
#' Helper function that dumps the allele counts from a Sanger pipeline VCF file
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
getAlleleCounts.Sanger = function(v, sample_col) {
  return(cbind(VariantAnnotation::geno(v)$FAZ[,sample_col]+VariantAnnotation::geno(v)$RAZ[,sample_col], 
               VariantAnnotation::geno(v)$FCZ[,sample_col]+VariantAnnotation::geno(v)$RCZ[,sample_col], 
               VariantAnnotation::geno(v)$FGZ[,sample_col]+VariantAnnotation::geno(v)$RGZ[,sample_col], 
               VariantAnnotation::geno(v)$FTZ[,sample_col]+VariantAnnotation::geno(v)$RTZ[,sample_col]))
}

#' Dump allele counts from the DKFZ pipeline
#' 
#' Helper function that takes a sample column and fetches allele counts. As the DKFZ pipeline does not
#' provide counts for each base, but just the alt and reference, we will provide just those and the
#' other bases with 0s.
#' 
#' Note: If there are multiple ALT alleles this function will only take the first mentioned!
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
getAlleleCounts.DKFZ = function(v, sample_col) {
  # Fetch counts for both forward and reverse ref/alt
  counts = VariantAnnotation::geno(v)$DP4[,sample_col,]
  counts.ref = counts[,1] + counts[,2] # ref forward/reverse
  counts.alt = counts[,3] + counts[,4] # alt forward/reverse
  allele.ref = as.character(VariantAnnotation::ref(v))
  allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
  
  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
  return(output)
}

#' Dump allele counts from the MuSE pipeline
#' 
#' Helper function that takes a sample column and fetches allele counts. As the MuSE pipeline does not
#' provide counts for each base, but just the alt and reference, we will provide just those and the
#' other bases with 0s.
#' 
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' Note2: This function assumes there are only two columns with read counts. The format allows for more
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
getAlleleCounts.MuSE = function(v, sample_col) {
  if (length(colnames(VariantAnnotation::geno(v)$AD)) > 2) {
    print("In getAlleleCounts.MuSE: Assuming 2 columns with read counts, but found more. This is not supported")
    q(save="no", status=1)
  }

  # An SNV can be represented by more than 1 alt alleles, here we pick the alt allele with the highest read count
  num.snvs = nrow(VariantAnnotation::geno(v)$AD)
  counts = array(NA, c(num.snvs, 2))
  allele.alt = array(NA, num.snvs)
  for (i in 1:num.snvs) {
	snv.counts = unlist(VariantAnnotation::geno(v)$AD[i,sample_col])
  	counts[i,1] = snv.counts[1] # The reference is the first base for which read counts are mentioned
	select_base = which.max(snv.counts[2:length(snv.counts)])
	allele.alt[i] = as.character(VariantAnnotation::alt(v)[[i]][select_base])
	select_base = select_base+1 # The reference is the first base for which read counts are mentioned
	counts[i,2] = snv.counts[select_base] 
  }

  allele.ref = as.character(VariantAnnotation::ref(v))
  
  output = construct_allelecounter_table(counts[i,1], counts[i,2], allele.ref, allele.alt)
  return(output)
}

#' Dump allele counts from the Broad pipeline
#' 
#' Helper function that takes a sample column and fetches allele counts. As the Broad pipeline does not
#' provide counts for each base, but just the alt and reference, we will provide just those and the
#' other bases with 0s.
#' 
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
getAlleleCounts.Broad = function(v, sample_col) {
  count.ref = as.numeric(unlist(VariantAnnotation::geno(v)$ref_count))
  count.alt = as.numeric(unlist(VariantAnnotation::geno(v)$alt_count))
  allele.ref = as.character(VariantAnnotation::ref(v))
  allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
  
  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
  return(output)
}

#' Dump allele counts in ICGC consensus SNV format
#' 
#' This function fetches allele counts from the info field in the VCF file.
#' 
#' @param v The vcf file
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
getAlleleCounts.ICGC_consensus = function(v) {
  count.alt = info(v)$t_alt_count
  count.ref = info(v)$t_ref_count
  allele.ref = as.character(VariantAnnotation::ref(v))
  allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
  
  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
  return(output)
}

#' Dump allele counts in Mutect format
#' 
#' This function fetches allele counts from the info field in the VCF file.
#' 
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
getAlleleCounts.mutect = function(v, sample_col) {
  counts = do.call(rbind, geno(v)$AD[,sample_col])
  count.ref = counts[,1]
  count.alt = counts[,2]
  allele.ref = as.character(VariantAnnotation::ref(v))
  allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
  return(output)
}

#' Function that constructs a table in the format of the allele counter
#' @param count.ref Number of reads supporting the reference allele
#' @param count.alt Number of reads supporting the variant allele
#' @param allele.ref The reference allele
#' @param allele.alt The variant allele
#' @return A data.frame consisting of four columns: Reads reporting A, C, G and T
#' @author sd11
#' @noRd
construct_allelecounter_table = function(count.ref, count.alt, allele.ref, allele.alt) {
  output = array(0, c(length(allele.ref), 4))
  nucleotides = c("A", "C", "G", "T")
  # Propagate the alt allele counts
  nucleo.index = match(allele.alt, nucleotides)
  for (i in 1:nrow(output)) {
    output[i,nucleo.index[i]] = count.alt[i]
  }
  
  # Propagate the reference allele counts
  nucleo.index = match(allele.ref, nucleotides)
  for (i in 1:nrow(output)) {
    output[i,nucleo.index[i]] = count.ref[i]
  }
  return(output)
}

############################################
# MUT 2 MUT phasing
############################################
#' Run linkage pull on SNVs vs SNVs
#' @noRd
run_linkage_pull_mut = function(output, loci_file, bam_file, bai_file) {
  #
  # Runs Linkage_pull 
  #
  nearbyMuts_file = paste(loci_file, "_nearbyMuts.txt", sep="")
  write.table(output, nearbyMuts_file, sep="\t", quote=F, col.names=F, row.names=F)
  
  zoomed_phase_file = paste(loci_file, "_zoomed_phase_info.txt", sep="")
  cmd=paste(LINKAGEPULL," ",nearbyMuts_file," ",bam_file," ",bai_file," ",zoomed_phase_file," Mut",sep="")
  print(paste("command=",cmd,sep=""))
  system(cmd,wait=T)
  
  count.data = read.delim(zoomed_phase_file,sep="\t",header=T, stringsAsFactors=F)
  
  return(count.data)
}

#' #' Phase mutation to mutation
#' #' 
#' #' Run mutation to mutation phasing. This function requires the Linkage_pull.pl script in $PATH.
#' #' @param loci_file A list of loci
#' #' @param phased_file File to save the output
#' #' @param bam_file Full path to the bam file
#' #' @param bai_file Full path to the bai file
#' #' @param max_distance The max distance of a mutation and SNP can be apart to be considered for phasing
#' #' @author sd11, dw9
#' #' @export
#' mut_mut_phasing = function(loci_file, phased_file, bam_file, bai_file, max_distance) {
#'   # Check if there are lines in the file, otherwise it will crash this script
#'   if (file.info(loci_file)$size == 0) {
#'     print("No lines in loci file")
#'     q(save="no")
#'   }
#'   
#'   # TODO: this must be removed
#'   chr.names = c(1:22,"X","Y")
#'   
#'   muts <- read.delim(loci_file, header=F, row.names=NULL, stringsAsFactors=F)
#'   names(muts) = c("CHR","POSITION","WT","MT")
#'   muts = muts[order(match(muts$CHR,chr.names),muts$POSITION),]
#'   
#'   # # TODO: Does this work with X and Y ??
#'   # if (!is.null(chrom)) {
#'   #   muts = muts[muts$CHR==chrom,]
#'   # }
#'   
#'   # Pairwise comparison of all muts, only take those that are close to eachother
#'   output <- data.frame(Chr = vector(mode="character",length=0), Pos1 = vector(mode="numeric",length=0), Ref1 = vector(mode="character",length=0), Var1 = vector(mode="character",length=0), Pos2 = vector(mode="numeric",length=0), Ref2 = vector(mode="character",length=0), Var2 = vector(mode="character",length=0))
#'   for(chr in chr.names){
#'     chr.muts = muts[muts$CHR==chr,]
#'     no.muts = nrow(chr.muts)
#'     for(i in 1:(no.muts-1)){
#'       dist = chr.muts$POSITION[(i+1):no.muts] - chr.muts$POSITION[i]
#'       inds = which(dist <= max_distance) # 700
#'       if(length(inds)>0){
#'         output <- rbind(output,data.frame(Chr = chr, Pos1 = chr.muts$POSITION[i], Ref1 = chr.muts$WT[i], Var1 = chr.muts$MT[i], Pos2 = chr.muts$POSITION[i+inds], Ref2 = chr.muts$WT[i+inds], Var2 = chr.muts$MT[i+inds]))
#'       }
#'     }
#'   }
#'   
#'   if (nrow(output) > 0) {
#'     # Run linkage pull on the chromosome locations mentioned in the data.frame output
#'     count.data = run_linkage_pull_mut(output, loci_file, bam_file, bai_file)
#'     
#'     # Categorise pairs of mutations
#'     count.data$phasing = NA
#'     for(h in 1:nrow(count.data)){
#'       counts = count.data[h,8:11]
#'       print(counts) 
#'       if(counts[2]>0 & counts[3]+counts[4] == 0){
#'         count.data$phasing[h]="phased"
#'         #}else if(counts[2]==0 & counts[3]+counts[4] > 0){
#'         #require BOTH WT-mut and mut-WT pairs
#'       }else if(counts[2]==0 & counts[3]>0 & counts[4]>0){  
#'         count.data$phasing[h]="anti-phased"
#'       }else if(counts[2]>0 && counts[3]>0 && counts[4]==0){
#'         count.data$phasing[h]="clone-subclone"
#'       }else if(counts[2]>0 && counts[3]==0 && counts[4]>0){
#'         count.data$phasing[h]="subclone-clone"
#'       }  
#'     }
#'     
#'     write.table(count.data[!is.na(count.data$phasing),],phased_file,sep="\t",quote=F,row.names=F)
#'   }
#' }

############################################
# MUT 2 CN phasing
############################################
#' Run linkage pull on SNVs vs SNPs
#' @noRd
run_linkage_pull_snp = function(loci_file, bam_file, bai_file, chr, pos1, ref1, var1, pos2, ref2, var2, af, af_phased) {
  #
  # Runs the Linkage_pull.pl script with the given columns as its input. 
  # Returns a dataframe with the given columns, plus the linkage information that was pulled from the BAM file
  #
  
  output = data.frame(Chr=chr, Pos1=pos1, Ref1=ref1, Var1=var1, Pos2=pos2, Ref2=ref2, Var2=var2, AF=af, AFphased=af_phased)
  
  linkedFile = paste(loci_file, "_muts_linkedSNPs.txt",sep="")
  write.table(output[,1:7], linkedFile, sep="\t", quote=F, col.names=F, row.names=F)
  
  outfile = paste(loci_file, "_zoomed_mutcn_phase.txt", sep="")
  cmd=paste(LINKAGEPULL," ",linkedFile," ",bam_file," ",bai_file," ",outfile," SNP",sep="")
  print(paste("command=",cmd,sep=""))
  system(cmd,wait=T)
  
  input <- read.delim(outfile, header=T, sep="\t", stringsAsFactors=F)
  linked.muts <- cbind(output, input[,8:11])
  
  return(linked.muts)
}

#' #' Phase mutation to SNP/copy number
#' #' 
#' #' Run mutation to copy number phasing. This function requires the Linkage_pull.pl script in $PATH.
#' #' Note: This function should either be run separately per chromosome and then combined with \code{\link{concat_files}}
#' #' or on all chromsomes in one go, but then the _allHaplotypeInfo.txt Battenberg files need to be concatenated first.
#' #' @param loci_file A list of loci
#' #' @param phased_file The .BAFsegmented.txt output file from Battenberg
#' #' @param hap_file Path to the _allHaplotypeInfo.txt Battenberg output file to be used
#' #' @param bam_file Full path to the bam file
#' #' @param bai_file Full path to the bai file
#' #' @param outfile File to save the output
#' #' @param max_distance The max distance of a mutation and SNP can be apart to be considered for phasing
#' #' @author sd11, dw9
#' #' @export
#' mut_cn_phasing = function(loci_file, phased_file, hap_file, bam_file, bai_file, outfile, max_distance) {
#'   
#'   if (file.info(loci_file)$size == 0) {
#'     linked.muts = data.frame(matrix(rep(NA, 13), nrow=1))
#'     colnames(linked.muts) = c("Chr","Pos1","Ref1","Var1","Pos2","Ref2","Var2","AF","AFphased","Num_linked_to_A","Num_linked_to_C","Num_linked_to_G","Num_linked_to_T")
#'     linked.muts = na.omit(linked.muts)
#'   } else if(nrow(read.delim(loci_file, header=F, stringsAsFactors=F, fill=T))==0) {
#'     linked.muts = data.frame(matrix(rep(NA, 13), nrow=1))
#'     colnames(linked.muts) = c("Chr","Pos1","Ref1","Var1","Pos2","Ref2","Var2","AF","AFphased","Num_linked_to_A","Num_linked_to_C","Num_linked_to_G","Num_linked_to_T")
#'     linked.muts = na.omit(linked.muts)
#'   } else {
#'     # TODO: is this filtering required when just supplying loci files from a single chromosome?
#'     chr.muts = read.delim(loci_file, header=F, stringsAsFactors=F, fill=T)
#'     names(chr.muts) = c("CHR","POSITION","REF_BASE","MUT_BASE")
#'     
#'     # Match phased SNPs and their haplotypes together
#'     phased = read.delim(phased_file, header=T, stringsAsFactors=F, quote="\"")
#'     # Compatible with both BB setups that have row.names and those that don't
#'     if (ncol(phased) == 6) {
#'       colnames(phased) = c("SNP", "Chr","Pos", "AF", "AFphased", "AFsegmented")
#'     } else {
#'       colnames(phased) = c("Chr","Pos", "AF", "AFphased", "AFsegmented")
#'     }
#'     
#'     # TODO: check that chromosomes are using the same names between loci and phased files
#'     phased = phased[phased$Chr %in% chr.muts$CHR,]
#'     
#'     hap.info = read.delim(hap_file, sep=" ", header=F, row.names=NULL, stringsAsFactors=F)
#'     # Compatible with both BB setups that have row.names and those that don't
#'     if (ncol(hap.info) == 7) {
#'       colnames(hap.info) = c("SNP","dbSNP","pos","ref","mut","ref_count","mut_count")
#'     } else {
#'       colnames(hap.info) = c("dbSNP","pos","ref","mut","ref_count","mut_count")
#'     }
#'     # get haplotypes that match phased heterozygous SNPs
#'     hap.info = hap.info[match(phased$Pos,hap.info$pos),]
#'     
#'     # Synchronise dfs in case some SNPs are not in hap.info
#'     selection = !is.na(hap.info$pos)
#'     hap.info = hap.info[selection,]
#'     phased = phased[selection,]
#'     
#'     #220212
#'     #phased$AF[hap.info$ref_count==1] = 1-phased$AF[hap.info$ref_count==1]
#'     phased$Ref = hap.info$ref
#'     phased$Var = hap.info$mut
#'     
#'     # Annotate the chr.muts df with the min abs distance to a phased SNP
#'     chr.muts$dist = sapply(1:dim(chr.muts)[1], function(i,p,m) min(abs(p$Pos - m$POSITION[i])), p=phased, m=chr.muts)
#'     chr.muts$snp.index = sapply(1:dim(chr.muts)[1], function(i,p,m) which.min(abs(p$Pos - m$POSITION[i])), p=phased, m=chr.muts)
#'     # Use only those to a SNP
#'     muts <- chr.muts[chr.muts$dist < max_distance,] #700
#'     snps <- phased[muts$snp.index,]
#'     
#'     linked.muts = run_linkage_pull_snp(loci_file, bam_file, bai_file, muts$CHR, muts$POSITION, muts$REF_BASE, muts$MUT_BASE, snps$Pos, snps$Ref, snps$Var, snps$AF, snps$AFphased)
#'     
#'     # Categorise where the mutation is with respect to the CN event
#'     ACGT = 10:13
#'     names(ACGT) <- c("A", "C", "G", "T")
#'     linked.muts$Parental <- rep(NA, dim(linked.muts)[1])
#'     if (nrow(linked.muts) > 0) {
#'       for (i in 1:nrow(linked.muts)) {
#'         
#'         # Fetch allele frequency
#'         af = linked.muts$AF[i]
#'         # Get number of reads covering the ref mutation allele
#'         ref_count = hap.info[hap.info$pos==linked.muts$Pos2[i],]$ref_count
#'         # Get number of reads covering the alt mutation allele
#'         alt_count = hap.info[hap.info$pos==linked.muts$Pos2[i],]$mut_count
#'         # Number of reads covering SNP allele A, that also cover mutation alt
#'         linked_to_A = linked.muts[i,ACGT[linked.muts$Ref2[i]]]
#'         # Number of reads covering SNP allele B, that also cover mutation alt
#'         linked_to_B = linked.muts[i,ACGT[linked.muts$Var2[i]]]
#'         
#'         if (af < 0.5 & alt_count==1 & linked_to_A > 0 & linked_to_B == 0) {
#'           linked.muts$Parental[i] = "MUT_ON_DELETED"
#'         } else if (af < 0.5 & alt_count==1 & linked_to_A == 0 & linked_to_B > 0) {
#'           linked.muts$Parental[i] = "MUT_ON_RETAINED"
#'         } else if (af > 0.5 & alt_count==1 & linked_to_A > 0 & linked_to_B == 0) {
#'           linked.muts$Parental[i] = "MUT_ON_RETAINED"
#'         } else if (af > 0.5 & alt_count==1 & linked_to_A == 0 & linked_to_B > 0) {
#'           linked.muts$Parental[i] = "MUT_ON_DELETED"
#'         } else if (af > 0.5 & ref_count==1 & linked_to_A > 0 & linked_to_B == 0) {
#'           linked.muts$Parental[i] = "MUT_ON_DELETED"
#'         } else if (af > 0.5 & ref_count==1 & linked_to_A == 0 & linked_to_B > 0) {
#'           linked.muts$Parental[i] = "MUT_ON_RETAINED"
#'         } else if (af < 0.5 & ref_count==1 & linked_to_A > 0 & linked_to_B == 0) {
#'           linked.muts$Parental[i] = "MUT_ON_RETAINED"
#'         } else if (af < 0.5 & ref_count==1 & linked_to_A == 0 & linked_to_B > 0) {
#'           linked.muts$Parental[i] = "MUT_ON_DELETED"
#'         }
#'       }
#'     }
#'   }
#'   
#'   write.table(linked.muts,outfile, sep="\t", quote=F, row.names=F)
#' }

############################################
# Copy number convert scripts
############################################

#' Transform ASCAT output into Battenberg
#' 
#' This function takes the ascat segments, acf and ploidy files to create
#' a minimum Battenberg subclones and rho_and_psi file for use in pre-processing.
#' These files only contain the essentials required by this package.
#' @param outfile.prefix String prefix for the output files. Internally _subclones.txt and _rho_and_psi.txt will be added.
#' @param segments.file String pointing to a segments.txt ASCAT output file
#' @param acf.file String pointing to a acf.txt ASCAT output file
#' @param ploidy.file String pointing to a ploidy.txt ASCAT output file
#' @author sd11
#' @export
ascatToBattenberg = function(outfile.prefix, segments.file, acf.file, ploidy.file) {
  # Construct a minimum Battenberg file with the copy number segments
  d = read.table(segments.file, header=T, stringsAsFactors=F)
  subclones = d[,2:6]
  colnames(subclones)[4] = "nMaj1_A"
  colnames(subclones)[5] = "nMin1_A"
  subclones$frac1_A = 1
  subclones$nMaj2_A = NA
  subclones$nMin2_A = NA
  subclones$frac2_A = NA
  subclones$SDfrac_A = NA
  write.table(subclones, paste(outfile.prefix, "_subclones.txt", sep=""), quote=F, sep="\t")
  
  # Now construct a minimum rho/psi file
  cellularity = read.table(acf.file, header=T)[1,1]
  ploidy = read.table(ploidy.file, header=T)[1,1]
  
  purity_ploidy = array(NA, c(3,5))
  colnames(purity_ploidy) = c("rho", "psi", "ploidy", "distance", "is.best")
  rownames(purity_ploidy) = c("ASCAT", "FRAC_GENOME", "REF_SEG")
  purity_ploidy["FRAC_GENOME", "rho"] = cellularity
  purity_ploidy["FRAC_GENOME", "psi"] = ploidy
  write.table(purity_ploidy, paste(outfile.prefix, "_rho_and_psi.txt", sep=""), quote=F, sep="\t")
}

#' Transform ASCAT NGS output into Battenberg
#' 
#' This function takes the ascat copynumber.caveman.csv and samplestatistics files to create
#' a minimum Battenberg subclones and rho_and_psi file for use in pre-processing.
#' These files only contain the essentials required by this package.
#' @param outfile.prefix String prefix for the output files. Internally _subclones.txt and _rho_and_psi.txt will be added.
#' @param copynumber.caveman.file String pointing to an ASCAT NGS copynumber.caveman.csv file
#' @param samplestatistics.file String point to an ASCAT NGS samplestatistics.csv file
#' @author sd11
#' @export
ascatNgsToBattenberg = function(outfile.prefix, copynumber.caveman.file, samplestatistics.file) {
  # Construct a minimum Battenberg file with the copy number segments
  d = read.table(copynumber.caveman.file, sep=",", header=F, stringsAsFactors=F)
  colnames(d) = c("count", "chr", "startpos", "endpos", "normal_total", "normal_minor", "tumour_total", "tumour_minor")
  subclones = d[,2:4]
  subclones$nMaj1_A = d$tumour_total-d$tumour_minor
  subclones$nMin1_A = d$tumour_minor
  subclones$frac1_A = 1
  subclones$nMaj2_A = NA
  subclones$nMin2_A = NA
  subclones$frac2_A = NA
  write.table(subclones, paste(outfile.prefix, "_subclones.txt", sep=""), quote=F, sep="\t")
  
  # Now construct a minimum rho/psi file
  samplestats = read.table(samplestatistics.file, header=F, stringsAsFactors=F)
  
  purity_ploidy = array(NA, c(3,5))
  colnames(purity_ploidy) = c("rho", "psi", "ploidy", "distance", "is.best")
  rownames(purity_ploidy) = c("ASCAT", "FRAC_GENOME", "REF_SEG")
  purity_ploidy["FRAC_GENOME", "rho"] = samplestats[samplestats$V1=="rho",2]
  purity_ploidy["FRAC_GENOME", "psi"] = samplestats[samplestats$V1=="Ploidy",2]
  write.table(purity_ploidy, paste(outfile.prefix, "_rho_and_psi.txt", sep=""), quote=F, sep="\t")
}


############################################
# Combine all the steps into a DP input file
############################################

#' Check if the Y chromosome is present and the donor is male. If this statement is TRUE we need to 
#' there is no estimate of the Y chromosome, which BB currently provides as two copies of X
#' @param subclone.data A read-in Battenberg subclones.txt file as a data.frame
#' @return The input data.frame with a Y chromosome entry added and possible X chromosome adjustment
#' @author sd11
#' @noRd
addYchromToBattenberg = function(subclone.data) {
  # Take the largest segment on X chromosome
  xlargest = which.max(subclone.data[subclone.data$chr=="X", ]$endpos - subclone.data[subclone.data$chr=="X", ]$startpos)
  xlargest = subclone.data[subclone.data$chr=="X", ][xlargest,]
  
  # Get the total availability of X
  xnmaj = xlargest$nMaj1_A * xlargest$frac1_A + ifelse(is.na(xlargest$frac2_A), 0, xlargest$nMaj2_A * xlargest$frac2_A)
  xnmin = xlargest$nMin1_A * xlargest$frac1_A + ifelse(is.na(xlargest$frac2_A), 0, xlargest$nMin2_A * xlargest$frac2_A)
  
  # If there are no two copies we assume that Y was lost and make no changes
  if (xnmaj + xnmin >=2 & xnmin >= 1) {
    # We have at least 1 copy of either allele. One must therefore be the Y chromosome
    
    # Remove a copy of the X chromosome
    ynMaj_1 = subclone.data[subclone.data$chr=="X", c("nMin1_A")]
    yfrac_1 = subclone.data[subclone.data$chr=="X", c("frac1_A")]
    subclone.data[subclone.data$chr=="X", c("nMin1_A")] = 0
    
    # if there was subclonal copy number we need to remove/adapt that too
    if (!is.na(xlargest$frac2_A)) {
      ynMaj_2 = subclone.data[subclone.data$chr=="X", c("nMin2_A")]
      yfrac_2 = subclone.data[subclone.data$chr=="X", c("frac2_A")]
      subclone.data[subclone.data$chr=="X", c("nMin2_A")] = 0
    } else {
      ynMaj_2 = NA
      yfrac_2 = NA
    }
  } else {
    # No 2 copies, we assume that Y has been lost
    ynMaj_1 = 0
    yfrac_1 = 1
    ynMaj_2 = NA
    yfrac_2 = NA
  }
  
  # Add Y
  subclone.data = rbind(subclone.data, 
                        data.frame(chr="Y", startpos=0, endpos=59373566, BAF=NA, pval=NA, LogR=NA, ntot=NA, 
                                   nMaj1_A=ynMaj_1, nMin1_A=0, frac1_A=yfrac_1, nMaj2_A=ynMaj_2, nMin2_A=NA, frac2_A=yfrac_2, SDfrac_A=NA, SDfrac_A_BS=NA, frac1_A_0.025=NA, frac1_A_0.975=NA, 
                                   nMaj1_B=NA, nMin1_B=NA, frac1_B=NA, nMaj2_B=NA, nMin2_B=NA, frac2_B=NA, SDfrac_B=NA, SDfrac_B_BS=NA, frac1_B_0.025=NA, frac1_B_0.975=NA, 
                                   nMaj1_C=NA, nMin1_C=NA, frac1_C=NA, nMaj2_C=NA, nMin2_C=NA, frac2_C=NA, SDfrac_C=NA, SDfrac_C_BS=NA, frac1_C_0.025=NA, frac1_C_0.975=NA, 
                                   nMaj1_D=NA, nMin1_D=NA, frac1_D=NA, nMaj2_D=NA, nMin2_D=NA, frac2_D=NA, SDfrac_D=NA, SDfrac_D_BS=NA, frac1_D_0.025=NA, frac1_D_0.975=NA, 
                                   nMaj1_E=NA, nMin1_E=NA, frac1_E=NA, nMaj2_E=NA, nMin2_E=NA, frac2_E=NA, SDfrac_E=NA, SDfrac_E_BS=NA, frac1_E_0.025=NA, frac1_E_0.975=NA, 
                                   nMaj1_F=NA, nMin1_F=NA, frac1_F=NA, nMaj2_F=NA, nMin2_F=NA, frac2_F=NA, SDfrac_F=NA, SDfrac_F_BS=NA, frac1_F_0.025=NA, frac1_F_0.975=NA)
  )
  return(subclone.data)
}

#' Main function that creates the DP input file. A higher level function should be called by users
#' @noRd
GetDirichletProcessInfo<-function(outputfile, cellularity, info, subclone.file, is.male = F, out.dir = NULL, SNP.phase.file = NULL, mut.phase.file = NULL, adjust_male_y_chrom=F){
  
  subclone.data = read.table(subclone.file,sep="\t",header=T,stringsAsFactors=F)
  # Add in the Y chrom if donor is male and Battenberg hasn't supplied it (BB returns X/Y ad multiple copies of X for men)
  if (is.male & (! "Y" %in% subclone.data$chr) & adjust_male_y_chrom) {
    subclone.data = addYchromToBattenberg(subclone.data)
  }
  subclone.data.gr = GenomicRanges::GRanges(subclone.data$chr, IRanges::IRanges(subclone.data$startpos, subclone.data$endpos), rep('*', nrow(subclone.data)))
  elementMetadata(subclone.data.gr) = subclone.data[,3:ncol(subclone.data)]
  subclone.data.gr = sortSeqlevels(subclone.data.gr)
  
  info_anno = as.data.frame(cbind(array(NA, c(length(info), 7)))) 
  colnames(info_anno) = c('subclonal.CN','nMaj1','nMin1','frac1','nMaj2','nMin2','frac2')
  inds = findOverlaps(info, subclone.data.gr)  
  info_anno[queryHits(inds),2:7] = subclone.data[subjectHits(inds),][,c("nMaj1_A", "nMin1_A", "frac1_A", "nMaj2_A", "nMin2_A", "frac2_A")]
  
  CN1 = (info_anno[queryHits(inds),]$nMaj1 + info_anno[queryHits(inds),]$nMin1) * info_anno[queryHits(inds),]$frac1
  # If frac is not one for allele 1 (i.e. not only CN data for allele 1), add the CN contribution of allele 2 as well
  CN2 = (info_anno[queryHits(inds),]$nMaj2 + info_anno[queryHits(inds),]$nMin2) * info_anno[queryHits(inds),]$frac2 * ifelse(info_anno[queryHits(inds),]$frac1 != 1, 1, 0)
  CN2[is.na(CN2)] = 0
  info_anno[queryHits(inds),]$subclonal.CN = CN1 + CN2
  elementMetadata(info) = cbind(as.data.frame(elementMetadata(info)), info_anno)
  
  info$phase="unphased"
  if (!is.null(SNP.phase.file) & SNP.phase.file!="NA") {
    phasing = read.table(SNP.phase.file, header=T, stringsAsFactors=F) #header=T, skip=1, 
    phasing.gr = GenomicRanges::GRanges(phasing$Chr, IRanges::IRanges(phasing$Pos1, phasing$Pos1))
    phasing.gr$phasing = phasing$Parental
    inds = findOverlaps(info, phasing.gr)  
    info$phase[queryHits(inds)] = phasing.gr$phasing[subjectHits(inds)]
    
    info$phase[is.na(info$phase)]="unphased"
  }
  
  if(is.male & "chr" %in% names(info)){
    normal.CN = rep(2,nrow(info))
    normal.CN[info$chr=="X"| info$chr=="Y"] = 1
    info$mutation.copy.number = mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity, normal.CN)
  }else{
    info$mutation.copy.number = mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity)
  }
  
  # convert MCN to subclonal fraction - tricky for amplified mutations
  info$subclonal.fraction = info$mutation.copy.number
  expected.burden.for.MCN = mutationCopyNumberToMutationBurden(rep(1,length(info)),info$subclonal.CN,cellularity)
  non.zero.indices = which(info$mut.count>0 & !is.na(expected.burden.for.MCN))
  #test for mutations in more than 1 copy
  p.vals = sapply(1:length(non.zero.indices),function(v,e,i){
    prop.test(v$mut.count[i],v$mut.count[i] + v$WT.count[i],e[i],alternative="greater")$p.value 
  },v=info[non.zero.indices,], e=expected.burden.for.MCN[non.zero.indices])
  amplified.muts = non.zero.indices[p.vals<=0.05]
  
  info$no.chrs.bearing.mut = 1	
  
  #copy numbers of subclones can only differ by 1 or 0 (as assumed when calling subclones)
  if(length(amplified.muts)>0){		
    for(a in 1:length(amplified.muts)){
      max.CN2=0
      #use phasing info - if on 'deleted' (lower CN chromosome), use the minor copy number
      if(info$phase[amplified.muts[a]]=="MUT_ON_DELETED"){
        print("mut on minor chromosome")
        max.CN1 = info$nMin1[amplified.muts[a]]
        frac1 = info$frac1[amplified.muts[a]]
        frac2=0
        if(!is.na(info$nMin2[amplified.muts[a]])){
          #swap subclones, so that the one with the higher CN is first
          if(info$nMin2[amplified.muts[a]]>max.CN1){
            max.CN2 = max.CN1
            max.CN1 = info$nMin2[amplified.muts[a]]
            frac2 = frac1
            frac1 = info$frac2[amplified.muts[a]]
          }else{
            max.CN2 = info$nMin2[amplified.muts[a]]
            frac2 = info$frac2[amplified.muts[a]]
          }
        }					
      }else{
        max.CN1 = info$nMaj1[amplified.muts[a]]
        frac1 = info$frac1[amplified.muts[a]]
        frac2=0
        if(!is.na(info$nMaj2[amplified.muts[a]])){
          #swap subclones, so that the one with the higher CN is first
          if(info$nMaj2[amplified.muts[a]]>max.CN1){
            max.CN2 = max.CN1
            max.CN1 = info$nMaj2[amplified.muts[a]]
            frac2 = frac1
            frac1 = info$frac2[amplified.muts[a]]						
          }else{
            max.CN2 = info$nMaj2[amplified.muts[a]]
            frac2 = info$frac2[amplified.muts[a]]
          }
        }	
      }
      best.err = info$mutation.copy.number[amplified.muts[a]] - 1
      best.CN=1
      for(j in 1:max.CN1){
        for(k in (j-1):min(j,max.CN2)){
          potential.CN = j * frac1 + k * frac2
          err = abs(info$mutation.copy.number[amplified.muts[a]]/potential.CN-1)
          if(err<best.err){
            info$no.chrs.bearing.mut[amplified.muts[a]] = potential.CN
            best.err=err
            best.CN = potential.CN
          }
        }
      }
      info$subclonal.fraction[amplified.muts[a]] = info$mutation.copy.number[amplified.muts[a]] / best.CN
    }
  }
  
  ##########################################################################
  #test for subclonal mutations
  
  #test whether mut burden is less than expected value for MCN = 1
  p.vals1 = sapply(1:length(non.zero.indices),function(v,e,i){
    prop.test(v$mut.count[i],v$mut.count[i] + v$WT.count[i], e[i], alternative="less")$p.value
  },v=info[non.zero.indices,], e=expected.burden.for.MCN[non.zero.indices])
  #test whether mut burden is above error rate (assumed to be 1 in 200)
  p.vals2 = sapply(1:length(non.zero.indices),function(v,i){
    prop.test(v$mut.count[i],v$mut.count[i] + v$WT.count[i],0.005,alternative="greater")$p.value
  },v=info[non.zero.indices,])
  
  subclonal.muts = non.zero.indices[p.vals1<=0.05 & p.vals2<=0.05]
  
  # use subclonal CN that minimises the difference in subclonal fraction from 1
  if(length(subclonal.muts)>0){
    for(a in 1:length(subclonal.muts)){
      #if there are no subclonal CNVs, don't adjust subclonal fraction
      if(is.na(info$frac2[subclonal.muts[a]])){next}
      #assume subclonal muts are on one chromosome copy, therefore mutation copy number must be subclonal fraction of the higher CN subclone (i.e. lost in lower CN subclone) or 1 (i.e. present in both subclones)
      if(info$nMaj1[subclonal.muts[a]]+info$nMin1[subclonal.muts[a]] > info$nMaj2[subclonal.muts[a]]+info$nMin2[subclonal.muts[a]]){	
        possible.subclonal.fractions = c(info$frac1[subclonal.muts[a]],1)
      }else{
        possible.subclonal.fractions = c(info$frac2[subclonal.muts[a]],1)
      }
      best.CN = possible.subclonal.fractions[which.min(abs(info$mutation.copy.number[subclonal.muts[a]]/possible.subclonal.fractions - 1))]
      #extra test 200313 - check whether subclonal CN results in clonal mutation, otherwise subclonal CN doesn't explain subclonal MCN
      if(best.CN != 1 & prop.test(info$mut.count[subclonal.muts[a]],info$mut.count[subclonal.muts[a]]+info$WT.count[subclonal.muts[a]],expected.burden.for.MCN[subclonal.muts[a]] * best.CN)$p.value > 0.05){
        info$subclonal.fraction[subclonal.muts[a]] = info$mutation.copy.number[subclonal.muts[a]] / best.CN
        info$no.chrs.bearing.mut[subclonal.muts[a]] = best.CN
      }
    }
  }	
  
  possible.zero.muts = intersect((1:length(info))[-non.zero.indices],which(!is.na(info$nMin1)))
  possible.zero.muts = c(possible.zero.muts,non.zero.indices[p.vals2>0.05])
  if(length(possible.zero.muts)>0){
    del.indices = which(info$nMin1[possible.zero.muts]==0 & !info$phase[possible.zero.muts]=="MUT_ON_RETAINED")
    info$subclonal.fraction[possible.zero.muts[del.indices]] = NA
    info$no.chrs.bearing.mut[possible.zero.muts[del.indices]] = 0
  }
  
  # convert GenomicRanges object to df
  df = data.frame(chr=as.data.frame(seqnames(info)),
                  start=start(info)-1,
                  end=end(info))
  df = cbind(df, as.data.frame(elementMetadata(info)))
  colnames(df)[1] = "chr"
  df = df[with(df, order(chr)),]
  print(head(df))
  write.table(df, outputfile, sep="\t", row.names=F, quote=F)
}

#' Convenience function to load the cellularity from a rho_and_psi file
#' @noRd
GetCellularity <- function(rho_and_psi_file) {
  d = read.table(rho_and_psi_file, header=T, stringsAsFactors=F)
  return(d['FRAC_GENOME','rho'])
}

#' Convenience function to fetch WTCount and mutCount
#'@noRd
GetWTandMutCount <- function(loci_file, allele_frequencies_file) {
  subs.data = read.table(loci_file, sep='\t', header=F, stringsAsFactors=F)
  subs.data = subs.data[order(subs.data[,1], subs.data[,2]),]
  
  # Replace dinucleotides and longer with just the first base. Here we assume the depth of the second base is the same and the number of dinucleotides is so low that removing the second base is negligable
  subs.data[,3] = apply(as.data.frame(subs.data[,3]), 1, function(x) { substring(x, 1,1) })
  subs.data[,4] = apply(as.data.frame(subs.data[,4]), 1, function(x) { substring(x, 1,1) })
  
  subs.data.gr = GenomicRanges::GRanges(subs.data[,1], IRanges::IRanges(subs.data[,2], subs.data[,2]), rep('*', nrow(subs.data)))
  elementMetadata(subs.data.gr) = subs.data[,c(3,4)]
  
  alleleFrequencies = read.delim(allele_frequencies_file, sep='\t', header=T, quote=NULL, stringsAsFactors=F)
  alleleFrequencies = alleleFrequencies[order(alleleFrequencies[,1],alleleFrequencies[,2]),]
  print(head(alleleFrequencies))
  alleleFrequencies.gr = GenomicRanges::GRanges(alleleFrequencies[,1], IRanges::IRanges(alleleFrequencies[,2], alleleFrequencies[,2]), rep('*', nrow(alleleFrequencies)))
  elementMetadata(alleleFrequencies.gr) = alleleFrequencies[,3:7]
  
  # Subset the allele frequencies by the loci we would like to include
  overlap = findOverlaps(subs.data.gr, alleleFrequencies.gr)
  alleleFrequencies = alleleFrequencies[subjectHits(overlap),]
  
  nucleotides = c("A","C","G","T")
  ref.indices = match(subs.data[,3],nucleotides)
  alt.indices = match(subs.data[,4],nucleotides)
  WT.count = as.numeric(unlist(sapply(1:nrow(alleleFrequencies),function(v,a,i){v[i,a[i]+2]},v=alleleFrequencies,a=ref.indices)))
  mut.count = as.numeric(unlist(sapply(1:nrow(alleleFrequencies),function(v,a,i){v[i,a[i]+2]},v=alleleFrequencies,a=alt.indices)))
  
  combined = data.frame(chr=subs.data[,1],pos=subs.data[,2],WTCount=WT.count, mutCount=mut.count)
  colnames(combined) = c("chr","pos","WT.count","mut.count")
  
  combined.gr = GenomicRanges::GRanges(seqnames(subs.data.gr), ranges(subs.data.gr), rep('*', nrow(subs.data)))
  elementMetadata(combined.gr) = data.frame(WT.count=WT.count, mut.count=mut.count)
  
  combined.gr = sortSeqlevels(combined.gr)
  return(combined.gr)
}

##############################################
# GetDirichletProcessInfo
##############################################
#' Create the DPClust input file
#' 
#' Function that takes allele counts and a copy number profile to estimate mutation copy number,
#' cancer cell fraction and multiplicity for each point mutation.
#' @param loci_file Simple four column file with chromosome, position, reference allele and alternative allele
#' @param allele_frequencies_file Output file from alleleCounter on the specified loci
#' @param cellularity_file Full path to a Battenberg rho_and_psi output file
#' @param subclone_file Full path to a Battenberg subclones.txt output file
#' @param gender Specify male or female
#' @param SNP.phase.file Output file from mut_mut_phasing, supply NA (as char) when not available
#' @param mut.phase.file Output file from mut_cn_phasing, supply NA (as char) when not available
#' @param output_file Name of the output file
#' @author sd11
#' @export
runGetDirichletProcessInfo = function(loci_file, allele_frequencies_file, cellularity_file, subclone_file, gender, SNP.phase.file, mut.phase.file, output_file) {
  if(gender == 'male' | gender == 'Male') {
    isMale = T
  } else if(gender == 'female' | gender == 'Female') {
    isMale = F
  } else {
    stop("Unknown gender supplied, exit.")
  }
  info_counts = GetWTandMutCount(loci_file, allele_frequencies_file)
  cellularity = GetCellularity(cellularity_file)
  GetDirichletProcessInfo(output_file, cellularity, info_counts, subclone_file, is.male=isMale, SNP.phase.file=SNP.phase.file, mut.phase.file=mut.phase.file)
}

##############################################
# dpIn to VCF
##############################################
#' DPClust input file to vcf
#' 
#' Transform a dirichlet input file into a VCF with the same info. It filters out mutations in areas that are not contained in the supplied genome index (fai file) or are contained in the ignore file (ign file)
#' It takes the DP input file created by runGetDirichletProcessInfo and combines the columns with the vcf file supplied. Finally it gzips and indexes the file
#' @param vcf_infile Filename of the VCF file to use as a base
#' @param dpIn_file Filename of a DP input file
#' @param vcf_outfile Filename of the output file
#' @param fai_file Path to a reference genome index containing chromosome names
#' @param ign_file Path to a file containing contigs to ignore
#' @param genome Specify the reference genome for reading in the VCF
#' @author sd11
#' @export
dpIn2vcf = function(vcf_infile, dpIn_file, vcf_outfile, fai_file, ign_file, genome="hg19") {
  vcf = VariantAnnotation::readVcf(vcf_infile, genome=genome)
  
  # Remove muts on chroms not to look at
  fai = parseFai(fai_file)
  ign = parseIgnore(ign_file)
  allowed_chroms = which(!(fai$chromosome %in% ign$chromosome))
  vcf = vcf[as.vector(seqnames(vcf)) %in% allowed_chroms,]
  
  # Read in the to be annotated data
  dat = read.table(dpIn_file, header=T, stringsAsFactors=F)
  
  # Annotate the columns into the VCF object  
  vcf = addVcfInfoCol(vcf, dat$WT.count, 1, "Integer", "Number of reads carrying the wild type allele", "WC")
  vcf = addVcfInfoCol(vcf, dat$mut.count, 1, "Integer", "Number of reads carrying the mutant allele", "MC")
  vcf = addVcfInfoCol(vcf, dat$subclonal.CN, 1, "Float", "Total subclonal copynumber", "TSC")
  vcf = addVcfInfoCol(vcf, dat$nMaj1, 1, "Float", "Copynumber of major allele 1", "NMA1")
  vcf = addVcfInfoCol(vcf, dat$nMin1, 1, "Float", "Copynumber of minor allele 1", "NMI1")
  vcf = addVcfInfoCol(vcf, dat$frac1, 1, "Float", "Fraction of tumour cells containing copy number state 1", "FR1")
  vcf = addVcfInfoCol(vcf, dat$nMaj2, 1, "Float", "Copynumber of major allele 2", "NMA2")
  vcf = addVcfInfoCol(vcf, dat$nMin2, 1, "Float", "Copynumber of minor allele 2", "NMI2")
  vcf = addVcfInfoCol(vcf, dat$frac2, 1, "Float", "Fraction of tumour cells containing copy number state 2", "FR2")
  vcf = addVcfInfoCol(vcf, dat$phase, 1, "String", "Phase relation mutation and copynumber", "PHS")
  vcf = addVcfInfoCol(vcf, dat$mutation.copy.number, 1, "Float", "Mutation copy number", "MCN")
  vcf = addVcfInfoCol(vcf, dat$subclonal.fraction, 1, "Float", "Fraction of tumour cells carying this mutation", "CCF")
  vcf = addVcfInfoCol(vcf, dat$no.chrs.bearing.mut, 1, "Float", "Number of chromosomes bearing the mutation", "NCBM")
  
  # Write the output, gzip and index it
  VariantAnnotation::writeVcf(vcf, file=vcf_outfile, index=T)
}

#' Convenience function that annotates a column into the supplied VCF object
#' @noRd
addVcfInfoCol = function(vcf, data, number, type, description, abbreviation) {
  i = header(vcf)@header$INFO
  exptData(vcf)$header@header$INFO <- rbind(i, S4Vectors::DataFrame(Number=number, Type=type, Description=description, row.names=abbreviation))
  info(vcf)[,abbreviation] <- as(data, "CharacterList")
  return(vcf)
}
shaghayeghsoudi/DPclust3p_Customised documentation built on March 18, 2022, 5:28 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
shaghayeghsoudi/DPclust3p_Customised
Dirichlet Process Clustering Pre-Processing Package

inst/todo/preprocessing_alternative_phasing.R
In shaghayeghsoudi/DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package

R Package Documentation

Browse R Packages

We want your feedback!

shaghayeghsoudi/DPclust3p_Customised Dirichlet Process Clustering Pre-Processing Package

inst/todo/preprocessing_alternative_phasing.R In shaghayeghsoudi/DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package

R Package Documentation

Browse R Packages

We want your feedback!

shaghayeghsoudi/DPclust3p_Customised
Dirichlet Process Clustering Pre-Processing Package

inst/todo/preprocessing_alternative_phasing.R
In shaghayeghsoudi/DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package