DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package

Documented in dumpCounts.maf

################################################
# Allele counting functions - SNV, indel and SV
################################################
#' Run alleleCount
### modified by Shaghayegh Soudi to work on MAF file #####
#' Dump allele counts from maf file
#'
#' Dump allele counts stored in the sample columns of the maf file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#'
#' @param maf_file The maf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param dummy_alt_allele 
#' @param dummy_ref_allele 
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the maf
#'
#' @author sd11
#' @export




dumpCounts.maf = function(maf_file, tumour_outfile,  normal_outfile=NA, samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) {
   
   for (maf_file in maf_file) { 
   maf = read.delim(maf_file, header = TRUE, sep = "\t")
   dummy_ref_allele = "A"
   dummy_alt_allele = "C"

   ### SNVs ###
   maf_snv<-maf[maf$Variant_Type == "SNP",]
   counts_maf_snv = maf_snv[,c("t_ref_count","t_alt_count")]
   rownames(counts_maf_snv)<-paste(maf_snv$Chromosome,maf_snv$Start_Position, sep = ":")
   allele.ref.snv = as.character(maf_snv[,"Reference_Allele"])
   allele.alt.snv = as.character(maf_snv[,"Tumor_Seq_Allele2"])
   count.ref.snv = counts_maf_snv[,1]
   count.alt.snv = counts_maf_snv[,2]
   

   ### else: insetions, delitions, DNPs, TBPs 
   maf_else<-maf[maf$Variant_Type != "SNP",]
   counts_maf_else = maf_else[,c("t_ref_count","t_alt_count")]
   rownames(counts_maf_else)<-paste(maf_else$Chromosome,maf_else$Start_Position, sep = ":")
   count.ref.else = counts_maf_else[,1]
   count.alt.else = counts_maf_else[,2]
   allele.ref.else=rep(dummy_ref_allele, length(count.ref.else))
   allele.alt.else=rep(dummy_alt_allele, length(count.alt.else)) 
   
  

     ## output table for SNVs
     output.snv = array(0, c(length(allele.ref.snv), 4))
     nucleotides = c("A", "C", "G", "T")
     # Propagate the alt allele counts
     nucleo.index.snv = match(allele.alt.snv, nucleotides)
  
     for (i in 1:nrow(output.snv)) {
     output.snv[i,nucleo.index.snv[i]] = count.alt.snv[i]
     } ## for i loop
  
     # Propagate the reference allele counts
     nucleo.index.snv = match(allele.ref.snv, nucleotides)
     
     for (i in 1:nrow(output.snv)) {
     output.snv[i,nucleo.index.snv[i]] = count.ref.snv[i]

    }  ## second for i loop

  if (nrow(maf_snv)==0) {
    output = data.frame(matrix(ncol = 7, nrow = 0))
   } else {

    output.snv<-cbind.data.frame(maf_snv$Chromosome,maf_snv$Start_Position,output.snv, rowSums(output.snv))
    output.snv[,1]<-gsub("chr","",output.snv[,1]) 

    colnames(output.snv) <- c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth")

   }  # else                    



## output table for else (insertions, delitions, DNPs, TNPs)
     output.else = array(0, c(length(allele.ref.else), 4))
     nucleotides = c("A", "C", "G", "T")
     # Propagate the alt allele counts
     nucleo.index.else = match(allele.alt.else, nucleotides)
  
     for (i in 1:nrow(output.else)) {
     output.else[i,nucleo.index.else[i]] = count.alt.else[i]
     } ## for i loop
  
     # Propagate the reference allele counts
     nucleo.index.else = match(allele.ref.else, nucleotides)
     
     for (i in 1:nrow(output.else)) {
     output.else[i,nucleo.index.else[i]] = count.ref.else[i]

    }  ## second for i loop

    if (nrow(maf_else)==0) {
    output.else = data.frame(matrix(ncol = 7, nrow = 0))
    } else {

    output.else<-cbind.data.frame(maf_else$Chromosome,maf_else$Start_Position,output.else, rowSums(output.else))
    output.else[,1]<-gsub("chr","",output.else[,1]) 

    colnames(output.else) <- c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth")

    }  # else                    

    output_snv_else<-rbind(output.snv,output.else)
    output<-output_snv_else[order(output_snv_else[,1], output_snv_else[,2]), ]
    write.table(output, col.names=T, quote=F, row.names=F, file=tumour_outfile, sep="\t")   ### adjusted by Shaghayegh
   }
}


###############################################
###############################################


#alleleCount = function(locifile, bam, outfile, min_baq=20, min_maq=35) {
#  cmd = paste(ALLELECOUNTER,
#              "-b", bam,
#              "-o", outfile,
#              "-l", locifile,
#              "-m", min_baq,
#              "-q", min_maq, sep=" ")
#  system(cmd, wait=T)
#}

#' Obtain alt allele for all variants as a vector
#' 
#' Dumps the alt allele in a vector. It takes only the first element if multiple alt alleles are listed
#' @param vcf A VCF object
#' @return A vector with the alt allele
#' @author sd11
#' @export
#getAltAllele = function(vcf) {
  #clist = CharacterList(VariantAnnotation::alt(vcf))
  #mult = elementNROWS(clist) > 1L
  #mult = elementLengths(clist) > 1L 
  #if (any(mult)) {
    #warning(paste("Took first alt allele only for these variants: ", paste(seqnames(vcf), start(vcf))))
  #}
  
#  clist[mult] = lapply(clist[mult], paste0, collapse=",")
#  return(unlist(clist))
#}

#' Dump allele counts from vcf - Sanger ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
#dumpCounts.Sanger = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="sanger", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
#}

#' Dump allele counts from vcf - DKFZ ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
#dumpCounts.DKFZ = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="dkfz", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
#}

#' Dump allele counts from vcf - Broad ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
#dumpCounts.Broad = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="broad", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
#}

#' Dump allele counts from vcf - MuSE ICGC pancancer pipeline
#'
#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file
#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted
#' allele counts file as returned by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
#dumpCounts.Muse = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="muse", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
#}

#' Dump allele counts from vcf - ICGC pancancer consensus SNV pipeline
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned 
#' by alleleCounter. There are no counts for the matched normal. 
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF
#' @author sd11
#' @export
#dumpCounts.ICGCconsensusSNV = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus_snv", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
#}

#' Dump allele counts from vcf - ICGC pancancer consensus indel pipeline
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned 
#' by alleleCounter. There are no counts for the matched normal. 
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param dummy_alt_allele Specify allele to be used to encode the alt counts (the indel can be multiple alleles)
#' @param dummy_ref_allele Specify allele to be used to encode the ref counts (the indel can be multiple alleles)
#' @author sd11
#' @export
#dumpCounts.ICGCconsensusIndel = function(vcf_infile, tumour_outfile, refence_genome="hg19", dummy_alt_allele=NA, dummy_ref_allele=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus_indel", normal_outfile=NA, refence_genome=refence_genome, samplename=NA, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)
#}

#' Dump allele counts from vcf - Strelka indel pipeline
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned
#' by alleleCounter. There are no counts for the matched normal.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param dummy_alt_allele Specify allele to be used to encode the alt counts (the indel can be multiple alleles)
#' @param dummy_ref_allele Specify allele to be used to encode the ref counts (the indel can be multiple alleles)
#' @author sd11
#' @export
#dumpCounts.StrelkaIndel = function(vcf_infile, tumour_outfile, refence_genome="hg19", dummy_alt_allele=NA, dummy_ref_allele=NA) {
#	dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="strelka_indel", normal_outfile=NA, refence_genome=refence_genome, samplename=NA, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)
#}

#' Dump allele counts from vcf - CGP Pindel indel pipeline
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned
#' by alleleCounter. There are no counts for the matched normal.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @param dummy_alt_allele Specify allele to be used to encode the alt counts (the indel can be multiple alleles)
#' @param dummy_ref_allele Specify allele to be used to encode the ref counts (the indel can be multiple alleles)
#' @author sd11
#' @export
#dumpCounts.cgpPindel = function(vcf_infile, tumour_outfile, refence_genome="hg19", dummy_alt_allele=NA, dummy_ref_allele=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="cgppindel_indel", normal_outfile=NA, refence_genome=refence_genome, samplename=samplename, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)
#}

#' Dump allele counts from vcf - Mutect
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned 
#' by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param samplename Parameter specifying the samplename to be used for matching the right column in the VCF
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @author sd11
#' @export
#dumpCounts.mutect = function(vcf_infile, tumour_outfile, samplename, normal_outfile=NA, refence_genome="hg19") {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="mutect", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename)
#}

#' Dump allele counts from vcf - Scalpel
#'
#' Dump allele counts stored in the info column of the VCF file. Output will go into a file
#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned 
#' by alleleCounter.
#' @param vcf_infile The vcf file to read in
#' @param tumour_outfile File to save the tumour counts to
#' @param samplename Parameter specifying the samplename to be used for matching the right column in the VCF
#' @param normal_outfile Optional parameter specifying where the normal output should go
#' @param refence_genome Optional parameter specifying the reference genome build used
#' @author sd11
#' @export
#dumpCounts.scalpel = function(vcf_infile, tumour_outfile, samplename, normal_outfile=NA, refence_genome="hg19", dummy_alt_allele=NA, dummy_ref_allele=NA) {
#  dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="scalpel_indel", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)
#}

#' Dump allele counts from VCF
#' 
#' This function implements all the steps required for dumping counts from VCF
#' as supplied by the ICGC pancancer pipelines. 
#' @noRd
#dumpCountsFromVcf = function(vcf_infile, tumour_outfile, centre, normal_outfile=NA, refence_genome="hg19", samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) {
  # Helper function for writing the output  
  #write.output = function(output, output_file) {
    #write.table(output, file=output_file, col.names=T, quote=F, row.names=F, sep="\t")
#  }
  
  # Read in the vcf and dump the tumour counts in the right format
  #v = VariantAnnotation::readVcf(vcf_infile, refence_genome)
  #if (nrow(v) > 0) {
    #tumour = getCountsTumour(v, centre=centre, samplename=samplename, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)
  #} else {
    #tumour = NA
  #}
  #tumour = formatOutput(tumour, v)
  #write.output(tumour, tumour_outfile)
  
#   Optionally dump the normal counts in the right format
#  if (!is.na(normal_outfile)) {
#    if (nrow(v) > 0) {
#      normal = getCountsNormal(v, centre=centre, samplename=samplename)
#    } else {
#      normal = NA
#    }
#    normal = formatOutput(normal, v)
#    write.output(normal, normal_outfile)
#  }
#}

#' Format a 4 column counts table into the alleleCounter format. This function assumes A, C, G, T format.
#' @noRd
#formatOutput = function(counts_table, v) {
  # Check if the vcf object is empty, if so, generate dummy data
  #if (nrow(v)==0) {
    #output = data.frame(matrix(ncol = 7, nrow = 0))
  #} else {
    #output = data.frame(as.character(seqnames(v)), start(ranges(v)), counts_table, rowSums(counts_table))
  #}
  #colnames(output) = c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth")
  #return(output)
#}

#' Dump allele counts from vcf for normal
#' 
#' Returns an allele counts table for the normal sample
#' @param v The vcf file
#' @param centre The sequencing centre of which pipeline the vcf file originates
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getCountsNormal = function(v, centre="sanger", samplename=NA) {
#  if (centre=="sanger") {
#    return(getAlleleCounts.Sanger(v, 1))
#  } else if(centre=="dkfz") {
#    sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "CONTROL")
#    return(getAlleleCounts.DKFZ(v, sample_col))
#  } else if (centre=="muse") {
#    # Assuming the tumour name is provided
#    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) != samplename)
#    return (getAlleleCounts.MuSE(v, sample_col))
#  } else if (centre=="broad") {
#    print("The Broad ICGC pipeline does not report allele counts for the matched normal")
#    q(save="no", status=1)
#  } else if (centre=="icgc_consensus_snv") {
#    print("The ICGC consensus pipeline does not report allele counts for the matched normal")
#    q(save="no", status=1)
#  } else if (centre=="icgc_consensus_indel") {
#    print("The ICGC consensus indel pipeline does not report allele counts for the matched normal")
#    q(save="no", status=1)
#  } else if (centre=="mutect") {
#    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename)
#    return(getAlleleCounts.mutect(v, sample_col))
#  } else if (centre=="scalpel_indel") {
#    if (is.na(dummy_alt_allele) | is.na(dummy_ref_allele)) { stop("When dumping allele counts from the Scalpel indels dummy_alt_allele and dummy_ref_allele must be supplied") }
#    if (!any(colnames(geno(v)[[1]])=="NORMAL")) { stop("Expect Scalpel normal VCF column to contain header NORMAL") }
#    sample_col = which(colnames(geno(v)[[1]])=="NORMAL")
#    return(getAlleleCounts.Scalpel_indel(v, sample_col=sample_col, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele))
#  } else {
#    print(paste("Supplied centre not supported:", centre))
#    q(save="no", status=1)
#  }
#}

#' Dump allele counts from vcf for tumour
#' 
#' Returns an allele counts table for the tumour sample
#' @param v The vcf file
#' @param centre The sequencing centre of which pipeline the vcf file originates
#' @param samplename Samplename to be used when matching sample columns (Default: NA)
#' @param dummy_alt_allele Supply if the alt allele is to be overwritten. The counts of the alt will be put in the column corresponding to this allele. (Default: NA)
#' @param dummy_ref_allele Supply if the ref allele is to be overwritten. The counts of the ref will be put in the column corresponding to this allele. (Default: NA)
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getCountsTumour = function(v, centre="sanger", samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) {
#  if (centre=="sanger") {
#    return(getAlleleCounts.Sanger(v, 2))
#  } else if (centre=="dkfz") {
#    sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "TUMOR")
#    return(getAlleleCounts.DKFZ(v, sample_col))
#  } else if (centre=="muse") {
#    if (is.na(samplename)) { stop("When dumping allele counts from the Muse samplename must be supplied") }
#    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename)
#    return (getAlleleCounts.MuSE(v, sample_col))
#  } else if (centre=="broad") {
#    if (is.na(samplename)) { stop("When dumping allele counts from the Broad ICGC pipeline samplename must be supplied") }
#    return(getAlleleCounts.Broad(v, 1))
#  } else if (centre=="icgc_consensus_snv") {
#    return(getAlleleCounts.ICGC_consensus_snv(v))
#  } else if (centre=="icgc_consensus_indel") {
#    if (is.na(dummy_alt_allele) | is.na(dummy_ref_allele)) { stop("When dumping allele counts from the ICGC consensus indels dummy_alt_allele and dummy_ref_allele must be supplied") }
#    return(getAlleleCounts.ICGC_consensus_indel(v, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele))
#  } else if (centre=="mutect") {
#    if (is.na(samplename)) { stop("When dumping allele counts from the Mutect samplename must be supplied") }
#    sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename)
#    return(getAlleleCounts.mutect(v, sample_col))
#  } else if (centre=="strelka_indel") {
#    if (is.na(dummy_alt_allele) | is.na(dummy_ref_allele)) { stop("When dumping allele counts from the Strelka indels dummy_alt_allele and dummy_ref_allele must be supplied") }
#    return(getAlleleCounts.Strelka_indel(v, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele))
#  } else if (centre=="scalpel_indel") {
#    if (is.na(dummy_alt_allele) | is.na(dummy_ref_allele)) { stop("When dumping allele counts from the Scalpel indels dummy_alt_allele and dummy_ref_allele must be supplied") }
#    if (!any(colnames(geno(v)[[1]])=="TUMOUR")) { stop("Expect Scalpel tumour VCF column to contain header TUMOUR") }
#    sample_col = which(colnames(geno(v)[[1]])=="TUMOUR")
#    return(getAlleleCounts.Scalpel_indel(v, sample_col=sample_col, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele))
#  } else if (centre=="cgppindel_indel") {
#    sample_col = which(colnames(geno(v)$WTR)==samplename)
#    return(getAlleleCounts.cgpPindel_indel(v, sample_col=sample_col, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele))
#  } else {
#    print(paste("Supplied centre not supported:", centre))
#    q(save="no", status=1)
#  }
#}

#' Dump allele counts from Sanger pipeline vcf
#' 
#' Helper function that dumps the allele counts from a Sanger pipeline VCF file
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.Sanger = function(v, sample_col) {
#  return(cbind(VariantAnnotation::geno(v)$FAZ[,sample_col]+VariantAnnotation::geno(v)$RAZ[,sample_col], 
#               VariantAnnotation::geno(v)$FCZ[,sample_col]+VariantAnnotation::geno(v)$RCZ[,sample_col], 
#               VariantAnnotation::geno(v)$FGZ[,sample_col]+VariantAnnotation::geno(v)$RGZ[,sample_col], 
#               VariantAnnotation::geno(v)$FTZ[,sample_col]+VariantAnnotation::geno(v)$RTZ[,sample_col]))
#}

#' Dump allele counts from the DKFZ pipeline
#' 
#' Helper function that takes a sample column and fetches allele counts. As the DKFZ pipeline does not
#' provide counts for each base, but just the alt and reference, we will provide just those and the
#' other bases with 0s.
#' 
#' Note: If there are multiple ALT alleles this function will only take the first mentioned!
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.DKFZ = function(v, sample_col) {
#  # Fetch counts for both forward and reverse ref/alt
#  counts = VariantAnnotation::geno(v)$DP4[,sample_col,]
#  counts.ref = counts[,1] + counts[,2] # ref forward/reverse
#  counts.alt = counts[,3] + counts[,4] # alt forward/reverse
#  allele.ref = as.character(VariantAnnotation::ref(v))
#  # allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
#  allele.alt = getAltAllele(v)
#  
#  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
#  return(output)
#}

#' Dump allele counts from the MuSE pipeline
#' 
#' Helper function that takes a sample column and fetches allele counts. As the MuSE pipeline does not
#' provide counts for each base, but just the alt and reference, we will provide just those and the
#' other bases with 0s.
#' 
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' Note2: This function assumes there are only two columns with read counts. The format allows for more
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.MuSE = function(v, sample_col) {
#  if (length(colnames(VariantAnnotation::geno(v)$AD)) > 2) {
#    print("In getAlleleCounts.MuSE: Assuming 2 columns with read counts, but found more. This is not supported")
#    q(save="no", status=1)
#  }
#  
  # An SNV can be represented by more than 1 alt alleles, here we pick the alt allele with the highest read count
#  num.snvs = nrow(VariantAnnotation::geno(v)$AD)
#  counts = array(NA, c(num.snvs, 2))
#  allele.alt = array(NA, num.snvs)
#  for (i in 1:num.snvs) {
#    snv.counts = unlist(VariantAnnotation::geno(v)$AD[i,sample_col])
#    counts[i,1] = snv.counts[1] # The reference is the first base for which read counts are mentioned
#    select_base = which.max(snv.counts[2:length(snv.counts)])
#    allele.alt[i] = as.character(VariantAnnotation::alt(v)[[i]][select_base])
#    select_base = select_base+1 # The reference is the first base for which read counts are mentioned
#    counts[i,2] = snv.counts[select_base] 
#  }
  
#  allele.ref = as.character(VariantAnnotation::ref(v))
#  
#  output = construct_allelecounter_table(counts[i,1], counts[i,2], allele.ref, allele.alt)
#  return(output)
#}

#' Dump allele counts from the Broad pipeline
#' 
#' Helper function that takes a sample column and fetches allele counts. As the Broad pipeline does not
#' provide counts for each base, but just the alt and reference, we will provide just those and the
#' other bases with 0s.
#' 
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.Broad = function(v, sample_col) {
#  count.ref = as.numeric(unlist(VariantAnnotation::geno(v)$ref_count))
#  count.alt = as.numeric(unlist(VariantAnnotation::geno(v)$alt_count))
#  allele.ref = as.character(VariantAnnotation::ref(v))
#  # allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
#  allele.alt = getAltAllele(v)
  
#  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
#  return(output)
#}

#' Dump allele counts in ICGC consensus SNV format
#' 
#' This function fetches allele counts from the info field in the VCF file.
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' @param v The vcf file
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.ICGC_consensus_snv = function(v) {
#  count.alt = info(v)$t_alt_count
#  count.ref = info(v)$t_ref_count
#  allele.ref = as.character(VariantAnnotation::ref(v))
#  # allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
#  allele.alt = getAltAllele(v)
#  
#  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
#  return(output)
#}

#' Dump allele counts in ICGC consensus indel format
#' 
#' This function fetches allele counts from the info field in the VCF file.
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' @param v The vcf file
#' @param dummy_alt_allele Dummy base to use to encode the alt allele, the counts will be saved in the corresponding column
#' @param dummy_ref_allele Dummy base to use to encode the ref allele, the counts will be saved in the corresponding column
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.ICGC_consensus_indel = function(v, dummy_alt_allele="A", dummy_ref_allele="C") {
#  c = construct_allelecounter_table(count.ref=info(v)$t_ref_count, 
#                                    count.alt=info(v)$t_alt_count, 
#                                    allele.ref=rep(dummy_ref_allele, nrow(v)),
#                                    allele.alt=rep(dummy_alt_allele, nrow(v)))
#  return(c)
#}

#' Dump allele counts in Strelka indel format
#'
#' This function fetches allele counts from the info field in the VCF file.
#'
#' @param v The vcf file
#' @param dummy_alt_allele Dummy base to use to encode the alt allele, the counts will be saved in the corresponding column
#' @param dummy_ref_allele Dummy base to use to encode the ref allele, the counts will be saved in the corresponding column
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.Strelka_indel = function(v, dummy_alt_allele="A", dummy_ref_allele="C") {
#  c = construct_allelecounter_table(count.ref=geno(v)$TAR[,2,1],
#				    count.alt=geno(v)$TIR[,2,1],
#				    allele.ref=rep(dummy_ref_allele, nrow(v)),
#				    allele.alt=rep(dummy_alt_allele, nrow(v)))
#  return(c)
#}

#' Dump allele counts in cgpPindel indel format
#'
#' This function fetches allele counts from the info field in the VCF file. Note that this requires running vafCorrect after cgpPindel.
#'
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @param dummy_alt_allele Dummy base to use to encode the alt allele, the counts will be saved in the corresponding column
#' @param dummy_ref_allele Dummy base to use to encode the ref allele, the counts will be saved in the corresponding column
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.cgpPindel_indel = function(v, sample_col, dummy_alt_allele="A", dummy_ref_allele="C") {
#  c = construct_allelecounter_table(count.ref=geno(v)$WT[,sample_col],
#                                    count.alt=geno(v)$MTR[,sample_col],
#                                    allele.ref=rep(dummy_ref_allele, nrow(v)),
#                                    allele.alt=rep(dummy_alt_allele, nrow(v)))
#  return(c)
#}

#' Dump allele counts in Scalpel indel format
#'
#' This function fetches allele counts from the info field in the VCF file.
#'
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @param dummy_alt_allele Dummy base to use to encode the alt allele, the counts will be saved in the corresponding column
#' @param dummy_ref_allele Dummy base to use to encode the ref allele, the counts will be saved in the corresponding column
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.Scalpel_indel = function(v, sample_col, dummy_alt_allele="A", dummy_ref_allele="C") {
#  counts = do.call(rbind, geno(v)$AD[,sample_col])
#  c = dpclust3p:::construct_allelecounter_table(count.ref=counts[,1],
#                                                count.alt=counts[,2],
#                                                allele.ref=rep(dummy_ref_allele, nrow(v)),
#                                                allele.alt=rep(dummy_alt_allele, nrow(v)))
#  return(c)
#}

#' Dump allele counts in Mutect format
#' 
#' This function fetches allele counts from the info field in the VCF file.
#' Note: If there are multiple ALT alleles this function will only take the first mentioned! 
#' @param v The vcf file
#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1
#' @return An array with 4 columns: Counts for A, C, G, T
#' @author sd11
#' @noRd
#getAlleleCounts.mutect = function(v, sample_col) {
#  counts = do.call(rbind, geno(v)$AD[,sample_col])
#  count.ref = counts[,1]
#  count.alt = counts[,2]
#  allele.ref = as.character(VariantAnnotation::ref(v))
#  # allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) }))
#  allele.alt = getAltAllele(v)
#  output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt)
#  return(output)
#}

#' Function that constructs a table in the format of the allele counter
#' @param count.ref Number of reads supporting the reference allele
#' @param count.alt Number of reads supporting the variant allele
#' @param allele.ref The reference allele
#' @param allele.alt The variant allele
#' @return A data.frame consisting of four columns: Reads reporting A, C, G and T
#' @author sd11
#' @noRd
#construct_allelecounter_table = function(count.ref, count.alt, allele.ref, allele.alt) {
#  output = array(0, c(length(allele.ref), 4))
#  nucleotides = c("A", "C", "G", "T")
#  # Propagate the alt allele counts
#  nucleo.index = match(allele.alt, nucleotides)
#  for (i in 1:nrow(output)) {
#    output[i,nucleo.index[i]] = count.alt[i]
 # }
  
  # Propagate the reference allele counts
#  nucleo.index = match(allele.ref, nucleotides)
#  for (i in 1:nrow(output)) {
#    output[i,nucleo.index[i]] = count.ref[i]
#  }
 # return(output)
#}
shaghayeghsoudi/DPclust3p_Customised documentation built on March 18, 2022, 5:28 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
shaghayeghsoudi/DPclust3p_Customised
Dirichlet Process Clustering Pre-Processing Package

R/allelecount.R
In shaghayeghsoudi/DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package

Defines functions dumpCounts.maf

Documented in dumpCounts.maf

R Package Documentation

Browse R Packages

We want your feedback!

shaghayeghsoudi/DPclust3p_Customised Dirichlet Process Clustering Pre-Processing Package

R/allelecount.R In shaghayeghsoudi/DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package

Defines functions dumpCounts.maf

Documented in dumpCounts.maf

R Package Documentation

Browse R Packages

We want your feedback!

shaghayeghsoudi/DPclust3p_Customised
Dirichlet Process Clustering Pre-Processing Package

R/allelecount.R
In shaghayeghsoudi/DPclust3p_Customised: Dirichlet Process Clustering Pre-Processing Package