R/util.R

Defines functions process_maf merge_maf plot_evolutionary_distance plot_cn_calls plot_allelic_imbalance plot_celullarity generate_CLONET_sample_info generate_CLONET_config format_SNP_data get_pileup_summary estimate_contamination_parallel estimate_contamination vcf_filtering vcf_format format_PM_data format_segment_data format_ASEQ_pileup filter_VEP vcf_sort vcf_annotate vcf_stats_merge vcf_concatenate fix_mnps vcf_filter_variants split_vcf bgzip CNNScoreVariants FilterVariantTranches generate_sets vcf_sets learn_orientation get_sv_type vcf_communality vcf_intersect_bed compress_and_index_vcf annotate_sv_type check_and_unzip tab_indx

Documented in annotate_sv_type bgzip check_and_unzip CNNScoreVariants compress_and_index_vcf estimate_contamination estimate_contamination_parallel FilterVariantTranches filter_VEP fix_mnps format_ASEQ_pileup format_PM_data format_segment_data format_SNP_data generate_CLONET_config generate_CLONET_sample_info generate_sets get_pileup_summary get_sv_type learn_orientation merge_maf plot_allelic_imbalance plot_celullarity plot_cn_calls plot_evolutionary_distance process_maf split_vcf tab_indx vcf_annotate vcf_communality vcf_concatenate vcf_filtering vcf_filter_variants vcf_format vcf_intersect_bed vcf_sets vcf_sort vcf_stats_merge

#' Index tab separated genomic regions
#'
#' This function takes a .vcf.bgz tab separated genomic region file and generates an index for it
#'
#' @param bin_path [REQUIRED] Path to bcftools binary. Default tools/htslib/tabix.
#' @param file [REQUIRED] Path to VCF file to index.
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @export


tab_indx=function(bin_path="tools/htslib/tabix",file="",verbose=FALSE){

  if (verbose){
    print(paste0(bin_path," -fp vcf",file))
  }
  system(paste0(bin_path," -fp vcf ",file))
}

#' Check and unzip gz compressed file
#'
#' This function takes a file checks if its compressed and uncompresses if true.
#'
#' @param file  [REQUIRED] Path to file
#' @param preserve  [OPTIONAL] Preserve original file. Default TRUE
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @export

check_and_unzip=function(file="",preserve=TRUE,verbose=FALSE){
  if(grepl("compressed",system(paste0("file ",file),intern=TRUE))){
    file_name=tools::file_path_sans_ext(file,compression=TRUE)
    if(preserve){
        system(paste0("gunzip ",file ,">",file_name))
    }else{
      system(paste0("gunzip ",file))
    }
    file=file_name
    return(file)
  }else{
    return(file)
  }
}


#' Get SV type from svaba generated VCF
#'
#' As given by sfrenk comment on https://github.com/walaj/svaba/issues/4 annotates SV type
#' in svaba vcf file
#'
#' @param vcf svaba generated vcf to annotate. Get vcf
#' @param output_dir Path to directory to output results
#' @export

annotate_sv_type <- function(vcf="",output_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }

  out_file_dir=paste0(output_dir,sep)
  if (!dir.exists(out_file_dir) & !out_file_dir==""){
      dir.create(out_file_dir)
  }
  # Find mate pair

  cols <- system(paste0('grep -v "##" ', vcf,' | grep "#" | sed s/#//'),intern=TRUE)
  cols <- strsplit(cols,"\t")[[1]]
  svaba_uniq=tryCatch(
    {svaba_uniq = read.table(vcf, col.names = cols, stringsAsFactors = FALSE);
     svaba_uniq$INFO = paste0(svaba_uniq$INFO,";SVANOT=",sapply(svaba_uniq$ID, FUN=get_sv_type,dat=svaba_uniq));
     svaba_uniq}
        ,error=function(e){
      svaba_uniq=data.frame(matrix(ncol = length(cols), nrow = 0));
      colnames(svaba_uniq)=cols;
      return(svaba_uniq)})
  fil=paste0(out_file_dir,paste0(ULPwgs::get_sample_name(vcf),".svaba.sv.annotated.vcf"))
  cat(system(paste0('grep "##" ', vcf ),intern=TRUE),file=fil,sep="\n")
  cat('##INFO=<ID=SVANOT,Number=1,Type=String,Description=\"Structural variant annotation\">',file=fil,sep="\n",append=TRUE)
  cat(paste0("#",paste0(cols,collapse="\t")),file=fil,sep="\n",append=TRUE)
  write.table(x=svaba_uniq,file=fil,append=TRUE,quote=FALSE,col.names=FALSE,sep="\t",row.names=FALSE)
}



#' Compress and index vcf file
#'
#' This function takes an uncompressed vcf file and uses bgzip and tabix to
#' compress and index the file while preserving the original file
#'
#' @param vcf [REQUIRED] Path to VCF to compress
#' @param output_dir [OPTIONAL] Path to output directory
#' @export

compress_and_index_vcf=function(bin_path="tools/htslib/bgzip",bin_path2="tools/htslib/tabix",vcf="",output_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }
  out_file_dir=paste0(output_dir,sep)

  system(paste("cp",paste0(vcf), paste0(vcf,".tmp")))
  bgzip(bin_path=bin_path,file=vcf)
  tab_indx(bin_path=bin_path2,file=paste0(vcf,".gz"))
  system(paste("mv",paste0(vcf,".tmp"), vcf))
  if (!out_file_dir==""){
    if (!dir.exists(out_file_dir) ){
        dir.create(out_file_dir,recursive=TRUE)
    }
    system(paste("mv",paste0(vcf,".gz*"), out_file_dir))
  }
}


#' Intersect vcf with bed file
#'
#' Generate a set of variants covered by the regions found in/out the bed file
#'
#' @param vcf [REQUIRED] Path to VCF file
#' @param bed [REQUIRED] Path to BED file
#' @param output_name [OPTIONAL] Name for output_files. IF not given it will use input vcf name
#' @param output_dir [OPTIONAL] Path to output directory. if not given outputs to current directory.
#' @export

vcf_intersect_bed <- function(vcf="",bed="",output_name="",output_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }

  out_file_dir=paste0(output_dir,sep)
  if (!dir.exists(out_file_dir) & !out_file_dir==""){
      dir.create(out_file_dir)
  }
  if(output_name!=""){
    sample_name=output_name
  }else{
    sample_name=ULPwgs::get_sample_name(vcf)
  }
  cols <- system(paste0('grep -v "##" ', vcf,' | grep "#" | sed s/#//'),intern=TRUE)
  cols <- strsplit(cols,"\t")[[1]]
  variants = read.table(vcf, col.names = cols, stringsAsFactors = FALSE)
  variants$CHROM=as.character(variants$CHROM)
  regions= read.table(bed, stringsAsFactors = FALSE)
  regions$V1=as.character(sub("chr","",regions$V1))
  variants_in=dplyr::semi_join(variants,regions,by=c("CHROM"="V1","POS"="V3"))
  variants_out=dplyr::anti_join(variants,regions,by=c("CHROM"="V1","POS"="V3"))
  file_out=paste0(out_file_dir,sample_name,".IN_BED.vcf")
  file_out2=paste0(out_file_dir,sample_name,".OUT_BED.vcf")
  cat(system(paste0('grep "##" ', vcf ),intern=TRUE),file=file_out,sep="\n")
  cat(system(paste0('grep "##" ', vcf ),intern=TRUE),file=file_out2,sep="\n")
  cat(paste0('##Filter In-section between vcf=',vcf," and bed=", bed),file=file_out,sep="\n",append=TRUE)
  cat(paste0('##Filter Out-section between vcf=',vcf," and bed=", bed),file=file_out2,sep="\n",append=TRUE)
  cat(paste0("#",paste0(cols,collapse="\t")),file=file_out,sep="\n",append=TRUE)
  cat(paste0("#",paste0(cols,collapse="\t")),file=file_out2,sep="\n",append=TRUE)
  write.table(x=variants_in,file=file_out,append=TRUE,quote=FALSE,col.names=FALSE,sep="\t",row.names=FALSE)
  write.table(x=variants_out,file=file_out2,append=TRUE,quote=FALSE,col.names=FALSE,sep="\t",row.names=FALSE)
}


#' VCF communality
#'
#' Estimate % of common SNPs between two VCF files and returns the communality between them
#'
#' @param vcf [REQUIRED] Path to VCF file
#' @param vcf2 [REQUIRED] Path to VCF file
#' @param output_name [OPTIONAL] Name for output_files. IF not given it will use input vcf name
#' @param verbose [OPTIONAL] Extra verbose. Default FALSE
#' @param output_dir [OPTIONAL] Path to output directory. if not given outputs to current directory.
#' @export

vcf_communality =function(bin_path="tools/bcftools/bcftools",vcf="",vcf2="",output_name="",output_dir="",verbose=FALSE){

  sep="/"
  if(output_dir==""){
    sep=""
  }

  out_file_dir=paste0(output_dir,sep)
  if (!dir.exists(out_file_dir) & !out_file_dir==""){
      dir.create(out_file_dir)
  }
    # if(output_name!=""){
    #   sample_name=output_name
    # }else{
    #   sample_name=ULPwgs::get_sample_name(vcf)
    # }
  generate_sets(bin_path=bin_path,vcf=c(vcf,vcf2),filter="",output_dir=paste0(out_file_dir,paste0(ULPwgs::get_sample_name(vcf),"_AND_",ULPwgs::get_sample_name(vcf2))),verbose=verbose,set_names=c(ULPwgs::get_sample_name(vcf),ULPwgs::get_sample_name(vcf2)))
  not_common=as.numeric(system(paste0("cat ",paste0(out_file_dir,paste0(ULPwgs::get_sample_name(vcf),"_AND_",ULPwgs::get_sample_name(vcf2))),"/SETS/SET_1/sites.txt | wc -l"),intern=TRUE))
  common=as.numeric(system(paste0("cat ",paste0(out_file_dir,paste0(ULPwgs::get_sample_name(vcf),"_AND_",ULPwgs::get_sample_name(vcf2))),"/SETS/SET_2/sites.txt | wc -l"),intern=TRUE))
  communality=common/(common+not_common)
  return(data.frame(vcf=vcf,communality=communality,common=common,not_common))
  }










#' Get SV type from svaba generated VCF
#'
#' As given by sfrenk comment on https://github.com/walaj/svaba/issues/4 annotates SV based on its breakpoints
#'
#'
#' @param x svaba generated vcf to annotate. Get vcf
#' @param dat Table with VCF data
#' @export

get_sv_type <- function(x,dat){
  root <- gsub(":[12]", "", x)
  mate1 <- paste0(root, ":1")
  mate2 <- paste0(root, ":2")
  alt1 <- dat %>% dplyr::filter(ID == mate1) %>% .$ALT
  alt2 <- dat %>% dplyr::filter(ID == mate2) %>% .$ALT
  # Determine sv type based on breakpoint orientation
  if ((grepl("\\[", alt1) & grepl("\\[", alt2)) | (grepl("\\]", alt1) & grepl("\\]", alt2))){
      sv_type <- "INV"

  } else if (grepl("[A-Z]\\[", alt1) & grepl("^\\]", alt2)){
      sv_type <- "DEL"

  } else if (grepl("^\\]", alt1) & grepl("[A-Z]\\[", alt2)){
      sv_type <- "DUP/INS"

  } else{
      sv_type <- "UNKNOWN"
  }

  return(sv_type)
}


#' Learn Read Orientation Model (Mutect2)
#'
#' This function takes a f1r2 read information and generates an orientation model
#' to be used for further filtering steps.
#'
#' @param bin_path [REQUIRED] Path to GATK binary. Default tools/htslib/tabix.
#' @param f1r2 [OPTIONAL] Path to f1r2 file. Only if f1r2_dir is not given.
#' @param f1r2_dir [OPTIONAL] Path to f1r2 file dir. Only if f1r2 is not given.
#' @param verbose [Optional] Enables progress messages. Default False.
#' @export

learn_orientation=function(bin_path="tools/gatk/gatk",f1r2="",f1r2_dir="",output_name="",output_dir="",verbose=FALSE){

  if (f1r2!=""){
    f1r2=paste0(" -I ",paste0(f1r2,collapse=" -I "))
  }else{
    files=list.files(f1r2_dir,full.names=TRUE)
    files=files[grepl(".f1r2.tar.gz$",files)]
    f1r2=paste0(" -I ",paste0(files,collapse=" -I "))
  }

  if (output_name==""){
    sample_name=ULPwgs::get_sample_name(f1r2[1])
  }else{
    sample_name=output_name
  }

  sep="/"

  if(output_dir==""){
    sep=""
  }
  out_file_dir=paste0(output_dir,sep,sample_name,"_ORIENTATION_MODEL")

  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  if (verbose){

    print(paste0(bin_path," LearnReadOrientationModel ",f1r2, " -O ", paste0(out_file_dir,"/",sample_name,".read-orientation-model.tar.gz")))
  }
  system(paste0(bin_path," LearnReadOrientationModel ",f1r2, " -O ", paste0(out_file_dir,"/",sample_name,".read-orientation-model.tar.gz")))
}


#' Generate a set between a group of vcfs usign bcftools
#'
#' This function takes a path to either a vcf file or to a directory of vcf files
#' and outputs an inset/outset between these vcf files.
#'
#' @param bin_path [REQUIRED] Path to bcftools binary. Default tools/bcftools/bcftools
#' @param set_formula [REQUIRED] Formula to filter variants. Default =+2.
#' @param vcf [OPTIONAL] Path to a single vcf file or a vector of vcf files. Only if vcf_dir is not given.
#' @param vcf_dir [OPTIONAL] Path to a directory with vcf files to generate a set for. Only if vcf is not given.
#' @param filter [OPTIONAL] Filter variants by. Default PASS
#' @param output_dir [OPTIONAL] Path to output directory. Default current directory
#' @param verbose [Optional] Enables progress messages. Default False.
#' @export

vcf_sets=function(bin_path="tools/bcftools/bcftools",vcf="",vcf_dir="",set_formula="=+2",filter="PASS",output_dir="",verbose=FALSE){

  sep="/"

  if(output_dir==""){
    sep=""
    output_dir="SET"
  }

  if (filter!=""){
    filter=paste0(" -f ",filter)
  }

  if (!dir.exists(output_dir)){
      dir.create(output_dir,recursive=TRUE)
  }

  if (vcf_dir!=""){
    vcfs=list.files(vcfs,full.names=TRUE)
    vcfs=vcfs[grepl(".vcf.gz$",files)]
    vcfs=paste0(vcfs,collapse=" ")
  }else{
    vcfs=paste0(" ",vcf,collapse=" ")
  }

  private=paste0(" -n",set_formula)

  if (verbose){

    print(paste0(bin_path," isec -p ", output_dir,vcfs,filter,private))

  }
  system(paste0(bin_path," isec -p ", output_dir,vcfs,filter,private))
}



#' Generate a 1:n sets between a group of vcfs
#'
#' This function takes a path to either a vcf file or to a directory of vcf files
#' and outputs an inset/outset between these vcf files.
#'
#' @param bin_path [REQUIRED] Path to bcftools binary. Default tools/bcftools/bcftools
#' @param vcf [OPTIONAL] Path to a single vcf file or a vector of vcf files. Only if vcf_dir is not given.
#' @param vcf_dir [OPTIONAL] Path to a directory with vcf files to generate a set for. Only if vcf is not given.
#' @param filter [OPTIONAL] Filter variants by. Default PASS
#' @param output_dir [OPTIONAL] Path to output directory. Default current directory
#' @param set_names [OPTIONAL] Names to use to identify each set.
#' @param plot [OPTIONAL] Generate a VennDiagram plot with set intersections
#' @param threads [OPTIONAL] Number of threads to use. Default 3
#' @param verbose [Optional] Enables progress messages. Default False.
#' @export

generate_sets=function(bin_path="tools/bcftools/bcftools",vcf="",vcf_dir="",filter="PASS",output_dir="",set_names="",verbose=FALSE,threads=3,plot=TRUE){

  sep="/"
  if(output_dir==""){
    sep=""
  }

  out_file_dir=paste0(output_dir,sep,"SETS")

  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir,recursive=TRUE)
  }

  if (vcf_dir!=""){
    vcfs=list.files(vcfs,full.names=TRUE)
    vcfs=vcfs[grepl(".vcf.gz$",files)]
    n_vcfs=length(vcfs)
    vcfs=paste0(vcfs,collapse=" ")
  }else{
    n_vcfs=length(vcf)
    vcfs=paste0(" ",vcf,collapse=" ")
  }

  parallel::mclapply(X=1:n_vcfs,FUN=function(x){
    if (x>1 & x<n_vcfs){
        n <- n_vcfs
        m <- expand.grid(rep(list(0:1),n))
        m <- m[rowSums(m)==x ,]
        lapply(X=1:nrow(m),FUN=function(y){
          vcf_sets(bin_path=bin_path,vcf=vcf,vcf_dir=vcf_dir,set_formula=paste0("~",paste0(m[y,],collapse="")),filter=filter,output_dir=paste0(out_file_dir,"/SET_",x,"/SET_",paste0(m[y,],collapse="")),verbose=verbose)
        })
    }else{
      vcf_sets(bin_path=bin_path,vcf=vcf,vcf_dir=vcf_dir,set_formula=paste0("=",x),filter=filter,output_dir=paste0(out_file_dir,"/SET_",x),verbose=verbose)
    }
  })
  files=list.files(out_file_dir,recursive=TRUE,full.names=TRUE)
  files=files[grepl("sites.txt",files)]
  tables=lapply(files,FUN=function(x){
    tryCatch(read.table(x,colClasses="character"),error=function(x) NULL)})
  tables=tables[!sapply(tables,is.null)]
  tables=dplyr::bind_rows(tables)
  tables=tables %>% dplyr::mutate(Variant=paste(V1,V2,V3,V4,sep="_"))
  colnames(tables)=c("chr","pos","ref","alt","inter","ID")
  sets=lapply(X=1:n_vcfs,FUN=function(z){
    search=rep(".",n_vcfs);
    search[z]="1";
    return(tables[grepl(paste0(search,collapse=""),tables$inter),]$ID);
  })

  if(set_names==""){
    names(sets)=paste0("Set_",1:n_vcfs)
  }else{
    names(sets)=set_names
  }
  if (plot){
    p1=ggvenn::ggvenn(sets)
    p1=p1+ggplot2::labs(title="Variant sets",subtitle=paste0("N=",nrow(tables)))
    ggplot2::ggsave(paste0(out_file_dir,"/variantSets_VennDiagram.png"),height=10,width=10)
  }
  write.table(tables,file=paste0(out_file_dir,"/variantData.txt"),quote=FALSE,row.names=FALSE,col.names=TRUE)
}

#' Filter Variant Tranches (Mutect2)
#'
#' This function takes an annotated VCF with CNN_D1 or CNN_D2 scores and filter the
#' variants based on set threshold.
#'
#' @param bin_path [REQUIRED] Path to GATK binary. Default tools/gatk/gatk
#' @param vcf [REQUIRED] Path to annotated VCF with CNN_D1/CNN_D2 scores
#' @param resources [OPTIONAL] Path to resources for variant filtering
#' @param output_name [OPTIONAL] Name of the sample to output
#' @param info_key [OPTIONAL] Annotation column to select. Default CNN_D1
#' @param snp_tranche [OPTIONAL] SNP tranche filter value. Default 99.95
#' @param indel_tranche [OPTIONAL] Indel tranche filter value. Default 99.4
#' @param keep_previous_filters [OPTIONAL] Keep previous filters in VCF. Default False
#' @param output_dir [OPTIONAL] Path to output dir
#' @param verbose [Optional] Enables progress messages. Default False
#' @export

FilterVariantTranches=function(bin_path="tools/gatk/gatk",vcf="",resources="",output_name="",info_key="CNN_1D",snp_tranche=99.95,indel_tranche=99.4,output_dir="",keep_previous_filters=FALSE,verbose=FALSE){

  prev_filters=" --invalidate-previous-filters "
  if (keep_previous_filters){
    prev_filters=" "
  }

  if (output_name==""){
    sample_name=ULPwgs::get_sample_name(vcf[1])
  }else{
    sample_name=output_name
  }

  if(resources!=""){
    resources=paste0(" --resource ",paste0(resources,collapse=" --resource "))
  }

  sep="/"

  if(output_dir==""){
    sep=""
  }
  out_file_dir=paste0(output_dir,sep,sample_name,"_FILTERED_TRENCHES")

  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }


  if (verbose){
    print(paste0(bin_path," FilterVariantTranches -V ",vcf," --info-key ",info_key, " -O ", paste0(out_file_dir,"/",sample_name,".FILTERED.vcf.gz")," --snp-tranche ",snp_tranche," --indel-tranche ",indel_tranche,resources,prev_filters))
  }
  system(paste0(bin_path," FilterVariantTranches -V ",vcf, " --info-key ",info_key, " -O ", paste0(out_file_dir,"/",sample_name,".FILTERED.vcf.gz")," --snp-tranche ",snp_tranche," --indel-tranche ",indel_tranche,resources,prev_filters))
}



#' Annotate a VCF with scores from a Convolutional Neural Network (CNN) {Mutect2}
#'
#' This function takes a vcf and then annotates it with CNN scores.
#' If a BAM files is supplied a 2D model will be applied, otherwise a 1D model will be applied.
#'
#' @param bin_path [REQUIRED] Path to GATK binary. Default tools/gatk/gatk
#' @param vcf [REQUIRED] Path to VCF file to annotate.
#' @param ref_genome [REQUIRED] Path to reference genome.
#' @param bam [OPTIONAL] Path to BAM file.
#' @param output_dir [OPTIONAL] Path to BAM file.
#' @param output_name [OPTIONAL] Default output name. If not supplied it will use the vcf file name.
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @export


CNNScoreVariants=function(bin_path="tools/gatk/gatk",vcf="",ref_genome="",bam="",output_dir="",output_name="",verbose=FALSE){

  sep="/"
  if(output_dir==""){
    sep=""
  }

  sample_name=ULPwgs::get_sample_name(vcf)

  if(output_name!=""){
    sample_name=output_name
  }

  out_file_dir=paste0(output_dir,sep,sample_name,"_CNNscored")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  opt=""
  if (bam!=""){
    bam=paste0(" -I ",bam)
    opt=" -tensor-type read_tensor "
    out_file=paste0(out_file_dir,"/",sample_name,".CNNscored.2D.vcf")
  }else{
    out_file=paste0(out_file_dir,"/",sample_name,".CNNscored.1D.vcf")
  }

  if(verbose){
    print(paste0("source activate gatk; ",bin_path," CNNScoreVariants -V ",vcf," -R ",ref_genome, " -O ", out_file,bam,opt))
  }
  system(paste0("source activate gatk; ",bin_path," CNNScoreVariants -V ",vcf," -R ",ref_genome, " -O ", out_file,bam,opt))

}


#' Bzips a VCF file
#'
#' This function takes a VCF file and Bgzips it
#'
#' @param bin_path [Required] Path to bcftools binary. Default tools/htslib/tabix.
#' @param file [Required] Path to VCF file to bgzip.
#' @param verbose [Optional] Enables progress messages. Default False.
#' @export



bgzip=function(bin_path="tools/htslib/bgzip",file="",verbose=FALSE){
  if (verbose){
    print(paste0(bin_path," -f ",file))
  }
  system(paste0(bin_path," -f ",file))
}

#' Splits multisample VCF file into multiple VCFs
#'
#' This function takes a multisample VCF and splits into multiple VCFS for each sample
#'
#' @param bin_path [Required] Path to bcftools binary. Default "tools/bcftools/bcftools"
#' @param vcf [Required] Path to VCF file to split.
#' @param verbose [Optional] Enables progress messages. Default False.
#' @param output_dir [Optional] Path to the output directory.
#' @export

split_vcf=function(bin_path="tools/bcftools/bcftools",vcf="",verbose=FALSE,output_dir=""){
    sep="/"

    if(output_dir==""){
      sep=""
    }

    sample_name=ULPwgs::get_sample_name(vcf)

    out_file=paste0(output_dir,sep,sample_name,"_SPLIT")
    if (!dir.exists(out_file)){
        dir.create(out_file,recursive=TRUE)
    }

  if (verbose){
    name=system(paste(bin_path," query -l ",vcf), intern = TRUE, ignore.stderr = TRUE)
    print(paste(bin_path," query -l ",vcf))
    sapply(name,function(x){print(paste(bin_path, "view -c1 -Oz -s",x, "-o",paste0(out_file,"/",x,".vcf.gz"),vcf));
      system(paste(bin_path, "index", paste0(out_file,"/",x,".vcf.gz"),vcf))
    })
  }
  name=system(paste(bin_path," query -l ",vcf), intern = TRUE, ignore.stderr = TRUE)
  sapply(name,function(x){system(paste(bin_path, "view -c1 -Oz -s",x, "-o", paste0(out_file,"/",x,".vcf.gz"),vcf));
        system(paste(bin_path, "index", paste0(out_file,"/",x,".vcf.gz"),vcf))
  })

}

#' VCF filtering using bcftools
#'
#' This function filters VCF calls using bcftools
#'
#' @param bin_path Path to gatk binary. Default tools/gatk/gatk.
#' @param bin_path2 Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 Path to tabix binary. Default tools/htslib/tabix.
#' @param unfil_vcf Path to unfiltered vcf file.
#' @param qual Quality filter. Default 30.
#' @param mq Mapping quality filter. Default 40.
#' @param ref [Optional] Filter variants with ID.
#' @param state [Optional] Variant state to select. Options: het/homo
#' @param type [Optional] Variant type to include. Options: snp/indel. Default includes both
#' @param filter [Optional] Filter to include Options: PASS/.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @export




vcf_filter_variants=function(unfil_vcf="",bin_path="tools/bcftools/bcftools",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",qual=30,mq=40,state="",ref="",type="",filter="",verbose=FALSE,output_dir=""){

  sep="/"
  if(output_dir==""){
    sep=""
  }
  sample_name=ULPwgs::get_sample_name(unfil_vcf)
  out_file_dir=paste0(output_dir,sep,sample_name,"_FILTERED")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir,recursive=TRUE)
  }

  out_file=paste0(out_file_dir,"/",sample_name,".FILTERED.vcf")

  if (qual!=""){
    qual=paste0("\'%QUAL>",qual)
  }else{
    if (mq!=""){
      mq=paste0("\'%MQ>",mq)
    }else{
        if (filter!=""){
          filter=paste0("\'%FILTER=\"",filter,"\" ")
        }else{
          if (type!=""){
            type=paste0("\'%TYPE=\"",type,"\" ")
          }else{
            if (ref!=""){
              ref=paste0("\'%ID!=\"",ref,"\" ")
            }else{
              if (state!=""){
                state=paste0("\'GT[0]=\"",state,"\" ")
              }
            }
          }
        }
      }
    }


    if (state!=""&!grepl("GT",state)){
      state=paste0(" & GT[0]=\"",state,"\" ")
    }

    if (ref!=""&!grepl("%",ref)){
      ref=paste0(" & ID!=\"",ref,"\" ")
    }

    if (type!=""&!grepl("%",type)){
      type=paste0(" & TYPE=\"",type,"\" ")
    }
    if (filter!=""&!grepl("%",filter)){
      filter=paste0(" & FILTER=\"",filter,"\" ")
    }
    if (mq!=""&!grepl("%",mq)){
      mq=paste0(" & MQ>",mq)
    }

  if(verbose){
    print(paste(bin_path,"view  -i ",qual,filter,state,type,ref,mq,"\'",unfil_vcf,">",out_file))
  }
  system(paste(bin_path,"view  -i ",qual,filter,state,type,ref,mq,"\'",unfil_vcf,">",out_file))
  system(paste("cp", out_file, paste0(out_file,".tmp")))
  bgzip(bin_path=bin_path2,file=out_file)
  tab_indx(bin_path=bin_path3,file=paste0(out_file,".gz"))
  system(paste("cp", paste0(out_file,".tmp"), out_file))
  system(paste("rm -rf", paste0(out_file,".tmp")))
}

#' VCF fix MNPs
#'
#' Exclude MNPs from vcf file
#'
#' @param bin_path Path to bcftools binary. Default tools/bcftools/bcftools.
#' @param bin_path2 Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 Path to tabix binary. Default tools/htslib/tabix.
#' @param vcf Path to unfiltered vcf file.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @export


fix_mnps=function(vcf="",bin_path="tools/bcftools/bcftools",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",verbose=FALSE,output_dir=""){

  sep="/"
  if(output_dir==""){
    sep=""
  }
  sample_name=ULPwgs::get_sample_name(vcf)
  out_file_dir=paste0(output_dir,sep)
  if (!dir.exists(out_file_dir) & !out_file_dir==""){
      dir.create(out_file_dir,recursive=TRUE)
  }

  out_file=paste0(out_file_dir,sample_name,".vcf")

  if(verbose){
    print(paste(bin_path,"view --exclude-types mnps ",vcf,">",out_file))
  }
  system(paste(bin_path,"view --exclude-types mnps ",vcf,">",out_file))
  system(paste("cp", out_file, paste0(out_file,".tmp")))
  bgzip(bin_path=bin_path2,file=out_file)
  tab_indx(bin_path=bin_path3,file=paste0(out_file,".gz"))
  system(paste("cp", paste0(out_file,".tmp"), out_file))
  system(paste("rm -rf", paste0(out_file,".tmp")))
}



#' VCF file concatenation
#'
#' This function concatenates VCF files found in a directory.
#'
#' @param bin_path Path to bcftools binary. Default tools/bcftools/bcftools.
#' @param vcf Path to vcf files to concatenate. Only if vcf_dir not given
#' @param vcf_dir Path to directory with vcf files to concatenate. Only if vcf not given
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @export



vcf_concatenate=function(bin_path="tools/bcftools/bcftools",vcf="",vcf_dir="",verbose=FALSE,output_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }
  if (vcf_dir!=""){
    files=list.files(vcf_dir,full.names=TRUE)
    files=files[grepl(".vcf$",files)]
    sample_name=ULPwgs::get_sample_name(list.files(vcf_dir)[1])
  }else{
    files=vcf
    sample_name=ULPwgs::get_sample_name(vcf[1])
  }

  out_file_dir=paste0(output_dir,sep,sample_name,"_CONCATENATED")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  out_file=paste0(out_file_dir,"/",sample_name,".CONCATENATED.vcf")

  if(verbose){
    print(paste(bin_path,"concat -o",out_file, paste(files, collapse=' ' ) ))
  }
  system(paste(bin_path,"concat -o",out_file, paste(files, collapse=' ' )))
}



#' VCF stats file concatenation
#'
#' This function merges VCF stat files found in a directory.
#'
#' @param bin_path Path to gatk binary. Default tools/gatk/gatk.
#' @param vcf_stats_dir Path to directory with vcf stats files to merge.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @export



vcf_stats_merge=function(bin_path="tools/gatk/gatk",vcf_stats_dir="",verbose=FALSE,output_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }
  files=list.files(vcf_stats_dir,full.names=TRUE)
  files=files[grepl(".vcf.stats$",files)]
  sample_name=ULPwgs::get_sample_name(files[1])
  out_file_dir=paste0(output_dir,sep,sample_name,"_MERGED_VCF_STATS")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  out_file=paste0(out_file_dir,"/",sample_name,".MERGED.vcf.stats")

  if(verbose){
    print(paste(bin_path,"MergeMutectStats -O",out_file," -stats ",paste(files, collapse=' -stats ' ) ))
  }
  system(paste(bin_path,"MergeMutectStats -O",out_file," -stats ", paste(files, collapse=' -stats ' )))
}

#' VCF annotation using bcftools
#'
#' This function annotates VCF file using bcftools
#'
#' @param bin_path Path to gatk binary. Default tools/gatk/gatk.
#' @param bin_path2 Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 Path to tabix binary. Default tools/htslib/tabix.
#' @param vcf Path to vcf file.
#' @param db Path to vcf file with SNPs database.
#' @param state Variant state. Default het.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @param threads Additional in/out compression threads. Default 1.
#' @export



vcf_annotate=function(bin_path="tools/bcftools/bcftools",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",vcf="",db="",verbose=FALSE,output_dir="",threads=1){
  sep="/"
  if(output_dir==""){
    sep=""
  }
  sample_name=ULPwgs::get_sample_name(vcf)
  out_file_dir=paste0(output_dir,sep,sample_name,"_ANNOTATED")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  out_file=paste0(out_file_dir,"/",sample_name,".ANNOTATED.vcf")

  if(verbose){
    print(paste(bin_path,"annotate -a ",db,"-c ID --threads",threads, vcf,">",out_file))
  }
  system(paste(bin_path,"annotate -a ",db,"-c ID --threads", threads,vcf,">",out_file))
  system(paste("cp", out_file, paste0(out_file,".tmp")))
  bgzip(bin_path=bin_path2,file=out_file)
  tab_indx(bin_path=bin_path3,file=paste0(out_file,".gz"))
  system(paste("cp", paste0(out_file,".tmp"), out_file))
  system(paste("rm -rf", paste0(out_file,".tmp")))
}


#' Sort VCF file
#'
#' This function takes a VCF file and sorts it genomic order
#'
#' @param bin_path [Required] Path to bcftools binary. Default tools/bcftools/bcftools.
#' @param vcf [Required] Path to VCF file to sort.
#' @param output_dir [Optional] Path to the output directory. Default working directory.
#' @param ram [Required] RAM in GB to allocate sorting. Default 4Gb.
#' @param verbose [Optional] Enables progress messages. Default False.
#' @param tmp_dir [Optional] Path to tmp file directory.
#' @export



vcf_sort=function(bin_path="tools/bcftools/bcftools",vcf="",verbose=FALSE,output_dir="",ram=4,tmp_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }

  sample_name=ULPwgs::get_sample_name(vcf)
  file_ext=ULPwgs::get_file_extension(vcf)
  out_file_dir=paste0(output_dir,sep,sample_name,"_SORTED")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  out_file=paste0(out_file_dir,"/",sample_name)
  tmp=""
  if (!tmp_dir==""){
    tmp=paste0("-T ",tmp_dir)
  }

  if (verbose){
    print(paste0(bin_path," sort ",vcf," -m ",ram, "G ",tmp," -o ",out_file,".SORTED.",file_ext))
  }
  system(paste0(bin_path," sort ",vcf," -m ",ram, "G ",tmp," -o ",out_file,".SORTED.",file_ext))


}

#' Filter VEP results
#'
#' This function filters VCFs generated by VEP
#'
#' @param bin_path Path to vep binary. Default tools/ensembl-vep/filter_vep.
#' @param bin_path2 Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 Path to tabix binary. Default tools/htslib/tabix.
#' @param unf_vcf Path to vcf file.
#' @param filter Column tags to filter by. Examples "(MAX_AF < 0.01 or not MAX_AF) and FILTER = PASS and SYMBOL in /home/regmova/PCFselect/Panel_V2/PANEL_GENE.txt". For more information check VEP filter info.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @export

filter_VEP=function(bin_path="tools/ensembl-vep/filter_vep",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",unf_vcf="",filter="",verbose=FALSE,output_dir=""){

  sep="/"
  if(output_dir==""){
    sep=""
  }
  sample_name=ULPwgs::get_sample_name(unf_vcf)
  out_file_dir=paste0(output_dir,sep,sample_name,"_FILTERED_VEP")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir,recursive=TRUE)
  }

  out_file=paste0(out_file_dir,"/",sample_name,".FILTERED.VEP.vcf")

  if(verbose){
    print(paste(bin_path,"-i ",unf_vcf,"-f ",filter,">",out_file))
  }
  system(paste(bin_path,"-i ",unf_vcf,"-f ",filter,">",out_file))
  system(paste("cp", out_file, paste0(out_file,".tmp")))
  bgzip(bin_path=bin_path2,file=out_file)
  tab_indx(bin_path=bin_path3,file=paste0(out_file,".gz"))
  system(paste("cp", paste0(out_file,".tmp"), out_file))
  system(paste("rm -rf", paste0(out_file,".tmp")))
}


#' ASEQ pileup formatting for downstream analysis

#' This function formats ASEQ pileup file for downstream analysis.
#'
#' @param file Path to file to format.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @import data.table
#' @export

format_ASEQ_pileup=function(file="",verbose=FALSE,output_dir=""){
  sep="/"
  if(output_dir==""){
    sep=""
  }
  sample_name=ULPwgs::get_sample_name(file)
  out_file_dir=paste0(output_dir,sep,sample_name,"_FORMATTED_ASEQ_PILEUP")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  out_file=paste0(out_file_dir,"/",sample_name,".snps")
  dat=read.table(file,header=TRUE)
  dat=dat %>% dplyr::select(chr,pos,dbsnp,ref,alt,A,C,G,T,RD)
  dat=dat %>% dplyr::filter(!nchar(as.character(alt))>1)
  dat=as.data.table(dat)
  dat=dat[,Value := get(as.character(alt)), by = alt]
  dat=dat %>% dplyr::mutate(af=Value/RD,cov=RD) %>% dplyr::select(chr, pos,dbsnp,ref,alt,A,C,G,T,af,cov) %>% dplyr::rename (rsid="dbsnp")
  write.table(dat,file=out_file,quote=FALSE,row.names=FALSE,sep="\t")
}


#' Format segmentation data for downstream analysis using CLONETv2
#'
#' This function takes the segmentation data of multiple samples produced by any segment caller (mainly CNVkit) and generates
#' a single BED file with all the segment information (chr/start/end/log2) with an additional column for sample ID.
#' The default columns to select from original segmentation files are 1,2,3,5, which correspond to chr/start/end/log2 in CNVkit.
#' Use argument cols_to_keep to select other columns if needed or if order is different from different segment callers.
#'
#' @param seg_file [REQUIRED] Path/s to segmentation file/s. dir_segment and seg_file are mutually excluding.
#' @param dir_segment [REQUIRED] Path to directory with segmentation files.
#' @param pattern [FALSE] Pattern to use if directory for segmentation files is given.
#' @param cols_to_keep [DEFAULT==c("chr","start","end","log2","sample"))] Columns to keep from original segmentation bed files
#' @param output_dir Path to the output directory.
#' @param output_name Name of the file to output.
#' @param verbose Enables progress messages. Default False.
#' @export



format_segment_data=function(seg_file="",dir_segment="",pattern="",cols_to_keep=c("chr","start","end","log2","sample"),output_dir="",output_name="",verbose=FALSE){

  if(dir_segment!="" & seg_file!=""){

    stop("Only seg_file or dir_segment can be provided, not both.")
  }

  if (dir_segment!=""){
    files=list.files(path=dir_segment,pattern=pattern,full.names=TRUE)
  }else {
    files=seg_file
  }

  data=lapply(files,FUN=function(x) {dat=read.table(file=x,header=TRUE);
  dat$sample=ULPwgs::get_sample_name(x);
  return (dat)
  })
  data=dplyr::bind_rows(data)
  names(data)[c(1,2,3,5,6,ncol(data))]=c("chr","start","end","log2","cn","sample")
  data$log2=ifelse(data$chr=="X",data$log2+1,data$log2)
  data=data[,c(cols_to_keep)]

  sep="/"
  if(output_dir==""){
    sep=""
  }

  if(output_name==""){
    output_name="SegmentData"
  }
  out_file_dir=paste0(output_dir,sep,"FORMATTED_SEGMENTS")
  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  out_file=paste0(out_file_dir,"/",output_name,".bed")
  write.table(data,file=out_file,quote=FALSE,row.names=FALSE,sep="\t")
}


#' Format vcf for downstream point mutatation analysis.
#'
#'
#' This function takes a VCF file and outputs a tab delimited file with data for downstream analysis in CLONETv2 point mutations analysis.
#'
#' @param bin_path [REQUIRED] Path to ASEQ binary. Default tools/bcftools/bcftools
#' @param vcf_dir [REQUIRED] Path to vcf file directory.
#' @param pattern [OPTIONAL] Pattern to filter vcfs by.
#' @param vcf [OPTIONAL] Path to vcf file. Only if vcf directory not provided.
#' @param output_dir [OPTIONAL]  Path to the output directory.
#' @param worst [OPTIONAL]  Only keep the variants with the worst outcome.
#' @param output_name [OPTIONAL]  Name of output file.
#' @param verbose [OPTIONAL]  Enables progress messages. Default False.
#' @export


format_PM_data=function(bin_path="tools/bcftools/bcftools",vcf_dir="",pattern="",vcf="",worst=FALSE,output_dir="",output_name="",verbose=FALSE){

  if(vcf!="" & vcf_dir!=""){

    stop("Only vcf or vcf_dir arguments can be provided, not both.")
  }

  if (vcf_dir!=""){
    files=list.files(path=vcf_dir,pattern=pattern,full.names=TRUE,recursive=TRUE)
    files=files[grepl("vcf$",files)]
  }else {
    files=vcf
  }

  if(output_name==""){
    output_name="PMreadsCount"
  }
  wrst=""
  if(worst){
    wrst= " -s worst "
  }

  sep="/"
  if(output_dir==""){
    sep=""
  }

  out_file_dir=paste0(output_dir,sep,"FORMATTED_PM_ANALYSIS")
  out_file=paste0(out_file_dir,"/",output_name,".tsv")

  if (!dir.exists(out_file_dir)){
      dir.create(out_file_dir)
  }

  if(verbose){
    print(paste("printf","\'Gene.id\tchr\tstart\tend\tsample\tref.count\talt.count\n\'",">",out_file))
    lapply(files,FUN=function(x){print(paste0(bin_path," +", system(paste0("echo ",sub("bcftools$", "plugins\\1",bin_path),"/split-vep.so"),intern=TRUE)," -f \'%SYMBOL{0}\\t%CHROM\\t%POS\\t%FILTER\\t[\\t%AD]\\n\' -i \'TYPE=\"snp\" & AD[0:1]>0\' ",wrst, x," | awk -F \'[\t,]\' \'{ print  $1,$2,$3,$3, \"",ULPwgs::get_sample_name(x),"\", $5, $6 }\' OFS=\'\\t\' >> ",out_file))})
    print(paste("printf","\'Gene.id\tchr\tpos\tref\talt\tFILTER\tAD\tDP\tAllele\tConsequence\tIMPACT\tSYMBOL\tGene\tFeature_type\tFeature\tBIOTYPE\tEXON\tINTRON\tHGVSc\tHGVSp\tcDNA_position\tCDS_position\tProtein_position\tAmino_acids\tCodons\tExisting_variation\tDISTANCE\tSTRAND\tFLAGS\tVARIANT_CLASS\tSYMBOL_SOURCE\tHGNC_ID\tCANONICAL\tMANE\tTSL\tAPPRIS\tCCDS\tENSP\tSWISSPROT\tTREMBL\tUNIPARC\tUNIPROT_ISOFORM\tGENE_PHENO\tSIFT\tPolyPhen\tDOMAINS\tmiRNA\tHGVS_OFFSET\tAF\tAFR_AF\tAMR_AF\tEAS_AF\tEUR_AF\tSAS_AF\tAA_AF\tEA_AF\tgnomAD_AF\tgnomAD_AFR_AF\tgnomAD_AMR_AF\tgnomAD_ASJ_AF\tgnomAD_EAS_AF\tgnomAD_FIN_AF\tgnomAD_NFE_AF\tgnomAD_OTH_AF\tgnomAD_SAS_AF\tMAX_AF\tMAX_AF_POPS\tCLIN_SIG\tSOMATIC\tPHENO\tPUBMED\tMOTIF_NAME\tMOTIF_POS\tHIGH_INF_POS\tMOTIF_SCORE_CHANGE\tTRANSCRIPTION_FACTORS\tsample\n\'",">",paste0(out_file,".ALL")))
    lapply(files,FUN=function(x){print(paste0(bin_path," +",system(paste0("echo ",sub("bcftools$", "plugins\\1",bin_path),"/split-vep.so"),intern=TRUE), " -f \'%SYMBOL{0}\\t%CHROM\\t%POS\\t%FILTER\\t[\\t%AD]\\t[\\t%DP]\\t%CSQ\\n\' -i \'AD[0:1]>0\' ",wrst," -d -A tab ",x," |sed \'s/$/\\t",ULPwgs::get_sample_name(x),"/\' >>",paste0(out_file,".ALL")))})
  }
    system(paste("printf","\'Gene.id\tchr\tstart\tend\tsample\tref.count\talt.count\n\'",">",out_file))
    lapply(files,FUN=function(x){system(paste0(bin_path," +",system(paste0("echo ",sub("bcftools$", "plugins\\1",bin_path),"/split-vep.so"),intern=TRUE), " -f \'%SYMBOL{0}\\t%CHROM\\t%POS\\t%FILTER\\t[\\t%AD]\\n\' -i \'TYPE=\"snp\" & AD[0:1]>0\' ",wrst ,x," | awk -F \'[\t,]\' \'{ print  $1,$2,$3,$3, \"",ULPwgs::get_sample_name(x),"\", $5, $6 }\' OFS=\'\\t\' >> ",out_file))})

    system(paste("printf","\'Gene.id\tchr\tpos\tref\talt\tFILTER\tAD\tDP\tAllele\tConsequence\tIMPACT\tSYMBOL\tGene\tFeature_type\tFeature\tBIOTYPE\tEXON\tINTRON\tHGVSc\tHGVSp\tcDNA_position\tCDS_position\tProtein_position\tAmino_acids\tCodons\tExisting_variation\tDISTANCE\tSTRAND\tFLAGS\tVARIANT_CLASS\tSYMBOL_SOURCE\tHGNC_ID\tCANONICAL\tMANE\tTSL\tAPPRIS\tCCDS\tENSP\tSWISSPROT\tTREMBL\tUNIPARC\tUNIPROT_ISOFORM\tGENE_PHENO\tSIFT\tPolyPhen\tDOMAINS\tmiRNA\tHGVS_OFFSET\tAF\tAFR_AF\tAMR_AF\tEAS_AF\tEUR_AF\tSAS_AF\tAA_A\tEA_AF\tgnomAD_AF\tgnomAD_AFR_AF\tgnomAD_AMR_AF\tgnomAD_ASJ_AF\tgnomAD_EAS_AF\tgnomAD_FIN_AF\tgnomAD_NFE_AF\tgnomAD_OTH_AF\tgnomAD_SAS_AF\tMAX_AF\tMAX_AF_POPS\tCLIN_SIG\tSOMATIC\tPHENO\tPUBMED\tMOTIF_NAME\tMOTIF_POS\tHIGH_INF_POS\tMOTIF_SCORE_CHANGE\tTRANSCRIPTION_FACTORS\tsample\n\'",">",paste0(out_file,".ALL")))
    lapply(files,FUN=function(x){system(paste0(bin_path," +",system(paste0("echo ",sub("bcftools$", "plugins\\1",bin_path),"/split-vep.so"),intern=TRUE), " -f \'%SYMBOL{0}\\t%CHROM\\t%POS\\t%REF\\t%ALT\\t%FILTER\\t[\\t%AD]\\t[\\t%DP]\\t%CSQ\\n\' -i \' AD[0:1]>0\' ",wrst," -d -A tab ",x," |sed \'s/$/\\t",ULPwgs::get_sample_name(x),"/\' >>",paste0(out_file,".ALL")))})
  }



#' VCF formating using bcftools
#'
#' This function formats VCF file using bcftools
#'
#' @param bin_path Path to gatk binary. Default tools/gatk/gatk.
#' @param bin_path2 Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 Path to tabix binary. Default tools/htslib/tabix.
#' @param vcf Path to vcf file.
#' @param expr Expresion by which to format VCF. Default %CHROM\\t%POS\\t%ID\\t%REF\\t%ALT\\t%QUAL\\t%FILTER\\t%INFO\\n
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default False.
#' @export

vcf_format=function(vcf="",bin_path="tools/bcftools/bcftools",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",expr="'%CHROM\\t%POS\\t%ID\\t%REF\\t%ALT\\t%QUAL\\t%FILTER\\t%INFO\\n'",verbose=FALSE,output_dir=""){
    sep="/"
    if(output_dir==""){
      sep=""
    }
    sample_name=ULPwgs::get_sample_name(vcf)
    out_file_dir=paste0(output_dir,sep,sample_name,"_FORMATED")
    if (!dir.exists(out_file_dir)){
        dir.create(out_file_dir)
    }

    out_file=paste0(out_file_dir,"/",sample_name,".FORMATED.vcf")

    if(verbose){
      print(paste(bin_path,"query -f ",expr, vcf,">",out_file))
    }
    system(paste(bin_path,"query -f ",expr, vcf,">",out_file))
    system(paste("cp", out_file, paste0(out_file,".tmp")))
    bgzip(bin_path=bin_path2,file=out_file)
    tab_indx(bin_path=bin_path3,file=paste0(out_file,".gz"))
    system(paste("cp", paste0(out_file,".tmp"), out_file))
    system(paste("rm -rf", paste0(out_file,".tmp")))
  }







#' VCF filtering using GATK
#'
#' This function filters VCF calls using GATK statistics
#'
#' @param bin_path [REQUIRED] Path to gatk binary. Default tools/gatk/gatk.
#' @param bin_path2 [REQUIRED] Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 [REQUIRED] Path to tabix binary. Default tools/htslib/tabix.
#' @param ref_genome [REQUIRED] Path to reference genome fasta file.
#' @param unfil_vcf [REQUIRED] Path to unfiltered vcf file.
#' @param unfil_vcf_stats [REQUIRED] Path to unfiltered vcf file stats.
#' @param contamination [OPTIONAL] Path to contamination table. Also requires segmentation.
#' @param segmentation [OPTIONAL] Path to segments table. Also requires contamination.
#' @param orientation_model [OPTIONAL] Path to orientation model generated using learn_orientation.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @export


vcf_filtering=function(bin_path="tools/gatk/gatk",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",unfil_vcf="",ref_genome="",unfil_vcf_stats="",contamination="",segmentation="",orientation_model="",verbose=FALSE,output_dir=""){

    sep="/"
    if(output_dir==""){
      sep=""
    }
    sample_name=ULPwgs::get_sample_name(unfil_vcf)
    out_file_dir=paste0(output_dir,sep,sample_name,"_FILTERED")
    if (!dir.exists(out_file_dir)){
        dir.create(out_file_dir)
    }


    out_file=paste0(out_file_dir,"/",sample_name,".FILTERED.vcf")

    if (contamination!="" & segmentation !=""){
      contamination=paste0(" --contamination-table ", paste0(contamination,collapse=" --contamination-table "))
      segmentation=paste0(" --tumor-segmentation ",paste0(segmentation,collapse=" --tumor-segmentation "))
    }

    if (orientation_model!=""){
      orientation_model=paste0(" --ob-priors ",orientation_model)
    }

    if(verbose){
      print(paste(bin_path,"FilterMutectCalls -O",out_file," -R ",ref_genome," -V ",unfil_vcf," -stats ",unfil_vcf_stats, contamination,segmentation,orientation_model))
    }
    system(paste(bin_path,"FilterMutectCalls -O",out_file," -R ",ref_genome," -V ",unfil_vcf," -stats ",unfil_vcf_stats, contamination,segmentation,orientation_model))
    system(paste("cp", out_file, paste0(out_file,".tmp")))
    bgzip(bin_path=bin_path2,file=out_file)
    tab_indx(bin_path=bin_path3,file=paste0(out_file,".gz"))
    system(paste("cp", paste0(out_file,".tmp"), out_file))
    system(paste("rm -rf", paste0(out_file,".tmp")))
  }

#' VCF contamination estimation using GATK
#'
#' This function estimates sample contamination for further variant filtering
#'
#' @param bin_path [REQUIRED] Path to gatk binary. Default tools/gatk/gatk.
#' @param pileup_table_tumor [REQUIRED] Path to pileup_table for tumor sample generated using pileup_summary
#' @param pileup_table_normal [OPTIONAL] Path to pileup_table for matched normal sample generated using pileup_summary
#' @param output_dir [OPTIONAL]  Path to the output directory.
#' @param verbose [OPTIONAL]  Enables progress messages. Default False.
#' @export


estimate_contamination=function(pileup_table_tumor="",bin_path="tools/gatk/gatk",pileup_table_normal="",verbose=FALSE,output_dir=""){

      sep="/"
      if(output_dir==""){
        sep=""
      }
      sample_name=ULPwgs::get_sample_name(pileup_table_tumor)
      if (!dir.exists(output_dir)){
          dir.create(output_dir)
      }

      out_file=paste0(output_dir,"/",sample_name,".contamination.table")
      out_file2=paste0(output_dir,"/",sample_name,".segments.table")
      if (pileup_table_normal!=""){
        pileup_table_normal=paste0("-matched ",pileup_table_normal)
      }

      if(verbose){
        print(paste(bin_path,"CalculateContamination -O ",out_file," -tumor-segmentation ",out_file2," -I ",pileup_table_tumor,pileup_table_normal))
      }
      system(paste(bin_path,"CalculateContamination -O ",out_file," -tumor-segmentation ",out_file2," -I ",pileup_table_tumor,pileup_table_normal))
  }

#' VCF contamination estimation using GATK
#'
#' This function estimates sample contamination for further variant filtering
#'
#' @param bin_path [REQUIRED] Path to gatk binary. Default tools/gatk/gatk.
#' @param bam_dir [REQUIRED] Path to directory with BAM files
#' @param germ_pattern [REQUIRED] Pattern to match normal samples.
#' @param patient_id [REQUIRED] Pattern to match patient specific samples.
#' @param db [REQUIRED] Path to vcf with known variants.
#' @param interval [REQUIRED] Path to vcf with intervals to analyze.
#' @param output_dir [OPTIONAL]  Path to the output directory.
#' @param verbose [OPTIONAL]  Enables progress messages. Default False.
#' @param threads [OPTIONAL]  Number of threads to use. Default 3.
#' @export


estimate_contamination_parallel=function(bin_path="tools/gatk/gatk",bam_dir="",germ_pattern="GL",patient_id="",db="",interval="",verbose=FALSE,output_dir="",threads=3){

      sep="/"
      if(output_dir==""){
        sep=""
      }
      sample_name=ULPwgs::get_sample_name(patient_id)
      out_file_dir=paste0(output_dir,sep,sample_name,"_CONTAMINATION")
      if (!dir.exists(out_file_dir)){
          dir.create(out_file_dir)
      }

      files=list.files(bam_dir,recursive=TRUE,full.names=TRUE,pattern=patient_id)
      files=files[grepl("bam$",files)]
      tumor_bams=files[!grepl(germ_pattern,files)]
      normal_bam=files[grepl(germ_pattern,files)]

      cl=parallel::makeCluster(threads)
      pbapply::pblapply(X=files,FUN=get_pileup_summary,bin_path=bin_path,db=db,interval=interval,verbose=verbose,output_dir=out_file_dir,cl=cl)
      on.exit(parallel::stopCluster(cl))

      files=list.files(out_file_dir,recursive=TRUE,full.names=TRUE,pattern=patient_id)
      files=files[grepl(".pileup.table$",files)]
      tumor_pileups=files[!grepl(germ_pattern,files)]
      normal_pileup=files[grepl(germ_pattern,files)]

      cl=parallel::makeCluster(threads)
      pbapply::pblapply(X=tumor_pileups,FUN=estimate_contamination,bin_path=bin_path,pileup_table_normal=normal_pileup,verbose=verbose,output_dir=out_file_dir,cl=cl)
      on.exit(parallel::stopCluster(cl))
}




#' BAM pileup summary for known sites using GATK
#'
#' This function generates a pileup summary that is further used in estimating contamination
#'
#' @param bin_path [REQUIRED] Path to gatk binary. Default tools/gatk/gatk.
#' @param bam [REQUIRED] Path to BAM file.
#' @param db [REQUIRED] Path to vcf with known variants.
#' @param interval [REQUIRED] Path to vcf with intervals to analyze.
#' @param output_dir [OPTIONAL]  Path to the output directory.
#' @param verbose [OPTIONAL]  Enables progress messages. Default False.
#' @export

get_pileup_summary=function(bam="",bin_path="tools/gatk/gatk",db="",interval="",verbose=FALSE,output_dir=""){
      sep="/"
      if(output_dir==""){
        sep=""
      }
      sample_name=ULPwgs::get_sample_name(bam)
      if (!dir.exists(output_dir)){
          dir.create(output_dir)
      }

      out_file=paste0(output_dir,"/",sample_name,".pileup.table")

      if(verbose){
        print(paste(bin_path,"GetPileupSummaries  -O",out_file," -V ",db," -L ",interval, " -I ",bam))
      }
      system(paste(bin_path,"GetPileupSummaries  -O",out_file," -V ",db," -L ",interval, " -I ",bam))

  }





#' This function filter and formats heterozygous SNP data for downstream analysis using clonet
#'
#' This function takes a path of a directory of unfiltered VCFs files generated using Platypus and
#' the a path of the directory of BAMs from which those VCFs were generated.
#'
#' @param bin_path Path to bcftools binary. Default tools/bcftools/bcftools.
#' @param bin_path2 Path to bgzip binary. Default tools/htslib/bgzip.
#' @param bin_path3 Path to tabix binary. Default tools/htslib/tabix.
#' @param bin_path4 Path to ASEQ binary. Default tools/ASEQ/binaries/linux64/ASEQ
#' @param unfil_vcf Path to unfiltered VCF file.
#' @param unfil_vcf_dir Path to unfiltered VCF file directory.
#' @param qual Quality filter. Default 30.
#' @param mq Mapping quality filter. Default 40.
#' @param min_cov Minimum coverage to filter. Default 20.
#' @param bam_dir Path to BAM file directory.
#' @param patient_id Patient identifier.
#' @param germ_pattern Germline identifier.
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default FALSE.
#' @param remove_tmp Remove temporary files. Default TRUE
#' @param jobs Number of jobs to do in parallel. Default 1
#' @param threads Number of threads to use per job. Default 3
#' @export

format_SNP_data=function(bin_path="tools/bcftools/bcftools",bin_path2="tools/htslib/bgzip",bin_path3="tools/htslib/tabix",bin_path4="tools/ASEQ/binaries/linux64/ASEQ",unfil_vcf="",unfil_vcf_dir="",bam_dir="",germ_pattern="GL",qual=30,mq=40,min_cov=20,patient_id="",verbose=FALSE,output_dir="",jobs=1,threads=3,remove_tmp=TRUE){
    sep="/"
    if(output_dir==""){
      sep=""
    }
    out_file_dir=paste0(output_dir,sep,patient_id,"_PROCESSED_SNPs")
    if (!dir.exists(out_file_dir)){
        dir.create(out_file_dir)
    }
    if (unfil_vcf_dir!=""){
      files0=list.files(unfil_vcf_dir,recursive=TRUE,full.names=TRUE,pattern=patient_id)
      files0=files0[grepl("vcf.gz$",files0)]

      parallel::mclapply(X=as.data.frame(files0),FUN=vcf_filter_variants,bin_path=bin_path,bin_path2=bin_path2,bin_path3=bin_path3,qual=qual,mq=mq,state="het",type="snp",filter="PASS",verbose=verbose,output_dir=out_file_dir,mc.cores=jobs*threads)
      files2=list.files(out_file_dir,recursive=TRUE,full.names=TRUE,pattern="FILTERED")
      files2=files2[grepl("vcf$",files2)]
      files2=as.data.frame(files2)
      names(files2)="VCF_path"
      files2$Sample=apply(files2,1,FUN=ULPwgs::get_sample_name)

      files3=list.files(bam_dir,recursive=TRUE,full.names=TRUE,pattern=patient_id)
      files3=files3[grepl("bam$",files3)]
      files3=as.data.frame(files3)
      names(files3)="BAM_path"
      files3$Sample=apply(files3,1,FUN=ULPwgs::get_sample_name)
      files=dplyr::left_join(files2,files3,by="Sample")
      parallel::mclapply(X=1:nrow(files),FUN=function(x){call_ASEQ(vcf=as.character(files[x,1]),bin_path=bin_path4,bam=as.character(files[x,3]),mrq=mq,mbq=qual,mdc=min_cov,output_dir=out_file_dir,threads=threads,verbose=verbose)},mc.cores=jobs)

    }else{
      files3=list.files(bam_dir,recursive=TRUE,full.names=TRUE,pattern=patient_id)
      files3=files3[grepl("bam$",files3)]
      parallel::mclapply(X=1:length(files3), FUN=function(x){call_ASEQ(vcf=unfil_vcf,bam=as.character(files3[x]),mrq=mq,bin_path=bin_path4,mbq=qual,mdc=min_cov,output_dir=out_file_dir,threads=threads,verbose=verbose)},mc.cores=jobs)
    }

    files3=list.files(out_file_dir,recursive=TRUE,full.names=TRUE,pattern="PILEUP.ASEQ")
    parallel::mclapply(X=1:length(files3),FUN=function(x){format_ASEQ_pileup(file=files3[x],verbose=verbose,output_dir=out_file_dir)},mc.cores=jobs*threads)
    files=list.files(out_file_dir,recursive=TRUE,full.names=TRUE,pattern=".snp")
    tumor_snps=files[!grepl(germ_pattern,files)]
    germ_snps=files[grepl(germ_pattern,files)]
    out_file_dir2=paste0(out_file_dir,"/RESULTS")
    out_file_dir2_ger=paste0(out_file_dir2,"/NormalPileup")
    out_file_dir2_tumor=paste0(out_file_dir2,"/TumorPileup")
    if (!dir.exists(out_file_dir2_ger)){
        dir.create(out_file_dir2_ger,recursive=TRUE)
    }
    if (!dir.exists(out_file_dir2_tumor)){
        dir.create(out_file_dir2_tumor,recursive=TRUE)
    }

    system(paste("cp -rf -t ",out_file_dir2_tumor,paste(tumor_snps,collapse=" ")))
    system(paste("cp -rf -t ",out_file_dir2_ger,paste(germ_snps,collapse=" ")))
    if (remove_tmp){
      system(paste0("ls -d ", paste0(out_file_dir,"/*"), " |egrep -v RESULTS | xargs rm -rf"))
    }
  }


#' This function generates a config files for CLONET pipeline
#'
#' This function takes multiple parameters and generates a config file for the CLONET tool
## Stages to perform:
#   1. analyse single sample and produe RData
#   2. create beta table aggragating single samples analysis
#   3. compute ploidy shift
#   4. compute global admixture
#   5. compute clonality table
#   6. compute allele specific copy number table
#'
#' @param patient_id Patient ID. Default Patient
#' @param clonet_dir Path to CLONET dir
#' @param snp_dir Path to formated SNP data dir.
#' @param segment_data Path to file with formated segment data.
#' @param sample_info Path to file with sample info data.
#' @param min_snp_cov Minimum tumor coverage for informative SNPs. Default 10.
#' @param min_nsnps Minimum number of SNPs per segment. Default 10.
#' @param min_seg_cov Minimum segment coverage. Default 20.
#' @param equal_betaThr Minimum value of beta above which the two alleles are present in the same number. Default 0.9
#' @param max_homo_dels Homozygous deletions threshold. Default 0.01
#' @param del_log_thr Parameters of a valid deletion used to compute Adm.global. Default c(-1,-0.25)
#' @param alpha_par  Percentage of used deletions to compute Adm.global varibility interval. Default 0.9
#' @param clonal_thr Clonality value threshold. Default 0.85
#' @param beta_thr Beta value threshold. Default 0.85
#' @param stages Analysis stages to run trough.Default c(1,2,3,4,5,6)
#' @param comp_ref_map_bias Compute reference mapping bias and to adjust beta estimation. Default FALSE
#' @param beta_decimals Number of beta value decimals to report. Default 3
#' @param beta_method Method for beta estimation. Default STM. Options STM/GB
#' @param adm_method Method for admixture estimation. Default 2D. Options 1D/2D
#' @param jobs Number of Samples to analyze in parallel. Default 1
#' @param threads Number of threads per job to use. Default 3
#' @param output_dir Path to the output directory.
#' @param verbose Enables progress messages. Default FALSE.
#' @export

generate_CLONET_config=function(patient_id="PATIENT",clonet_dir="tools/CLONET",snp_dir="",segment_data="",
    sample_info="",min_snp_cov=10,min_nsnps=10,min_seg_cov=20,equal_betaThr=0.9,max_homo_dels=0.01,
    del_log_thr=c(-1,-0.25),alpha_par=0.9,clonal_thr=0.85,beta_thr=0.85,
    stages=c(1,2,3,4,5,6),comp_ref_map_bias=FALSE,beta_decimals=3,ale_imb_thr=0.5,
    beta_method="STM",adm_method="2D",jobs=1,threads=3,output_dir=""){

    sep="/"
    if(output_dir==""){
      sep=""
    }
    out_file_dir=paste0(output_dir,sep,patient_id,"_CLONET_CONFIG")

    if (!dir.exists(out_file_dir)){
        dir.create(out_file_dir,recursive=TRUE)
    }

    config_data=paste("
    ## Output directory
    output_DIR <-", paste0("'",output_dir,"/OUTPUT/'"),"

    ## Path to CLONET functions
    path_to_CLONET_functions <-", paste0("'",clonet_dir,"/'"),"

    # information about sample names
    sampleInfoFile <-",paste0("'",sample_info,"'"),"

    # list of segment
    segmentListFile <-", paste0("'",segment_data,"'"),"

    # folder with informative SNPs
    pileup_dir <- ",paste0("'",snp_dir,"/'"),"

    # Suffix of the informative SNPs pileup
    PaPI_Suffix <- '.snps'

    ## Path to error table
    errorTable_file =",paste0("'",clonet_dir,'/Examples/BasicData/errorTable.csv',"'"),"

    ## Method to compete beta
    # 'GB' : classic model defined in http://www.genomebiology.com/2014/15/8/439 (only for backward compatibility)
    # 'STM' : new method based on error estimation on matched normal published in http://stm.sciencemag.org/content/6/254/254ra125.short (RECOMMENDED)
    betaCompute.method <-",paste0("'",beta_method,"'"),"

    ## Method to compete Adm.global
    # '1D' : classic model that use only deletions (only for backward compatibility)
    # '2D' : model based on cnA vs cnB transformation (RECOMMENDED)
    adm.global.method <- ",paste0("'",adm_method,"'"),"

    # minimum tumor coverage to consider an informative SNP as valid
    minCoverage <-",min_snp_cov,"

    # number of samples to process in parallel
    NsamplesToProcessInParallel <-", jobs,"

    # number of cores assigned to the analysis of each process
    perSampleCores <-", threads,"

    # min number of informative SNPs for a genomic segment
    min_nsnps <-",min_nsnps,"

    # min mean coverage of a genomic segment
    min_cov <-",min_seg_cov,"

    # minimum value of beta above which the two alleles are present in the same number (cn equal to 1+1 or 2+2 or 3+3 ...) to account ref map bias
    equalCN.betaThr <-",equal_betaThr,"

    # Homozygous deletions threshold (change only if you know what you are doing)
    maxHomoDels <-",max_homo_dels,"

    ## Parameters of a valid deletion used to compute Adm.global
    # log2 thresholds
    deletionsLog2Levels <-",deparse(del_log_thr),"

    ## Percentage of used deletions to compute Adm.global varibility interval (change only if you know what you are doing)
    alphaPar <-",alpha_par,"

    ## Threshold on clonality value (change only if you know what you are doing)
    clonalityThreshold <-",clonal_thr,"

    ## Threshold on beta value (change only if you know what you are doing)
    betaThreshold <-",beta_thr,"

    ## Stages to perform
    # 1. analyse single sample and produe RData
    # 2. create beta table aggragating single samples analysis
    # 3. compute ploidy shift
    # 4. compute global admixture
    # 5. compute clonality table
    # 6. compute allele specific copy number table
    stages <-",deparse(stages),"

    ## CLONET can try to compute reference mapping bias and to adjust beta estimation
    computeRefMapBias <-",comp_ref_map_bias,"

    ## number of significant digits for the beta table (NA for reporting all digits)
    n.digits <-",beta_decimals,"

    ## maximum distance from (cnA, cnB) integer copy number value
    ## Dafault 0.5 corresponds to round cnA and cnB
    AllelicImbalanceTh <-",ale_imb_thr)
    cat(config_data,file=paste0(out_file_dir,"/",patient_id,".Config.R"))
  }

#' This function generates a sample_info file for CLONET pipeline
#'
#' This function takes the path to the directory with processed SNPS generated with format_SNP_data function
#' and generates a tab separated file with samples info ready to use for CLONET pipeline.
#'
#' @param patient_id Patient ID. Default Patient
#' @param snp_dir Path to formated SNP data dir.
#' @param output_dir Path to output directory.
#' @export

generate_CLONET_sample_info=function(snp_dir="",patient_id="",output_dir=""){
    sep="/"
    if(output_dir==""){
      sep=""
    }
    out_file_dir=paste0(output_dir,sep,patient_id,"_CLONET_SAMPLE_INFO")

    if (!dir.exists(out_file_dir)){
        dir.create(out_file_dir,recursive=TRUE)
    }

    out_file=paste0(patient_id,".sample.info.txt")
    files=list.files(snp_dir,full.names=TRUE,recursive=TRUE)
    tumor=files[grepl("TumorPileup",files)]
    germ=files[grepl("NormalPileup",files)]
    tumor=unlist(lapply(tumor,FUN=ULPwgs::get_sample_name))
    germ=unlist(lapply(germ,FUN=ULPwgs::get_sample_name))
    sample_info=data.frame(Tumor.Array.Name="",Tumor.Bam.Name=tumor,Normal.Array.Name="",Normal.Bam.Name=germ)
    write.table(sample_info,file=paste0(out_file_dir,"/",out_file),quote=FALSE,row.names=FALSE,col.names=TRUE,sep="\t")
}

#' This function generates a plot of ploidy and celularity levels from CLONET data
#'
#' This function takes the path to the directory with CLONET output, as well as a tab separated file with
#' sample info and generates a plot of celularity and ploidy levels in the samples
#'
#' @param clonet_dir Path to clonet output directory
#' @param sample_data Path to file with sample info
#' @param output_dir Path to output directory.
#' @export
#' @import patchwork
#' @import tidyverse
#' @import ggplot2

plot_celullarity=function(clonet_dir="",sample_data="",output_dir=""){

    sep="/"
    if(output_dir==""){
      sep=""
    }

    if (!dir.exists(output_dir)){
        dir.create(output_dir,recursive=TRUE)
    }

    admixture=read.table(paste0(clonet_dir,"/globalAdmTable.txt"),header=TRUE,stringsAsFactors=FALSE)
    ploidy=read.table(paste0(clonet_dir,"/ploidyTable.txt"),header=TRUE,stringsAsFactors=FALSE)
    sample_info=read.table(sample_data,header=TRUE,stringsAsFactors=FALSE)
    full_data=dplyr::left_join(admixture,ploidy,by="sample")
    full_data=fuzzyjoin::fuzzy_inner_join(full_data,sample_info, by = c("sample" = "Sample_name_corrected"), match_fun = stringr::str_detect)

    plasma=full_data %>% dplyr::filter(Origin=="Plasma") %>% dplyr::mutate(Timepoint_ID=as.Date(lubridate::dmy(Timepoint_ID)))

    p=ggplot(plasma,aes(x=Timepoint_ID,y=ploidy))+geom_hline(aes(yintercept=2),linetype="dotted",alpha=0.5)+geom_bar(stat="identity",col="black",fill="red",alpha=0.5)+geom_point()+
    geom_line(aes(group=1),col="red")+theme_classic()+theme(axis.text.x = element_text(angle = 90),legend.position="bottom")+labs(x="Samples",y="Ploidy")

    p2=ggplot(plasma,aes(x=Timepoint_ID,y=1-adm))+geom_hline(aes(yintercept=0.5),linetype="dotted",alpha=0.5)+geom_bar(stat="identity",col="black",fill="blue",alpha=0.5)+geom_point()+
    geom_ribbon(data=plasma[!is.na(plasma$adm),],aes(ymin=1-adm.min, ymax=1-adm.max,group=1), fill="blue", alpha=0.2)+geom_line(data=plasma[!is.na(plasma$adm),],aes(group=1),col="blue")+theme_classic()+theme(axis.text.x = element_blank(),axis.title.x = element_blank(),axis.ticks.x = element_blank(),legend.position="bottom")+labs(x="Samples",y="Celularity")+ylim(0,1)

    d=(p2/p)+plot_annotation(title = paste0(unique(full_data$Patient_ID)," Plasma"))
    ggsave(paste0(output_dir,sep,unique(full_data$Patient_ID),"_Celularity_Plasma.png"),d)

    tissue=full_data %>% dplyr::filter(Origin!="Plasma")

    if(dim(tissue)[1]>0){
      tissue$Anatomy=make.unique(tissue$Anatomy,sep="_")
      tissue$order=1-tissue$adm
      tissue$order=ifelse(is.na(tissue$order),0,tissue$order)
      p=ggplot(tissue,aes(x=reorder(Anatomy,order),y=ploidy))+geom_hline(aes(yintercept=2),linetype="dotted",alpha=0.5)+geom_bar(stat="identity",col="black",fill="red",alpha=0.5)+geom_point()+
      geom_line(aes(group=1),col="red")+theme_classic()+theme(axis.text.x = element_text(angle = 90),legend.position="bottom")+labs(x="Samples",y="Ploidy")

      p2=ggplot(tissue,aes(x=reorder(Anatomy,order),y=1-adm))+geom_hline(aes(yintercept=0.5),linetype="dotted",alpha=0.5)+geom_bar(stat="identity",col="black",fill="blue",alpha=0.5)+geom_point()+
      geom_ribbon(data=tissue[!is.na(tissue$adm),],aes(ymin=1-adm.min, ymax=1-adm.max,group=1), fill="blue", alpha=0.2)+geom_line(data=tissue[!is.na(tissue$adm),],aes(group=1),col="blue")+theme_classic()+theme(axis.text.x = element_blank(),axis.title.x = element_blank(),axis.ticks.x =
      element_blank(),legend.position="bottom")+labs(x="Samples",y="Celularity")+ylim(0,1)
      d=(p2/p)+plot_annotation(title = paste0(unique(full_data$Patient_ID)," Tissue"))
      ggsave(paste0(output_dir,sep,unique(full_data$Patient_ID),"_Celularity_Tissue.png"),d)
    }
}


#' This function generates a plot of allelic imbalance from CLONET output
#'
#' This function takes the path to the directory with CLONET output, as well as a tab separated file with
#' sample and gene info and generates a plot of celularity and ploidy levels in the samples
#'
#' @param clonet_dir Path to clonet output directory
#' @param sample_data Path to file with sample info
#' @param gene_data Path to file with gene info
#' @param output_dir Path to output directory.
#' @param jobs Number of jobs to run.
#' @param threads Number of threads.
#' @export
#' @import patchwork
#' @import tidyverse
#' @import ggplot2

plot_allelic_imbalance=function(clonet_dir="",sample_data="",output_dir="",gene_data="",jobs=1,threads=3){
    sep="/"
    if(output_dir==""){
      sep=""
    }

    if (!dir.exists(output_dir)){
        dir.create(output_dir,recursive=TRUE)
    }

    out_file_dir=paste0(output_dir,sep,"ALLELIC_IMBALANCE")

    if (!dir.exists(out_file_dir)){
        dir.create(out_file_dir,recursive=TRUE)
    }

    ale_imb_table=read.table(paste0(clonet_dir,"/allelicImbalanceTable.txt"),header=TRUE,stringsAsFactors=FALSE)
    clonality_table=read.table(paste0(clonet_dir,"/clonalityTable.txt"),header=TRUE,stringsAsFactors=FALSE)
    sample_info=read.table(sample_data,header=TRUE,stringsAsFactors=FALSE)
    gen_info=read.table(gene_data,header=TRUE)
    ale_imb_table=dplyr::left_join(ale_imb_table,clonality_table)

    ale_imb_table_complete=fuzzyjoin::fuzzy_inner_join(ale_imb_table, gen_info,
                    by=c("chr"="chr","start"="start","end"="end"),
                   match_fun=list(`==`, `<=`, `>=`)) %>% dplyr::mutate(Overlap="Complete")

    ale_imb_table_partially=fuzzyjoin::fuzzy_inner_join(ale_imb_table, gen_info,
                    by=c("chr"="chr","start"="end","end"="start"),
                    match_fun=list(`==`, `<=`, `>=`)) %>% dplyr::group_by(pcf_gene_symbol,sample)%>% dplyr::filter(dplyr::n()>1) %>%
                    dplyr::ungroup() %>% dplyr::mutate(Overlap="Partial")

    ale_imb_table=rbind(ale_imb_table_complete,ale_imb_table_partially)

    ale_imb_table=ale_imb_table %>% dplyr::mutate(Type=ifelse(cnA.int==1 & cnB.int==1,"WT",
      ifelse(cnA.int==1 & cnB.int==0,"Hem.Del",
      ifelse(cnA.int==2 & cnB.int==1,"Gain",
      ifelse(cnA.int==2 & cnB.int==2,"Amp",
      ifelse(cnA.int==2 & cnB.int==0,"CN‐LOH",
      ifelse(cnA.int==0 & cnB.int==0,"Hom.Del",
      ifelse(cnA.int==3 & cnB.int==0,"Gain‐LOH",
      ifelse(cnA.int>=4 & cnB.int==0,"Amp-LOH",
      ifelse(cnA.int>=3 & cnB.int==1,"Unbalanced Gain",
      ifelse((is.na(cnA.int) & is.na(cnB.int)),"NA","Unbalanced Amp")))))))))))
    ale_imb_table=ale_imb_table %>% dplyr::mutate(col=ifelse(cnA.int==1 & cnB.int==1,"grey",
      ifelse(cnA.int==1 & cnB.int==0,"lightblue",
      ifelse(cnA.int==2 & cnB.int==1,"red",
      ifelse(cnA.int==2 & cnB.int==2,"firebrick",
      ifelse(cnA.int==2 & cnB.int==0,"yellow",
      ifelse(cnA.int==0 & cnB.int==0,"blue",
      ifelse(cnA.int==3 & cnB.int==0,"orange",
      ifelse(cnA.int>=4 & cnB.int==0,"darkorange",
      ifelse(cnA.int>=3 & cnB.int==1,"brown4",
      ifelse((is.na(cnA.int) & is.na(cnB.int)),"black","deeppink4")))))))))))

    full_data=fuzzyjoin::fuzzy_inner_join(ale_imb_table,sample_info, by = c("sample" = "Sample_name_corrected"), match_fun = stringr::str_detect)
    full_data$Allelic_Imbalance=forcats::fct_rev(as.factor(ifelse(full_data$AllelicImbalance<=0.2,"E[AI] ≤ 0.2     ","E[AI] > 0.2     ")))
    full_data$Symbol=ifelse(grepl("CONTROL",full_data$pcf_gene_class),paste0(full_data$pcf_gene_symbol,"[C:","'",full_data$chr.y,substring(full_data$band,1,1),"'","]"),paste0(full_data$pcf_gene_symbol,"[T:","'",full_data$chr.y,substring(full_data$band,1,1),"'","]"))
    full_data$ID=ifelse(full_data$Origin=="Plasma",as.character(lubridate::dmy(full_data$Timepoint_ID)),full_data$Anatomy)
    full_data$AI=ifelse(full_data$chr.x=="X",round(2**full_data$log2.corr),ifelse(full_data$AllelicImbalance<=0.2|is.na(full_data$AllelicImbalance),"-",paste0(full_data$cnA.int,"/",full_data$cnB.int)))
    write.table(file=paste0(out_file_dir,"/",unique(full_data$Patient_ID),".CLONET.txt"),x=full_data,quote=FALSE,row.names=FALSE,col.names=TRUE,sep="\t")

    min.log2=min(full_data$log2,na.rm=TRUE)
    max.log2=max(full_data$log2,na.rm=TRUE)
    min.beta=0
    max.beta=1
    max.cnA=max(full_data$cnA.int,na.rm=TRUE)

  parallel::mclapply(unique(full_data$Origin),FUN=function(y){
    sub_origin=full_data %>% dplyr::filter(Origin==y)
    parallel::mclapply(unique(sub_origin$ID),FUN=function(x){
      sub_ID=sub_origin %>% dplyr::filter(ID==x)
      p01=ggplot(sub_ID,aes(x=log2))+geom_histogram(aes(y=..density..),binwidth=0.1,alpha=0.9,col="black")+geom_density(aes(y=..density..))+scale_fill_identity()+theme_classic()+xlim(min.log2-0.1,max.log2+0.1)
      pa1=ggplot(sub_ID)+geom_point(aes(y=beta,x=log2,col=col))+theme_classic()+geom_vline(aes(xintercept=0),linetype="dashed")+geom_vline(aes(xintercept=1),linetype="dashed")+geom_vline(aes(xintercept=-1),linetype="dashed")+geom_vline(aes(xintercept=0.6),linetype="dashed")+scale_color_identity()+xlim(min.log2-0.1,max.log2+0.1)+ylim(min.beta-0.1,max.beta+0.1)
      p1a=p01/pa1+plot_layout(height=c(2,8))
      p3=ggplot(sub_ID)+geom_vline(aes(xintercept=0),linetype="dashed")+geom_vline(aes(xintercept=1),linetype="dashed")+geom_vline(aes(xintercept=-1),linetype="dashed")+geom_vline(aes(xintercept=0.6),linetype="dashed")+ggrepel::geom_label_repel(data=sub_ID %>% dplyr::filter(AllelicImbalance>0.2,!is.na(pcf_gene_class)),aes(y=beta,x=log2,col=col,label=Symbol),force=20,max.overlaps=1000,min.segment.length = 0,parse=TRUE)+geom_point(data=sub_ID %>% dplyr::filter(!is.na(pcf_gene_symbol),!is.na(Allelic_Imbalance)),aes(shape=Allelic_Imbalance,y=beta,x=log2,col=col))+theme_classic()+scale_color_identity()+xlim(min.log2-0.1,max.log2+0.1)+ylim(min.beta-0.1,max.beta+0.1)+theme(legend.position=c(0.88,0.075),legend.background = element_rect(
                                        size=0.2, linetype="solid",colour="black"),
              legend.key.width=unit(1.1,"cm"))
      dummy=ggplot(sub_origin[!is.na(sub_origin$cnA),])+geom_hline(aes(yintercept=0),linetype="dashed")+geom_hline(aes(yintercept=1),linetype="dashed")+geom_hline(aes(yintercept=2),linetype="dashed")+geom_hline(aes(yintercept=3),linetype="dashed") +geom_vline(aes(xintercept=0),linetype="dashed")+geom_vline(aes(xintercept=1),linetype="dashed")+geom_vline(aes(xintercept=2),linetype="dashed")+geom_vline(aes(xintercept=3),linetype="dashed")+ geom_point(aes(x=cnA,y=cnB,col=col))+theme_classic()+geom_abline(intercept = 0, slope = 1)+xlim(0,3) +ylim(0,3)+ scale_color_identity(name = "Copy Number",
                                breaks = sub_origin$col,
                                labels = sub_origin$Type,
                                guide = "legend")
      leg=cowplot::get_legend(dummy)
      p1=ggplot(sub_ID %>% dplyr::filter(!is.na(cnA)),aes(x=cnA,y=cnB,col=col))+ geom_point()+scale_color_identity()+geom_abline(intercept = 0, slope = 1)+geom_hline(yintercept=c(1:(max.cnA)),linetype="dashed")+geom_vline(xintercept=c(1:(max.cnA)),linetype="dashed")+theme_classic()+xlim(0,max.cnA) +ylim(0,max.cnA)

      leg=ggplotify::as.ggplot(leg)
      p1a=p01/pa1
      p1b=leg/p1
      pa=(p1a|p1b)
      pa=(pa+p3)+plot_layout(width=c(3,3,6))+plot_annotation(title = x)
      ggsave(paste0(out_file_dir,"/",unique(sub_ID$Patient_ID),".",x,".CLONET_per_gene_",y,".png"),pa,width=20,height=10)
      write.table(file=paste0(out_file_dir,"/",unique(sub_ID$Patient_ID),".",x,".Allelic_Imbalance_CLONET_",y,".txt"),x=sub_ID,quote=FALSE,row.names=FALSE,col.names=TRUE,sep="\t")
    },mc.cores=jobs)
  },mc.cores=threads)

    tc_and_ploidy_per_sample=full_data %>% dplyr::distinct(ID,adm,adm.max,adm.min,ploidy)
    log2_corr_per_gene=full_data %>% dplyr::group_by(Symbol,ID) %>% dplyr::summarise(meanLog2corr=mean(log2.corr))
    AI_per_gene=full_data %>% dplyr::group_by(Symbol,ID) %>% dplyr::summarise(AI=paste0(AI,collapse=":"))
    log2_corr_per_gene_wider=log2_corr_per_gene %>% tidyr::pivot_wider(id_cols="ID",names_from="Symbol",values_from="meanLog2corr")
    AI_per_gene_wider=AI_per_gene %>% tidyr::pivot_wider(id_cols="ID",names_from="Symbol",values_from="AI")
    write.table(file=paste0(out_file_dir,"/",unique(full_data$Patient_ID),".Allelic_Imbalance_CLONET_log2_corr_matrix.txt"),x=log2_corr_per_gene_wider,quote=FALSE,row.names=FALSE,col.names=TRUE,sep="\t")
    write.table(file=paste0(out_file_dir,"/",unique(full_data$Patient_ID),".Allelic_Imbalance_CLONET_AI_matrix.txt"),x=AI_per_gene_wider,quote=FALSE,row.names=FALSE,col.names=TRUE,sep="\t")
    c_names=as.character(unique(log2_corr_per_gene$Symbol))
    log2_corr_per_gene_wider=as.data.frame(log2_corr_per_gene_wider)
    AI_per_gene_wider=as.data.frame(AI_per_gene_wider)
    r_names=log2_corr_per_gene_wider[rowSums(is.na(log2_corr_per_gene_wider))<(ncol(log2_corr_per_gene_wider)-1),1]
    log2_corr_mtx=log2_corr_per_gene_wider[rowSums(is.na(log2_corr_per_gene_wider))<(ncol(log2_corr_per_gene_wider)-1),-1]
    rownames(AI_per_gene_wider)=AI_per_gene_wider[,1]
    AI_per_gene_wider=AI_per_gene_wider[r_names,c_names]
    AI_per_gene_wider=data.frame(lapply(AI_per_gene_wider, function(x) {gsub(":", "\n", x)}))
    print(AI_per_gene_wider)
    dim(AI_per_gene_wider)
    dim(log2_corr_mtx)
    tc_and_ploidy_per_sample=tc_and_ploidy_per_sample %>% tidyr::drop_na()
    rownames(tc_and_ploidy_per_sample)=tc_and_ploidy_per_sample$ID
    tc_and_ploidy_per_sample=tc_and_ploidy_per_sample[r_names,]
    col_fun = circlize::colorRamp2(c(0, 1), c("white", "red"))
    col_fun2 = circlize::colorRamp2(c(0,max(tc_and_ploidy_per_sample$ploidy)), c("blue", "brown"))
    rows_ha = ComplexHeatmap::rowAnnotation(ploidy = ComplexHeatmap::anno_barplot(tc_and_ploidy_per_sample$ploidy,baseline=2,col=col_fun2), tmf =ComplexHeatmap::anno_simple(1-tc_and_ploidy_per_sample$adm,col = col_fun))
    png(paste0(out_file_dir,"/",unique(full_data$Patient_ID),".log2_corrected_CLONET.png"),width=20,height=8,res=1200,units="in",type="cairo-png")
    ComplexHeatmap::draw(ComplexHeatmap::Heatmap(as.matrix(log2_corr_mtx),na_col="black",column_labels=c_names,
    row_labels=r_names,column_names_gp=grid::gpar(fontsize=7),cluster_rows=FALSE,cluster_columns=TRUE,
    row_split=c(rep("Plasma",sum(!grepl("[aA-zZ]",r_names))),rep("Tissue",sum(grepl("[aA-zZ]",r_names)))),name="log2.cor",right_annotation = rows_ha, cell_fun = function(j, i, x, y, width, height, fill) {
        grid::grid.text(sprintf("%s",AI_per_gene_wider[i, j]), x, y, gp = grid::gpar(fontsize = 6))
      },width=grid::unit(15,"in")))
    dev.off()
}

#' This function generates a plot of ploidy and celularity levels from CLONET data
#'
#' This function takes the path to the directory with CLONET output, as well as a tab separated file with
#' sample info and generates a plot of celularity and ploidy levels in the samples
#'
#' @param cn_call_data Path to segment data with cn calls
#' @param sample_data Path to file with sample info
#' @param output_dir Path to output directory.
#' @export
#' @import patchwork
#' @import tidyverse
#' @import ggplot2

plot_cn_calls=function(cn_call_data="",sample_data="",output_dir=""){

    sep="/"
    if(output_dir==""){
      sep=""
    }

    if (!dir.exists(output_dir) & output_dir!=""){
        dir.create(output_dir,recursive=TRUE)
    }

    cn_info=read.table(cn_call_data,header=TRUE,stringsAsFactors=FALSE)
    sample_info=read.table(sample_data,header=TRUE,stringsAsFactors=FALSE)
    full_data=fuzzyjoin::fuzzy_inner_join(cn_info,sample_info, by = c("sample" = "Sample_name_corrected"), match_fun = stringr::str_detect)
    full_data$cn=ifelse(full_data$chr=="X",  full_data$cn+1,  full_data$cn)
    full_data=full_data %>% dplyr::mutate(CN=ifelse(cn>2,"GAIN",ifelse(cn<2,"LOSS","NEUTRAL")))
    full_data=full_data %>% dplyr::mutate(CNs=ifelse(CN=="GAIN"|CN=="LOSS","CNA","NEUTRAL"))
    full_data$ID=ifelse(full_data$Origin=="Plasma",as.character(lubridate::dmy(full_data$Timepoint_ID)),full_data$Anatomy)

    plasma=full_data %>% dplyr::filter(Origin=="Plasma")
    p1=ggplot(plasma %>% dplyr::group_by(sample,CN,ID,CNs)%>% dplyr::summarise(Count=dplyr::n()))+geom_bar(stat="identity",aes(x=CNs,y=Count,fill=CN),col="black")+facet_grid(.~ID)+
    theme_classic()+theme(axis.text.x = element_text(angle = 90))+plot_annotation(title=paste0(unique(plasma$Patient_ID)," Plasma"),subtitle="Somatic copy number uncorrected") +theme(strip.text.x = element_text(size = 6))
    ggsave(paste0(output_dir,sep,unique(plasma$Patient_ID),"_SCNA_count_Plasma.png"),p1,height=10,width=20)


    tissue=full_data %>% dplyr::filter(Origin!="Plasma")

    if(dim(tissue)[1]>0){
      p2=ggplot(tissue %>% dplyr::group_by(sample,CN,ID,CNs) %>% dplyr::summarise(Count=dplyr::n()) %>% dplyr::group_by(ID)%>% dplyr::mutate(TotalCN=sum(Count[CN!="NEUTRAL"])))+
      geom_bar(stat="identity",aes(x=CNs,y=Count,fill=CN),col="black")+facet_grid(~reorder(ID,TotalCN))+theme_classic()+theme(axis.text.x = element_text(angle = 90))+plot_annotation(title=paste0(unique(tissue$Patient_ID), " Tissue"),subtitle="Somatic copy number uncorrected")+theme(strip.text.x = element_text(size = 6))
      ggsave(paste0(output_dir,sep,unique(tissue$Patient_ID),"_SCNA_count_Tissue.png"),p2,height=10,width=20)
    }
}


#' This function generates a plot of reltionship between samples based on Hammering Distance
#' computed as the number of bins thats differ between samples

#' This function takes the path to the directory with CLONET output, as well as a tab separated file with
#' sample info and generates a plot of celularity and ploidy levels in the samples
#'
#' @param cn_call_data Path to segment data with cn calls
#' @param sample_data Path to file with sample info
#' @param ref_bins  Path to reference genome binned into 100kb/500kb bin
#' @param output_dir Path to output directory.
#' @param threads Number of threads to use.
#' @export
#' @import patchwork
#' @import tidyverse
#' @import ggplot2

plot_evolutionary_distance=function(cn_call_data="",sample_data="",ref_bins="",output_dir="",threads=3){

    sep="/"
    if(output_dir==""){
      sep=""
    }

    if (!dir.exists(output_dir)& output_dir!=""){
        dir.create(output_dir,recursive=TRUE)
    }
    cn_info=read.table(cn_call_data,header=TRUE,stringsAsFactors=FALSE)
    sample_info=read.table(sample_data,header=TRUE,stringsAsFactors=FALSE)
    segments=read.table(ref_bins,stringsAsFactors=FALSE)
    full_data=fuzzyjoin::fuzzy_inner_join(cn_info,sample_info, by = c("sample" = "Sample_name_corrected"), match_fun = stringr::str_detect)
    full_data$cn=ifelse(full_data$chr=="X",  full_data$cn+1,  full_data$cn)
    full_data=full_data %>% dplyr::mutate(CN=ifelse(cn>2,"GAIN",ifelse(cn<2,"LOSS","NEUTRAL")))
    full_data=full_data %>% dplyr::mutate(CNs=ifelse(CN=="GAIN"|CN=="LOSS","CNA","NEUTRAL"))
    full_data$ID=ifelse(full_data$Origin=="Plasma",as.character(lubridate::dmy(full_data$Timepoint_ID)),full_data$Anatomy)
    segments$cn=2
    segments$region="-"
    solution=parallel::mclapply(unique(full_data$ID),FUN=function(x){segments_tmp=segments;CN_sub=full_data %>% dplyr::filter(ID==x);
    for (y in 1:nrow(CN_sub)){segments_tmp[segments_tmp$V1==CN_sub[y,]$chr & segments_tmp$V2>=CN_sub[y,]$start & segments_tmp$V3<=CN_sub[y,]$end,"cn"]=CN_sub[y,]$cn;
    segments_tmp$sample=x;segments_tmp[segments_tmp$V1==CN_sub[y,]$chr & segments_tmp$V2<=CN_sub[y,]$end & segments_tmp$V3>=CN_sub[y,]$start,"region"]=paste0(CN_sub[y,]$chr,":",CN_sub[y,]$start,"-",CN_sub[y,]$end)};
    return(segments_tmp)},mc.cores=threads)
    solution=solution %>% dplyr::bind_rows()
    solution$change=ifelse(solution$cn==2,0,ifelse(solution$cn>2,1,-1))
    solution_wider=tidyr::pivot_wider(solution,id_cols="V4",names_from="sample",values_from="change")
    solution_matrix=solution_wider[,-1]
    write.table(file=paste0(output_dir,sep,unique(full_data$Patient_ID),".CN.Matrix.txt"),x=solution_wider,quote=FALSE,row.names=FALSE,col.names=TRUE)

    dist_matrix_all=dist(t(solution_matrix))
    NJ_data_all=phangorn::NJ(dist_matrix_all)
    NJ_tree_all=ape::ladderize(NJ_data_all)

    png(filename=paste0(output_dir,sep,unique(full_data$Patient_ID),".NJ_all.png"),units="in",width=12,height=10,res=1000)
      p=ape::plot.phylo(NJ_tree_all)
      p
    dev.off()



    dist_matrix_plasma=dist(t(solution_matrix[,!grepl("[aA-zZ]",colnames(solution_matrix))]))
    NJ_data_plasma=phangorn::NJ(dist_matrix_plasma)
    NJ_tree_plasma=ape::ladderize(NJ_data_plasma)

    png(filename=paste0(output_dir,sep,unique(full_data$Patient_ID),".NJ_all_plasma.png"),units="in",width=12,height=10,res=1000)
      p=ape::plot.phylo(NJ_tree_plasma)
      p
    dev.off()


    dist_matrix_tissue=dist(t(solution_matrix[,grepl("[aA-zZ]",colnames(solution_matrix))]))
    NJ_data_tissue=phangorn::NJ(dist_matrix_tissue)
    NJ_tree_tissue=ape::ladderize(NJ_data_tissue)

    png(filename=paste0(output_dir,sep,unique(full_data$Patient_ID),".NJ_all_tissue.png"),units="in",width=12,height=10,res=1000)
      p=ape::plot.phylo(NJ_tree_tissue)
      p
    dev.off()

    tp=solution_wider[,-1]
    tp_pos=dplyr::bind_cols(parallel::mclapply(1:ncol(tp),FUN=function(x){
      tp[,x]!=dplyr::lag(tp[,x])
    },mc.cores=threads))
    rownames(tp_pos)=solution_wider[,1]
    colnames(tp_pos)=colnames(solution_wider[,-1])
    tp_pos[1,]=FALSE
    tp_pos=tp_pos*1
    tp_all=dummy_tp[rowSums(dummy_tp)>1,]

    png(paste0(out_file_dir,"/",unique(full_data$Patient_ID),".transition_points.png"),width=12,height=8,res=1200,units="in",type="cairo-png")
    ComplexHeatmap::draw(ComplexHeatmap::Heatmap(tp_all,row_names_gp=grid::gpar(fontsize=5),clustering_distance_rows=function(m) dist(m,method="manhattan"),clustering_distance_columns=function(m) dist(m,method="manhattan")))
    dev.off()
}



#' This function merges a group of files in MAF format

#' This function takes the path to the directory with MAF files and
#' concatenates them into a single file
#'
#' @param maf_dir Path to maf file dir
#' @param recursive Reads all files in the directory recursively. Default TRUE
#' @param pattern  Pattern to identify MAF files with. Default "maf"
#' @param generate_file Generate an output file. Default TRUE
#' @param output_name Output file name without extension. Default "merged"
#' @param output_dir Path to output directory.
#' @param threads Number of threads to use.
#' @export

merge_maf=function(maf_dir="",recursive=FALSE,pattern=".maf",threads=3,generate_file=TRUE,output_name="merged",output_dir=""){
  files=list.files(maf_dir,pattern=pattern,recursive=recursive)
  dat=parallel::mclapply(files,FUN=function(x){
    read.table(x,sep="\t",quote="\\",header=TRUE,colClasses="character")
  },mc.cores=threads)
  dat=dplyr::bind_rows(dat)
  if (generate_file){
    sep="/"
    if(output_dir==""){
      sep=""
    }

    if (!dir.exists(output_dir)& output_dir!=""){
        dir.create(output_dir,recursive=TRUE)
    }
    out_file=paste0(output_dir,sep,output_name,".maf")
    write.table(dat,file=out_file,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
  }
  return(dat)
}

#' This function processes a MAF file

#' This function takes the path to a MAF file and processes the variant infomation
#' contained within it
#'
#' @param maf Path to MAF file
#' @param sample_data Path to file with sample information
#' @param gen_data Path to file with gen to take into account
#' @param generate_file Generate an output file. Default TRUE
#' @param output_name Output file name without extension. Default "merged"
#' @param output_dir Path to output directory.
#' @param threads Number of threads to use.
#' @export

process_maf=function(maf="",sample_data="",gen_data=""){
  maf_data=read.table(maf,header=TRUE,stringsAsFactors=FALSE)
  maf_data$IMPORTANCE=ifelse(grepl("sense|Frame|Splice|Start",maf_data$Variant_Classification)&maf_data$IMPACT!="LOW","HIGH","LOW")
  if (gen_data!=""){
    gen=read.table(gen_data,header=TRUE,stringsAsFactors=FALSE)
    maf_data$RELEVANCE=ifelse(maf_data$Hugo_Symbol %in% gen$hgnc_gene_symbol,"RELEVANT","NOT-RELEVANT")
    maf_data=dplyr::left_join(maf_data,gen,by=c("Hugo_Symbol"="hgnc_gene_symbol"))
  }
  sample_info=read.table(sample_data,header=TRUE,stringsAsFactors=FALSE)
  full_data=fuzzyjoin::fuzzy_inner_join(maf_data,sample_info, by = c("Tumor_Sample_Barcode" = "Sample_name_corrected"), match_fun = stringr::str_detect)
  full_data$ID=ifelse(full_data$Origin=="Plasma",as.character(lubridate::dmy(full_data$Timepoint_ID)),full_data$Anatomy)
  print(summary(full_data %>% select(IMPORTANCE,RELEVANCE,pcf_gene_class)))
}
TearsWillFall/variantCall documentation built on Oct. 15, 2021, 8:51 p.m.