#'
#' This function functions calls Mutect2 for variant calling.
#' If a vector of tumour samples are provided these will be processed in multi-sample mode.
#' To run in tumour-normal mode suppply a single tumour and normal sample.
#' If no normal is supplied this will run in tumour only.
#' TO DO// Implement mitochondrial mode feature
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param tumour [REQUIRED] Path to tumour BAM file.
#' @param normal [OPTIONAL] Path to normal BAM file.
#' @param ref_genome [REQUIRED] Path to reference genome fasta file.
#' @param rdata [OPTIONAL] Import R data information with list of BAM.
#' @param selected [OPTIONAL] Select BAM from list.
#' @param pon [OPTIONAL] Path to Panel of Normals. Default none
#' @param access [OPTIONAL] Path to reference genome accessibility. Default none
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param pon [OPTIONAL] Path to panel of normal.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param seg_method [OPTIONAL] Method to use to generate segmentation calls.Default cbs. Options ["cbs","flasso","haar","none","hmm","hmm-tumor","hmm-germline"]
#' @param seq_method [OPTIONAL] Sequenced methods used. Default hybrid. Options ["hybrid","amplicon","wgs"]
#' @param trend_scatter [OPTIONAL] Show trendline. Default TRUE
#' @param range_scatter [OPTIONAL] Range to show in chr:start-end format. Default none
#' @param range_list_scatter [OPTIONAL] Path to BED file with range list. Default none
#' @param genes_scatter [OPTIONAL] List of genes to focus on. Default none.
#' @param margin_width_scatter [OPTIONAL] Default gene margin width. Default 1000000
#' @param plot_by_bin_scatter [OPTIONAL] Assume all bins are the same size. Default FALSE
#' @param trend_scatter [OPTIONAL] Show trendline. Default TRUE.
#' @param antitarget_symbol_scatter [OPTIONAL] Show antitarget probes with this symbol. Default "@"
#' @param segment_colour_scatter [OPTIONAL] Show antitarget probes with this symbol. Default "@"
#' @param y_max_scatter [OPTIONAL] Y max range. Default NULL.
#' @param y_min_scatter [OPTIONAL] Y min range. Default NULL.
#' @param cn_thr_diagram [OPTIONAL] Copy number threshold. Default 0.5
#' @param min_probes_diagram [OPTIONAL] Minimum number of probes to show a gene. Default 3
#' @param male_reference_diagram [OPTIONAL] Assume inputs were normalized to a male reference. Default FALSE
#' @param gender_diagram [OPTIONAL] Assume sample gender. Default male
#' @param shift_diagram [OPTIONAL] Adjust the X and Y chromosomes according to sample sex. Default TRUE
#' @param normal_ichor [OPTIONAL] Path to normal samples. Default c(0.5,0.6,0.7,0.8,0.9)
#' @param ploidy_ichor [OPTIONAL] Initial tumour ploidy; can be more than one value if additional ploidy initializations are desired. Default: 2
#' @param lambda_ichor [OPTIONAL] Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data
#' @param scStates_ichor [OPTIONAL] Subclonal states to consider. Default NULL
#' @param output_name_ichor [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param lambdaScaleHyperParam_ichor [OPTIONAL] Hyperparameter (scale) for Gamma prior on Student's-t precision. Default 3
#' @param maxCN_ichor [OPTIONAL] Total clonal states. Default 7.
#' @param estimateNormal_ichor [OPTIONAL] Estimate normal. Default TRUE.
#' @param estimateScPrevalence_ichor [OPTIONAL] Estimate subclonal prevalence. Default TRUE.
#' @param estimatePloidy_ichor [OPTIONAL] Estimate tumour ploidy. Default TRUE.
#' @param maxFracGenomeSubclone_ichor [OPTIONAL] Exclude solutions with subclonal genome fraction greater than this value. Default 0.5
#' @param maxFracCNASubclone_ichor [OPTIONAL] Exclude solutions with fraction of subclonal events greater than this value. Default 0.7
#' @param minSegmentBins_ichor [OPTIONAL] Minimum number of bins for largest segment threshold required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction
#' @param altFracThreshold_ichor [OPTIONAL] Minimum proportion of bins altered required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction. Default: [0.05]
#' @param includeHOMD_ichor [OPTIONAL] If FALSE, then exclude HOMD state. Useful when using large bins (e.g. 1Mb). Default FALSE.
#' @param txnE_ichor [OPTIONAL] Self-transition probability. Increase to decrease number of segments. Default: [0.9999999]
#' @param txnStrength_ichor [OPTIONAL] Transition pseudo-counts. Exponent should be the same as the number of decimal places of --txnE. Default: [1e+07]
#' @param plotFileType_ichor [OPTIONAL] File format for output plots. Default pdf
#' @param plotYLim_ichor [OPTIONAL] ylim to use for chromosome plots. Default: [c(-2,2)]
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
process_cnvkit=function(
target=build_default_reference_list()$HG19$panel$PCF_V3$binned$target,
antitarget=build_default_reference_list()$HG19$panel$PCF_V3$binned$antitarget,
ref_genome=build_default_reference_list()$HG19$reference$genome,
access=build_default_reference_list()$HG19$reference$access_5k,
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
pon=build_default_reference_list()$HG19$panel$PCF_V3$variant$pon_cn_male,
bam=NULL,
patient_id=NULL,
seg_method="cbs",
seq_method="hybrid",
diagram=TRUE,
scatter=TRUE,
read_count=FALSE,
min_mapq=0,
gc=TRUE,
edge=TRUE,
rmask=TRUE,
smooth=FALSE,
drop_low_coverage=TRUE,
drop_outliers=10,
range_scatter=NULL,
range_list_scatter=NULL,
genes_scatter=NULL,
margin_width_scatter=1000000,
plot_by_bin_scatter=FALSE,
trend_scatter=FALSE,
antitarget_symbol_scatter="@",
segment_colour_scatter="red",
y_max_scatter=NULL,
y_min_scatter=NULL,
cn_thr_diagram=0.5,
min_probes_diagram=3,
male_reference_diagram=FALSE,
gender_diagram="male",
shift_diagram=TRUE,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
out_file_dir=set_dir(
out_file_dir,
name=paste0(patient_id,"/cnvkit_reports/",input_id)
)
set_main(.env=.this.env)
.main$steps[[fn_id]]<-.this.env
.main.step=.main$steps[[fn_id]]
.main.step$steps <-append(
.main.step$steps,
call_coverage_cnvkit(
sif_cnvkit=sif_cnvkit,
ref_genome=ref_genome,
bam=input,
bed=target,
type="target",
output_dir=out_file_dir,
tmp_dir=tmp_dir,
env_dir=env_dir,
batch_dir=batch_dir,
err_msg=err_msg,
read_count=read_count,
min_mapq=min_mapq,
verbose=verbose,
threads=threads,
ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$call_coverage_cnvkit.target
.main.step$out_files$cnn=.this.step$out_files
if(!is.null(antitarget)){
.main.step$steps <-append(
.main.step$steps,
call_coverage_cnvkit(
sif_cnvkit=sif_cnvkit,
ref_genome=ref_genome,
bam=input,
bed=antitarget,
type="antitarget",
output_dir=out_file_dir,
tmp_dir=tmp_dir,
env_dir=env_dir,
batch_dir=batch_dir,
err_msg=err_msg,
read_count=read_count,
min_mapq=min_mapq,
verbose=verbose,
threads=threads,
ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$call_coverage_cnvkit.antitarget
.main.step$out_files$cnn=append(.main.step$out_files$cnn,.this.step$out_files)
}
.main.step$steps <-append(
.main.step$steps,
fix_cnvkit(
sif_cnvkit=sif_cnvkit,
pon=pon,
target=.main.step$out_files$cnn$target,
antitarget=.main.step$out_files$cnn$antitarget,
output_name=input_id,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
env_dir=env_dir,
batch_dir=batch_dir,
verbose=verbose,
err_msg=err_msg,
gc=gc,
edge=edge,
rmask=rmask,
threads=threads,
ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$fix_cnvkit
.main.step$out_files=append(.main.step$out_files,.this.step$out_files)
.main.step$steps <-append(
.main.step$steps,
segment_cnvkit(
sif_cnvkit=sif_cnvkit,
cnr=.main.step$out_files$cnr,
seg_method=seg_method,
output_name=input_id,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
env_dir=env_dir,
batch_dir=batch_dir,
err_msg=err_msg,
smooth=smooth,
drop_low_coverage=drop_low_coverage,
drop_outliers=drop_outliers,
verbose=verbose,
threads=threads,
ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$segment_cnvkit
.main.step$out_files=append(.main.step$out_files,.this.step$out_files)
if(scatter){
.main.step$steps <-append(
.main.step$steps,
scatter_cnvkit(
sif_cnvkit=sif_cnvkit,
cnr=.main.step$out_files$cnr,
cns=.main.step$out_files$cns,
output_name=input_id,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
env_dir=env_dir,
batch_dir=batch_dir,
err_msg=err_msg,
range=range_scatter,
range_list=range_list_scatter,
genes=genes_scatter,
margin_width=margin_width_scatter,
plot_by_bin=plot_by_bin_scatter,
trend=trend_scatter,
antitarget_symbol=antitarget_symbol_scatter,
segment_colour=segment_colour_scatter,
title=input_id,
y_max=y_max_scatter,
y_min=y_max_scatter,
verbose=verbose,
threads=threads,ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$scatter_cnvkit
.main.step$out_files=append(.main.step$out_files,.this.step$out_files)
}
if(diagram){
.main.step$steps <-append(
.main.step$steps,
diagram_cnvkit(
sif_cnvkit=sif_cnvkit,
cnr=.main.step$out_files$cnr,
cns=.main.step$out_files$cns,
output_name=input_id,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
env_dir=env_dir,
batch_dir=batch_dir,
err_msg=err_msg,
cn_thr=cn_thr_diagram,
min_probes=min_probes_diagram,
male_reference=male_reference_diagram,
gender=gender_diagram,
shift=shift_diagram,
title=input_id,
verbose=verbose,
threads=threads,
ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$diagram_cnvkit
.main.step$out_files=append(.main.step$out_files,.this.step$out_files)
}
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bam"
)
launch(.env=.base.env)
}
#' Wrapper around access command in CNVkit
#'
#' Create a BED file with non-mappable regions to exclude from the genome.
#' Additional regions can be supplied in a BED format using the exclude_regions argument
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param ref_genome [REQUIRED] Path to reference genome fasta file.
#' @param exclude_regions [REQUIRED] Additional regions to exclude in BED format. Default none.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
access_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
ref_genome=build_default_reference_list()$HG19$reference$genome,
exclude_regions=NULL,
gap_size=5000,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
.main$out_files$access=paste0(out_file_dir,"/",input_id,".",gap_size,".access.bed")
if(exclude_regions!=""){
exclude_regions=paste0(" -x ",exclude_regions)
}
.main$exec_code=paste("singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py access -o ",.main$out_files$access,
exclude_regions," -s ",gap_size, ref_genome
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="ref_genome"
)
launch(.env=.base.env)
}
#' Wrapper around target function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param ref_genome [REQUIRED] Path to reference genome.
#' @param bin_size_target [OPTIONAL] Size of bins for targets. Default 75
#' @param bin_size_antitarget [OPTIONAL] Size of bins for antitargets. Default 500000
#' @param min_bin_size_antitarget [OPTIONAL] Size of bins for antitargets. Default NULL
#' @param access [OPTIONAL] Path to reference genome accessibility. Default none
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
create_pon_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
ref_genome=build_default_reference_list()$HG19$reference$genome,
access=build_default_reference_list()$HG19$reference$access_5k,
normal=NULL,
target=NULL,
bin_size_target=75,
bin_size_antitarget=500000,
min_bin_size_antitarget=NULL,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
.main$out_files$pon=paste0(out_file_dir,"/",
input_id,
".target_",bin_size_target,
ifelse(!is.null(bin_size_antitarget),
".antitarget_",""),bin_size_antitarget,
".pon.cnn"
)
if(!is.null(target)){
target=paste0(" -t ",target)
}
if(!is.null(access)){
access=paste0(" -g ",access)
}
if(!is.null(bin_size_antitarget)){
bin_size_antitarget=paste0(
" --antitarget-min-size ",
bin_size_antitarget
)
}
if(!is.null(min_bin_size_antitarget)){
min_bin_size_antitarget=paste0(
" --antitarget-min-size ",
min_bin_size_antitarget
)
}
.main$exec_code=paste("singularity exec -H ",
paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py batch -p ",threads,
ifelse(is.null(bin_size_antitarget)," -m wgs ",""),
" -n ",gsub(";"," ",input),
" --output-reference ",.main$out_files$pon,
" -f ", ref_genome,access,target,
" --target-avg-size ",bin_size_target,
bin_size_antitarget,
min_bin_size_antitarget
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="normal"
)
launch(.env=.base.env)
}
#' Wrapper around reference function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param ref_genoma [REQUIRED] Path to reference genoms.
#' @param cnn [REQUIRED] Path to normal coverage profiles. Default none
#' @param gender [OPTIONAL] Sample gender. Default male.
#' @param target [OPTIONAL] Path to BED file with target regions. Default none.
#' @param antitarget [OPTIONAL] Path to BED file with target regions. Default none.
#' @param cluster [OPTIONAL] Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles. Default FALSE
#' @param min_cluster_size [OPTIONAL] Minimum size to keep in reference profiles.
#' @param gc [OPTIONAL] Disble GC bias correction. Default FALSE.
#' @param edge [OPTIONAL] Disble edge correction. Default FALSE.
#' @param rmask [OPTIONAL] Disble repeat mask correction. Default FALSE.
#' @param male_reference [OPTIONAL] Adjust X chromosome to log2 0. Default FALSE.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
reference_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
ref_genome=build_default_reference_list()$HG19$reference$genome,
cnn=NULL,
gender="male",
gc=TRUE,
edge=TRUE,
rmask=TRUE,
verbose=FALSE,
cluster=FALSE,
min_cluster_size=10,
male_reference=FALSE,
threads=1,ram=1,
...
){
run_main=function(
.env
){
if(!is.null(gender)){
gender=paste0(" -x ",gender)
}
add=""
if(!gc){
add=paste(add," --no-gc ")
}
if(!edge){
add=paste(add," --no-edge ")
}
if(!rmask){
add=paste(add," --no-rmask ")
}
if(male_reference){
add=paste(add," -y ")
}
if(cluster){
add=paste(add," -c ")
min_cluster_size=paste0(" --min-cluster-size ",min_cluster_size)
}else{
min_cluster_size=""
}
.main$out_file$reference=paste0(out_file_dir,"/",input_id,".cnn")
.main$exec_code=paste("singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py reference ", gender, min_cluster_size,
" -f ",normalizePath(ref_genome)," -o ",
.main$out_file$pon, add, gsub(";"," ",cnn)
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="cnn"
)
launch(.env=.base.env)
}
#' Wrapper around target function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param bed [REQUIRED] Path to input BED file with target regions. Default none
#' @param annotation [OPTIONAL] BED file with region annotation information. Default none
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param short_names [OPTIONAL] Use short annotation names. Default FALSE
#' @param bin_size [OPTIONAL] Average bin size. Default 100.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
create_target_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
bed=NULL,
annotation=NULL,
split=FALSE,
short_names=FALSE,
bin_size=100,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
if(!is.null(annotation)){
annotation=paste0(" --annotate ",annotation)
}
add=""
if(short_names){
add=paste(add," --short-names ")
}
.main$out_files$tg_bed=paste0(
out_file_dir,"/",input_id,".binned.targets.bed"
)
.main$exec_code=paste(
"singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py target --split -a ",bin_size," -o ",
.main$out_files$tg_bed, add, input
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bed"
)
launch(.env=.base.env)
}
#' Wrapper around antitarget function from CNVkit
#'
#' This function wraps around antitarget function for CNVkit
#' This function generates an antitarget BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param bed [REQUIRED] Path to input BED file with target regions. Default none
#' @param access [OPTIONAL] Path to non-accessible regions to exclude. Default none
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param bin_size [OPTIONAL] Average bin size. Default 100.
#' @param min_bin_size [OPTIONAL] Minimum average bin size. Bins smaller that this will be dropped. Default NULL.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
create_antitarget_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
access=build_default_reference_list()$HG19$reference$access_5k,
bed=NULL,
bin_size=500000,
min_bin_size=NULL,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
if(!is.null(min_bin_size)){
add=paste0(" -m ",min_bin_size)
}
.main$out_files$atg_bed=paste0(
out_file_dir,"/",input_id,".binned.antitargets.bed"
)
if(!is.null(access)){
access=paste0(" -g ",access)
}
.main$exec_code=paste(
"singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py antitarget -a ",
bin_size,access,
" -o ",.main$out_files$atg_bed, add, normalizePath(input)
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bed"
)
launch(.env=.base.env)
}
#' Create binned target and anitarget beds from target BED file
#'
#' This function wraps around target and antitarget functiosn for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param access [OPTIONAL] Path to non-accessible regions to exclude. Default none
#' @param bed [REQUIRED] Path to input BED file with target regions. Default none
#' @param annotation [OPTIONAL] BED file with region annotation information. Default none
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param short_names [OPTIONAL] Use short annotation names. Default FALSE
#' @param bin_size_target [OPTIONAL] Average bin size for target. Default 120.
#' @param bin_size_antitarget [OPTIONAL] Average bin size for target. Default 500000.
#' @param min_bin_size_antitarget [OPTIONAL] Average bin size for target. Default null.
#' @param split [OPTIONAL] Average bin size for target. Default null.
#' @param output_dir [OPTIONAL] Split annotation names.
#' @param output_dir [OPTIONAL] Split annotation names.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
bin_targets_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
access=build_default_reference_list()$HG19$reference$access_5k,
bed=NULL,
annotation=NULL,
bin_size_antitarget=100000,
bin_size_target=100,
min_bin_size_antitarget=NULL,
split=FALSE,
short_names=FALSE,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
.main$steps[[fn_id]]<-.this.env
.main.step=.main$steps[[fn_id]]
.main.step$steps=append(
.main.step$steps,
create_target_cnvkit(
sif_cnvkit=sif_cnvkit,
bed=bed,
annotation=annotation,
output_name=output_name,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
batch_dir=batch_dir,
env_dir=env_dir,
split=split,
short_names=short_names,
bin_size=bin_size_target,
verbose=verbose,
threads=threads,ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$create_target_cnvkit
.main.step$out_files<-.this.step$out_files
.main.step$steps=append(
.main.step$steps,
create_antitarget_cnvkit(
sif_cnvkit=sif_cnvkit,
access=access,
bed=bed,
output_name=output_name,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
batch_dir=batch_dir,
env_dir=env_dir,
bin_size=bin_size_antitarget,
min_bin_size=min_bin_size_antitarget,
verbose=verbose,
threads=threads,ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$create_antitarget_cnvkit
.main.step$out_files<-append(
.main.step$out_files,
.this.step$out_files
)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bed"
)
launch(.env=.base.env)
}
#' Wrapper around autobin function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param ref_genoma [REQUIRED] Path to reference genoms.
#' @param bed [REQUIRED] Path to input BED file with target regions. Default none
#' @param bams [REQUIRED] Path to BAM files. Default none
#' @param annotation [OPTIONAL] BED file with region annotation information. Default none
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param short_names [OPTIONAL] Use short annotation names. Default FALSE
#' @param seq_method [OPTIONAL] Sequenced methods used. Default hybrid. Options ["hybrid","amplicon","wgs"]
#' @param seq_type [OPTIONAL] Use short annotation names. Default FALSE
#' @param min_bin_size_target [OPTIONAL] Mininimum target bin size. Default 20.
#' @param max_bin_size_target [OPTIONAL] Mininimum target bin size. Default 20000.
#' @param max_bin_size_antitarget [OPTIONAL] Mininimum target bin size. Default 500000.
#' @param min_bin_size_antitarget [OPTIONAL] Mininimum target bin size. Default 500.
#' @param bp_per_bin [OPTIONAL] Bases per bin. Default 100000.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
autobin_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
ref_genome=build_default_reference_list()$HG19$reference$genome,
access=build_default_reference_list()$HG19$reference$access_5k,
bam=NULL,
bed=NULL,
annotation=NULL,
seq_method="hybrid",
split=FALSE,
short_names=FALSE,
bp_per_bin=100000,
min_bin_size_target=20,
max_bin_size_target=20000,
min_bin_size_antitarget=500,
max_bin_size_antitarget=500000,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
if(!is.null(annotation)){
annotation=paste0(" --annotate ",annotation)
}
if(!is.null(access)){
access=paste0(" -g ",access)
}
add=""
if(short_names){
add=paste(add," --short-names ")
}
.main$exec_code=paste("singularity exec -H ",
paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py autobin -t ",bed," -f ",ref_genome,
" -b ",bp_per_bin, " -m ",seq_method,access,
" --target-max-size ",max_bin_size_target,
" --target-min-size ",min_bin_size_target,
" --antitarget-min-size ",min_bin_size_antitarget,
" --antitarget-max-size ",max_bin_size_antitarget,
add,annotation,
gsub(";"," ",input)
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bam"
)
launch(.env=.base.env)
}
#' Wrapper around autobin function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param ref_genoma [REQUIRED] Path to reference genoms.
#' @param bed [REQUIRED] Path to input BED file with target regions. Default none
#' @param bam [REQUIRED] Path to BAM files. Default none
#' @param read_count [OPTIONAL] Alternative method for coverage. Default FALSE
#' @param min_mapq [OPTIONAL] Minimum mapping quality to count a read for coverage. Default 0.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
coverage_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
ref_genome=build_default_reference_list()$HG19$reference$genome,
bam=NULL,
bed=NULL,
read_count=FALSE,
min_mapq=0,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
.main$out_files$cnn=paste0(out_file_dir,"/",input_id,".cnn")
add=""
if(read_count){
add=" -c "
}
.main$exec_code=paste(
"singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py coverage -p ",threads, "-q ",min_mapq,
" -f ", normalizePath(ref_genome)," -o ",.main$out_files$cnn,
add,normalizePath(input), bed
)
run_job(.env=.this.env)
.env$.main <- .main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bam"
)
launch(.env=.base.env)
}
#' Wrapper around autobin function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.
#' @param ref_genoma [REQUIRED] Path to reference genoms.
#' @param bed [REQUIRED] Path to input BED file with target regions. Default none
#' @param bam [REQUIRED] Path to BAM files. Default none
#' @param read_count [OPTIONAL] Alternative method for coverage. Default FALSE
#' @param min_mapq [OPTIONAL] Minimum mapping quality to count a read for coverage. Default 0.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
call_coverage_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
ref_genome=build_default_reference_list()$HG19$reference$genome,
bam=NULL,
type="target",
bed=NULL,
read_count=FALSE,
min_mapq=0,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
output_name=paste0(input_id,".",type,"coverage")
fn_id=paste0(fn_id,".",type)
.main$steps[[fn_id]]<-.this.env
.main.step<-.main$steps[[fn_id]]
if(type!="target" && type != "antitarget"){
stop("Valid values for type are : target / antitarget")
}
.main.step$steps<-append(
.main.step$steps,
coverage_cnvkit(
sif_cnvkit=sif_cnvkit,
ref_genome=ref_genome,
bam=bam,
bed=bed,
output_name=output_name,
output_dir=out_file_dir,
tmp_dir=tmp_dir,
batch_dir=batch_dir,
env_dir=env_dir,
err_msg=err_msg,
read_count=read_count,
min_mapq=min_mapq,
verbose=verbose,
threads=threads,
ram=ram,
executor_id=task_id
)
)
.this.step=.main.step$steps$coverage_cnvkit
.main.step$out_files[[type]]=.this.step$out_files$cnn
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="bam"
)
launch(.env=.base.env)
}
#' Wrapper around reference function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.oms.
#' @param seg_method [OPTIONAL] Method to use for segmentation. Default cbs. Options ["cbs","flasso","haar","none","hmm","hmm-tumor","hmm-germline"]
#' @param smooth [OPTIONAL] Smooth before CBS. Default TRUE
#' @param drop_low_coverage [OPTIONAL] Drop low coverage bins before segmentation. Default TRUE
#' @param drop_outliers [OPTIONAL] Drop outlier bins before segmentation. Default TRUE
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
segment_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
cnr=NULL,
seg_method="cbs",
smooth=FALSE,
drop_low_coverage=FALSE,
drop_outliers=10,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
add=""
if(smooth){
add=paste(add," --smooth-cbs ")
}
if(drop_low_coverage){
add=paste(add," --drop-low-coverage ")
}
if(drop_outliers){
drop_outliers=paste0(" --drop-outliers ",drop_outliers)
}
.main$out_files$cns=paste0(out_file_dir,"/",input_id,".cns")
.main$exec_code=paste(
"singularity exec -H",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py segment -p ",
threads,drop_outliers," -o ",
.main$out_files$cns,
add,normalizePath(input)
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="cnr"
)
launch(.env=.base.env)
}
#' Wrapper around reference scatter from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.oms.
#' @param trend [OPTIONAL] Smooth before CBS. Default TRUE
#' @param range [OPTIONAL] Range to show in chr:start-end format. Default none
#' @param range_list [OPTIONAL] Path to BED file with range list. Default none
#' @param genes [OPTIONAL] List of genes to focus on. Default none.
#' @param margin_width [OPTIONAL] Default gene margin width. Default 1000000
#' @param plot_by_bin [OPTIONAL] Assume all bins are the same size. Default FALSE
#' @param trend [OPTIONAL] Show trendline. Default TRUE.
#' @param antitarget_symbol [OPTIONAL] Show antitarget probes with this symbol. Default "@"
#' @param segment_colour [OPTIONAL] Show antitarget probes with this symbol. Default "@"
#' @param title [OPTIONAL] Sample title. Default NULL.
#' @param y_max [OPTIONAL] Y max range. Default NULL.
#' @param y_min[OPTIONAL] Y min range. Default NULL.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
scatter_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
chrs=build_default_chr_list()$canonical,
cnr=NULL,
cns=NULL,
range=NULL,
range_list=NULL,
genes=NULL,
margin_width=1000000,
plot_by_bin=FALSE,
trend=TRUE,
antitarget_symbol="@",
segment_colour="red",
title=NULL,
y_max=NULL,
y_min=NULL,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
if(!is.null(range)){
range=paste0(" -c ",range)
}
if(!is.null(segment_colour)){
segment_colour=paste0(" --segment-color ",segment_colour)
}
if(!is.null(genes)){
genes=paste0(" --gene ",genes)
}
if(!is.null(range_list)){
range_list=paste0(" -l ",range_list)
}
add=""
if(!is.null(title)){
title=paste0(" --title ",title)
}
if(!is.null(y_max)){
y_max=paste0(" --y-max ",y_max)
}
if(!is.null(y_min)){
y_min=paste0(" --y-min ",y_min)
}
if(trend){
add=paste(add," --trend ")
}
if(plot_by_bin){
add=paste(add," --by-bin ")
}
if(!is.null(margin_width)){
margin_width=paste0(" --width ",margin_width)
}
.main$out_files$scatter=paste0(out_file_dir,"/",input_id,".scatter.pdf")
cnr_tmp=paste0(cnr,".tmp")
cns_tmp=""
cns_code=""
.main$exec_code=paste0("cat ",cnr," | head -n 1 > ",
cnr_tmp," && cat ",cnr," | grep \"",
paste0(paste0("^",chrs),collapse="\\|"),"\" >> ",cnr_tmp
)
if(!is.null(cns)){
cns_tmp=paste0(cns,".tmp")
cns_code=paste(" -s ",cns_tmp)
.main$exec_code=paste0(
.main$exec_code," && cat ",cns," | head -n 1 > ",cns_tmp,
" && ",paste0("cat ",cns," | grep \" ",
paste0(paste0("^",unlist(chrs),collapse="\\|")," \" >> ",cns_tmp)
)
)
}
.main$exec_code=paste(
.main$exec_code,";singularity exec -H ",paste0(getwd(),":/home "),
sif_cnvkit," cnvkit.py scatter -o ",.main$out_files$scatter,
cns_code,add,
title,
segment_colour,
y_max,
y_min,
range,
margin_width,
range_list,
cnr_tmp,
"&& rm ",
cnr_tmp,cns_tmp
)
run_job(.env=.this.env)
.env$.main <- .main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env=.base.env,
vars="cnr"
)
launch(.env=.base.env)
}
#' Wrapper around reference function from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.oms.
#' @param target [REQUIRED] Path to CNN file for target regions. Default none.
#' @param antitarget [REQUIRED] Path to BED file with target regions. Default none.
#' @param gc [OPTIONAL] Disble GC bias correction. Default FALSE.
#' @param edge [OPTIONAL] Disble edge correction. Default FALSE.
#' @param rmask [OPTIONAL] Disble repeat mask correction. Default FALSE.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
fix_cnvkit=function(
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
pon=build_default_reference_list()$HG19$panel$PCF_V3$variant$pon_cn_male,
target=NULL,
antitarget=NULL,
gc=TRUE,
edge=TRUE,
rmask=TRUE,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
.main$out_files$cnr=paste0(out_file_dir,"/",input_id,".cnr")
add=""
if(!gc){
add=paste(add," --no-gc ")
}
if(!edge){
add=paste(add," --no-edge ")
}
if(!rmask){
add=paste(add," --no-rmask ")
}
if(is.null(antitarget)){
antitarget=paste0(tmp_dir,"/antitargetcoverage.cnn")
system(paste("cp ", system.file("data", "antitargetcoverage.cnn", package=ns), tmp_dir))
}
.main$exec_code=paste(
"singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py fix -o ",.main$out_files$cnr,
add,
normalizePath(input),
normalizePath(antitarget),
normalizePath(pon)
)
run_job(.env=.this.env)
.env$.main<-.main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env=.base.env,
vars="target"
)
launch(.env=.base.env)
}
#' Wrapper around reference scatter from CNVkit
#'
#' This function wraps around target function for CNVkit
#' This function generates an target BED file from an input target file.
#' Additional parameters can be used to exclude regions and modify the average bin size
#'
#'
#' For more information read:
#' https://cnvkit.readthedocs.io/en/stable/pipeline.html
#'
#' @param sif_cnvkit [REQUIRED] Path to cnvkit sif file.oms.
#' @param cnr [REQUIRED] Path to CNR as produced by fix_cnvkit command.
#' @param cns [OPTIONAL] Path to CNS as produced by segment_cnvkit command.
#' @param cn_thr [OPTIONAL] Copy number threshold. Default 0.5
#' @param min_probes [OPTIONAL] Minimum number of probes to show a gene. Default 3
#' @param male_reference [OPTIONAL] Assume inputs were normalized to a male reference. Default FALSE
#' @param gender [OPTIONAL] Assume sample gender. Default male
#' @param shift [OPTIONAL] Adjust the X and Y chromosomes according to sample sex. Default TRUE
#' @param title [OPTIONAL] Sample ID. Default NULL.
#' @param output_name [OPTIONAL] Name for the output. If not given the name of the first tumour sample of the samples will be used.
#' @param output_dir [OPTIONAL] Path to the output directory.
#' @param threads [OPTIONAL] Number of threads to split the work. Default 4
#' @param ram [OPTIONAL] RAM memory to asing to each thread. Default 4
#' @param verbose [OPTIONAL] Enables progress messages. Default False.
#' @param mode [REQUIRED] Where to parallelize. Default local. Options ["local","batch"]
#' @param executor_id Task EXECUTOR ID. Default "recalCovariates"
#' @param task_name Task name. Default "recalCovariates"
#' @param time [OPTIONAL] If batch mode. Max run time per job. Default "48:0:0"
#' @param update_time [OPTIONAL] If batch mode. Job update time in seconds. Default 60.
#' @param wait [OPTIONAL] If batch mode wait for batch to finish. Default FALSE
#' @param hold [OPTIONAL] HOld job until job is finished. Job ID.
#' @export
diagram_cnvkit=function(
chrs=build_default_chr_list()$canonical,
sif_cnvkit=build_default_sif_list()$sif_cnvkit,
cnr=NULL,
cns=NULL,
cn_thr=0.5,
min_probes=3,
male_reference=FALSE,
gender="male",
shift=TRUE,
title=NULL,
...
){
run_main=function(
.env
){
.this.env=environment()
append_env(to=.this.env,from=.env)
set_main(.env=.this.env)
.main$out_files$diagram=paste0(out_file_dir,"/",input_id,".diagram.pdf")
if(!is.null(gender)){
gender=paste0(" -x ",gender)
}
if(!is.null(cn_thr)){
cn_thr=paste0(" -t ",cn_thr)
}
if(!is.null(min_probes)){
min_probes=paste0(" -m ",min_probes)
}
if(!is.null(title)){
title=paste0(" --title ",title)
}
add=""
if(!shift){
add=paste(add," --no-shift-xy ")
}
if(male_reference){
add=paste(add," -y ")
}
tmp_cns=paste0(cns,".tmp")
tmp_cnr=paste0(cnr,".tmp")
cnr_tmp=paste0(cnr,".tmp")
cns_tmp=""
cns_code=""
.main$exec_code=paste0("cat ",cnr," | head -n 1 > ",cnr_tmp," && cat ",cnr," | grep \"",
paste0(paste0("^",chrs),collapse="\\|"),"\" >> ",cnr_tmp)
if(!is.null(cns)){
cns_tmp=paste0(cns,".tmp")
cns_code=paste(" -s ",cns_tmp)
.main$exec_code=paste0(.main$exec_code," && cat ",
cns," | head -n 1 > ",cns_tmp," && ",paste0("cat ",cns," | grep \" ",
paste0(paste0("^",unlist(chrs),collapse="\\|")," \" >> ",cns_tmp)))
}
.main$exec_code=paste(
.main$exec_code,"; singularity exec -H ",paste0(getwd(),":/home "),sif_cnvkit,
" cnvkit.py diagram -o ",.main$out_files$diagram,
cns_code,gender,
add,title,
min_probes,
cn_thr,
cnr_tmp,
"&& rm ",
cnr_tmp,
cns_tmp
)
run_job(.env=.this.env)
.env$.main <- .main
}
.base.env=environment()
list2env(list(...),envir=.base.env)
set_env_vars(
.env= .base.env,
vars="cnr"
)
launch(.env=.base.env)
}
#' Plot CNR across bins
#'
#'
#'
#' @param cnr [REQUIRED] Path to CNR as produced by fix_cnvkit command.
#' @param cns [OPTIONAL] Path to CNS as produced by segment_cnvkit command.
#' @param genes [OPTIONAL] Genes to focus on. Default NULL
#' @param highlights [OPTIONAL] Highlights genes. Default NULL.
#' @param weights [OPTIONAL] Filter bins by weight
#' @param trend [OPTIONAL] Draw trend line across bins
#' @param save [OPTIONAL] Save image
#' @export
plot_cnvkit=function(
cnr=NULL,cns=NULL,chromosomes=NULL,
genes=NULL,highlights=NULL,
weights=NULL,threads=5,
show_lines=TRUE,
trend=TRUE,
save=TRUE,
limits=5,
show="panel",
output_dir="img",
output_name="img.png",
plot_width=1200,
plot_height=600
){
out_file_dir=set_dir(dir=output_dir)
cnr_dat=dplyr::bind_rows(mclapply_os(X=cnr,FUN=function(x){dat=read.csv(x,sep="\t");dat$id=ULPwgs::get_file_name(x);dat},mc.cores=threads))
cnr_dat$bin_type=ifelse(cnr_dat$gene=="Antitarget","Antitarget","Target")
cnr_dat$origin=ifelse(grepl("GENE",cnr_dat$gene),
"GENE",ifelse(grepl("SNP",cnr_dat$gene),"SNP","OTHER"))
cnr_dat$gene_type=ifelse(grepl("PANEL",cnr_dat$gene),"TARGET",
ifelse(grepl("CONTROL",cnr_dat$gene),"CONTROL",NA
))
cnr_dat$gene=sub(";rs.*","",gsub("MAF=[0-9].[0-9]{1,5};?","",cnr_dat$gene))
cnr_dat$gene=sub(
"^;","",
gsub(";?[CP][OA][NN][TE].*_[GS][EN][A-Z]{1,2};?","",cnr_dat$gene)
)
if(!is.null(cns)){
cns_dat=dplyr::bind_rows(mclapply_os(X=cns,FUN=function(x){dat=read.csv(x,sep="\t");dat$id=ULPwgs::get_file_name(x);dat},mc.cores=threads))
}
if(!is.null(weights)){
cnr_dat=cnr_dat %>% filter(weight>=weights)
}
cnr_dat$bin_id=paste0(cnr_dat$chromosome,"_",(cnr_dat$start+cnr_dat$end)/2)
cnr_dat$bin_id=as.numeric(forcats::fct_inorder(cnr_dat$bin_id))
if(!is.null(chromosomes)){
cnr_dat=cnr_dat[cnr_dat$chromosome %in% chromosomes,]
}
cnr_dat=cnr_dat[stringr::str_order(cnr_dat$chromosome, numeric = TRUE),]
cnr_dat$chromosome=forcats::fct_inorder(cnr_dat$chromosome)
if(!is.null(genes)){
cnr_dat=cnr_dat %>%
dplyr::filter(grepl(paste0(genes,collapse="|"),gene))
}
if(show=="panel"){
plt=ggplot(cnr_dat)+
geom_hline(aes(yintercept=0),linetype="dashed",size=0.1,alpha=0.25)
lapply(1:limits,FUN=function(x){
plt<<-plt+geom_hline(aes(yintercept=x),linetype="longdash",size=0.1,alpha=0.25)
plt<<-plt+geom_hline(aes(yintercept=-x),linetype="longdash",size=0.1,alpha=0.25)
})
plt=plt+facet_grid(id~chromosome)
if(!is.null(highlights)){
lapply(highlights,FUN=function(x){
tmp=cnr_dat %>% dplyr::filter(grepl(paste0("^",x,";?$"),gene)|grepl(paste0(";",x,"$"),gene))
tmp_outer=cnr_dat %>%
dplyr::filter(!grepl(paste0("^",x,";?$"),gene)|!grepl(paste0(";",x,"$"),gene),
chromosome==unique(tmp$chromosome))
tmp_outer=tmp_outer %>% group_by(id,chromosome) %>% summarise(medianOUTER=median(log2))
tmp=tmp %>% group_by(id,chromosome) %>%
summarise(
medianLog=median(log2),medianGENE=median(log2[origin=="GENE"]),
medianFLANK=median(log2[origin=="SNP"]),
min=min(bin_id),max=max(bin_id),minGENE=min(bin_id[]))
tmp=dplyr::left_join(tmp,tmp_outer)
plt<<-plt+geom_rect(data=tmp,aes(xmin=min,xmax=max,
ymin=-1+medianLog,ymax=1+medianLog),
col="black",fill="lightgray",size=0.1,alpha=0.5)
plt<<-plt+geom_text(data=tmp,aes(x=(min+max)/2,
y=1.75+medianLog,label=paste0(x,
";\nGENE LOG2: ",round(medianGENE,2),
";\nFLANK LOG2: ",round(medianFLANK,2),
";\nALL LOG2: ",round(medianLog,2),
";\nOUTER LOG2: ",round(medianOUTER,2)
)),
col="black",size=1)
}
)
}
plt=plt+geom_point(data=cnr_dat,aes(x=bin_id,y=log2,
col=ifelse(bin_type=="Antitarget","green","red")),size=0.1)+
scale_colour_identity()+facet_grid(id~chromosome,scale="free_x")+
theme_classic() +
theme(axis.title.x=element_blank(),
panel.spacing = unit(0, "lines"),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.y = element_text(size=3)
)+
ylab("Log2")+scale_y_continuous(limits=c(-limits,limits))
if(trend){
plt=plt+geom_smooth(aes(x=bin_id,y=log2),col="#7b0b9c",se=FALSE)
}
if(save){
ggsave(units="px",limitsize = FALSE,filename=paste0( out_file_dir,"/",s_id,".",chrom,".",output_name),plot=plt,
heigh=length(unique(cnr_dat$id))*plot_height,
width=plot_width*length(unique(cnr_dat$chromosome)))
}else{
return(plt)
}
}else if (show=="single"){
mclapply_os(unique(cnr_dat$id),FUN=function(s_id){
library(tidyverse)
library(ggrepel)
lapply(unique(cnr_dat$chromosome),FUN=function(chr){
out_file_dir=set_dir(out_file_dir,name=chr)
cnr_dat=cnr_dat%>% filter(chromosome==chr,id==s_id)
plt=ggplot(cnr_dat)+
geom_hline(aes(yintercept=0),linetype="dashed",size=0.1,alpha=0.25)
lapply(1:limits,FUN=function(x){
plt<<-plt+geom_hline(aes(yintercept=x),linetype="longdash",size=0.1,alpha=0.25)
plt<<-plt+geom_hline(aes(yintercept=-x),linetype="longdash",size=0.1,alpha=0.25)
})
plt=plt+facet_grid(id~chromosome)
plt=plt+geom_point(data=cnr_dat,aes(x=bin_id,y=log2,
col=ifelse(bin_type=="Antitarget","green","red")),size=0.1)+
scale_colour_identity()+facet_grid(id~chromosome,scale="free_x")+
theme_classic() +
theme(axis.title.x=element_blank(),
panel.spacing = unit(0, "lines"),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.y = element_text(size=3)
)+
ylab("Log2")+scale_y_continuous(limits=c(-limits,limits))
if(trend){
plt=plt+geom_smooth(aes(x=bin_id,y=log2),col="#7b0b9c",se=FALSE)
}
if(!is.null(highlights)){
lapply(highlights,FUN=function(x){
tmp=cnr_dat %>% dplyr::filter(grepl(paste0("^",x,";?$"),gene)|grepl(paste0(";",x,"$"),gene))
tmp_outer=cnr_dat %>%
dplyr::filter(!grepl(paste0("^",x,";?$"),gene)|!grepl(paste0(";",x,"$"),gene),
chromosome==unique(tmp$chromosome))
tmp_outer=tmp_outer %>% group_by(id,chromosome) %>% summarise(medianOUTER=median(log2))
tmp=tmp %>% group_by(id,chromosome) %>%
summarise(
medianLog=median(log2),medianGENE=median(log2[origin=="GENE"]),
medianFLANK=median(log2[origin=="SNP"]),
min=min(bin_id),max=max(bin_id),minGENE=min(bin_id[]))
tmp=dplyr::left_join(tmp,tmp_outer)
plt<<-plt+geom_rect(data=tmp,aes(xmin=min,xmax=max,
ymin=-0.75+medianLog,ymax=0.75+medianLog),
col="black",fill="lightgray",size=0.1,alpha=0.5)
plt<<-plt+geom_text(data=tmp,aes(x=(min+max)/2,
y=ifelse(medianLog>0,medianLog-1.75,medianLog+1.75),label=paste0(x,
";\nGENE LOG2: ",round(medianGENE,2),
";\nFLANK LOG2: ",round(medianFLANK,2),
";\nALL LOG2: ",round(medianLog,2),
";\nOUTER LOG2: ",round(medianOUTER,2)
)),
col="black",size=0.9)
}
)
}
if(save){
ggsave(units="px",limitsize = FALSE,filename=paste0( out_file_dir,"/",s_id,".",chr,".",output_name),plot=plt,
heigh=length(unique(cnr_dat$id))*plot_height,
width=plot_width*length(unique(cnr_dat$chromosome)))
}else{
return(plt)
}
})
},mc.cores=threads)
}
}
tfbs="TFBS/mnt/ssd/gtrd1903-temp/BioUML_20210519111712054.tmp/669696E3583AE070C6B3DACC767116B1/only_top1000/hg19/AR.1000.tfbs"
cnr="~/Scratch/CNV_WGS/pl/bin_1k/cnvkit_reports/PEA172_2019_05_02_PL/PEA172_2019_05_02_PL.cnr"
cns="~/Scratch/CNV_WGS/pl/bin_1k/cnvkit_reports/PEA172_2019_05_02_PL/PEA172_2019_05_02_PL.cns"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.