#library(data.table)
#library(tidyr)
#library(stringr)
#library(dplyr)
#' @export
compile_reads <- function(
master.ref, results.dir, project.ID, pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/",
fasta.path = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta",
genotyper.path = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample",
dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245",
dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt") {
# # test input section -----------------------------------------------------------
# master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv')
# results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y'))
# pooled.bam.dir = '/ifs/work/bergerm1/ACCESS-Projects/novaseq_curated_duplex_v2/'
# fasta.path = '/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta'
# genotyper.path = '/ifs/work/bergerm1/Innovation/software/maysun/GetBaseCountsMultiSample/GetBaseCountsMultiSample'
# dmp.dir = '/ifs/work/bergerm1/zhengy1/dmp/mskimpact/'
# mirror.bam.dir = '/ifs/dmpshare/share/irb12_245/'
# dmp.key.path = '/ifs/dmprequest/12-245/key.txt'
# setting up directory ----------------------------------------------------
dir.create(results.dir)
# make tmp directory in output directory
dir.create(paste0(results.dir, "/tmp"))
# checking virtualenv -----------------------------------------------------
geno.bash <- system("which genotype_variants", intern = T)
if (length(geno.bash) == 0) {
# print(pyclone.path)
stop("needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0")
}
# data from DMP -----------------------------------------------------------
DMP.key <- as.data.table(read.csv(dmp.key.path, header = FALSE, sep = ","))
if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1))) {
stop(paste0(
"These DMP IDs are not found in DMP key file: ",
paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in%
gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1))], collapse = " ,")
))
}
DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>%
filter(Mutation_Status != "GERMLINE") %>%
data.table()
DMP.RET.maf <- DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ]
# Pooled normal samples ---------------------------------------------------
pooled.bams <- list.files(pooled.bam.dir, pattern = ".bam", full.names = T)
# For each patient --------------------------------------------------------
x <- unique(master.ref$cmo_patient_id)[1]
# x = unique(master.ref$cmo_sample_id_plasma)[16]
# x = 'C-YW82CY'
print("Compiling reads per patient")
all.fillout.id <- lapply(unique(master.ref$cmo_patient_id), function(x) {
print(x)
dir.create(paste0(results.dir, "/", x))
dmp_id <- unique(master.ref[cmo_patient_id == x]$dmp_patient_id)
# sample sheet with colummns -- TSB, sample type, bam path, treatm --------
# need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal
# DMP sample sheet
if (is.na(dmp_id) | dmp_id == '') {
dmp.sample.sheet <- NULL
} else {
all.dmp.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1
all.dmp.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1
all.dmp.ids.XS <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1
all.dmp.ids <- c(all.dmp.ids.IM,all.dmp.ids.IH,all.dmp.ids.XS)
all.dmp.bam.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2
all.dmp.bam.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2
all.dmp.bam.ids.XS <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2
all.dmp.bam.ids <- c(all.dmp.bam.ids.IM,all.dmp.bam.ids.IH,all.dmp.bam.ids.XS)
bam.sub.dir <- unlist(lapply(strsplit(substr(all.dmp.bam.ids, 1, 2), ""), function(x) {
paste0(x, collapse = "/")
}))
dmp.sample.sheet <- data.frame(
Sample_Barcode = all.dmp.ids,
standard_bam = paste0(mirror.bam.dir, "/", bam.sub.dir, "/", all.dmp.bam.ids, ".bam")
) %>%
mutate(cmo_patient_id = x, Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), dmp_patient_id = dmp_id)
}
# total sample sheet
sample.sheet <- master.ref[
cmo_patient_id == x,
# plasma bams -- duplex and simplex bam
.(
Sample_Barcode = as.character(cmo_sample_id_plasma), duplex_bam = bam_path_plasma_duplex,
simplex_bam = bam_path_plasma_simplex, cmo_patient_id, Sample_Type = "duplex", dmp_patient_id
)
] %>%
merge(rbind(
unique(master.ref[
cmo_patient_id == x&paired=='Paired',
# buffy coat + DMP bams -- standard bam only
.(
Sample_Barcode = as.character(cmo_sample_id_normal), standard_bam = bam_path_normal,
cmo_patient_id, Sample_Type = "unfilterednormal", dmp_patient_id
)
]),
dmp.sample.sheet
), all = T)
# catch '' or NA for empty cells for some cmo_sample_id_normal
sample.sheet <- sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""]
write.table(sample.sheet, paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), sep = "\t", quote = F, row.names = F)
# piece together all unique calls -----------------------------------------
# get duplex calls
duplex.calls <- do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) {
# fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table()
selectcolumns <- c("Hugo_Symbol","Entrez_Gene_Id","Center","NCBI_Build","Chromosome","Start_Position","End_Position","Strand","Variant_Classification","Variant_Type","Reference_Allele","Tumor_Seq_Allele1","Tumor_Seq_Allele2","dbSNP_RS","dbSNP_Val_Status","Tumor_Sample_Barcode","caller_Norm_Sample_Barcode","Match_Norm_Seq_Allele1","Match_Norm_Seq_Allele2","Tumor_Validation_Allele1","Tumor_Validation_Allele2","Match_Norm_Validation_Allele1","Match_Norm_Validation_Allele2","Verification_Status","Validation_Status","Mutation_Status","Sequencing_Phase","Sequence_Source","Validation_Method","Score","BAM_File","Sequencer","Tumor_Sample_UUID","Matched_Norm_Sample_UUID","HGVSc","HGVSp","HGVSp_Short","Transcript_ID","Exon_Number","caller_t_depth","caller_t_ref_count","caller_t_alt_count","caller_n_depth","caller_n_ref_count","caller_n_alt_count","all_effects","Allele","Gene","Feature","Feature_type","Consequence","cDNA_position","CDS_position","Protein_position","Amino_acids","Codons","Existing_variation","ALLELE_NUM","DISTANCE","STRAND_VEP","SYMBOL","SYMBOL_SOURCE","HGNC_ID","BIOTYPE","CANONICAL","CCDS","ENSP","SWISSPROT","TREMBL","UNIPARC","RefSeq","SIFT","PolyPhen","EXON","INTRON","DOMAINS","AF","AFR_AF","AMR_AF","ASN_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF","CLIN_SIG","SOMATIC","PUBMED","MOTIF_NAME","MOTIF_POS","HIGH_INF_POS","MOTIF_SCORE_CHANGE","IMPACT","PICK","VARIANT_CLASS","TSL","HGVS_OFFSET","PHENO","MINIMISED","ExAC_AF","ExAC_AF_AFR","ExAC_AF_AMR","ExAC_AF_EAS","ExAC_AF_FIN","ExAC_AF_NFE","ExAC_AF_OTH","ExAC_AF_SAS","GENE_PHENO","FILTER","flanking_bps","variant_id","variant_qual","ExAC_AF_Adj","ExAC_AC_AN_Adj","ExAC_AC_AN","ExAC_AC_AN_AFR","ExAC_AC_AN_AMR","ExAC_AC_AN_EAS","ExAC_AC_AN_FIN","ExAC_AC_AN_NFE","ExAC_AC_AN_OTH","ExAC_AC_AN_SAS","ExAC_FILTER","gnomAD_AF","gnomAD_AFR_AF","gnomAD_AMR_AF","gnomAD_ASJ_AF","gnomAD_EAS_AF","gnomAD_FIN_AF","gnomAD_NFE_AF","gnomAD_OTH_AF","gnomAD_SAS_AF","CallMethod","VCF_POS","VCF_REF","VCF_ALT","hotspot_whitelist","Status","D_t_alt_count_fragment","D_t_ref_count_fragment","D_t_vaf_fragment","SD_t_alt_count_fragment","SD_t_ref_count_fragment","SD_t_vaf_fragment","Matched_Norm_Sample_Barcode","Matched_Norm_Bamfile","n_alt_count_fragment","n_ref_count_fragment","n_vaf_fragment")
if("Status" %in% names(fread(x))){
fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | (is.na(Status)))
} else {
fread(x) %>% select(one_of(selectcolumns))
}
# fread(x)
# %>%
# filter(as.numeric(t_alt_count) > 0) %>%
# data.table()
}))
# get impact calls
impact.calls <- DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode]
write.table(impact.calls[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)],
paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"),
sep = "\t", quote = F, row.names = F
)
# combining plasma and impact calls
all.calls <- rbind(
duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F],
impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]
)
# getting rid of duplicate calls and take the first occurence of all events
all.calls <- all.calls[which(!duplicated(all.calls[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)])), ] %>%
mutate(t_ref_count=0, t_alt_count=0, n_ref_count=0, n_alt_count=0, Matched_Norm_Sample_Barcode=NA ) %>%
filter(Variant_Classification != "Silent" & !grepl("RP11-", Hugo_Symbol) & !grepl("Intron", Variant_Classification))
write.table(all.calls, paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), sep = "\t", quote = F, row.names = F)
# tagging hotspots
system(paste0(
'bsub -R "rusage[mem=4]" -cwd ', results.dir, "/", x, "/ -oo hotspot.o -eo hotspot.e -W 00:59 ",
" -P ", project.ID, " -J ", x, "_tag_hotspot ",
" python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ",
" -m ", results.dir, "/", x, "/", x, "_all_unique_calls.maf",
" -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ",
" -o ", results.dir, "/", x, "/", x, "_all_unique_calls_hotspots.maf",
" -outdir ", results.dir, "/", x, "/", x
))
# genotype all bams in this patient directory -----------------------------
# genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal
write.table(sample.sheet[, .(
sample_id = Sample_Barcode, maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"),
standard_bam, duplex_bam, simplex_bam
)],
paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"),
sep = "\t", quote = F, row.names = F
)
job.ids <- system(paste0(
"bsub -cwd ", results.dir, "/", x, ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ',
" -P ", project.ID, " -J ", x, "_genotype_variants ",
" genotype_variants small_variants multiple-samples -i ", results.dir, "/", x, "/", x, "_genotype_metadata.tsv",
" -r ", fasta.path, " -g ", genotyper.path, " -v DEBUG "
), intern = T)
job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids))
})
# Get base count multi sample in pooled normal ----------------------------
# all all unique calls in entire cohort
print("Compiling reads in pooled samples")
dir.create(paste0(results.dir, "/pooled"))
all.all.unique.mafs <- do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) {
fread(list.files(paste0(results.dir, "/", x), pattern = "unique_calls.maf$", full.names = T))
}))
all.all.unique.mafs <- all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)]),]
write.table(all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), sep = "\t", quote = F, row.names = F)
write.table(data.frame(
sample_id = gsub("^.*./|.bam", "", pooled.bams), maf = paste0(results.dir, "/pooled/all_all_unique.maf"),
standard_bam = pooled.bams, duplex_bam = "", simplex_bam = ""
),
paste0(results.dir, "/pooled/pooled_metadata.tsv"),
sep = "\t", quote = F, row.names = F
)
pooled.sample.job.id <- system(paste0(
"bsub -cwd ", results.dir, '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ',
" -w ", ' \"', paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), '\" ',
" -P ", project.ID, " -J pooled_genotype_variants ",
" genotype_variants small_variants multiple-samples -i ", results.dir, "/pooled/pooled_metadata.tsv",
" -r ", fasta.path, " -g ", genotyper.path, " -v DEBUG "
), intern = T)
pooled.sample.job.id <- as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id))
while (
!any(grepl("Done successfully", system(paste0("bjobs -l ", pooled.sample.job.id), intern = T)))
) {
Sys.sleep(120)
}
print("Compile reads done!")
}
# Executable -----------------------------------------------------------------------------------------------------------
# Minimal columns for input mafs
#
# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment
suppressPackageStartupMessages({
library(data.table)
library(tidyr)
library(stringr)
library(dplyr)
library(argparse)
})
if (!interactive()) {
parser <- ArgumentParser()
parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file")
parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory")
parser$add_argument("-pid", "--projectid",
type = "character", default = "",
help = "Project ID for submitted jobs involved in this run"
)
parser$add_argument("-pb", "--pooledbamdir",
type = "character", default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/",
help = "Directory for all pooled bams [default]"
)
parser$add_argument("-fa", "--fastapath",
type = "character", default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta",
help = "Reference fasta path [default]"
)
parser$add_argument("-gt", "--genotyperpath",
type = "character", default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample",
help = "Genotyper executable path [default]"
)
parser$add_argument("-dmp", "--dmpdir",
type = "character", default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme",
help = "Directory of clinical DMP repository [default]"
)
parser$add_argument("-mb", "--mirrorbamdir",
type = "character", default = "/juno/res/dmpcollab/dmpshare/share/irb12_245",
help = "Mirror BAM file directory [default]"
)
parser$add_argument("-dmpk", "--dmpkeypath",
type = "character", default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt",
help = "DMP mirror BAM key file [default]"
)
args <- parser$parse_args()
master.ref <- args$masterref
results.dir <- args$resultsdir
project.ID <- args$projectid
pooled.bam.dir <- args$pooledbamdir
fasta.path <- args$fastapath
genotyper.path <- args$genotyperpath
dmp.dir <- args$dmpdir
mirror.bam.dir <- args$mirrorbamdir
dmp.key.path <- args$dmpkeypath
if (project.ID == "") {
project.ID <- paste0(sample(c(0:9), size = 10, replace = T), collapse = "")
}
print(paste0("Input parameters for run ", project.ID))
print(master.ref)
print(results.dir)
print(pooled.bam.dir)
print(fasta.path)
print(genotyper.path)
print(dmp.dir)
print(mirror.bam.dir)
print(dmp.key.path)
suppressWarnings(compile_reads(fread(master.ref), results.dir, project.ID, pooled.bam.dir, fasta.path, genotyper.path, dmp.dir, mirror.bam.dir, dmp.key.path))
print("compile reads function finished")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.