##' The control function of the pipeline of the artificial genome
##' mapping.
##'
##' This function needs the fastq infiles as a named list and
##' additonal parameters set in the par_list by the function
##' \code{\link{set_par_list}}.
##' @title Control function for the artificial mapping
##' @param in_file list of *.fastq.gz file(s) by
##' \code{\link{get_sub_files}}
##' @param out_file file path of the outfile
##' @param par_list parameter list gnerated by
##' \code{\link{set_par_list}}
##' @return NULL
##' @author Jochen Kruppa
##' @export
##' @examples
##'
##' ## This function needs dendencies and file handling, see \url{https://github.com/jkruppa/virDisco} for a comprehensive tutorial.
artificial_genome_mapping <- function(in_file,
out_file,
par_list
)
{
talk("[DATA HANDLING] Write to ", dirname(out_file))
## some checks for file handling later on
require(tools)
require(png)
require(openxlsx)
proc_start_tm <- proc.time()
## generate tmp dir if not exists
if(!file.exists(par_list["tmp_dir"])) {
talk("[DATA HANDLING] Temp dir ", par_list["tmp_dir"], " not found. Will be created...")
dir.create(par_list["tmp_dir"])
} else {
talk("[DATA HANDLING] Temp dir ", par_list["tmp_dir"], " found. Will be removed...")
if(par_list["tmp_dir"] == file.path("/home/temp/"))
stop("Temp dir is '/home/temp/'! Stop, this not a good idea...")
unlink(par_list["tmp_dir"], recursive = TRUE)
talk("[DATA HANDLING] Temp dir ", par_list["tmp_dir"], " will be created...")
dir.create(par_list["tmp_dir"])
}
## check if mapper and index a coherent
if(!str_detect(par_list["genome_index"], par_list["mapper"])){
stop("par_list['genome_index'] does not fit to the selected mapper in par_list['mapper']!")
}
## check some flags
if(nchar(file_ext(out_file)) != 0) {
stop("out_file should have no extension like '.fq'")
}
if(length(unlist(in_file)) == 2) {
par_list["paired"] <- TRUE
}
if(nchar(par_list["amino_index"]) == 0) {
par_list["run_pep_mapping"] <- FALSE
}
if(file.exists(str_c(out_file, "_map_pep_list.RDS"))) {
par_list["run_pep_mapping"] <- FALSE
}
if(file.exists(str_c(out_file, "_map_dna_list.RDS"))) {
par_list["run_dna_mapping"] <- FALSE
}
if(file.exists(str_c(out_file, "_consensus.RDS"))) {
par_list["consensus"] <- FALSE
}
## if a list of genebank Ids is given, the plot need the consensus
if(nchar(par_list["plot_id"]) != 0){
par_list["consensus"] <- TRUE
}
if(any(class(par_list@species_info) == "empty") | any(class(par_list@prot_info) == "empty")) {
par_list["plot"] <- FALSE
} else {
if(tbl_sqlite_check(par_list)) talk("[DATA HANDLING] Gene and species information is correct")
}
## begin with trimming
## fastq quality control and trimming
## fastq quality control and trimming
if(!par_list["qc"] & par_list["decoy"]){
stop("If you want to use the decoy approach set qc = TRUE")
}
## start QC by trimming and read removing
if(par_list["qc"]){
trim_out_file <- fastq_quality_control(inFile = in_file,
illumninaclip = file.path(dirname(program_list["trimmomatic"]),
"adapters", "TruSeq3-PE.fa"),
par_list)
par_list["map_dna_in"] <- trim_out_file
par_list["map_pep_in"] <- trim_out_file
} else {
par_list["map_dna_in"] <- in_file
par_list["map_pep_in"] <- in_file
}
## start generating decoy reads
if(par_list["decoy"]){
talk("[DECOY] Generating decoy reads")
ref_seq <- readDNAStringSet(par_list["ref_seq_file"])
decoy_seq <- ref_seq[sample(str_which(names(ref_seq), "decoy"), 1)]
read_info <- read.table(file.path(par_list["tmp_dir"],
str_c(basename(out_file), "_trim.log")), header = TRUE)
## start generating the decoy reads depending on the mean read length
generate_decoy_reads(decoy_seq,
read_length = read_info$mean_read_length,
read_num = par_list["num_decoy_reads"])
## combine the decoy reads with the true reads
decoy_read_files <- dir0(par_list["tmp_dir"], "decoy")
decoy_R1_fq <- file.path(par_list["tmp_dir"], str_c(names(in_file), "_decoy_R1.fq.gz"))
decoy_R2_fq <- file.path(par_list["tmp_dir"], str_c(names(in_file), "_decoy_R2.fq.gz"))
runCMD(paste("cat", par_list["map_dna_in"][[1]]["R1"], str_subset(decoy_read_files, "_R1_"),
">", decoy_R1_fq))
runCMD(paste("cat", par_list["map_dna_in"][[1]]["R2"], str_subset(decoy_read_files, "_R2_"),
">", decoy_R2_fq))
## set the decoy in files as the new in files for mapping
par_list["map_dna_in"][[1]]["R1"] <- decoy_R1_fq
par_list["map_dna_in"][[1]]["R2"] <- decoy_R2_fq
}
## start DNA mapping
if(par_list["run_dna_mapping"]){
talk("[DNA MAPPING] Start DNA mapping")
map_dna_list <- map_dna_ref(infile = par_list["map_dna_in"],
outfile = str_c(out_file, ".sam"),
par_list)
talk("[DNA MAPPING] Save DNA mapping results...")
saveRDS(map_dna_list, str_c(out_file, "_map_dna_list.RDS"))
} else {
talk("[DNA MAPPING] Start DNA mapping -> Stopped due to prior results")
map_dna_list <- readRDS(str_c(out_file, "_map_dna_list.RDS"))
}
## get the mapping stats
if(par_list["decoy"]){
par_list["map_dna_stats"] <- as.list(map_dna_list$stats)
}
## run PEP mapping
if(par_list["run_pep_mapping"]){
talk("[AMINO MAPPING] Start AMINO mapping")
map_pep_list <- map_pep_ref(infile = par_list["map_pep_in"],
outfile = str_c(out_file, ".blastx"),
par_list)
talk("[AMINO MAPPING] Save AMINO mapping results...")
saveRDS(map_pep_list, str_c(out_file, "_map_pep_list.RDS"))
} else {
talk("[AMINO MAPPING] Start AMINO mapping -> Stopped due to prior results")
map_pep_list <- readRDS(str_c(out_file, "_map_pep_list.RDS"))
}
## build results xlsx
talk("[RESULTS] Save hits to xlsx")
ord_df <- ord_findings(map_dna_list, map_pep_list)
## remove some strange references by black_list
ord_df <- ord_df[!ord_df$genebank_id %in% par_list["black_list"], ]
## we remove all viral strains with a low coverage
good_coverage <- coverage_filter(ord_df$genebank_id,
map_dna_list,
par_list,
out_file) %>%
filter(coverage >= par_list["min_coverage"]) %>%
select(genebank_id)
## filter for the coverage
ord_df <- ord_df[ord_df$genebank_id %in% good_coverage$genebank_id,]
## get the description
genbank_id_desc_df <- collect(select(filter(par_list["species_info"],
genebank_id %in% ord_df$genebank_id),
c(genebank_id, Description)))
genbank_id_desc_map <- hashmap(genbank_id_desc_df$genebank_id,
genbank_id_desc_df$Description)
xlsx_df <- tibble(genebank_id = ord_df$genebank_id,
dna_reads = ord_df$count_read,
dna_rank = ord_df$rank_dna,
amino_reads = as.numeric(ord_df$count_prot),
amino_rank = ord_df$rank_pep,
description = genbank_id_desc_map[[genebank_id]])
openxlsx::write.xlsx(xlsx_df, file.path(str_c(out_file, ".xlsx")), row.names = FALSE)
talk("[RESULTS] Generate pylogenetic tree")
tax_ids <- collect(select(filter(par_list["species_info"],
genebank_id %in% ord_df$genebank_id), tax_id))
if(par_list["tax"] && nrow(tax_ids) != 0){
ete2_plot(tax_ids, out_file = out_file,
pdf_file = str_c(out_file, "_tree.pdf"),
clean = FALSE)
} else {
runCMD(paste("touch", str_c(out_file, "_no_tax_ids_found_tree.pdf")))
}
## clean everything
unlink(dir0(dirname(out_file), "tax")) ## clean tax files
unlink(dir0(dirname(out_file), "bam")) ## clean DNA map files
unlink(dir0(dirname(out_file), "blastx")) ## clean PEP map files
## start plotting the results
if(nrow(ord_df) >= par_list["num_plot"]) {
ord_df <- ord_df[1:par_list["num_plot"],]
}
## if no plot_ids are given run on findings
if(nchar(par_list["plot_id"])[1] == 0) {
par_list["plot_id"] <- ord_df$genebank_id
}
if(par_list["consensus"]){
talk("[CONSENSUS] Generate consensus of reads to reference")
consensus_list <- get_consensus_df(hits = par_list["plot_id"],
out_file,
par_list)
## write consensus_list information to file
writeXLS(consensus_list[2:6], file.path(str_c(out_file, "_consensus_statistics.xlsx")))
## save consensus_list
saveRDS(consensus_list, str_c(out_file, "_consensus.RDS"))
} else {
talk("[CONSENSUS] Generate consensus of reads to reference -> Stopped due to prior results OR not wanted")
if(file.exists(str_c(out_file, "_consensus.RDS"))) {
consensus_list <- readRDS(str_c(out_file, "_consensus.RDS"))
par_list["consensus"] <- TRUE
} else {
consensus_df <- ""
}
}
## build the sample info file to get more information on the sample
sample_info <- build_sample_info(sample_in = par_list["map_dna_in"],
out_file, par_list, proc_start_tm)
## plot the thing
if(par_list["plot"]){
talk("[PLOT] Start generating the mapping pdf")
mapping_dna_plot(genebank_id = par_list["plot_id"],
consensus_df = consensus_list$consensus_df,
dna_alignment_df = map_dna_list$alignment_df,
pep_alignment_df = map_pep_list$alignment_df,
aa_info_df = par_list["prot_info"],
viral_info_df = par_list["species_info"],
sample_info = sample_info,
pdf_file = ifelse(length(par_list["pdf_file"]) == 0,
str_c(out_file, ".pdf"), par_list["pdf_file"]),
par_list)
} else {
talk("[PLOT] Start generating the mapping pdf -> Stopped, not wanted")
}
## clean the temp folder
unlink(dir0(dirname(out_file), "\\.png")) ## clean png_files
gc()
if(par_list["clean"]) unlink(dir0(par_list["tmp_dir"]))
talk("[FIN] Finished\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.