# Copyright © 2014-2016 The YAPSA package contributors
# This file is part of the YAPSA package. The YAPSA package is licenced under
# GPL-3
#' @importFrom GetoptLong qq
#' @importFrom GetoptLong qq.options
#'
check_perl = function(module = NULL, inc = NULL, perl_bin = "perl") {
op = qq.options("code.pattern")
qq.options("code.pattern" = "@\\{CODE\\}")
on.exit(qq.options("code.pattern" = op))
if(is.null(module)) {
cmd = qq("\"@{perl_bin}\" -v")
} else if(!is.null(module) && is.null(inc)) {
cmd = qq("\"@{perl_bin}\" -M@{module} -e \"use @{module}\"")
} else if(!is.null(module) && !is.null(inc)) {
cmd = qq("\"@{perl_bin}\" \"-I@{inc}\" -M@{module} -e \"use @{module}\"")
}
OS = Sys.info()["sysname"]
if(OS == "Windows") {
res = system(cmd, ignore.stdout = TRUE, ignore.stderr = TRUE,
show.output.on.console = FALSE)
} else {
res = system(cmd, ignore.stdout = TRUE, ignore.stderr = TRUE)
}
return(ifelse(res, FALSE, TRUE))
}
#' @importFrom GetoptLong qq
#' @importFrom GetoptLong qq.options
#'
check_bedtools = function(module = NULL,
inc = NULL,
bedtools_bin = "bedtools") {
op = qq.options("code.pattern")
qq.options("code.pattern" = "@\\{CODE\\}")
on.exit(qq.options("code.pattern" = op))
if(is.null(module)) {
cmd = qq("\"@{bedtools_bin}\" --version")
}
OS = Sys.info()["sysname"]
if(OS == "Windows") {
res = system(cmd, ignore.stdout = TRUE, ignore.stderr = TRUE,
show.output.on.console = FALSE)
} else {
res = system(cmd, ignore.stdout = TRUE, ignore.stderr = TRUE)
}
return(ifelse(res, FALSE, TRUE))
}
## example:
## in_fasta <- "/ibios/co02/reference/Reference_1KG/hs37d5.fa"
## in_word_length <- 3
## project_folder <- file.path("/icgc/dkfzlsdf/analysis/hipo/hipo_028",
## "somaticSignatures/nucleotide_distrib")
## out_csv <- "kmer_frequencies_in_ref.csv"
##
run_kmer_frequency_pl <- function(in_fasta,
in_word_length,
project_folder,
out_csv) {
if(!check_perl()) {
cat("YAPSA:::run_kmer_frequency_pl::Error:unable to find path to perl")
return(1)
}
package_path <- system.file(package='YAPSA')
perl_command <- paste0("perl")
script_command <- file.path(package_path,"foreign","kmer_frequencies.pl")
script_options_command <- paste0("-r ",in_fasta, " -w ",in_word_length)
output_command <- paste0("> ",file.path(project_folder,out_csv))
this_command <- paste(perl_command,script_command,script_options_command,
output_command,collapse=" ")
system(this_command,intern=TRUE)
return(0)
}
## example:
## in_ref_genome <- "/ibios/co02/reference/Reference_1KG/hs37d5.fa"
## in_target_capture_bed <- file.path("/icgc/ngs_share/assemblies",
## "hg19_GRCh37_1000genomes/targetRegions",
## "Agilent5withUTRs_chr.bed.gz")
## project_folder <- file.path("/icgc/dkfzlsdf/analysis/hipo/hipo_028",
## "somaticSignatures/nucleotide_distrib")
## out_target_capture_fasta <- "hs37d5_Agilent5withUTR_targetCapture.fa"
##
run_bedtools_getfasta <- function(in_ref_genome,in_target_capture_bed,
project_folder,out_target_capture_fasta) {
if(!check_bedtools()) {
cat("YAPSA:::run_bedtools_getfasta::Error:unable to find path to bedtools")
return(1)
}
bedtools_command <- "bedtools getfasta"
bedtools_options_command <-
paste0("-fi ",in_ref_genome, " -bed ",in_target_capture_bed,
" -fo ",file.path(project_folder,out_target_capture_fasta))
this_command <- paste(bedtools_command,bedtools_options_command,collapse=" ")
system(this_command)
return(0)
}
#' Provide normalized correction factors for kmer content
#'
#' This function is analogous to
#' \code{\link[SomaticSignatures]{normalizeMotifs}}. If an
#' analysis of mutational signatures is performed on e.g. Whole Exome
#' Sequencing (WES) data, the signatures and exposures have to be adapted to
#' the potentially different kmer (trinucleotide) content of the target
#' capture. The present function takes as arguments paths to the used reference
#' genome and target capture file. It the extracts the sequence of the target
#' capture by calling \code{bedtools getfasta} on the system command prompt.
#' \code{run_kmer_frequency_normalization} then calls a custom made perl script
#' \code{kmer_frequencies.pl} also included in this package to count the
#' occurences of the tripletts in both the whole reference genome and the
#' created target capture sequence. These counts are used for normalization as
#' in \code{\link[SomaticSignatures]{normalizeMotifs}}. Note that
#' \code{\link[SomaticSignatures]{kmerFrequency}} provides a solution to
#' approximate kmer frequencies by random sampling. As opposed to that
#' approach, the function described here deterministically counts all
#' occurences of the kmers in the respective genome.
#'
#' @param in_ref_genome_fasta
#' Path to the reference genome fasta file used.
#' @param in_target_capture_bed
#' Path to a bed file containing the information on the used target capture.
#' May also be a compressed bed.
#' @param in_word_length
#' Integer number defining the length of the features or motifs, e.g. 3 for
#' tripletts or 5 for pentamers
#' @param project_folder
#' Path where the created files, especially the fasta file with the sequence
#' of the target capture and the count matrices, can be stored.
#' @param in_verbose
#' Verbose if \code{in_verbose=1}
#'
#' @return
#' A numeric vector with correction factors
#'
#' @examples
#' NULL
#'
#' @seealso \code{\link[SomaticSignatures]{normalizeMotifs}}
#'
#' @export
#'
run_kmer_frequency_normalization <- function(in_ref_genome_fasta,
in_target_capture_bed,
in_word_length,
project_folder,
in_verbose=1) {
target_capture_fasta <- "hs37d5_Agilent5withUTR_targetCapture.fa"
target_capture_kmer_counts_file <- "kmer_frequencies_in_targetCapture.csv"
reference_genome_kmer_counts_file <- "kmer_frequencies_in_ref.csv"
if(!file.exists(file.path(project_folder,
reference_genome_kmer_counts_file))){
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"reference_genome_kmer_counts_file doesn't exist yet. ",
"Create first.\n")
}
run_kmer_frequency_pl(in_ref_genome_fasta,in_word_length,project_folder,
reference_genome_kmer_counts_file)
} else {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"reference_genome_kmer_counts_file already exists.\n");}
}
if(!file.exists(file.path(project_folder,target_capture_kmer_counts_file))){
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_kmer_counts_file doesn't exist yet. ",
"Create first.\n")
}
if(!file.exists(file.path(project_folder,target_capture_fasta))) {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_fasta doesn't exist yet. Create first.\n")
}
run_bedtools_getfasta(in_ref_genome_fasta,in_target_capture_bed,
project_folder,target_capture_fasta)
} else {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::target_capture_fasta",
" already exists.\n")
}
}
run_kmer_frequency_pl(file.path(project_folder,target_capture_fasta),
in_word_length,project_folder,
target_capture_kmer_counts_file)
} else {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_kmer_counts_file already exists.\n")
}
}
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::Now read counts into data",
" frames...\n");}
reference_genome_kmer_frequency_df <-
read.csv(file.path(project_folder,reference_genome_kmer_counts_file),
header=FALSE,sep="\t")
names(reference_genome_kmer_frequency_df) <- c("triplet","count")
reference_genome_total_count <-
sum(as.numeric(reference_genome_kmer_frequency_df$count))
reference_genome_kmer_frequency_df$rel_counts <-
as.numeric(reference_genome_kmer_frequency_df$count) /
reference_genome_total_count
target_capture_kmer_frequency_df <-
read.csv(file.path(project_folder, target_capture_kmer_counts_file),
header=FALSE, sep="\t")
names(target_capture_kmer_frequency_df) <- c("triplet","count")
target_capture_total_count <-
sum(as.numeric(target_capture_kmer_frequency_df$count))
target_capture_kmer_frequency_df$rel_counts <-
as.numeric(target_capture_kmer_frequency_df$count) /
target_capture_total_count
norms <- reference_genome_kmer_frequency_df$rel_counts /
target_capture_kmer_frequency_df$rel_counts
names(norms) <- reference_genome_kmer_frequency_df$triplet
return(norms)
}
#' Provide comprehensive correction factors for kmer content
#'
#' This function is analogous to
#' \code{\link[SomaticSignatures]{normalizeMotifs}}. If an analysis of
#' mutational signatures is performed on e.g. Whole Exome Sequencing (WES)
#' data, the signatures and exposures have to be adapted to the potentially
#' different kmer (trinucleotide) content of the target capture. The present
#' function takes as arguments paths to the used reference genome and target
#' capture file. It the extracts the sequence of the target capture by calling
#' \code{bedtools getfasta} on the system command prompt.
#' \code{run_kmer_frequency_normalization} then calls a custom made perl
#' script \code{kmer_frequencies.pl} also included in this package to count the
#' occurences of the tripletts in both the whole reference genome and the
#' created target capture sequence. These counts are used for normalization as
#' in \code{\link[SomaticSignatures]{normalizeMotifs}}. Note that
#' \code{\link[SomaticSignatures]{kmerFrequency}} provides a solution to
#' approximate kmer frequencies by random sampling. As opposed to that
#' approach, the function described here deterministically counts all
#' occurences of the kmers in the respective genome.
#'
#' @param in_ref_genome_fasta
#' Path to the reference genome fasta file used.
#' @param in_target_capture_bed
#' Path to a bed file containing the information on the used target capture.
#' May also be a compressed bed.
#' @param in_word_length
#' Integer number defining the length of the features or motifs, e.g. 3 for
#' tripletts or 5 for pentamers
#' @param project_folder
#' Path where the created files, especially the fasta file with the sequence
#' of the target capture and the count matrices, can be stored.
#' @param target_capture_fasta
#' Name of the fasta file of the target capture to be created if not yet
#' existent.
#' @param in_verbose
#' Verbose if \code{in_verbose=1}
#'
#' @return
#' A list with 2 entries:
#' \itemize{
#' \item \code{rel_cor}:
#' The correction factors after normalization as in
#' \code{\link{run_kmer_frequency_normalization}}
#' \item \code{abs_cor}:
#' The correction factors without normalization.
#' }
#'
#' @examples
#' NULL
#'
#' @seealso \code{\link[SomaticSignatures]{normalizeMotifs}}
#'
#' @export
#'
run_kmer_frequency_correction <-
function(in_ref_genome_fasta, in_target_capture_bed, in_word_length,
project_folder, target_capture_fasta="targetCapture.fa",
in_verbose=1) {
target_capture_kmer_counts_file <- "kmer_frequencies_in_targetCapture.csv"
reference_genome_kmer_counts_file <- "kmer_frequencies_in_ref.csv"
dir.create(project_folder,recursive=TRUE)
if(!file.exists(file.path(project_folder,
reference_genome_kmer_counts_file))){
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"reference_genome_kmer_counts_file doesn't exist yet. ",
"Create first.\n")
}
run_kmer_frequency_pl(in_ref_genome_fasta,in_word_length,project_folder,
reference_genome_kmer_counts_file)
} else {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"reference_genome_kmer_counts_file already exists.\n")
}
}
if(!file.exists(file.path(project_folder,target_capture_kmer_counts_file))){
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_kmer_counts_file doesn't exist yet. ",
"Create first.\n")
}
if(!file.exists(file.path(project_folder,target_capture_fasta))) {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_fasta doesn't exist yet. Create first.\n")
}
run_bedtools_getfasta(in_ref_genome_fasta,in_target_capture_bed,
project_folder,target_capture_fasta)
} else {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_fasta already exists.\n")
}
}
run_kmer_frequency_pl(file.path(project_folder,target_capture_fasta),
in_word_length,project_folder,
target_capture_kmer_counts_file)
} else {
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::",
"target_capture_kmer_counts_file already exists.\n")
}
}
if(in_verbose==1) {
cat("\nYAPS:::run_kmer_frequency_normalization::Now read counts into ",
"data frames...\n")
}
reference_genome_kmer_frequency_df <-
read.csv(file.path(project_folder,reference_genome_kmer_counts_file),
header=FALSE,sep="\t")
names(reference_genome_kmer_frequency_df) <- c("triplet","count")
reference_genome_total_count <-
sum(as.numeric(reference_genome_kmer_frequency_df$count))
reference_genome_kmer_frequency_df$rel_counts <-
as.numeric(reference_genome_kmer_frequency_df$count) /
reference_genome_total_count
target_capture_kmer_frequency_df <-
read.csv(file.path(project_folder,target_capture_kmer_counts_file),
header=FALSE,sep="\t")
names(target_capture_kmer_frequency_df) <- c("triplet","count")
target_capture_total_count <-
sum(as.numeric(target_capture_kmer_frequency_df$count))
target_capture_kmer_frequency_df$rel_counts <-
as.numeric(target_capture_kmer_frequency_df$count) /
target_capture_total_count
rel_norms <-
reference_genome_kmer_frequency_df$rel_counts /
target_capture_kmer_frequency_df$rel_counts
names(rel_norms) <- reference_genome_kmer_frequency_df$triplet
abs_norms <- reference_genome_kmer_frequency_df$count /
target_capture_kmer_frequency_df$count
names(abs_norms) <- reference_genome_kmer_frequency_df$triplet
#out_df <- data.frame(rel_cor=rel_norms,abs_cor=abs_norms)
#rownames(out_df) <- reference_genome_kmer_frequency_df$triplet
#return(out_df)
out_list <- list(rel_cor=rel_norms,abs_cor=abs_norms)
return(out_list)
}
#' Wrapper function to annotate addition information
#'
#' Wrapper function to the perl script annotate_vcf.pl which annotates
#' data of a track stored in file_B (may be different formats) to called
#' variants stored in a vcf-like file_A.
#'
#' @param in_data_file
#' Path to the input vcf-like file to be annotated
#' @param in_anno_track_file
#' Path to the input file containing the annotation track
#' @param in_new_column_name
#' String indicating the name of the column to be created for annotation.
#' @param out_file
#' Path where the created files can be stored.
#' @param in_data_file_type
#' \code{custom} for vcf-like
#' @param in_anno_track_file_type
#' Type of the file \code{in_anno_track_file} containing the annotation
#' track.
#' @param in_data_CHROM.field
#' String indicating which column of \code{in_data_file} contains the
#' chromosome information.
#' @param in_data_POS.field
#' String indicating which column of \code{in_data_file} contains the
#' position information.
#' @param in_data_END.field
#' String indicating which column of \code{in_data_file} contains the
#' end information if regions are considered.
#'
#' @return Return zero if no problems occur.
#'
#' @examples
#' NULL
#'
#' @export
#'
run_annotate_vcf_pl <- function(in_data_file,
in_anno_track_file,
in_new_column_name,
out_file,
in_data_file_type="custom",
in_anno_track_file_type="bed",
in_data_CHROM.field="CHROM",
in_data_POS.field="POS",
in_data_END.field="POS") {
if(!check_perl()) {
cat("YAPSA:::run_annotate_vcf_pl::Error:unable to find path to perl")
return(1)
}
package_path <- system.file(package='YAPSA')
perl_command <- paste0("perl")
script_command <- file.path(package_path,"foreign","annotate_vcf.pl")
script_options_command <- paste0("-a ",in_data_file,
" --aFileType=",in_data_file_type,
" --aChromColumn ",in_data_CHROM.field,
" --aPosColumn ",in_data_POS.field,
" --aEndColumn ",in_data_END.field,
" -b ",in_anno_track_file,
" --bFileType=",in_anno_track_file_type,
" --columnName ",in_new_column_name)
output_command <- paste0("> ",file.path(out_file))
this_command <- paste(perl_command,script_command,script_options_command,
output_command,collapse=" ")
system(this_command,intern=TRUE)
return(0)
}
#' Build a gene list for a given pathway name
#'
#' @param in_string
#' Name or description of the pathway
#' @param in_organism
#' Name of the taxon to be searched in
#'
#' @return A character vector of gene names
#'
#' @examples
#' NULL
#' \dontrun{
#' species <- "hsa"
#' gene_lists_meta_df <- data.frame(
#' name=c("BER","NHEJ","MMR"),
#' explanation=c("base excision repair",
#' "non homologous end joining",
#' "mismatch repair"))
#' number_of_pathways <- dim(gene_lists_meta_df)[1]
#' gene_lists_list <- list()
#' for (i in seq_len(number_of_pathways)) {
#' temp_list <-
#' build_gene_list_for_pathway(gene_lists_meta_df$explanation[i],
#' species)
#' gene_lists_list <- c(gene_lists_list,list(temp_list))
#' }
#' gene_lists_list
#' }
#'
#' @seealso \code{\link[KEGGREST]{keggLink}}
#' @seealso \code{\link[KEGGREST]{keggFind}}
#' @seealso \code{\link{extract_names_from_gene_list}}
#'
#' @importFrom KEGGREST keggFind
#' @importFrom KEGGREST keggLink
#' @export
#'
build_gene_list_for_pathway <- function(in_string,in_organism) {
my_pathway <- keggFind("pathway",in_string)
my_name <- gsub("map",in_organism,names(my_pathway))
KEGG_gene_list <- keggLink(in_organism,my_name)
out_gene_list <- c()
for (i in seq_len(length(KEGG_gene_list))) {
out_gene_list[i] <- extract_names_from_gene_list(KEGG_gene_list,i)
}
return(out_gene_list)
}
#' Return gene names from gene lists
#'
#' @param in_KEGG_gene_list
#' Gene list to extract names from
#' @param l
#' Index of the gene to be extracted
#'
#' @return The gene name.
#'
#' @examples
#' NULL
#'
#' @seealso \code{\link[KEGGREST]{keggGet}}
#' @seealso \code{\link{build_gene_list_for_pathway}}
#'
#' @importFrom KEGGREST keggGet
#' @export
#'
extract_names_from_gene_list <- function(in_KEGG_gene_list,l) {
temp_gene <- keggGet(in_KEGG_gene_list[l])[[1]]
temp_gene_name <- strsplit(temp_gene$NAME,", ")[[1]][1]
return(temp_gene_name)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.