#' identify_VCF_file
#'
#' Identifies a cancer cell lines contained in a vcf file based
#' on the pattern (start & length) of all contained mutations/ variations.
#'
#' \code{identify_vcf_file} parses the vcf file and predicts
#' the identity of the sample
#'
#' @param write_xls Create identification results additionally
#' as xls file for easier reading
#' @param vcf_file Input vcf file. Only one sample column allowed.
#' @param output_file Path of the output file. If blank,
#' autogenerated as name of input file plus '_uniquorn_ident.tab' suffix.
#' @param ref_gen Reference genome version. All training sets are
#' associated with a reference genome version. Default: GRCH37
#' @param mutational_weight_inclusion_threshold Include only mutations
#' with a weight of at least x. Range: 0.0 to 1.0. 1= unique to CL.
#' ~0 = found in many CL samples.
#' @param minimum_matching_mutations The minimum amount of mutations that
#' has to match between query and training sample for a positive prediction
#' @param top_hits_per_library Limit the number of significant similarities
#' per library to n (default 3) many hits. Is particularrly used in contexts
#' when heterogeneous query and reference CCLs are being compared.
#' @param manual_identifier Manually enter a vector of CL
#' name(s) whose bed files should be created, independently from
#' them passing the detection threshold
#' @param output_bed_file If BED files for IGV visualization should be
#' created for the Cancer Cell lines that pass the threshold
#' @param verbose Print additional information
#' @param p_value Required p-value for identification.
#' Note that if you set the confidence score, the confidence score
#' overrides the p-value
#' @param confidence_score Cutoff for positive prediction between 0 and 100.
#' Calculated by transforming the p-value by -1 * log(p-value)
#' Note that if you set the confidence score, the confidence score
#' overrides the p-value
#' @param n_threads Number of threads to be used
#' @param write_results Write identification results to file
#' @import WriteXLS
#' @usage
#' identify_vcf_file(
#' vcf_file,
#' output_file,
#' ref_gen,
#' minimum_matching_mutations,
#' mutational_weight_inclusion_threshold,
#' write_xls,
#' output_bed_file,
#' top_hits_per_library,
#' manual_identifier,
#' verbose,
#' p_value,
#' confidence_score,
#' n_threads,
#' write_results
#' )
#' @examples
#' HT29_vcf_file = system.file("extdata/HT29.vcf", package = "Uniquorn");
#'
#' identification = identify_vcf_file(
#' vcf_file = HT29_vcf_file,
#' verbose = FALSE,
#' write_results = FALSE
#' )
#' @return R table with a statistic of the identification result
#' @export
identify_vcf_file = function(
vcf_file,
output_file = "",
ref_gen = "GRCH37",
minimum_matching_mutations = 0,
mutational_weight_inclusion_threshold = 0.5,
write_xls = FALSE,
output_bed_file = FALSE,
top_hits_per_library = 3,
manual_identifier = "",
verbose = TRUE,
p_value = .05,
confidence_score = NA,
n_threads = 1,
write_results = FALSE
){
if ( ! is.na(confidence_score) ){
message(
"Confidence score has been set to ",
as.character(confidence_score),
", overriding the p_value"
)
confidence_score[confidence_score < 0 ] = 0
confidence_score[confidence_score > 100 ] = 100
p_value = exp(-1 * confidence_score)
}
g_query = parse_vcf_file(
vcf_file,
ref_gen = ref_gen,
library_name = ""
)
library_names = read_library_names(ref_gen = ref_gen)
match_t <<- data.frame(
CCL = as.character(),
Matches = as.character(),
Library = as.character()
)
message(
"Limiting out to top ",
as.character(top_hits_per_library),
" hits per library."
)
for( library_name in library_names ){
options(warn = -1)
hit_list = match_query_ccl_to_database(
g_query,
ref_gen = ref_gen,
library_name = library_name,
mutational_weight_inclusion_threshold =
mutational_weight_inclusion_threshold
)
options(warn = 0)
#assign("match_t", rbind(match_t,hit_list),envir = parent.frame())
match_t = rbind(match_t, hit_list)
message(
library_name, ": ",
as.character(hit_list$CCL[1]),
", matching variants: ",
as.character(hit_list$Matches[1])
)
}
# statistics
match_t = add_p_q_values_statistics(
g_query,
match_t,
p_value,
ref_gen = ref_gen,
minimum_matching_mutations = minimum_matching_mutations,
top_hits_per_library
)
match_t = add_penalty_statistics(
match_t,
minimum_matching_mutations
)
match_t$Identification_sig = match_t$P_value_sig & match_t$Above_Penalty
match_t = match_t[order(as.double(match_t$P_values),decreasing = FALSE),]
match_t$Matches = as.integer(match_t$Matches)
match_t$All_variants = as.integer(match_t$All_variants)
### io stuff
if(output_file == ""){
output_file = str_replace(
vcf_file,pattern = "(\\.vcf)|(\\.VCF)", ".ident.tsv" )
}
if (verbose){
message("Candidate(s): ", paste0(unique(
as.character( match_t$CCL )[ match_t$Identification_sig ]))
)
message("Storing information in table: ", output_file)
}
if (write_results){
utils::write.table(
match_t,
output_file,
sep ="\t",
row.names = FALSE,
quote = FALSE
)
}
if (output_bed_file & ( sum( as.logical(match_t$Q_value_sig) ) > 0 ))
create_bed_file(
match_t,
vcf_fingerprint,
output_file,
manual_identifier
)
if ( !verbose )
match_t = match_t[ ,
!( colnames( match_t ) %in% c(
"P_values",
"Q_values",
"Q_value_sig"
)
)
]
if ( write_xls )
WriteXLS::WriteXLS(
x = match_t,
path.expand(
output_file_xls
),
row.names = FALSE
)
return( match_t )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.