R/Identify_VCF_file.R

Defines functions identify_vcf_file

Documented in identify_vcf_file

#' identify_VCF_file
#' 
#' Identifies a cancer cell lines contained in a vcf file based 
#' on the pattern (start & length) of all contained mutations/ variations.
#' 
#' \code{identify_vcf_file} parses the vcf file and predicts 
#' the identity of the sample
#' 
#' @param write_xls Create identification results additionally 
#' as xls file for easier reading
#' @param vcf_file Input vcf file. Only one sample column allowed.
#' @param output_file Path of the output file. If blank, 
#' autogenerated as name of input file plus '_uniquorn_ident.tab' suffix.
#' @param ref_gen Reference genome version. All training sets are 
#' associated with a reference genome version. Default: GRCH37
#' @param mutational_weight_inclusion_threshold Include only mutations 
#' with a weight of at least x. Range: 0.0 to 1.0. 1= unique to CL. 
#' ~0 = found in many CL samples. 
#' @param minimum_matching_mutations The minimum amount of mutations that 
#' has to match between query and training sample for a positive prediction
#' @param top_hits_per_library Limit the number of significant similarities
#' per library to n (default 3) many hits. Is particularrly used in contexts
#' when heterogeneous query and reference CCLs are being compared.
#' @param manual_identifier Manually enter a vector of CL 
#' name(s) whose bed files should be created, independently from 
#' them passing the detection threshold
#' @param output_bed_file If BED files for IGV visualization should be 
#' created for the Cancer Cell lines that pass the threshold
#' @param verbose Print additional information
#' @param p_value Required p-value for identification.
#' Note that if you set the confidence score, the confidence score
#' overrides the p-value
#' @param confidence_score Cutoff for positive prediction between 0 and 100.
#' Calculated by transforming the p-value by -1 * log(p-value)
#' Note that if you set the confidence score, the confidence score
#' overrides the p-value
#' @param n_threads Number of threads to be used
#' @param write_results Write identification results to file
#' @import WriteXLS
#' @usage 
#' identify_vcf_file( 
#'     vcf_file,
#'     output_file,
#'     ref_gen,
#'     minimum_matching_mutations,
#'     mutational_weight_inclusion_threshold,
#'     write_xls,
#'     output_bed_file,
#'     top_hits_per_library,
#'     manual_identifier,
#'     verbose,
#'     p_value,
#'     confidence_score,
#'     n_threads,
#'     write_results
#' )
#' @examples 
#' HT29_vcf_file = system.file("extdata/HT29.vcf", package = "Uniquorn");
#' 
#' identification = identify_vcf_file(
#'     vcf_file = HT29_vcf_file, 
#'     verbose = FALSE,
#'     write_results = FALSE
#' )
#' @return R table with a statistic of the identification result
#' @export
identify_vcf_file = function(
    vcf_file,
    output_file = "",
    ref_gen = "GRCH37",
    minimum_matching_mutations = 0,
    mutational_weight_inclusion_threshold = 0.5,
    write_xls = FALSE,
    output_bed_file = FALSE,
    top_hits_per_library = 3,
    manual_identifier = "",
    verbose = TRUE,
    p_value = .05,
    confidence_score = NA,
    n_threads = 1,
    write_results = FALSE
){
    
    if ( ! is.na(confidence_score) ){
        message(
            "Confidence score has been set to ",
            as.character(confidence_score),
            ", overriding the p_value"
        )
        confidence_score[confidence_score < 0 ] = 0
        confidence_score[confidence_score > 100 ] = 100
        p_value = exp(-1 * confidence_score)
    }
    
    g_query = parse_vcf_file(
        vcf_file,
        ref_gen = ref_gen,
        library_name = ""
    )
    
    library_names = read_library_names(ref_gen = ref_gen)
    match_t <<- data.frame(
        CCL = as.character(),
        Matches = as.character(),
        Library = as.character()
    )
    
    message(
        "Limiting out to top ",
        as.character(top_hits_per_library),
        " hits per library."
    )
    
    for( library_name in library_names ){
        
        options(warn = -1)
        hit_list = match_query_ccl_to_database(
            g_query,
            ref_gen = ref_gen,
            library_name = library_name,
            mutational_weight_inclusion_threshold =
                mutational_weight_inclusion_threshold
        )
        options(warn = 0)
        #assign("match_t", rbind(match_t,hit_list),envir = parent.frame())
        match_t = rbind(match_t, hit_list)
        
        message(
            library_name, ": ",
            as.character(hit_list$CCL[1]),
            ", matching variants: ",
            as.character(hit_list$Matches[1])
        )
    }
    
    # statistics
    
    match_t = add_p_q_values_statistics(
        g_query,
        match_t,
        p_value,
        ref_gen = ref_gen,
        minimum_matching_mutations = minimum_matching_mutations,
        top_hits_per_library
    )
    match_t = add_penalty_statistics(
      match_t,
      minimum_matching_mutations
    )
    match_t$Identification_sig = match_t$P_value_sig & match_t$Above_Penalty
    match_t = match_t[order(as.double(match_t$P_values),decreasing = FALSE),]
    
    match_t$Matches = as.integer(match_t$Matches)
    match_t$All_variants = as.integer(match_t$All_variants)
    
    ### io stuff
    
    if(output_file == ""){
        output_file = str_replace(
            vcf_file,pattern = "(\\.vcf)|(\\.VCF)", ".ident.tsv" )
    }
    
    if (verbose){
        
        message("Candidate(s): ", paste0(unique( 
            as.character( match_t$CCL )[ match_t$Identification_sig  ]))
        )
        message("Storing information in table: ", output_file)
    }
       
    if (write_results){ 
        utils::write.table( 
            match_t,
            output_file,
            sep ="\t",
            row.names = FALSE,
            quote = FALSE
        )
    }
    
    if (output_bed_file & ( sum( as.logical(match_t$Q_value_sig) ) > 0 ))
        create_bed_file( 
            match_t, 
            vcf_fingerprint, 
            output_file, 
            manual_identifier
        )
    
    if ( !verbose )
        match_t = match_t[ ,
            !( colnames( match_t ) %in% c(
                "P_values",
                "Q_values",
                "Q_value_sig"
                )
            )
        ]

    
    if ( write_xls )

        WriteXLS::WriteXLS( 
            x = match_t,
            path.expand(
                output_file_xls
            ),
            row.names = FALSE
        )
    
    return( match_t )
}
RaikOtto/Younikorn documentation built on Oct. 18, 2022, 1:21 a.m.