R/Statistics_identification.R

Defines functions add_penalty_statistics add_p_q_values_statistics

Documented in add_penalty_statistics add_p_q_values_statistics

#' add_p_q_values_statistics
#' 
#' A hypergeometric distribution-assumption allows to calculate the 
#' p-values for a significant or non-significant overlap in this function
#' 
#' \code{add_p_q_values_statistics} Calculates the p-values
#' 
#' @param g_query IRanges object that contains the query variants
#' @param match_t A table that contains the nubmber of matching variants
#' @param p_value Threshold for the significance p-value
#' @param ref_gen Reference genome version
#' @param minimum_matching_mutations Manual lower amount of matching
#' mutations require for a significant match between a query and a 
#' reference
#' @param top_hits_per_library limits significant similarities to the first 
#' n hits
#' @import stringr
#' @importFrom stats pbinom
#' @return R table with a statistic 
add_p_q_values_statistics = function(
    g_query,
    match_t,
    p_value,
    ref_gen,
    minimum_matching_mutations,
    top_hits_per_library
){
    
    library_names = read_library_names(ref_gen = ref_gen)
    p_values = rep(1.0, nrow(match_t))
    balls_in_query = length(g_query$Member_CCLs)
    sig_vec <<- c()
    
    for (library_name in library_names){
        
        index_library = which( match_t$Library == library_name  )
        white_balls_possible = as.integer(as.character(
            match_t$All_variants))[index_library]
        
        if( length(white_balls_possible) == 0){
            stop("There is only one CCL in the custom set.",  
                "A confidence score calculation is only ", 
                "possible with more than one sample!")
        }
        
        # matching features
        white_balls_found = as.integer(
            as.character(match_t$Matches))[index_library]
        white_balls_found_least_one = sum(white_balls_found > 0)
        
        # not matching features
        black_balls = sum(white_balls_possible) - white_balls_possible
        
        # estimation of 'noise'
        background_cls_traces = sum(
            white_balls_found >= mean(white_balls_found)
        )
        
        # here, we apply a heuristic on the likelihood to match
        likelihood_found = white_balls_possible / sum(white_balls_possible)

        q = white_balls_found - 1
        q[q < 0]   = 0
        
        # here we calculate the p-value for the amount of matches
        p_values_panel = as.double(stats::pbinom(
            q = q,
            size = sum(white_balls_found),
            p = likelihood_found,
            lower.tail = FALSE 
        ))
        
        p_values_panel[white_balls_found == white_balls_possible] = 0.0
        p_values_panel[p_values_panel >= 1.0] = 1.0
        p_values_panel[white_balls_found == 0] = 1.0
        p_values_panel[white_balls_found < minimum_matching_mutations] = 1.0
        
        p_values[index_library] = p_values_panel
        n_hits_vec = order( as.double( p_values[index_library] ) )
        n_hits_vec[as.integer(n_hits_vec) >
            as.integer(top_hits_per_library)] = FALSE
        n_hits_vec[as.integer(n_hits_vec) > 0 ] = TRUE
        n_hits_vec = as.logical(n_hits_vec)
        
        sig_vec <<- c(sig_vec, n_hits_vec)
    }
    
    match_t$P_values = p_values
    match_t$P_value_sig = match_t$P_values <= p_value
    match_t$P_value_sig = match_t$P_value_sig & sig_vec

    return(match_t)
}

#' add_penalty_statistics
#' 
#' Add penalty statistics to results
#' 
#' @param match_t object that contains the matching variants
#' @param minimum_matching_mutations a numerical giving the minimum amount of
#'  mutations that has to match between query and training sample for a 
#'  positive prediction
#' @importFrom stats integrate
#' @importFrom stats pbeta
#' @return The updated statistics
add_penalty_statistics = function(match_t, minimum_matching_mutations){

    matching_variants = as.integer(match_t$Matches)
    minimum_matching_mutations = min(matching_variants)
    matching_variants = matching_variants[ matching_variants > 0]
    
    if ( length(matching_variants) == 0 ){
        
        # initalize penality values -> defensive programming
        penalty = 0
        penalty_mutations = 0
        
    } else{ 
        
        # these are the two parameter relevant for generalization
        mean_match = mean( matching_variants )
        max_match  = max( matching_variants )
        
        # here, we utilize the conjugate prior beta distribution
        penalty = integrate(
            f = pbeta,
            0,
            1,
            max_match,
            max_match/ mean_match,
            stop.on.error = FALSE
            )$value
        
        # this is the precise amount of penality applied    
        penalty_mutations = ceiling(
            mean_match + (max_match * penalty) / (1 - penalty)
        )
    }
    
    if ( ( minimum_matching_mutations == 0 ) & ( penalty != 0.0) ){
        
        message(
            "Correcting the background due to traces of random, ", 
            "scale-freeness amounts of matches, requiring at least ", 
            as.character(penalty_mutations), " variants to match."
        )
    }
    match_t$Above_Penalty = 
        as.integer(as.character(match_t$Matches)) > penalty_mutations
    
    return(match_t)
}
RaikOtto/Uniquorn documentation built on Oct. 18, 2022, 1:21 a.m.