R/get_kld.R

get_KLD = function(x){
  # Normalise read count within each sample AND position, such that
  # sum(p_count_pos) = 1
  # sum(q_input_mean_pos) = 1
  x = x %>%
    group_by(sample, mut_pos) %>%
    mutate(p_count_1_pos = count.1 / sum(count.1),
           q_input_1_pos = input.1 / sum(input.1),
           q_input_2_pos = input.2 / sum(input.2),
           q_input_3_pos = input.3 / sum(input.3)) %>%
    ungroup %>%
    rowwise %>%
    mutate(q_input_pos_mean=mean(c(q_input_1_pos,q_input_2_pos,q_input_3_pos)))%>%
    ungroup

  # Check the per sample per position normalisation i.e. that:
  # sum(p_count_pos) = 1
  # sum(q_input_mean_pos) = 1
  # Is indeed true
  check_sum = x %>%
    group_by(sample,mut_pos) %>%
    summarise(q_sum = sum(q_input_pos_mean), p_sum = sum(p_count_1_pos)) %>%
    ungroup %>%
    select(q_sum,p_sum) %>%
    as.matrix %>%
    as.vector %>%
    as.character %>% # Due to rounding
    unique
  if( check_sum != "1" ){
    stop("count/input normalisation error!")
  }

  # Calculate the Kullback-Leibler Divergence
  # 1. Per position, calculate the contribution to KLD per residue
  x = x %>%
    mutate(KLD_res = p_count_1_pos * log2( p_count_1_pos / q_input_pos_mean ))
  # 2. Per position calculate the KLD
  tmp = x %>% group_by(sample, mut_pos) %>%
    summarise(KLD_pos = sum(KLD_res)) %>% ungroup
  x = x %>% full_join(tmp, by = c('sample','mut_pos'))
  rm(tmp)
  # 3. Calculate the height of the letters for the KLD logo
  x = x %>%
    mutate(KLD_height = p_count_1_pos * KLD_pos * sign(KLD_res))

  # Done, return!
  return(x)
}
leonjessen/Barracoda2viz documentation built on May 28, 2019, 12:59 p.m.