R/consolidate_krause_etal_dna_data.R

#' Combines all the disparate pieces of information about the DNA dataset into 
#' a single dataframe
#'
#' @param decoded_barcodes_ff A dataframe of decoded barcodes from flipflop 
#' basecalled data
#' @param tailfindr_estimates_ff A dataframe of tailfindr estimates from 
#' flipflop basecalled data
#' @param transcript_start_info_ff A dataframe of transcript start information 
#' obtained from flipflop basecalled data
#' @param moves_in_tail_ff A dataframe of moves in the tail region obtained from
#' using flipflop basecalled data
#' @param tailfindr_estimates_st A dataframe of tailfindr estimates from 
#' data basecalled using the standard model
#' @param transcript_start_info_st A dataframe of transcript start information 
#' obtained from data basecalled with standard model
#' @param moves_in_tail_st A dataframe of moves in the tail region obtained from
#' data basecalled with standard model
#' 
#' @importFrom magrittr %>% 
#' @importFrom magrittr %<>% 
#' @return A dataframe
#' @export
#'
consolidate_krause_etal_dna_data <- function(decoded_barcodes_ff,
                                             tailfindr_estimates_ff,
                                             transcript_start_info_ff,
                                             moves_in_tail_ff,
                                             tailfindr_estimates_st,
                                             transcript_start_info_st,
                                             moves_in_tail_st) {
  decoded_barcodes_ff %<>% 
    # keep only valid reads in which GFP sequence is there
    dplyr::filter(read_type == 'polyA' | read_type == 'polyT') %>% 
    # remove reads that are shorter than 700bp or greater than 900bp
    # such reads have a high chance that the barcode is either not 
    # present or that the read is fused with any other read -- making 
    # the barcode -- if detecded -- unreliable
    dplyr::filter(read_length >= 700 & read_length <= 900) %>% 
    # remove reads in which two or more barcodes have an equally
    # high alignment score. We don't need such ambiguous reads in our
    # analysis
    dplyr::filter(!barcode_tie) %>% 
    # reomve reads in which the barcode didn't pass
    dplyr::filter(barcode_passed_threshold) %>%  
    # remove slip oligo (barcode == 999)
    dplyr::filter(barcode != 999) %>% 
    # select only read_id and barcode column as we will need only
    # these two columns subsequently
    dplyr::select(read_id, read_type, barcode, replicate, file_path)
    
  tailfindr_estimates_ff %<>% 
    # filter out reads for which tailfindr's tail length
    # estimates are not available
    dplyr::filter(!is.na(tail_length)) %>% 
    # take only valid reads
    dplyr::filter(tail_is_valid) %>% 
    # remove irrelevant columns
    dplyr::select(-read_type, -tail_is_valid, -file_path, -replicate) %>% 
    # rename variables:
    dplyr::rename(tail_start_ff = tail_start,
                  tail_end_ff = tail_end,
                  samples_per_nt_ff = samples_per_nt,
                  tail_length_ff = tail_length)
  
  tailfindr_estimates_st %<>% 
    # filter out reads for which tailfindr's tail length
    # estimates are not available
    dplyr::filter(!is.na(tail_length)) %>% 
    # take only valid reads
    dplyr::filter(tail_is_valid == TRUE) %>% 
    # remove irrelevant columns
    dplyr::select(-read_type, -tail_is_valid, -file_path, -replicate) %>% 
    # rename variables:
    dplyr::rename(tail_start_st = tail_start,
                  tail_end_st = tail_end,
                  samples_per_nt_st = samples_per_nt,
                  tail_length_st = tail_length)
    
  transcript_start_info_st %<>% 
    dplyr::select(-file_path) %>% 
    dplyr::rename(transcript_alignment_start_st = transcript_alignment_start)
  
  transcript_start_info_ff %<>% 
    dplyr::select(-file_path) %>% 
    dplyr::rename(transcript_alignment_start_ff = transcript_alignment_start)
  
  # join all these four datasets
  df <- dplyr::left_join(tailfindr_estimates_ff, tailfindr_estimates_st, by = "read_id") %>% 
    dplyr::left_join(decoded_barcodes_ff, by = "read_id") %>% 
    dplyr::left_join(transcript_start_info_st, by = "read_id") %>% 
    dplyr::left_join(transcript_start_info_ff, by = "read_id") %>% 
    dplyr::left_join(moves_in_tail_st, by = "read_id") %>% 
    dplyr::left_join(moves_in_tail_ff, by = "read_id") 
  
  # remove rows with missing tail length predictions
  df <- df[!is.na(df$tail_length_ff), ]
  df <- df[!is.na(df$tail_length_st), ]
  # remove rows with missing barcode predictions
  df <- df[!is.na(df$barcode), ]
  df <- dplyr::distinct(df)
  
  # make factor columns for categoical variables
  df %<>% dplyr::mutate(read_type = as.factor(read_type)) %>% 
    dplyr::mutate(barcode = as.factor(barcode)) %>% 
    dplyr::mutate(replicate = as.factor(replicate)) 
  
  # add a kit column
  df %<>% mutate(kit = ifelse(replicate == 1, 'SQK-LSK108', 'SQK-LSK109')) 
  
  return(df)
}
adnaniazi/krauseNiazi2019Analyses documentation built on June 9, 2019, 7:22 p.m.