R/read_hdx.R

Defines functions read_hdx

Documented in read_hdx

#' Read HDX-MS data file
#' 
#' @description Imports data from a HDX-MS file and validates its content.
#' 
#' @importFrom tools file_ext
#' @importFrom readxl read_excel
#' @importFrom readr read_csv read_tsv parse_logical parse_integer parse_double parse_character 
#' cols col_character
#' @importFrom data.table fread
#' @importFrom dplyr %>%
#' 
#' @param filename a file supplied by the user. Formats allowed: .csv, .xlsx and .xls.
#' 
#' @details First version accepts files produced by DynamX 3.0 and 2.0 in `cluster data` format. 
#' The function checks if all necessary columns are provided in correct format. The file must 
#' include at least two repetitions of the measurement for the uncertainty to be calculated.
#' 
#' @return \code{dat} - a \code{\link{data.frame}} with validated content.
#' 
#' @seealso \code{\link{calculate_kinetics}} \code{\link{calculate_state_deuteration}} \code{\link{plot_coverage}} \code{\link{plot_position_frequency}}
#' \code{\link{prepare_dataset}} \code{\link{quality_control}} \code{\link{reconstruct_sequence}}
#' 
#' @examples
#' # read example data
#' head(read_hdx(system.file(package = "HaDeX", 
#'                      "HaDeX/data/KD_180110_CD160_HVEM.csv")))
#' 
#' @export read_hdx

read_hdx <- function(filename){
  
  dat <- switch(file_ext(filename),
                "csv" = fread(filename),
                # "csv" = read_csv(filename, col_names = TRUE, col_types = cols(Modification = col_character(), 
                #                                                               Fragment = col_character())),
                # "tsv" = read_tsv(filename, col_names = TRUE, col_types = cols(Modification = col_character(), 
                #                                                               Fragment = col_character())),
                "xlsx" = read_excel(filename),
                "xls" = read_excel(filename))
  
  #check for dynamx2 file
  colnames_v_2 <- c("Protein", "Start", "End", "Sequence", 
                    "Modification", "Max Exchangers", 
                    "MHP", "State", "Exposure", "File", 
                    "z", "RT", "Inten", "Center")
  
  if(all(colnames_v_2 %in% colnames(dat))){
    dat <- upgrade_2_to_3(dat)
  }
  
  colnames_v_3 <- c("Protein", "Start", "End", "Sequence", 
                  "Modification", "Fragment", "MaxUptake", 
                  "MHP", "State", "Exposure", "File", "z", 
                  "RT", "Inten", "Center") 
  
  colnames_presence <- colnames_v_3 %in% colnames(dat)
  
  if(!all(colnames_presence)) {
    err_message <- paste0(ifelse(sum(!colnames_presence) > 0, 
                                 "A supplied file does not have required columns: ", 
                                 "A supplied file does not have the required column "),
                          paste0(colnames_v_3[!colnames_presence], collapse = ", "), ".")
    stop(err_message)
  }
  
  no_replicates <- dat %>%
    group_by(Protein, Start, End, Sequence, Modification, State, Exposure) %>%
    summarise(n_rep = length(unique(File))) %>%
    ungroup(.) %>%
    summarize(avg_rep = mean(n_rep))
    
  if (!(no_replicates[[1]] > 2)) {
    err_message <- "There is no sufficient number of replicates."
  } 
  
  dat[["Exposure"]] <- round(dat[["Exposure"]], 3)
  
  dat <- mutate(dat, State = paste0(State, ifelse(!is.na(Modification), paste0(" - ", Modification), ""))) 
  
  dat
  
}

upgrade_2_to_3 <- function(dat){
  
  colnames(dat)[6] <- "MaxUptake"
  dat[["Fragment"]] <- NA
  
  dat
  
}

Try the HaDeX package in your browser

Any scripts or data that you put into this service are public.

HaDeX documentation built on Aug. 12, 2021, 5:20 p.m.