R/read_MAVEN.R

##' Read raw MAVEN output
##' 
##' @param data.fn Filename (including path) of the data file
##' @param exp.vars Not yet implemented
##' @param key.fn Filename (including path) of the key file
##' @param id.cols Character vector containing all of the "id.columns" that come with xcms output
##' @param numeric.sample.names CURRENTLY NOT IMPLEMENTED. If sample names begin with numbers, this must be set TRUE. If SOME but not ALL sample names are numeric, this will fail
##' @param rep.name Name of the "replicate" variable in the sample key, if it exists # IMPLEMENT REP FINDER
##' @export
read_MAVEN <- function(data.fn, 
                       key.fn, 
                       #id.cols = NULL,
                       id.cols = quos(label, metaGroupId, groupId, goodPeakCount, medMz, medRt, maxQuality, note, compound,compoundId,expectedRtDiff, ppmDiff, parent),
                       cols = quos(mpg, cyl),
                       numeric.sample.names=TRUE,
                       sample.name = rlang::quo(sample), # currently this requires that sample.name is sample
                       # rep.name="replicate") 
                       rep.name = rlang::quo(replicate)) {
  
  # Read the raw data
  d <- readr::read_csv(data.fn, na="") # In the future this should ensure that numeric columns are numeric
  
  # Someday I would like to implement a column name parser, but I won't do this just yet
  # Read the sample key
  key <- readr::read_csv(key.fn)
  
  # Get the 'experimental variables' (all variables in key other than sample name)
  exp.vars = exp_var_finder(key)
  
  # Strip out blank lines from the sample key
  key <- dplyr::filter(key, (!is.na(sample) & sample != "X"))
  
  # Determine experimental variables
  
  # Melt the data frame
  browser()
  ### THIS NEEDS TO BE EVERYTHING BUT id.cols, not id.cols
  dm <- tidyr::gather(d, key=sample, value = ion.count, .dots=!!!id.cols)
  
  
  # Remove the rows where everything is NA
  dm <- dplyr::filter(dm, !is.na(ion.count))

  #browser()
  # Check for mismatch between key names and sample names
  # WTF is this doing - not the right thing, I think
  if(sum(unique(dm$sample) %in% unique(key$sample)) < length(unique(dm$sample))) {
    missing.from.key <- unique(key$sample[!(key$sample %in% dm$sample)]) # Looks for samples in dataset that aren't in key
    missing.from.key.single <- do.call(paste, as.list(missing.from.key))
    missing.from.dm <- unique(dm$sample[!(dm$sample %in% key$sample)]) # Looks for samples in key that aren't in dataset
    missing.from.dm.single <- do.call(paste, as.list(missing.from.dm))
    warning(paste("Some samples listed in the raw dataset are not listed in the sample key. \n
The following are missing from the sample key:\n",
                  missing.from.key.single,
                  "\nThe following are missing from the dataset:\n",
                  missing.from.dm.single))
  }
  
  # Merge key with data
  d_merge <- dplyr::full_join(dm, key, by=!!sample.name)
  
  # Set replicate value to factor
  if(quo_name(rep.name) %in% exp.vars) {
    d_merge <- d_merge %>% select(!!rep.name) %>% as.factor()
      
    d_merge[ , rep.name] <- as.factor(d_merge[ , rep.name])
  }

  list(raw_data=d_merge, exp.var=exp.var)
}
adsteen/metafluxr documentation built on May 20, 2019, 1:27 p.m.