R/processing.R

Defines functions readprofile trim_profile fitpoly processFile processFiles readData

Documented in processFile processFiles readData

# Read a profile file
readprofile = function(filename) {
  # Figure out ncol
  row1 = scan(filename, nlines = 1)
  # Read profile as matrix
  prof = matrix(scan(filename), ncol = length(row1), byrow = TRUE)
  # We only use one channel (and up to 4 may be present)
  if(grepl("ch0", filename)) {
    #prof = plyr::alply(prof, 2, function(x) x)
    prof = purrr::map(1:ncol(prof), ~ prof[,.x])
  } else {
    prof = purrr::map(seq(1,length(row1), by = 4), ~ prof[,.x])
    #prof = plyr::alply(prof[,seq(1,length(row1), by = 4)], 2, function(x) x)
  }
  # Remove he entries with 0 that are actually outside of the profile
  #prof = plyr::llply(prof, function(x) {x = x[-(1:5)]; x = x[x > 0]})
  prof = purrr::map(prof, trim_profile)
}

# Trim a profile so that we only keep non-zero values and remove the initial bit
trim_profile = function(x) {
  x = x[-(1:5)]
  x = x[x > 0]
}

# Fit a 8th degree polynomial to the profile of each particle in the file
# Extract from the polynomial a series of features that are used for classification
fitpoly = function(y) {
  L = length(y)
  if(L < 5) return(c(P = max(y), Px = 0, C = 0))
  P = max(y) - min(y)
  x = seq(0.5, L - 0.5, by = 1)
  ry = y - min(y)
  model = lm(ry~x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) +
               I(x^8))
  EllipArea = pi*P*L/2/2
  coefs = coef(model)
  Area = sum(coefs[1] + coefs[2]*x + coefs[3]*x^2+ coefs[4]*x^3 + coefs[5]*x^4 +
               coefs[6]*x^5 + coefs[7]*x^6+ coefs[8]*x^7+ coefs[9]*x^8)
  out = list(P = P, Px = (which.max(y) + 0.5)/L, C = Area/EllipArea)
}

#' Process a file with particle profiles and save a new file with features for
#' classification
#'
#' @param filename Name of the file with raw data from the flow cytometer
#'
#' @return The function does not return anything but rather an fst file (from the \code{fst} package)
#' is saved with all features extracted from the profile of each particle.
#' @export
processFile = function(filename) {
  cat("Processing file ", filename, "\n")
  prof = readprofile(filename)
  coefs = furrr::future_map_dfr(prof, fitpoly)
  fst::write_fst(coefs, sub(".txt",".fst",filename), compress = 100)
}


#' Process files with particle profiles in a given folder
#'
#' @param dir Directory where the files with particles profiles is located
#'
#' @return The function does not return anything but rather an fst file (from the \code{fst} package)
#' is saved for every file with profile data, in the same directory as where the profile data is
#' located.
#'
#' @export
processFiles = function(dir = getwd()) {
  curdir = getwd()
  setwd(dir)
  on.exit(setwd(curdir))
  future::plan(future::multiprocess)
  # Figure out which are the files in the directory that end in "prf.txt"
  all_files = list.files()
  # Choose those that contain the pattern "_prf.txt"
  files = grep("_prf.txt", all_files, fixed = TRUE, value = TRUE)
  # For each file, read the profile, fit poly and save the matrix in a fst file
  purrr::map(files, processFile)
  setwd(curdir)
  return(invisible())
}


#' Read the data associated to a sorted sample
#'
#' @param mainfile The path to the file with time of fly and other summary data
#' @param profile The path to the fst files that contains the features extracted from the profiles
#'
#' @return Returns a data.frame with all the features calculated from the data. Each row corresponds to a particle
#'
#' @export
readData = function(mainfile, profile) {

  # Read the files with features extracted from the profile
  profile_data = fst::read_fst(profile) %>%
                    tibble::as_tibble(.)

  # Types of columns for the raw main file from the BioSorter
  types_of_cols = readr::cols(
    .default = readr::col_double(),
    Plate = readr::col_character(),
    `Source well` = readr::col_character(),
    Clog = readr::col_character(),
    `In Regions` = readr::col_character(),
    `PC Extinction` = readr::col_character(),
    `PC Green` = readr::col_character(),
    `PC Yellow` = readr::col_character(),
    `PC Red` = readr::col_character())

  # Read the raw main file from the BioSorter
  main_data = readr::read_delim(mainfile, delim = "\t", col_types = types_of_cols, n_max = nrow(profile_data)) %>%
                    tibble::as_tibble(.) %>%
                    dplyr::filter(!is.na(Id), !is.na(Time)) %>%
                    dplyr::select(-X27)
  all_data = dplyr::bind_cols(main_data[1:nrow(profile_data),], profile_data)
  dplyr::select(all_data, TOF, Extinction, Green, Yellow, Red, P, Px, C)
}



#' Read and process seed sample (with optional complementary waste sample)
#'
#' @param main_file Name of the file with measurements of extinction and time of flight.
#' @param profile_file Name of the file with features extracted from particle profiles, as generated by function [processFiles()].
#' @param datadir Directory where the files are located (by default, the current working directory).
#' @param main_waste_file (Optional) Name of the file with measurements of extinction and time of flight for
#' non-seed particles.
#' @param profile_waste_file (Optional) Name of the file with profile features for non-seed particles.
#' @param calibration A vector with intercept and slope of the linear calibration between time of flight and
#' particle size.
#' @param clean (Optional) Whether to use clustering to remove dust particles from the dataset (default: `FALSE`).
#' @param cleanguess (Optional) Initial guess for value for `Size` to separate seeds from dust particles
#'
#' @details The column `Class` indicates whether the particle is a seed (`S`) or a non-seed/waste material (`W`).
#'
#' @return A tibble with all the features that can be used for training or testing classification algorithms.
#' @export
getTrainingData = function(main_file, profile_file, datadir = "",
                   main_waste_file = NULL, profile_waste_file = NULL,
                   calibration = c(38.325, 0.185), clean = FALSE, cleanguess = 200) {

  # Retrieve data from the main_file and profile_file
  data = readData(file.path(datadir, main_file),
                  file.path(datadir, profile_file)) %>%
          dplyr::mutate(Class = "S")

  # Optionally clean the sample
  if(clean) {
    data = extractfeatures(data, calibration)
    data = cleanTrainingSample(data, guess = cleanguess)
  }


  # If a waste file is added, append it to the data
  if(!is.null(main_waste_file)) {
    waste = readData(file.path(datadir, main_waste_file),
                     file.path(datadir, profile_waste_file)) %>%
            dplyr::mutate(Class = "W")
    if(clean) waste = extractfeatures(waste, calibration)
    data = dplyr::bind_rows(data, waste)
  }

  # Perform transformations (feature engineering) if not clean
  if(!clean) data = extractfeatures(data, calibration)

  # Select features to be used for classification
   dplyr::select(data, Extinction, rGreen, rYellow, P, Px, C, Size, Class)
}

# Calculate the features for classification from the output
extractfeatures = function(data, calibration) {
  dplyr::mutate(data,
                Size =  calibration[2]*TOF + calibration[1],
                FluorTotal = (Green + Red + Yellow)/3,
                rGreen = ifelse(FluorTotal > 0, Green/FluorTotal, 0),
                rRed = ifelse(FluorTotal > 0, Red/FluorTotal, 0),
                rYellow = ifelse(FluorTotal > 0, Yellow/FluorTotal, 0))
}


#' Use k-means clustering to separate waste from seeds using a size threshold as initial guess
#'
#' @param data Tibble with the data for classification, as returned by the function [getData()].
#' @param guess Initial guess for the thresold that separates seeds from dust particles in micrometers (default: 200).
#'
#' @return The same as `data` but some of the particles are now reclassified as non-seed waste material.
#' @export
cleanTrainingSample = function(data, guess = 200) {
  seedData = dplyr::filter(data, Class == "S") %>%
              dplyr::select(-Class)
  wasteData = dplyr::filter(data, Class == "W")
  naive_dust = dplyr::filter(seedData, Size < guess)
  naive_seed = dplyr::filter(seedData, Size > guess)
  centers_dust = colMeans(naive_dust)
  centers_seed = colMeans(naive_seed)
  clusters = kmeans(seedData, centers = rbind(centers_dust, centers_seed), iter.max = 1e3)
  seedData = dplyr::mutate(seedData, Class = ifelse(clusters$cluster == 2, "S", "W"))
  dplyr::bind_rows(seedData, wasteData)
}



#'Create classification task from a dataset
#'
#' @param data Tibble with the data for classification, as returned by the function [getData()] or [cleanTrainingSample()].
#' @param id Name by which the data can be identified.

#'
#' @return A binary classification task (class [mlr::Task]).
#' @export
createTrainingTask = function(data, id = "cleanseeds") {
  task = mlr::makeClassifTask(id = id, data = data, target = "Class", positive = "S")

  return(task)
}




#' Read and process seed sample to be classified
#'
#' @param main_file Name of the file with measurements of extinction and time of flight.
#' @param profile_file Name of the file with features extracted from particle profiles, as generated by function [processFiles()].
#' @param datadir Directory where the files are located (by default, the current working directory).
#' @param calibration A vector with intercept and slope of the linear calibration between time of flight and
#' particle size.
#'
#' @return A tibble with all the features that can be used for identifying the type of particle by classification algorithms.
#' @export
getPredictionData = function(main_file, profile_file, datadir = "", calibration = c(38.325, 0.185)) {

  # Retrieve data from the main_file and profile_file
  data = readData(file.path(datadir, main_file), file.path(datadir, profile_file))

  # Perform transformations (feature engineering)
  data = extractfeatures(data, calibration)

  # Select features required by
  dplyr::select(data, Extinction, rGreen, rYellow, P, Px, C, Size)
}
AleMorales/SeedSorter documentation built on Feb. 12, 2020, 4:13 a.m.