R/do.align.R

Defines functions do.align

Documented in do.align

#' do.align - Function to align multiple batches of a dataset.
#'
#' This function allows you to align multiple batches of a dataset, using either a "Quantile" approach, or using "CytoNorm"
#'
#' @usage do.align(ref.dat, target.dat, batch.col, align.cols, method, goal, nQ, Qmin, Qmax, write.ref.fcs, write.target.fcs)
#'
#' @param ref.dat NO DEFAULT. If using method = "CytoNorm", this must be a FlowSOM object created by Spectre::do.prep.fsom. If using "Quantiles", then this must be
#' @param target.dat NO DEFAULT. A data.table of data you wish to align
#' @param batch.col NO DEFAULT. Character, column that denotes batches
#' @param align.cols NO DEFAULT. Character, a vector of columns that you wish to align.
#' @param method DEFAULT = "CytoNorm". Character, can be "CytoNorm" or "Quantile".
#' @param goal DEFAULT = "Mean". For method = "CytoNorm". Character, the goal to align to. Can either be "mean", or can be the name of one of the batches (e.g. "Batch1").
#' @param nQ DEFAULT = 101. For method = "CytoNorm". Numeric, the number of quanitles to use.
#' @param Qmin DEFAULT = 0.01. For method = "Quantiles". Numeric, the minimum valus for the lower threshold.
#' @param Qmax DEFAULT = 0.99. For method = "Quantiles". Numeric, the maximum value for the upper threshold.
#' @param write.ref.fcs DEFAULT = TRUE. Logical, do you want a quick copy of the reference sample FCS files.
#' @param write.target.fcs DEFAULT = TRUE. Logical, do you want a quick copy of the ALIGNED target sample FCS files.
#' @param mem.ctrl DEFAULT = TRUE. Runs gc() (garbage collection) after a number of steps to free up memory that hasn't been released quickly enough.
#'
#' @return Returns a data.table with selected columns (align.cols) replaced with aligned data.
#'
#' @author Thomas M Ashhurst, \email{thomas.ashhurst@@sydney.edu.au}
#'
#' @references Ashhurst, T. M., et al. (2019). \url{https://www.ncbi.nlm.nih.gov/pubmed/31077106}
#'
#' @examples
#' aligned.dat <- do.align()
#'
#' @export

do.align <- function(ref.dat,
                     target.dat,

                     ## Columns
                     #sample.col, # used to divide 'ref' and 'target' data into 'samples'
                     # @param sample.col NO DEFAULT. Character, column that denotes samples

                     batch.col, # Column denoting batches
                     align.cols, # Channels to align

                     ## Method
                     method = "CytoNorm", # Can also be "Quantile"

                     ## CytoNorm settings
                     goal = "mean",
                     nQ = 101,

                     ## Quantile settings
                     Qmin = 0.01,
                     Qmax = 0.99,

                     ## Writing result files
                     write.ref.fcs = TRUE,
                     write.target.fcs = TRUE,
                     mem.ctrl = TRUE
){

  ###########################################################################################
  ### Package check
  ###########################################################################################

  ### Check that necessary packages are installed
  if(!is.element('Spectre', installed.packages()[,1])) stop('Spectre is required but not installed')
  if(!is.element('CytoNorm', installed.packages()[,1])) stop('CytoNorm is required but not installed')

  ### Require packages
  require(Spectre)
  Spectre::package.load()
  require(data.table)
  require(CytoNorm)

  ###########################################################################################
  ### Test data
  ###########################################################################################

  # ref.dat = ref.fsom
  # target.dat = target.dat
  #
  # ## Columns
  # sample.col <- batch.col
  # batch.col = batch.col
  # align.cols = align.cols
  #
  # ## CytoNorm settings
  # method = "CytoNorm"
  # goal = "mean"
  # nQ = 101
  #
  # ## Writing result files
  # write.ref.fcs = TRUE
  # write.target.fcs = TRUE

  ###########################################################################################
  ### If using quantile method
  ###########################################################################################

  if(method == "Quantile"){

    stop("do.align -- The Quantile method is not active yet, please use the CytoNorm method")

    # ## Normalise data (prior to concatenation)
    # ## --> norm by total dataframe
    # DataList.norm <- DataList
    # reset.quant <- function(x){ #Source: https://stackoverflow.com/questions/13339685/how-to-replace-outliers-with-the-5th-and-95th-percentile-values-in-r
    #   quantiles <- quantile( x, c(.0, .995 ) )
    #   x[ x < quantiles[1] ] <- quantiles[1]
    #   x[ x > quantiles[2] ] <- quantiles[2]
    #   x
    # } #Sets anything less/greater than 0th/99.5th percentile to be equal to 0th/99.5th percentile
    # norm.fun <- function(x) {99*(x - min(x, na.rm=TRUE))/(max(x,na.rm=TRUE) -min(x, na.rm=TRUE))+1} #normalises data so min is 1, max is 100
    # #norm.quant.fun <- function(x) {(x - quantile(x, 0.05, na.rm=TRUE))/(quantile(x, 0.95, na.rm=TRUE) -quantile(x, 0.05, na.rm=TRUE))} #Normalises based on 5th/95th percentile, rather than min/max
    # is.nan.data.frame <- function(x) do.call(cbind, lapply(x, is.nan)) #"NaN" means "not a number" (happens because all values are the same, so normalising gives back "0/0": NaN)
    # for (i in AllSampleNos) {
    #   #DataList.norm[[i]][,1:(length(DataList.norm[[i]])-4)] <- as.data.frame(lapply(DataList.norm[[i]][,1:(length(DataList.norm[[i]])-4)], reset.quant))
    #   DataList.norm[[i]][,1:(length(DataList.norm[[i]])-4)] <- as.data.frame(lapply(DataList.norm[[i]][,1:(length(DataList.norm[[i]])-4)], norm.fun)) #Ignores last 4 columns which include names
    #   # DataList.norm[[i]][,1:(length(DataList.norm[[i]])-4)] <- as.data.frame(lapply(DataList.norm[[i]][,1:(length(DataList.norm[[i]])-4)], norm.quant.fun)) #Ignores last 4 columns which include names
    #   DataList.norm[[i]][is.nan.data.frame(DataList.norm[[i]])] <- 0 #Replaces NaN with 0
    # }

  }

  ###########################################################################################
  ### Setup for CytoNorm
  ###########################################################################################

  if(method == "CytoNorm"){

    message("do.align -- Step 1/5. Setup started")

    ### Initial stuff

    sample.col <- batch.col
    starting.dir <- getwd()

    labels <- align.cols
    channels <- align.cols
    channel.nums <- as.numeric(as.factor(channels))

    non.channels <- names(target.dat[, -channel.nums, with=FALSE])

    ### Merge outputs into Spectre format
    A <- ref.dat$fsom$FlowSOM$data
    B <- ref.dat$fsom$FlowSOM$map$mapping[,1]
    C <- ref.dat$fsom$metaclustering[ref.dat$fsom$FlowSOM$map$mapping[,1]]

    #nrow(A)
    #length(B)
    #length(C)

    if(nrow(A) != length(B)){
      stop("Spectre: there was a problem with your 'ref.dat' file")
    }

    if(nrow(A) != length(C)){
      stop("Spectre: there was a problem with your 'ref.dat' file")
    }

    if(length(B) != length(C)){
      stop("Spectre: there was a problem with your 'ref.dat' file")
    }

    ### Merge

    ref.dt <- as.data.table(A)
    ref.dt <- cbind(A, FlowSOM_cluster = B, FlowSOM_metacluster = C)
    ref.dt <- as.data.table(ref.dt)
    #ref.dt
    #str(ref.dt)

    test <- ref.dt[,channels, with = FALSE]
    test2 <- target.dat[,c(channels), with = FALSE]

    if(!exists("test")){
      warning("It seems the column names you have specified are not all present in the 'reference' dataset. Please check your entries and try again.")
    }

    if(!exists("test2")){
      warning("It seems the column names you have specified are not all present in the 'target' dataset. Please check your entries and try again.")
    }

    rm(test)
    rm(test2)

    if(mem.ctrl == TRUE){
      gc()
    }

    #### Cell barcodes for target.dat
    target.dat[["temp-pre-alignment-barcode"]] <- c(1:nrow(target.dat))

    ### Setup fsom
    fsom <- ref.dat$fsom

    ### Reference samples
    fsom$files
    fsom$filenums
    fsom$batches

    if(is.unsorted(fsom$filenums) == TRUE){
      stop("Error -- your files (check $fsom$filenums and $fsom$files) are not in order, and may cause a batch mismatch")
    }

    if(all(unique(ref.dt[["File"]]) == unique(fsom$filenums)) == FALSE){
      stop("Error -- your reference files appear to have been ordered incorrectly in the prep.fsom() function, and may not match their corresponding batches")
    }

    if(length(fsom$files) != length(unique(fsom$batches))){
      stop("Error -- you have a different number of 'files' compared to 'batches'. Please check your prep.fsom() arguments and try again.")
    }

    # ref.list <- unique(ref.dt[["File"]]) ##### Causes a mis-match based on ordering

    ref.list <- fsom$files # ordering based on the fsom.prep file -- disk file order, which is required here
    ref.list

    # ref.labels <- vector()
    #
    # for(i in ref.list){
    #   # i <- 1
    #   ref.labels[i] <- fsom$batches[[i]]
    # }
    #
    # ref.labels <- as.numeric(factor(ref.labels))

    ref.labels <- fsom$batches # ordering based on the fsom.prep file -- disk file order, which is required here


    fsom$files
    ref.list

    fsom$batches
    ref.labels

    ### Target samples

    target.list <- unique(target.dat[[sample.col]])
    target.list <- as.character(target.list)

    # Check if target list already contains "___"
    # if('___' %in% target.list){
    #   stop('Error -- it seems your file names contain three underscores in at least one name, which is not compatible with this function (as we use three underscores to add file order information to the file names. Please rename the files and re-run the function.')
    # }

    ### ### ### ### ###
    target.list <- target.list[order(target.list)] ######################## Order it so that it matches the file order when they are made into FCS files
    target.list

    starting.target.list <- target.list

    nr.before <- nrow(target.dat)

    for(i in c(1:length(target.list))){
      # i <- 1

      a <- target.list[[i]]

      if(nchar(i) == 1){
        o <- paste0(0, 0, 0)
      }

      if(nchar(i) == 2){
        o <- paste0(0, 0)
      }

      if(nchar(i) == 3){
        o <- paste0(0)
      }

      if(nchar(i) == 4){
        o <- paste0(i)
      }

      if(nchar(i) == 5){
        stop("Error -- the current function configuration is not designed for managing over 9999 batches. How did you even do that? Get in touch at thomas.ashhurst@sydneycytometry.org.au for help.")
      }

      temp <- target.list[[i]]
      temp <- paste0("TARGETFILE", o, i, "___", temp)
      target.list[[i]] <- temp
    }

    d <- data.table("Start" = starting.target.list, "ORDEREDFILE" = target.list)

    target.dat <- do.embed.columns(target.dat, sample.col, add.dat = d, add.by = "Start", rmv.ext = FALSE)

    # target.dat[target.dat[[sample.col]] == a, "ORDEREDFILE"] <- temp
    #  target.dat[target.dat[[sample.col]] == temp,sample.col, with = FALSE]

    start.sample.col <- sample.col
    sample.col <- "ORDEREDFILE"

    nr.after <- nrow(target.dat)

    if(nr.before != nr.after){
      stop("Error -- something has gone wrong in renaming the file/batches in the target dataset")
    }

    target.labels <- vector()

    for(i in c(1:length(target.list))){
      # i <- 1
      nme <- target.list[i]
      temp <- target.dat[target.dat[[sample.col]] == nme,]
      tmp <- temp[[batch.col]][1]
      target.labels[i] <- as.character(tmp)
    }

    target.labels

    if(all(gsub(".*___", "", target.list) == target.labels) == FALSE){
      stop("Error -- your list of target batches has mixed batch labels.")
    }

    ###########################################################################################
    ### TRAIN THE MODEL -- Create quantile conversion model (using ref samples)
    ###########################################################################################

    message("do.align -- Step 2/5. Training started")

    ### Some preferences

    transformList = NULL # in the normal fsom, the default is NULL, but not here -- might be a bug report for Sofie
    outputDir = "./tmp"

    normMethod.train = QuantileNorm.train
    normParams = list(nQ = nQ, goal = goal) ### Can we do more?? Default = 101 Can also be a batch #
    clean = TRUE
    plot = FALSE
    verbose = TRUE
    #labels <- as.character(dat.training$Batch)

    clusterRes <- list()

    ### Convert 'reference' samples into 'temp' FCS files

    setwd(starting.dir)
    dir.create("tmp-train")
    setwd("tmp-train")

    ref.list
    ref.labels
    ref.fsom$fsom$filenums

    ordr.check <- list()

    for(i in ref.fsom$fsom$filenums){

      a <- ref.list[[i]]
      ordr.check[[i]] <- a

      if(nchar(i) == 1){
        o <- paste0(0, 0, 0, i)
      }

      if(nchar(i) == 2){
        o <- paste0(0, 0, i)
      }

      if(nchar(i) == 3){
        o <- paste0(0, i)
      }

      if(nchar(i) == 4){
        o <- paste0(i)
      }

      if(nchar(i) == 5){
        stop("Error -- the current function configuration is not designed for managing over 9999 batches. How did you even do that? Get in touch at thomas.ashhurst@sydneycytometry.org.au for help.")
      }

      temp <- ref.dt[ref.dt[["File"]] == i,]

      write.files(temp,
                  file.prefix = paste0("File_", o, "_Batch_", a),
                  write.csv = FALSE,
                  write.fcs = TRUE)

      # write.files(dat = ref.dt[ref.dt[["File"]] == i,],
      #             file.prefix = i,
      #             write.csv = FALSE,
      #             write.fcs = TRUE)
    }

    train.files <- list.files(getwd(), ".fcs")

    train.files
    ref.list

    if(length(train.files) != length(ref.list)){
      stop("Error -- somehow the number of files generated is larger than your batches.")
    }


    ### Split files by clusters

    dirCreated = FALSE
    if (!dir.exists(outputDir)) {
      dirCreated = dir.create(outputDir)
    }

    # Split files by clusters

    for(file in train.files){
      if(verbose) message("Splitting ",file)

      ff <- flowCore::read.FCS(file)

      if (!is.null(transformList)) {
        ff <- flowCore::transform(ff,transformList)
      }

      # Map the file to the FlowSOM clustering
      fsom_file <- FlowSOM::NewData(fsom$FlowSOM, ff)

      # Get the metacluster label for every cell
      cellClusterIDs <- fsom$metaclustering[fsom_file$map$mapping[,1]]

      for (cluster in unique(fsom$metaclustering)) {
        if (sum(FlowSOM::GetMetaclusters(fsom_file,
                                         fsom$metaclustering) == cluster) > 0) {
          suppressWarnings(
            flowCore::write.FCS(
              ff[cellClusterIDs == cluster,],
              file=file.path(outputDir,
                             paste0(gsub("[:/]","_",file),
                                    "_fsom",cluster,".fcs"))))
        }
      }
    }


    ### Learn quantiles for each cluster
    clusterRes <- list()

    for (cluster in unique(fsom$metaclustering)) {
      if(verbose) message("Processing cluster ",cluster)
      if (plot) {
        grDevices::pdf(file.path(outputDir,
                                 paste0("CytoNorm_norm_Cluster",
                                        cluster, ".pdf")),
                       height = 3*(2*length(train.files)+2),
                       width = 3*(length(channels)+1))
      }

      normParams_tmp <- c(normParams,
                          list(files = file.path(outputDir,
                                                 paste0(gsub("[:/]", "_", train.files),
                                                        "_fsom", cluster, ".fcs")),
                               labels = as.character(ref.labels),
                               channels = channels,
                               transformList = NULL,
                               verbose = verbose,
                               plot = plot))

      normParams_tmp <- normParams_tmp[unique(names(normParams_tmp))]

      if(is.list(normParams[["goal"]])){
        normParams_tmp[["goal"]] <- normParams[["goal"]][[cluster]]
      }

      clusterRes[[cluster]] <- do.call(normMethod.train,
                                       normParams_tmp)

      if (plot) { grDevices::dev.off() }
    }


    ### Cleanup

    if(clean){
      for(cluster in unique(fsom$metaclustering)){
        tmp_files <- file.path(outputDir,
                               paste0(gsub("[:/]", "_", train.files),
                                      "_fsom", cluster, ".fcs"))

        file.remove(tmp_files[file.exists(tmp_files)])
      }
      if(dirCreated & !plot){
        unlink(outputDir, recursive=TRUE)
      }
    }

    ### Establish model

    model <- named.list(fsom, clusterRes) ###### mismatch
    #model

    if(mem.ctrl == TRUE){
      gc()
    }

    ###########################################################################################
    ### APPLY THE MODEL -- Align datasets (apply to all samples)
    ###########################################################################################

    message("do.align -- Step 3/5. Alignment started")

    ### Write 'training' list samples to FCS files

    getwd()

    setwd(starting.dir)
    dir.create("tmp-target")
    setwd("tmp-target")

    for(i in target.list){
      write.files(dat = target.dat[target.dat[[sample.col]] == i,],
                  file.prefix = i,
                  write.csv = FALSE,
                  write.fcs = TRUE)
    }

    target.files <- list.files(getwd(), ".fcs")
    # trg.test <- gsub(".fcs", "", target.files)

    target.files <- target.files[order(target.files)]
    # trg.test <- trg.test[order(trg.test)]

    # if(all(target.list == trg.test) == FALSE){
    #   stop("Error -- the target files (batches) have become dis-ordered when writing to/reading from disk.")
    # }

    if(all(gsub(".*___", "", target.list) == target.labels) == FALSE){
      stop("Error -- the target files (batches) and corresponding batches don't match.")
    }


    # QUICKTEST <- list()
    #
    # for(file in target.files) { # Loop to read files into the list
    #   tempdata <- exprs(flowCore::read.FCS(file, transformation = FALSE))
    #   tempdata <- tempdata[1:nrow(tempdata),1:ncol(tempdata)]
    #   tempdata <- as.data.table(tempdata)
    #   file <- gsub(".fcs", "", file)
    #   QUICKTEST[[file]] <- tempdata
    #   rm(tempdata)
    # }
    #
    # QUICKTEST[[1]]
    # QUICKTEST[[2]]
    # QUICKTEST[[3]]
    # QUICKTEST[[4]]
    # QUICKTEST[[5]]
    # QUICKTEST[[6]]
    # QUICKTEST[[7]]
    # QUICKTEST[[8]]
    # QUICKTEST[[9]]
    # QUICKTEST[[10]]


    ## Align data
    # CytoNorm.normalize(model = model,
    #                    files = target.files,
    #                    labels = target.labels,
    #                    transformList = NULL,
    #                    transformList.reverse = NULL,
    #                    normMethod.normalize = QuantileNorm.normalize,
    #                    outputDir = "Target_Normalized",
    #                    prefix = "TaretNorm_",
    #                    clean = TRUE,
    #                    verbose = TRUE)

    ### Run model

    model = model
    files = target.files
    labels = target.labels
    transformList = NULL
    transformList.reverse = NULL
    normMethod.normalize = QuantileNorm.normalize
    outputDir = "Target_Normalized"
    prefix = "TargetNorm_"
    clean = TRUE
    verbose = TRUE

    if(is.null(model$fsom) |
       is.null(model$clusterRes)){
      stop("The 'model' paramter should be the result of using the
               trainQuantiles function.")
    }

    if(length(labels) != length(files)){
      stop("Input parameters 'labels' and 'files' should have the same length")
    }

    # Create output directory
    if(!dir.exists(outputDir)){
      dir.create(outputDir)
    }

    fsom <- model$fsom
    clusterRes <- model$clusterRes

    ### Split files by clusters

    cellClusterIDs <- list()
    meta <- list()
    cluster_files <- list()


    for(file in files){
      # file <- "01_Air_01.fcs"
      if(verbose) message("Splitting ",file)
      ff <- flowCore::read.FCS(file)

      if(!is.null(transformList)){
        ff <- flowCore::transform(ff, transformList)
        # meta[[file]] <- list()
        # meta[[file]][["description_original"]] <- ff@description
        # meta[[file]][["parameters_original"]] <- ff@parameters
      }

      fsom_file <- FlowSOM::NewData(fsom$FlowSOM,ff)
      cellClusterIDs[[file]] <- fsom$metaclustering[fsom_file$map$mapping[,1]]

      for(cluster in unique(fsom$metaclustering)){
        if (sum(FlowSOM::GetMetaclusters(fsom_file,
                                         fsom$metaclustering) == cluster) > 0) {
          f <- file.path(outputDir,
                         paste0(gsub("[:/]","_",file),
                                "_fsom", cluster, ".fcs"))
          suppressWarnings(
            flowCore::write.FCS(ff[cellClusterIDs[[file]] == cluster],
                                file = f)
          )
        }
      }
      rm(ff)
    }

    ### Apply normalization on each cluster
    for(cluster in unique(fsom$metaclustering)){
      if(verbose) message("Processing cluster ",cluster)
      files_tmp <- file.path(outputDir,
                             paste0(gsub("[:/]",
                                         "_",
                                         files),
                                    "_fsom",
                                    cluster,
                                    ".fcs"))

      ### ### ### ### ###

      labels_tmp <- labels[file.exists(files_tmp)]
      files_tmp <- files_tmp[file.exists(files_tmp)] #### different order
      normMethod.normalize(model = clusterRes[[cluster]],
                           files = files_tmp,
                           labels = labels_tmp,
                           outputDir = file.path(outputDir),
                           prefix = "Norm_",
                           transformList = NULL,
                           transformList.reverse = NULL,
                           removeOriginal = TRUE,
                           verbose = verbose)
    }

    ###

    strt <- getwd()
    setwd("Target_Normalized/")

    #list.files(getwd(), ".fcs")
    #all.clust <- read.files(getwd(), file.type = ".fcs", do.embed.file.names = FALSE)

    all.clust <- list()
    fl.nms <- list.files(path=getwd(), pattern = ".fcs")
    fl.nms

    fl.nrws <- list()
    fl.ncls <- list()

    for(file in fl.nms) { # Loop to read files into the list
      tempdata <- exprs(flowCore::read.FCS(file, transformation = FALSE))

      if(nrow(tempdata) == 1){
        tempdata <- tempdata[1:nrow(tempdata),1:ncol(tempdata)]
        tempdata <- as.data.table(t(tempdata))
        tempdata
      }

      if(nrow(tempdata) > 1){
        tempdata <- tempdata[1:nrow(tempdata),1:ncol(tempdata)]
        tempdata <- as.data.table(tempdata)
      }

      file <- gsub(".fcs", "", file)
      all.clust[[file]] <- tempdata
      fl.ncls[[file]] <- ncol(tempdata)
      fl.nrws[[file]] <- nrow(tempdata)
      rm(tempdata)
    }

    if(length(unique(unlist(fl.ncls))) > 1){
      stop("Error -- when combining the 'file-cluster' FCS files back into 'file' datasets, differenting number of columns were found in one 'file-cluster' FCS file.")
    }

    # all.clust[[18]]

    names(all.clust)
    names(all.clust[[1]])

    for(i in c(1:length(all.clust))){
      a <- names(all.clust)[[i]]
      all.clust[[i]]$NORM_TARGET_FILENAME <- a
      all.clust[[i]]$NORM_TARGET_FILENUM <- i
    }

    names(all.clust)
    names(all.clust[[1]])

    setwd(strt)
    getwd()


    ### Combine clusters into one final fcs file
    for(file in files){
      if(verbose) message("Rebuilding ",file)

      ff <- flowCore::read.FCS(file)

      for(cluster in unique(fsom$metaclustering)){
        file_name <- file.path(outputDir,
                               paste0("Norm_",gsub("[:/]","_",file),
                                      "_fsom",cluster,".fcs"))
        if (file.exists(file_name)) {
          ff_subset <- flowCore::read.FCS(file_name)
          flowCore::exprs(ff)[cellClusterIDs[[file]] == cluster,] <- flowCore::exprs(ff_subset)
        }
      }

      if(!is.null(transformList.reverse)){
        ff <- flowCore::transform(ff, transformList.reverse)
        # ff@description <- meta[[file]][["description_original"]]
        # ff@parameters <- meta[[file]][["parameters_original"]]
      }


      # Adapt to real min and max because this gets strange values otherwise
      ff@parameters@data[,"minRange"] <- apply(ff@exprs, 2, min)
      ff@parameters@data[,"maxRange"] <- apply(ff@exprs, 2, max)
      ff@parameters@data[,"range"] <- ff@parameters@data[,"maxRange"] -
        ff@parameters@data[,"minRange"]

      if(clean){
        file.remove(file.path(outputDir,
                              paste0("Norm_",gsub("[:/]","_",file),
                                     "_fsom",unique(fsom$metaclustering),".fcs")))
      }

      suppressWarnings(flowCore::write.FCS(ff,
                                           file=file.path(outputDir,
                                                          paste0(prefix,gsub(".*/","",file)))))
    }

    if(mem.ctrl == TRUE){
      gc()
    }

    ###########################################################################################
    ### Read FCS files back in and create merged DT
    ###########################################################################################

    message("do.align -- Step 4/5. Re-import aligned files and creating 'modified' aligned data.table")

    ###

    setwd(starting.dir)
    setwd("tmp-target/")
    setwd("Target_Normalized")

    # all.clust.strt <- all.clust
    # all.clust <- all.clust.strt
    names(all.clust[[1]])

    sve <- all.clust

    for(a in names(all.clust)){
      # a <- names(all.clust)[[1]]
      b <- a
      colnames(all.clust[[a]])[colnames(all.clust[[a]]) %in% c("NORM_TARGET_FILENAME")] <- c("RAWFILENAME")
      a <- gsub('\\_fsom.*', '', a, perl=TRUE)
      all.clust[[b]][["NORM_TARGET_FILENAME"]] <- a
    }

    #all.clust
    #names(all.clust)
    #names(all.clust[[1]])
    #names(all.clust[[70]])

    if(mem.ctrl == TRUE){
      gc()
    }

    nme.list <- list()

    for(i in c(1:length(all.clust))){
      temp <- all.clust[[i]]
      nme.list[[i]] <- ncol(temp)
      rm(temp)
    }

    if(length(unique(unlist(nme.list))) < 1){
      stop("Error -- differing number of column names in each file")
    }

    aligned.dat <- data.table::rbindlist(all.clust, fill = TRUE)

    if(mem.ctrl == TRUE){
      gc()
    }

    aligned.dat[, channels, with=FALSE]

    ### Replace target.dat with aligned.dat -- just the channels aligned
    mod.dat <- target.dat

    aligned.dat <- aligned.dat[order(aligned.dat[['temp-pre-alignment-barcode']]),]

    if(is.unsorted(aligned.dat[['temp-pre-alignment-barcode']])){
      stop("Error -- pre-alignment cellular barcodes not in correct order")
    }


    if(mem.ctrl == TRUE){
      gc()
    }

    for(a in channels){
      # mod.dat[[a]] <- aligned.dat[[a]]
      # names(mod.dat)[names(mod.dat) == a] <- paste0(a, "_align")

      ## Test function for 'adding' data
      i <- paste0(a, "_aligned")
      mod.dat[[i]] <- aligned.dat[[a]]
    }

    if(nrow(mod.dat) != nrow(aligned.dat)){
      stop("Error -- differing rows in 'mod.dat' and 'aligned.dat'")
    }

    mod.dat[["temp-post-alignment-barcode"]] <- aligned.dat[['temp-pre-alignment-barcode']]

    if(mem.ctrl == TRUE){
      gc()
    }

    names(mod.dat)

    ###########################################################################################
    ### Return mod.dat and cleanup
    ###########################################################################################

    message("do.align -- Step 5/5. Cleanup, writing FCS files, and returning aligned data")

    ## WD and Directory stuff

    setwd(starting.dir)
    list.dirs(getwd(), full.names = FALSE, recursive = FALSE)

    unlink("tmp-train", recursive = TRUE)
    unlink("tmp-target", recursive = TRUE)

    ### Replacing batch names with originals

    mod.target.list <- target.list
    target.list <- starting.target.list


    mod.sample.col <- sample.col
    sample.col <- start.sample.col


    target.dat$ORDEREDFILE <- NULL
    mod.dat$ORDEREDFILE <- NULL

    # final.target.list <- gsub(".*___", "", starting.target.list)
    #
    # if(all(starting.target.list == final.target.list) == FALSE){
    #   stop("Error -- somehow the list of target files (batches) has become mixed up")
    # }
    #
    # if(all(unique(target.dat$ORDEREDFILE) == unique(mod.dat$ORDEREDFILE)) == FALSE){
    #   stop("Error")
    # }
    #
    # #
    #
    # for(i in c(1:length(target.list))){
    #   a <- target.list[[i]]
    #   b <- starting.target.list[[i]]
    #
    #   target.dat$ORDEREDFILE <- NULL
    #   mod.dat$ORDEREDFILE <- NULL
    # }
    #
    # target.list <- final.target.list


    ### Write reference FCS files

    if(write.ref.fcs == TRUE){
      setwd(starting.dir)
      dir.create("CytoNorm_ref_output")
      setwd("CytoNorm_ref_output")

      ref.fsom$fsom$filenums

      for(i in ref.fsom$fsom$filenums){
        a <- ref.fsom$fsom$files[[i]]
        write.files(dat = ref.dt[ref.dt[["File"]] == i,],
                    file.prefix = paste0("Raw_RefFile_", i, "_", a),
                    write.csv = FALSE,
                    write.fcs = TRUE)
      }
      setwd(starting.dir)
    }

    if(mem.ctrl == TRUE){
      gc()
    }

    ### Write target FCS files

    if(write.target.fcs == TRUE){
      setwd(starting.dir)
      dir.create("CytoNorm_target_output")
      setwd("CytoNorm_target_output")

      for(i in target.list){
        write.files(dat = target.dat[target.dat[[sample.col]] == i,],
                    file.prefix = paste0("Raw_", i),
                    write.csv = FALSE,
                    write.fcs = TRUE)
      }

      for(i in unique(mod.dat[[sample.col]])){
        write.files(dat = mod.dat[mod.dat[[sample.col]] == i,],
                    file.prefix = paste0("Aligned_", i),
                    write.csv = FALSE,
                    write.fcs = TRUE)
      }
      setwd(starting.dir)
    }

    if(mem.ctrl == TRUE){
      gc()
    }

    ### Return

    return(mod.dat)
    message("do.align complete")

  }
}
tomashhurst/Spectre documentation built on Dec. 23, 2021, 11:55 a.m.