R/functions_batch.R

Defines functions define_batches extract_scan_date

Documented in define_batches extract_scan_date

#' Extract scan date
#'
#' Extracts date of scanning from original array files (e.g. CEL for Affymetrix
#' and idat for Illumina arrays).
#'
#' Currently only implemented for Affymetrix arrays.
#'
#' @param se
#' [\code{\link[SummarizedExperiment]{SummarizedExperiment-class}} object]
#' study data set
#' @param col.sample.id [character(1)] Column in colData() with
#' sample accession information (default: "geo_accession")
#' @param array.type [character(1)] type of array ("affy" (default), "agilent"
#' or "illumina")
#' @param temp.dir [character(1)] directory where temporary files should be
#' stored
#' @param tryFormats [character(n)] Possible format strings to try when
#' converting date as character to an object of class
#' \code{\link[base]{Date}}
#'
#' @return \code{\link[SummarizedExperiment]{SummarizedExperiment-class}}
#' object with scan date added as column scan.date in colData.
#'
#' @export

extract_scan_date <- function(
  se,
  col.sample.id = "geo_accession",
  array.type = "affy",
  temp.dir = tempdir(),
  tryFormats = c("%Y-%m-%d",
                 "%Y/%m/%d",
                 "%m/%d/%Y",
                 "%m/%d/%y",
                 "%m-%d-%Y",
                 "%d-%b-%Y")) {

  if (!(col.sample.id %in%
        colnames(SummarizedExperiment::colData(se)))) {
    stop(paste("column", col.sample.id, "not available in colData!"))
  }
  accession.samples = as.character(
    SummarizedExperiment::colData(se)[, col.sample.id])

  if (!(array.type %in% c("affy", "agilent", "illumina"))) {
    stop("array.type needs to be 'affy', 'agilent' or 'illumina'")
  }

  scan.date.all = rep(NA, length(accession.samples))
  if (array.type == "affy") {

    for (i in seq_len(length(accession.samples))) {

      ## extract scan date from header of CEL file
      cel.file = download_cel_file(
        sample = accession.samples[i],
        temp.dir = temp.dir)

      scan.info = affyio::read.celfile.header(
        file = cel.file,
        info = "full")
      scan.date = ifelse(length(scan.info$ScanDate) == 0,
                         NA, scan.info$ScanDate)
      scan.date.all[i] = scan.date
    }
  } else {
    stop(paste(array.type, "not implemented yet!"))
  }

  ## convert to Date
  scan.date.formats = lapply(tryFormats, function(x) {
    as.Date(scan.date.all,
            tryFormats = x,
            optional = TRUE)
  })
  no.na = vapply(X = scan.date.formats,
                 FUN = function(x) {sum(is.na(x))},
                 FUN.VALUE = integer(1))
  ind.min = which.min(no.na)
  if (no.na[ind.min] > sum(is.na(scan.date.all))) {
    warning("some dates could not be correctly converted!")
  }

  se$scan.date = scan.date.formats[[ind.min]]
  return(se)
}

#' Define batches based on scan date
#'
#' Groups samples into batches based on scan date. Samples run within a short
#' time interval can be defined to belong to the same batch.
#'
#' @param se
#' [\code{\link[SummarizedExperiment]{SummarizedExperiment-class}} object]
#' study data set
#' @param col.scan.date [character(1)] Column in colData() with scan
#' dates (default: scan.date as generated by the function
#' \code{\link{extract_scan_date}}
#' @param diff.ignore [numeric(1)] Time difference (in days) defined as
#' negligible (default: 1)
#'
#' @return \code{\link[SummarizedExperiment]{SummarizedExperiment-class}}
#' object with batch information added as column batch in colData()
#'
#' @export

define_batches <- function(
  se,
  col.scan.date = "scan.date",
  diff.ignore = 1) {

  if (!(col.scan.date %in%
        colnames(SummarizedExperiment::colData(se)))) {
    stop(paste("column", col.scan.date, "not available in column data!"))
  }

  if ("batch" %in%
      colnames(SummarizedExperiment::colData(se))) {
    stop(paste("column batch already available in column data!"))
  }

  scan.date = SummarizedExperiment::colData(se)[, col.scan.date]
  if (!methods::is(scan.date, "Date")) {
    stop(paste("column", col.scan.date, "needs to be of type Date!"))
  }
  names(scan.date) = colnames(se)

  scan.date = sort(scan.date)
  diff = c(0, as.numeric(diff(scan.date)))
  diff[diff == diff.ignore] = 0
  batch.num = as.numeric(as.factor(cumsum(diff)))
  ## add leading zeros to enable numeric sorting of strings
  batch = formatC(
    batch.num,
    width = max(nchar(batch.num)),
    format = "d",
    flag = "0")
  names(batch) = names(scan.date)
  se$batch = batch[colnames(se)]
  return(se)
}
szymczak-lab/harmonizeGeneExprData documentation built on Dec. 1, 2022, 9:07 p.m.