R/readEPIC.R

Defines functions generateManifest bfp readEPIC methylumIDATepic NChannelSetToMethyLumiSet2 mergeProbeDesigns2 designIItoMandU2 designItoMandU2 getControlProbes2 DataToNChannelSet2 extractAssayDataFromList2 IDATsToMatrices2 IDATtoMatrix2 getMethylationBeadMappers2 columnMatrix

Documented in bfp columnMatrix DataToNChannelSet2 designIItoMandU2 designItoMandU2 extractAssayDataFromList2 generateManifest getControlProbes2 getMethylationBeadMappers2 IDATsToMatrices2 IDATtoMatrix2 mergeProbeDesigns2 methylumIDATepic NChannelSetToMethyLumiSet2 NChannelSetToMethyLumiSet2 readEPIC

#### readEPIC

 # Author: Tyler Gorrie-Stone
 # LS add in support for epicv2 01-09-2023
 # Last Modified: 19-01-2016
 # Creation Date: 20-11-2015

 # Based heavily on 'methylumIDAT' and modified to read Epic arrays.
 # Functionality for 450k and 27k arrays remains (relatively) unchanged.
 # Although certain stops have been removed/moved to a different
 # place due to the implausibility of determining sample based on name
 # As differing between the three chips requires reading in .idats.

###
 # Usage:
 # readEPIC : Instead of inputting the barcodes, it is possible to input 
 # path to a directory (containing idats or folders containing idats).
 # This constructs a vector of the barcodes which is passed to 
 # methylumIDAT.
 #
 # methylumIDATepic: Usage remains the same as methylumIDAT. Although 
 # might be slightly changed due to the introduction of recursive=T 
 # argument in the list.files()
###
 # Leaving 27k and 450k array functionality in function for the purpose 
 # of convenience, however may be removed in the future when such arrays 
 # are not used anymore.
###

# {{{ # Utility function for dealing with single samples (still not 100% perfect...)
columnMatrix <- function(x, row.names = NULL) {
    if (is.null(dim(x)[2]))
        dim(x) = c(length(x), 1)
    if (!is.null(row.names))
        rownames(x) = row.names
    return(x)
}  
# }}}

# {{{ getMethylationBeadMappers2 
## require()s the appropriate package for annotating a chip & sets up mappings
## Potentially may temporarily require more indepth ordering file for 450k
## arrays until the 450k is completely replaced by epic chips.
getMethylationBeadMappers2 <- function(chipType = c("450k", "27k", "Epic", "Epicv2", ".manifest"),
    genome = c("hg19", "hg18", "hg38")) {
    #### genomic annotation package
    genome <- match.arg(genome)  ## default to FDb.InfiniumMethylation.hg19
    pkg <- paste0("FDb.InfiniumMethylation.", genome)
    require(pkg, character.only = TRUE)  
    ####  I think we can now do away with this

    chipType <- sub("^IlluminaHumanMethylation", "", chipType)
    #### this appears to be dispensable cruft.  chipType is a string
    if (class(chipType) %in% c("NChannelSet", "MethyLumiSet", "MethyLumiM")) {
        chipType <- sub("^IlluminaHumanMethylation", "", annotation(chipType))
        chipType <- sub(".db$", "", annotation(chipType))
    }
    chipType <- match.arg(chipType)  # backwards compatibility purposes
    #### have added .manifest chiptype to work with readPepo 

    ## addressA=U, addressB=M
    getProbes <- switch(chipType, `27k` = function(color = NULL, ...) {
        data(hm27.ordering)
        what <- c("Probe_ID", "M", "U")
        r <- split(hm27.ordering[, what], hm27.ordering$col)
        if (is.null(color)) return(r) else return(r[[substr(color, 1, 1)]])
    }, `450k` = function(design = NULL, color = NULL, ...) {
        data(hm450.ordering)
        what <- c("Probe_ID", "M", "U")
        r <- split(hm450.ordering[, c(what, "col")], hm450.ordering$DESIGN)
        r$I <- split(r$I[, what], r$I$col)
        r$II <- r$II[, what]
        if (is.null(design)) {
            return(r)
        } else if (design == "I" && !is.null(color)) {
            return(r[[design]][[substr(color, 1, 1)]])
        } else {
            return(r[[design]])
        }
    }, Epic = { function(design = NULL, color = NULL, ...) {
        # data(epic.ordering)
        epic.ordering <<- generateManifest("EPIC")
        what <- c("Probe_ID", "M", "U")
        r <- split(epic.ordering[, c(what, "col")], epic.ordering$DESIGN)
        r$I <- split(r$I[, what], r$I$col)
        r$II <- r$II[, what]
        if (is.null(design)) {
            r <- return(r)
        } else if (design == "I" && !is.null(color)) {
            r <- return(r[[design]][[substr(color, 1, 1)]])
        } else {
            r <- return(r[[design]])
        }}
    }, Epicv2 = function(design = NULL, color = NULL, ...) {
        # data(epic.ordering)
        epicV2.ordering <<- generateManifest("EPICv2")
        what <- c("Probe_ID", "M", "U")
        r <- split(epicV2.ordering[, c(what, "col")], epicV2.ordering$DESIGN)
        r$I <- split(r$I[, what], r$I$col)
        r$II <- r$II[, what]
        if (is.null(design)) {
            r <- return(r)
        } else if (design == "I" && !is.null(color)) {
            r <- return(r[[design]][[substr(color, 1, 1)]])
        } else {
            r <- return(r[[design]])
        }
    } , .manifest = function(design = NULL, color = NULL, ...) {
        # data(epic.ordering)
        # epicV2.ordering <<- generateManifest("EPICv2")
        what <- c("Probe_ID", "M", "U")
        #r <- split(epicV2.ordering[, c(what, "col")], epicV2.ordering$DESIGN)
        r <- list()
        #r$I <- split(r$I[, what], r$I$col)
        I <- data.frame(.manifest@data$TypeI)
        nI <- colnames(I)
        nI[1:3] <- c("Probe_ID", "U", "M")
        colnames(I) <- nI
        r$I <- split(I[,what], I$Color)
        names(r$I)<-  c('G','R')

        r$II <- data.frame(.manifest@data$TypeII) 
        colnames(r$II)[c(1,2)] <- nI[c(1,2)]

        if (is.null(design)) {
            r <- return(r)
        } else if (design == "I" && !is.null(color)) {
            r <- return(r[[design]][[substr(color, 1, 1)]])
        } else {
            r <- return(r[[design]])
        }
    }
    )

    getControls <- switch(chipType, `27k` = function() {
        data(hm27.controls)
        return(hm27.controls)
    }, `450k` = function() {
        data(hm450.controls)
        return(hm450.controls)
    }, Epic = function() {
        data(epic.controls)
        return(epic.controls)
    }, Epicv2 = function() {
        data(epicV2.controls)
        return(epicV2.controls)
    }, .manifest = function(){
       kont <- data.frame(.manifest@data$TypeControl)
       colnames(kont) <- c("Address", "Type", "Color_Channel", "Name") 
       kont
    }
    )
    ## we may be able to eliminate ordering <<no, this is the fData 
    getOrdering <- switch(chipType, `27k` = function() {
        ord <- c("Probe_ID", "DESIGN", "COLOR_CHANNEL")
        return(hm27.ordering[, ord])
    }, `450k` = function() {
        ord <- c("Probe_ID", "DESIGN", "COLOR_CHANNEL")
        return(hm450.ordering[, ord])
    }, Epic = function() {
        ord <- c("Probe_ID", "DESIGN", "COLOR_CHANNEL")
        return(epic.ordering[, ord])
    }, Epicv2 = function() {
        ord <- c("Probe_ID", "DESIGN", "COLOR_CHANNEL")
        return(epicV2.ordering[, ord])
    }, .manifest = function(){ 
        ord <- c("Probe_ID", "DESIGN", "COLOR_CHANNEL")
        man1 <- data.frame( 
           Probe_ID = .manifest@data$TypeI$Name, 
           DESIGN   = 'I',
           COLOR_CHANNEL = .manifest@data$TypeI$Color
        )
        man2 <- data.frame( 
           Probe_ID = .manifest@data$TypeII$Name, 
           DESIGN   = 'II',
           COLOR_CHANNEL = ''
        )
        rbind(man1,man2)
    }

    )

    mapper <- list(probes = getProbes, controls = getControls, ordering = getOrdering)
    return(mapper)
}  # }}}

#{{{ IDATtoMatrix2: process a single IDAT (just the mean intensities) {{{
IDATtoMatrix2 <- function(x, fileExts = list(Cy3 = "Grn", Cy5 = "Red"), idatPath = ".") {
    chs = names(fileExts)
    names(chs) = fileExts
    processed = lapply(fileExts, function(ch) {
        ext = paste(ch, "idat", sep = ".")
        message(sprintf("Reading in: %s", paste(x, ext, sep = "_")))
        dat = readIDAT(file.path(idatPath, paste(x, ext, sep = "_")))
        Quants = data.matrix(dat$Quants)
        colnames(Quants) = paste(chs[ch], colnames(Quants), sep = ".")
        return(list(Quants = Quants, RunInfo = dat$RunInfo, ChipType = dat$ChipType))
    })
    probe.data = do.call(cbind, lapply(processed, function(x) x[["Quants"]]))
    probe.data <- probe.data[, c("Cy3.Mean", "Cy5.Mean", "Cy3.NBeads", "Cy5.NBeads")]
    # Nbeads for beadcount switch TGS, at this point add something that takes
    # the beadcount with the lower number of beads.
    attr(probe.data, "RunInfo") = processed[[1]][["RunInfo"]]
    attr(probe.data, "ChipType") = processed[[1]][["ChipType"]]
    return(probe.data)
}  # }}}

# {{{ IDATsToMatrices2
IDATsToMatrices2 <- function(barcodes, fileExts = list(Cy3 = "Grn", Cy5 = "Red"),
    parallel = F, idatPath = ".") {
    names(barcodes) = as.character(barcodes)
    if (parallel) {
        mats = .mclapply(barcodes, IDATtoMatrix2, fileExts = fileExts, idatPath = idatPath)
    } else {
        mats = lapply(barcodes, IDATtoMatrix2, fileExts = fileExts, idatPath = idatPath)
    }
    names(mats) = as.character(barcodes)
    return(mats)
}  # }}}

#{{{ extractAssayDataFromList2
## might consider doing the following in C++ via Rcpp to avoid copying data 
extractAssayDataFromList2 <- function(assay, mats, fnames) {
    d <- do.call("cbind", lapply(mats, function(x) x[fnames, assay]))  # Select Common Probes
    if (!is.matrix(d))
        d <- data.matrix(d)
    colnames(d) <- names(mats)
    rownames(d) <- fnames
    return(d)
}  # }}}

#{{{  DataToNChannelSet2
## a faster rewrite of DFsToNChannelSet() so that I can decommission it...
## Stop-check to ensure different platforms are read simulatenously.  
## note EPIC and EPICv2 have the same chip type and there are several 
## different EPIC manifests

DataToNChannelSet2 <- function(
   mats, 
   chans = c(Cy3 = "GRN", Cy5 = "RED"), 
   parallel = F,
   protocol.data = F, 
   IDAT = TRUE, 
   force = F
   ) {
      epic = hm27 = hm450 = 0
      qw <- unlist(lapply(mats, function(x) attr(x, "ChipType")))
      epic = sum(grepl("BeadChip 8x5", qw))
      message(paste(epic, "HumanMethylationEpic / Epicv2 samples found"))
      hm450 = sum(grepl("BeadChip 12x8", qw))
      message(paste(hm450, "HumanMethylation450 samples found"))
      hm27 = sum(grepl("BeadChip 12x1", qw))
      message(paste(hm27, "HumanMethylation27 samples found"))
      if (
         hm27 > 0 && hm450 > 0 | 
         hm27 > 0 && epic  > 0 | 
         hm450 > 0 && epic > 0
         ) {
           stop(
           "Cannot process multiple platforms simultaneously; 
            please run separately."
            )
      }

      stopifnot(is(mats, "list"))
      assayNames = paste0(names(chans), c(".Mean"))
      assayNames = c(assayNames, paste0(names(chans), c(".NBeads")))
      names(assayNames) = assayNames
      assaylengths <- sapply(mats, nrow)
      names(assaylengths) <- names(mats)
# Minfi Method for handling early version epic IDATS. from read.meth.R by
# Kaspar Hansen.
      if (length(unique(assaylengths)) > 1) {
        commonAddresses <- as.character(
           Reduce(
              "intersect", 
              lapply(mats, function(x) rownames(x))
           )
        )
        if (!force) {
           if (!all(assaylengths == length(commonAddresses))) {
                print(as.matrix(assaylengths))
                stop(
                   "Cannot combine IDATs of differing lengths, 
                    possibly try force = T")
            }
        }
        fnames <- commonAddresses
        } else {
        fnames <- rownames(mats[[1]])
        }

        extract <- function(assay){
           extractAssayDataFromList2(assay, mats, fnames)
        }
        assays = lapply(assayNames, extract)
        nb <- pmin(assays[["Cy3.NBeads"]], assays[["Cy5.NBeads"]])
        obj = new(
          "NChannelSet", 
          assayData = assayDataNew(
             R = assays[["Cy5.Mean"]], 
             G = assays[["Cy3.Mean"]],
             N = nb)
        )
        featureNames(obj) = fnames
        if (IDAT){
            message("Attempting to extract protocolData() from list...")
            ChipType = attr(mats[[1]], "ChipType")
            RunInfo = lapply(mats, function(d) attr(d, "RunInfo"))
            if (protocol.data){
               scanDates <- data.frame(
                  DecodeDate = rep(NA, length(mats)), 
                  ScanDate = rep(NA, length(mats))
               )
               rownames(scanDates) <- names(mats)
                  for (i in seq_along(mats)) {
                     cat(
                     "decoding protocolData for", 
                     names(mats)[i], 
                     "...\n"
                     )
                     if (nrow(RunInfo[[i]]) >= 2) {
                        scanDates$DecodeDate[i] <- RunInfo[[i]][1, 1]
                        scanDates$ScanDate[i]   <- RunInfo[[i]][2, 1]
                     }
                 }
                 protocoldata = new("AnnotatedDataFrame",
                    data = scanDates,
                    varMetadata = data.frame(
                        labelDescription = colnames(scanDates),
                        row.names = colnames(scanDates)
                     )
                 )
                protocolData(obj) = protocoldata
          }

          message("Determining chip type from IDAT protocolData...")
          if (ChipType == "BeadChip 12x1") {
             annotation(obj) = "IlluminaHumanMethylation27k"
          } else if (ChipType == "BeadChip 12x8") {
              annotation(obj) = "IlluminaHumanMethylation450k"
          } else if (ChipType == "BeadChip 8x5") {
              annotation(obj) = "IlluminaHumanMethylationEpic"
          }

          if (ChipType == "BeadChip 8x5" && dim(obj)[1] > 1.1e+06) {
             annotation(obj) = "IlluminaHumanMethylationEpicv2"
          }
        }  
    return(obj)
}  # }}}

# {{{ getControlProbes2
getControlProbes2 <- function(NChannelSet) {
    fD <- getMethylationBeadMappers2(annotation(NChannelSet))$controls()
    ctls <- match(fD[["Address"]], featureNames(NChannelSet))

    ## FIXME: make this happen in the annotations, to avoid redundancy in
    ## names!
    rownames(fD) <- ctlnames <- make.names(fD[, "Name"], unique = T)
    fD <- data.frame(fD)
    fvD <- data.frame(labelDescription = c("Address of this control bead", "Purpose of this control bead",
        "Color channel for this bead", "Reporter group ID for this bead"))
    fDat <- new("AnnotatedDataFrame", data = fD, varMetadata = fvD)
    methylated <- assayDataElement(NChannelSet, "G")[ctls, , drop = FALSE]  # Cy3
    unmethylated <- assayDataElement(NChannelSet, "R")[ctls, , drop = FALSE]  # Cy5
    beadcount <- assayDataElement(NChannelSet, "N")[ctls, , drop = FALSE]  ## Testing TGS
    rownames(beadcount) <- rownames(methylated) <- rownames(unmethylated) <- ctlnames

    aDat <- assayDataNew(methylated = methylated, unmethylated = unmethylated, beadcount = beadcount)  #TGS
    new("MethyLumiQC", assayData = aDat, featureData = fDat, annotation = annotation(NChannelSet))
}  # }}}

## {{{ designItoMandU2 
## 27k design, both probes same channel; ~100,000 of the 450k probes as well
## epic and epic II too

designItoMandU2 <- function(NChannelSet, parallel = F, n = F, n.sd = F, oob = T) {
   mapper <- getMethylationBeadMappers2(annotation(NChannelSet))
   probes <- mapper$probes(design = "I")  # as list(G=..., R=...)
   channels <- c("G", "R")
   names(channels) <- channels

   getIntCh <- function(NChannelSet, ch, al) {
      
      newprobes <- lapply(probes, function(y) {
         lapply(y, function(x) {
            x[probes[[ch]][[al]] %in% rownames(assayDataElement(NChannelSet, ch))]
            })
        })
        a = assayDataElement(NChannelSet, ch)[as.character(newprobes[[ch]][[al]]),
            , drop = FALSE]
        rownames(a) = as.character(newprobes[[ch]][["Probe_ID"]])
        return(a)
    }  

    getOOBCh <- function(NChannelSet, ch, al) {
     
        ch.oob <- ifelse(ch == "R", "G", "R")
        newprobes <- lapply(probes, function(y) {
            lapply(y, function(x) {
                x[probes[[ch]][[al]] %in% rownames(assayDataElement(NChannelSet,
                  ch))]
            })
        })
        a = assayDataElement(NChannelSet, ch.oob)[as.character(newprobes[[ch]][[al]]),
            , drop = FALSE]
        rownames(a) = as.character(newprobes[[ch]][["Probe_ID"]])
        return(a)
    } 

    getNbeadCh <- function(NChannelSet, ch, al) {
         #tgs
        newprobes <- lapply(probes, function(y) {
            lapply(y, function(x) {
                x[probes[[ch]][[al]] %in% rownames(assayDataElement(NChannelSet,
                  ch))]
            })
        })
        n = assayDataElement(NChannelSet, "N")[as.character(newprobes[[ch]][[al]]),
            , drop = FALSE]
        rownames(n) = as.character(newprobes[[ch]][["Probe_ID"]])
        return(n)
    } 

    getAllele <- function(NChannelSet, al, parallel = F, n = n, n.sd = T, oob = T) {
     
        fluor = lapply(channels, function(ch) getIntCh(NChannelSet, ch, al))
        fluor.oob = lapply(channels, function(ch) getOOBCh(NChannelSet, ch, al))
        bc = lapply(channels, function(ch) getNbeadCh(NChannelSet, ch, al))  #tgs
        res = list()
        res[["I"]] = fluor
        if (n)
            res[["BC"]] = bc
        if (oob)
            res[["OOB"]] = fluor.oob
        lapply(res, function(r) {
            names(r) = channels
            return(r)
        })
    } 

    signal <- lapply(c(M = "M", U = "U"), function(al) {
        getAllele(NChannelSet, al, parallel = F, n = n, n.sd = n.sd, oob = oob)
    })

    retval = list(methylated = rbind(signal$M$I$R, signal$M$I$G), unmethylated = rbind(signal$U$I$R,
        signal$U$I$G))

    if (n) {
        # tgs
        retval[["m.beadcount"]] = rbind(signal$M$BC$R, signal$M$BC$G)
        retval[["u.beadcount"]] = rbind(signal$U$BC$R, signal$U$BC$G)
        # NBeads takes the lowest bead count for each type I probe.
        retval[["NBeads"]] = pmin(retval[["m.beadcount"]], retval[["u.beadcount"]])
    }
    if (oob) {
        retval[["methylated.OOB"]] = rbind(signal$M$OOB$R, signal$M$OOB$G)
        retval[["unmethylated.OOB"]] = rbind(signal$U$OOB$R, signal$U$OOB$G)
    }
    return(retval)
}  # }}}

## {{{ designIItoMandU2 
## 450k/GoldenGate design (green=methylated, red=unmethylated, single address)
# loads the annotation DB so we can run SQL queries
designIItoMandU2 <- function(NChannelSet, parallel = F, n = F, n.sd = F, oob = T) {
    mapper <- getMethylationBeadMappers2(annotation(NChannelSet))
    probes2 <- mapper$probes(design = "II")
    probes2$M <- probes2$U  ## horrid kludge   
    #readPepo assigns $U to match

    getIntCh <- function(NChannelSet, ch = NULL, al) {
        ch <- ifelse(al == "M", "G", "R")
        newprobes <- lapply(probes2, function(x) {
            x[probes2[[al]] %in% rownames(assayDataElement(NChannelSet, ch))]
        })
        a <- assayDataElement(NChannelSet, ch)[as.character(newprobes[[al]]), , drop = F]
        rownames(a) <- as.character(newprobes[["Probe_ID"]])
        return(a)
    }

    getNbeadCh <- function(NChannelSet, ch = NULL, al) {
        #  tgs
        ch <- ifelse(al == "M", "G", "R")
        newprobes <- lapply(probes2, function(x) {
            x[probes2[[al]] %in% rownames(assayDataElement(NChannelSet, ch))]
        })
        n <- assayDataElement(NChannelSet, "N")[as.character(newprobes[[al]]), ,
            drop = F]
        rownames(n) <- as.character(newprobes[["Probe_ID"]])
        return(n)
    } 

    getAllele <- function(NChannelSet, al, n = F, n.sd = F, oob = F) {
        ch <- ifelse(al == "M", "G", "R")
        res <- list()
        res[["I"]] <- getIntCh(NChannelSet, ch, al)
        if (n)
            res[["BC"]] <- getNbeadCh(NChannelSet, ch, al)
        if (oob) {
            res[["OOB"]] <- res[["I"]]
            is.na(res[["OOB"]]) <- TRUE
        }
        return(res)
    }  # 

    ## M == Grn/Cy3 and U == Red/Cy5, same address
    alleles = c(M = "M", U = "U")
    signal = lapply(alleles, function(a) getAllele(NChannelSet, a, n, n.sd, oob))

    retval = list(methylated = signal$M$I, unmethylated = signal$U$I)
    if (n) {
        # tgs
        retval[["m.beadcount"]] = signal$M$BC
        retval[["u.beadcount"]] = signal$U$BC
        retval[["NBeads"]] = signal$M$BC  #TGSbc
    }

    if (oob) {
        retval[["methylated.OOB"]] = signal$M$OOB
        retval[["unmethylated.OOB"]] = signal$U$OOB
    }
    return(retval)
}  # }}}

#{{{ mergeProbeDesigns2
## TGS: possible to do this better?
mergeProbeDesigns2 <- function(
   NChannelSet, 
   parallel = F, 
   n = F, n.sd = F, 
   oob = T, too=TRUE
) {

   if(length(annotation(NChannelSet))==0) annotation(NChannelSet) <- '.manifest'
   if( !too | annotation(NChannelSet)== "IlluminaHumanMethylation27k") {

      res <- designItoMandU2(
          NChannelSet, parallel = parallel, n = n, n.sd = n.sd, oob = oob
      )

   } else {  # all subsequent human arrays

      design1 <- designItoMandU2(
         NChannelSet, parallel = parallel, n = n, n.sd = n.sd, oob = oob
      )
      design2 <- designIItoMandU2(
         NChannelSet, parallel = parallel, n = n, n.sd = n.sd, oob = oob
      )
      res <- list()
      for (i in names(design1)) {
         res[[i]] <- rbind(design1[[i]], design2[[i]])
         rownames(res[[i]]) <- c(rownames(design1[[i]]), rownames(design2[[i]]))
      }

    }
    return(res)  # reorder on the way out...
}  # }}}

# {{{ NChannelSetToMethyLumiSet2 

#' For internal use, is read using minfi-like machinery and then
#'   preprocessed into the more flexible and convenient methylumi 
#'   object used by wateRmelon/bigmelon
#'
#' @param NChannelSet an NChannelSet (raw red and green values not yet 
#'    mapped to Illumina IDs/CpG names
#' @param parallel  no effect, included for future parallelisation
#' @param pval detection pval threshold for filtering.  Inactivated.
#' @param n keep nbeads data (min of m & u)
#' @param n.sd process SD of U and M (not currently implemented) 
#' @param oob  keep out-of-band signals
#' @param to does chip have type I and II probes?
#' 
#' @return  A methylumi object with betas, U and M, optionally 
#'   additional data

NChannelSetToMethyLumiSet2 <- function(
   NChannelSet, 
   parallel = F, 
   pval = 0.05, 
   n = F,
   n.sd = F, 
   oob = T, 
   to=TRUE
){
   history.submitted = as.character(Sys.time())
   results = mergeProbeDesigns2(
      NChannelSet, 
      parallel = parallel, 
      n = n, 
      n.sd = n.sd,
      oob = oob, 
      too=to
   )
#The next part is somewhat messy, will think of an better way to do this
# ----especially since these are non-options
   if (oob && n) {
      aDat <- with(
         results, 
         assayDataNew(
            methylated = methylated, 
            unmethylated = unmethylated,
            methylated.N = m.beadcount, 
            unmethylated.N = u.beadcount, 
            methylated.OOB = methylated.OOB,
            unmethylated.OOB = unmethylated.OOB, 
            betas = methylated/(methylated + unmethylated), 
            # should this not be +100 in denom?
            pvals = methylated/(methylated + unmethylated), 
            # is there really not a less shit alternative?
            NBeads = NBeads)
        )
    } else if (oob) {
        aDat <- with(
           results, 
           assayDataNew(
              methylated = methylated, 
              unmethylated = unmethylated,
              methylated.OOB = methylated.OOB, 
              unmethylated.OOB = unmethylated.OOB,
              betas = methylated/(methylated + unmethylated), 
              pvals = methylated/(methylated + unmethylated)
           )
        )
    } else if (n) {
        aDat <- with(
           results, 
           assayDataNew(
              methylated = methylated, 
              unmethylated = unmethylated,
              methylated.N = m.beadcount, 
              unmethylated.N = u.beadcount, 
              betas = methylated/(methylated + unmethylated), 
              pvals = methylated/(methylated + unmethylated), 
              NBeads = NBeads
           )
        )
    } else {
        aDat <- with(
           results, 
           assayDataNew(
              methylated = methylated, 
              unmethylated = unmethylated,
              betas = methylated/(methylated + unmethylated), 
              pvals = methylated/(methylated + unmethylated)
           )
        )
    }
    rm(results)
    gc()

    ## now return the MethyLumiSet (which can be directly coerced to
    ## MethyLumiM)
    x.lumi = new("MethyLumiSet", assayData = aDat)
    x.lumi@QC <- getControlProbes2(NChannelSet)
    x.lumi@protocolData <- protocolData(NChannelSet)
    x.lumi@annotation <- annotation(NChannelSet)
    x.lumi@QC@annotation <- annotation(NChannelSet)
    pdat <- data.frame(barcode = sampleNames(NChannelSet))
    rownames(pdat) <- sampleNames(NChannelSet)
    pData(x.lumi) <- pdat
    varLabels(x.lumi) <- c("barcode")
    varMetadata(x.lumi)[, 1] <- c("Illumina BeadChip barcode")
    mapper <- getMethylationBeadMappers2(annotation(NChannelSet))
    fdat <- mapper$ordering()
    rownames(fdat) <- fdat$Probe_ID
    x.fnames <- rownames(betas(x.lumi))
    fdat <- fdat[x.fnames, ]

    ## Regression tests: fail noisily if there is an ordering issue
    stopifnot(identical(rownames(methylated(x.lumi)), rownames(unmethylated(x.lumi))))
    stopifnot(identical(rownames(betas(x.lumi)), rownames(unmethylated(x.lumi))))
    stopifnot(identical(rownames(fdat), rownames(betas(x.lumi))))
    # The only way to feasibly clean up this section would be to outsource the
    # possible metadatas to another script.
    fData(x.lumi) <- fdat
    possibleLabels <- c("Probe_ID", "DESIGN", "COLOR_CHANNEL", "PROBE_TYPE", "SNP10",
        "SYMBOL", "CHR36", "CPG36", "CPGS")
    fvarLabels(x.lumi) <- possibleLabels[1:ncol(fdat)]
    possibleMetadata <- c("Illumina probe ID from manifest", "Infinium design type (I or II)",
        "Color channel (for type I probes)", "Probe locus type (CpG, CpH, or SNP)",
        "SNP (dbSNP build 128) within 10bp of target?", "Gene symbol (if probe is annotated to a gene)",
        "Chromosome mapping for probe in hg18 assembly", "Coordinates of interrogated cytosine in hg18",
        "Number of CpG dinucleotides in probe sequence")
    # something happens to readPepo betas after this point
    fvarMetadata(x.lumi)[, 1] <- possibleMetadata[1:ncol(fdat)]
    #pval.detect(x.lumi) <- pval  # culprit!
    history.finished <- as.character(Sys.time())  
    history.command <- deparse(match.call()) # can't be it
    x.lumi@history <- rbind(
       x.lumi@history, 
       data.frame(
          submitted = history.submitted,
          finished = history.finished, 
          command = history.command)
       ) # cbi
    # if(normalize) x.lumi = normalizeMethyLumiSet(x.lumi)
    return(x.lumi)
}  # }}}

# {{{ methylumIDATepic
methylumIDATepic <- function(barcodes = NULL, pdat = NULL, parallel = F, n = F, n.sd = F,
    oob = T, idatPath = getwd(), force = F, tw=TRUE, ...) {
    if (is(barcodes, "data.frame"))
        pdat = barcodes
    if ((is.null(barcodes)) & (is.null(pdat) | (!("barcode" %in% names(pdat)))))
        {
            stop("\"barcodes\" or \"pdat\" (with pdat$barcode defined) must be supplied.")
        }
    if (!is.null(pdat) && "barcode" %in% tolower(names(pdat))) {
        names(pdat)[which(tolower(names(pdat)) == "barcode")] = "barcode"
        barcodes = pdat$barcode
        if (any(grepl("idat", ignore.case = TRUE, barcodes))) {
            message("Warning: filtering out raw filenames")
            barcodes = gsub("_(Red|Grn)", "", barcodes, ignore.case = TRUE)
            barcodes = gsub(".idat", "", barcodes, ignore.case = TRUE)
        }
        if (any(duplicated(barcodes)))
            {
                message("Warning: filtering out duplicates")
                pdat = pdat[-which(duplicated(barcodes)), ]
                barcodes = pdat$barcode
            } 
    } else {
        if (any(grepl("idat", ignore.case = TRUE, barcodes))) {
            message("Warning: filtering out raw filenames")
            barcodes = unique(gsub("_(Red|Grn)", "", barcodes, ignore.case = TRUE))
            barcodes = unique(gsub(".idat", "", barcodes, ignore.case = TRUE))
        }
        if (any(duplicated(barcodes))) {
            message("Warning: filtering out duplicate barcodes")
            barcodes = barcodes[which(!duplicated(barcodes))]
        }
    } 
    files.present = rep(TRUE, length(barcodes)) 
    idats = sapply(barcodes, function(b) paste(b, c("_Red", "_Grn"), ".idat", sep = ""))
    for (i in colnames(idats)) for (j in idats[, i]) if (!j %in% list.files(idatPath,
        recursive = TRUE)) {
        message(paste("Error: file", j, "is missing for sample", i))
        files.present = FALSE
    }
    stopifnot(all(files.present)) 

    mats <- IDATsToMatrices2(barcodes, parallel = parallel, idatPath = idatPath)
    dats <- DataToNChannelSet2(mats, IDAT = T, parallel = parallel, force = force)
    mlumi <- NChannelSetToMethyLumiSet2(dats, parallel = parallel, oob = oob, n = n, to=tw)

    if (is.null(pdat)) {
        pdat = data.frame(barcode = as.character(barcodes))
        rownames(pdat) = pdat$barcode
        pData(mlumi) = pdat 
    } else {
        pData(mlumi) = pdat
    }
    if (!is.null(mlumi@QC))
        {
            sampleNames(mlumi@QC) = sampleNames(mlumi)
        } 

    # finally
    colnames(mlumi) <- as.character(colnames(mlumi))  # Fixing _that_ one bug.
    return(mlumi[sort(featureNames(mlumi)), ])
}  # }}}

#{{{ readEPIC
readEPIC <- function(idatPath, barcodes = NULL, pdat = NULL, parallel = F, n = T,
    oob = F, force = F, two=TRUE, ...) {
    # path: file path to folder containing idat files, if working directory
    # is folder containing idats, use path <- './' the gsub will catch all
    # types of arrays as it is technically impossible to distinguish chiptype
    # through barcode.
    if (is.null(barcodes)) {
        barcodes <- idatPath
        methylumIDATepic(bfp(barcodes), pdat = pdat, parallel = parallel, n = n,
            n.sd = n.sd, oob = oob, idatPath = idatPath, force = force, tw=two, ...)
    } else {
        methylumIDATepic(barcodes, pdat = pdat, parallel = parallel, n = n, n.sd = n.sd,
            oob = oob, idatPath = idatPath, force = force, tw=two, ...)
    }

}  # }}}

bfp <- function(path) { #{{{
    # Barcodes from path, incase one wishes to use MethylumIDATepic manually.
    # bar <- dir(path, recursive=T)[grepl('R0[12345678]C0[12]_(Red|Grn).idat',
    # dir(path, recursive=T))]
    bar <- dir(path, recursive = TRUE)[grepl("_(Red|Grn).idat", dir(path, recursive = TRUE))]
    bar <- gsub("_(Red|Grn).idat", "", bar, ignore.case = TRUE)
    # Automatically sift out duplicated filenames!
    bar <- bar[!duplicated(basename(bar))]
    return(bar)
}#}}}

generateManifest <- function(anno = c("450k", "EPIC", "EPICv2", ".manifest")) { #{{{
    anno <- match.arg(anno)
    # Possible to add more manifests here!
    anno <- switch(anno, 
        `450k` = "IlluminaHumanMethylation450kanno.ilmn12.hg19",
        EPIC   = "IlluminaHumanMethylationEPICanno.ilm10b4.hg19", 
        EPICv2 = "IlluminaHumanMethylationEPICv2anno.20a1.hg38",
        .manifest = "alreadyCooked"
    )
    if (anno == "alreadyCooked") {man <- .manifest} else
    {man <- getAnnotationObject(anno)}
    x <- getAnnotation(man)[, c("Name", "AddressB", "AddressA", "Type", "Color")]
    # Name = Name AddressB = M AddressA = U Type = DESIGN Color = COLOR_CHANNEL
    # Generate manifest.
    snpI  <- getProbeInfo(man, type = "SnpI" )[, c(1, 2, 3, 4)]
    snpII <- getProbeInfo(man, type = "SnpII")[, c(1, 2)]
    snpI  <- cbind(snpI[, c("Name", "AddressB", "AddressA")], rep("I", nrow(snpI)),
        snpI[, "Color"])
    snpII <- cbind(snpII[, "Name"], rep("", nrow(snpII)), snpII[, "AddressA"], rep("II",
        nrow(snpII)), rep("", nrow(snpII)))
    colnames(snpI) <- colnames(snpII) <- colnames(x)
    x1 <- rbind(data.frame(x, stringsAsFactors = F), data.frame(snpI, stringsAsFactors = F),
        data.frame(snpII, stringsAsFactors = F))
    x1$col <- x1$Color
    x1$Color[x1$Color == ""   ] <- "Both"
    x1$col  [x1$Color == "Red"] <- "R"
    x1$col  [x1$Color == "Grn"] <- "G"
    is.na(x1$col) <- x1$Color == "Both"
    colnames(x1) <- c("Probe_ID", "M", "U", "DESIGN", "COLOR_CHANNEL", "col")
    x1$COLOR_CHANNEL <- factor(x1$COLOR_CHANNEL)
    x1$col <- factor(x1$col)
    return(x1)
} #}}}

# memory hole {{{
## for HM27k, all probes are of "design I": single-channel, two-address pairings
## for HM450k, probes are either "design I" or "design II" as noted in manifest!
## for epic, probes are either "design I" or "design II" as noted in manifest
## IDATs from GoldenGate methylation arrays are not supported at this time.
#cy3 <- function(object) { #
#  if(is.element('Color_Channel', fvarLabels(object)) &&
#     !is.element('COLOR_CHANNEL', fvarLabels(object))) {
#    fvarLabels(object)<-gsub('Color_Channel','COLOR_CHANNEL',fvarLabels(object))
#  }
#  if(!is.element('COLOR_CHANNEL', fvarLabels(object))) {
#    annotChip <- paste(annotation(object),'db',sep='.')
#    object <- addColorChannelInfo(object, annotChip)
#  }
#  return(which(fData(object)$COLOR_CHANNEL=='Grn'))
#} #


#cy5 <- function(object) { #
#  if(is.element('Color_Channel', fvarLabels(object)) &&
#     !is.element('COLOR_CHANNEL', fvarLabels(object))) {
#    fvarLabels(object)<-gsub('Color_Channel','COLOR_CHANNEL',fvarLabels(object))
#  }
#  if(!is.element('COLOR_CHANNEL', fvarLabels(object))) {
#    annotChip <- paste(annotation(object),'db',sep='.')
#    object <- addColorChannelInfo(object, annotChip)
#  }
#  return(which(fData(object)$COLOR_CHANNEL=='Red'))
#} # }}}

#.onLoad = function(libname, pkgname){ 
#  anchorenv <- new.env(parent = getNamespace("wateRmelon"))
#  assign("anchorenv", anchorenv, envir = getNamespace("wateRmelon"))
#  assign("types", character(0), envir=anchorenv)
#  assign("index", list(), envir=anchorenv)
#  assign("history", list(), envir=anchorenv)
#}  method of creading a hidey hole for packge variables see 
#https://hydroecology.net/implementing-session-cache-r-packages/
schalkwyk/wateRmelon documentation built on June 30, 2024, 7:37 a.m.