R/read.R

# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2021  Kevin Lu
# Parts modified from EpigenCentral

#' Read Illumina 450k methylation array IDATs specified in a sample sheet
#'
#' A sample sheet is a CSV with at least the following columns:
#' Sample_Name, Sample_Group, Sentrix_ID, Sentrix_Position
#' Further columns may annotate with other metadata about the samples for
#' other types of analysis not performed by this package.
#'
#' Sample_Name should be unique. Sample_Group should be either "control" or the
#' name of a case type, which will be used to differentiate the classes when
#' finding differentially methylated CpGs.
#'
#' paste(Sample_Name, Sentrix_ID, Sentrix_Position, sep = "_") should result in
#' a prefix of the filenames of the corresponding red and green channel IDATs.
#'
#' An example for the GSE55491 dataset of Silver Russel syndrome patients is
#' in extdata/
#'
#' @param csv_sample_sheet_file Path to a CSV sample sheet
#'
#' @return normalized minfi GenomicRatioSet
#'
#' @examples
#' \dontrun{
#' grset <- read_idat("extdata/GSE55491/samplesheet.rss-GSE55491.csv")
#' }
#' @references
#' Prickett AR, Ishida M, Böhm S, Frost JM et al. Genome-wide methylation analysis
#' in Silver-Russell syndrome patients. Hum Genet 2015 Mar;134(3):317-332. PMID: 25563730
#'
#' @export
read_idat <- function (csv_sample_sheet_file) {
  targets <- utils::read.csv(csv_sample_sheet_file, strip.white = TRUE)
  targets$Basename <- paste(
    dirname(csv_sample_sheet_file),
    paste(targets$Sentrix_ID, targets$Sentrix_Position, sep = "_"),
    sep = "/"
  )
  # TODO: use a pipe here
  rgset <- minfi::read.metharray.exp(targets = targets)
  grset <- minfi::ratioConvert(minfi::mapToGenome(minfi::preprocessIllumina(rgset)))
  grset
}

#' Parse tab-delimited methylation matrices often found in GEO datasets.
#'
#' This is meant to parse the uncompressed Series Matrix files when IDATs
#' are not available. A CSV sample sheet of the same format required by read_idat
#' should still be provided to annotate Sample_Group for analysis, otherwise
#' you will only be able to generate PCA plots.
#'
#' @param tsv_beta_matrix_file Path to the GEO text file
#' @param csv_sample_sheet_file Path to the sample sheet for annotations. Optional
#'
#' @examples
#' \dontrun{
#' grset <- read_geo_tsv("extdata/GSE55491/GSE55491_series_matrix.txt")
#' }
#' @references
#' Prickett AR, Ishida M, Böhm S, Frost JM et al. Genome-wide methylation analysis
#' in Silver-Russell syndrome patients. Hum Genet 2015 Mar;134(3):317-332. PMID: 25563730
#'
#' @return parsed minfi GenomicRatioSet
#' @export
read_geo_tsv <- function (tsv_beta_matrix_file, csv_sample_sheet_file = NULL) {
  betas <- utils::read.delim(tsv_beta_matrix_file, strip.white = TRUE, comment.char = "!")

  # Transform to structure expected by minfi
  methyls <- data.matrix(betas[, -1])
  rownames(methyls) <- betas[[1]]
  methyls <- methyls[grepl("^cg", rownames(methyls)), ]

  # From https://support.bioconductor.org/p/73941/
  rset <- minfi::RatioSet(Beta = methyls)
  BiocGenerics::annotation(rset) <- c(array = "IlluminaHumanMethylation450k", annotation = "ilmn12.hg19")
  grset <- minfi::mapToGenome(rset)

  # Infer sample sheet as needed
  if (is.null(csv_sample_sheet_file)) {
    pheno <- data.frame(Sample_Name = colnames(grset), Sentrix_ID = 0, Sentrix_Position = 0)
  } else {
    pheno <- utils::read.csv(csv_sample_sheet_file, strip.white = TRUE)
  }

  # Annotate GenomicRatioSet with sample sheet
  Biobase::pData(grset) <- methods::as(pheno, "DataFrame")
  colnames(grset) <- pheno$Sample_Name
  rownames(Biobase::pData(grset)) <- pheno$Sample_Name

  grset
}

# [END]
kevinlul/EpigeneLite documentation built on Dec. 21, 2021, 6:35 a.m.