In omicsr/dmapaq: DNA Methylation Arrays Processing And Quality-Control

options(stringsAsFactors = FALSE)


### Environment ====================================================================================
output_directory <- params[["output_directory"]]
dir.create(output_directory, showWarnings = FALSE, recursive = TRUE, mode = "0777")


### Load packages ==================================================================================
suppressPackageStartupMessages({
  library("ragg")
  library("stats")
  library("utils")
  library("data.table")
  library("dmapaq")
  library("ENmix")
  library("flashpcaR")
  library("ggdendro")
  library("ggplot2")
  library("ggrepel")
  library("ggtext")
  library("gt")
  library("knitr")
  library("minfi")
  library("parallel")
  library("patchwork")
  library("readr")
  library("RefFreeEWAS")
  library("scales")
  library("sessioninfo")
  library("sva")
  # library("rmarkdown")
  # library("bookdown")
  # library("IlluminaHumanMethylation450kmanifest")
  # library("IlluminaHumanMethylationEPICmanifest") # renv::install("achilleasNP/IlluminaHumanMethylationEPICmanifest")
  # library("IlluminaHumanMethylationEPICanno.ilm10b5.hg38") # renv::install("achilleasNP/IlluminaHumanMethylationEPICanno.ilm10b5.hg38")
  # library("preprocessCore") # options(configure.args = "--disable-threading"); renv::install("bmbolstad/preprocessCore")
  library("FlowSorted.CordBloodCombined.450k")
  library("FlowSorted.Blood.EPIC")
})


### knitr settings =================================================================================
knitr::opts_chunk$set(
  results = "asis",
  include = TRUE,
  echo = FALSE,
  warning = FALSE,
  message = FALSE,
  dpi = 120,
  tidy = FALSE,
  crop = TRUE,
  autodep = TRUE,
  fig.align = "center",
  dev = "ragg_png"
)


### Define theme ===================================================================================
ggplot2::theme_set(
  ggplot2::theme_minimal(base_size = 11) +
    ggplot2::theme(
      plot.title = ggtext::element_markdown(),
      plot.title.position = "plot",
      plot.subtitle = ggtext::element_markdown(face = "italic", size = ggplot2::rel(0.8)),
      plot.caption = ggtext::element_markdown(face = "italic", size = ggplot2::rel(0.5)),
      plot.caption.position = "plot",
      axis.title.x = ggtext::element_markdown(),
      axis.title.y = ggtext::element_markdown(),
      axis.text.x = ggtext::element_markdown(),
      axis.text.x.top = ggtext::element_markdown(),
      axis.text.y = ggtext::element_markdown()
    )
)


### Functions ======================================================================================
`:=` <- data.table::`:=`
`%>%` <- gt::`%>%`

percent0 <- scales::percent_format(accuracy = 1, suffix = " %")
percent1 <- scales::percent_format(accuracy = 0.1, suffix = " %")
percent2 <- scales::percent_format(accuracy = 0.01, suffix = " %")
comma <- scales::comma_format(accuracy = 1)
scientific <- function(x, ...) gsub("(.*)e([-+]*)0*(.*)", "\\1 &times; 10<sup>\\2\\3</sup>", scales::scientific(x, ...))


### Sex check ======================================================================================
do_check_sex <- !is.null(params[["sex_colname"]])


### Cell heterogeneity =============================================================================
if (nchar(system.file(package = "FlowSorted.Blood.EPIC")) == 0) {
  do_cell_composition <- FALSE
  message('Package "FlowSorted.Blood.EPIC" is not available.')
} else {
  do_cell_composition <- !is.null(params[["cell_tissue"]])
}

Methods and Parameters

Array: r params[["array"]]
Annotation package from bioconductor: r params[["annotation"]]

Call Rate
- filter_callrate: r params[["filter_callrate"]]
- The threshold for the detection pvalues: r params[["detection_pvalues"]]
- The call rate threshold for samples: r params[["callrate_samples"]]
  => should samples with less than the specified call rate (r params[["callrate_samples"]]) for detection p-values below $\alpha=r params[["detection_pvalues"]]$ be removed?
- The call rate threshold for probes: r params[["callrate_probes"]]
  => should probes with less than the specified call rate (r params[["callrate_probes"]]) for detection p-values below $\alpha=r params[["detection_pvalues"]]$ be removed?
Pre-Processing
- The method to estimate background normal distribution parameters: r params[["norm_background"]]
  => method from ENmix
- The dye bias correction: r params[["norm_dye"]]
  => method from ENmix
- The quantile normalisation: r params[["norm_quantile"]]
  => method from ENmix
Probes Filtering
- filter_snps: r params[["filter_snps"]]
  => should probes in which the probed CpG falls near a SNP be removed? (Zhou et al., 2016)
- filter_non_cpg: r params[["filter_non_cpg"]]
  => should non-cg probes be removed?
- filter_xy: r params[["filter_xy"]]
  => should probes from X and Y chromosomes be removed?
- filter_multihit: r params[["filter_multihit"]] (Nordlund et al., 2013)
  => should probes which align to multiple locations be removed?
- filter_beads: r params[["filter_beads"]]
  => should probes with less than three beadcount in at least r scales::percent(params[["bead_cutoff"]]) of the samples be removed?
Sex Check
- The threshold value to discrimate sex: r params[["sex_threshold"]]
  => flag samples with sex discrepancy.
Ethnicity Check
- Name of the ethnicity population: r if (is.null(params[["population"]])) "Not defined" else params[["population"]]
Cell Composition
- The cell tissue: r params[["cell_tissue"]]
  => using a reference panel or RefFreeCellMix method from RefFreeEWAS
Final Processing
- Probe design type bias correction: r "rcp"
  => method from ENmix
- Batch effect normalisation: r "ComBat"
  => method from sva

Quality Control

sample_sheet <- data.table::fread(params[["csv_file"]], skip = "Sample_Name")
sample_sheet[ 
  j = Sample_ID := {
    x <- as.character(1:.N)
    data.table::fifelse(x == "1", as.character(Sample_Name), paste(Sample_Name, x, sep = "_"))
  }, 
  by = "Sample_Name"
]

if (any(grepl("^[0-9]", sample_sheet[["Sample_ID"]]))) {
  sample_sheet[
    j = Sample_ID := paste0("ID_", Sample_ID)
  ]
}

data.table::setcolorder(sample_sheet, neworder = "Sample_ID")

if (do_check_sex) {
  sample_sheet[
    j = qc_observed_sex := c(
      "1" = 1, "2" = 2, "M" = 1, "F" = 2, "0" = 2
    )[as.character(get(params[["sex_colname"]]))]
  ]
  pca_vars <- intersect(colnames(sample_sheet), unique(c(params[["pca_vars"]], "qc_observed_sex")))
} else {
  pca_vars <- intersect(colnames(sample_sheet), params[["pca_vars"]])
}

data.table::fwrite(x = sample_sheet, file = file.path(tempdir(), "sample_sheet.csv"))

data_idats <- read_idats(
  directory = params[["data_directory"]],
  csv_file = file.path(tempdir(), "sample_sheet.csv"),
  meth_value_type = "B",
  filter_beads = params[["filter_beads"]],
  bead_cutoff = params[["bead_cutoff"]],
  filter_non_cpg = params[["filter_non_cpg"]],
  filter_snps = params[["filter_snps"]],
  population = params[["population"]],
  filter_multihit = params[["filter_multihit"]],
  filter_xy = params[["filter_xy"]],
  detection_pvalues = params[["detection_pvalues"]],
  filter_callrate = params[["filter_callrate"]],
  callrate_samples = params[["callrate_samples"]],
  callrate_probes = params[["callrate_probes"]],
  norm_background = params[["norm_background"]],
  norm_dye = params[["norm_dye"]],
  norm_quantile = params[["norm_quantile"]],
  array_name = params[["array"]],
  annotation_version = params[["annotation"]],
  n_cores = min(c(nrow(sample_sheet), 25, parallel::detectCores()))
)
data_mset <- data_idats[["mset"]]
phenotypes <- as.data.table(data_mset@metadata[["phenotypes"]])[
  j = c("Sample_ID", "Sample_Plate", "Sentrix_ID") := lapply(.SD, as.character),
  .SDcols = c("Sample_ID", "Sample_Plate", "Sentrix_ID")
]
data_rgset <- data_idats[["rgset"]]
readr::write_rds(x = data_idats, file = file.path(output_directory, paste0(params[["array"]], "_idats.rds")))

cat(paste("+", data_idats[["log"]]), sep = "\n")

phenotypes[
  j = sex_fct := factor(qc_observed_sex, levels = c(1, 2, 0), labels = c("Male", "Female", "Unspecified"))
]

phenotypes[j = .N, by = sex_fct][j = list(sex_fct, N)] %>% 
  gt::gt(auto_align = TRUE) %>%
  gt::tab_header(
    title = "Samples Available", 
    subtitle = gt::md("*EPIC Array*")
  ) %>%
  gt::fmt_number(columns = "N", decimals = 0) %>% 
  gt::grand_summary_rows(
    columns = "N",
    fns = list(Total = ~ sum(.)),
    formatter = gt::fmt_number,
    decimals = 0
  ) %>% 
  gt::cols_label(sex_fct = "Sex") %>% 
  gt::opt_row_striping() %>% 
  gt::opt_all_caps() %>% 
  print()

callrate_thresholds <- sort(unique(c(params[["callrate_samples"]], 0.90, 0.95, 0.97, 0.98, 0.99, 1)), decreasing = TRUE)
data.frame(
  X1 = percent2(callrate_thresholds),
  X2 = rowSums(sapply(phenotypes[["call_rate"]], "<", callrate_thresholds)),
  X3 = percent2(rowSums(sapply(phenotypes[["call_rate"]], "<", callrate_thresholds)) / nrow(phenotypes))
) %>% 
  gt::gt(auto_align = TRUE) %>% 
  gt::tab_header("Number Of Samples To Exclude Based On Call Rate Thresholds") %>% 
  gt::tab_style(
    style = gt::cell_fill(color = "dodgerblue", alpha = 0.5), 
    locations = gt::cells_body(
      columns = gt::everything(), 
      rows = X1 == percent2(params[["callrate_samples"]])
    )
  ) %>% 
  gt::cols_label(
    X1 = "Call Rate Threshold",
    X2 = "Samples to Exclude (N)",
    X3 = "Samples to Exclude (%)"
  ) %>% 
  gt::opt_row_striping() %>% 
  gt::opt_all_caps() %>% 
  print()

ggplot2::ggplot(
  data = phenotypes[order(call_rate), list(Sample_ID, call_rate)][
    j = c("x", "labs") := list(1:.N, ifelse(call_rate < params[["callrate_samples"]], Sample_ID, NA))
  ]
) +
  ggplot2::aes(x = seq_along(call_rate), y = call_rate) +
  ggplot2::geom_point(
    colour = scales::viridis_pal(begin = 0.5, end = 0.5)(1), 
    shape = 1, 
    na.rm = TRUE
  ) +
  ggrepel::geom_label_repel(
    data = ~ .x[j = labs := if (sum(!is.na(labs)) > params[["max_labels"]]) NA else labs],
    mapping = ggplot2::aes(label = labs),
    min.segment.length = ggplot2::unit(0, "lines"),
    force = 10, 
    fill = "white",
    colour  = "firebrick2", 
    segment.colour = "firebrick2", 
    size = 3, 
    na.rm = TRUE
  ) +
  ggplot2::geom_hline(
    mapping = ggplot2::aes(yintercept = params[["callrate_samples"]]),
    colour = "firebrick2", 
    linetype = 2
  ) +
  ggplot2::scale_x_continuous(labels = comma, trans = "log10") +
  ggplot2::scale_y_continuous(
    breaks = function(x) unique(c(scales::breaks_extended()(x), params[["callrate_samples"]])),
    labels = function(x) ifelse(x == params[["callrate_samples"]], paste0("<b style='color:firebrick2;'>", percent1(x), "</b>"), percent1(x))
  ) +
  ggplot2::labs(x = "Number of Samples", y = "Call Rate", title = "Sample Call Rate")

Cell composition

if (
  grepl("^blood$", params[["cell_tissue"]], ignore.case = TRUE) &
    nchar(system.file(package = paste0("FlowSorted.Blood.", params[["array"]]))) == 0
) {
  do_cell_composition <- FALSE
  message(paste0('Package "', paste0("FlowSorted.Blood.", params[["array"]]), '" is not available.'))
}

if (
  grepl("^cordblood$", params[["cell_tissue"]], ignore.case = TRUE) &
    nchar(system.file(package = "FlowSorted.CordBloodCombined.450k")) == 0
) {
  do_cell_composition <- FALSE
  message('Package "FlowSorted.CordBloodCombined.450k" is not available.')
}

if (
  (
    !grepl("^blood$", params[["cell_tissue"]], ignore.case = TRUE) &
      !grepl("^cordblood$", params[["cell_tissue"]], ignore.case = TRUE)
  ) &
    nchar(system.file(package = "RefFreeEWAS")) == 0
) {
  do_cell_composition <- FALSE
  message('Package "RefFreeEWAS" is not available.')
}

if (!do_cell_composition) {
  cat("No cell tissue was provided or no available reference set (packages).\n")
} else {
  cat("IDOL optimised CpGs are used when available in the relevant reference set or method.\n")
}

cell_comp <- switch(
  EXPR = tolower(params[["cell_tissue"]]),
  "blood" = {
    idol_cpgs <- switch(params[["array"]],
      "450k" = FlowSorted.Blood.EPIC::IDOLOptimizedCpGs450klegacy,
      "EPIC" = FlowSorted.Blood.EPIC::IDOLOptimizedCpGs
    )
    out <- FlowSorted.Blood.EPIC::estimateCellCounts2(
      rgSet = data_rgset,
      compositeCellType = "Blood",
      processMethod = "preprocessNoob",
      probeSelect = "IDOL",
      cellTypes = c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Neu"),
      referencePlatform = paste0("IlluminaHumanMethylation", params[["array"]]),
      IDOLOptimizedCpGs = idol_cpgs,
      returnAll = FALSE,
      meanPlot = FALSE,
      verbose = FALSE
    )$counts
    colnames(out) <- paste0("CellT_", colnames(out))
    out
  },
  "cordbloodlegacy" = {
    out <- FlowSorted.Blood.EPIC::estimateCellCounts2(
      rgSet = data_rgset,
      compositeCellType = "CordBlood",
      processMethod = "preprocessNoob",
      probeSelect = "auto",
      cellTypes = c("Bcell", "CD4T", "CD8T", "Gran", "Mono", "NK", "nRBC"),
      referencePlatform = "IlluminaHumanMethylation450k",
      IDOLOptimizedCpGs = NULL,
      returnAll = FALSE,
      meanPlot = FALSE,
      verbose = FALSE
    )$counts
    colnames(out) <- paste0("CellT_", colnames(out))
    out
  },
  "cordblood" = {
    if (nchar(system.file(package = "FlowSorted.CordBloodCombined.450k")) > 0) {
      idol_cpgs <- get(utils::data("IDOLOptimizedCpGsCordBlood", package = "FlowSorted.CordBloodCombined.450k"))
    } else {
      idol_cpgs <- NULL
    }
    # FlowSorted.CordBloodCombined.450k <- FlowSorted.CordBloodCombined.450k::FlowSorted.CordBloodCombined.450k()
    out <- FlowSorted.Blood.EPIC::estimateCellCounts2(
      rgSet = data_rgset,
      compositeCellType = "CordBloodCombined",
      processMethod = "preprocessNoob",
      probeSelect = if (is.null(idol_cpgs)) "auto" else "IDOL",
      cellTypes = c("Bcell", "CD4T", "CD8T", "Gran", "Mono", "NK", "nRBC"),
      referencePlatform = paste0("IlluminaHumanMethylation", params[["array"]]),
      # referenceset = "FlowSorted.CordBloodCombined.450k",
      IDOLOptimizedCpGs = idol_cpgs,
      returnAll = FALSE,
      meanPlot = FALSE,
      verbose = FALSE
    )$counts
    colnames(out) <- paste0("CellT_", colnames(out))
    out
  },
  {
    estimate_k_cluster <- function(Rmat, max_k = 25, n_cores = 1) {
      svdRmat <- RefFreeEWAS::svdSafe(Rmat)
      tmp <- do.call("rbind", mclapply(
        X = 0:max_k,
        mc.cores = n_cores,
        mc.preschedule = FALSE,
        mc_Rmat = Rmat,
        mc_svdRmat = svdRmat,
        FUN = function(Ktest, mc_Rmat, mc_svdRmat) {
          N1 <- dim(mc_Rmat)[1]
          N2 <- dim(mc_Rmat)[2]
          if (Ktest == 0) {
            tmpRminLU <- mc_Rmat
          } else {
            tmpRminLU <- mc_Rmat - mc_svdRmat$u[, 1:Ktest] %*% 
              (mc_svdRmat$d[1:Ktest] * t(mc_svdRmat$v[, 1:Ktest]))
          }
          tmpSigSq <- rowSums(tmpRminLU * tmpRminLU) / N2

          c(
            K = Ktest,
            AIC = 2 * (N1 + Ktest * (N1 + N2)) +
              N1 * N2 +
              N2 * sum(log(tmpSigSq)),
            BIC = log(N2) * (N1 + Ktest * (N1 + N2)) +
              N1 * N2 +
              N2 * sum(log(tmpSigSq))
          )
      }))

      list(
        icTable = tmp,
        best = tmp[c(AIC = which.min(tmp[, "AIC"]), BIC = which.min(tmp[, "BIC"])), "K"],
        custom_best = tmp[c(
          AIC = which.max(abs(diff(tmp[, "AIC"])[-1])) + 1,
          BIC = which.max(abs(diff(tmp[, "BIC"])[-1])) + 1
        ), "K"]
      )
    }

    beta_matrix <- stats::na.exclude(minfi::getBeta(data_mset))
    max_k <- min(ncol(beta_matrix), 25)
    k_estimated <- min(estimate_k_cluster(
      Rmat = beta_matrix,
      max_k = max_k,
      n_cores = min(params[["n_cores"]], max_k)
    )$best)
    mu0 <- RefFreeEWAS::RefFreeCellMixInitialize(
      Y = beta_matrix,
      K = k_estimated,
      Y.Distance = NULL,
      Y.Cluster = NULL,
      largeOK = TRUE,
      dist.method = "euclidean"
    )

    RefFreeCellMixObj <- RefFreeEWAS::RefFreeCellMix(
      Y = beta_matrix,
      mu0 = mu0,
      K = NULL,
      iters = 10,
      Yfinal = NULL,
      verbose = FALSE
    )

    out <- RefFreeCellMixObj[["Omega"]]
    colnames(out) <- paste0("CellT_", 1:ncol(out))
    out
  }
)

phenotypes <- merge(
  x = phenotypes,
  y = data.table::as.data.table(cell_comp, keep.rownames = "Sample_ID"),
  by = "Sample_ID"
)

cell_cols <- grep("^CellT_", names(phenotypes), value = TRUE)
dd_row <- stats::as.dendrogram(
  stats::hclust(
    d = stats::dist(phenotypes[j = ..cell_cols], method = "euclidean"), 
    method = "ward.D2"
  )
)
dd_col <- stats::as.dendrogram(
  stats::hclust(
    d = stats::dist(data.table::transpose(phenotypes[j = ..cell_cols]), method = "euclidean"), 
    method = "ward.D2"
  )
)
p_heatmap <- list(
  ggplot2::ggplot(
    data = data.table::melt(
      phenotypes[j = .SD, .SDcols = c("Sample_ID", cell_cols)], 
      measure.vars = grep("^CellT_", names(phenotypes), value = TRUE)
    )[ 
      j = c("Sample_ID", "variable") := 
        list(
          factor(Sample_ID, levels = phenotypes[stats::order.dendrogram(dd_row), Sample_ID]),
          factor(variable, levels = cell_cols[stats::order.dendrogram(dd_col)])
        )
    ]
  ) +
    ggplot2::aes(
      x = variable, 
      y = Sample_ID, 
      fill = scales::rescale(value, to = c(0, 1))
    ) +
    ggplot2::geom_tile() +
    ggplot2::scale_x_discrete(
      expand = c(0, 0), 
      labels = function(x) gsub("CellT_", "", x), 
      position = "top"
    ) +
    ggplot2::scale_y_discrete(position = "right", expand = c(0, 0)) +
    ggplot2::scale_fill_viridis_c(
      limits = c(0, 1), 
      labels = percent0, 
      guide = ggplot2::guide_colourbar(
        title = "Cell Composition", 
        title.position = "top", 
        title.hjust = 0.5,
        direction = "horizontal", 
        barwidth = ggplot2::unit(8, units = "lines"),
        raster = TRUE
      )
    ) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      axis.title = ggplot2::element_blank(),
      axis.text.y.right = if (nrow(phenotypes) > params[["max_labels"]]) element_blank() else element_text(),
      axis.ticks = ggplot2::element_line(colour = "black"),
      axis.ticks.length = ggplot2::unit(x = 0.1, units = "line")
    ) +
    ggplot2::labs(x = "Cell Type", y = "Sample"),

  ggplot2::ggplot() +
    ggplot2::geom_segment(
      data = ggdendro::segment(ggdendro::dendro_data(dd_col, type = "rectangle")),
      mapping = ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
      size = 0.5
    ) +
    ggplot2::theme_void() +
    ggplot2::scale_x_continuous(expand = ggplot2::expansion(add = c(0.5, 0.5))) +
    ggplot2::scale_y_continuous(expand = ggplot2::expansion(c(0, 0.1))),

  ggplot2::ggplot() +
    ggplot2::geom_segment(
      data = ggdendro::segment(ggdendro::dendro_data(dd_row, type = "rectangle")),
      mapping = ggplot2::aes(x = y, y = x, xend = yend, yend = xend),
      size = 0.5
    ) +
    ggplot2::theme_void() +
    ggplot2::scale_x_continuous(expand = ggplot2::expansion(mult = c(0, 0.1))) +
    ggplot2::scale_y_continuous(expand = ggplot2::expansion(add = c(0.5, 0.5))),

  patchwork::guide_area()
)

patchwork::wrap_plots(p_heatmap, design = "BD\nAC", guides = "collect", widths = c(2/3, 1/3), heights = c(1/3, 2/3))

Sex check

if (!do_check_sex) {
  cat("No phenotypes for sex was provided.\n")
}

if (is.null(params[["sex_threshold"]])) {
  sex_predicted <- minfi::getSex(minfi::mapToGenome(data_rgset), cutoff = -2)
  sex_density <- stats::density(sex_predicted$yMed - sex_predicted$xMed, n = 100000)
  min_diff_xy <- which(diff(sign(diff(sex_density$y))) == 2)
  min_diff_xy <- min_diff_xy[which.min(sex_density$y[min_diff_xy])]
  sex_threshold <- round(x = sex_density$x[min_diff_xy], digits = 3)
} else {
  sex_threshold <- params[["sex_threshold"]]
}
sex_predicted <- data.table::as.data.table(
  minfi::getSex(minfi::mapToGenome(data_rgset), cutoff = sex_threshold)
)[j = Sample_ID := minfi::sampleNames(data_rgset)]
data.table::setnames(
  x = sex_predicted,
  old = c("xMed", "yMed", "predictedSex"), 
  new = paste0("qc_", c("xmedian", "ymedian", "predicted_sex"))
)

phenotypes <- merge(
  x = phenotypes,
  y = sex_predicted[
    j = c("Sample_ID", "qc_predicted_sex") := list(
      as.character(Sample_ID),
      c("1" = 1, "2" = 2, "M" = 1, "F" = 2, "0" = 2)[qc_predicted_sex]
    )
  ], 
  by = "Sample_ID"
)[j = qc_sex_discrepancy := is.na(qc_observed_sex) | qc_observed_sex != qc_predicted_sex]

phenotypes[
  j = c("qc_predicted_sex", "qc_observed_sex") := 
    lapply(.SD, factor, levels = c(1, 2, 0), labels = c("Male", "Female", "Unspecified")), 
  .SDcols = c("qc_predicted_sex", "qc_observed_sex")
]

ggplot2::ggplot(data = phenotypes) + 
  ggplot2::aes(x = qc_ymedian - qc_xmedian) +
  ggplot2::geom_density(na.rm = TRUE) +
  ggplot2::geom_vline(xintercept = sex_threshold, linetype = 2, colour = "firebrick2", na.rm = TRUE) +
  ggplot2::scale_x_continuous(
    expand = ggplot2::expansion(mult = c(0, 0)),
    sec.axis = dup_axis(
      name = NULL,
      breaks = function(x) unique(c(scales::breaks_extended()(x), sex_threshold)),
      labels = function(x) ifelse(x == sex_threshold, paste0("<b style='color:firebrick2;'>", x, "</b>"), "")
    )
  ) +
  ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0, 0.05))) +
  ggplot2::labs(
    x = paste0(
      "Y Chromosome Median Total Intensity (log<sub>2</sub>)<br>", 
      "- X Chromosome Median Total Intensity (log<sub>2</sub>)"
    ),
    y = "Density",
    title = "Sex Threshold Detection"
  )

axis_limits <- range(phenotypes[j = c("qc_xmedian", "qc_ymedian")], na.rm = TRUE)

ggplot2::ggplot(data = phenotypes) +
  ggplot2::aes(
    x = qc_xmedian,
    y = qc_ymedian,
    shape = factor(qc_observed_sex),
    colour = factor(qc_observed_sex)
  ) +
  ggplot2::geom_polygon(
    data = data.frame(
      qc_xmedian = c(c(0, 0, 20), c(0, 20, 20)),
      qc_ymedian = c(c(0, 20, 20), c(0, 0, 20)) + sex_threshold,
      qc_predicted_sex = factor(rep(c(1, 2), each = 3), levels = c(1, 2), labels = c("Male", "Female"))
    ),
    mapping = ggplot2::aes(x = qc_xmedian, y = qc_ymedian, fill = qc_predicted_sex),
    alpha = 0.1,
    inherit.aes = FALSE
  ) +
  ggplot2::geom_abline(
    data = data.frame(Threshold = paste("=", sex_threshold), Seuil = sex_threshold),
    mapping = ggplot2::aes(intercept = Seuil, slope = 1, linetype = Threshold),
    colour = "firebrick2",
    na.rm = TRUE
  ) +
  ggplot2::geom_point(
    data = ~ .x[(!qc_sex_discrepancy)],
    size = 2,
    na.rm = TRUE
  ) +
  ggplot2::stat_ellipse(data = ~ .x[(!qc_sex_discrepancy)], na.rm = TRUE, show.legend = FALSE) +
  ggplot2::geom_point(
    data = ~ .x[(qc_sex_discrepancy)],
    colour = "firebrick2",
    size = 4,
    show.legend = FALSE,
    na.rm = TRUE
  ) +
  ggrepel::geom_label_repel(
    data = ~ .x[(qc_sex_discrepancy)],
    mapping = ggplot2::aes(x = qc_xmedian, y = qc_ymedian, label = Sample_ID),
    segment.colour = "black",
    colour = "black",
    min.segment.length = ggplot2::unit(0, "lines"),
    size = 2,
    inherit.aes = FALSE,
    show.legend = FALSE,
    na.rm = TRUE
  ) +
  ggplot2::scale_colour_viridis_d(begin = 0.2, end = 0.8, drop = FALSE) +
  ggplot2::scale_fill_viridis_d(begin = 0.2, end = 0.8, drop = FALSE) +
  ggplot2::scale_shape_manual(values = c(22, 21), drop = FALSE) +
  ggplot2::scale_linetype_manual(values = 2) +
  ggplot2::labs(
    x = paste("X Chromosome<br><i>Median Total Intensity (log<sub>2</sub>)</i>"),
    y = paste("Y Chromosome<br><i>Median Total Intensity (log<sub>2</sub>)</i>"),
    colour = "Predicted",
    fill = "Predicted",
    shape = "Observed",
    linetype = "Sex Threshold",
    title = "Sex Check Using Methylation Intensity On X/Y Chromosomes"
  ) +
  ggplot2::guides(
    fill = ggplot2::guide_legend(order = 1, override.aes = list(alpha = 1)),
    shape = ggplot2::guide_legend(order = 2, override.aes = list(size = 4)),
    linetype = ggplot2::guide_legend(order = 3),
    colour = "none"
  ) +
  ggplot2::theme(
    legend.key.height = ggplot2::unit(1.2, "lines"),
    legend.key.width = ggplot2::unit(1.2, "lines")
  ) +
  ggplot2::coord_cartesian(xlim = axis_limits, ylim = axis_limits)

data.table::dcast(
  data = phenotypes[j = .N, by = c("qc_predicted_sex", "qc_observed_sex")][
    j = c("qc_predicted_sex", "qc_observed_sex") := list(
      paste("Predicted:", qc_predicted_sex),
      paste("Observed:", qc_observed_sex)
    )
  ], 
  formula = qc_predicted_sex ~ qc_observed_sex,
  value.var = "N", 
  fill = 0
) %>%
  gt::gt(rowname_col = "qc_predicted_sex", auto_align = TRUE) %>%
  gt::opt_row_striping() %>% 
  gt::opt_all_caps() %>% 
  print()

if (any(phenotypes[["qc_sex_discrepancy"]])) {
  phenotypes[(qc_sex_discrepancy), list(Sample_Name, Sample_ID, qc_observed_sex, qc_predicted_sex)] %>% 
    gt::gt(auto_align = TRUE) %>%
    gt::cols_label(
      Sample_Name = "Name",
      Sample_ID = "ID", 
      qc_observed_sex = "Observed Sex", 
      qc_predicted_sex = "Predicted Sex"
    ) %>% 
    gt::opt_row_striping() %>% 
    gt::opt_all_caps() %>% 
    print()
}

norm_beta <- ENmix::rcp(mdat = data_mset)
if (length(unique(minfi::pData(data_mset)[["Sentrix_ID"]])) > 1) {
  norm_beta <- sva::ComBat(
    dat = norm_beta,
    batch = factor(minfi::pData(data_mset)[["Sentrix_ID"]])
  )[rownames(data_mset), ]
}
colnames(norm_beta) <- minfi::pData(data_mset)[["Sample_ID"]]
if (min(norm_beta, na.rm = TRUE) <= 0) {
  norm_beta[norm_beta <= 0] <- min(norm_beta[norm_beta > 0])
}
if (max(norm_beta, na.rm = TRUE) >= 1) {
  norm_beta[norm_beta >= 1] <- max(norm_beta[norm_beta < 1])
}
data_mset@metadata[grep("_values", names(data_mset@metadata))] <- NULL
data_mset@metadata[["norm_beta_values"]] <- norm_beta
data_mset@metadata[["phenotypes"]] <- phenotypes

readr::write_rds(
  x = data_mset,
  file = file.path(output_directory, paste0(params[["array"]], "_QC_mset.rds"))
)
data.table::fwrite(
  x = data.table::as.data.table(norm_beta, keep.rownames = "cpg_id"),
  file = file.path(output_directory, paste0(params[["array"]], "_QC_betavalues.csv.gz"))
)
data.table::fwrite(
  x = data.table::as.data.table(phenotypes),
  file = file.path(output_directory, paste0(params[["array"]], "_QC_phenotypes.csv"))
)

cat("\n## Principal Component Analysis\n\n")
list_beta <- c("raw_beta" = "Raw &beta;-values", "norm_beta" = "ComBat normalised &beta;-values")
data_batch <- list(
    "raw_beta" = `colnames<-`(minfi::getBeta(data_rgset), minfi::pData(data_rgset)[, "Sample_ID"]),
    "norm_beta" = data_mset@metadata[["norm_beta_values"]]
)

for (ibeta in seq_along(list_beta)) {
    cat("\n###", list_beta[ibeta], "\n\n")

  pca_methylation <- data_batch[[names(list_beta)[ibeta]]]
  pca_methylation <- pca_methylation[rowSums(is.na(pca_methylation)) == 0, ]
  pca_phenotypes <- phenotypes[Sample_ID %in% colnames(pca_methylation)]

  n_comp <- min(10, ncol(pca_methylation))
  fig_n_comp <- min(3, ncol(pca_methylation))

  keep_technical <- names(which(sapply(pca_phenotypes[ 
    j = lapply(.SD, function(x) {
      (data.table::uniqueN(x) > 1 & data.table::uniqueN(x) < length(x)) | is.numeric(x)
    }), 
    .SDcols = pca_vars
  ], isTRUE)))

  variables_excluded <- setdiff(pca_vars, keep_technical)
  if (length(variables_excluded) != 0) {
    cat(
      "The following variables have been excluded (null variances or confounding with samples):\n",
      paste("+", variables_excluded),
      "\n",
      sep = "\n"
    )
  }

  pca_res <- flashpcaR::flashpca(X = t(pca_methylation), stand = "sd", ndim = n_comp)

  pca_dfxy <- data.table::as.data.table(pca_res[["vectors"]], keep.rownames = "Sample_ID")
  data.table::setnames(
    x = pca_dfxy, 
    old = setdiff(names(pca_dfxy), "Sample_ID"), 
    new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), "Sample_ID"))))
  )
  pca_dfxy <- merge(x = pca_dfxy, y = pca_phenotypes, by = "Sample_ID")

  p_inertia <- ggplot2::ggplot(
    data = data.table::data.table(
      y = pca_res[["pve"]],
      x = sprintf("PC%02d", seq_along(pca_res[["pve"]]))
    )[x %in% sprintf("PC%02d", 1:fig_n_comp)]
  ) +
    ggplot2::aes(
      x = paste0(x, "<br><i style='font-size:5pt;'>(", percent2(y), ")</i>"), 
      y = y
    ) +
    ggplot2::geom_col(
      width = 1, 
      colour = "white", 
      fill = scales::viridis_pal(begin = 0.5, end = 0.5)(1), 
      na.rm = TRUE
    ) +
    ggplot2::scale_y_continuous(
      labels = percent1, 
      expand = ggplot2::expansion(mult = c(0, 0.05))
    ) +
    ggplot2::labs(
      x = "Principal Components",
      y = "Variance Explained"
    )

  if (length(keep_technical) > 0) {
    cat("\n#### Association Tests\n\n")
    asso_dt <- data.table::melt(
      data = pca_dfxy,
      measure.vars = grep("^PC[0-9]+$", names(pca_dfxy), value = TRUE),
      variable.name = "pc",
      value.name = "values"
    )[pc %in% sprintf("PC%02d", 1:n_comp)][
      j = {
        m <- stats::model.matrix(
          object = stats::as.formula(
            object = paste0("values ~ ", paste(keep_technical, collapse = " + "))
          ),
          data = .SD
        )

        if (qr(m)$rank == ncol(m)) {
          out <- data.table::as.data.table(
            stats::anova(
              stats::lm(
                formula = stats::as.formula(
                  object = paste0("values ~ ", paste(keep_technical, collapse = " + "))
                ),
                data = .SD
              )
            ),
            keep.rownames = "term"
          )[term != "Residuals"]
        } else {
          out <- data.table::rbindlist(
            lapply(X = keep_technical, .data = .SD, FUN = function(.x, .data) {
              data.table::as.data.table(
                stats::anova(
                  stats::lm(
                    formula = stats::as.formula(paste0("values ~ ", .x)),
                    data = .SD
                  )
                ),
                keep.rownames = "term"
              )[term != "Residuals"]
            })
          )
        }
        out[j = full_rank := qr(m)$rank == ncol(m)]
      },
      by = "pc"
    ]

    p_association <- ggplot2::ggplot(data = asso_dt) +
      ggplot2::aes(
        x = factor(.data[["pc"]]), 
        y = factor(
          x = .data[["term"]], 
          levels = setorderv(
            x = data.table::dcast(
              data = asso_dt[j = list(pc, term, `Pr(>F)` = data.table::fifelse(`Pr(>F)` <= 0.1, `Pr(>F)`, NA_real_))], 
              formula = term ~ pc, 
              value.var = "Pr(>F)"
            ), 
            cols = levels(asso_dt[["pc"]])[1:n_comp], 
            order = -1
          )[["term"]]
        ),
        fill = .data[["Pr(>F)"]]
      ) +
      ggplot2::geom_tile(colour = "white", na.rm = TRUE) +
      ggtext::geom_richtext(
        mapping = ggplot2::aes(
          label = gsub(
            pattern = "(.*)e([-+]*)0*(.*)",
            replacement = "\\1<br>&times;<br>10<sup>\\2\\3</sup>",
            x = format(.data[["Pr(>F)"]], digits = 2, nsmall = 2, scientific = TRUE)
          )
        ),
        colour = "white",
        fill = NA,
        label.colour = NA,
        size = 2.5,
        na.rm = TRUE
      ) +
      ggplot2::scale_fill_viridis_c(na.value = "grey85", end = 0.75, limits = c(0, 0.1)) +
      ggplot2::theme(panel.grid = ggplot2::element_blank()) +
      ggplot2::scale_x_discrete(
        expand = c(0, 0),
        labels = function(x) {
          paste0(
            x, "<br><i style='font-size:5pt;'>(",
            format(
              x = pca_res[["pve"]][as.numeric(gsub("PC", "", x))] * 100,
              digits = 2,
              nsmall = 2
            ),
            " %)</i>"
          )
        }
      ) +
      ggplot2::scale_y_discrete(expand = c(0, 0), labels = toupper) +
      ggplot2::labs(
        x = "Principal Components",
        y = "Variables",
        title = "Association Tests Between Variables And Principal Components",
        caption = ifelse(
          test = all(asso_dt[["full_rank"]]),
          yes = "Variables are tested against principal components using ANOVA.",
          no = paste(
            "Variables are independently tested against principal components using ANOVA",
            "(*i.e.*, model matrix is not full rank)."
          )
        ),
        fill = "P-Value"
      )

    print(p_association)
    cat("\n")

    cat("\n#### Factorial Planes\n\n")
    for (ivar in keep_technical) {
      cat("\n##### `", ivar, "`\n\n", sep = "")
      p <- patchwork::wrap_plots(
        c(
          apply(
            X = utils::combn(sprintf("PC%02d", 1:fig_n_comp), 2),
            MARGIN = 2,
            FUN = function(x) {
              ggplot2::ggplot(data = pca_dfxy[j = .SD, .SDcols = c(ivar, x)]) +
                ggplot2::aes(x = .data[[x[1]]], y = .data[[x[2]]], colour = .data[[ivar]]) +
                ggplot2::geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
                ggplot2::geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
                ggplot2::geom_point(na.rm = TRUE) +
                {
                  if (is.numeric(pca_dfxy[[ivar]])) {
                    ggplot2::scale_colour_viridis_c(
                      name = NULL,
                      begin = 0,
                      end = 0.75
                    )
                  } else {
                    list(
                      ggplot2::stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE),
                      ggplot2::scale_colour_viridis_d(
                        name = NULL,
                        begin = if (pca_dfxy[j = data.table::uniqueN(.SD), .SDcols = ivar] == 2) 0.25 else 0,
                        end = 0.75, 
                        guide = ggplot2::guide_legend(override.aes = list(size = 4))
                      ),
                      if (length(unique(pca_dfxy[[ivar]])) > 10) {
                        ggplot2::theme(legend.position = "none")
                      } else {
                        NULL
                      }
                    )
                  }
                }
            }
          ), 
          list(p_inertia)
        ), 
        guides = "collect"
      ) + 
        patchwork::plot_annotation(
          title = paste0("Structure Detection For: '<i>", ivar, "</i>'"),
          tag_levels = "A", 
          theme = ggplot2::theme(plot.title = ggtext::element_markdown())
        )
      print(p)
      cat("\n")
    }
  }

  cat("\n#### Outliers Detection\n\n")
  pca_dfxy[
    j = dist_centre := rowSums(sapply(.SD, function(x) as.vector(scale(x))^2)), 
    .SDcols = sprintf("PC%02d", 1:fig_n_comp)
  ][
    j = is_outlier := factor(
      x = dist_centre > (
        stats::quantile(dist_centre, 0.75, na.rm = TRUE) + 
          params[["pca_threshold"]] * stats::IQR(dist_centre, na.rm = TRUE)
      ),
      levels = c(FALSE, TRUE), 
      labels = c("No", "Yes")
    )
  ]
  p <- patchwork::wrap_plots(
    c(
      apply(
        X = utils::combn(sprintf("PC%02d", 1:fig_n_comp), 2),
        MARGIN = 2,
        FUN = function(x) {
          ggplot2::ggplot(data = pca_dfxy[j = .SD, .SDcols = c("is_outlier", x)]) +
            ggplot2::aes(
              x = .data[[x[1]]], 
              y = .data[[x[2]]], 
              colour = is_outlier, 
              shape = is_outlier
            ) +
            ggplot2::geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
            ggplot2::geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
            ggplot2::geom_point(na.rm = TRUE) +
            ggplot2::stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE) +
            ggplot2::scale_colour_viridis_d(
              name = "Outlier",
              begin = 0.25, 
              end = 0.75,
              guide = ggplot2::guide_legend(override.aes = list(size = 4))
            ) +
            ggplot2::scale_shape_manual(name = "Outlier", values = c(1, 4))
        }
      ), 
      list(p_inertia)
    ), 
    guides = "collect"
  ) + 
    patchwork::plot_annotation(
      title = "Outliers Detection In Factorial Planes",
      caption = paste0(
        "Outliers defined for a Euclidean distance from cohort centroid",
        " (based on the principal components up to ", fig_n_comp, ")<br>",
        "higher than ", params[["pca_threshold"]], " times the interquartile",
        " range above the 75<sup>th</sup> percentile."
      ),
      tag_levels = "A", 
      theme = ggplot2::theme(
        plot.subtitle = ggtext::element_markdown(size = ggplot2::rel(0.8), face = "italic")
      )
    )

  print(p)
  cat("\n")
  gt::gt(
    data = pca_dfxy[
      is_outlier == "Yes", 
      .SD, 
      .SDcols = c("Sample_ID", pca_vars, sprintf("PC%02d", 1:fig_n_comp))
    ], 
    auto_align = TRUE
  ) %>%
    gt::tab_header(title = "Samples Identified As Possible Outliers") %>%
    gt::fmt_number(columns = sprintf("PC%02d", 1:fig_n_comp), decimals = 2) %>%
    gt::opt_row_striping() %>%
    gt::opt_all_caps() %>%
    print()

}

R session information

options("width" = 110)
sessioninfo::session_info()

omicsr/dmapaq documentation built on Oct. 13, 2021, 1:08 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

omicsr/dmapaq
DNA Methylation Arrays Processing And Quality-Control

In omicsr/dmapaq: DNA Methylation Arrays Processing And Quality-Control

Methods and Parameters

Quality Control

Cell composition

Sex check

R session information

R Package Documentation

Browse R Packages

We want your feedback!

omicsr/dmapaq DNA Methylation Arrays Processing And Quality-Control

In omicsr/dmapaq: DNA Methylation Arrays Processing And Quality-Control

Methods and Parameters

Quality Control

Cell composition

Sex check

R session information

R Package Documentation

Browse R Packages

We want your feedback!

omicsr/dmapaq
DNA Methylation Arrays Processing And Quality-Control