R/script_v12-3_package.R
In QCEWAS: Fast and Easy Quality Control of EWAS Results Files

Documented in EWAS_plots EWAS_QC EWAS_series P_correlation P_lambda translate_header

# QCEWAS v1.2-3
# Created by Peter van der Most, April 2015
# Updated December 2016, August 2018, February 2019, January-February 2023
# Based on code for the QCGWAS package


# Remember to adjust version number here and in output log, and activate onAttach functie
# don't export the zf_testLogical function


  .onAttach <- function(libname, pkgname) {
    packageStartupMessage("")
    packageStartupMessage("QCEWAS library, version 1.2-3")
    packageStartupMessage(paste0("A Quick Start Guide can be found in ", system.file("doc", package = "QCEWAS")))
    packageStartupMessage("")
  }


# zf_testLogical is an internal function to test if arguments are logical
zf_testLogical <- function(param, notNA = TRUE){
  OK <- is.logical(param) & is.vector(param) & length(param) == 1L
  if(OK & notNA) { OK <- !is.na(param) }
  return(OK)    
}


# Function P_correlation is no longer the same check_p as in QCGWAS. The save name has been changed, the log entries & HQ subset removed and the colnames have been updated
P_correlation <- function(dataset,
                    plot_correlation = TRUE, plot_if_threshold = FALSE, threshold_r = 0.99,
                    high_quality_plots = FALSE,
                    save_name = "dataset", header_translations, ...) {
  
  if(!missing(header_translations)) {
    header_test <- translate_header(header = colnames(dataset), standard = c("P_VAL", "BETA", "SE"), alternative = header_translations)
    if(any(duplicated(header_test$header_h))) stop("cannot translate header - duplicate column names")
    if(header_test$missing_N > 0L) stop(paste("Cannot identify data column(s):", paste(header_test$missing_h, collapse = ", ")))
    colnames(dataset) <- header_test$header_h
  }
  
  goodOnes <- !(is.na(dataset$P_VAL) | is.na(dataset$BETA) | is.na(dataset$SE))
  if(sum(goodOnes) > 10L) {
    p_obs <- -log10(dataset$P_VAL[goodOnes])
    p_exp <- -log10(pchisq((dataset$BETA[goodOnes]/dataset$SE[goodOnes])^2, 1, lower.tail=FALSE))
    if(any(p_obs > 300)) {
      p_obs[p_obs > 300] <- 300
      warning("Extreme p-values in dataset - temporarly set to 1e-300 for correlation calculation and plot")
    }
    if(any(p_exp > 300)) {
      p_exp[p_exp > 300] <- 300
      warning("Expected p-values capped at 1e-300")
    }
    p_cor <- cor(p_obs, p_exp, use = "everything")
    if(is.na(p_cor)){
      warning("Unable to calculate p-value correlation")
      return(NA) # uses return here so that the if loops below are skipped      
    }
    if(p_cor < threshold_r) {
      print(paste0("Warning: reported p-values correlate poorly to expected values(r = ", round(p_cor, digits = 3), ")"), quote = FALSE)
    }
    if(plot_correlation & (p_cor < threshold_r | !plot_if_threshold )) {
      p_max <- max(p_obs, p_exp)
      if(high_quality_plots){
        tiff(filename = paste0(save_name, ".graph_p.tiff"), units = "in", width = 7, height = 7, res = 350,
             compression = "lzw")
      } else {
        png(paste0(save_name, ".graph_p.png"),
            width = 720, height = 720, res = 144)
      }
      plot(x = 0, y = 0, col = "white",
           xlim = c(0, p_max), ylim = c(0, p_max),
           main="P-value correlation", xlab = "Expected -log10(p)", ylab = "Observed -log10(p)",
           sub = save_name, cex.sub = 1.3, ...)
      lines(x = c(0, 1.1 * p_max), y = c(0, 1.1 * p_max))
      points(p_exp, p_obs, col = "black", pch = 20, cex = 0.8)
      text(0.0 * p_max, 0.95 * p_max, paste("r =", round(p_cor, digits = 3)), pos = 4, cex=1.0, col = ifelse(p_cor < threshold_r, "red", "black") )
      dev.off()
    }
  } else {
    print("p-test aborted: less than 10 entries with the data required to calculate corelation", quote = FALSE)
    p_cor <- NA
  }
  return(p_cor)
}


# Function EWAS_plots is an edited version of QC_plots from QCGWAS.
# EWAS_plots was edited to prevent a bug from occuring, and to suppress messages
# as well as suppresing an underline in the QQ plot
# also removed a HQ filter call
# lateron, we removed all code relating to the filters

P_lambda <- function(p){
  p <- p[!is.na(p)]
  if(length(p) < 2L) stop("'p' does not contain sufficient non-missing values to calculate lambda") 
  return(median(qchisq(p, df=1, lower.tail=FALSE)) / qchisq(0.5, 1))
}

EWAS_plots <- function(dataset, plot_QQ = TRUE, plot_Man = TRUE,
                     plot_cutoff_p = 0.05,
                     plot_QQ_bands = FALSE,
                     high_quality_plots = FALSE,
                     save_name = "dataset",
                     header_translations) {
  
  if(is.vector(dataset)) {
    dataset <- data.frame(P_VAL = dataset)
    if(plot_Man){
      plot_Man <- FALSE
      warning("Cannot create a Manhattan plot without chromosome/position values")
    }
  }
  colnames(dataset) <- toupper(colnames(dataset))
  header_std <- c("P_VAL", "CHR", "MAPINFO")[c(TRUE, plot_Man, plot_Man)]
  
  if(missing(header_translations)) {
    header_missing <- header_std[!header_std %in% colnames(dataset)]
    if(length(header_missing) > 0L) { stop(paste0("Missing columns: ", paste0(header_missing, collapse = ", "))) }
  } else {
    header_test <- translate_header(header = colnames(dataset), standard = header_std, alternative = header_translations)
    if(any(duplicated(header_test$header_h))) stop(paste0("Cannot translate header - duplicate columns: ", paste0(header_test$header_h[duplicated(header_test$header_h)], collapse = ", ") ))
    if(header_test$missing_N > 0L) stop(paste("Cannot identify data column(s):", paste(header_test$missing_h, collapse = ", ")))
    colnames(dataset) <- header_test$header_h
  }
    
   
  # Stage 1: creating filters & calculating lambda
 
  # Calculating lambda
  # First the script creates a short-list by throwing out NA's (!) to calculate the lambda's,
  #	and a p-cutoff filter is created (!). The script checks whether there are still more than
  #	10 datapoints available at both exclamation marks. If yes, the script uses the shortlist
  #	to generate the unfiltered expected and observed QQ plots; and the long list plus the 
  #	addQQplot function to generate the filtered plots.
  #	WARNING: note that the content of QQ_obs_short changes: it starts out as the p-values
  #	minus NA (*), then -log10 values (#), then -log10 values minus p > 0.05 (***). Lambda needs
  #	to be calculated at the (*). QQ_exp is calculated at (#), but (*) works too; the contents
  #	of the vector used to calculate QQ_exp do not matter, only its length. QQ_exp changes
  #	as well: a copy without > 0.05 is made at (***: QQ_exp_short). The copy is used for
  #	the graphs, but QQ_exp is necessary for the probability clouds. If so, it's shortened
  #	to 1000 points to prevent the polygon functions from being overloaded.
  
  QQ_obs_p	 <- dataset$P_VAL
  QQ_obs_short <- subset(QQ_obs_p, !is.na(QQ_obs_p))
  QQ_obs_N	 <- length(QQ_obs_short)	# N of non-missing p's
  
  if(QQ_obs_N > 10L) {
    lambda <- P_lambda(QQ_obs_short)
    
    if(any(QQ_obs_short < 1e-300)) {
      warning("Extreme p-values in dataset - temporarly set to 1e-300 for QQ & Manhattan plots")
      QQ_obs_short <- ifelse(QQ_obs_short < 1e-300, 1e-300, QQ_obs_short)
      dataset$P_VAL <- ifelse(dataset$P_VAL < 1e-300 & !is.na(dataset$P_VAL), 1e-300, dataset$P_VAL)
    }
    
    QQ_obs_short<- sort(-log10(QQ_obs_short))
    QQ_incl	<- QQ_obs_short >= -log10(plot_cutoff_p)
  } else {
    lambda	<- NA
    QQ_incl	<- 0
  }
  
  # Stage 2: creating QQ plots
  if(plot_QQ & sum(QQ_incl) > 10L ) {
    QQ_exp	<- sort(-log10(ppoints(QQ_obs_N)))
    QQ_exp_short<- QQ_exp[QQ_incl]
    QQ_obs_short<- QQ_obs_short[QQ_incl]
    QQ_exp_min	<- QQ_exp_short[1]
    QQ_exp_max	<- QQ_exp_short[length(QQ_exp_short)]
    QQ_obs_min	<- QQ_obs_short[1]
    QQ_obs_max	<- QQ_obs_short[length(QQ_obs_short)]
    
    if(plot_QQ_bands) {
      temp <- (1:QQ_obs_N)
      i1000 <- c(1, (1:1000) * floor(QQ_obs_N / 1000), QQ_obs_N)
      QQ_band_upper <- sort(-log10(qbeta( 1 - 0.05 / 2, temp, QQ_obs_N - temp + 1 ) ) )[i1000]
      QQ_band_lower <- sort(-log10(qbeta(		 0.05 / 2, temp, QQ_obs_N - temp + 1 ) ) )[i1000]
      QQ_exp <- QQ_exp[i1000]
    }
    
    
    if(high_quality_plots){
      tiff(filename = paste0(save_name, ".graph_QQ.tiff"), units = "in", width = 7, height = 7, res = 350,
           compression = "lzw")
    } else {
      png(paste0(save_name, ".graph_QQ.png"),
          width = 720, height = 720, res = 144)
    }
    plot(c(QQ_exp_min, QQ_exp_max), c(QQ_obs_min, QQ_obs_max), xlim = c(0, QQ_exp_max), ylim = c(0, QQ_obs_max),
         main = "QQ plot", xlab = "Expected -log10(p-value)", ylab = "Observed -log10(p-value)",
         pch = 1, col = "black", sub = save_name, cex.sub = 1.3)
    if(plot_QQ_bands) {
      polygon( c(QQ_exp, rev(QQ_exp)), c( QQ_band_upper, rev(QQ_exp)), col="grey", border = NA )
      polygon( c(QQ_exp, rev(QQ_exp)), c( QQ_band_lower, rev(QQ_exp)), col="grey", border = NA )
    }
    points(QQ_exp_short, QQ_obs_short, pch = 1, col = "black")
    abline(0,1)
    text(0, 0.98 * QQ_obs_max, substitute(paste(lambda, " = ", x), list(x=round(lambda, digits = 3))),
         pos = 4, col = ifelse(lambda > 2 | lambda < 0.8, "red", "black"))
    #abline(h = -log10(0.05/QQ_obs_N),  #  -log10(5e-8), # Bonferroni threshold
    #       lty = 3, col="red")
    dev.off()
  } else { if(plot_QQ) warning("Insufficient significant p-values to create QQ plot") }
  
  # Stage 3: creating Manhattan plot
  if(plot_Man & sum(QQ_incl) > 10L ) {
    
    man_set <- dataset[!is.na(dataset$MAPINFO) & !is.na(dataset$CHR) & !is.na(dataset$P_VAL) & dataset$P_VAL <= plot_cutoff_p, c("CHR", "MAPINFO", "P_VAL")]
    
    if(!is.numeric(man_set$CHR)) {
      if(is.factor(man_set$CHR)) { man_set$CHR <- as.character(man_set$CHR) }
      if(is.character(man_set$CHR)){
        man_set$CHR <- toupper(man_set$CHR)
        man_set$CHR[man_set$CHR == "X"] <- 23L
        man_set$CHR[man_set$CHR == "Y"] <- 24L
        man_set$CHR[man_set$CHR == "XY"] <- 25L
        man_set$CHR[man_set$CHR %in% c("M", "MT")] <- 26L
      }
      man_set$CHR <- as.integer(man_set$CHR)
      if(any(is.na(man_set$CHR))) {
        warning(paste0(sum(is.na(man_set$CHR)), " entries with untranslatable chromosome values"))
        man_set <- man_set[!is.na(man_set$CHR), ]
      }
    }
    
    if(any(!man_set$CHR %in% 1:26)) {
      warning(paste0(sum(!man_set$CHR %in% 1:26), " entries with chromosome values outside of the normal range"))
      man_set <- man_set[man_set$CHR %in% 1:26, ]				
    }
    manhattanN <- nrow(man_set)
    
    
    if(manhattanN > 9L) { 		# testing if there are sufficient p-values < 0.05
      chr_size <- data.frame(chromosome = 1:27, #	chr2				chr3			chr4				chr5				chr6				chr7				chr8				chr9			chr10				chr11			 chr12			chr13				chr14				chr15				chr16			chr17			chr18				chr19			chr20				chr21			chr22			X23					Y24		XY25,M26, 27 = end of M
                             size = c(247249719, 242951149, 199501827, 191273063, 180857866,	170899992,	158821424,	146274826,	140273252,	135374737,	134452384,	132349534,	114142980,	106368585,	100338915,	 88827254,	 78774742,	 76117153,	 63811651,	 62435964,	 46944323,	 49691432,	154913754,	 57772954,	0,	0,	0 ),
                             start= c(	 500000, 248249719, 491700868, 691702695, 883475758, 1064833624, 1236233616, 1395555040, 1542329866, 1683103118, 1818977855, 1953930239, 2086779773, 2201422753, 2308291338, 2409130253, 2498457507, 2577732249, 2654349402, 2718661053, 2781597017, 2829041340, 2879232772, 3034646526, NA, NA, NA))
      
      new_pos <- integer(length = manhattanN)
      for (mi in 1:24 ) new_pos <- ifelse(man_set$CHR==chr_size$chromosome[mi], chr_size[mi, 3] + man_set$MAPINFO, new_pos)
      
      use_Y	<- any(man_set$CHR == 24L)
      use_XY <- any(man_set$CHR == 25L)
      use_M	<- any(man_set$CHR == 26L)
      man_label <- c(1:22, "X")
      at_label	<- chr_size$start[2:24] - 250000 - ( chr_size$start[2:24] - chr_size$start[1:23] ) / 2
      
      if(use_Y) {
        man_label <- c(man_label, "Y")
        at_label <- c(at_label, chr_size$start[25] - 250000 - ( chr_size$start[25] - chr_size$start[24] ) / 2)
        chr_size$start[25] <- chr_size$start[24] + chr_size$size[24] + 500000
      } else { chr_size$start[25] <- chr_size$start[24] }
      
      if(use_XY) {
        man_label <- c(man_label, "XY")
        at_label <- c(at_label, chr_size$start[26] - 250000 - ( chr_size$start[26] - chr_size$start[25] ) / 2)
        chr_size$size[25] <- max(man_set$MAPINFO[man_set$CHR == 25])
        chr_size$start[26] <- chr_size$start[25] + chr_size$size[25] + 500000
      } else { chr_size$start[26] <- chr_size$start[25] }
      
      if(use_M ) {
        man_label <- c(man_label, "M")
        at_label <- c(at_label, chr_size$start[27] - 250000 - ( chr_size$start[27] - chr_size$start[26] ) / 2)
        chr_size$size[26] <- max(man_set$MAPINFO[man_set$CHR == 26])
        chr_size$start[27] <- chr_size$start[26] + chr_size$size[26] + 500000
      } else { chr_size$start[27] <- chr_size$start[26] }
      
      manMax <- ceiling(-log10(min(man_set$P_VAL)))
      if(manMax < 10L) manMax <- 10L
      
      if(high_quality_plots){
        tiff(filename = paste0(save_name, ".graph_M.tiff"), units = "in", width = 14, height = 7, res = 350,
             compression = "lzw")
      } else {
        png(paste0(save_name, ".graph_M.png"),
            width = 960, height = 480)
      }
      par(mfrow = c(1,1), mgp = c(2.5, 0.9, 0))
      palette(c("red", "green3", "blue", "cyan"))
      plot(new_pos, -log10(man_set$P_VAL), pch = 20, xaxs = "i", xaxt = "n", ylim = c(1, manMax),
           xlim = c(0, chr_size$start[27]), col = man_set$CHR, cex.lab = 1.8, cex.axis = 1.4,
           xlab = "Chromosome", ylab = "Observed -log10(p-value)", main = "Manhattan plot", sub = save_name, cex.sub = 1.3, col.sub = "red")
      abline(v = chr_size[2:26, 3] - 250000, lty = 2)
      abline(h = -log10(0.05/QQ_obs_N),  #  -log10(5e-8), # Bonferroni threshold
             lty = 3, col="red")
      axis(1, at = at_label, labels = man_label, cex.axis = 1.5)
      dev.off()
    } else warning("Insufficient SNPs remain to create Manhattan plot")
  } else {
    if(plot_Man) warning("Insufficient SNPs remain to create Manhattan plot")
    #manhattanN	<- NA
  }
  return(invisible(lambda))
}

# Function translate_header has the standard argument altered & the identify_column function made internal
translate_header <- function(header, standard = c("PROBEID","BETA","SE","P_VAL"), alternative){
  if(any(duplicated(alternative[ ,2]))) stop("duplicated elements in alternative, column 2")
  capitalized <- toupper(header)
  unknowns  <- !logical(length = length(header))
  missings	<- logical(length = length(standard))
  identify_column <- function(std_name, alt_names, header) { return(which(header %in% c(alt_names[ which(alt_names[ ,1]==std_name), 2], std_name))) }
  for(forI in 1:length(standard)) {
    column_no <- identify_column(std_name = standard[forI], alt_names = alternative, header = capitalized)
    if(length(column_no) == 0L) {
      missings[forI] <- TRUE
    } else {
      header[column_no] <- standard[forI]
      unknowns[column_no] <- FALSE
    }
  }	
  return(list(header_N = length(header), header_h = header,
              missing_N = sum(missings), missing_h = if(sum(missings) == 0L) NULL else standard[missings],
              unknown_N = sum(unknowns), unknown_h = if(sum(unknowns) == 0L) NULL else header[unknowns]	) )
}

EWAS_QC <- function(data, # datatable with ewas results, or filename of the same
                    map, # datatable with chr & pos of CpGs in idata or filename of the same
                    outputname, # characterstring to identify dataset in output
                    header_translations, # translation table for file headers
                    threshold_outliers = c(NA, NA),
                    markers_to_exclude,
                    exclude_outliers = FALSE,
                    exclude_X = FALSE,
                    exclude_Y = FALSE,
                    save_final_dataset = TRUE,
                    gzip_final_dataset = TRUE,
                    header_final_dataset = "standard",
                    high_quality_plots = FALSE,
                    return_beta = FALSE,
                    N_return_beta = 500000L,
                    ...){
  ### Stage 0: checking input
  # 0: Prepation: creating functions, checking params & output template
  zv_startime <- date()
  
  use_map <- !missing(map)
  if(use_map) { use_map <- !is.null(map) }
  zf_emergencyExit <- function(value, message){ # removed the warning to avoid duplicates (and it's less than informative) since the name of zf_EmergencyExit is returned
    # The function uses "file" from value to determine filename
    # it returns 'value' so that it can be passed to the return that aborts EWAS_QC
    #warning(paste0("Error in file ", value$file, " : ", message))
    print(paste0("Error in file ", value$file, " : ", message), quote = FALSE)
    return(value)
  }

  outcome_QC <- integer(9L)
  names(outcome_QC) <- c("input", "unusable", "removed_at_request", "chrX", "chrY", "below_lower_limit", "above_upper_limit", "removed", "final")
  outcome_QC <- list(data_input = "",
                     file = "",
                     QC_success = FALSE,
                     lambda = -1,
                     p_cor = -1,
                     N = outcome_QC,
                     SE_median = -1,
                     mean_methylation = NULL,
                     effect_size = NULL)
  
  
  # 0: Checking exclude & threshold outliers
  if(!zf_testLogical(exclude_outliers)) stop("'exclude_outliers' is not a single logical value")
  if(!(is.vector(threshold_outliers) & length(threshold_outliers) == 2L)) {
    stop("'threshold_outliers' is not a numeric vector of length 2") }
  
  use_outliers <- !all(is.na(threshold_outliers))
  if(use_outliers){
    if(!is.numeric(threshold_outliers)) stop("'threshold_outliers' is not numeric")
    if(threshold_outliers[1] > threshold_outliers[2] & !is.na(threshold_outliers[1]) & !is.na(threshold_outliers[2])) {
      threshold_outliers <- threshold_outliers[2:1]
    }
  } else {
    if(exclude_outliers) stop("'exclude_outliers' is set to TRUE, but no thresholds have been specified")
  }
  
  # 0: Checking exclude_X & exclude_Y
  if(!zf_testLogical(exclude_X)) stop("'exclude_X' is not a single logical value")
  if(!zf_testLogical(exclude_Y)) stop("'exclude_Y' is not a single logical value")
  if((exclude_X | exclude_Y) & !use_map) stop("No map has been specified to exclude X or Y chromosome markers")

  # 0: checking markers to exclude
  use_MTE <- if(missing(markers_to_exclude)) FALSE else !is.null(markers_to_exclude)
  if(use_MTE){
    if(is.character(markers_to_exclude) & length(markers_to_exclude) == 1L) {
      if(file.exists(markers_to_exclude)){ markers_to_exclude <- read.table(markers_to_exclude, header = F, stringsAsFactors = FALSE)
      } else { stop("Cannot find a file with the name specified in 'markers_to_exclude'")}
    }
    if(is.data.frame(markers_to_exclude)){ markers_to_exclude <- markers_to_exclude[,1]
    } else {
      if(!is.vector(markers_to_exclude)) stop("'markers_to_exclude' is not a vector or filename of a file containing a vector")
    }
    markers_to_exclude <- markers_to_exclude[!is.na(markers_to_exclude)]
    if(length(markers_to_exclude) == 0L) stop("'markers_to_exclude' contains no non-missing values")
  }
  
  # 0: Checking save_final_dataset, return_beta & high_quality_plots
  if(!zf_testLogical(save_final_dataset)) stop("'save_final_dataset' is not a single logical value")
  if(!zf_testLogical(return_beta)) stop("'return_beta' is not a single logical value")
  if(return_beta) stopifnot(is.integer(N_return_beta), is.vector(N_return_beta), length(N_return_beta) == 1L, N_return_beta > 50L)
  if(!zf_testLogical(high_quality_plots)) stop("'high_quality_plots' is not a single logical value")
  
  # 0: checking outputname & gzip_final_dataset
  if(!(is.character(outputname) & length(outputname) == 1L)) stop("'outputname' is not a valid filename")
  if(nchar(outputname) == 0L) stop("'outputname' is not a valid filename")
  outcome_QC$file <- outputname # will be changed if save & gzip are TRUE
  
  if(save_final_dataset){
    if(!zf_testLogical(gzip_final_dataset)) stop("'gzip_final_dataset' is not a single logical value")
    outcome_QC$file <- if(gzip_final_dataset) paste0(outputname, ".txt.gz") else paste0(outputname, ".txt")
    if(file.exists(outcome_QC$file)) stop(paste0("Cannot save output file: ", outcome_QC$file, " already exists"))
    
    if(length(header_final_dataset) == 1L) {
      stopifnot(is.character(header_final_dataset), nchar(header_final_dataset) > 2L)
      if(header_final_dataset %in% c("original", "standard")) {
        use_outheader <- header_final_dataset
      } else {
        if(header_final_dataset %in% c("GWAMA", "PLINK", "META")) {
          use_outheader <- header_final_dataset # temporary storage for command below
          header_final_dataset <- data.frame(
            GWAMA = c("MARKER", "CHR", "POSITION", "BETA", "SE", "P"),
            PLINK = c("SNP",		"CHR", "BP",			 "BETA", "SE", "P"),
            META =  c("rsid",   "chr", "pos",    	 "beta", "se", "P_value"),
            QC =    c("PROBEID","CHR", "POSITION", "BETA", "SE", "P_VAL"),
            stringsAsFactors = FALSE)
          header_final_dataset <- header_final_dataset[ ,c(use_outheader, "QC")]
        } else {
          if(!file.exists(header_final_dataset)) stop("'header_final_dataset' is not a standard name nor a filename in the working directory")
          header_final_dataset <- read.table(header_final_dataset, header = F, stringsAsFactors = FALSE)
        }
        use_outheader <- "table"
      }
    } else {
      if(!is.matrix(header_final_dataset) & !is.data.frame(header_final_dataset)) stop("'header_final_dataset' is not a table, filename or standard name")
      use_outheader <- "table"
    }
    
    if(use_outheader == "table"){
      if(is.matrix(header_final_dataset) | is.data.frame(header_final_dataset)) {
        if(ncol(header_final_dataset) != 2L) { stop("'header_final_dataset' does not have 2 columns") }
        if(any(is.na(header_final_dataset))) { stop("'header_final_dataset' contains missing values") }
        if(!is.character(header_final_dataset[,1])) { stop("'header_final_dataset' contains non-character values") }
        if(!is.character(header_final_dataset[,2])) { stop("'header_final_dataset' contains non-character values") }
        if(any(duplicated(header_final_dataset[ ,1]),duplicated(header_final_dataset[ ,2]))) { stop("'header_final_dataset' contains duplicated names") }
      } else {
        stop("'header_final_dataset' is not a table, filename or standard name")
      }
    }
  } else { gzip_final_dataset <- FALSE } # this is necessary for log
  
  # 0: checking header translation table
  use_translation <- !missing(header_translations)
  if(use_translation) { use_translation <- !is.null(header_translations) }
  if(use_translation) {
    if(length(header_translations) == 1L) {
      stopifnot(is.character(header_translations), nchar(header_translations) > 2L, file.exists(header_translations))
      header_translations <- read.table(header_translations, stringsAsFactors = FALSE)
    }
    if(!is.data.frame(header_translations)) stop("'header_translations' is not dataframe or a filename leading to such")
    if(ncol(header_translations) != 2L) stop("'header_translations' does not have two columns")
    if(any(duplicated(header_translations[ ,2]))) stop("'header_translations' contains duplicated elements in column 2")
  }
  
  
  #### Stage 1: loading data & map & checking column names
  
  print("", quote = FALSE)
  print(paste0(" *** QC'ing ", outputname), quote = FALSE)
  print("", quote = FALSE)
  flush.console()
  
  # 1: map
  if(use_map){
    if(is.character(map) & length(map) == 1L) {
      if(!file.exists(map)) stop(paste0("Cannot find file: ", map))
      map <- read.table(map, header = T, stringsAsFactors = FALSE, comment.char = "")
    }
    if(!is.data.frame(map)) stop("'map' is not a dataframe or a filename leading to such")
    
    colnames(map) <- toupper(colnames(map))
    
    if(!all(c("TARGETID", "CHR", "MAPINFO") %in% colnames(map))) {
      stop(paste0("map does not have columns:  ",
                  paste0(c("TARGETID", "CHR", "MAPINFO")[!c("TARGETID", "CHR", "MAPINFO") %in% colnames(map)],
                         collapse = ", ")))
    }
    # We cannot allow duplicate or missing probeIDs to be present in map,
    # because otherwise temp_map won't be of equal length to data.
    if(any(is.na(map$TARGETID))) stop("'map' contains missing TargetIDs")
    if(any(duplicated(map$TARGETID))) stop("'map' contains duplicate TargetIDs")
  }
  
  # 1: data
  if(is.character(data) & length(data) == 1L) {
    if(!file.exists(data)) stop(paste0("Cannot find file: ", data))
    outcome_QC$data_input <- data
    data <- try(read.table(data, header = T, stringsAsFactors = FALSE, ...))
    if(is(data, "try-error")) return(zf_emergencyExit(value = outcome_QC,
                                                           message = paste0("cannot load file ", outcome_QC$data_input, "; check warnings() for details")))
  } else { outcome_QC$data_input <- "user-supplied table" }
  if(!is.data.frame(data)) stop("'data' is not a dataframe or a filename leading to such")
  # The above error won't occur for EWAS_series, as that loads all data via read.table
  # hence we use stop isntead of the return function
  
  
  header_orig <- colnames(data)
  header_std <- c("PROBEID", "BETA", "SE", "P_VAL")
  if(use_translation){
    header_info <- translate_header(header = toupper(header_orig),
                                    standard = c(header_std, "CHR", "POSITION"),
                                    alternative = header_translations)
    if(any(duplicated(header_info$header_h))) {
      return(zf_emergencyExit(value = outcome_QC,
                              message = paste0("Duplicate columns: ", paste0(header_info$header_h[duplicated(header_info$header_h)], collapse = ", ") )))
    }
    colnames(data) <- header_info$header_h
  } else { colnames(data) <- toupper(header_orig) }
  
  if(!all(header_std %in% colnames(data))) {
    return(zf_emergencyExit(value = outcome_QC,
                            message = paste0("Columns: ",
                                             paste0(header_std[!header_std %in% colnames(data)], collapse = ", "),
                                             " are missing from data. Data does contain (but could not identify) column(s) ", paste0(colnames(data)[!colnames(data) %in% header_std], collapse = ", "), "."
                                             )))
  }
  
  
  #### Stage 2: data integrity
  outcome_QC$N["input"] <- nrow(data)
  if(outcome_QC$N["input"] == 0L) return(zf_emergencyExit(value = outcome_QC,
                                                          message = "'data' contains no entries"))
  vec_unusable <- logical(outcome_QC$N["input"])
  
  # 2: probeID - checking for duplicates
  vec_NA_ID <- is.na(data$PROBEID)
  N_NA_ID <- sum(vec_NA_ID)
  if(N_NA_ID > 0L) { vec_unusable <- vec_unusable | vec_NA_ID }
  
  vec_dupli <- duplicated(data$PROBEID)
  N_dupli <- sum(vec_dupli)
  if(N_dupli > 0L) {
    temp_duplis <- data$PROBEID[vec_dupli]
    N_dupli <- sum(data$PROBEID %in% temp_duplis)
    rm(temp_duplis)
    warning(paste0(N_dupli, " entries with duplicated IDs in file: ", outputname, "."))
  }

  # 2: Beta
  vec_NA_beta <- is.na(data$BETA)
  N_NA_beta <- sum(vec_NA_beta)
  if(N_NA_beta == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
                                                                 message = "'data' contains no BETA values"))
  if(!is.numeric(data$BETA)) return(zf_emergencyExit(value = outcome_QC,
                                                     message = "BETA column inside 'data' does not contain numeric values"))
  if(N_NA_beta > 0L) { vec_unusable <- vec_unusable | vec_NA_beta }
  
  # 2: SE - checking for negative values
  vec_temp <- is.na(data$SE)
  N_NA_SE <- sum(vec_temp)
  if(N_NA_SE == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
                                                               message = "'data' contains no SE values"))
  if(!is.numeric(data$SE)) return(zf_emergencyExit(value = outcome_QC,
                                                   message = "SE column inside 'data' does not contain numeric values"))
  if(N_NA_SE > 0L) { vec_unusable <- vec_unusable | vec_temp }
  
  vec_temp <- data$SE <= 0 & !vec_temp  # vec_temp represent missing values up until this point
  N_bad_SE <- sum(vec_temp)
  if(N_bad_SE > 0L) {
    warning(paste0(N_bad_SE, " entries with a negative or zero standard-error in file: ", outputname, ". These values have been set to NA."))
    data$SE[vec_temp] <- NA
    vec_unusable <- vec_unusable | vec_temp
  }
  
  # 2: Pval - checking for missing, bad en p = 0
  vec_temp <- is.na(data$P_VAL)
  N_NA_p <- sum(vec_temp)
  if(N_NA_p == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
                                                              message = "'data' contains no p values"))
  if(!is.numeric(data$P_VAL)) return(zf_emergencyExit(value = outcome_QC,
                                                      message = "P_VAL column inside 'data' does not contain numeric values"))
  if(N_NA_p > 0L) { vec_unusable <- vec_unusable | vec_temp }
  
  vec_temp <- (data$P_VAL < 0 | data$P_VAL > 1) & !vec_temp  # vec_temp represent missing values up until this point
  N_bad_p <- sum(vec_temp)
  if(N_bad_p > 0L) {
    warning(paste0(N_bad_p, " entries with p-values outside or the range 0 - 1 in file: ", outputname, ". These values have been set to NA."))
    data$P_VAL[vec_temp] <- NA
    vec_unusable <- vec_unusable | vec_temp
  }

  N_low_p <- sum(data$P_VAL < 1e-300 & !is.na(data$P_VAL))
  if(N_low_p > 0L){
    N_zero_p <- sum(data$P_VAL == 0 & !is.na(data$P_VAL))
    if(N_zero_p > 0L) warning(paste0(N_zero_p, " entries with p = 0 in file: ", outputname, ". This indicates a rounding error."))
    warning(paste0(N_low_p, " entries with p < 1e-300 in file: ", outputname, ". These values will be temporarily set to 1e-300 for graph creation."))
  } else { N_zero_p <- 0L }
  
  # 2: final check
  rm(vec_temp)
  outcome_QC$N["unusable"] <- sum(vec_unusable)
  if(outcome_QC$N["unusable"] == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
                                                                  message = "All entries in 'data' contain missing or invalid values"))
  #print(paste0(if(outcome_QC$N["unusable"] == 0L) "No" else outcome_QC$N["unusable"], " entries with missing values"), quote = FALSE)
  print(paste0(outcome_QC$N["unusable"], " entries with missing values"), quote = FALSE)

  
  #### Stage 3: data cleaning
  # generate matching map for use in Manhattan plot

  if(use_map) {
    N_map <- sum(data$PROBEID %in% map$TARGETID)
    if(N_map == 0L){
      use_map <- FALSE
      exclude_X <- FALSE
      exclude_Y <- FALSE
      warning(paste0("Cannot matchs probeIDs in file ", outputname, " with map. Check map or file probeIDs for problems."))
    } else {
      if(N_map < (outcome_QC$N["input"]-N_NA_ID) ) { warning(paste0((outcome_QC$N["input"] - N_NA_ID) - N_map, " entries in file ", outputname, " do not appear in the map.")) }
      
      temp_map <- data.frame(data[, c("PROBEID", "P_VAL")], # only takes PROBEID and P_VAL to avoid issues with existing chr/position columns
                             order = 1:outcome_QC$N["input"],
                             stringsAsFactors = FALSE)
      temp_map <- merge(temp_map, map[, c("TARGETID", "CHR", "MAPINFO")], by.x = "PROBEID", by.y = "TARGETID",
                        all.x = TRUE, all.y = FALSE, sort = FALSE)
      temp_map <- temp_map[order(temp_map$order), -3]
      if(!all(temp_map$PROBEID == data$PROBEID)) stop("DEBUG ERROR - temp_map does not match data-probeIDs")
      temp_map$CHR <- ifelse(is.na(temp_map$CHR), -9, temp_map$CHR) # so we don't have to check for missing values in the following logic test
    }
    rm(map)
  } else { N_map <- NA }

  # 3: removing markers_to_exclude
  if(use_MTE){
    vec_remove <- data$PROBEID %in% markers_to_exclude
    outcome_QC$N["removed_at_request"] <- sum(vec_remove)
  } else {
    vec_remove <- logical(outcome_QC$N["input"])
    outcome_QC$N["removed_at_request"] <- NA
  }
    
  # 3: counting X & Y chr
  if(use_map){ # the use_map test is so that use_map can be disabled above
    # we di bit count probes that have already been excluded above
    vec_temp <- ( temp_map$CHR == "X" |  temp_map$CHR == 23 ) & !vec_remove
    outcome_QC$N["chrX"] <- sum(vec_temp)
    if(outcome_QC$N["chrX"] > 0L & exclude_X) vec_remove <- vec_remove | vec_temp
    
    vec_temp <- ( temp_map$CHR == "Y" |  temp_map$CHR == 24 ) & !vec_remove
    outcome_QC$N["chrY"] <- sum(vec_temp)
    if(outcome_QC$N["chrY"] > 0L & exclude_Y) vec_remove <- vec_remove | vec_temp
    rm(vec_temp)
  } else {
    outcome_QC$N["chrX"] <- NA
    outcome_QC$N["chrY"] <- NA
  }
  
  # 3: counting extremes
  if(is.na(threshold_outliers[1])){
    outcome_QC$N["below_lower_limit"] <- 0L
  } else {
    vec_temp <- data$BETA < threshold_outliers[1] & !vec_NA_beta & !vec_remove # this excludes any previously excluded markers
    outcome_QC$N["below_lower_limit"] <- sum(vec_temp)
    if(outcome_QC$N["below_lower_limit"] > 0L & exclude_outliers) vec_remove <- vec_remove| vec_temp
    rm(vec_temp)
  }
  
  if(is.na(threshold_outliers[2])){
    outcome_QC$N["above_upper_limit"] <- 0L
  } else {
    vec_temp <- data$BETA > threshold_outliers[2] & !vec_NA_beta & !vec_remove # this excludes  any previously excluded markers
    outcome_QC$N["above_upper_limit"] <- sum(vec_temp)
    if(outcome_QC$N["above_upper_limit"] > 0L & exclude_outliers) vec_remove <- vec_remove| vec_temp
    rm(vec_temp)
  }

  # 3: removing extremes & X & Y
  outcome_QC$N["removed"] <- sum(vec_remove)
  
  if(use_MTE) print(paste0(outcome_QC$N["removed_at_request"], " probes specified by user excluded"), quote = FALSE)
  if(exclude_X) print(paste0(outcome_QC$N["chrX"], " X-chromosome probes excluded"), quote = FALSE)
  if(exclude_Y) print(paste0(outcome_QC$N["chrY"], " Y-chromosome probes excluded"), quote = FALSE)
  if(use_outliers){
    print(paste0(outcome_QC$N["below_lower_limit"] + outcome_QC$N["above_upper_limit"], " outliers ", if(exclude_outliers) "excluded" else "found"), quote = FALSE)
  }
  if(sum(use_MTE, exclude_outliers, exclude_X, exclude_Y) > 1L) print(paste0(outcome_QC$N["removed"], " total probes excluded"), quote = FALSE)
  flush.console()
  
  if(outcome_QC$N["removed"] > 0L){
    if(outcome_QC$N["removed"] == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
                                                                                 message = "All entries have been removed from 'data'"))
    
    data <- data[!vec_remove, ]
    if(use_map) { temp_map <- temp_map[!vec_remove, ] }
    vec_unusable <- vec_unusable[!vec_remove]
    vec_NA_ID <- vec_NA_ID[!vec_remove]
    vec_dupli <- vec_dupli[!vec_remove]
    vec_NA_beta <- vec_NA_beta[!vec_remove]
    
    if(all(vec_unusable)) return(zf_emergencyExit(value = outcome_QC,
                                                  message = "All entries in 'data' contain missing or invalid values"))
  }
  rm(vec_remove)
  outcome_QC$N["final"] <- nrow(data)
  
  
  #### Phase 4: generating plots
  
  # 4: histograms
  if(high_quality_plots){
    tiff(filename = paste0(outputname, ".histo.tif"), units = "in", width = 9, height = 4.5, res = 350,
         compression = "lzw")
  } else {
    png(paste0(outputname, ".histo.png"),  width = 960, height = 480, res = 108)
  }
  par(mfrow = c(1, 2))
  
  hist(data$BETA,
       breaks = 30, freq = FALSE, col = "blue", plot = TRUE,
       main = "Beta", xlab = "Beta", cex.main = 1.5)
  
  hist(data$SE,
       breaks = 30, freq = FALSE, col = "red", plot = TRUE,
       main = "Standard error", xlab = "Standard error", cex.main = 1.5,
       sub = outputname, cex.sub = 1.3)
  
  dev.off()
  
  # 4: P correlation plot
  outcome_QC$p_cor <- P_correlation(dataset = data,
                              plot_correlation = TRUE, plot_if_threshold = FALSE, threshold_r = 0.99,
                              high_quality_plots = high_quality_plots, save_name = outputname)
  
  # 4: Manhattan & QQ plots plot - note that we run this even if there is no map in order to calculate lamda
  outcome_QC$lambda <- EWAS_plots(dataset = if(use_map) temp_map else data$P_VAL,
                                  plot_QQ = TRUE, plot_Man = use_map, plot_cutoff_p = 0.05,
                                  plot_QQ_bands = FALSE, high_quality_plots = high_quality_plots,
                                  save_name = outputname)
  if(use_map) rm(temp_map)
  
  
  # 4: Volcano plot - Effect/Beta op X-as, y-as = -log10 p-waarde
  temp_p <- if(N_low_p > 0L) ifelse(data$P_VAL < 1e-300, 1e-300, data$P_VAL) else data$P_VAL
  temp_bonf <- 0.05/sum(!is.na(temp_p))
  if(high_quality_plots){
    tiff(filename = paste0(outputname, ".graph_volcano.tif"), units = "in", width = 7, height = 7, res = 350,
         compression = "lzw")
  } else {
    png(paste0(outputname, ".graph_volcano.png"),
        width = 720, height = 720, res = 144)
  }
  plot(data$BETA, -log10(temp_p),
       main = "Volcano plot", sub = outputname, cex.sub = 1.3,
       xlab = "Effect Size", ylab = "-log10(p value)")
  if(min(temp_p, na.rm = T) < temp_bonf){
    abline(h = -log10(temp_bonf),  #  -log10(5e-8), # Bonferroni threshold
           lty = 3, col="red")
  }
  dev.off()
  rm(temp_p)
  
  # 4: Effect size against methylation beta (beta vs. beta-value) - ook even in de ijskast
  

  
  #### 5 : generate output
  # 5: update outcome_QC - this needs to occur before creation of the log file
  
  outcome_QC$SE_median <- median(data$SE, na.rm = TRUE)
  #outcome_QC$mean_methylation <- NULL # not implemented yet
  if(return_beta) {
    #N_return_beta <- 50000L
    N_final_beta <- sum(!vec_NA_beta)
    if(N_final_beta < N_return_beta) {
      outcome_QC$effect_size <- c(data$BETA[!vec_NA_beta], rep(NA, N_return_beta - N_final_beta))
    } else {
      #outcome_QC$effect_size <- sort(data$BETA[!vec_NA_beta])[c(1:N_return_beta)*(N_final_beta %/% N_return_beta)]
      outcome_QC$effect_size <- sort(sample(x = data$BETA[!vec_NA_beta], size = N_return_beta, replace = FALSE))
    }
  }
  outcome_QC$QC_success <- TRUE
  
  
  # 5: create log file - this needs to occur before the file headers are changed
  
  print("", quote = FALSE)
  print("", quote = FALSE)
  write.table(c(
    "****************************************",
    "\t\tEWAS_QC LOG FILE",
    "****************************************",
    ""), paste0(outputname, ".log"), append = FALSE,
    quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  zc_log <- file(description = paste0(outputname, ".log"), open = "a")
  
  write.table(data.frame(
    v1 = c("File", "QC Start Time", "QC End Time", "EWAS_QC Version"),
    v2 = "",
    v3 = c(outcome_QC$file, zv_startime, date(), "1.2-3"),
    stringsAsFactors = FALSE),
    zc_log, quote = FALSE,
    sep = "\t", row.names = FALSE, col.names = FALSE)
  
  write.table(c("",
                "",
                "****************************************",
                "\t1. Number of entries in file",
                "****************************************",
                ""), zc_log, quote = FALSE,
              row.names = FALSE, col.names = FALSE)
  
  write.table(t(c("", "", "N", "", "", "", "N")),
              zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
  write.table(data.frame(
    v1 = c("Start", "ID Missing", "ID Duplicated", "Beta Missing", "SE Missing", "SE Invalid", "P Missing", "P Invalid", "Total missing & invalid"),
    v2 = "",
    v3 = c(outcome_QC$N["input"], N_NA_ID, N_dupli, N_NA_beta, N_NA_SE, N_bad_SE, N_NA_p, N_bad_p, outcome_QC$N["unusable"]),
    v4 = "",
    v5 = c("Start", "Removed at request", "Not found in map", "> X", "> Y", "Outliers High", "Outliers Low", "Total Removed", "Final"),
    v6 = "",
    v7 = c(outcome_QC$N["input"], outcome_QC$N["removed_at_request"], outcome_QC$N["input"] - N_map, outcome_QC$N["chrX"], outcome_QC$N["chrY"], outcome_QC$N["below_lower_limit"], outcome_QC$N["above_upper_limit"], outcome_QC$N["removed"], outcome_QC$N["final"]),
    stringsAsFactors = FALSE),
    zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
  
  write.table(c("",
                "",
                "****************************************",
                "\t2. Summary statistics",
                "****************************************",
                ""), zc_log, quote = FALSE,
              row.names = FALSE, col.names = FALSE)
  
  write.table(data.frame(
    v1 = c("p-value correlation", "Lambda", "SE median"),
    v2 = "",
    v3 = c(outcome_QC$p_cor, outcome_QC$lambda, outcome_QC$SE_median),
    stringsAsFactors = FALSE),
    zc_log, quote = FALSE,
    sep = "\t", row.names = FALSE, col.names = FALSE)
  
  zf_saveStats <- function(stat, statname){ # we have to add name, otherwise c() will convert all integers to numeric
    Nall <- length(stat)
    NNA <- sum(is.na(stat))
    PNA <- 100*NNA/Nall
    if(PNA == 100) return(c(statname, Nall - NNA, NNA, round(PNA, digits = 1), rep(NA, 6)))
    #return(c(statname, Nall - NNA, NNA, round(PNA, digits = 1),
    #         round(c(mean(stat, na.rm = TRUE), quantile(stat, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE, names = FALSE))[c(2,3,4,1,5,6)], digits = 2)))
    return(c(statname, Nall - NNA, NNA, round(PNA, digits = 1),
             c(mean(stat, na.rm = TRUE), quantile(stat, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE, names = FALSE))[c(2,3,4,1,5,6)]))
  }
  outcome_stats <- data.frame(stat = c("Beta", "StdErr", "P"), Nfinal = integer(3), NNA = integer(3), PNA = integer(3),
                              min = numeric(3), Q25 = numeric(3), median = numeric(3), mean = numeric(3), Q75 = numeric(3), max = numeric(3),
                              stringsAsFactors = FALSE)
  outcome_stats[1, ] <- zf_saveStats(stat = data$BETA , statname = "Beta")
  outcome_stats[2, ] <- zf_saveStats(stat = data$SE   , statname = "StdErr")
  outcome_stats[3, ] <- zf_saveStats(stat = data$P_VAL, statname = "P")
  
  write.table("", zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
  write.table(t(c("" ,"N final", "N missing", "% missing", "min", "25%", "median", "mean", "75%", "max")),
              zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
  write.table(outcome_stats,
              zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
  
  close(zc_log)

  # Saving final dataset

  if(save_final_dataset) {
    if(use_outheader == "original") colnames(data) <- header_orig
    if(use_outheader == "table") {
      temp_header <- translate_header(header = colnames(data), standard = header_final_dataset[ ,1], alternative = header_final_dataset)
      colnames(data) <- temp_header$header_h
    }
    write.table(data,
                if(gzip_final_dataset) gzfile(outcome_QC$file) else outcome_QC$file,
                row.names = FALSE, quote = FALSE)
  }

  rm(data, vec_unusable, vec_NA_ID, vec_dupli, vec_NA_beta)
  gc(FALSE, FALSE)
  return(outcome_QC)
}

EWAS_series <- function(EWAS_files, # datatable with ewas results, or filename of the same
                        output_files,
                        map, # datatable with chr & pos of CpGs in idata or filename of the same
                        N, # sample sizes for precision plot
                        header_translations, # translation table for file headers
                        save_final_dataset = TRUE,
                        gzip_final_dataset = TRUE,
                        high_quality_plots = FALSE,
                        N_plot_beta = 500000L,
                        ...){
  # all other arguments will be checked by the first EWAS_QC, so no need to check 'em here
  zf_loadAndReturn <- function(filename, filedesc, ...){
    
    if(is.data.frame(filename) | is.matrix(filename)) return(filename)
    
    if(!is.character(filename) | length(filename) != 1L) stop(paste0("argument '", filedesc, "' is neither a dataset nor a filename"))
    
    if(!file.exists(filename)) stop(paste0("Cannot find file '", filename, "' in the working directory"))
    
    return(read.table(file = filename, stringsAsFactors = FALSE, ...))
  }
  
  use_map <- !missing(map)
  if(use_map) { use_map <- !is.null(map) }
  use_translation <- !missing(header_translations)
  if(use_translation) { use_translation <- !is.null(header_translations) }
  use_N <- !missing(N)
  if(use_N) { use_N <- !is.null(N) }
  
  # Checking N_plot_beta
  if(is.numeric(N_plot_beta)) {
    if(as.integer(N_plot_beta) == N_plot_beta){
      N_plot_beta <- as.integer(N_plot_beta)
    } else stop("'N_plot_beta' is not an integer")
  }
  stopifnot(is.integer(N_plot_beta), is.vector(N_plot_beta), length(N_plot_beta) == 1L, N_plot_beta > 50L)
  
  # checking input & output filenames, as well as save commands
  stopifnot(is.vector(EWAS_files), is.character(EWAS_files))
  if(any(nchar(EWAS_files) == 0L)) stop("'EWAS_files' contains invalid filenames")
  if(any(duplicated(EWAS_files))) stop("duplicate filenames in 'EWAS_files'")
  if(any(!file.exists(EWAS_files))) stop(paste0("files: ", paste0(EWAS_files[!file.exists(EWAS_files)], collapse = ", "), " do not exist"))
  N_file <- length(EWAS_files)
  
  
  if(missing(output_files)) { output_files <- paste0("QC_", EWAS_files)
  } else {
    stopifnot(is.vector(output_files), is.character(output_files))
    if(N_file != length(output_files)) stop("vectors 'EWAS_files' (input) and 'output_files' are of unequal length")
  }
  # removing .gz
  output_files <- ifelse(substr(output_files, nchar(output_files) - 2, nchar(output_files)) == ".gz",
                         substr(output_files, 1, nchar(output_files) - 3),
                         output_files)
  if(any(nchar(output_files) == 0L)) stop("'output_files' contains valid filename")
  if(any(duplicated(output_files))) stop("duplicate filenames in 'output_files'")
  
  if(!zf_testLogical(save_final_dataset)) stop("'save_final_dataset' is not a single logical value")
  if(save_final_dataset){
    if(!zf_testLogical(gzip_final_dataset)) stop("'gzip_final_dataset' is not a single logical value")
    #if(!zf_testLogical(save_standard_header)) stop("'save_standard_header' is not a single logical value")
    if(gzip_final_dataset){
      if(any(file.exists(paste0(output_files, ".gz")))) stop("cannot save output: one of the specified 'output_files' already exists")
    } else {
      if(any(file.exists(output_files))) stop("cannot save output: one of the specified 'output_files' already exists")
    }
  }
  if(!zf_testLogical(high_quality_plots)) stop("'high_quality_plots' is not a single logical value")
  
  # checking N & preparing precision plot
  dat_prec <- data.frame(file = EWAS_files, no = 1:N_file,
                         use = logical(N_file), SE = numeric(N_file),
                         stringsAsFactors = FALSE)
  if(use_N){
    if(is.character(N) & length(N) == 1L) { N <- zf_loadAndReturn(filename = N, filedesc = "N", header = TRUE) }
    if(!is.data.frame(N)) stop("'N' is not dataframe or a filename leading to such")
    if(any(!colnames(N) %in% c("file", "N"))) stop("'N' does not contain the specified columns 'file' and/or 'N'")
    if(any(!EWAS_files %in% N$file)) stop("Cannot find matching entries in 'N' for all files in 'EWAS_files'")
    
    dat_prec <- merge(dat_prec, N[ , c("file", "N")], by = "file", all.x = TRUE, all.y = FALSE, sort = FALSE)
    rm(N)
    dat_prec <- dat_prec[order(dat_prec$no), ]
    
    if(!is.numeric(dat_prec$N)) stop("column N in table 'N' does not contain numeric values")
    if(any(dat_prec$N < 1, na.rm = TRUE)) stop("column N in table 'N' does not contain valid numeric values")
    if(sum(!is.na(dat_prec$N)) < 2) {
      warning("insufficient non-missing N values to generate precision plot")
      use_N <- FALSE
    }
  }
  
  # Checking header translation table
  if(use_translation) {
    header_translations <- zf_loadAndReturn(filename = header_translations, filedesc = "header_translations")
    #if(!is.data.frame(header_translations)) stop("'header_translations' is not dataframe or a filename leading to such")
    if(ncol(header_translations) != 2L) stop("'header_translations' does not have two columns")
    if(any(duplicated(header_translations[ ,2]))) stop("'header_translations' contains duplicated elements in column 2")
  } else { header_translations <- NULL }
  
  # Checking map
  if(use_map){
    map <- zf_loadAndReturn(filename = map, filedesc = "map", header = TRUE)
    #if(!is.data.frame(map)) stop("'map' is not a dataframe or a filename leading to such")
    
    colnames(map) <- toupper(colnames(map))
    if(!all(c("TARGETID", "CHR", "MAPINFO") %in% colnames(map))) {
      stop(paste0("map does not have columns:  ",
                  paste0(c("TARGETID", "CHR", "MAPINFO")[!c("TARGETID", "CHR", "MAPINFO") %in% colnames(map)],
                         collapse = ", ")))
    }
    # We cannot allow duplicate or missing TargetIDs to be present in map,
    # because otherwise temp_map won't be of equal length to data.
    if(any(is.na(map$TARGETID))) stop("'map' contains missing TargetIDs")
    if(any(duplicated(map$TARGETID))) stop("'map' contains duplicate TargetIDs")
  } else { map <- NULL }
  
  
  #### Starting the actual QC
  
  #N_beta <- 500000L
  dat_beta <- matrix(data = numeric(N_plot_beta * N_file), nrow = N_plot_beta, ncol = N_file)
  for(ci in 1:N_file){
    #print(paste0(" *** QC'ing file ", ci, " of ", N_file), quote = FALSE)
    #flush.console() # unnecessary as EWAS_QC will already flush
    temp_out <- EWAS_QC(data = EWAS_files[ci],
                        map = map,
                        outputname = output_files[ci],
                        header_translations = header_translations,
                        save_final_dataset = save_final_dataset, gzip_final_dataset = gzip_final_dataset,
                        high_quality_plots = high_quality_plots,
                        return_beta = TRUE, N_return_beta = N_plot_beta, ...)
    if(temp_out$QC_success){
      dat_prec$use[ci] <- TRUE
      dat_prec$SE[ci] <- temp_out$SE_median
      # mean methylation is not used, for the moment
      dat_beta[, ci] <- temp_out$effect_size
    }
    rm(temp_out)
  }
  print("", quote = FALSE)
  
  # output:
  N_pass <- sum(dat_prec$use)
  if(N_pass > 1L){
    #Precision plot
    dat_temp <- dat_prec[dat_prec$use, ]
    
    if(use_N){
      temp_prec <- 1/dat_temp$SE
      temp_N <- sqrt(dat_temp$N)
      temp_ticks <- pretty(temp_prec) # using pretty() will hopefully ensure neat values on both y-axes
      
      if(high_quality_plots){
        tiff(filename = "EWAS_QC_graph_precision.tif", units = "in", width = 7, height = 7, res = 350,
             compression = "lzw")
      } else {
        png("EWAS_QC_graph_precision.png", width = 960, height= 960, res = 144)
      }
      par(mar = c(5, 4, 4, 4.5) + 0.1) # add +2 to right margin, so to leave space for label
      plot(x = temp_N, y = temp_prec, yaxt = "n", # suppresses the y-axis
           main = "Precision by Sample Size", xlab = "sqrt(sample size)", ylab = "Precision ( = 1 / median SE)")
      axis(side = 2, at = temp_ticks, labels = TRUE)
      axis(side = 4, at = temp_ticks, labels = 1/temp_ticks)
      mtext(text = "median SE", side = 4, line = 3) # adds label to right margin
      text(temp_N, temp_prec, labels = dat_temp$no, pos = 4)
      dev.off()
      rm(temp_N, temp_prec, temp_ticks)
    }
    
    #effect size boxplot
    
    dat_beta <- dat_beta[ , dat_prec$use]
    if(use_N){
      dat_beta <- dat_beta[ , order(dat_temp$N, na.last = FALSE)]
      dat_temp <- dat_temp[order(dat_temp$N, na.last = FALSE), ]
    }
    
    dat_quant <- matrix(data = 0.0, ncol = 3, nrow = N_pass,
                        dimnames = list(NULL, c("quantile25", "median", "quantile75")))
    for(ti in 1:ncol(dat_beta)) dat_quant[ti, ] <- quantile(dat_beta[, ti], names = FALSE, na.rm = TRUE)[2:4]
    
    if(high_quality_plots){
      tiff(filename = "EWAS_QC_graph_effectsize.tif", units = "in", width = 6 + 1 * N_pass, height = 7, res = 350,
           compression = "lzw")
    } else {
      png("EWAS_QC_graph_effectsize.png", width = 800 + 160 * N_pass, height = 960, res = 144)
    }
    boxplot(dat_beta, names = if(use_N) paste0(dat_temp$no, ", N=", dat_temp$N) else dat_temp$no,
            las = 0, main = "Effect-size distribution")
    abline(h = c(median(dat_quant[ ,1]), median(dat_quant[ ,2]), median(dat_quant[ ,3])), lty = 3)
    dev.off()
    rm(dat_temp)
  } else { print("Insufficient files passed the QC to generate graphs", quote = FALSE)}
  
  rm(dat_beta, map)
  write.table(dat_prec, "EWAS_QC_legend.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
  gc(FALSE, FALSE)
  return(invisible(dat_prec))
}
Any scripts or data that you put into this service are public.
QCEWAS documentation built on Feb. 16, 2023, 10:30 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
QCEWAS
Fast and Easy Quality Control of EWAS Results Files

R/script_v12-3_package.R
In QCEWAS: Fast and Easy Quality Control of EWAS Results Files

Defines functions EWAS_series EWAS_QC translate_header EWAS_plots P_lambda P_correlation zf_testLogical .onAttach

Documented in EWAS_plots EWAS_QC EWAS_series P_correlation P_lambda translate_header

Try the QCEWAS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

QCEWAS Fast and Easy Quality Control of EWAS Results Files

R/script_v12-3_package.R In QCEWAS: Fast and Easy Quality Control of EWAS Results Files

Defines functions EWAS_series EWAS_QC translate_header EWAS_plots P_lambda P_correlation zf_testLogical .onAttach

Documented in EWAS_plots EWAS_QC EWAS_series P_correlation P_lambda translate_header

Try the QCEWAS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

QCEWAS
Fast and Easy Quality Control of EWAS Results Files

R/script_v12-3_package.R
In QCEWAS: Fast and Easy Quality Control of EWAS Results Files