Nothing
QC_GWAS <-
function(filename, filename_output = paste0("QC_", filename),
dir_data = getwd(), dir_output = paste(dir_data, "QCGWASed", sep="/"), dir_references = dir_data,
header_translations,
column_separators = c("\t", " ", "", ",", ";"),
nrows = -1, nrows_test = 1000, header = TRUE, comment.char = "",
na.strings = c("NA", "nan", "NaN", "."),
imputed_T = c("1", "TRUE", "T"), imputed_F = c("0", "FALSE", "F"), imputed_NA = c(NA, "-"),
save_final_dataset = TRUE, gzip_final_dataset = TRUE, order_columns = FALSE,
spreadsheet_friendly_log = FALSE,
out_header = "standard", out_quote = FALSE, out_sep = "\t",
out_eol = "\n", out_na = "NA", out_dec = ".", out_qmethod = "escape",
out_rownames = FALSE, out_colnames = TRUE,
return_HQ_effectsizes = FALSE,
remove_X = FALSE, remove_Y = FALSE, remove_XY = remove_Y, remove_M = FALSE,
calculate_missing_p = FALSE,
make_plots = TRUE, only_plot_if_threshold = TRUE,
threshold_allele_freq_correlation = 0.95, threshold_p_correlation = 0.99,
plot_intensity = FALSE,
plot_histograms = make_plots, plot_QQ = make_plots, plot_QQ_bands = TRUE,
plot_Manhattan = make_plots, plot_cutoff_p = 0.05,
allele_ref_std, allele_name_std,
allele_ref_alt, allele_name_alt,
update_alt = FALSE, update_savename, update_as_rdata = FALSE, backup_alt = FALSE,
remove_mismatches = TRUE,
remove_mismatches_std = remove_mismatches, remove_mismatches_alt = remove_mismatches,
threshold_diffEAF = 0.15, remove_diffEAF = FALSE,
remove_diffEAF_std = remove_diffEAF, remove_diffEAF_alt = remove_diffEAF,
check_ambiguous_alleles = FALSE,
use_threshold = 0.1,
useFRQ_threshold = use_threshold, useHWE_threshold = use_threshold, useCal_threshold = use_threshold, useImp_threshold = use_threshold, useMan_threshold = use_threshold,
HQfilter_FRQ = 0.01, HQfilter_HWE = 10^-6, HQfilter_cal = 0.95, HQfilter_imp = 0.3,
QQfilter_FRQ = c(NA, 0.01, 0.05), QQfilter_HWE = c(NA, 10^-6, 10^-4),
QQfilter_cal = c(NA, 0.95, 0.99), QQfilter_imp = c(NA, 0.3, 0.5, 0.8),
NAfilter = TRUE, NAfilter_FRQ = NAfilter,
NAfilter_HWE = NAfilter, NAfilter_cal = NAfilter, NAfilter_imp = NAfilter,
ignore_impstatus = FALSE,
minimal_impQ_value = -0.5, maximal_impQ_value = 1.5,
logI = 1L, logN = 1L, ...) {
### PHASE 0: checking the input parameters for errors
# Filename_output is checked later, to avoid calling it before the extensions
# are removed from "filename". If filename_output is invalid, "QC_[filename]"
# will be used instead.
start_time <- date()
stopifnot(is.character(filename), length(filename) == 1L, !is.na(filename),
is.character(dir_data), length(dir_data) == 1L,
is.character(dir_output), length(dir_output) == 1L,
is.character(dir_references), length(dir_references) == 1L)
if(nchar(filename) < 3L | nchar(dir_data) == 0L | nchar(dir_output) == 0L | nchar(dir_references) == 0L) {
stop("File / directory names cannot be empty character-strings") }
if(!file.exists(dir_data) | !file.exists(dir_references)) stop("Cannot find specified directory(s)")
if(!file.exists(paste(dir_data, filename, sep = "/"))) stop("Cannot find the data file in input directory")
if(length(header_translations) == 1L) {
stopifnot(is.character(header_translations), nchar(header_translations) > 2L, file.exists(paste(dir_references, header_translations, sep = "/")))
settings_header_input <- header_translations
header_translations <- read.table(paste(dir_references, header_translations, sep = "/"), stringsAsFactors = FALSE)
} else { settings_header_input <- "table" }
if(!is.data.frame(header_translations) & !is.matrix(header_translations)) stop("'header_translations' is not a table, matrix or dataframe")
if(ncol(header_translations) != 2L) stop("'header_translations' does not have two columns")
if(any(duplicated(header_translations[ ,2]))) stop("'header_translations' contains duplicated elements in column 2")
stopifnot(is.logical(header), length(header) == 1L)
if(is.na(header)) stop("'header' cannot be NA")
stopifnot(is.numeric(nrows), length(nrows) == 1L)
if(is.na(nrows) | nrows == 0) stop("'nrows' cannot be missing or zero")
stopifnot(is.numeric(nrows_test), length(nrows_test) == 1L)
if(is.na(nrows_test) | nrows_test == 0) stop("'nrows_test' cannot be missing or zero")
stopifnot(is.character(comment.char), length(comment.char) == 1L)
if(nchar(comment.char) != 1L & nchar(comment.char) != 0L) stop("'comment.char' must be a single character or empty string")
stopifnot(is.vector(na.strings), is.character(na.strings), is.vector(imputed_T), is.vector(imputed_F), is.vector(imputed_NA),
is.vector(column_separators), is.character(column_separators), is.logical(ignore_impstatus), length(ignore_impstatus) == 1L, !is.na(ignore_impstatus))
if(any(duplicated(c(imputed_NA, imputed_T, imputed_F)))) stop("duplicate strings in the 'imputed' arguments")
stopifnot(is.logical(save_final_dataset), length(save_final_dataset) == 1L, is.logical(order_columns), length(order_columns) == 1L,
is.logical(spreadsheet_friendly_log), length(spreadsheet_friendly_log) == 1L)
if(is.na(save_final_dataset)) stop("'save_final_dataset' cannot be NA")
if(is.na(order_columns)) stop("'order_columns' cannot be NA")
if(is.na(spreadsheet_friendly_log)) stop("'spreadsheet_friendly_log' cannot be NA")
if(save_final_dataset) {
stopifnot(is.logical(out_quote), length(out_quote) == 1L,
is.character(out_sep), length(out_sep) == 1L,
is.character(out_eol), length(out_eol) == 1L,
is.character(out_na), length(out_na) == 1L,
is.character(out_dec), length(out_dec) == 1L,
is.character(out_qmethod), length(out_qmethod) == 1L,
is.logical(gzip_final_dataset), length(gzip_final_dataset) == 1L)
if(is.na(out_quote)) stop("'out_quote' cannot be NA")
if(is.na(gzip_final_dataset)) stop("'gzip_final_dataset' cannot be NA")
if(nchar(out_sep) == 0L) stop("'out_sep' cannot be an empty character-string")
if(nchar(out_eol) == 0L) stop("'out_eol' cannot be an empty character-string")
if(nchar(out_dec) != 1L) stop("'out_dec' must be of length 1")
if(!out_qmethod %in% c("escape", "double", "e", "d")) stop("'out_qmethod' must be either 'escape' or 'double'")
if(length(out_header) == 1L) {
stopifnot(is.character(out_header), nchar(out_header) > 2L)
settings_header_output <- out_header
if(out_header %in% c("original", "standard", "GWAMA", "PLINK", "META", "GenABEL", "old")) {
if(settings_header_output != "standard" & settings_header_output != "original") {
if(settings_header_output == "GenABEL"){
out_header <- data.frame(
GenABEL = c("name", "chromosome", "position", "strand", "allele1", "allele2", "effallelefreq", "n", "beta", "sebeta", "p", "pexhwe", "call"),
QC = c("MARKER", "CHR", "POSITION", "STRAND", "EFFECT_ALL", "OTHER_ALL", "EFF_ALL_FREQ", "N_TOTAL", "EFFECT", "STDERR", "PVALUE", "HWE_PVAL", "CALLRATE"),
stringsAsFactors = FALSE)
} else {
out_header <- data.frame(
GWAMA = c("MARKER", "CHR", "POSITION", "EA", "NEA", "STRAND", "BETA", "SE", "P", "EAF", "N", "IMPUTED", "IMP_QUALITY"),
PLINK = c("SNP", "CHR", "BP", "A1", "A2", "STRAND", "BETA", "SE", "P", "EFF_ALL_FREQ", "N", "IMPUTED", "IMP_QUALITY"),
META = c( "rsid", "chr", "pos", "allele_B", "allele_A", "strand", "beta", "se", "P_value", "EFF_ALL_FREQ", "N", "imputed", "info"),
old =c("MARKER", "CHR", "POSITION", "ALLELE1", "ALLELE2", "STRAND", "EFFECT", "STDERR", "PVALUE", "FREQLABEL", "N_TOTAL", "IMPUTED", "IMP_QUALITY"),
QC = c("MARKER", "CHR", "POSITION", "EFFECT_ALL", "OTHER_ALL", "STRAND", "EFFECT", "STDERR", "PVALUE", "EFF_ALL_FREQ", "N_TOTAL", "IMPUTED", "IMP_QUALITY"),
stringsAsFactors = FALSE)
out_header <- out_header[ ,c(settings_header_output, "QC")]
}
}
} else {
if(!file.exists(paste(dir_references, out_header, sep = "/"))) stop("'out_header' is not a standard name nor a filename in dir_references")
out_header <- read.table(paste(dir_references, out_header, sep = "/"), stringsAsFactors = FALSE)
}
} else {
if(!is.matrix(out_header) & !is.data.frame(out_header)) stop("'out_header' is not a table, filename or standard name")
settings_header_output <- "table"
}
if(is.matrix(out_header) | is.data.frame(out_header)) {
if(ncol(out_header) != 2L) { stop("'out_header' does not have 2 columns") }
if(any(is.na(out_header))) { stop("'out_header' contains missing values") }
if(any(duplicated(out_header[ ,1]),duplicated(out_header[ ,2]))) { stop("'out_header' contains duplicated names") }
}
} else { settings_header_output <- NA }
stopifnot(is.logical(remove_X), length(remove_X) == 1L,
is.logical(remove_Y), length(remove_Y) == 1L,
is.logical(remove_XY), length(remove_XY) == 1L,
is.logical(remove_M), length(remove_M) == 1L)
if(is.na(remove_X)) { stop( "'remove_X' cannot be NA") }
if(is.na(remove_Y)) { stop( "'remove_Y' cannot be NA") }
if(is.na(remove_XY)){ stop("'remove_XY' cannot be NA") }
if(is.na(remove_M)) { stop( "'remove_M' cannot be NA") }
stopifnot(is.numeric(useFRQ_threshold), length(useFRQ_threshold) == 1L,
is.numeric(useHWE_threshold), length(useHWE_threshold) == 1L,
is.numeric(useCal_threshold), length(useCal_threshold) == 1L,
is.numeric(useImp_threshold), length(useImp_threshold) == 1L,
is.numeric(useMan_threshold), length(useMan_threshold) == 1L)
if(any(is.na(c(useFRQ_threshold, useHWE_threshold, useCal_threshold, useImp_threshold, useMan_threshold))) | useFRQ_threshold <= 0 | useHWE_threshold <= 0 | useCal_threshold <= 0 | useImp_threshold <= 0 | useMan_threshold <= 0) {
stop("'use_threshold' values cannot be negative, missing or zero") }
stopifnot(is.logical(plot_histograms), length(plot_histograms) == 1L,
is.numeric(threshold_p_correlation), length(threshold_p_correlation) == 1L,
is.logical(plot_QQ), length(plot_QQ) == 1L,
is.logical(plot_QQ_bands), length(plot_QQ_bands) == 1L,
is.logical(plot_Manhattan), length(plot_Manhattan) == 1L,
is.numeric(plot_cutoff_p), length(plot_cutoff_p) == 1L)
if(is.na(plot_histograms)) { stop("'plot_histograms' cannot be NA") }
if(is.na(plot_QQ)) { stop("'plot_QQ' cannot be NA") }
if(is.na(plot_QQ_bands)) { stop("'plot_QQ_bands' cannot be NA") }
if(is.na(plot_Manhattan)) { stop("'plot_Manhattan' cannot be NA") }
stopifnot(is.logical(NAfilter_FRQ), length(NAfilter_FRQ) == 1L,
is.logical(NAfilter_HWE), length(NAfilter_HWE) == 1L,
is.logical(NAfilter_cal), length(NAfilter_cal) == 1L,
is.logical(NAfilter_imp), length(NAfilter_imp) == 1L)
if(is.na(NAfilter_FRQ) | is.na(NAfilter_HWE) | is.na(NAfilter_cal) | is.na(NAfilter_imp)) {
stop("'NAfilter' values cannot be NA") }
if(!is.null(HQfilter_FRQ)) {
if(length(HQfilter_FRQ) > 1L) stop("'HQfilter_FRQ' isn't a single value")
if(!is.numeric(HQfilter_FRQ) & !is.na(HQfilter_FRQ)) stop("'HQfilter_FRQ' isn't a numerical or NA value") }
if(!is.null(HQfilter_HWE)) {
if(length(HQfilter_HWE) > 1L) stop("'HQfilter_HWE' isn't a single value")
if(!is.numeric(HQfilter_HWE) & !is.na(HQfilter_HWE)) stop("'HQfilter_HWE' isn't a numerical or NA value") }
if(!is.null(HQfilter_cal)) {
if(length(HQfilter_cal) > 1L) stop("'HQfilter_cal' isn't a single value")
if(!is.numeric(HQfilter_cal) & !is.na(HQfilter_cal)) stop("'HQfilter_cal' isn't a numerical or NA value") }
if(!is.null(HQfilter_imp)) {
if(length(HQfilter_imp) > 1L) stop("'HQfilter_imp' isn't a single value")
if(!is.numeric(HQfilter_imp) & !is.na(HQfilter_imp)) stop("'HQfilter_imp' isn't a numerical or NA value") }
if(!is.numeric(minimal_impQ_value) | length(minimal_impQ_value) != 1L) stop("'minimal_impQ_value' values isn't a single numerical value")
if(!is.numeric(maximal_impQ_value) | length(maximal_impQ_value) != 1L) stop("'maximal_impQ_value' values isn't a single numerical value")
if(!is.null(QQfilter_FRQ)) {
if(!is.vector( QQfilter_FRQ)) stop("'QQfilter_FRQ' is not a numeric vector")
if(!is.numeric(QQfilter_FRQ) & !all(is.na(QQfilter_FRQ))) stop("'QQfilter_FRQ' is not a numeric vector") }
if(!is.null(QQfilter_HWE)) {
if(!is.vector( QQfilter_HWE)) stop("'QQfilter_HWE' is not a numeric vector")
if(!is.numeric(QQfilter_HWE) & !all(is.na(QQfilter_HWE))) stop("'QQfilter_HWE' is not a numeric vector") }
if(!is.null(QQfilter_cal)) {
if(!is.vector( QQfilter_cal)) stop("'QQfilter_cal' is not a numeric vector")
if(!is.numeric(QQfilter_cal) & !all(is.na(QQfilter_cal))) stop("'QQfilter_cal' is not a numeric vector") }
if(!is.null(QQfilter_imp)) {
if(!is.vector( QQfilter_imp)) stop("'QQfilter_imp' is not a numeric vector")
if(!is.numeric(QQfilter_imp) & !all(is.na(QQfilter_imp))) stop("'QQfilter_imp' is not a numeric vector") }
stopifnot(is.logical(calculate_missing_p), length(calculate_missing_p) == 1L,
is.numeric(logI), length(logI) == 1L, is.numeric(logN), length(logN) == 1L)
if(is.na(calculate_missing_p)) { stop("'calculate_missing_p' cannot be NA") }
use_allele_std <- if(missing(allele_ref_std)) FALSE else !is.null(allele_ref_std)
use_allele_alt <- if(missing(allele_ref_alt)) FALSE else !is.null(allele_ref_alt)
remove_extension <- function(filename) {
ci <- nchar(filename) - 1L
ct <- "x"
while(ct != "." & ci != 1L){
ct <- substr(filename, ci, ci)
ci <- ci - 1L
}
return( if(ci==1L) filename else substr(filename, 1L, ci) )
}
if(!is.logical(update_alt) | length(update_alt) != 1L) { stop("'update_alt' is not a single logical value") }
if(is.na(update_alt)) { stop("'update_alt' cannot be NA") }
# arguments passed on to match_alleles are now tested regardless whether
# the function is called, since they are used in the settings-tables of
# the log file.
stopifnot(is.logical(check_ambiguous_alleles), length(check_ambiguous_alleles) == 1L,
is.numeric(threshold_allele_freq_correlation), length(threshold_allele_freq_correlation) == 1L,
is.logical(remove_mismatches_std), length(remove_mismatches_std) == 1L,
is.logical(remove_mismatches_alt), length(remove_mismatches_alt) == 1L,
is.logical(remove_diffEAF_std), length(remove_diffEAF_std) == 1L,
is.logical(remove_diffEAF_alt), length(remove_diffEAF_alt) == 1L,
is.numeric(threshold_diffEAF), length(threshold_diffEAF) == 1L,
is.logical(plot_intensity), length(plot_intensity) == 1L)
if(is.na(check_ambiguous_alleles)) stop("'check_ambiguous_alleles' cannot be NA")
if(is.na(remove_mismatches_std)) stop("'remove_mismatches_std' cannot be NA")
if(is.na(remove_mismatches_alt)) stop("'remove_mismatches_alt' cannot be NA")
if(is.na(remove_diffEAF_std)) stop("'remove_diffEAF_std' cannot be NA")
if(is.na(remove_diffEAF_alt)) stop("'remove_diffEAF_alt' cannot be NA")
if(is.na(plot_intensity)) stop("'plot_intensity' cannot be NA")
if(use_allele_std) {
if(is.character(allele_ref_std)) {
stopifnot(length(allele_ref_std) == 1L, nchar(allele_ref_std) > 2L, file.exists(paste(dir_references, allele_ref_std, sep = "/")))
print(paste("Loading standard allele reference from file:", allele_ref_std), quote = FALSE)
flush.console()
settings_allele_std <- allele_ref_std
if(toupper(substr(allele_ref_std, nchar(allele_ref_std) - 5L, nchar(allele_ref_std))) == ".RDATA") {
if(missing(allele_name_std)) allele_name_std <- substr(allele_ref_std, 1L, nchar(allele_ref_std) - 6L)
refs_name <- allele_ref_std
refs_check<- load(paste(dir_references, allele_ref_std, sep = "/"))
if(!"allele_ref_std" %in% refs_check) stop(paste0("no object with name 'allele_ref_std' inside ", refs_name,
". The standard allele reference must have the object-name 'alele_ref_std', otherwise it cannot load."))
if(length(refs_check) > 1L) print(paste(" - - WARNING:", refs_name, "contains more than one object. This may affect the running of the QC."), quote = FALSE)
} else {
if(missing(allele_name_std)) allele_name_std <- remove_extension(allele_ref_std)
allele_ref_std <- read.table(paste(dir_references, allele_ref_std, sep = "/"), header = TRUE, stringsAsFactors = FALSE)
}
} else {
settings_allele_std <- "table"
if(missing(allele_name_std)) allele_name_std <- "Std. reference"
}
if(!is.data.frame(allele_ref_std) & !is.matrix(allele_ref_std)) stop("'allele_ref_std' isn't a table, matrix or dataframe")
if(!all(c("SNP", "MAJOR", "MINOR", "MAF") %in% colnames(allele_ref_std))) {
stop("Cannot find 'SNP', 'MAJOR', 'MINOR' or 'MAF' columns in 'allele_ref_std' table") }
stopifnot(is.character(allele_name_std), length(allele_name_std) == 1L, !is.na(allele_name_std), nchar(allele_name_std) > 0L)
} else {
settings_allele_std <- NA
allele_name_std <- "Std. reference"
remove_mismatches_std <- FALSE
remove_diffEAF_std <- FALSE
}
if(use_allele_alt) {
if(is.character(allele_ref_alt)) {
stopifnot(length(allele_ref_alt) == 1L, nchar(allele_ref_alt) > 2L, file.exists(paste(dir_references, allele_ref_alt, sep = "/")))
print(paste("Loading alternative allele reference from file:", allele_ref_alt), quote = FALSE)
flush.console()
settings_allele_alt <- allele_ref_alt
if(toupper(substr(allele_ref_alt, nchar(allele_ref_alt) - 5L, nchar(allele_ref_alt))) == ".RDATA") {
if(missing(update_savename)) update_savename <- substr(allele_ref_alt, 1L, nchar(allele_ref_alt) - 6L)
if(missing(allele_name_alt)) allele_name_alt <- substr(allele_ref_alt, 1L, nchar(allele_ref_alt) - 6L)
refa_name <- allele_ref_alt
refa_check<- load(paste(dir_references, allele_ref_alt, sep = "/"))
if(!"allele_ref_alt" %in% refa_check) stop(paste0("no object with name 'allele_ref_alt' inside ", refa_name,
". The alternative allele reference must have the object-name 'alele_ref_alt', otherwise it cannot load."))
if(length(refa_check) > 1L) print(paste(" - - WARNING:", refa_name, "contains more than one object. This may affect the running of the QC."), quote = FALSE)
} else {
if(missing(update_savename)) update_savename <- remove_extension(allele_ref_alt)
if(missing(allele_name_alt)) allele_name_alt <- remove_extension(allele_ref_alt)
allele_ref_alt <- read.table(paste(dir_references, allele_ref_alt, sep = "/"),
sep = "\t", comment.char = "", header = TRUE,
colClasses = c("character","integer","integer", "character","character","numeric","character","character"))
}
} else {
settings_allele_alt <- "table"
if(missing(allele_name_alt)) allele_name_alt <- "alternative reference"
if(update_alt & missing(update_savename)) stop("No filename specified for saving updated allele-reference")
}
if(!is.data.frame(allele_ref_alt) & !is.matrix(allele_ref_alt)) stop("'allele_ref_alt' isn't a table, matrix or dataframe")
if(!all(colnames(allele_ref_alt) == c("SNP", "CHR", "POS", "MINOR", "MAJOR", "MAF", "SOURCE", "DATE_ADDED"))) stop("columns of 'allele_ref_alt' table do not match the standard")
if(!is.character(allele_name_alt) | length(allele_name_alt) != 1L) stop("'allele_name_alt' is not a character-string")
if(nchar(allele_name_alt) == 0L) stop("'allele_name_alt' cannot be an empty character-string")
} else {
if(update_alt & missing(update_savename)) stop("Cannot update alternative reference: update_savename specified")
settings_allele_alt <- NA
allele_name_alt <- "alternative reference"
remove_mismatches_alt <- FALSE
remove_diffEAF_alt <- FALSE
}
if(update_alt) {
stopifnot(is.character(update_savename), length(update_savename) == 1L, !is.na(update_savename), nchar(update_savename) > 0L,
is.logical(update_as_rdata), length(update_as_rdata) == 1L, !is.na(update_as_rdata),
is.logical(backup_alt), length(backup_alt) == 1L, !is.na(backup_alt))
}
if(!file.exists(dir_output)) {
if(!dir.create(path = dir_output, showWarnings = TRUE, recursive = TRUE)) stop("Unable to create output directory")
if(!file.exists(dir_output)) stop("Output directory was created but misnamed. Check the dir_output argument for unconventional characters or trailing spaces.")
}
### PHASE 1: preparing the data
# Step 1a: loading dataset. Two control variables are created here.
# EmergencyExit is a failsafe: when the script encounters a problem
# that cannot be fixed or ignored (corrupt data or when a QC step
# removes all SNPs in the dataset), EmergencyExit is set to TRUE. The
# script tests EmergencyExit in several places. If TRUE, the rest of
# the QC is skipped. allele_ref_changed keeps track of whether the
# alternative allele reference is changed.
emergency_return <- function(name_in, name_out, LI, LN) {
print("", quote = FALSE)
print(paste("QC check aborted for", name_in,"( file", LI, "out of", LN, ")"), quote = FALSE)
return(list(QC_successful = FALSE, filename_input = name_in, filename_output = name_out, all_ref_changed = FALSE, effectsize_return = FALSE))
}
EmergencyExit <- FALSE
allele_ref_changed <- FALSE
filename_input <- filename
print("", quote = FALSE)
print("", quote = FALSE)
print(paste("Starting analysis of", filename_input,"( file", logI, "out of", logN, ")"), quote = FALSE)
print( "Step 1: loading dataset", quote = FALSE)
flush.console()
loadI <- load_test(filename_input, dir_data, column_separators = column_separators, test_nrows = nrows_test,
header = header, na.strings = na.strings, comment.char = comment.char, stringsAsFactors = FALSE, ...)
if(loadI$success) {
if(loadI$type == "zip" | loadI$type == ".gz") {
if(loadI$type == "zip") {
filename <- substr(filename, 1L, nchar(filename) - 4L)
dataI <- read.table(unz(paste(dir_data, filename_input, sep = "/"), filename), sep = loadI$sep,
header = header, nrows = nrows, na.strings = na.strings, comment.char = comment.char, stringsAsFactors = FALSE, ...)
close(unz(paste(dir_data, filename_input, sep = "/"), filename))
} else {
filename <- substr(filename, 1L, nchar(filename) - 3L)
dataI <- read.table(gzfile(paste(dir_data, filename_input, sep = "/")), sep = loadI$sep,
header = header, nrows = nrows, na.strings = na.strings, comment.char = comment.char, stringsAsFactors = FALSE, ...)
close(gzfile(paste(dir_data, filename_input, sep = "/")))
}
} else {
dataI <- read.table(paste(dir_data, filename_input, sep = "/"), sep = loadI$sep,
header = header, nrows = nrows, na.strings = na.strings, comment.char = comment.char, stringsAsFactors = FALSE, ...)
}
} else { EmergencyExit <- TRUE }
# Filename_output is checked here to avoid calling it before the extension has been removed from filename_input
filename <- remove_extension(filename)
filename_error <- TRUE
if(missing(filename_output)) {
filename <- paste("QC", filename, sep = "_")
filename_error <- FALSE
} else {
if(length(filename_output) == 1L) {
if(is.na(filename_output)) {
filename <- paste("QC", filename, sep = "_")
filename_error <- FALSE
} else {
if(is.character(filename_output)) { if(nchar(filename_output) > 0L) {
filename <- filename_output
filename_error <- FALSE
} } } } }
if(filename_error) { filename <- paste("QC", filename, sep = "_") }
filename_dir <- paste(dir_output, filename, sep = "/")
write.table(t(c("Step", "Check", "Type", "Affected SNPs", "%", "Action", "Notes")), paste0(filename_dir, "_log.txt"), append=FALSE, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
if(filename_error) {
save_log(1L, "loading dataset", "output filename", 0L, 1L, "-", "Cannot use specified output filename - reverted to default", filename_dir)
print(paste0(" - - warning: unable to use specified output filename - reverted to default: ", filename, ".txt"), quote = FALSE)
}
if(EmergencyExit) { # 1st phase-1 EmergencyExit: failure to load file
print("CRITICAL ERROR: unable to load dataset ", quote=FALSE)
print(paste(" - error type:", loadI$error), quote=FALSE)
save_log(1L, "loading data", loadI$error, 0L, 1L, "QC aborted", "Cannot read file format", filename_dir)
return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
}
# Step 1b: checking and translating columnnames
SNPn_input <- nrow(dataI)
header_orig <- colnames(dataI)
header_info <- translate_header(header = header_orig, alternative = header_translations)
# 2st phase-1 EmergencyExit: duplicate or missing crucial headers
if(any(duplicated(header_info$header_h))) {
print("CRITICAL ERROR: dataset contains duplicate data columns",quote=FALSE)
print(paste(" - duplicate columns:", paste(header_info$header_h[duplicated(header_info$header_h)], collapse = ", ") ), quote = FALSE)
save_log(1L, "loading file", "duplicate columns", SNPn_input, SNPn_input, "QC aborted", paste("Dataset contains duplicate columns for", paste(header_info$header_h[duplicated(header_info$header_h)], collapse = ", ") ), fileL = filename_dir)
return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
}
if(header_info$missing_N > 0L) {
if(any(c("MARKER", "EFFECT_ALL", "OTHER_ALL", "EFFECT", "STDERR") %in% header_info$missing_h)) {
print(paste("CRITICAL ERROR: column(s)", paste(header_info$missing_h, collapse = ", "), "not found in dataset"), quote = FALSE)
save_log(1L, "loading file", "missing column(s)", SNPn_input, SNPn_input, "QC aborted", paste("Column(s)", paste(header_info$missing_h, collapse = ", "), "not found in dataset."), filename_dir)
if(header_info$unknown_N > 0L) { save_log(1L, "loading file", "unidentified column", SNPn_input, SNPn_input, "-", paste("Column(s)", paste(header_info$unknown_h, collapse = ", "), "were not found in the translation table."), filename_dir) }
return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
}
print(paste(" - - warning: column(s)", paste(header_info$missing_h, collapse = ", "), "not found"), quote = FALSE)
save_log(1L, "loading file", "missing column", SNPn_input, SNPn_input, "dummy column created", paste("Column(s)", paste(header_info$missing_h, collapse = ", "), "not found in dataset."), filename_dir)
if(header_info$unknown_N > 0L) { save_log(1L, "loading file", "unidentified column", SNPn_input, SNPn_input, "-", paste("Column(s)", paste(header_info$unknown_h, collapse = ", "), "were not found in the translation table."), filename_dir) }
dataI <- cbind(dataI, matrix(data = NA, nrow = SNPn_input, ncol = header_info$missing_N,
dimnames = list(NULL, header_info$missing_h)))
colnames(dataI)[1:header_info$header_N] <- header_info$header_h
} else { colnames(dataI) <- header_info$header_h }
if(order_columns) { dataI <- dataI[ , c("MARKER","CHR","POSITION","EFFECT_ALL","OTHER_ALL","STRAND","EFFECT","STDERR","PVALUE","EFF_ALL_FREQ","HWE_PVAL","CALLRATE","N_TOTAL","IMPUTED", "USED_FOR_IMP", "IMP_QUALITY", header_info$unknown_h) ] }
### PHASE 2: checking data-integrity
# Checking for invalid data entries and selecting SNPs for exclusion
# Step 2a is removing the monomorphic SNPs (defined as SNPs with a
# missing or invalid non-effect allele, allele freq = 1 or 0, or identical
# alleles). This step is caried out first so that the monomorphic
# SNPs won't generate unnecesarry logs in phase 2c-d. Y & M-
# chromosome SNPs are removed here as well.
# Step 2b is making a list of poorly-imputed SNPs (imputation quality > 0.3). Missing
# crucial variables for these SNPs won't be counted in the log
# file (but they will be shown in the stat file). This step also
# compiles lists of which SNPs are imputed and genotyped
# Step 2c is checking the remaining columns for missing and invalid
# entries.
# Step 2d is the removal of SNPs from the QC. SNPs are removed if they
# are unusuable (missing or invalid crucial variables). Crucial
# variables are: marker name, effect allele, effect size and standard
# error (bad non-effect alleles have already been removed in phase 2a).
# Duplicate SNPs are also removed here. Invalid non-crucial variables
# are set to missing.
# The QC (in phases 2a, b and c) carries out three tests per variable:
# 1) Does the column of this variable contain the correct (i.e.
# numeric or character) datatype? If not, the dataset is presumed
# corrupt and EmergencyExit is set to true.
# 2) Which entries are missing?
# 3) Which entries are invalid? (i.e. impossible values, like
# a p-value of 1.1 or chromosome 36)
# The details of are different between variables, but this is the
# broad outline. This can make for a somewhat confusing nomenclature:
# -iNA-variables refer to data that is missing in the *original*
# dataset; while inv(alid) indicates entries with impossible
# values. inv-entries will later be set to NA (for effect allele, a
# bad_ variable collects both invalid & missing entries).
# -_poor variables count the number of NA entries that have poor
# imputation quality and need not be reported as missing.
# PHASE 2a: removing monomorphic SNPs
# & Y & M chromosome SNPs
print("", quote = FALSE)
print("Step 2: testing values and excluding SNPs", quote = FALSE)
print(" - Checking alleles", quote = FALSE)
flush.console()
iNA_al1_list <- is.na(dataI$EFFECT_ALL)
iNA_al1_L0 <- sum(iNA_al1_list)
iNA_al2_list<-is.na(dataI$OTHER_ALL) | dataI$OTHER_ALL == "0" | dataI$OTHER_ALL == "-" | dataI$OTHER_ALL == "99"
iNA_al2_N <- sum(iNA_al2_list)
if(iNA_al1_L0 == SNPn_input | iNA_al2_N == SNPn_input) {
if(iNA_al1_L0 == SNPn_input) {
print("CRITICAL ERROR: effect-allele column is empty",quote=FALSE)
save_log(2L, "allele data", "no effect allele", iNA_al1_L0, SNPn_input, "QC aborted", "Effect-allele column is empty.", filename_dir)
}
if(iNA_al2_N == SNPn_input) {
print("CRITICAL ERROR: non-effect-allele column is empty",quote=FALSE)
save_log(2L, "allele data", "no non-effect allele", iNA_al2_N, SNPn_input, "QC aborted", "Non-effect-allele column is empty.", filename_dir)
}
EmergencyExit <- TRUE
} else {
if(!is.character(dataI$EFFECT_ALL)) {
print("CRITICAL ERROR: effect-allele column contains non-character entries",quote=FALSE)
save_log(2L, "allele data", "effect allele", SNPn_input - iNA_al1_L0, SNPn_input, "QC aborted", paste("Effect allele is", mode(dataI$EFFECT_ALL)), filename_dir)
EmergencyExit <- TRUE
}
if(!is.character(dataI$OTHER_ALL)) {
print("CRITICAL ERROR: non-effect-allele column contains non-character entries",quote=FALSE)
save_log(2L, "allele data", "non-effect allele", SNPn_input - iNA_al2_N, SNPn_input, "QC aborted", paste("Non-effect allele is", mode(dataI$OTHER_ALL)), filename_dir)
EmergencyExit <- TRUE
}
}
# 1st phase-2 EmergencyExit: invalid or empty EFFECT_ALL/2 columns
if(EmergencyExit) { return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN)) }
# Removing spaces from alleles
if(iNA_al1_L0 > 0L) {
if(any(nchar(dataI$EFFECT_ALL) > 1L & !iNA_al1_list)) {
dataI$EFFECT_ALL <- gsub("(^ +)|( +$)", "", dataI$EFFECT_ALL) }
} else {
if(any(nchar(dataI$EFFECT_ALL) > 1L)) {
dataI$EFFECT_ALL <- gsub("(^ +)|( +$)", "", dataI$EFFECT_ALL) } }
if(iNA_al2_N > 0L) {
if(any(nchar(dataI$OTHER_ALL) > 1L & !iNA_al2_list)) {
dataI$OTHER_ALL <- gsub("(^ +)|( +$)", "", dataI$OTHER_ALL) }
} else {
if(any(nchar(dataI$OTHER_ALL) > 1L)) {
dataI$OTHER_ALL <- gsub("(^ +)|( +$)", "", dataI$OTHER_ALL) } }
lwr_al1 <- which(dataI$EFFECT_ALL %in% c("a", "t", "c", "g"))
lwr_al2 <- which(dataI$OTHER_ALL %in% c("a", "t", "c", "g"))
if(length(lwr_al1) > 0L) { dataI$EFFECT_ALL[lwr_al1] <- toupper(dataI$EFFECT_ALL[lwr_al1]) }
if(length(lwr_al2) > 0L) { dataI$OTHER_ALL[lwr_al2] <- toupper(dataI$OTHER_ALL[lwr_al2]) }
inv_al2_list<- !dataI$OTHER_ALL %in% c("A", "T", "C", "G") & !iNA_al2_list
inv_al2_N <- sum(inv_al2_list)
low_FRQ_list<- (dataI$EFF_ALL_FREQ == 0 | dataI$EFF_ALL_FREQ == 1) & !(is.na(dataI$EFF_ALL_FREQ) | iNA_al2_list | inv_al2_list)
low_FRQ_N <- sum(low_FRQ_list)
same_al <- which(dataI$EFFECT_ALL == dataI$OTHER_ALL & !(iNA_al2_list | inv_al2_list | low_FRQ_list))
same_al_N <- length(same_al)
remove_L0 <- unique(c(which(iNA_al2_list | inv_al2_list | low_FRQ_list), same_al))
monomorp_N <- length(remove_L0)
if(is.character(dataI$CHR)) { dataI$CHR <- gsub("(^ +)|( +$)", "", dataI$CHR) }
dataI$CHR[dataI$CHR %in% c("X", "x")] <- 23L
dataI$CHR[dataI$CHR %in% c("Y", "y")] <- 24L
dataI$CHR[dataI$CHR %in% c("XY","xy", "Xy", "xY")]<- 25L
dataI$CHR[dataI$CHR %in% c("M", "m", "MT", "mt", "Mt", "mT")] <- 26L
na_rm_ch <- any(is.na(dataI$CHR))
if(remove_X) {
chr_X_N <- sum(dataI$CHR == 23, na.rm = na_rm_ch)
if(chr_X_N > 0L) {
save_log(2L, "allele data", "Chromosome X SNPs", chr_X_N, SNPn_input, "SNPs removed", "-", filename_dir)
print(paste(" - - Chromosome X SNPs removed:", chr_X_N), quote = FALSE)
remove_L0 <- unique(c(remove_L0, which(dataI$CHR == 23)))
} } else { chr_X_N <- NA }
if(remove_Y) {
chr_Y_N <- sum(dataI$CHR == 24, na.rm = na_rm_ch)
if(chr_Y_N > 0L) {
save_log(2L, "allele data", "Chromosome Y SNPs", chr_Y_N, SNPn_input, "SNPs removed", "-", filename_dir)
print(paste(" - - Chromosome Y SNPs removed:", chr_Y_N), quote = FALSE)
remove_L0 <- unique(c(remove_L0, which(dataI$CHR == 24)))
} } else { chr_Y_N <- NA }
if(remove_XY) {
chr_XY_N <- sum(dataI$CHR == 25, na.rm = na_rm_ch)
if(chr_XY_N > 0L) {
save_log(2L, "allele data", "Chromosome XY SNPs", chr_XY_N, SNPn_input, "SNPs removed", "-", filename_dir)
print(paste(" - - Chromosome XY SNPs removed:", chr_XY_N), quote = FALSE)
remove_L0 <- unique(c(remove_L0, which(dataI$CHR == 25)))
} } else { chr_XY_N <- NA }
if(remove_M) {
chr_M_N <- sum(dataI$CHR == 26, na.rm = na_rm_ch)
if(chr_M_N > 0L) {
save_log(2L, "allele data", "Chromosome M SNPs", chr_M_N, SNPn_input, "SNPs removed", "-", filename_dir)
print(paste(" - - Chromosome M SNPs removed:", chr_M_N), quote = FALSE)
remove_L0 <- unique(c(remove_L0, which(dataI$CHR == 26)))
} } else { chr_M_N <- NA }
remove_L0_N <- length(remove_L0)
# 2st step-2 EmergencyExit: no valid SNPs
if(remove_L0_N > 0L) {
print(paste(" - - monomorphic SNPs removed:", monomorp_N), quote = FALSE)
if(iNA_al2_N > 0L) {
save_log(2L, "allele data", "non-effect allele", iNA_al2_N, SNPn_input, "SNPs removed", "Missing non-effect allele entries", filename_dir)
if(remove_L0_N == SNPn_input) { print(paste(" - - warning:", iNA_al2_N, "missing non-effect allele values"), quote = FALSE) }
}
if(inv_al2_N > 0L) {
save_log(2L, "allele data", "non-effect allele", inv_al2_N, SNPn_input, "SNPs removed", "Invalid non-effect allele entries", filename_dir)
print(paste(" - - warning:", inv_al2_N, "invalid non-effect allele values"), quote = FALSE)
if(inv_al2_N > 30L) { write.table(dataI[inv_al2_list, ][1:30, ] , paste(filename_dir, "SNPs_invalid_OTHER_ALL.txt", sep = "_"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
} else { write.table(dataI[inv_al2_list, ] , paste(filename_dir, "SNPs_invalid_OTHER_ALL.txt", sep = "_"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t") }
}
if(low_FRQ_N > 0L) {
save_log(2L, "allele data", "allele frequency = 0", low_FRQ_N, SNPn_input, "SNPs removed", "Allele frequency of 0 or 1", filename_dir)
if(remove_L0_N == SNPn_input) { print(paste(" - - warning:", low_FRQ_N, "SNPs with allele-frequency of 0 or 1"), quote = FALSE) }
}
if(same_al_N > 0L) {
save_log(2L, "allele data", "identical alleles", same_al_N, SNPn_input, "SNPs removed", "Identical alleles, but allele frequency suggests non-monomorphic SNPs", filename_dir)
print(paste(" - - warning:", same_al_N, "SNPs with identical alleles"), quote = FALSE)
}
if(remove_L0_N == SNPn_input) {
print("CRITICAL ERROR: no usable SNPs in remaining dataset",quote=FALSE)
save_log(2L, "allele data", "no usable SNPs", remove_L0_N, SNPn_input, "QC aborted", "No usable SNPs.", filename_dir)
return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
} else {
save_log(2L, "allele data", "total removed", remove_L0_N, SNPn_input, "SNPs removed", "-", filename_dir)
dataI <- dataI[-remove_L0, ]
}
}
rm(iNA_al1_list, iNA_al2_list, lwr_al1, lwr_al2, inv_al2_list, low_FRQ_list, same_al, remove_L0)
# gc(verbose = FALSE) # memory cleaning
#### Step 2b: checking imputation quality
# inv and iNA are shadow-tables for dataI that log
# which entries are invalid / missing. Note that
# missing imputation-STATUS is tracked by inv, not iNA!
# remove_L1_list will be filled in phase 2d. In 2b-c it
# serves as template for the creation of other vectors.
print(" - Checking data integrity", quote = FALSE)
flush.console()
SNPn_preQC <- nrow(dataI)
reason_excl <- character(length = SNPn_preQC)
column_improb <- character(length = SNPn_preQC)
remove_L1_list <- logical(length = SNPn_preQC)
inv <- data.frame(
chr = remove_L1_list, pos = remove_L1_list,
strand = remove_L1_list, p = remove_L1_list,
FRQ = remove_L1_list, HWE = remove_L1_list,
cal = remove_L1_list, N = remove_L1_list,
impQ = remove_L1_list, impstatus = remove_L1_list )
iNA <- data.frame(
chr = is.na(dataI$CHR), pos = is.na(dataI$POSITION),
strand = is.na(dataI$STRAND), p = is.na(dataI$PVALUE),
FRQ = is.na(dataI$EFF_ALL_FREQ), HWE = is.na(dataI$HWE_PVAL),
cal = is.na(dataI$CALLRATE), N = is.na(dataI$N_TOTAL),
impQ = is.na(dataI$IMP_QUALITY))
#Testing dataI$IMPUTED - unlike other parametes, the imputation status
# column can contain many data-types; hence it's "translated" here.
# The translated values will replace the original data at the
# end of phase 2. Imputation status is also unusual in that missing
# values are stored in the "invalid" table (because without
# this parameter call rate, HWE-p & imputation quality are useless), although
# the inv_impstatus_N still counts only true invalids.
if(is.character(dataI$IMPUTED)) {
dataI$IMPUTED <- gsub("(^ +)|( +$)", "", dataI$IMPUTED) }
iNA_impstatus_N <- if(length(imputed_NA) == 0L) 0L else sum(dataI$IMPUTED %in% imputed_NA)
if(iNA_impstatus_N == SNPn_preQC) {
save_log(2L, "data integrity", "no imputation status", iNA_impstatus_N, SNPn_preQC, "-", "Imputation-status column contains no values!", filename_dir)
print(" - - warning: no imputation-status data", quote=FALSE)
inv$impstatus <- TRUE
column_improb <- paste0(column_improb, "imputation status; ")
inv_impstatus_N <- 0L
} else {
new_impstatus <- convert_impstatus(dataI$IMPUTED, T_strings = imputed_T, F_strings = imputed_F, NA_strings = imputed_NA,
use_log = TRUE, allSNPs = SNPn_preQC, fileL = filename_dir)
inv$impstatus <- is.na(new_impstatus)
column_improb[inv$impstatus] <- paste0(column_improb[inv$impstatus], "imputation status; ")
inv_impstatus_N <- sum(inv$impstatus) - iNA_impstatus_N
if(iNA_impstatus_N > 0L) {
save_log(2L, "data integrity", "imputation status", iNA_impstatus_N, SNPn_preQC, "-", "Missing imputation status values", filename_dir)
print(paste(" - - warning:", iNA_impstatus_N, "missing imputation-status values"), quote = FALSE)
}
if(inv_impstatus_N > 0L) {
if(is.character(dataI$IMPUTED)) {
print("CRITICAL ERROR: imputation-status column contains unidentified character strings",quote=FALSE)
save_log(2L, "data integrity", "imputation status", inv_impstatus_N, SNPn_preQC, "QC aborted", "Unidentified character strings in imputation status column.", filename_dir)
EmergencyExit <- TRUE
} else {
print(paste(" - - warning:", inv_impstatus_N, "invalid imputation-status values"), quote = FALSE)
save_log(2L, "data integrity", "imputation status", inv_impstatus_N, SNPn_preQC, "set to NA", "Invalid imputation status", filename_dir)
} } }
if(inv_impstatus_N + iNA_impstatus_N == SNPn_preQC) {
geno_list <- remove_L1_list
imp_list <- remove_L1_list
SNPn_preQC_geno <- 0L
SNPn_preQC_imp <- 0L
} else {
geno_list <- new_impstatus == 0 & !inv$impstatus
imp_list <- new_impstatus == 1 & !inv$impstatus
SNPn_preQC_geno <- sum(geno_list)
SNPn_preQC_imp <- sum( imp_list)
}
#Testing dataI$IMP_QUALITY
iNA_impQ_N <- sum(iNA$impQ)
if(!is.numeric(dataI$IMP_QUALITY) & iNA_impQ_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: imputation-quality column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "imputation quality", SNPn_preQC - iNA_impQ_N, SNPn_preQC, "QC aborted", paste0("Imputation quality is ", mode(dataI$IMP_QUALITY)), filename_dir)
low_impQ_list <- remove_L1_list
} else {
if(any(dataI$IMP_QUALITY == -1, na.rm = iNA_impQ_N > 0L)) {
iNA$impQ[which(dataI$IMP_QUALITY == -1)] <- TRUE
iNA_impQ_N <- sum(iNA$impQ)
dataI$IMP_QUALITY[which(dataI$IMP_QUALITY == -1)] <- NA
}
iNA_impQ_g <- sum(iNA$impQ & geno_list)
iNA_impQ_i <- sum(iNA$impQ & imp_list)
inv$impQ <- !( (dataI$IMP_QUALITY >= minimal_impQ_value & dataI$IMP_QUALITY <= maximal_impQ_value) | iNA$impQ )
inv_impQ_N <- sum(inv$impQ)
if(inv_impQ_N > 0L) {
column_improb[inv$impQ] <- paste0(column_improb[inv$impQ],"imputation quality; ")
inv_impQ_g <- sum(inv$impQ & geno_list)
inv_impQ_i <- sum(inv$impQ & imp_list)
save_log(2, "data integrity", "imputation quality", inv_impQ_N, SNPn_preQC, "set to NA", "Invalid imputation quality", filename_dir)
print(paste(" - - warning:", inv_impQ_N, "invalid imputation-quality values"), quote = FALSE)
} else {
inv_impQ_g <- 0L
inv_impQ_i <- 0L
}
if(ignore_impstatus) { low_impQ_list <- dataI$IMP_QUALITY < 0.3 & !iNA$impQ & !inv$impQ
} else { low_impQ_list <- dataI$IMP_QUALITY < 0.3 & !iNA$impQ & !inv$impQ & imp_list }
}
#### Step 2c: checking other variables
# Testing dataI$MARKER for NA & duplicate entries.
iNA_marker <- which(is.na(dataI$MARKER))
reason_excl[iNA_marker] <- paste(reason_excl[iNA_marker],"Missing marker ID;")
iNA_mar_N <- length(iNA_marker)
if(!is.character(dataI$MARKER) & iNA_mar_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: marker-name column contains non-character entries",quote=FALSE)
save_log(2L, "data integrity", "SNP names", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Marker name is ", mode(dataI$MARKER)), filename_dir)
} else {
dataI$MARKER <- gsub("(^ +)|( +$)", "", dataI$MARKER)
dupli_list <- duplicated(dataI$MARKER)
if(iNA_mar_N > 0L) {
save_log(2L, "data integrity", "marker ID", iNA_mar_N, SNPn_preQC, "Markers removed", "Missing marker ID", filename_dir)
print(paste(" - - warning:", iNA_mar_N, "missing marker names"), quote = FALSE)
dupli_list[iNA_marker] <- FALSE # removes "duplicate" NA's from dupli list
}
if(any(dupli_list)) {
dupli <- which(dataI$MARKER %in% unique(dataI$MARKER[dupli_list]) ) # Lists all SNPs with multiple entries
reason_excl[dupli] <- paste(reason_excl[dupli],"Duplicate marker ID;")
SNPn_preQC_dupli <- length(dupli)
save_log(2L, "data integrity", "duplicate marker IDs", SNPn_preQC_dupli, SNPn_preQC, "Markers removed", "Duplicate marker IDs", filename_dir)
print(paste(" - - warning:", SNPn_preQC_dupli, "duplicate marker names"), quote = FALSE)
write.table(dataI[dupli, ], paste(filename_dir, "SNPs_duplicates.txt", sep = "_"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
} else {
dupli <- NULL
SNPn_preQC_dupli <- 0L
}
}
# Testing dataI$CHR
iNA_chr_N <- sum(iNA$chr)
if(iNA_chr_N != SNPn_preQC) {
inv$chr <- !(dataI$CHR %in% 1:26 | iNA$chr)
column_improb[inv$chr] <- paste0(column_improb[inv$chr],"chromosome; ")
inv_chr_N <- sum(inv$chr)
if(inv_chr_N > 0L) {
save_log(2L, "data integrity", "chromosome number", inv_chr_N, SNPn_preQC, "set to NA", "Invalid chromosome number", filename_dir)
print(paste(" - - warning:", inv_chr_N, "invalid chromosome values"), quote = FALSE)
}
if(iNA_chr_N + inv_chr_N < SNPn_preQC) {
if(any(!(c(1:22) %in% dataI$CHR))) {
nochr <- paste(which(!(c(1:22) %in% dataI$CHR)), collapse=", ")
print(paste(" - - no SNPs present for chromosome(s)", nochr), quote = FALSE)
save_log(2L, "data integrity", "chromosome number", SNPn_preQC, SNPn_preQC, "-", paste("No SNPs present for chromosome(s)", nochr), filename_dir)
} }
} else { inv_chr_N <- 0L }
#Testing dataI$POSITION
iNA_pos_N <- sum(iNA$pos)
if(!is.numeric(dataI$POSITION) & iNA_pos_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: position column contains non-integer entries",quote=FALSE)
save_log(2L, "data integrity", "position", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Position is ", mode(dataI$POSITION)), filename_dir)
} else {
inv$pos <- dataI$POSITION <= 0 & !iNA$pos
column_improb[inv$pos] <- paste0(column_improb[inv$pos],"position; ")
inv_pos_N <- sum(inv$pos)
if(inv_pos_N > 0L) {
save_log(2L, "data integrity", "position", inv_pos_N, SNPn_preQC, "set to NA", "Invalid positions", filename_dir)
print(paste(" - - warning:", inv_pos_N, "invalid position values"), quote = FALSE)
} }
#Testing dataI$EFFECT_ALL
# Because it is possible, if unlikely, that all non-missing entries were removed
# in phase 2a, the data type is checked again, here.
iNA_al1_N <- sum(is.na(dataI$EFFECT_ALL))
if(!is.character(dataI$EFFECT_ALL) & iNA_al1_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: effect-allele column contains non-character entries",quote=FALSE)
save_log(2L, "data integrity", "effect allele", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Effect allele is ", mode(dataI$EFFECT_ALL)), filename_dir)
} else {
bad_al1 <- which(!dataI[ ,"EFFECT_ALL"] %in% c("A", "T", "C", "G"))
reason_excl[bad_al1] <- paste(reason_excl[bad_al1],"Invalid effect allele;")
inv_al1_N <- length(bad_al1) - iNA_al1_N
if(iNA_al1_N > 0L) {
save_log(2L, "data integrity", "effect allele", iNA_al1_N, SNPn_preQC, "Markers removed", "Missing effect allele", filename_dir)
print(paste(" - - warning:", iNA_al1_N, "missing effect alleles"), quote = FALSE)
}
if(inv_al1_N > 0L) {
save_log(2L, "data integrity", "effect allele", inv_al1_N, SNPn_preQC, "Markers removed", "Invalid effect allele", filename_dir)
print(paste(" - - warning:", inv_al1_N, "invalid effect alleles"), quote = FALSE)
}
if(!"A" %in% dataI$EFFECT_ALL) save_log(2L, "data integrity", "effect allele", SNPn_preQC, SNPn_preQC, "-", "No A alleles found in effect allele column", filename_dir)
if(!"C" %in% dataI$EFFECT_ALL) save_log(2L, "data integrity", "effect allele", SNPn_preQC, SNPn_preQC, "-", "No C alleles found in effect allele column", filename_dir)
if(!"T" %in% dataI$EFFECT_ALL) save_log(2L, "data integrity", "effect allele", SNPn_preQC, SNPn_preQC, "-", "No T alleles found in effect allele column", filename_dir)
if(!"G" %in% dataI$EFFECT_ALL) save_log(2L, "data integrity", "effect allele", SNPn_preQC, SNPn_preQC, "-", "No G alleles found in effect allele column", filename_dir)
if(!"A" %in% dataI$OTHER_ALL ) save_log(2L, "data integrity", "other allele", SNPn_preQC, SNPn_preQC, "-", "No A alleles found in non-effect allele column", filename_dir)
if(!"C" %in% dataI$OTHER_ALL ) save_log(2L, "data integrity", "other allele", SNPn_preQC, SNPn_preQC, "-", "No C alleles found in non-effect allele column", filename_dir)
if(!"T" %in% dataI$OTHER_ALL ) save_log(2L, "data integrity", "other allele", SNPn_preQC, SNPn_preQC, "-", "No T alleles found in non-effect allele column", filename_dir)
if(!"G" %in% dataI$OTHER_ALL ) save_log(2L, "data integrity", "other allele", SNPn_preQC, SNPn_preQC, "-", "No G alleles found in non-effect allele column", filename_dir)
}
# Testing strand_info
strand_pre <- list(plus = 0L, minus = 0L, missing = sum(iNA$strand), invalid = 0L)
if(is.character(dataI$STRAND) & strand_pre$missing != SNPn_preQC) {
dataI$STRAND <- gsub("(^ +)|( +$)", "", dataI$STRAND)
if(strand_pre$missing > 0L) {
strand_min <- dataI$STRAND == "-" & !iNA$strand
inv$strand <- !(dataI$STRAND == "+" | strand_min | iNA$strand )
column_improb[inv$strand] <- paste0(column_improb[inv$strand],"strand; ")
} else {
strand_min <- dataI$STRAND == "-"
inv$strand <- !(dataI$STRAND == "+" | strand_min)
column_improb[inv$strand] <- paste0(column_improb[inv$strand],"strand; ")
}
strand_pre$minus <- sum(strand_min)
strand_pre$invalid <- sum(inv$strand)
strand_pre$plus <- SNPn_preQC - (strand_pre$minus + strand_pre$missing + strand_pre$invalid)
if(strand_pre$missing > 0L) { save_log(2L, "data integrity", "strand", strand_pre$missing, SNPn_preQC, "-", "Missing strand information", filename_dir) }
if(strand_pre$invalid > 0L) {
save_log(2L, "data integrity", "strand", strand_pre$invalid, SNPn_preQC, "set to NA", "Invalid strand information", filename_dir)
print(paste(" - - warning:", strand_pre$invalid, "invalid strand values"), quote = FALSE)
}
} else {
if(strand_pre$missing == SNPn_preQC) {
save_log(2L, "data integrity", "strand", strand_pre$missing, SNPn_preQC, "-", "No strand information in dataset", filename_dir)
print(" - - warning: no strand information in dataset", quote = FALSE)
strand_min <- remove_L1_list
} else {
strand_pre$invalid <- SNPn_preQC - strand_pre$missing
print("CRITICAL ERROR: strand column contains non-character entries", quote=FALSE)
save_log(2L, "data integrity", "strand", strand_pre$invalid, SNPn_preQC, "QC aborted", paste0("Strand information is ", mode(dataI$STRAND)), filename_dir)
EmergencyExit <- TRUE
} }
#Testing dataI$EFFECT
iNA_eff <- which(is.na(dataI$EFFECT))
reason_excl[iNA_eff] <- paste(reason_excl[iNA_eff],"Missing effect size;")
iNA_eff_N <- length(iNA_eff)
if(!is.numeric(dataI$EFFECT) & iNA_eff_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: effect size column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "effect size", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Effect size is ", mode(dataI$EFFECT)), filename_dir)
} else {
min_eff <- which(dataI$EFFECT == -1 & (dataI$PVALUE == -1 | dataI$STDERR == -1))
if(length(min_eff) > 0L) {
dataI$EFFECT[min_eff] <- NA
iNA_eff <- c(iNA_eff, min_eff)
iNA_eff_N <- iNA_eff_N + length(min_eff)
}
if(iNA_eff_N > 0L) {
iNA_eff_poor <- sum(low_impQ_list[iNA_eff])
if(iNA_eff_N - iNA_eff_poor > 0L) {
save_log(2L, "data integrity", "effect size", iNA_eff_N - iNA_eff_poor, SNPn_preQC, "Markers removed", "Missing effect size (poorly imputed markers not included)", filename_dir)
print(paste(" - - warning:", iNA_eff_N - iNA_eff_poor, "missing effect size values"), quote = FALSE)
}
} else { iNA_eff_poor <- 0L }
}
#Testing dataI$STDERR
iNA_se <- which(is.na(dataI$STDERR))
reason_excl[iNA_se] <- paste(reason_excl[iNA_se],"Missing standard error;")
iNA_se_N <- length(iNA_se)
if(!is.numeric(dataI$STDERR) & iNA_se_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: SE column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "standard error", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Standard error is ", mode(dataI$STDERR)), filename_dir)
} else {
if(any(dataI$STDERR == -1, na.rm = iNA_se_N > 0L)) {
iNA_se <- c(iNA_se, which(dataI$STDERR == -1))
iNA_se_N <- length(iNA_se)
dataI$STDERR[which(dataI$STDERR == -1)] <- NA
}
inv_se <- which(dataI$STDERR <= 0)
reason_excl[inv_se] <- paste(reason_excl[inv_se],"Invalid standard error;")
inv_se_N <- length(inv_se)
if(inv_se_N > 0L) {
if(any(dataI$STDERR[inv_se] == 0)) {
if(any(dataI$STDERR[inv_se] != 0)) {
save_log(2L, "data integrity", "standard error", sum(dataI$STDERR[inv_se] != 0), SNPn_preQC, "Markers removed", "Invalid standard error (not counting SE = 0)", filename_dir)
}
save_log(2L, "data integrity", "standard error", sum(dataI$STDERR[inv_se] == 0), SNPn_preQC, "Markers removed", "Invalid standard error (SE = 0)", filename_dir)
} else {
save_log(2L, "data integrity", "standard error", inv_se_N, SNPn_preQC, "Markers removed", "Invalid standard error", filename_dir)
}
print(paste(" - - warning:", inv_se_N, "invalid standard error values"), quote = FALSE)
}
if(iNA_se_N > 0L) {
iNA_se_poor <- sum(low_impQ_list[iNA_se])
if(iNA_se_N - iNA_se_poor > 0L) {
save_log(2L, "data integrity", "standard error", iNA_se_N - iNA_se_poor, SNPn_preQC, "Markers removed", "Missing standard error (poorly imputed markers not included)", filename_dir)
print(paste(" - - warning:", iNA_se_N - iNA_se_poor, "missing standard error values"), quote = FALSE)
}
} else { iNA_se_poor<- 0L }
}
#Testing dataI$PVALUE
iNA_p_N <- sum(iNA$p)
if(!is.numeric(dataI$PVALUE) & iNA_p_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: p-value column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "p-value", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("P-value is ", mode(dataI$PVALUE)), filename_dir)
} else {
if(any(dataI$PVALUE == -1, na.rm = iNA_p_N > 0L)) {
iNA$p[which(dataI$PVALUE == -1)] <- TRUE
iNA_p_N <- sum(iNA$p)
dataI$PVALUE[which(dataI$PVALUE == -1)] <- NA
}
inv$p <- !( (dataI$PVALUE > 0 & dataI$PVALUE <= 1) | iNA$p )
column_improb[inv$p] <- paste0(column_improb[inv$p],"p-value; ")
inv_p_N <- sum(inv$p)
if(inv_p_N > 0L) {
save_log(2L, "data integrity", "p-value", inv_p_N, SNPn_preQC, "set to NA", "Invalid p-value", filename_dir)
print(paste(" - - warning:", inv_p_N, "invalid p-values"), quote = FALSE)
} }
#Testing dataI$EFF_ALL_FREQ
iNA_FRQ_N <- sum(iNA$FRQ)
if(!is.numeric(dataI$EFF_ALL_FREQ) & iNA_FRQ_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: allele-frequency column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "allele frequency", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Allele frequency is ", mode(dataI$EFF_ALL_FREQ)), filename_dir)
} else {
if(any(dataI$EFF_ALL_FREQ == -1, na.rm = iNA_FRQ_N > 0L)) {
iNA$FRQ[which(dataI$EFF_ALL_FREQ == -1)] <- TRUE
iNA_FRQ_N <- sum(iNA$FRQ)
dataI$EFF_ALL_FREQ[which(dataI$EFF_ALL_FREQ == -1)] <- NA
}
inv$FRQ <- !( (dataI$EFF_ALL_FREQ >= 0 & dataI$EFF_ALL_FREQ <= 1) | iNA$FRQ )
column_improb[inv$FRQ] <- paste0(column_improb[inv$FRQ],"allele frequency; ")
inv_FRQ_N <- sum(inv$FRQ)
if(inv_FRQ_N > 0L) {
save_log(2L, "data integrity", "allele frequency", inv_FRQ_N, SNPn_preQC, "set to NA", "Invalid allele frequencies", filename_dir)
print(paste(" - - warning:", inv_FRQ_N, "invalid allele frequencies"), quote = FALSE)
} }
#Testing dataI$HWE_PVAL
iNA_HWE_N <- sum(iNA$HWE)
if(!is.numeric(dataI$HWE_PVAL) & iNA_HWE_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: HWE p-value column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "HWE p-value", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("HWE p-value is ", mode(dataI$HWE_PVAL)), filename_dir)
} else {
if(any(dataI$HWE_PVAL == -1, na.rm = iNA_HWE_N > 0L)) {
iNA$HWE[which(dataI$HWE_PVAL == -1)] <- TRUE
iNA_HWE_N <- sum(iNA$HWE)
dataI$HWE_PVAL[which(dataI$HWE_PVAL == -1)] <- NA
}
inv$HWE <- !( (dataI$HWE_PVAL > 0 & dataI$HWE_PVAL <= 1) | iNA$HWE )
column_improb[inv$HWE] <- paste0(column_improb[inv$HWE],"HWE p-value; ")
inv_HWE_N <- sum(inv$HWE)
iNA_HWE_g <- sum(iNA$HWE & geno_list)
iNA_HWE_i <- sum(iNA$HWE & imp_list)
if(inv_HWE_N > 0L) {
save_log(2L, "data integrity", "HWE p-value", inv_HWE_N, SNPn_preQC, "set to NA", "Invalid HWE p-value", filename_dir)
print(paste(" - - warning:", inv_HWE_N, "invalid HWE p-values"), quote = FALSE)
inv_HWE_g <- sum(inv$HWE & geno_list)
inv_HWE_i <- sum(inv$HWE & imp_list)
} else {
inv_HWE_g <- 0L
inv_HWE_i <- 0L
} }
#Testing dataI$CALLRATE
iNA_cal_N <- sum(iNA$cal)
if(!is.numeric(dataI$CALLRATE) & iNA_cal_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: call-rate column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "call rate", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Call rate is ", mode(dataI$CALLRATE)), filename_dir)
} else {
if(any(dataI$CALLRATE == -1, na.rm = iNA_cal_N > 0L)) {
iNA$cal[which(dataI$CALLRATE == -1)] <- TRUE
iNA_cal_N <- sum(iNA$cal)
dataI$CALLRATE[which(dataI$CALLRATE == -1)] <- NA
}
inv$cal <- !( (dataI$CALLRATE >= 0 & dataI$CALLRATE <= 1) | iNA$cal )
column_improb[inv$cal] <- paste0(column_improb[inv$cal],"call rate; ")
inv_cal_N <- sum(inv$cal)
iNA_cal_g <- sum(iNA$cal & geno_list)
iNA_cal_i <- sum(iNA$cal & imp_list)
if(inv_cal_N > 0L) {
save_log(2L, "data integrity", "call rate", inv_cal_N, SNPn_preQC, "set to NA", "Invalid call rate", filename_dir)
print(paste(" - - warning:", inv_cal_N, "invalid callrate values"), quote = FALSE)
inv_cal_g <- sum(inv$cal & geno_list)
inv_cal_i <- sum(inv$cal & imp_list)
} else {
inv_cal_g <- 0L
inv_cal_i <- 0L
} }
#Testing dataI$N_TOTAL
iNA_N_N <- sum(iNA$N)
if(!is.numeric(dataI$N_TOTAL) & iNA_N_N != SNPn_preQC) {
EmergencyExit <- TRUE
print("CRITICAL ERROR: QC aborted because sample-size column contains non-numeric entries",quote=FALSE)
save_log(2L, "data integrity", "sample size", SNPn_preQC, SNPn_preQC, "QC aborted", paste0("Sample size is ", mode(dataI$N_TOTAL)), filename_dir)
} else {
iNA_N_g <- sum( iNA$N & geno_list)
iNA_N_i <- sum( iNA$N & imp_list)
inv$N <- dataI$N_TOTAL <= 0 & !iNA$N
column_improb[inv$N] <- paste0(column_improb[inv$N],"sample size; ")
inv_N_N <- sum(inv$N)
if(inv_N_N > 0L) {
save_log(2L, "data integrity", "sample size", inv_N_N, SNPn_preQC, "set to NA", "Invalid sample size", filename_dir)
print(paste(" - - warning:", inv_N_N, "invalid sample size values"), quote = FALSE)
inv_N_g <- sum(inv$N & geno_list)
inv_N_i <- sum(inv$N & imp_list)
} else {
inv_N_g <- 0L
inv_N_i <- 0L
} }
# 3rd phase-2 EmergencyExit: invalid data-types
if(EmergencyExit) return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
#### PHASE 2d: removal of unusable SNPs
# Here the scripts checks if no column contained unexpected data
# (via EmergencyExit), and makes lists of entries selected for
# removal. These entries are then saved separately, together
# with a list (inv_entries) of SNPs with improbable values for
# for non-crucial data. The improbable values are then set to NA,
# and the remove_L1 removed from the dataset.(In the
# unlikely event that all SNPs are removed, the script will
# activate EmergencyExit instead.)
flush.console()
SNPn_preQC_inv <- sum(rowSums(inv) > 0L)
# 4rd phase-2 EmergencyExit: no usuable SNPs
remove_L1 <- sort(unique(c(iNA_marker, dupli, bad_al1, iNA_eff, iNA_se, inv_se)))
remove_L1_N <- length(remove_L1)
if(remove_L1_N > 0L ) {
if(remove_L1_N == SNPn_preQC) {
print("CRITICAL ERROR: no usable markers remaining in dataset",quote=FALSE)
save_log(2L, "data integrity", "unusable dataset", remove_L1_N, SNPn_preQC, "QC check aborted", "No valid markers.", filename_dir)
return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
} else {
write.table(data.frame(dataI[remove_L1,],REASON=reason_excl[remove_L1]), paste(filename_dir, "SNPs_removed.txt", sep = "_"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
if(SNPn_preQC_inv > 0L) {
remove_L1_list[remove_L1] <- TRUE
if(any(rowSums(inv) > 0L & !remove_L1_list)) {
write.table(data.frame(dataI[rowSums(inv) > 0L & !remove_L1_list, ], COLUMN = column_improb[rowSums(inv) > 0L & !remove_L1_list], stringsAsFactors = FALSE)[1:min(1000, sum(rowSums(inv) > 0L & !remove_L1_list)), ],
paste(filename_dir, "SNPs_improbable_values.txt", sep = "_"), append = FALSE, col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
}
if(inv_chr_N > 0L) dataI$CHR[inv$chr] <- NA
if(inv_pos_N > 0L) dataI$POSITION[inv$pos] <- NA
if(strand_pre$invalid > 0L) dataI$STRAND[inv$strand] <- NA
if(inv_p_N > 0L) dataI$PVALUE[inv$p] <- NA
if(inv_FRQ_N > 0L) dataI$EFF_ALL_FREQ[inv$FRQ] <- NA
if(inv_HWE_N > 0L) dataI$HWE_PVAL[inv$HWE] <- NA
if(inv_cal_N > 0L) dataI$CALLRATE[inv$cal] <- NA
if(inv_N_N > 0L) dataI$N_TOTAL[inv$N] <- NA
if(inv_impQ_N > 0L) dataI$IMP_QUALITY[inv$impQ] <- NA
}
dataI$IMPUTED <- if(iNA_impstatus_N == SNPn_preQC) NA else new_impstatus
dataI <- dataI[-remove_L1, ]
inv <- inv[-remove_L1, ]
iNA <- iNA[-remove_L1, ]
geno_list <- geno_list[-remove_L1]
imp_list <- imp_list[-remove_L1]
strand_min <- strand_min[-remove_L1]
SNPn_ref <- nrow(dataI)
strand_ref <- list(
plus = 0L,
minus = if(strand_pre$minus == 0L) 0L else sum(strand_min),
missing = if(strand_pre$missing == 0L) 0L else sum(iNA$strand),
invalid = if(strand_pre$invalid == 0L) 0L else sum(inv$strand) )
strand_ref$plus <- SNPn_ref - (strand_ref$missing + strand_ref$invalid + strand_ref$minus)
save_log(2L, "data integrity", "total removed", remove_L1_N, SNPn_preQC, "Markers removed", "Total number of markers removed", filename_dir)
print(paste(" - - unusable markers removed:", remove_L1_N), quote = FALSE)
}
} else {
if(SNPn_preQC_inv > 0L) {
write.table(data.frame(dataI[rowSums(inv) > 0L, ], COLUMN = column_improb[rowSums(inv) > 0L], stringsAsFactors = FALSE)[1:min(1000,SNPn_preQC_inv), ],
paste(filename_dir, "SNPs_improbable_values.txt", sep = "_"), append = FALSE, col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
if(inv_chr_N > 0L) dataI$CHR[inv$chr] <- NA
if(inv_pos_N > 0L) dataI$POSITION[inv$pos] <- NA
if(strand_pre$invalid > 0L) dataI$STRAND[inv$strand] <- NA
if(inv_p_N > 0L) dataI$PVALUE[inv$p] <- NA
if(inv_FRQ_N > 0L) dataI$EFF_ALL_FREQ[inv$FRQ] <- NA
if(inv_HWE_N > 0L) dataI$HWE_PVAL[inv$HWE] <- NA
if(inv_cal_N > 0L) dataI$CALLRATE[inv$cal] <- NA
if(inv_N_N > 0L) dataI$N_TOTAL[inv$N] <- NA
if(inv_impQ_N > 0L) dataI$IMP_QUALITY[inv$impQ] <- NA
}
dataI$IMPUTED <- if(iNA_impstatus_N == SNPn_preQC) NA else new_impstatus
SNPn_ref <- SNPn_preQC
strand_ref <- strand_pre
save_log(2L, "data integrity", "total removed", 0L, SNPn_preQC, "-", "No markers removed", filename_dir)
print(paste(" - - no markers removed from", filename_input), quote = FALSE)
}
if(iNA_impstatus_N != SNPn_preQC) rm(new_impstatus)
rm(remove_L1, remove_L1_list, low_impQ_list,
iNA_marker, dupli_list, dupli, bad_al1,
iNA_eff, min_eff, iNA_se, inv_se, column_improb, reason_excl)
gc(verbose = FALSE) # memory cleaning
### PHASE 3: checking alleles & frequency against a base (allele_ref_std) and
# a additional reference (allele_ref_alt) table. Base table will remain unaltered, while
# the additional table will be updated with unknown SNPs.
print("", quote = FALSE)
print("Step 3: checking alleles", quote = FALSE)
# creating a high-quality list for the allele-frequency plots
if(use_allele_std | use_allele_alt) {
FRQ_FRQ_threshold <- if(useFRQ_threshold <= 1) SNPn_ref * useFRQ_threshold else useFRQ_threshold
FRQ_HWE_threshold <- if(useHWE_threshold <= 1) SNPn_ref * useHWE_threshold else useHWE_threshold
FRQ_cal_threshold <- if(useCal_threshold <= 1) SNPn_ref * useCal_threshold else useCal_threshold
FRQ_imp_threshold <- if(useImp_threshold <= 1) SNPn_ref * useImp_threshold else useImp_threshold
if(FRQ_FRQ_threshold < 2) { FRQ_FRQ_threshold <- 2
save_log(3L, "thresholds", "allele frequency", SNPn_ref, SNPn_ref, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(FRQ_HWE_threshold < 2) { FRQ_HWE_threshold <- 2
save_log(3L, "thresholds", "HWE p-value", SNPn_ref, SNPn_ref, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(FRQ_cal_threshold < 2) { FRQ_cal_threshold <- 2
save_log(3L, "thresholds", "Call rate", SNPn_ref, SNPn_ref, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(FRQ_imp_threshold < 2) { FRQ_imp_threshold <- 2
save_log(3L, "thresholds", "Imputation quality", SNPn_ref, SNPn_ref, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(ignore_impstatus) {
FRQ_HQ_list <- HQ_filter(data = dataI, ignore_impstatus = TRUE,
FRQ_val = if(SNPn_ref - sum(iNA$FRQ | inv$FRQ ) >= FRQ_FRQ_threshold) HQfilter_FRQ else NULL,
HWE_val = if(SNPn_ref - sum(iNA$HWE | inv$HWE ) >= FRQ_HWE_threshold) HQfilter_HWE else NULL,
cal_val = if(SNPn_ref - sum(iNA$cal | inv$cal ) >= FRQ_cal_threshold) HQfilter_cal else NULL,
imp_val = if(SNPn_ref - sum(iNA$impQ| inv$impQ) >= FRQ_imp_threshold) HQfilter_imp else NULL,
FRQ_NA = NAfilter_FRQ, HWE_NA = NAfilter_HWE, cal_NA = NAfilter_cal, imp_NA = NAfilter_imp)
} else {
FRQ_HQ_list <- HQ_filter(data = dataI, ignore_impstatus = FALSE,
FRQ_val = if(SNPn_ref - sum(iNA$FRQ | inv$FRQ) >= FRQ_FRQ_threshold) HQfilter_FRQ else NULL,
HWE_val = if(SNPn_preQC_geno == 0L) NULL else { if(sum(geno_list & !(iNA$HWE | inv$HWE) ) >= FRQ_HWE_threshold) HQfilter_HWE else NULL },
cal_val = if(SNPn_preQC_geno == 0L) NULL else { if(sum(geno_list & !(iNA$cal | inv$cal) ) >= FRQ_cal_threshold) HQfilter_cal else NULL },
imp_val = if(SNPn_preQC_imp == 0L) NULL else { if(sum( imp_list & !(iNA$impQ|inv$impQ) ) >= FRQ_imp_threshold) HQfilter_imp else NULL },
FRQ_NA = NAfilter_FRQ, HWE_NA = NAfilter_HWE, cal_NA = NAfilter_cal, imp_NA = NAfilter_imp)
}
}
# Step 3a: using the base reference
if(use_allele_std) {
ref_std <- dataI$MARKER %in% allele_ref_std$SNP
SNPn_ref_std <- sum(ref_std)
if(SNPn_ref_std > 0L) {
print(paste(" - matching alleles to", allele_name_std), quote = FALSE)
flush.console()
allele_out_std <- match_alleles(dataset = dataI[ref_std,
c("MARKER", "EFFECT_ALL", "OTHER_ALL", "STRAND", "EFFECT", "EFF_ALL_FREQ")[c(TRUE, TRUE, TRUE, strand_ref$minus > 0L, TRUE, TRUE)]],
dataname = filename,
ref_set = allele_ref_std, ref_name = allele_name_std,
unmatched_data = FALSE, HQ_subset = FRQ_HQ_list[ref_std],
check_strand = strand_ref$minus > 0L,
save_mismatches = TRUE, delete_mismatches = remove_mismatches_std,
delete_diffEAF = remove_diffEAF_std, threshold_diffEAF = threshold_diffEAF,
check_FRQ = TRUE, check_ambiguous = check_ambiguous_alleles,
plot_FRQ = make_plots, plot_intensity = plot_intensity,
plot_if_threshold = only_plot_if_threshold, threshold_r = threshold_allele_freq_correlation,
return_SNPs = TRUE,
save_name = filename, save_dir = dir_output, use_log = TRUE, log_SNPall = SNPn_ref )
dataI$EFFECT_ALL[ref_std] <- allele_out_std$EFFECT_ALL
dataI$OTHER_ALL[ref_std] <- allele_out_std$OTHER_ALL
if(strand_ref$minus > 0L & allele_out_std$n_negative_strand > 0L) {
dataI$STRAND[ref_std] <- allele_out_std$STRAND }
dataI$EFFECT[ref_std] <- allele_out_std$EFFECT
dataI$EFF_ALL_FREQ[ref_std] <- allele_out_std$EFF_ALL_FREQ
# Memory-cleaning: data columns are removed from allele_out
allele_out_std <- allele_out_std[1:16]
gc(verbose = FALSE)
} else {
print(paste(" - matching alleles to", allele_name_std, "(SKIPPED)"), quote = FALSE)
allele_out_std <- list(FRQ_cor = NA, FRQ_cor_ambiguous = NA, FRQ_cor_nonambi = NA,
n_SNPs = 0L, n_negative_strand = NA, n_negative_switch = NA, n_negative_mismatch = NA,
n_strandswitch = NA, n_mismatch = NA, n_flipped = NA, n_ambiguous = NA, n_suspect = NA, n_diffEAF = NA)
save_log(3L, paste("allele match -", allele_name_std), "no data", SNPn_ref, SNPn_ref, "phase 3a skipped", paste("No matching markers found in", allele_name_std), filename_dir)
}
} else {
SNPn_ref_std <- 0L
allele_out_std <- list(FRQ_cor = NA, FRQ_cor_ambiguous = NA, FRQ_cor_nonambi = NA,
n_SNPs = NA, n_negative_strand = NA, n_negative_switch = NA, n_negative_mismatch = NA,
n_strandswitch = NA, n_mismatch = NA, n_flipped = NA, n_ambiguous = NA, n_suspect = NA, n_diffEAF = NA)
}
# Step 3b: using alternative reference
if(use_allele_alt) {
ref_alt <- if(use_allele_std) !ref_std & dataI$MARKER %in% allele_ref_alt$SNP else dataI$MARKER %in% allele_ref_alt$SNP
SNPn_ref_alt<- sum(ref_alt)
if(SNPn_ref_alt > 0L) {
print(paste(" - matching alleles to", allele_name_alt), quote = FALSE)
flush.console()
allele_out_alt <- match_alleles(dataset = dataI[ref_alt, c("MARKER", "EFFECT_ALL", "OTHER_ALL", "STRAND", "EFFECT", "EFF_ALL_FREQ")[c(TRUE, TRUE, TRUE, strand_ref$minus > 0L, TRUE, TRUE)]],
dataname = filename,
ref_set = allele_ref_alt, ref_name = allele_name_alt,
unmatched_data = FALSE, HQ_subset = FRQ_HQ_list[ref_alt],
check_strand = strand_ref$minus > 0L,
save_mismatches = TRUE, delete_mismatches = remove_mismatches_alt,
delete_diffEAF = remove_diffEAF_alt, threshold_diffEAF = threshold_diffEAF,
check_FRQ = TRUE, check_ambiguous = check_ambiguous_alleles,
plot_FRQ = make_plots, plot_intensity = plot_intensity,
plot_if_threshold = only_plot_if_threshold, threshold_r = threshold_allele_freq_correlation,
return_SNPs = TRUE,
save_name = filename, save_dir = dir_output, use_log = TRUE, log_SNPall = SNPn_ref )
dataI$EFFECT_ALL[ref_alt] <- allele_out_alt$EFFECT_ALL
dataI$OTHER_ALL[ref_alt] <- allele_out_alt$OTHER_ALL
if(strand_ref$minus > 0L & allele_out_alt$n_negative_strand > 0L) { dataI$STRAND[ref_alt] <- allele_out_alt$STRAND }
dataI$EFFECT[ref_alt] <- allele_out_alt$EFFECT
dataI$EFF_ALL_FREQ[ref_alt]<- allele_out_alt$EFF_ALL_FREQ
# Memory-cleaning: data columns are removed from allele_out
allele_out_alt <- allele_out_alt[1:16]
gc(verbose = FALSE)
} else {
print(paste(" - matching alleles to", allele_name_alt, "(SKIPPED)"), quote = FALSE)
allele_out_alt <- list(FRQ_cor = NA, FRQ_cor_ambiguous = NA, FRQ_cor_nonambi = NA,
n_SNPs = 0L, n_negative_strand = NA, n_negative_switch = NA, n_negative_mismatch = NA,
n_strandswitch = NA, n_mismatch = NA, n_flipped = NA, n_ambiguous = NA, n_suspect = NA, n_diffEAF = NA)
}
} else {
SNPn_ref_alt <- 0L
allele_out_alt <- list(FRQ_cor = NA, FRQ_cor_ambiguous = NA, FRQ_cor_nonambi = NA,
n_SNPs = NA, n_negative_strand = NA, n_negative_switch = NA, n_negative_mismatch = NA,
n_strandswitch = NA, n_mismatch = NA, n_flipped = NA, n_ambiguous = NA, n_suspect = NA, n_diffEAF = NA)
}
# Step 3c: adding unknown SNPs to alternative reference
SNPn_ref_new <- SNPn_ref - SNPn_ref_std - SNPn_ref_alt
if(SNPn_ref_new > 0L) {
if(update_alt) {
if(use_allele_alt) {
save_log(3L, paste("allele match -", allele_name_alt), "Markers not found", SNPn_ref_new, SNPn_ref, "added to alternative reference", paste("Markers not found in", allele_name_alt), filename_dir)
print(paste(" - adding new SNPs to", allele_name_alt), quote = FALSE)
} else {
save_log(3L, paste("allele match -", allele_name_alt), "Markers not found", SNPn_ref_new, SNPn_ref, "created alternative reference", paste("Markers not found: creating", allele_name_alt), filename_dir)
print(paste(" - creating", allele_name_alt, "file"), quote = FALSE)
}
} else if (use_allele_alt) {
print(" - checking remaining alleles", quote = FALSE)
}
flush.console()
if(SNPn_ref_std > 0L | SNPn_ref_alt > 0L) {
if(SNPn_ref_std > 0L & SNPn_ref_alt > 0L) {
ref_new <- !(ref_std | ref_alt)
} else {
if(SNPn_ref_std > 0L) { ref_new <- !ref_std
} else { ref_new <- !ref_alt }
}
} else { ref_new <- !logical(length = SNPn_ref) }
allele_out_new <- list(n_SNPs = SNPn_ref_new,
n_negative_strand = if(strand_ref$minus == 0L) { 0L } else { sum(strand_min[ref_new]) },
n_flipped = sum(dataI$EFF_ALL_FREQ[ref_new] > 0.5, na.rm = iNA_FRQ_N+inv_FRQ_N>0L),
n_ambiguous = sum(( dataI$EFFECT_ALL[ref_new] == "A" & dataI$OTHER_ALL[ref_new] == "T" ) | ( dataI$EFFECT_ALL[ref_new] == "T" & dataI$OTHER_ALL[ref_new] == "A" ) | ( dataI$EFFECT_ALL[ref_new] == "C" & dataI$OTHER_ALL[ref_new] == "G" ) | ( dataI$EFFECT_ALL[ref_new] == "G" & dataI$OTHER_ALL[ref_new] == "C" ) ) )
if(allele_out_new$n_negative_strand > 0L) {
dataI[ref_new & strand_min, c("EFFECT_ALL", "OTHER_ALL", "STRAND")] <-
switch_strand(dataI[ref_new & strand_min, c("EFFECT_ALL", "OTHER_ALL", "STRAND")], strand_col = TRUE)
}
if(allele_out_new$n_flipped > 0L) {
ref_new_flip <- ref_new & dataI$EFF_ALL_FREQ > 0.5 & !iNA$FRQ & !inv$FRQ
ref_new_al2 <- dataI[ref_new_flip, "OTHER_ALL"]
dataI[ref_new_flip, "OTHER_ALL"] <- dataI[ref_new_flip, "EFFECT_ALL"]
dataI[ref_new_flip, "EFFECT_ALL"] <- ref_new_al2
dataI[ref_new_flip, "EFFECT"] <- -dataI[ref_new_flip, "EFFECT"]
dataI[ref_new_flip, "EFF_ALL_FREQ"] <- 1 - dataI[ref_new_flip, "EFF_ALL_FREQ"]
}
if(update_alt) {
if(update_as_rdata) {
update_savename <- paste0(update_savename, ".RData")
if(backup_alt & file.exists(paste(dir_references, update_savename, sep = "/"))) {
save_log(3L, "allele match ", "back-up", 0L, 1L, "saved back-up", paste0("Back-up of previous alternative reference file saved as: ", dir_references, "/", remove_extension(update_savename), "_", Sys.Date(), ".RData"), filename_dir)
print(" - - creating back-up of previous alternative-reference file", quote = FALSE)
flush.console()
file.rename(paste(dir_references, update_savename, sep = "/"), paste0(dir_references, "/", remove_extension(update_savename), "_", Sys.Date(), ".RData"))
}
print(" - - saving alternative-reference file", quote = FALSE)
flush.console()
if(use_allele_alt) {
allele_ref_alt <- rbind(allele_ref_alt, data.frame(SNP = dataI$MARKER[ref_new], CHR = dataI$CHR[ref_new], POS = as.integer(dataI$POSITION[ref_new]),
MINOR = as.factor(dataI$EFFECT_ALL[ref_new]), MAJOR = as.factor(dataI$OTHER_ALL[ref_new]), MAF = dataI$EFF_ALL_FREQ[ref_new],
SOURCE = as.factor(filename_input), DATE_ADDED = as.factor(date()), stringsAsFactors = FALSE) )
} else {
allele_ref_alt <- data.frame(SNP = dataI$MARKER[ref_new], CHR = dataI$CHR[ref_new], POS = as.integer(dataI$POSITION[ref_new]),
MINOR = as.factor(dataI$EFFECT_ALL[ref_new]), MAJOR = as.factor(dataI$OTHER_ALL[ref_new]), MAF = dataI$EFF_ALL_FREQ[ref_new],
SOURCE = as.factor(filename_input), DATE_ADDED = as.factor(date()), stringsAsFactors = FALSE)
}
save(allele_ref_alt, file = paste(dir_references, update_savename, sep = "/"))
rm(allele_ref_alt)
# In order save the correctobject name in the .RData file, it is
# necesarry to edit allele_ref_alt itself. Since allelel_ref_alt
# is often a pointer to an object outside of QC_GWAS, the object
# is duplicated in the memory (one unaltered instance outside
# QC_GWAS and one updated instance inside). This may use considerable
# amounts of memory; hence allele_ref_alt is deleted once it is no longer
# necessary.
} else {
update_savename <- paste0(update_savename, ".txt")
if(backup_alt & file.exists(paste(dir_references, update_savename, sep = "/"))) {
save_log(3L, "allele match ", "back-up", 1L, 1L, "created back-up", paste0("copy of previous alternative reference file saved as: ", dir_references, "/", remove_extension(update_savename), "_", Sys.Date(), ".RData"), filename_dir)
print(" - - creating back-up of previous alternative-reference file", quote = FALSE)
flush.console()
file.copy(paste(dir_references, update_savename, sep = "/"), paste0(dir_references, "/", remove_extension(update_savename), "_", Sys.Date(), ".txt"))
}
print(" - - saving alternative-reference file", quote = FALSE)
flush.console()
if(use_allele_alt) {
write.table(rbind(allele_ref_alt, data.frame(SNP = dataI$MARKER[ref_new], CHR = dataI$CHR[ref_new], POS = dataI$POSITION[ref_new],
MINOR = dataI$EFFECT_ALL[ref_new], MAJOR = dataI$OTHER_ALL[ref_new], MAF = dataI$EFF_ALL_FREQ[ref_new],
SOURCE = filename_input, DATE_ADDED = date(), stringsAsFactors = FALSE) ),
paste(dir_references, update_savename, sep ="/"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
} else {
write.table(data.frame(SNP = dataI$MARKER[ref_new], CHR = dataI$CHR[ref_new], POS = dataI$POSITION[ref_new],
MINOR = dataI$EFFECT_ALL[ref_new], MAJOR = dataI$OTHER_ALL[ref_new], MAF = dataI$EFF_ALL_FREQ[ref_new],
SOURCE = filename_input, DATE_ADDED = date(), stringsAsFactors = FALSE),
paste(dir_references, update_savename, sep ="/"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep="\t")
}
}
allele_ref_changed <- TRUE
}
} else { allele_out_new <- list(n_SNPs = 0L, n_negative_strand = NA, n_flipped = NA, n_ambiguous = NA ) }
#Step 3d: removal of mismatching and different allele frequency entries from 3a & 3b
if(use_allele_std | use_allele_alt) {
N_mismatch <- sum(allele_out_std$n_mismatch, allele_out_alt$n_mismatch, na.rm = !(use_allele_std & use_allele_alt))
N_diffEAF <- sum(allele_out_std$n_diffEAF, allele_out_alt$n_diffEAF, na.rm = !(use_allele_std & use_allele_alt))
N_ambiguous <- sum(allele_out_std$n_ambiguous, allele_out_alt$n_ambiguous, allele_out_new$n_ambiguous, na.rm = TRUE)
N_suspect <- sum(allele_out_std$n_suspect, allele_out_alt$n_suspect, na.rm = TRUE)
if(N_suspect > 0L) save_log(3L, "allele match", "suspicious strand", N_suspect, SNPn_ref, "-", "Possible strand mismatch: ambiguous markers with minor allele frequency > 35%", filename_dir)
use_L2 <- FALSE
if(SNPn_ref_std > 0L & allele_out_std$n_mismatch > 0L) {
print(paste0(" - - mismatches with ", allele_name_std, ": ", allele_out_std$n_mismatch), quote = FALSE)
if(remove_mismatches_std) use_L2 <- TRUE
}
if(SNPn_ref_alt > 0L & allele_out_alt$n_mismatch > 0L) {
print(paste0(" - - mismatches with ", allele_name_alt, ": ", allele_out_alt$n_mismatch), quote = FALSE)
if(remove_mismatches_alt) use_L2 <- TRUE
}
if(SNPn_ref_std > 0L & allele_out_std$n_diffEAF > 0L) {
print(paste0(" - - differing allele frequencies from ", allele_name_std, ": ", allele_out_std$n_diffEAF), quote = FALSE)
if(remove_diffEAF_std) use_L2 <- TRUE
}
if(SNPn_ref_alt > 0L & allele_out_alt$n_diffEAF > 0L) {
print(paste0(" - - differing allele frequencies from ", allele_name_alt, ": ", allele_out_alt$n_diffEAF), quote = FALSE)
if(remove_diffEAF_alt) use_L2 <- TRUE
}
flush.console()
if(use_L2) {
remove_L2_list <- is.na(dataI$EFFECT_ALL)
remove_L2_N <- sum(remove_L2_list)
if(remove_L2_N == SNPn_ref) {
# Step 3 EmergencyExit: all SNPs removed
print("CRITICAL ERROR: no SNPs remaining in dataset", quote = FALSE)
save_log(3L, "allele match", "no matching SNPs", SNPn_ref, SNPn_ref, "QC check aborted", "All markers excluded due to mismatching alleles or allele-frequencies", filename_dir)
return(emergency_return(name_in = filename_input, name_out = filename, LI = logI, LN = logN))
}
dataI <- dataI[!remove_L2_list, ]
inv <- inv[!remove_L2_list, ]
iNA <- iNA[!remove_L2_list, ]
geno_list <- geno_list[!remove_L2_list]
imp_list <- imp_list[!remove_L2_list]
strand_min <- strand_min[!remove_L2_list]
rm(remove_L2_list)
} else { remove_L2_N <- 0L }
rm(FRQ_HQ_list)
if(use_allele_std) rm(ref_std)
if(use_allele_alt) rm(ref_alt)
} else {
N_mismatch <- NA
N_diffEAF <- NA
N_ambiguous <- allele_out_new$n_ambiguous
N_suspect <- NA
remove_L2_N <- 0L
}
if(SNPn_ref_new > 0L) rm(ref_new)
gc(verbose = FALSE) # memory cleaning
#### Step 4: QC of the final dataset
# Step 4a
# Determining whether sufficient data is available for the following QC tests
print("", quote = FALSE)
print("Step 4: generating QC statistics & graphs", quote = FALSE)
flush.console()
SNPn_postQC <- nrow(dataI)
SNPn_postQC_inv <- sum(rowSums(inv) > 0L)
SNPn_postQC_geno<- sum(geno_list)
SNPn_postQC_imp <- sum( imp_list)
if(remove_L2_N > 0L) {
strand_post <- list(
plus = 0L,
minus = if(strand_ref$minus == 0L) 0L else sum(strand_min),
missing = if(strand_ref$missing == 0L) 0L else sum(iNA$strand),
invalid = if(strand_ref$invalid == 0L) 0L else sum(inv$strand) )
strand_post$plus <- SNPn_postQC - (strand_post$missing + strand_post$invalid + strand_post$minus)
} else { strand_post <- strand_ref }
if(useMan_threshold <= 1) { useMan_threshold <- SNPn_postQC * useMan_threshold }
if(useMan_threshold < 10) { useMan_threshold <- 10
save_log(4L, "thresholds", "Manhattan", SNPn_postQC, SNPn_postQC, "set to 10", "User-specified threshold was < 10", filename_dir) }
# Threshold is set to 10, so it corresponds to the QC_plots internal threshold
if(useFRQ_threshold <= 1) { useFRQ_threshold <- SNPn_postQC * useFRQ_threshold }
if(useFRQ_threshold < 2) { useFRQ_threshold <- 2
save_log(4L, "thresholds", "allele frequency", SNPn_postQC, SNPn_postQC, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(useHWE_threshold <= 1) { useHWE_threshold <- SNPn_postQC * useHWE_threshold }
if(useHWE_threshold < 2) { useHWE_threshold <- 2
save_log(4L, "thresholds", "HWE p-value", SNPn_postQC, SNPn_postQC, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(useCal_threshold <= 1) { useCal_threshold <- SNPn_postQC * useCal_threshold }
if(useCal_threshold < 2) { useCal_threshold <- 2
save_log(4L, "thresholds", "Call rate", SNPn_postQC, SNPn_postQC, "set to 2", "User-specified threshold was < 2", filename_dir) }
if(useImp_threshold <= 1) { useImp_threshold <- SNPn_postQC * useImp_threshold }
if(useImp_threshold < 2) { useImp_threshold <- 2
save_log(4L, "thresholds", "Imputation quality", SNPn_postQC, SNPn_postQC, "set to 2", "User-specified threshold was < 2", filename_dir) }
useMan <- SNPn_postQC - sum(iNA$chr | inv$chr | iNA$pos | inv$pos) >= useMan_threshold
useFRQ <- SNPn_postQC - sum(iNA$FRQ | inv$FRQ) >= useFRQ_threshold
if(ignore_impstatus) {
useHWE <- SNPn_postQC - sum(iNA$HWE | inv$HWE ) >= useHWE_threshold
useCal <- SNPn_postQC - sum(iNA$cal | inv$cal ) >= useCal_threshold
useImp <- SNPn_postQC - sum(iNA$impQ| inv$impQ) >= useImp_threshold
} else {
if(SNPn_postQC_geno == 0L) {
useHWE <- FALSE
useCal <- FALSE
} else {
useHWE <- sum(geno_list & !(iNA$HWE | inv$HWE) ) >= useHWE_threshold
useCal <- sum(geno_list & !(iNA$cal | inv$cal) ) >= useCal_threshold
}
if(SNPn_postQC_imp == 0L) {
useImp <- FALSE
} else {
useImp <- sum( imp_list & !(iNA$impQ|inv$impQ) ) >= useImp_threshold
} }
if(!useFRQ) {
QQfilter_FRQ <- NULL
HQfilter_FRQ <- NULL }
if(!useHWE) {
QQfilter_HWE <- NULL
HQfilter_HWE <- NULL }
if(!useCal) {
QQfilter_cal <- NULL
HQfilter_cal <- NULL }
if(!useImp) {
QQfilter_imp <- NULL
HQfilter_imp <- NULL }
HQ_list <- HQ_filter(data = dataI, ignore_impstatus = ignore_impstatus,
FRQ_val = HQfilter_FRQ, HWE_val = HQfilter_HWE,
cal_val = HQfilter_cal, imp_val = HQfilter_imp,
FRQ_NA = NAfilter_FRQ, HWE_NA = NAfilter_HWE, cal_NA = NAfilter_cal, imp_NA = NAfilter_imp)
SNPn_postQC_HQ <- sum(HQ_list)
useHQ <- SNPn_postQC_HQ > 9L
if(!useHQ) {
save_log(4L, "QC statistics", "insufficient high-quality markers", SNPn_postQC, SNPn_postQC, "-", "Less than 10 high-quality SNPs: cannot apply standard filter", filename_dir)
print(" - - warning: insufficient high-quality SNPs to apply HQ filter", quote = FALSE)
}
# Step 4b: histograms
if(plot_histograms) {
print(" - creating histograms", quote = FALSE)
flush.console()
png(paste(filename_dir, "graph_histogram.png", sep = "_"), width = 1440, height = 960, res = 108)
par(mfrow = c(2, 3 ))
hist(dataI$EFFECT,
breaks = 30, freq = FALSE, col = "blue", plot = TRUE,
main = "Effect size", xlab = "Effect size", cex.main = 1.5)
hist(dataI$STDERR,
breaks = 30, freq = FALSE, col = "yellow", plot = TRUE,
main = "Standard error", xlab = "Standard error", cex.main = 1.5)
if(useFRQ) {
hist(dataI$EFF_ALL_FREQ, breaks = 20, freq = FALSE, col = "green3", xlim = c(0,1), plot = TRUE,
main = "Allele frequency", xlab = "Allele frequency", cex.main = 1.5)
} else {
plot(0.5, 0.5, xlim = c(0,1), ylim = c(0,1), col = "white", cex.main = 1.5,
main = "Allele frequency", xlab = "Allele frequency", ylab = "Density")
text(0.5, 0.5, "Insufficient data", cex = 1.5)
}
if(useHWE) {
if(ignore_impstatus) {
hist(dataI$HWE_PVAL, breaks = 20, freq = FALSE, col = "yellow", xlim = c(0,1), plot = TRUE,
main = "HWE p-value", xlab = "HWE p-value", cex.main = 1.5)
mtext("Genotyped & imputed SNPs", cex = 0.8, line = 0.5, col = "red")
} else {
hist(dataI$HWE_PVAL[geno_list], breaks = 20, freq = FALSE, col = "yellow", xlim = c(0,1), plot = TRUE,
main = "HWE-p distribution", xlab = "HWE p-value", cex.main = 1.5)
mtext("Genotyped SNPs only", cex = 0.8, line = 0.5, col = "blue") }
} else {
plot(0.5, 0.5 , xlim = c(0,1), ylim = c(0,1), col = "white", cex.main = 1.5,
main = "HWE-p distribution", xlab = "HWE p-value", ylab = "Density")
text(0.5, 0.5, "Insufficient data", cex = 1.5) }
if(useCal) {
if(ignore_impstatus) {
hist(dataI$CALLRATE, freq = FALSE, col = "red", xlim = c(0.75,1), plot = TRUE,
breaks = c(0, 0.8, 0.85, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1),
main = "Call rate", xlab = "Call rate", cex.main = 1.5)
mtext("Genotyped & imputed SNPs", cex = 0.8, line = 0.5, col = "red")
if(min(dataI$CALLRATE, na.rm = iNA_cal_N + inv_cal_N > 0) < 0.75) {
title(sub = paste("Note:", sum(dataI$CALLRATE < 0.75, na.rm = iNA_cal_N + inv_cal_N > 0), "outliers excluded from graph"), col.sub = "blue", cex.sub = 1.3) }
} else {
hist(dataI$CALLRATE[geno_list], freq = FALSE, col = "red", xlim = c(0.75,1), plot = TRUE,
breaks = c(0, 0.8, 0.85, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1),
main = "Call rate", xlab = "Call rate", cex.main = 1.5)
mtext("Genotyped SNPs only", cex = 0.8, line = 0.5, col = "blue")
if(min(dataI$CALLRATE[geno_list], na.rm = iNA_cal_g + inv_cal_g > 0) < 0.75) {
title(sub = paste("Note:", sum(dataI$CALLRATE[geno_list] < 0.75, na.rm = iNA_cal_g + inv_cal_g > 0), "outliers excluded from graph"), col.sub = "blue", cex.sub = 1.3) } }
} else {
plot(0.5, 0.5, xlim = c(0,1), ylim = c(0,1), col = "white", cex.main = 1.5,
main = "Call rate", xlab = "Call rate", ylab = "Density")
text(0.5, 0.5, "Insufficient data", cex = 1.5) }
if(useImp) {
if(ignore_impstatus) {
hist(dataI$IMP_QUALITY, freq = FALSE, col = "blue", xlim = c(floor(min(0, dataI$IMP_QUALITY, na.rm = iNA_impQ_N+inv_impQ_N>0L) * 10)/10, ceiling(max(dataI$IMP_QUALITY, na.rm = iNA_impQ_N+inv_impQ_N>0L) * 10)/10), plot = TRUE,
main = "Imputation Quality", xlab = "Imputation quality", cex.main = 1.5)
mtext("Genotyped & imputed SNPs", cex = 0.8, line = 0.5, col = "red")
} else {
hist(dataI$IMP_QUALITY[imp_list], freq = FALSE, col = "blue", xlim = c(floor(min(0, dataI$IMP_QUALITY[imp_list], na.rm = iNA_impQ_i+inv_impQ_i>0L) * 10)/10, ceiling(max(dataI$IMP_QUALITY[imp_list], na.rm = iNA_impQ_i+inv_impQ_i>0L) * 10)/10), plot = TRUE,
main = "Imputation Quality", xlab = "Imputation quality", cex.main = 1.5)
mtext("Imputed SNPs only", cex = 0.8, line = 0.5, col = "blue") }
} else {
plot(0.5, 0.5, xlim = c(0,1), ylim = c(0,1), col = "white", cex.main = 1.5,
main = "Imputation Quality", xlab = "Imputation quality", ylab = "Density")
text(0.5, 0.5, "Insufficient data", cex = 1.5) }
title(sub = filename, cex.sub = 1.3)
dev.off()
gc(verbose = FALSE)
}
outcome_ES <- list(HQ_SNPs = SNPn_postQC_HQ, p = NA, return_ES = return_HQ_effectsizes,
HQ_effectsizes = if(return_HQ_effectsizes) {
if(SNPn_postQC_HQ < 1000L) { c(dataI$EFFECT[HQ_list], rep(NA, 1000L - SNPn_postQC_HQ))
} else { sort((dataI$EFFECT[HQ_list])[c(1:1000)*(SNPn_postQC_HQ %/% 1000)]) } } else { NULL } )
# Step 4c: checking p-values
print(" - checking p-values", quote = FALSE)
outcome_P <- check_P(dataI, threshold_r = threshold_p_correlation, plot_correlation = make_plots, plot_if_threshold = only_plot_if_threshold,
save_name = filename, save_dir = dir_output, dataN = SNPn_postQC, use_log = TRUE, HQ_subset = HQ_list)
if(calculate_missing_p) {
calc_p_newN <- sum(iNA$p | inv$p)
if(calc_p_newN > 0L) {
print(" - calculating missing/invalid p-values", quote = FALSE)
dataI$PVALUE <- ifelse(iNA$p | inv$p, pchisq((dataI$EFFECT/dataI$STDERR)^2, 1, lower.tail=FALSE), dataI$PVALUE)
save_log(4L, "p-value correction", "missing/invalid p-values", calc_p_newN, SNPn_postQC, "p-values recalculated", "Missing/invalid p-values replaced with recalculated p-values.", filename_dir)
low_p_newN <- sum(dataI$PVALUE < 1e-300 & (iNA$p | inv$p) )
if(low_p_newN > 0L) {
dataI$PVALUE[dataI$PVALUE < 1e-300 & (iNA$p | inv$p) ] <- 1e-300
save_log(4L, "p-value correction", "extreme p-values", low_p_newN, SNPn_postQC, "p-value set to 1e-300", "Recalculation resulted in extreme p-value: it has been set to 1e-300", filename_dir)
}
} else { low_p_newN <- 0L }
} else {
calc_p_newN <- NA
low_p_newN <- 0L
}
# Step 4d: creating QQ & Manhattan plots
plot_output <- QC_plots(dataset = dataI, plot_QQ = plot_QQ, plot_Man = useMan & plot_Manhattan,
FRQfilter_values = QQfilter_FRQ, FRQfilter_NA = NAfilter_FRQ,
HWEfilter_values = QQfilter_HWE, HWEfilter_NA = NAfilter_HWE,
calfilter_values = QQfilter_cal, calfilter_NA = NAfilter_cal,
impfilter_values = QQfilter_imp, impfilter_NA = NAfilter_imp, impfilter_min = minimal_impQ_value,
manfilter_FRQ = HQfilter_FRQ, manfilter_HWE = HQfilter_HWE, manfilter_cal = HQfilter_cal, manfilter_imp = HQfilter_imp,
ignore_impstatus = ignore_impstatus,
plot_QQ_bands = plot_QQ_bands, plot_cutoff_p = plot_cutoff_p, plot_names = FALSE,
save_name = filename, save_dir = dir_output, use_log = TRUE)
gc(verbose = FALSE) # memory cleaning
if(plot_output$lambda[1] > 1.1 & !is.na(plot_output$lambda[1])) {
save_log(phaseL = 5L, checkL = "QC statistics", typeL = "high lambda", SNPL = sum(!is.na(dataI$PVALUE)), allSNPs = SNPn_postQC, actionL = "-", noteL = paste("Lambda =", plot_output$lambda[1]), fileL = filename_dir)
print(paste0(" - - warning: high lambda value (", plot_output$lambda[1], ")"), quote = FALSE) }
if(plot_Manhattan & !useMan) {
save_log(phaseL = 5L, checkL = "Manhattan plot", typeL = "insuf. data", SNPL = sum(iNA$chr | inv$chr | iNA$pos | inv$pos), allSNPs = SNPn_postQC, actionL = "-", noteL = "Insufficient known chromosome/positions to create Manhattan plot", fileL = filename_dir)
print(" - - warning: insufficient chromosome/position values to generate Manhattan plot", quote = FALSE)
}
#### STEP 5: ANALYSIS complete: saving statistics data & post-QC results
print("", quote = FALSE)
print("Step 5: saving statistics in log file", quote = FALSE)
flush.console()
# Calculating the remaining QC statistics
useN <- if(SNPn_postQC > iNA_N_N + inv_N_N) TRUE else !all(iNA$N | inv$N)
stat_N_max <- if(useN) max(dataI$N_TOTAL, na.rm = iNA_N_N+inv_N_N>0L ) else NA
stat_N_HQ <- if(useN & useHQ) {
if(any(!((iNA$N | inv$N)[HQ_list]))) { max(dataI$N_TOTAL[HQ_list], na.rm = iNA_N_N+inv_N_N>0 )
} else { NA } } else { NA }
if(useFRQ & !is.na(stat_N_max)) {
stat_Visscher <- median( 1 / (2 * dataI$EFF_ALL_FREQ * (1-dataI$EFF_ALL_FREQ) * (dataI$STDERR)^2 ), na.rm = iNA_FRQ_N+inv_FRQ_N>0L) / stat_N_HQ
if(!is.na(stat_N_HQ) & sum(!iNA$FRQ & !inv$FRQ & HQ_list) > 9L) {
stat_Vissc_HQ <- median( 1 / (2 * dataI$EFF_ALL_FREQ[HQ_list] * (1-dataI$EFF_ALL_FREQ[HQ_list]) * (dataI$STDERR[HQ_list])^2 ), na.rm = iNA_FRQ_N+inv_FRQ_N>0L) / stat_N_max
} else {
stat_Vissc_HQ <- NA
if(is.na(stat_N_HQ)) save_log(4L, "Visschers stat.", "Insufficient data", SNPn_postQC, SNPn_postQC, "-", "Insufficient sample sizes to calculate Visschers statistic: too few high-quality markers", filename_dir)
if(sum(!iNA$FRQ & !inv$FRQ & HQ_list) <= 9L) save_log(4L, "Visschers stat.", "Insufficient data", SNPn_postQC, SNPn_postQC, "-", "Insufficient allele frequencies to calculate Visschers statistic: too few high-quality markers", filename_dir)
}
} else {
if(!useFRQ) save_log(4L, "Visschers stat.", "Insufficient data", SNPn_postQC, SNPn_postQC, "-", "Insufficient allele frequencies to calculate Visschers statistic", filename_dir)
if(is.na(stat_N_max)) save_log(4L, "Visschers stat", "Insufficient data", SNPn_postQC, SNPn_postQC, "-", "Insufficient sample sizes to calculate Visschers statistic", filename_dir)
stat_Visscher <- NA
stat_Vissc_HQ <- NA
}
stat_SE <- median(dataI$STDERR)
stat_SE_HQ <- if(useHQ) median(dataI$STDERR[HQ_list]) else NA
stat_skewness <- calc_skewness(input = dataI$EFFECT)
stat_skewness_HQ <- if(useHQ) calc_skewness(input = dataI$EFFECT[HQ_list]) else NA
stat_kurtosis <- calc_kurtosis(input = dataI$EFFECT)
stat_kurtosis_HQ <- if(useHQ) calc_kurtosis(input = dataI$EFFECT[HQ_list]) else NA
# Reformatting the old log entries log file
log_old <- read.table(paste0(filename_dir, "_log.txt"), header = FALSE,
sep = "\t", comment.char = "", stringsAsFactors = FALSE)
write.table(c(
"*********************************",
"",
"\t\tQC LOG FILE",
"",
"*********************************",
""), paste0(filename_dir, "_log.txt"), append = FALSE,
quote = FALSE, row.names = FALSE, col.names = FALSE)
SFL <- spreadsheet_friendly_log
logCon <- file(description = paste0(filename_dir, "_log.txt"),
open = "a")
if(SFL) {
write.table(data.frame(
v1 = c("Input File", "output File(s)", "QC Start Time", "QC End time", "Script version"),
v2 = "", v3 = c(filename_input, filename, start_time, date(), "1.0-9"),
stringsAsFactors = FALSE),
logCon, quote = FALSE,
sep = "\t", row.names = FALSE, col.names = FALSE)
} else {
write.table(format(
data.frame(
v1 = c("Input File", "output File(s)", "QC Start Time", "QC End time", "Script version"),
v2 = "\t: ", v3 = c(filename_input, filename, start_time, date(), "1.0-9"),
stringsAsFactors = FALSE), justify = "left"),
logCon, quote = FALSE,
sep = "", row.names = FALSE, col.names = FALSE)
}
write.table(c("", "",
"****************************************",
"\t1. Log entries generated during QC",
"****************************************",
""), logCon, quote = FALSE,
row.names = FALSE, col.names = FALSE)
write.table(if(SFL) log_old else format(log_old, justify = "left"),
logCon, quote = FALSE,
sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(log_old)
write.table(c("", "",
"*******************************************",
"\t2. Summary statistics of removed SNPs",
"*******************************************",
""), logCon, quote = FALSE,
row.names = FALSE, col.names = FALSE)
del_table <- data.frame(
# Statistics of remove 1: monomorphic, Y- & M-chr SNPs
name1 = c("SNPs in uploaded data", "Removed", "> Monomorphic", ">> missing non-effect allele", ">> invalid non-effect allele", ">> allele frequency = 0", ">> identical alleles*", if(remove_X) "> X chromosome" else ">", if(remove_Y) "> Y chromosome" else ">", if(remove_XY) "> XY chromosome" else ">", if(remove_M) "> M chromosome" else ">", "", "", ""),
N1 = c(SNPn_input, remove_L0_N, monomorp_N, iNA_al2_N, inv_al2_N, low_FRQ_N, same_al_N, chr_X_N, chr_Y_N, chr_XY_N, chr_M_N, NA, NA, NA),
perc1 = numeric(length = 14),
em4 = character(length = 14),
# Statistics of remove 2: missing & invalid crucial variables
name2 = c("SNPs in initial QC", "Removed", "> Marker name", ">> missing", ">> duplicate", "> Effect allele", ">> missing", ">> invalid", "> Effect size (missing)", ">> low imputation quality", "> Standard error", ">> missing", ">>> low imputation quality", ">> invalid"),
N2 = c(SNPn_preQC, remove_L1_N, iNA_mar_N + SNPn_preQC_dupli, iNA_mar_N, SNPn_preQC_dupli, iNA_al1_N + inv_al1_N, iNA_al1_N, inv_al1_N, iNA_eff_N, iNA_eff_poor, iNA_se_N + inv_se_N, iNA_se_N, iNA_se_poor, inv_se_N),
perc2 = numeric(length = 14),
em8 = character(length = 14),
# Statistics of remove 3: allele mismatches & FINAL statistics
name3 = c(character(length = 10), "Final dataset", "SNPs", "> genotyped", "> imputed"),
N3 = c(rep(NA, 11), SNPn_postQC, SNPn_postQC_geno, SNPn_postQC_imp),
perc3 = numeric(length = 14),
stringsAsFactors = FALSE)
del_table$perc1 <- round(100 * del_table$N1 / SNPn_input, digits = 2)
del_table$perc2 <- round(100 * del_table$N2 / SNPn_preQC, digits = 2)
del_table$perc3 <- round(100 * del_table$N3 / SNPn_postQC, digits = 2)
if(remove_mismatches_std | remove_mismatches_alt | remove_diffEAF_std | remove_diffEAF_alt) {
del_table$name3[1:2] <- c("SNPs in allele check", "Removed")
del_table$N3[1] <- SNPn_ref
del_table$N3[2] <- remove_L2_N
if(remove_mismatches_std & use_allele_std){
del_table$name3[3] <- paste("> mismatch to", allele_name_std)
del_table$N3[3] <- allele_out_std$n_mismatch
} else { del_table$name3[3] <- ">" }
if(remove_mismatches_alt & use_allele_alt){
del_table$name3[4] <- paste("> mismatch to", allele_name_alt)
del_table$N3[4] <- allele_out_alt$n_mismatch
} else { del_table$name3[4] <- ">" }
if(remove_diffEAF_std & use_allele_std){
del_table$name3[5] <- paste("> deviating FRQ from", allele_name_std)
del_table$N3[5] <- allele_out_std$n_diffEAF
} else { del_table$name3[5] <- ">" }
if(remove_diffEAF_alt & use_allele_alt){
del_table$name3[6] <- paste("> deviating FRQ from", allele_name_alt)
del_table$N3[6] <- allele_out_alt$n_diffEAF
} else { del_table$name3[6] <- ">" }
del_table$perc3[1:6] <- round(100 * del_table$N3[1:6] / SNPn_ref, digits = 2)
}
# adding the header line
del_table <- rbind(character(length = 11), del_table)
del_table[1, ] <- if(remove_mismatches_std | remove_mismatches_alt | remove_diffEAF_std | remove_diffEAF_alt) {
c("", "N", "%", "", "", "N", "%", "", "", "N", "%")
} else { c("", "N", "%", "", "", "N", "%", "", "", "", "") }
del_table[12, c("N3", "perc3")] <- c("N", "%")
# Changing NA's to empty cells
del_table$N1 <- ifelse(is.na(del_table$N1), "", del_table$N1)
del_table$perc1 <- ifelse(is.na(del_table$perc1), "", del_table$perc1)
del_table$N3 <- ifelse(is.na(del_table$N3), "", del_table$N3)
del_table$perc3 <- ifelse(is.na(del_table$perc3), "", del_table$perc3)
write.table(if(SFL) del_table else format(del_table, justify = "left"),
logCon, quote = FALSE,
sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(del_table)
write.table(c("* Not including SNPs with allele frequency = 0",
" NB: monomorphic & Y- & M-chromosome SNPs are removed before the analysis starts.",
" The pre-QC values in the tables below refer to the dataset after these SNPs have been",
" removed, but before further exclusions have taken place.",
"", "",
"********************************************************************",
"\t3. Summary statistics before and after quality check procedure",
"********************************************************************",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
stat_table <- data.frame(variable = character(18),
NA_pre = character(18), inv_pre = character(18), unus_pre = character(18),
NA_post = character(18), inv_post = character(18), unus_post = character(18),
Qmin = character(18), Q25 = character(18), Qmean = character(18), Qmedian = character(18), Q75 = character(18), Qmax = character(18),
stringsAsFactors = FALSE)
stat_table[1, 2] <- "Pre-QC"
stat_table[1, 5] <- "Post-QC"
stat_table[2, ] <- c("Statistics", "missing", "invalid", "%unusable", "missing", "invalid", "%unusable", "min", "25%", "mean", "median", "75%", "max")
stat_save <- function(stat_name, stat_col, old_N, old_NA, old_inv,
new_N = length(stat_col), new_NA = sum(is.na(stat_col)) - new_inv, new_inv,
no_quantiles = FALSE, round_min = FALSE) {
if(no_quantiles | new_NA + new_inv == new_N) {
return(c(stat_name, old_NA, old_inv, round(100*(old_NA + old_inv)/old_N, digits = 2), new_NA, new_inv, round(100*(new_NA + new_inv)/new_N, digits = 2), character(6) ))
} else {
quant <- quantile(stat_col, na.rm = new_NA + new_inv > 0L, names = FALSE)
return(c(stat_name, old_NA, old_inv, round(100*(old_NA + old_inv)/old_N, digits = 2), new_NA, new_inv, round(100*(new_NA + new_inv)/new_N, digits = 2), if(round_min) round(quant[1], digits = 2) else signif(quant[1], digits = 2), round(c(mean(stat_col, na.rm = new_NA + new_inv > 0L), quant)[c(3,1,4,5,6)], digits = 4)))
}
}
stat_select_save <- function(stat_name, select_name,
stat_col, NA_col = is.na(stat_col) & !inv_col, inv_col, select_col,
old_N, old_NA, old_inv,
new_N = sum(select_col), new_NA = sum(select_col & NA_col), new_inv = sum(select_col & inv_col),
round_min = FALSE) {
if(old_N == 0L) {
return(c(paste(stat_name, "-", select_name), "-", "-", "-", "-", "-", "-", character(6)))
} else {
if(new_N == 0L) {
return(c(paste(stat_name, "-", select_name), old_NA, old_inv, round(100*(old_NA + old_inv)/old_N, digits = 2), "-", "-", "-", character(6)))
} else {
new_inv <- sum(select_col & inv_col)
new_NA <- sum(select_col & NA_col)
if(new_NA + new_inv == new_N) {
return(c(paste(stat_name, "-", select_name), old_NA, old_inv, round(100*(old_NA + old_inv)/old_N, digits = 2), new_NA, new_inv, round(100*(new_NA + new_inv)/new_N, digits = 2), character(6)))
} else {
quant <- quantile(stat_col[select_col], na.rm = new_NA + new_inv > 0L, names = FALSE)
return(c(paste(stat_name, "-", select_name), old_NA, old_inv, round(100*(old_NA + old_inv)/old_N, digits = 2), new_NA, new_inv, round(100*(new_NA + new_inv)/new_N, digits = 2), if(round_min) round(quant[1], digits = 2) else signif(quant[1], digits = 2), round(c(mean(stat_col[select_col], na.rm = new_NA + new_inv > 0L), quant)[c(3,1,4,5,6)], digits = 4)))
} } }
}
stat_table[ 3, ] <- stat_save(stat_name = "effect size", stat_col = dataI$EFFECT, old_N = SNPn_preQC, old_NA = iNA_eff_N, old_inv = 0, new_N = SNPn_postQC, new_NA = 0, new_inv = 0)
stat_table[ 4, ] <- stat_save(stat_name = "SE", stat_col = dataI$STDERR, old_N = SNPn_preQC, old_NA = iNA_se_N, old_inv = inv_se_N, new_N = SNPn_postQC, new_NA = 0, new_inv = 0)
stat_table[ 5, ] <- stat_save(stat_name = "p-value", stat_col = dataI$PVALUE, old_N = SNPn_preQC, old_NA = iNA_p_N, old_inv = inv_p_N, new_N = SNPn_postQC, new_NA = sum(iNA$p), new_inv = sum(inv$p))
stat_table[ 6, ] <- stat_save(stat_name = "allele frequency", stat_col = dataI$EFF_ALL_FREQ, old_N = SNPn_preQC, old_NA = iNA_FRQ_N, old_inv = inv_FRQ_N, new_N = SNPn_postQC, new_NA = sum(iNA$FRQ), new_inv = sum(inv$FRQ))
stat_table[ 7, ] <- stat_save(stat_name = "HWE p-value", stat_col = dataI$HWE_PVAL, old_N = SNPn_preQC, old_NA = iNA_HWE_N, old_inv = inv_HWE_N, new_N = SNPn_postQC, new_NA = sum(iNA$HWE), new_inv = sum(inv$HWE))
stat_table[ 8, ] <- stat_select_save(stat_name = "HWE p-value", select_name = "genotyped", stat_col = dataI$HWE_PVAL, NA_col = iNA$HWE, inv_col = inv$HWE,
select_col = geno_list, old_N = SNPn_preQC_geno, old_NA = iNA_HWE_g, old_inv = inv_HWE_g, new_N = SNPn_postQC_geno)
stat_table[ 9, ] <- stat_select_save(stat_name = "HWE p-value", select_name = "imputed", stat_col = dataI$HWE_PVAL, NA_col = iNA$HWE, inv_col = inv$HWE,
select_col = imp_list, old_N = SNPn_preQC_imp, old_NA = iNA_HWE_i, old_inv = inv_HWE_i, new_N = SNPn_postQC_imp)
stat_table[10, ] <- stat_save(stat_name = "Call rate", stat_col = dataI$CALLRATE, old_N = SNPn_preQC, old_NA = iNA_cal_N, old_inv = inv_cal_N, new_N = SNPn_postQC, new_NA = sum(iNA$cal), new_inv = sum(inv$cal))
stat_table[11, ] <- stat_select_save(stat_name = "Call rate", select_name = "genotyped", stat_col = dataI$CALLRATE, NA_col = iNA$cal, inv_col = inv$cal,
select_col = geno_list, old_N = SNPn_preQC_geno, old_NA = iNA_cal_g, old_inv = inv_cal_g, new_N = SNPn_postQC_geno)
stat_table[12, ] <- stat_select_save(stat_name = "Call rate", select_name = "imputed", stat_col = dataI$CALLRATE, NA_col = iNA$cal, inv_col = inv$cal,
select_col = imp_list, old_N = SNPn_preQC_imp, old_NA = iNA_cal_i, old_inv = inv_cal_i, new_N = SNPn_postQC_imp)
stat_table[13, ] <- stat_save(stat_name = "Sample size", stat_col = dataI$N_TOTAL, old_N = SNPn_preQC, old_NA = iNA_N_N, old_inv = inv_N_N, new_N = SNPn_postQC, new_NA = sum(iNA$N), new_inv = sum(inv$N), round_min = TRUE)
stat_table[14, ] <- stat_select_save(stat_name = "Sample size", select_name = "genotyped", stat_col = dataI$N_TOTAL, NA_col = iNA$N, inv_col = inv$N,
select_col = geno_list, old_N = SNPn_preQC_geno, old_NA = iNA_N_g, old_inv = inv_N_g, new_N = SNPn_postQC_geno, round_min = TRUE)
stat_table[15, ] <- stat_select_save(stat_name = "Sample size", select_name = "imputed", stat_col = dataI$N_TOTAL, NA_col = iNA$N, inv_col = inv$N,
select_col = imp_list, old_N = SNPn_preQC_imp, old_NA = iNA_N_i, old_inv = inv_N_i, new_N = SNPn_postQC_imp, round_min = TRUE)
stat_table[16, ] <- stat_save(stat_name = "Imputation quality", stat_col = dataI$IMP_QUALITY, old_N = SNPn_preQC, old_NA = iNA_impQ_N, old_inv = inv_impQ_N, new_N = SNPn_postQC, new_NA = sum(iNA$impQ), new_inv = sum(inv$impQ))
stat_table[17, ] <- stat_select_save(stat_name = "Imp. quality", select_name = "genotyped", stat_col = dataI$IMP_QUALITY, NA_col = iNA$impQ, inv_col = inv$impQ,
select_col = geno_list, old_N = SNPn_preQC_geno, old_NA = iNA_impQ_g, old_inv = inv_impQ_g, new_N = SNPn_postQC_geno)
stat_table[18, ] <- stat_select_save(stat_name = "Imp. quality", select_name = "imputed", stat_col = dataI$IMP_QUALITY, NA_col = iNA$impQ, inv_col = inv$impQ,
select_col = imp_list, old_N = SNPn_preQC_imp, old_NA = iNA_impQ_i, old_inv = inv_impQ_i, new_N = SNPn_postQC_imp)
write.table(if(SFL) stat_table else format(stat_table, justify = "left"),
logCon, quote = FALSE,
sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(stat_table)
write.table(c("", "",
"************************",
"\t4. Allele matching",
"************************",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
all_table0 <- data.frame(
names = c("SNPs", "> negative strand", "> strand switch (SS)", ">> double SS",
">> MISMATCH", ">>> mismatch & double SS",
"> flipped", "> ambiguous", ">> MAF between 35 & 65%", "> deviating allele frequency"),
N = integer(length = 10), perc = numeric(length = 10), stringsAsFactors = FALSE)
if(use_allele_std & SNPn_ref_std > 0L) {
all_table1 <- all_table0
all_table1$N <- c(allele_out_std$n_SNPs, if(strand_ref$minus == 0L) 0L else allele_out_std$n_negative_strand, allele_out_std$n_strandswitch, if(strand_ref$minus == 0L) 0L else allele_out_std$n_negative_switch,
allele_out_std$n_mismatch, if(strand_ref$minus == 0L) 0L else allele_out_std$n_negative_mismatch,
allele_out_std$n_flipped, allele_out_std$n_ambiguous, allele_out_std$n_suspect, allele_out_std$n_diffEAF)
all_table1$perc <- round(100 * all_table1$N/SNPn_ref, digits = 2)
all_table0$N <- all_table1$N
all_table1$N <- ifelse(is.na(all_table1$N ), "-", as.character(all_table1$N ))
all_table1$perc <- ifelse(is.na(all_table1$perc), "-", as.character(all_table1$perc))
} else { all_table1 <- cbind(all_table0$names, matrix(data = "-", nrow = 10, ncol = 2)) }
if(use_allele_alt & SNPn_ref_alt > 0L) {
all_table2 <- all_table0
all_table2$N <- c(allele_out_alt$n_SNPs, if(strand_ref$minus == 0L) 0L else allele_out_alt$n_negative_strand, allele_out_alt$n_strandswitch, if(strand_ref$minus == 0L) 0L else allele_out_alt$n_negative_switch,
allele_out_alt$n_mismatch, if(strand_ref$minus == 0L) 0L else allele_out_alt$n_negative_mismatch,
allele_out_alt$n_flipped, allele_out_alt$n_ambiguous, allele_out_alt$n_suspect, allele_out_alt$n_diffEAF)
all_table2$perc <- round(100 * all_table2$N/SNPn_ref, digits = 2)
all_table0$N <- all_table0$N + all_table2$N
all_table2$N <- ifelse(is.na(all_table2$N ), "-", as.character(all_table2$N ))
all_table2$perc <- ifelse(is.na(all_table2$perc), "-", as.character(all_table2$perc))
} else { all_table2 <- cbind(all_table0$names, matrix(data = "-", nrow = 10, ncol = 2)) }
if(SNPn_ref_new > 0L) {
all_table3 <- all_table0
all_table3$N <- c(SNPn_ref_new, if(strand_ref$minus == 0L) 0L else allele_out_new$n_negative_strand, NA, NA,
NA, NA, allele_out_new$n_flipped, allele_out_new$n_ambiguous, NA, NA)
all_table3$perc <- round(100 * all_table3$N/SNPn_ref, digits = 2)
all_table0$N[c(1,2,7,8)] <- all_table0$N[c(1,2,7,8)] + all_table3$N[c(1,2,7,8)]
all_table3$N <- ifelse(is.na(all_table3$N ), "-", as.character(all_table3$N ))
all_table3$perc <- ifelse(is.na(all_table3$perc), "-", as.character(all_table3$perc))
} else { all_table3 <- cbind(all_table0$names, c("0", rep("-", 9)), c("0", rep("-", 9))) }
all_table0$perc <- round(100 * all_table0$N/SNPn_ref, digits = 2)
all_table0$N <- ifelse(is.na(all_table0$N ), "-", as.character(all_table0$N ))
all_table0$perc <- ifelse(is.na(all_table0$perc), "-", as.character(all_table0$perc))
# Combining tables & adding header line
write.table(if(SFL) {
rbind(c("Combined", "N", "%", "", allele_name_std, "N", "%", "", allele_name_alt, "N", "%", "", if(update_alt) paste("Updated", allele_name_alt) else "Other SNPs", "N", "%"),
cbind(all_table0, character(length = 10), all_table1, character(length = 10), all_table2, character(length = 10), all_table3, stringsAsFactors = F))
} else {
format(rbind(c("Combined", "N", "%", "", allele_name_std, "N", "%", "", allele_name_alt, "N", "%", "", if(update_alt) paste("Updated", allele_name_alt) else "Other SNPs", "N", "%"),
cbind(all_table0, character(length = 10), all_table1, character(length = 10), all_table2, character(length = 10), all_table3, stringsAsFactors = F)), justify = "left") },
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(all_table0, all_table1, all_table2, all_table3)
write.table(c("", "",
"**********************",
"\t5. QC statistics",
"**********************",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
stat_table <- data.frame(allname = c(allele_name_std, "> ambiguous SNPs", "> non-ambiguous SNPs", allele_name_alt, "> ambiguous SNPs", "> non-ambiguous SNPs"),
allcor = "-",
em1 = character(length = 6),
name1 = c("r", "Lambda", "> genotyped", "> imputed", "", ""),
stat1 = c(round(outcome_P, digits = 3), round(plot_output$lambda, digits = 3), "", ""),
em2 = character(length = 6),
name2 = c("SE median", "Skewness", "Kurtosis", "Visscher's stat.", "* high-quality SNPs", " only"),
stat2all = c(signif(stat_SE, digits = 3), round(c(stat_skewness, stat_kurtosis, stat_Visscher), digits = 3), "", ""),
stat2_HQ = c(signif(stat_SE_HQ, digits = 3), round(c(stat_skewness_HQ, stat_kurtosis_HQ, stat_Vissc_HQ), digits = 3), "", ""),
em3 = character(length = 6),
Nname = c("Negative strand SNPs", "High-quality SNPs", "Corrected p-values", "Extreme p-values", "", ""),
NN = c(strand_post$minus, SNPn_postQC_HQ, calc_p_newN, low_p_newN, NA, NA),
Nperc = numeric(length = 6), stringsAsFactors = FALSE)
stat_table$Nperc <- round(100*stat_table$NN/SNPn_postQC, digits = 2)
stat_table[5:6, c("NN", "Nperc")] <- ""
if(!calculate_missing_p) { stat_table[3:4, c("NN", "Nperc")] <- "-" }
if(use_allele_std | use_allele_alt) {
stat_table$allcor <- as.character(round(c(allele_out_std$FRQ_cor, allele_out_std$FRQ_cor_ambiguous, allele_out_std$FRQ_cor_nonambi, allele_out_alt$FRQ_cor, allele_out_alt$FRQ_cor_ambiguous, allele_out_alt$FRQ_cor_nonambi), digits = 3))
if(!check_ambiguous_alleles) { stat_table$allcor[c(2,3,5,6)] <- "-" }
if(!use_allele_alt) { stat_table$allcor[c(4,5,6)] <- "-" }
}
write.table(if(SFL) {
rbind(c("Allele-frequency correlation", "", "", "P-value corr.", "", "", "QC stats", "all SNPs", "*HQ SNPs", "", "Other", "N", "%"), stat_table)
} else {
format(rbind(c("Allele-frequency correlation", "", "", "P-value corr.", "", "", "QC stats", "all SNPs", "*HQ SNPs", "", "Other", "N", "%"),
stat_table), justify = "left") },
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(stat_table)
if(ignore_impstatus) {
write.table(c("", "",
"************************",
"\t6. QQ plot filters",
"************************",
"",
"NOTE - ignore_impstatus was TRUE: HWE-p, callrate and imputation-Q filters (HQ & QQ) were applied to all SNPs",
"",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
} else {
write.table(c("", "",
"************************",
"\t6. QQ plot filters",
"************************",
"",
"NOTE - Ignore_impstatus was FALSE: HWE-p and callrate filters (HQ & QQ) were applied to genotyped SNPs only;",
" imputation-Q filter to imputed SNPs only.",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE) }
FRQ_table <- data.frame(names = character(length = 5), N = character(5), perc = character(5), em = character(length = 5), stringsAsFactors = FALSE)
if(is.null(plot_output$FRQfilter_N)) { FRQ_table$names[1] <- "no filter applied"
} else {
FRQ_table$names[1:length(plot_output$FRQfilter_N)] <- plot_output$FRQfilter_names
FRQ_table$N[1:length(plot_output$FRQfilter_N)] <- as.character(plot_output$FRQfilter_N)
FRQ_table$perc[1:length(plot_output$FRQfilter_N)] <- as.character(round(100*plot_output$FRQfilter_N/SNPn_postQC, digits = 2))
}
HWE_table <- data.frame(names = character(length = 5), N = character(5), perc = character(5), em = character(length = 5), stringsAsFactors = FALSE)
if(is.null(plot_output$HWEfilter_N)) { HWE_table$names[1] <- "no filter applied"
} else {
HWE_table$names[1:length(plot_output$HWEfilter_N)] <- plot_output$HWEfilter_names
HWE_table$N[1:length(plot_output$HWEfilter_N)] <- as.character(plot_output$HWEfilter_N)
HWE_table$perc[1:length(plot_output$HWEfilter_N)] <- as.character(round(100*plot_output$HWEfilter_N/SNPn_postQC, digits = 2))
}
cal_table <- data.frame(names = character(length = 5), N = character(5), perc = character(5), em = character(length = 5), stringsAsFactors = FALSE)
if(is.null(plot_output$calfilter_N)) { cal_table$names[1] <- "no filter applied"
} else {
cal_table$names[1:length(plot_output$calfilter_N)] <- plot_output$calfilter_names
cal_table$N[1:length(plot_output$calfilter_N)] <- as.character(plot_output$calfilter_N)
cal_table$perc[1:length(plot_output$calfilter_N)] <- as.character(round(100*plot_output$calfilter_N/SNPn_postQC, digits = 2))
}
imp_table <- data.frame(names = character(length = 5), N = character(5), perc = character(5), stringsAsFactors = FALSE)
if(is.null(plot_output$impfilter_N)) { imp_table$names[1] <- "no filter applied"
} else {
imp_table$names[1:length(plot_output$impfilter_N)] <- plot_output$impfilter_names
imp_table$N[1:length(plot_output$impfilter_N)] <- as.character(plot_output$impfilter_N)
imp_table$perc[1:length(plot_output$impfilter_N)] <- as.character(round(100*plot_output$impfilter_N/SNPn_postQC, digits = 2))
}
write.table(if(SFL) {
rbind(c("Allele frequency", "N", "%", "", "HWE p-value", "N", "%", "", "Call rate", "N", "%", "", "Imp. quality", "N", "%"),
cbind(FRQ_table, HWE_table, cal_table, imp_table))
} else {
format(rbind(c("Allele frequency", "N", "%", "", "HWE p-value", "N", "%", "", "Call rate", "N", "%", "", "Imp. quality", "N", "%"),
cbind(FRQ_table, HWE_table, cal_table, imp_table)), justify = "left") },
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(FRQ_table, HWE_table, cal_table, imp_table)
write.table(c("", "",
"*********************************",
"\t7. Chromosome & Allele data",
"*********************************",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
### Tables counting chromosomes & alleles
chr_table <- data.frame(Chr = c("NA", "Invalid", 1:22, "23 ( X)",
"24 ( Y)", "25 (XY)", "26 ( M)"),
N1 = integer(length = 28),
perc1= numeric(length = 28),
empty= character(length = 28),
all = character(length = 28),
N2 = integer(length = 28),
perc2= numeric(length = 28),
stringsAsFactors = FALSE)
chr_table$N1[1] <- sum(iNA$chr)
chr_table$N1[2] <- sum(inv$chr)
na_rm_chr <- chr_table$N1[1] + chr_table$N1[2] > 0L
for(ci in 1:22) { chr_table$N1[ci + 2L] <- sum(dataI$CHR == ci, na.rm = na_rm_ch) }
chr_table$N1[25] <- if(remove_X ) NA else sum(dataI$CHR == 23L, na.rm = na_rm_ch)
chr_table$N1[26] <- if(remove_Y ) NA else sum(dataI$CHR == 24L, na.rm = na_rm_ch)
chr_table$N1[27] <- if(remove_XY) NA else sum(dataI$CHR == 25L, na.rm = na_rm_ch)
chr_table$N1[28] <- if(remove_M ) NA else sum(dataI$CHR == 26L, na.rm = na_rm_ch)
chr_table$perc1 <- round(100*chr_table$N1/SNPn_postQC, digits = 2)
if(remove_X) {
chr_table$N1[25] <- "removed"
chr_table$perc1[25] <- paste0("(", chr_X_N, " SNPs)") }
if(remove_Y) {
chr_table$N1[26] <- "removed"
chr_table$perc1[26] <- paste0("(", chr_Y_N, " SNPs)") }
if(remove_XY) {
chr_table$N1[27] <- "removed"
chr_table$perc1[27] <- paste0("(",chr_XY_N, " SNPs)") }
if(remove_M) {
chr_table$N1[28] <- "removed"
chr_table$perc1[28] <- paste0("(", chr_M_N, " SNPs)") }
chr_table$all[1:10] <- c("A", "T", "C", "G", "", "Other allele", "A", "T", "C", "G")
chr_table$N2[1] <- sum(dataI$EFFECT_ALL == "A")
chr_table$N2[2] <- sum(dataI$EFFECT_ALL == "T")
chr_table$N2[3] <- sum(dataI$EFFECT_ALL == "C")
chr_table$N2[4] <- sum(dataI$EFFECT_ALL == "G")
chr_table$N2[7] <- sum(dataI$OTHER_ALL == "A")
chr_table$N2[8] <- sum(dataI$OTHER_ALL == "T")
chr_table$N2[9] <- sum(dataI$OTHER_ALL == "C")
chr_table$N2[10]<- sum(dataI$OTHER_ALL == "G")
chr_table$perc2[1:10] <- round(100*chr_table$N2[1:10]/SNPn_postQC, digits = 2)
chr_table[c(5, 6, 11:28), c(6,7)] <- ""
write.table(if(SFL) {
rbind(c("Chromosome", "N", "%", "", "Effect allele", "N", "%"), chr_table)
} else {
format(rbind(c("Chromosome", "N", "%", "", "Effect allele", "N", "%"), chr_table), justify = "left") },
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
rm(chr_table)
write.table(c("", "",
"*********************************",
"\t8. Missing & invalid values",
"*********************************",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
error_table1 <- data.frame(names = c("Duplicate SNPs", "Imputation status", "Strand", "> missing", "> invalid", "Chromosome", "> missing", "> invalid", "Position", "> missing", "> invalid", "Invalid SNPs"),
N = c(SNPn_preQC_dupli, SNPn_preQC - (SNPn_preQC_geno+ SNPn_preQC_imp), strand_pre$missing + strand_pre$invalid, strand_pre$missing, strand_pre$invalid, iNA_chr_N + inv_chr_N, iNA_chr_N, inv_chr_N, iNA_pos_N + inv_pos_N, iNA_pos_N, inv_pos_N, SNPn_preQC_inv),
perc = numeric(length = 12), stringsAsFactors = FALSE)
error_table1$perc <- round(100*error_table1$N / SNPn_preQC, digits = 2)
error_table2 <- error_table1
error_table2[1,1] <- "Extreme p-values"
error_table2$N <- c(low_p_newN, SNPn_postQC - (SNPn_postQC_geno+ SNPn_postQC_imp), strand_post$missing + strand_post$invalid, strand_post$missing, strand_post$invalid, sum(iNA$chr | inv$chr), sum(iNA$chr), sum(inv$chr), sum(iNA$pos | inv$pos), sum(iNA$pos), sum(inv$pos), SNPn_postQC_inv)
error_table2$perc <- round(100*error_table2$N / SNPn_postQC, digits = 2)
write.table(if(SFL) {
rbind(c("Pre-QC", "N", "%", "", "Post-QC", "N", "%"),
cbind(error_table1, character(length = 12), error_table2))
} else {
format(rbind(c("Pre-QC", "N", "%", "", "Post-QC", "N", "%"),
cbind(error_table1, character(length = 12), error_table2)),
justify = "left") },
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
write.table(" * Note: SNPs whose imputation status is missing (i.e. not invalid) are also counted as invalid.",
logCon, col.names=FALSE, row.names=FALSE, quote=FALSE)
rm(error_table1, error_table2)
write.table(c("", "",
"***************************",
"\t9. Settings of the QC",
"***************************",
""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
settings_table <- data.frame(
col1 = c("input file", "output file", "dir data", "dir output", "dir refs", ""),
col2 = c(filename_input, filename, dir_data, dir_output, dir_references, ""),
stringsAsFactors = FALSE)
write.table(if(SFL) settings_table else format(settings_table, justify = "left"),
logCon, col.names=FALSE, row.names=FALSE, quote=FALSE, sep=if(SFL) "\t" else " ")
rm(settings_table)
# Table already includes an additional row to create whiteline
filter_table <- data.frame(
chr_names = c(" X chrom.", " Y chrom.", "XY chrom.", " M chrom."),
chr_val = c(remove_X, remove_Y, remove_XY, remove_M),
em1 = " ",
fil_names = c("Allele FRQ", "HWE p-value", "Callrate", "Imp. Quality"),
threshold = c(useFRQ_threshold, useHWE_threshold, useCal_threshold, useImp_threshold),
fNA = logical(length = 4),
HQ = c(NA, NA, NA, NA),
QQ = c("-", "-", "-", "-"),
stringsAsFactors = FALSE)
if(useFRQ) {
if(!is.null(c(HQfilter_FRQ, QQfilter_FRQ)))
filter_table[1,6] <- NAfilter_FRQ
if(!is.null(HQfilter_FRQ)) filter_table[1,7] <- HQfilter_FRQ
if(!is.null(QQfilter_FRQ)) filter_table[1,8] <- paste(QQfilter_FRQ, collapse = ", ")
} else { filter_table[1,8] <- "Insufficient data to meet threshold" }
if(useHWE) {
if(!is.null(c(HQfilter_HWE, QQfilter_HWE)))
filter_table[2,6] <- NAfilter_HWE
if(!is.null(HQfilter_HWE)) filter_table[2,7] <- HQfilter_HWE
if(!is.null(QQfilter_HWE)) filter_table[2,8] <- paste(QQfilter_HWE, collapse = ", ")
} else { filter_table[2,8] <- "Insufficient data to meet threshold" }
if(useCal) {
if(!is.null(c(HQfilter_cal, QQfilter_cal)))
filter_table[3,6] <- NAfilter_cal
if(!is.null(HQfilter_cal)) filter_table[3,7] <- HQfilter_cal
if(!is.null(QQfilter_cal)) filter_table[3,8] <- paste(QQfilter_cal, collapse = ", ")
} else { filter_table[3,8] <- "Insufficient data to meet threshold" }
if(useImp) {
if(!is.null(c(HQfilter_imp, QQfilter_imp)))
filter_table[4,6] <- NAfilter_imp
if(!is.null(HQfilter_imp)) filter_table[4,7] <- HQfilter_imp
if(!is.null(QQfilter_imp)) filter_table[4,8] <- paste(QQfilter_imp, collapse = ", ")
} else { filter_table[4,8] <- "Insufficient data to meet threshold" }
write.table(if(SFL) {
rbind(c("> filters", "", "", "", "threshold*", "NA", "HQ", "QQ"), filter_table)
} else {
format(rbind(c("> filters", "", "", "", "threshold*", "NA", "HQ", "QQ"),
filter_table), justify = "left") },
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
write.table(c("* the value used in phase 4 (opt. after multiplication with the remaining number of SNPs)", ""),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
rm(filter_table)
settings_table <- rbind(c("> other settings", "", "", "", "", "", "> plots", "", "", "", "", "", "> allele references", "", "", "", ""),
data.frame(
c1 = c("remove_mismatches_std", "remove_mismatches_alt", "remove_diffEAF_std", "remove_diffEAF_alt", "threshold_diffEAF", "check_ambiguous_alleles", "return_HQ_effectsizes", "calculate_missing_p"),
c2 = c(remove_mismatches_std, remove_mismatches_alt, remove_diffEAF_std, remove_diffEAF_alt, as.character(threshold_diffEAF), check_ambiguous_alleles, return_HQ_effectsizes, calculate_missing_p),
c3 = " ",
c4 = c("ignore impS", "min imp", "max imp", "imputed T", "imputed F", "imputed NA","",""),
c5 = c(ignore_impstatus, as.character(minimal_impQ_value), as.character(maximal_impQ_value), paste(imputed_T, collapse = ", "), paste(imputed_F, collapse = ", "), paste(imputed_NA, collapse = ", "),"",""),
c6 = " ",
c7 = c("make plots", "only plot if threshold", "histograms", "QQ plot", "Manhattan", "intensity","",""),
c8 = c(make_plots, only_plot_if_threshold, plot_histograms, plot_QQ, plot_Manhattan, plot_intensity,"",""),
c9 = " ",
c10= c("threshold FRQ test" , "threshold p-test", "QQ bands", "plot p cutoff", "*Manhattan threshold", "","",""),
c11= c(threshold_allele_freq_correlation, threshold_p_correlation, as.character(plot_QQ_bands), plot_cutoff_p, useMan_threshold, "","",""),
c12= " ",
c13= c("update alt", "update savename", "save as rdata", "backup alt", "", "","",""),
c14= if(update_alt) { c(TRUE, update_savename, update_as_rdata, backup_alt, "", "","","")
} else { c(FALSE, "-", "-", "-", "", "", "", "" ) },
c15= " ",
c16= c("allele ref std input", "allele ref std name", "allele ref alt input", "allele ref alt name", "", "","",""),
c17= c(settings_allele_std, allele_name_std, settings_allele_alt, allele_name_alt, "", "","",""),
stringsAsFactors = FALSE))
write.table(if(SFL) settings_table else format(settings_table, justify = "left"),
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
write.table(c("", "> data import & export"),
logCon, quote = FALSE, row.names = FALSE, col.names = FALSE)
settings_table <- data.frame(
col1 = c("nrows", "test_nrows", "header_translations", "comment.char", "column_separators", "selected separator", "na.strings", "header", "", "", ""),
col3 = c(nrows, nrows_test, settings_header_input, encodeString(paste0("'", comment.char, "'")), encodeString(paste0("'", column_separators, "'", collapse = ", ")), encodeString(paste0("'", loadI$sep, "'")), paste(na.strings, collapse = ", "), header, "", "", ""),
col4 = " ",
col5 = c("save_final_dataset", "order_columns", "out_header", "out_quote", "out_sep", "out_eol", "out_na", "out_colnames", "out_rownames", "out_dec", "out_qmethod"),
col7 = c(save_final_dataset, order_columns, settings_header_output, out_quote, encodeString(paste0("'", out_sep, "'")), encodeString(paste0("'", out_eol, "'")), out_na, paste(out_colnames, collapse = ", "), paste(out_rownames, collapse = ", "), out_dec, out_qmethod),
stringsAsFactors = FALSE)
write.table(if(SFL) settings_table else format(settings_table, justify = "left"),
logCon, quote = FALSE, sep = if(SFL) "\t" else " ", row.names = FALSE, col.names = FALSE)
close(logCon)
rm(settings_table)
# creating output file
effect_quantile <- quantile(dataI$EFFECT, names = FALSE)
useStrand <- strand_ref$plus + strand_ref$minus > 0L
empty_colls <- c("CHR","POSITION","STRAND","PVALUE",
"EFF_ALL_FREQ","HWE_PVAL","CALLRATE","N_TOTAL","IMPUTED",
"USED_FOR_IMP", "IMP_QUALITY")[c(
if(iNA_chr_N + inv_chr_N == SNPn_preQC) TRUE else all(iNA$chr | inv$chr),
if(iNA_pos_N + inv_pos_N == SNPn_preQC) TRUE else all(iNA$pos | inv$pos),
strand_post$plus + strand_post$minus == 0L,
if(calculate_missing_p) FALSE else { if(iNA_p_N + inv_p_N == SNPn_preQC) TRUE else all(iNA$p | inv$p) },
if(useFRQ) FALSE else all(iNA$FRQ | inv$FRQ),
if(iNA_HWE_N + inv_HWE_N == SNPn_preQC) TRUE else all(iNA$HWE | inv$HWE),
if(iNA_cal_N + inv_cal_N == SNPn_preQC) TRUE else all(iNA$cal | inv$cal),
!useN,
SNPn_postQC_geno + SNPn_postQC_imp == 0L,
all(is.na(dataI$USED_FOR_IMP)),
if(iNA_impQ_N + inv_impQ_N == SNPn_preQC) TRUE else all(iNA$impQ | inv$impQ)
) ]
QC_results <- list(QC_successful = TRUE,
filename_input = filename_input,
filename_output = paste0(filename, ".txt"),
sample_size = stat_N_max,
sample_size_HQ = stat_N_HQ,
lambda = plot_output$lambda[1],
lambda_geno = plot_output$lambda[2],
lambda_imp = plot_output$lambda[3],
SNP_N_input = SNPn_input,
SNP_N_input_monomorphic = monomorp_N,
SNP_N_input_monomorphic_identic_alleles = same_al_N,
SNP_N_input_chr = if(remove_X | remove_Y | remove_XY | remove_M) sum(chr_X_N, chr_Y_N, chr_XY_N, chr_M_N, na.rm = TRUE) else NA,
SNP_N_preQC = SNPn_preQC,
SNP_N_preQC_unusable = remove_L1_N,
SNP_N_preQC_invalid = SNPn_preQC_inv,
SNP_N_preQC_min = if(strand_pre$plus + strand_pre$minus == 0L) { NA } else { strand_pre$minus },
SNP_N_midQC = SNPn_ref,
SNP_N_midQC_min = if(useStrand) { strand_ref$minus } else { NA },
SNP_N_midQC_min_std = if(use_allele_std & useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_std$n_negative_strand } } else { NA },
SNP_N_midQC_min_alt = if(use_allele_alt & useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_alt$n_negative_strand } } else { NA },
SNP_N_midQC_min_new = if( useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_new$n_negative_strand } } else { NA },
SNP_N_midQC_strandswitch_std = if(use_allele_std) { allele_out_std$n_strandswitch } else { NA },
SNP_N_midQC_strandswitch_std_min = if(use_allele_std & useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_std$n_negative_switch } } else { NA },
SNP_N_midQC_strandswitch_alt = if(use_allele_alt) { allele_out_alt$n_strandswitch } else { NA },
SNP_N_midQC_strandswitch_alt_min = if(use_allele_alt & useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_alt$n_negative_switch } } else { NA },
SNP_N_midQC_mismatch = N_mismatch,
SNP_N_midQC_mismatch_std = if(use_allele_std) { allele_out_std$n_mismatch } else { NA },
SNP_N_midQC_mismatch_std_min = if(use_allele_std & useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_std$n_negative_mismatch } } else { NA },
SNP_N_midQC_mismatch_alt = if(use_allele_alt) { allele_out_alt$n_mismatch } else { NA },
SNP_N_midQC_mismatch_alt_min = if(use_allele_alt & useStrand) { if(strand_ref$minus == 0L) { 0L } else { allele_out_alt$n_negative_mismatch } } else { NA },
SNP_N_midQC_flip_std = if(use_allele_std) { allele_out_std$n_flipped } else { NA },
SNP_N_midQC_flip_alt = if(use_allele_alt) { allele_out_alt$n_flipped } else { NA },
SNP_N_midQC_flip_new = allele_out_new$n_flipped,
SNP_N_midQC_ambiguous = N_ambiguous,
SNP_N_midQC_ambiguous_std = if(use_allele_std) { allele_out_std$n_ambiguous } else { NA },
SNP_N_midQC_ambiguous_alt = if(use_allele_alt) { allele_out_alt$n_ambiguous } else { NA },
SNP_N_midQC_ambiguous_new = allele_out_new$n_ambiguous,
SNP_N_midQC_suspect = N_suspect,
SNP_N_midQC_suspect_std = if(use_allele_std) { allele_out_std$n_suspect } else { NA },
SNP_N_midQC_suspect_alt = if(use_allele_alt) { allele_out_alt$n_suspect } else { NA },
SNP_N_midQC_diffEAF = N_diffEAF,
SNP_N_midQC_diffEAF_std = if(use_allele_std) { allele_out_std$n_diffEAF } else { NA },
SNP_N_midQC_diffEAF_alt = if(use_allele_alt) { allele_out_alt$n_diffEAF } else { NA },
SNP_N_postQC = SNPn_postQC,
SNP_N_postQC_geno = SNPn_postQC_geno,
SNP_N_postQC_imp = SNPn_postQC_imp,
SNP_N_postQC_invalid = SNPn_postQC_inv,
SNP_N_postQC_min = if(strand_post$plus + strand_post$minus == 0L) { NA } else { strand_post$minus },
SNP_N_postQC_HQ = SNPn_postQC_HQ,
fixed_HWE = if(ignore_impstatus) {
if(all(iNA$HWE | inv$HWE)) { "insuf. data"
} else { max(dataI$HWE_PVAL[!iNA$HWE & !inv$HWE]) == min(dataI$HWE_PVAL[!iNA$HWE & !inv$HWE]) }
} else {
if(all(iNA$HWE | inv$HWE | !geno_list)) { "insuf. data"
} else { max(dataI$HWE_PVAL[geno_list & !iNA$HWE & !inv$HWE]) == min(dataI$HWE_PVAL[geno_list & !iNA$HWE & !inv$HWE]) }
},
fixed_callrate = if(ignore_impstatus) {
if(all(iNA$cal | inv$cal)) { "insuf. data"
} else { max(dataI$CALLRATE[!iNA$cal & !inv$cal]) == min(dataI$CALLRATE[!iNA$cal & !inv$cal]) }
} else {
if(all(iNA$cal | inv$cal | !geno_list)) { "insuf. data"
} else { max(dataI$CALLRATE[geno_list & !iNA$cal & !inv$cal]) == min(dataI$CALLRATE[geno_list & !iNA$cal & !inv$cal]) }
},
fixed_sampleN = if(useN) { stat_N_max == min(dataI$N_TOTAL, na.rm = iNA_N_N+inv_N_N>0L) } else "no data",
fixed_impQ = if(ignore_impstatus) {
if(all(iNA$impQ | inv$impQ)) { "insuf. data"
} else { max(dataI$IMP_QUALITY[!iNA$impQ & !inv$impQ]) == min(dataI$IMP_QUALITY[!iNA$impQ & !inv$impQ]) }
} else {
if(all(iNA$impQ | inv$impQ | !imp_list)) { "insuf. data"
} else { max(dataI$IMP_QUALITY[imp_list & !iNA$impQ & !inv$impQ]) == min(dataI$IMP_QUALITY[imp_list & !iNA$impQ & !inv$impQ]) }
},
effect_25 = effect_quantile[2],
effect_mean = mean(dataI$EFFECT),
effect_median = effect_quantile[3],
effect_75 = effect_quantile[4],
SE_median = stat_SE,
SE_median_HQ= stat_SE_HQ,
skewness = stat_skewness,
skewness_HQ = stat_skewness_HQ,
kurtosis = stat_kurtosis,
kurtosis_HQ = stat_kurtosis_HQ,
all_ref_std_name = allele_name_std,
all_ref_alt_name = allele_name_alt,
all_MAF_std_r = if(use_allele_std) allele_out_std$FRQ_cor else NA,
all_MAF_alt_r = if(use_allele_alt) allele_out_alt$FRQ_cor else NA,
all_ambiguous_MAF_std_r = if(use_allele_std) allele_out_std$FRQ_cor_ambiguous else NA,
all_ambiguous_MAF_alt_r = if(use_allele_alt) allele_out_alt$FRQ_cor_ambiguous else NA,
all_non_ambig_MAF_std_r = if(use_allele_std) allele_out_std$FRQ_cor_nonambi else NA,
all_non_ambig_MAF_alt_r = if(use_allele_alt) allele_out_alt$FRQ_cor_nonambi else NA,
all_ref_changed = allele_ref_changed,
effectsize_return = outcome_ES$return_ES,
effectsizes_HQ = if(outcome_ES$return_ES) outcome_ES$HQ_effectsizes else NULL,
pvalue_r = outcome_P,
visschers_stat = stat_Visscher,
visschers_stat_HQ = stat_Vissc_HQ,
columns_std_missing = if(header_info$missing_N == 0L) 0L else paste(header_info$missing_h, collapse = ", "),
columns_std_empty = if(length(empty_colls) == 0L) 0L else paste(empty_colls, collapse = ", "),
columns_unidentified = if(header_info$unknown_N == 0L) 0L else paste(header_info$unknown_h, collapse = ", "),
outcome_useFRQ = useFRQ,
outcome_useHWE = useHWE,
outcome_useCal = useCal,
outcome_useImp = useImp,
outcome_useMan = useMan,
settings_ignore_impstatus = ignore_impstatus,
settings_filter_NA_FRQ = NAfilter_FRQ,
settings_filter_NA_HWE = NAfilter_HWE,
settings_filter_NA_cal = NAfilter_cal,
settings_filter_NA_imp = NAfilter_imp,
settings_filter_HQ_FRQ = HQfilter_FRQ,
settings_filter_HQ_HWE = HQfilter_HWE,
settings_filter_HQ_cal = HQfilter_cal,
settings_filter_HQ_imp = HQfilter_imp,
settings_filter_QQ_FRQ = QQfilter_FRQ,
settings_filter_QQ_HWE = QQfilter_HWE,
settings_filter_QQ_cal = QQfilter_cal,
settings_filter_QQ_imp = QQfilter_imp
)
if(save_final_dataset) {
print(" - saving cleaned dataset", quote = FALSE)
flush.console()
if(settings_header_output != "standard") {
if(settings_header_output == "original") {
if(order_columns) {
header_new <- colnames(dataI)
order_orig <- integer(length = header_info$header_N)
for(hI in 1:header_info$header_N) { order_orig[hI] <- which(header_new == header_info$header_h[hI]) }
colnames(dataI)[order_orig] <- header_orig
} else {
colnames(dataI)[1:length(header_orig)] <- header_orig
}
} else {
header_out <- translate_header(header = colnames(dataI), standard = out_header[ ,1], alternative = out_header)
if(header_out$missing_N > 0L) save_log(5, "saving file", "missing column", SNPn_postQC, SNPn_postQC, "-", paste("Unable to translate column(s)", paste(header_out$missing_h, collapse = ", ")), filename_dir)
colnames(dataI) <- header_out$header_h
if(settings_header_output == "GenABEL") {
if(!"build" %in% header_out$unknown_h) dataI$build <- as.factor("unknown")
dataI$effallele <- as.factor(dataI$allele1)
if(!"pgc" %in% header_out$unknown_h) {
dataI$pgc <- if(is.na(plot_output$lambda[1])) NA else {
pchisq( (dataI$beta/(dataI$sebeta * sqrt(plot_output$lambda[1]) ) )^2,
1, lower.tail=FALSE) }
}
if(!"lambda.estimate" %in% header_out$unknown_h) dataI$lambda.estimate <- plot_output$lambda[1]
if(!"lambda.se" %in% header_out$unknown_h) dataI$lambda.se <- NA
dataI <- dataI[ , c("name", "chromosome", "position", "strand",
"allele1", "allele2", "build", "effallele",
"effallelefreq", "n", "beta", "sebeta", "p",
"pgc", "lambda.estimate", "lambda.se",
"pexhwe", "call")]
}
}
}
write.table(dataI,
if(gzip_final_dataset) gzfile(paste0(filename_dir, ".txt.gz")) else paste0(filename_dir, ".txt"),
quote = out_quote, sep = out_sep, eol = out_eol, na = out_na, dec = out_dec,
row.names = out_rownames, col.names = out_colnames, qmethod = out_qmethod)
}
print("", quote = FALSE)
print(paste("QC check completed for", filename_input,"( file", logI, "out of", logN,")"), quote = FALSE)
return(QC_results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.