Nothing
#### added columns to dataset
# found , whether found in any reference files
# palindromic
# match , no nedd to flip or switch
# flip
# switch
# wrong , wrong variant based on alleles and matching with references
# highDiffEAF
# HQ
# PVALUE.calculated
process_each_file <- function(study){
print_and_log(sprintf('============== [ File %s from %s ] ==============',
.QC$file.counter,
length(.QC$qc.study.list)),
'info',
cat = TRUE)
## processing crucial columns and creating report variables
config <- .QC$config
# study variable contains all variables and paths
.QC$thisStudy <- study
.QC$thisStudy$starttime <- Sys.time()
.QC$thisStudy$number <- .QC$file.counter
.QC$thisStudy$effect_type_string <- .QC$config$input_parameters$effect_type_string
.QC$file.counter <- .QC$file.counter + 1 ## counter that displays how many files are processed
#print_and_log(mem_used(),'info',cat= FALSE)
### uplolading and processing file
## ==============================================
input.data <- uploadInputFile()
#print_and_log(mem_used(),'info',cat= FALSE)
# return null if file could not be read
#====================================
if(is.null(input.data)){
print_and_log('File removed from QC analysis due to error in loading!','warning')
addEmptyStudy_studyInstance(.QC$thisStudy)
return(NULL)
}
invisible(gc())
# remove duplicate lines of data
# study$dup_lines_count is set
#TODO ignored because it is very time consuming
#input.data <- removeDuplicatedLines(input.data)
# IMPORTANT: variants may be removed in 2 steps
# 1- missing crucial variables
# 2- monomorphic, duplicate, chromosomes
## ==============================================
input.data <- processInputFile(input.data)
#print_and_log(mem_used(),'info',cat= FALSE)
# return null if error encountered
#====================================
if(is.null(input.data)){
print_and_log('File removed from QC analysis due to error in processing!','warning')
addEmptyStudy_studyInstance(.QC$thisStudy)
return(NULL)
}
# variable_statistics_pre_matching(input.data) moved to process_matched_data()
invisible(gc())
### matching with reference dataset
## ==============================================
print_and_log('Comparing input file with reference file (this might take long) ...','info')
input.data<-tryCatch(compareInputfileWithReferenceData(input.data),
error = function(err) {
print_and_log(paste('Error in comparing process:' , err$message), 'warning')
return(NULL)
}
)
#====================================
if(is.null(input.data)){
print_and_log('File removed from QC analysis due to error in comparing process!','warning')
addEmptyStudy_studyInstance(.QC$thisStudy)
return(NULL)
}
invisible(gc())
#print_and_log(mem_used(),'info',cat= FALSE)
## allele matching
## ==============================================
print_and_log('Allele matching vs Allele Freq reference dataset ...','info')
input.data <- tryCatch(input.data[,c("match_result","palindromic") :=
variant_match(EFFECT_ALL,OTHER_ALL,ALT,REF,VT),
by = list(EFFECT_ALL,OTHER_ALL,ALT,REF,VT)],
error = function(err) {
print_and_log(paste('Error in allele matching:' , err$message), 'warning')
return(NULL)
}
)
#====================================
if(is.null(input.data)){
print_and_log('File removed from QC analysis due to error in allele matching!','warning')
addEmptyStudy_studyInstance(.QC$thisStudy)
return(NULL)
}
invisible(gc())
#print_and_log(mem_used(),'info',cat= FALSE)
## flip and switch and analyzing
# IMPORTANT: variants may be removed in 1 step
# 3- mismatched variants
## ==============================================
print_and_log('Data processing ...','info')
input.data <- tryCatch(process_matched_data(input.data),
error = function(err) {
print_and_log(paste('Error in variant processing:' , err$message), 'warning')
return(NULL)
}
)
#====================================
if(is.null(input.data)){
print_and_log('File removed from QC analysis due to error in variant processing!','warning')
addEmptyStudy_studyInstance(.QC$thisStudy)
return(NULL)
}
invisible(gc())
## Calculations
## ==============================================
## calculate Pvalue from stderr and effect
input.data <- calculate_PVALUE(input.data) #pValueFunctions.R
## calculate correlation between the 2 Pvalues
calculate_pvalue_correlation(input.data)
# calcualte AF correlation between study and std ref
calculate_af_correlation_std_ref(input.data) #calculationFunctions.R
# calcualte AF correlation between study and alt ref
if(!is.na(config$supplementaryFiles$allele_ref_alt))
calculate_af_correlation_alt_ref(input.data) #calculationFunctions.R
# selcet high quality variants based on 4 filters
input.data <- applyHQfilter(input.data) #calculationFunctions.R
# generate distribution statistcis for HQ variants
# summary of important columns
if(.QC$thisStudy$HQ.count == .QC$thisStudy$rowcount.step3)
{
.QC$thisStudy$tables$variable.summary.HQ <- .QC$thisStudy$tables$variable.summary
}
else if(.QC$thisStudy$HQ.count > 0)
{
.QC$thisStudy$tables$variable.summary.HQ <- variable_statistics_post_matching_HQ(input.data)
}
##TODO outliers are defined as skewness > 0.1 or < -0.1, or kurtosis > 10
calculateSkewness(input.data)
calculateSkewness_HQ(input.data)
calculateKurtosis(input.data)
calculateKurtosis_HQ(input.data)
calculateVischerStats(input.data)
calculateVischerStats_HQ(input.data)
calculateLambda(input.data)
### ===================== variables for multifile comparison ==============
## --------- effect plot for box plot -------------
eff.col <- as.data.table(input.data[HQ==TRUE]$EFFECT)
##------ variables for precision plot ---------
## ============================================
.QC$thisStudy$STDERR.mean.HQ <- mean(input.data[HQ == TRUE]$STDERR,na.rm = TRUE)
df <- data.frame(
x = 1,
y0 = min(eff.col$V1),
y25 = stats::quantile(eff.col$V1, 0.25),
y50 = stats::median(eff.col$V1),
y75 = stats::quantile(eff.col$V1, 0.75),
y100 = max(eff.col$V1)
)
# upper whisker = largest observation less than or equal to upper hinge + 1.5 * IQR
y_upper = df$y75 + 1.5 * stats::IQR(eff.col$V1)
#lower whisker = smallest observation greater than or equal to lower hinge - 1.5 * IQR
y_lower = df$y25 - 1.5 * stats::IQR(eff.col$V1)
#TODO maybe show values inside plot
.QC$thisStudy$effect.plot.df <- df
.QC$thisStudy$effect.plot.df_y_upper <- y_upper
.QC$thisStudy$effect.plot.df_y_lower <- y_lower
# generate the plot usin generateEffectSizePlot() function instead of saving the plot
# file.number = .QC$thisStudy$number
#
# if("N_CASES" %in% .QC$thisStudy$renamed.File.Columns)
# {
# file.N.max = .QC$thisStudy$MAX_N_CASES
# print_and_log("N_CASES will be used for MAX_N value.")
# }
# else
# file.N.max = .QC$thisStudy$MAX_N_TOTAL
#
#
#
# .QC$thisStudy$effect.plot = ggplot(df, aes(x)) +
# geom_boxplot(
# aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
# stat = "identity") +
# geom_errorbar(aes(ymin=y_lower, ymax=y_upper), width=0.6,
# position=position_dodge(.9)) +
# labs(x= file.number ,y="effect size",subtitle = sprintf('N = %s', file.N.max)) +
# theme_classic(base_size = 8)+
# coord_cartesian(ylim = c(-0.6,0.6)) +
# geom_hline(yintercept = 0.1,linetype = 2,color='red') +
# geom_hline(yintercept = -0.1,linetype = 2,color='red') +
# theme(axis.text.x=element_blank())
### ===================== END of multi study variables =====================
## updating alternate refrence file
## ==============================================
#cat('\n--- [evaluating alternate reference ...] ---',fill=TRUE)
if(!is.na(config$supplementaryFiles$allele_ref_alt))
update_alternate_reference(input.data)
## data cleaning 1
## ==============
rm(eff.col) # was used for eff.plot
# added while checking with reference sets
input.data[, CHR.y := NULL]
input.data[, POS := NULL]
input.data[, ALT := NULL]
input.data[, REF := NULL]
# input.data[, AF := NULL] required for alele freq plot and correlation calculation
input.data[, DATE_ADDED := NULL]
# input.data[, SOURCE := NULL] required for alele freq plot and correlation calculation
# added while matching with refrence sets
input.data[, match := NULL]
input.data[, flip := NULL]
input.data[, switch := NULL]
input.data[, wrong := NULL]
invisible(gc())
#print_and_log(mem_used(),'info',cat= FALSE)
## 6- plots
## ==============================================
print_and_log('Plotting ...','info')
drawPlots(input.data)
invisible(gc())
## ==============================================
print_and_log('Saving data set ...','info')
## missing pvalues set from calculated pvalues
input.data <-fill_missing_pvalues_from_calculated_pvalues(input.data)
saveDataSet_final(input.data) #'saveFilesFunctions.R'
invisible(gc())
#print_and_log(mem_used(),'info',cat= FALSE)
## data cleaning 2
## ==============
# these items contain long list of missing and invalid variant indexes
# keeps huge memory
# not required after creating the report
.QC$thisStudy$column.NA.list <- lapply(.QC$thisStudy$column.NA.list, length)
.QC$thisStudy$column.INVALID.list <- lapply(.QC$thisStudy$column.INVALID.list, length)
.QC$thisStudy$endtime <- Sys.time()
# save txt report file
report_to_txt_file(.QC$thisStudy)
## Compare Beta(Effect) values with refrence set and draw plots
# =============================================================
if(nrow(.QC$reference.data.effect) > 0 ){
print_and_log('Comparing input file with Effect-Size reference file ...','info')
input.data<-tryCatch(compareInputfileWithBetaReferenceFile(input.data),
error = function(err) {
print_and_log(paste('Error in comparing with Effect-Size reference:' , err$message), 'warning')
return(data.table())
}
)
# only variants that could be matched are returned.
# gc not done in previous function
invisible(gc())
## allele matching for effect size plot
## ==============================================
if(nrow(input.data) > 0){
# THIS STEP IS NOT REQUIRED, BECAUSE DATA IS ALREADY MATCHED WITH REFERENCE DATASET
# print_and_log('Allele matching vs Effect-Size reference dataset ...','info')
# input.data <- tryCatch(allele.match.effectPlot(input.data),
# error = function(err) {
# print_and_log(paste('Error in allele matching:' , err$message), 'warning')
# return(NULL)
# }
# )
if(config$plot_specs$make_plots & !is.null(input.data))
tryCatch(plot_DataEFFECT_vs_RefEFFECT(input.data,
.QC$thisStudy$effPlotPath,
.QC$thisStudy$plot.title),
error = function(err) {
print_and_log(paste('Error plotting effect-size correlation plot:' , err$message), 'warning')
return(NULL)
}
)
}else
{
print_and_log('No variants were found in Effect-size reference dataset!','warning')
print_and_log('Effect-size comparison plot is skipped!','warning')
.QC$thisStudy$effect.rho_4 <- 'NA (no variants were found)'
}
# write(x = '\n\n[Comparing Input File with Effect-size reference dataset (HQ variants)]',
# file = .QC$thisStudy$txt.report.path ,
# sep = '\n',
# append = TRUE)
writeTXTreport('\n\nComparing Input File with Effect-size reference dataset (HQ variants)')
writeTXTreport(kable(.QC$thisStudy$tables$betaCor.tbl, align = "l",format = "rst"))
writeTXTreport('\n* Data is presented as r(N). Variants were filtered on reference data P-values. ')
writeTXTreport('** Data is presented as r(N). Variants were filtered on input result file P-values. ')
# write(x = paste('r (P-value < 0.001) =' , .QC$thisStudy$effect.rho_3),
# file = .QC$thisStudy$txt.report.path ,
# sep = '\n',
# append = TRUE)
#
# write(x = paste('r (P-value < 0.0001) =' , .QC$thisStudy$effect.rho_4),
# file = .QC$thisStudy$txt.report.path ,
# sep = '\n',
# append = TRUE)
#
# write(x = paste('r (P-value < 0.00001) =' , .QC$thisStudy$effect.rho_5),
# file = .QC$thisStudy$txt.report.path ,
# sep = '\n',
# append = TRUE)
#
# write(x = paste('r (P-value < 0.000001) =' , .QC$thisStudy$effect.rho_6),
# file = .QC$thisStudy$txt.report.path ,
# sep = '\n',
# append = TRUE)
}
## save significant variants ( p-value < 1e-8 & HQ & rsID) ==> CHR-POS-MARKER-PVALUE
#save_significant_variants(input.data)
# save rds file
studyClass <- create_Study(.QC$thisStudy)
# .QC$StudyList <- append( .QC$StudyList , studyClass)
.QC$StudyList@studyList <- append(.QC$StudyList@studyList , studyClass)
.QC$StudyList@studyCount <- length(.QC$StudyList@studyList)
save_rds_file(.QC$thisStudy)
rm(input.data)
invisible(gc())
## ended
##==============
print_and_log('\n','info')
## add info to text report
writeTXTreport("\n==============================================")
writeTXTreport(sprintf('Generated by %s package - v.%s',.QC$package.name, .QC$script.version))
writeTXTreport(as.character(Sys.time()))
return(.QC$thisStudy)
}
create_file_specific_config <- function(file.name){
### these varaibles are used for each file that is QC'ed ###
# study$starttime
# study$endtime
#
#
# study$AFcor.std_ref
# study$AFcor.non.palindromic.std_ref
# study$AFcor.palindromic.std_ref
# study$AFcor.alt_ref
# study$AFcor.non.palindromic.alt_ref
# study$AFcor.palindromic.alt_ref
# study$duplicate.count
# study$HQ.count
# study$input.data.rowcount
# study$kurtosis
# study$kurtosis.HQ
# study$lambda
# study$lambda.gen
# study$lambda.imp
# study$LQ.count
# study$missing.crucial.rowcount
# study$multiAlleleVariants.rowcount
# study$missing.alleles.rowcount
# study$monomorphic.count
# study$neg.strand.count
# study$PVcor
# study$PVcor.palindromic
# study$skewness
# study$skewness.HQ
# study$Visschers.stat
# study$Visschers.stat.HQ
# Study$MAX_N_TOTAL
# study$hasINDEL
# study$hID.added
# check if there are characters in chromosome column.
# they are converted to number and should be deconverted before saving final dataset
# study$character.chromosome
# if study has none base character as alleles => so they sholod not be regarded as mismatches
# study$hanNoneBaseAlleles
# study$column.INVALID.list$CALLRATE
# study$column.INVALID.list$CHR
# study$column.INVALID.list$EFF_ALL_FREQ
# study$column.INVALID.list$EFFECT
# study$column.INVALID.list$EFFECT_ALL
# study$column.INVALID.list$HWE_PVAL
# study$column.INVALID.list$IMP_QUALITY
# study$column.INVALID.list$IMPUTED
# study$column.INVALID.list$minusone.CALLRATE
# study$column.INVALID.list$minusone.EFF_ALL_FREQ
# study$column.INVALID.list$minusone.HWE_PVAL
# study$column.INVALID.list$minusone.PVALUE
# study$column.INVALID.list$N_TOTAL
# study$column.INVALID.list$one.EFF_ALL_FREQ
# study$column.INVALID.list$OTHER_ALL
# study$column.INVALID.list$POSITION
# study$column.INVALID.list$PVALUE
# study$column.INVALID.list$STDERR
# study$column.INVALID.list$zero.EFF_ALL_FREQ
# study$column.INVALID.list$zero.STDERR
#
#
#
#
# study$column.NA.list$CALLRATE
# study$column.NA.list$CHR
# study$column.NA.list$EFF_ALL_FREQ
# study$column.NA.list$EFFECT
# study$column.NA.list$EFFECT_ALL
# study$column.NA.list$HWE_PVAL
# study$column.NA.list$IMP_QUALITY
# study$column.NA.list$IMPUTED
# study$column.NA.list$MARKER
# study$column.NA.list$N_TOTAL
# study$column.NA.list$OTHER_ALL
# study$column.NA.list$POSITION
# study$column.NA.list$PVALUE
# study$column.NA.list$STDERR
# study$column.NA.list$STRAND
#
#
#
# study$tables$CHR.tbl
# study$tables$VT.tbl
# study$tables$EFFECT_ALL.tbl
# study$tables$imputed.tbl
# study$tables$OTHER_ALL.tbl
# study$tables$match.ref.table whcih ref is used for matching
# study$tables$VT.ref.table how many variants of each type and multi allelic or not
## multi file comparison
# study$STDERR.mean.HQ
# study$N.max
# study$effect.plot
# title if the plots . it is either the name of input file or set by user
# it is automatrically set as file name if there are multiple files
# study$plot.title
study <- c()
config <- .QC$config
file.name <- as.character(file.name[[1]])
study$file.path <- file.name
study$file.extension <- file_ext(file.name)
study$zipped.File <- FALSE
if(study$file.extension %in% c('gz','zip','bz2'))
study$zipped.File <- TRUE
##-- getting filename without extensions
study$file.name <- tools::file_path_sans_ext(basename(file.name))
# remove the extension before gz
#if(study$file.extension == 'gz' && any(endsWith(x = study$file.name,suffix = c('.txt','.csv')))) ## replaced to be compatible with older R
if(study$file.extension %in% c('gz','bz2') && any( grepl("(txt|csv)$", x = study$file.name)))
{
study$file.name <- sub(x= study$file.name , pattern = '\\.(\\w{3})$', replacement = '')
}
###------
print_and_log(sprintf("Checking Study file : '%s'",file.name),
'info')
# get_file_line_count if file is not zipped
# if(config$test.run) # do not count the lines if it is a test run
# study$file.line.count <- 'NA (test run)'
# else if(study$zipped.File) # do not count the lines if it is zipped
# study$file.line.count <- 'NA (zipped file)'
# else
# study$file.line.count <- get_file_line_count(file.name)
# get_file_line_count if file is not zipped
if(study$file.extension != 'zip') # do not count the lines if it is a test run
{
fileInspection <- get_file_line_count_RUtils(file.name)
study$file.line.count <- fileInspection[1]
study$file.endsWithNewLine <- fileInspection[2]
}
###### ==== file names ==== ##########
# this prefix is put infront of all files. it does not have any extension
# /home/user/ + QC + _ + studyfile
files.prefix <- sprintf('%s/%s_%s',
config$paths$dir_output,
config$paths$filename_output_tag,
study$file.name)
## add plot paths
study$manPlotPath<-paste0(files.prefix, '_graph_M', .QC$img.extension)
study$histPlotPath<-paste0(files.prefix, '_graph_histogram' , .QC$img.extension)
study$stdMafPlotPath<-paste0(files.prefix, '_graph_EAF_SR' , .QC$img.extension)
study$stdMafSmPlotPath<-paste0(files.prefix, '_graph_EAF_SR_sm' , .QC$img.extension)
study$altMafPlotPath<-paste0(files.prefix, '_graph_EAF_AR' , .QC$img.extension)
study$pvalCorPlotPath<-paste0(files.prefix, '_graph_p_correlation' , .QC$img.extension)
study$pvalCorSmPlotPath<-paste0(files.prefix, '_graph_p_correlation_sm' , .QC$img.extension)
study$QQPlotPath<-paste0(files.prefix, '_graph_QQ' , .QC$img.extension)
study$effPlotPath<-paste0(files.prefix, '_beta' , .QC$img.extension)
study$plot.title <- ifelse(length(config$paths$input_files) > 1 | config$plot_specs$plot_title == 'none',
study$file.name,
config$plot_specs$plot_title)
## add dataset paths
#variant with missing crucial values
study$SNPs_invalid.path<-paste(files.prefix, 'vars_invalid_allele.txt' , sep='_')
#variant with invalid allele values
study$SNPs_removed.path<-paste(files.prefix, 'vars_removed.txt' , sep='_')
#variant with invalid both allele values
study$SNPs_invalid_both.path<-paste(files.prefix, 'vars_invalid_alleles.txt' , sep='_')
#variant with missing non-crucial values
study$SNPs_improbable_values.path<-paste(files.prefix, 'vars_improbable_values.txt' , sep='_')
# duplicate variants
study$SNPs_duplicates.path<-paste(files.prefix, 'vars_duplicates.txt' , sep='_')
study$SNPs_duplicates_postMatch.path<-paste(files.prefix, 'vars_duplicates_post_match.txt' , sep='_')
# mismatched bi-allelic variants
study$SNPs_mismatches.path<-paste(files.prefix, 'vars_mismatches_BA.txt' , sep='_')
# monomorphic variants
study$SNPs_monomorphic.path<-paste(files.prefix, 'vars_monomorphic.txt' , sep='_')
# multi_allelic variants
# vairants that are found in reference but cannot be matched because of many variants on the same position
# mismatched multi-allelic variants
study$SNPs_multi_allelic.path<-paste(files.prefix, 'vars_mismatches_MA.txt' , sep='_')
# duplicated match variants
study$SNPs_ambiguous.path<-paste(files.prefix, 'vars_ambiguous.txt' , sep='_')
# significant variants
study$SNPs_significant.path<-paste(files.prefix, 'vars_significant.txt' , sep='_')
# cleaned output file
study$output.path<-paste(files.prefix, '.txt' , sep='')
# effect-size reference dataset path
study$effect_size_ref.output.path<-paste(files.prefix, 'effect-size_ref.rds' , sep='_')
# html report file
study$html.report.path<-paste(files.prefix, 'report.html' , sep='_')
# txt report file
study$txt.report.path<-paste(files.prefix, 'report.txt' , sep='_')
# rdata object file
study$rds.study.rds.path<-paste(files.prefix, 'object.rds' , sep='_')
## ====== column variables ======== ##
study$original.File.Columns <- character(0)
study$original.File.Columns.sorted <- character(0)
study$renamed.File.Columns.sorted <- character(0)
study$wanted.columns.index <- character(0)
study$renamed.File.Columns.classes <- character(0)
study$missing.Columns <- 'none'
study$missing.PVALUE.column <- FALSE
study$hasINDEL <- FALSE
study$hID.added <- FALSE
study$hanNoneBaseAlleles <- FALSE
study$HQ.count <- 0
study$LQ.count <- 0
study$input.data.rowcount <- 0 # originla file row number - before any processing
study$rowcount.step1 <- 0 # after processing crucial columns
study$rowcount.step2 <- 0 # aftere removing duplicates, monomorphics and chromosal variants
study$rowcount.step3 <- 0 # aftere
study$missing.crucial.rowcount<- 0
study$missing.alleles.rowcount <- 0
study$neg.strand.count<- 0
study$monomorphic.count <- 0
study$duplicate.count<- 0
study$found.rows <- 0
study$mismatched.rows<- 0
study$ambiguos.rows<- 0
study$not.found.rows <- 0
study$switched.rows <- 0
study$flipped.rows <- 0
study$palindromic.rows <- 0
study$non.palindromic.rows<- 0
study$skewness <- 'NA'
study$skewness.HQ <- 'NA'
study$Visschers.stat <- 'NA'
study$Visschers.stat.HQ <- 'NA'
study$MAX_N_TOTAL <- 'NA'
study$tables$imputed.tbl <- as.data.table(matrix(NA,2,2))
study$tables$multi_allele_count_preProcess <- data.table()
study$tables$multi_allele_count_postProcess <- data.table()
study$tables$variable.summary.HQ <- data.table()
colnames(study$tables$imputed.tbl) <- c('IMPUTED','N')
study$tables$imputed.tbl$IMPUTED[1] <- 'Imputed'
study$tables$imputed.tbl$IMPUTED[2] <- 'Genotyped'
study$lambda <- 0
study$lambda.gen <- 'not available'
study$lambda.imp <- 'not available'
study$PVcor <- 'not available (missing P-value column)'
study$PVcor.palindromic <- 'not available (missing P-value column)'
study$rownum.PVcor <- 0
study$fixed.hwep <- 'NA'
study$fixed.callrate <- 'NA'
study$fixed.n_total <- 'NA'
study$fixed.impq <- 'NA'
study$AFcor.std_ref <- 'NA'
study$AFcor.non.palindromic.std_ref <- 'NA'
study$AFcor.palindromic.std_ref <- 'NA'
study$AFcor.std_ref.indel <- 'NA'
study$AFcor.std_ref.CHR <- 'NA'
study$AFcor.alt_ref <- 'NA'
study$AFcor.non.palindromic.alt_ref <- 'NA'
study$AFcor.palindromic.alt_ref <- 'NA'
study$AFcor.alt_ref.indel <- 'NA'
study$character.chromosome <- FALSE
## multi file comparison
study$STDERR.mean.HQ <- 0
study$effect.rho_4 <- 'NA (not calculated)'
# study$column.INVALID.list$CALLRATE <- numeric(length = 0L)
# study$column.INVALID.list$CHR <- numeric(length = 0L)
# study$column.INVALID.list$EFF_ALL_FREQ <- numeric(length = 0L)
# study$column.INVALID.list$EFFECT <- numeric(length = 0L)
# study$column.INVALID.list$EFFECT_ALL <- numeric(length = 0L)
# study$column.INVALID.list$HWE_PVAL <- numeric(length = 0L)
# study$column.INVALID.list$IMP_QUALITY <- numeric(length = 0L)
# study$column.INVALID.list$IMPUTED <- numeric(length = 0L)
# study$column.INVALID.list$N_TOTAL <- numeric(length = 0L)
# study$column.INVALID.list$OTHER_ALL <- numeric(length = 0L)
# study$column.INVALID.list$POSITION <- numeric(length = 0L)
# study$column.INVALID.list$PVALUE <- numeric(length = 0L)
# study$column.INVALID.list$STDERR <- numeric(length = 0L)
study$column.INVALID.list$minusone.CALLRATE <- numeric(length = 0L)
study$column.INVALID.list$zero.EFF_ALL_FREQ <- numeric(length = 0L)
study$column.INVALID.list$one.EFF_ALL_FREQ <- numeric(length = 0L)
study$column.INVALID.list$minusone.EFF_ALL_FREQ <- numeric(length = 0L)
study$column.INVALID.list$minusone.HWE_PVAL <- numeric(length = 0L)
study$column.INVALID.list$minusone.PVALUE <- numeric(length = 0L)
study$column.INVALID.list$zero.STDERR <- numeric(length = 0L)
study$column.INVALID.list$CALLRATE = numeric(length = 0L)
study$column.INVALID.list$CHR = numeric(length = 0L)
study$column.INVALID.list$EFF_ALL_FREQ = numeric(length = 0L)
study$column.INVALID.list$EFFECT = numeric(length = 0L)
study$column.INVALID.list$EFFECT_ALL = numeric(length = 0L)
study$column.INVALID.list$HWE_PVAL = numeric(length = 0L)
study$column.INVALID.list$IMP_QUALITY = numeric(length = 0L)
study$column.INVALID.list$IMPUTED = numeric(length = 0L)
study$column.INVALID.list$MARKER = numeric(length = 0L)
study$column.INVALID.list$N_TOTAL = numeric(length = 0L)
study$column.INVALID.list$OTHER_ALL = numeric(length = 0L)
study$column.INVALID.list$BOTH_ALL = numeric(length = 0L)
study$column.INVALID.list$POSITION = numeric(length = 0L)
study$column.INVALID.list$PVALUE = numeric(length = 0L)
study$column.INVALID.list$STDERR = numeric(length = 0L)
study$column.INVALID.list$STRAND = numeric(length = 0L)
study$column.NA.list$CALLRATE = numeric(length = 0L)
study$column.NA.list$CHR = numeric(length = 0L)
study$column.NA.list$EFF_ALL_FREQ = numeric(length = 0L)
study$column.NA.list$EFFECT = numeric(length = 0L)
study$column.NA.list$EFFECT_ALL = numeric(length = 0L)
study$column.NA.list$HWE_PVAL = numeric(length = 0L)
study$column.NA.list$IMP_QUALITY = numeric(length = 0L)
study$column.NA.list$IMPUTED = numeric(length = 0L)
study$column.NA.list$MARKER = numeric(length = 0L)
study$column.NA.list$N_TOTAL = numeric(length = 0L)
study$column.NA.list$OTHER_ALL = numeric(length = 0L)
study$column.NA.list$POSITION = numeric(length = 0L)
study$column.NA.list$PVALUE = numeric(length = 0L)
study$column.NA.list$STDERR = numeric(length = 0L)
study$column.NA.list$STRAND = numeric(length = 0L)
# study$column.NA.list$CALLRATE <- numeric(length = 0L)
# study$column.NA.list$CHR <- numeric(length = 0L)
# study$column.NA.list$EFF_ALL_FREQ <- numeric(length = 0L)
# study$column.NA.list$EFFECT <- numeric(length = 0L)
# study$column.NA.list$EFFECT_ALL <- numeric(length = 0L)
# study$column.NA.list$HWE_PVAL <- numeric(length = 0L)
# study$column.NA.list$IMP_QUALITY <- numeric(length = 0L)
# study$column.NA.list$IMPUTED <- numeric(length = 0L)
# study$column.NA.list$MARKER <- numeric(length = 0L)
# study$column.NA.list$N_TOTAL <- numeric(length = 0L)
# study$column.NA.list$OTHER_ALL <- numeric(length = 0L)
# study$column.NA.list$POSITION <- numeric(length = 0L)
# study$column.NA.list$PVALUE <- numeric(length = 0L)
# study$column.NA.list$STDERR <- numeric(length = 0L)
# study$column.NA.list$STRAND <- numeric(length = 0L)
# specific chromosome count in file
study$x.chr.count.removed <- 0
study$y.chr.count.removed <- 0
study$xy.chr.count.removed <- 0
study$m.chr.count.removed <- 0
study$dup_lines_count <- 0
# get variable summary statistics from table
# ===================
# study$tables$variable.summary['Min.','EFFECT']
# study$tables$variable.summary['1st Qu.','EFFECT']
# study$tables$variable.summary['Median','EFFECT']
# study$tables$variable.summary['Mean','EFFECT']
# study$tables$variable.summary['3rd Qu.','EFFECT']
# study$tables$variable.summary['Max.','EFFECT']
study$effect.plot <- textGrob("Empty Effect Plot", gp = gpar(fontsize=12,col='red', fontface='bold'))
# study$effect.plot <- ggdraw() +
# draw_label('Empty Effect Plot', x = .5, y = .5,
# vjust = .5, hjust = .5, size = 10, fontface = 'bold',colour = 'darkred')
# check header of the input file
study <- tryCatch(checkRequiredColumnNames(file.name,study),
error = function(err)
{
print_and_log(paste('Could not read file header:',err$message),'warning')
return(NULL)
}
)
# print_and_log("\nColumn check is done!",'info')
# cat('----------------------------------------------------',fill = TRUE)
if(!is.null(study))
return(study)
}
get_file_line_count <- function(file.path)
{
## get line count of input file
# this values is used to compare count of file lines with count of vatriants in input datset
# if they do not match , it means some lines are not read
# skip if input file is zipped (because line number is binary line count)
if(.QC$wc.exists){
line <- system(sprintf("wc -l %s", file.path), intern = TRUE)
lineCount.regex.value <- regexec('([0-9]*)([A-z ]*)',line)[[1]]
start.char <- lineCount.regex.value[2]
end.char <- start.char + attributes(lineCount.regex.value)$match.length[2] - 1
# removed to exclude stringr package
# file.line.count <- format(as.numeric(str_match(line, "([0-9]*)([A-z ]*)")[,2]),
# big.mark="," ,
# scientific = FALSE)
file.line.count <- format(as.numeric(substr(line, start.char, end.char)),
big.mark="," ,
scientific = FALSE)
}
else
{
file.line.count <- 'NA (command not accessible)'
}
return(file.line.count)
}
get_file_line_count_RUtils <- function(file.path)
{
## get line count of input file
# this values is used to compare count of file lines with count of vatriants in input datset
# if they do not match , it means some lines are not read
# skip if input file is zipped (because line number is binary line count)
file.line.count <- tryCatch({
line <- R.utils::countLines(file.path)
file_ends_withNewline <- as.character(attr(line,"lastLineHasNewline"))
line <- format(line,
big.mark="," ,
scientific = FALSE)
return(list(line,file_ends_withNewline))
},
error = function(err) return("NA (command not accessible)","NA")
)
return(file.line.count)
}
get_study_name <- function(study) {
return(study$file.name)
}
get_skewness_kurtosis <- function(study) {
vec <- data.table('kurtosis' = study$kurtosis.HQ,
'skewness' = study$skewness.HQ,
'order' = study$number)
return(vec)
}
get_precision_plot_values <- function(study) {
vec <- data.table('SE.mean.HQ' = 1 / study$STDERR.mean.HQ,
'sqrt.n' = sqrt(study$MAX_N_TOTAL),
'order' = study$number)
return(vec)
}
verify_files_with_user <- function(qc.study.list, user.verification)
{
study.table <- sapply(qc.study.list,function(study)
return(c('File Name' = basename(study$file.path),
'Lines (including header)'= study$file.line.count,
'missing items (first 100 lines)'= study$file.header.na,
'Missing Columns'= paste(study$missing.Columns,collapse = ' | ')
)
)
)
if(is.null(ncol(study.table)))
print_and_log('Error in filenames. check if filename contains space or dash or another uknown character.','fatal')
study.table <- cbind(seq(1:ncol(study.table)),t(study.table))
colnames(study.table)[1] <- '#'
print_and_log(kable(study.table,format = "rst"),
'info',
cat= FALSE)
if(user.verification)
{
# warn user that existing files will be overwritten if output folder containts files that match the input file pattern
if(.QC$config$new_items$non.empty.output.folder)
warning('existing files in the output folder will be overwritten!!!')
if(menu(c("Yes", "No"),
title=sprintf('Do you want to continue (1 = \'Yes\' | 2 = \'No\' )?'))
== 2)
runStopCommand('QC ended by user!')
}
}
variable_statistics_pre_matching <- function(input.data)
{
# count alleles for report
.QC$thisStudy$tables$EFFECT_ALL.tbl <- input.data[VT == 1,.(.N),keyby = EFFECT_ALL] ## count alleles
.QC$thisStudy$tables$OTHER_ALL.tbl <- input.data[VT == 1,.(.N),keyby = OTHER_ALL] ## count alleles
}
variable_statistics_post_matching <- function(input.data)
{
# pallindromics
.QC$thisStudy$palindromic.rows <- input.data[palindromic ==TRUE,.N]
.QC$thisStudy$non.palindromic.rows <- nrow(input.data) - .QC$thisStudy$palindromic.rows
if(is.element('CHR' , names(input.data))){
.QC$thisStudy$tables$CHR.tbl <- input.data[,.(.N),keyby = CHR] ## count variants per chromosome
### looking for missing chromosomes
chr_range = range(as.numeric(.QC$thisStudy$tables$CHR.tbl$CHR),na.rm = T)
chr_range = if(chr_range[2] <= 23)
seq(chr_range[1]:23)
else
seq(chr_range[1]:chr_range[2])
.QC$thisStudy$missing_chromosomes <- which(chr_range %notin% .QC$thisStudy$tables$CHR.tbl$CHR)
###################################
.QC$thisStudy$tables$CHR.tbl$CHR <- as.character(.QC$thisStudy$tables$CHR.tbl$CHR)
.QC$thisStudy$tables$CHR.tbl[is.na(CHR), CHR:= 'missing']
.QC$thisStudy$tables$CHR.tbl$N <- thousand_sep(.QC$thisStudy$tables$CHR.tbl$N)
}else{
.QC$thisStudy$tables$CHR.tbl <- NA
.QC$thisStudy$tables$CHR.tbl$CHR <- ''
.QC$thisStudy$tables$CHR.tbl$N <- '0'
}
# count alleles after flipping and switching and compare to pre-match allele count
# for report
.QC$thisStudy$tables$EFFECT_ALL.post.matching.tbl <- input.data[VT == 1,.(.N),keyby = EFFECT_ALL]
.QC$thisStudy$tables$OTHER_ALL.post.matching.tbl <- input.data[VT == 1,.(.N),keyby = OTHER_ALL] ## count alleles
if('IMPUTED' %in% colnames(input.data))
{
.QC$thisStudy$tables$imputed.tbl <- input.data[,.(.N),keyby = IMPUTED] ## count variants per chromosome
.QC$thisStudy$tables$imputed.tbl$IMPUTED <- as.character(.QC$thisStudy$tables$imputed.tbl$IMPUTED)
.QC$thisStudy$tables$imputed.tbl[is.na(IMPUTED) , IMPUTED := 'missing/invalid']
.QC$thisStudy$tables$imputed.tbl[IMPUTED == 0 , IMPUTED := 'genotyped']
.QC$thisStudy$tables$imputed.tbl[IMPUTED == 1 , IMPUTED := 'imputed']
}
##
## =================
existing.columns <- intersect(c('PVALUE','HWE_PVAL','CALLRATE','EFF_ALL_FREQ','IMP_QUALITY','EFFECT','STDERR') ,
names(input.data))
.QC$thisStudy$tables$variable.summary <- cbind(sapply(input.data[,existing.columns,with= FALSE],
function(x) return(summary(x)[1:6])))
.QC$thisStudy$tables$variable.summary <- formatC(.QC$thisStudy$tables$variable.summary, format = 'g')
# change effect column name to BETA or LN(OR)
colnames(.QC$thisStudy$tables$variable.summary)[colnames(.QC$thisStudy$tables$variable.summary) == 'EFFECT'] <- .QC$config$input_parameters$effect_type_string
## get frequency table for multi-allelic variants
.QC$thisStudy$tables$multi_allele_count_preProcess <- getMultiAlleleCountTbl(input.data)
# variables that are found in standard reference file
.QC$thisStudy$found.rows.std <- input.data[match_result != 9L & SOURCE == 'Std_ref' , .N]
.QC$thisStudy$switched.rows.std <- input.data[match_result == 3L & SOURCE == 'Std_ref' , .N]
.QC$thisStudy$flipped.rows.std <- input.data[match_result == 2L & SOURCE == 'Std_ref' , .N]
# variables that are found in alternate reference file
.QC$thisStudy$found.rows.alt <- input.data[match_result != 9L & SOURCE != 'Std_ref' , .N]
.QC$thisStudy$switched.rows.alt <- input.data[match_result == 3L & SOURCE != 'Std_ref' , .N]
.QC$thisStudy$flipped.rows.alt <- input.data[match_result == 2L & SOURCE != 'Std_ref' , .N]
# variables that are not found in standard reference file
.QC$thisStudy$not.found.rows.std <- nrow(input.data) - .QC$thisStudy$found.rows.std
# variables that are not found in either standard or alternate reference file
.QC$thisStudy$not.found.rows.alt <- nrow(input.data) - .QC$thisStudy$found.rows.std - .QC$thisStudy$found.rows.alt
}
variable_statistics_post_matching_HQ <- function(input.data)
{
existing.columns <- intersect(c('PVALUE','HWE_PVAL','CALLRATE','EFF_ALL_FREQ','IMP_QUALITY','EFFECT','STDERR') ,
names(input.data))
tbl <- cbind(sapply(input.data[HQ == 1,existing.columns,with= FALSE],
function(x) return(summary(x)[1:6])))
tbl <- formatC(tbl, format = 'g')
# change effect column name to BETA or LN(OR)
colnames(tbl)[colnames(tbl) == 'EFFECT'] <- .QC$config$input_parameters$effect_type_string
return(tbl)
}
addEmptyStudy_studyInstance <- function(study)
{
faultyStudy <- new("Study")
faultyStudy@File$file.path <- study$file.path
faultyStudy@Successful_run <- FALSE
faultyStudy@Counts$input.data.rowcount <- study$input.data.rowcount
faultyStudy@Counts$rowcount.step1 <- study$rowcount.step1
faultyStudy@Counts$rowcount.step3 <- study$rowcount.step3
faultyStudy@Counts$found.rows <- NA
faultyStudy@Correlations$AFcor.std_ref <- NA
faultyStudy@Correlations$PVcor <- NA
faultyStudy@Statistics$lambda <- NA
.QC$StudyList@studyList <- append(.QC$StudyList@studyList , faultyStudy)
.QC$StudyList@studyCount <- length(.QC$StudyList@studyList)
}
addEmptyStudy_pathOnly <- function(study.path)
{
faultyStudy <- new("Study")
faultyStudy@File$file.path <- study.path
faultyStudy@Successful_run <- FALSE
faultyStudy@Counts$input.data.rowcount <- NA
faultyStudy@Counts$rowcount.step1 <- NA
faultyStudy@Counts$rowcount.step3 <- NA
faultyStudy@Counts$found.rows <- NA
faultyStudy@Correlations$AFcor.std_ref <- NA
faultyStudy@Correlations$PVcor <- NA
faultyStudy@Statistics$lambda <- NA
.QC$StudyList@studyList <- append(.QC$StudyList@studyList , faultyStudy)
.QC$StudyList@studyCount <- length(.QC$StudyList@studyList)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.