R/studyFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

#### added columns to dataset
# found	, whether found in any reference files
# palindromic
# match	, no nedd to flip or switch
# flip
# switch
# wrong	, wrong variant based on alleles and matching with references
# highDiffEAF
# HQ
# PVALUE.calculated

process_each_file <- function(study){


  print_and_log(sprintf('============== [ File %s from %s ] ==============',
                        .QC$file.counter,
                        length(.QC$qc.study.list)),
                'info',
                cat = TRUE)





  ## processing crucial columns and creating report variables
  config <- .QC$config

  # study variable contains all variables and paths
  .QC$thisStudy <- study

  .QC$thisStudy$starttime <- Sys.time()

  .QC$thisStudy$number <- .QC$file.counter
  .QC$thisStudy$effect_type_string <- .QC$config$input_parameters$effect_type_string

  .QC$file.counter <- .QC$file.counter + 1 ## counter that displays how many files are processed

  #print_and_log(mem_used(),'info',cat= FALSE)

  ### uplolading and processing file
  ## ==============================================
  input.data <- uploadInputFile()

  #print_and_log(mem_used(),'info',cat= FALSE)


  # return null if file could not be read
  #====================================
  if(is.null(input.data)){
    print_and_log('File removed from QC analysis due to error in loading!','warning')
    addEmptyStudy_studyInstance(.QC$thisStudy)
    return(NULL)
  }

  # invisible(gc())


  # remove duplicate lines of data
  # study$dup_lines_count is set
  #TODO ignored because it is very time consuming
  #input.data <- removeDuplicatedLines(input.data)



  # IMPORTANT: variants may be removed in 2 steps
  # 1- missing crucial variables
  # 2- monomorphic, duplicate, chromosomes
  ## ==============================================

  input.data <- processInputFile(input.data)
  #print_and_log(mem_used(),'info',cat= FALSE)

  # return null if error encountered
  #====================================
  if(is.null(input.data)){
    print_and_log('File removed from QC analysis due to error in processing!','warning')
    addEmptyStudy_studyInstance(.QC$thisStudy)
    return(NULL)
  }

  # variable_statistics_pre_matching(input.data) moved to process_matched_data()

  # invisible(gc())



  ### matching with reference dataset
  ## ==============================================
  print_and_log('Comparing input file with reference file (this might take long) ...','info')
  input.data<-tryCatch(compareInputfileWithReferenceData(input.data),
                       error = function(err) {
                         print_and_log(paste('Error in comparing process:' , err$message), 'warning')
                         return(NULL)
                       }
  )

  #====================================
  if(is.null(input.data)){
    print_and_log('File removed from QC analysis due to error in comparing process!','warning')
    addEmptyStudy_studyInstance(.QC$thisStudy)
    return(NULL)
  }

  # invisible(gc())
  #print_and_log(mem_used(),'info',cat= FALSE)


  ## allele matching
  ## ==============================================
  print_and_log('Allele matching vs Allele Freq reference dataset ...','info')

  input.data <- tryCatch(input.data[,c("match_result","palindromic") :=
                                      variant_match(EFFECT_ALL,OTHER_ALL,ALT,REF,VT),
                                    by = list(EFFECT_ALL,OTHER_ALL,ALT,REF,VT)],
                         error = function(err) {
                           print_and_log(paste('Error in allele matching:' , err$message), 'warning')
                           return(NULL)
                         }
  )

  #====================================
  if(is.null(input.data)){
    print_and_log('File removed from QC analysis due to error in allele matching!','warning')
    addEmptyStudy_studyInstance(.QC$thisStudy)
    return(NULL)
  }

  # invisible(gc())
  #print_and_log(mem_used(),'info',cat= FALSE)



  ## flip and switch and analyzing
  # IMPORTANT: variants may be removed in 1 step
  # 3- mismatched variants
  ## ==============================================
  print_and_log('Data processing ...','info')
  input.data <- tryCatch(process_matched_data(input.data),
                         error = function(err) {
                           print_and_log(paste('Error in variant processing:' , err$message), 'warning')
                           return(NULL)
                         }
  )

  #====================================
  if(is.null(input.data)){
    print_and_log('File removed from QC analysis due to error in variant processing!','warning')
    addEmptyStudy_studyInstance(.QC$thisStudy)
    return(NULL)
  }


  # invisible(gc())



  ## Calculations
  ## ==============================================
  ## calculate Pvalue from stderr and effect
  input.data <- calculate_PVALUE(input.data) #pValueFunctions.R

  ## calculate correlation between the 2 Pvalues
  calculate_pvalue_correlation(input.data)

  # calcualte AF correlation between study and std ref
  calculate_af_correlation_std_ref(input.data) #calculationFunctions.R

  # calcualte AF correlation between study and alt ref
  if(!is.na(config$supplementaryFiles$allele_ref_alt))
    calculate_af_correlation_alt_ref(input.data) #calculationFunctions.R

  # selcet high quality variants based on 4 filters
  input.data <- applyHQfilter(input.data) #calculationFunctions.R


  # generate distribution statistcis for HQ variants
  # summary of important columns
  if(.QC$thisStudy$HQ.count == .QC$thisStudy$rowcount.step3)
  {
    .QC$thisStudy$tables$variable.summary.HQ <- .QC$thisStudy$tables$variable.summary
  }
  else if(.QC$thisStudy$HQ.count > 0)
  {
    .QC$thisStudy$tables$variable.summary.HQ <- variable_statistics_post_matching_HQ(input.data)
  }

  ##TODO  outliers are defined as skewness > 0.1 or < -0.1, or kurtosis > 10
  calculateSkewness(input.data)
  calculateSkewness_HQ(input.data)
  calculateKurtosis(input.data)
  calculateKurtosis_HQ(input.data)
  calculateVischerStats(input.data)
  calculateVischerStats_HQ(input.data)
  calculateLambda(input.data)




  ### ===================== variables for multifile comparison ==============
  ## --------- effect plot for box plot -------------
  eff.col <- as.data.table(input.data[HQ==TRUE]$EFFECT)
  ##------ variables for precision plot ---------
  ## ============================================
  .QC$thisStudy$STDERR.mean.HQ <- mean(input.data[HQ == TRUE]$STDERR,na.rm = TRUE)


  df <- data.frame(
    x = 1,
    y0 = min(eff.col$V1),
    y25 = stats::quantile(eff.col$V1, 0.25),
    y50 = stats::median(eff.col$V1),
    y75 = stats::quantile(eff.col$V1, 0.75),
    y100 = max(eff.col$V1)
  )

  # upper whisker = largest observation less than or equal to upper hinge + 1.5 * IQR
  y_upper = df$y75 + 1.5 * stats::IQR(eff.col$V1)

  #lower whisker = smallest observation greater than or equal to lower hinge - 1.5 * IQR
  y_lower = df$y25 - 1.5 * stats::IQR(eff.col$V1)

  #TODO maybe show values inside plot
  .QC$thisStudy$effect.plot.df <- df
  .QC$thisStudy$effect.plot.df_y_upper <- y_upper
  .QC$thisStudy$effect.plot.df_y_lower <- y_lower

  # generate the plot usin generateEffectSizePlot() function instead of saving the plot
  # file.number = .QC$thisStudy$number
  #
  # if("N_CASES" %in% .QC$thisStudy$renamed.File.Columns)
  # {
  #   file.N.max = .QC$thisStudy$MAX_N_CASES
  #   print_and_log("N_CASES will be used for MAX_N value.")
  # }
  # else
  #   file.N.max = .QC$thisStudy$MAX_N_TOTAL
  #
  #
  #
  # .QC$thisStudy$effect.plot = ggplot(df, aes(x)) +
  #   geom_boxplot(
  #     aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
  #     stat = "identity") +
  #   geom_errorbar(aes(ymin=y_lower, ymax=y_upper), width=0.6,
  #                 position=position_dodge(.9)) +
  #   labs(x= file.number ,y="effect size",subtitle = sprintf('N = %s', file.N.max)) +
  #   theme_classic(base_size = 8)+
  #   coord_cartesian(ylim = c(-0.6,0.6)) +
  #   geom_hline(yintercept = 0.1,linetype = 2,color='red') +
  #   geom_hline(yintercept = -0.1,linetype = 2,color='red') +
  #   theme(axis.text.x=element_blank())


  ### ===================== END of multi study variables =====================


  ## updating alternate refrence file
  ## ==============================================
  #cat('\n--- [evaluating alternate reference ...] ---',fill=TRUE)
  if(!is.na(config$supplementaryFiles$allele_ref_alt))
    update_alternate_reference(input.data)



  ## data cleaning 1
  ## ==============
  rm(eff.col) # was used for eff.plot

  # added while checking with reference sets
  input.data[, CHR.y := NULL]
  input.data[, POS := NULL]
  input.data[, ALT := NULL]
  input.data[, REF := NULL]
  # input.data[, AF := NULL] required for alele freq plot and correlation calculation
  input.data[, DATE_ADDED := NULL]
  # input.data[, SOURCE := NULL] required for alele freq plot and correlation calculation

  # added while matching with refrence sets
  input.data[, match := NULL]
  input.data[, flip := NULL]
  input.data[, switch := NULL]
  input.data[, wrong := NULL]

  # invisible(gc())
  #print_and_log(mem_used(),'info',cat= FALSE)





  ## 6- plots
  ## ==============================================
  print_and_log('Plotting ...','info')
  drawPlots(input.data)

  # invisible(gc())


  ## ==============================================
  print_and_log('Saving data set ...','info')
  ## missing pvalues set from calculated pvalues
  input.data <-fill_missing_pvalues_from_calculated_pvalues(input.data)


  saveDataSet_final(input.data) #'saveFilesFunctions.R'



  # invisible(gc())
  #print_and_log(mem_used(),'info',cat= FALSE)


  ## data cleaning 2
  ## ==============
  # these items contain long list of missing and invalid variant indexes
  # keeps huge memory
  # not required after creating the report
  .QC$thisStudy$column.NA.list <- lapply(.QC$thisStudy$column.NA.list, length)
  .QC$thisStudy$column.INVALID.list <- lapply(.QC$thisStudy$column.INVALID.list, length)


  .QC$thisStudy$endtime <- Sys.time()

  # save txt report file
  report_to_txt_file(.QC$thisStudy)




  ## Compare Beta(Effect) values with refrence set and draw plots
  # =============================================================


  if(nrow(.QC$reference.data.effect) > 0 ){

    print_and_log('Comparing input file with Effect-Size reference file ...','info')
    input.data<-tryCatch(compareInputfileWithBetaReferenceFile(input.data),
                         error = function(err) {
                           print_and_log(paste('Error in comparing with Effect-Size reference:' , err$message), 'warning')
                           return(data.table())
                         }
    )

    # only variants that could be matched are returned.

    # gc not done in previous function
    # invisible(gc())

    ## allele matching for effect size plot
    ## ==============================================
    if(nrow(input.data) > 0){



      # THIS STEP IS NOT REQUIRED, BECAUSE DATA IS ALREADY MATCHED WITH REFERENCE DATASET
      # print_and_log('Allele matching vs Effect-Size reference dataset ...','info')
      # input.data <- tryCatch(allele.match.effectPlot(input.data),
      #                        error = function(err) {
      #                          print_and_log(paste('Error in allele matching:' , err$message), 'warning')
      #                          return(NULL)
      #                        }
      # )




      if(config$plot_specs$make_plots & !is.null(input.data))
        tryCatch(plot_DataEFFECT_vs_RefEFFECT(input.data,
                                              .QC$thisStudy$effPlotPath,
                                              .QC$thisStudy$plot.title),
                 error = function(err) {
                   print_and_log(paste('Error plotting effect-size correlation plot:' , err$message), 'warning')
                   return(NULL)
                 }
        )


    }else
    {
      print_and_log('No variants were found in Effect-size reference dataset!','warning')
      print_and_log('Effect-size comparison plot is skipped!','warning')
      .QC$thisStudy$effect.rho_4 <- 'NA (no variants were found)'
    }


    # write(x = '\n\n[Comparing Input File with Effect-size reference dataset (HQ variants)]',
    #     file = .QC$thisStudy$txt.report.path ,
    #     sep =  '\n',
    #     append = TRUE)

    writeTXTreport('\n\nComparing Input File with Effect-size reference dataset (HQ variants)')
    writeTXTreport(kable(.QC$thisStudy$tables$betaCor.tbl, align = "l",format = "rst"))

    writeTXTreport('\n* Data is presented as r(N). Variants were filtered on reference data P-values. ')
    writeTXTreport('** Data is presented as r(N). Variants were filtered on input result file P-values. ')

#     write(x = paste('r (P-value < 0.001) =' ,  .QC$thisStudy$effect.rho_3),
#         file = .QC$thisStudy$txt.report.path ,
#         sep =  '\n',
#         append = TRUE)
#
#     write(x = paste('r (P-value < 0.0001) =' ,  .QC$thisStudy$effect.rho_4),
#         file = .QC$thisStudy$txt.report.path ,
#         sep =  '\n',
#         append = TRUE)
#
#     write(x = paste('r (P-value < 0.00001) =' ,  .QC$thisStudy$effect.rho_5),
# 	    file = .QC$thisStudy$txt.report.path ,
# 	    sep =  '\n',
# 	    append = TRUE)
#
#     write(x = paste('r (P-value < 0.000001) =' ,  .QC$thisStudy$effect.rho_6),
# 	    file = .QC$thisStudy$txt.report.path ,
# 	    sep =  '\n',
# 	    append = TRUE)

  }

  ## save significant variants ( p-value  < 1e-8  & HQ & rsID) ==> CHR-POS-MARKER-PVALUE
  #save_significant_variants(input.data)


  # save rds file
  studyClass <- create_Study(.QC$thisStudy)
  # .QC$StudyList <- append( .QC$StudyList , studyClass)
  .QC$StudyList@studyList <- append(.QC$StudyList@studyList , studyClass)
  .QC$StudyList@studyCount <- length(.QC$StudyList@studyList)

  save_rds_file(.QC$thisStudy)

  rm(input.data)
  # invisible(gc())

  ## ended
  ##==============
  print_and_log('\n','info')

  ## add info to text report
  writeTXTreport("\n==============================================")
  writeTXTreport(sprintf('Generated by %s package - v.%s',.QC$package.name, .QC$script.version))
  writeTXTreport(as.character(Sys.time()))

  return(.QC$thisStudy)
}






create_file_specific_config <- function(file.name){

  ### these varaibles are used for each file that is QC'ed ###

  # study$starttime
  # study$endtime
  #
  #
  # study$AFcor.std_ref
  # study$AFcor.non.palindromic.std_ref
  # study$AFcor.palindromic.std_ref

  # study$AFcor.alt_ref
  # study$AFcor.non.palindromic.alt_ref
  # study$AFcor.palindromic.alt_ref


  # study$duplicate.count
  # study$HQ.count
  # study$input.data.rowcount
  # study$kurtosis
  # study$kurtosis.HQ
  # study$lambda
  # study$lambda.gen
  # study$lambda.imp
  # study$LQ.count
  # study$missing.crucial.rowcount
  # study$multiAlleleVariants.rowcount
  # study$missing.alleles.rowcount
  # study$monomorphic.count
  # study$neg.strand.count
  # study$PVcor
  # study$PVcor.palindromic
  # study$skewness
  # study$skewness.HQ
  # study$Visschers.stat
  # study$Visschers.stat.HQ
  # Study$MAX_N_TOTAL
  # study$hasINDEL
  # study$hID.added
  # check if there are characters in chromosome column.
  # they are converted to number and should be deconverted before saving final dataset
  # study$character.chromosome

  # if study has none base character as alleles => so they sholod not be regarded as mismatches
  # study$hanNoneBaseAlleles

  # study$column.INVALID.list$CALLRATE
  # study$column.INVALID.list$CHR
  # study$column.INVALID.list$EFF_ALL_FREQ
  # study$column.INVALID.list$EFFECT
  # study$column.INVALID.list$EFFECT_ALL
  # study$column.INVALID.list$HWE_PVAL
  # study$column.INVALID.list$IMP_QUALITY
  # study$column.INVALID.list$IMPUTED
  # study$column.INVALID.list$minusone.CALLRATE
  # study$column.INVALID.list$minusone.EFF_ALL_FREQ
  # study$column.INVALID.list$minusone.HWE_PVAL
  # study$column.INVALID.list$minusone.PVALUE
  # study$column.INVALID.list$N_TOTAL
  # study$column.INVALID.list$one.EFF_ALL_FREQ
  # study$column.INVALID.list$OTHER_ALL
  # study$column.INVALID.list$POSITION
  # study$column.INVALID.list$PVALUE
  # study$column.INVALID.list$STDERR
  # study$column.INVALID.list$zero.EFF_ALL_FREQ
  # study$column.INVALID.list$zero.STDERR
  #
  #
  #
  #
  # study$column.NA.list$CALLRATE
  # study$column.NA.list$CHR
  # study$column.NA.list$EFF_ALL_FREQ
  # study$column.NA.list$EFFECT
  # study$column.NA.list$EFFECT_ALL
  # study$column.NA.list$HWE_PVAL
  # study$column.NA.list$IMP_QUALITY
  # study$column.NA.list$IMPUTED
  # study$column.NA.list$MARKER
  # study$column.NA.list$N_TOTAL
  # study$column.NA.list$OTHER_ALL
  # study$column.NA.list$POSITION
  # study$column.NA.list$PVALUE
  # study$column.NA.list$STDERR
  # study$column.NA.list$STRAND
  #
  #
  #
  # study$tables$CHR.tbl
  # study$tables$VT.tbl
  # study$tables$EFFECT_ALL.tbl
  # study$tables$imputed.tbl
  # study$tables$OTHER_ALL.tbl
  # study$tables$match.ref.table  whcih ref is used for matching
  # study$tables$VT.ref.table     how many variants of each type and multi allelic or not


  ## multi file comparison
  # study$STDERR.mean.HQ
  # study$N.max
  # study$effect.plot

  # title if the plots . it is either the name of input file or set by user
  # it is automatrically set as file name if there are multiple files
  # study$plot.title


  study <- c()
  config <- .QC$config

  file.name <- as.character(file.name[[1]])

  study$file.path <- file.name

  study$file.extension <- file_ext(file.name)

  study$zipped.File <- FALSE

  if(study$file.extension %in% c('gz','zip','bz2'))
    study$zipped.File <- TRUE



  ##-- getting filename without extensions
  study$file.name <- tools::file_path_sans_ext(basename(file.name))

  # remove the extension before gz
  #if(study$file.extension == 'gz' && any(endsWith(x =  study$file.name,suffix = c('.txt','.csv')))) ## replaced to be compatible with older R
  if(study$file.extension %in% c('gz','bz2') && any( grepl("(txt|csv)$", x = study$file.name)))
  {
    study$file.name <- sub(x= study$file.name , pattern = '\\.(\\w{3})$', replacement = '')
  }


  ###------
  print_and_log(sprintf("Checking Study file : '%s'",file.name),
                'info')

  # get_file_line_count if file is not zipped
  # if(config$test.run) # do not count the lines if it is a test run
  #   study$file.line.count <- 'NA (test run)'
  # else if(study$zipped.File)  # do not count the lines if it is zipped
  #   study$file.line.count <- 'NA (zipped file)'
  # else
  #   study$file.line.count <- get_file_line_count(file.name)

  # get_file_line_count if file is not zipped
  if(study$file.extension != 'zip') # do not count the lines if it is a test run
  {
    fileInspection <- get_file_line_count_RUtils(file.name)
    study$file.line.count <- fileInspection[1]
    study$file.endsWithNewLine <- fileInspection[2]
  }


  ###### ==== file names ==== ##########
  # this prefix is put infront of all files. it does not have any extension
  # /home/user/  +  QC  +  _  +  studyfile
  files.prefix <- sprintf('%s/%s_%s',
                          config$paths$dir_output,
                          config$paths$filename_output_tag,
                          study$file.name)



  ## add plot paths
  study$manPlotPath<-paste0(files.prefix, '_graph_M', .QC$img.extension)


  study$histPlotPath<-paste0(files.prefix, '_graph_histogram' , .QC$img.extension)


  study$stdMafPlotPath<-paste0(files.prefix, '_graph_EAF_SR' , .QC$img.extension)

  study$stdMafSmPlotPath<-paste0(files.prefix, '_graph_EAF_SR_sm' , .QC$img.extension)

  study$altMafPlotPath<-paste0(files.prefix, '_graph_EAF_AR' , .QC$img.extension)


  study$pvalCorPlotPath<-paste0(files.prefix, '_graph_p_correlation' , .QC$img.extension)

  study$pvalCorSmPlotPath<-paste0(files.prefix, '_graph_p_correlation_sm' , .QC$img.extension)


  study$QQPlotPath<-paste0(files.prefix, '_graph_QQ' , .QC$img.extension)

  study$effPlotPath<-paste0(files.prefix, '_beta' , .QC$img.extension)

  study$plot.title <- ifelse(length(config$paths$input_files) > 1 | config$plot_specs$plot_title == 'none',
                             study$file.name,
                             config$plot_specs$plot_title)

  ## add dataset paths

  #variant with missing crucial values
  study$SNPs_invalid.path<-paste(files.prefix, 'vars_invalid_allele.txt' , sep='_')


  #variant with invalid allele values
  study$SNPs_removed.path<-paste(files.prefix, 'vars_removed.txt' , sep='_')

  #variant with invalid both allele values
  study$SNPs_invalid_both.path<-paste(files.prefix, 'vars_invalid_alleles.txt' , sep='_')

  #variant with missing non-crucial values
  study$SNPs_improbable_values.path<-paste(files.prefix, 'vars_improbable_values.txt' , sep='_')

  # duplicate variants
  study$SNPs_duplicates.path<-paste(files.prefix, 'vars_duplicates.txt' , sep='_')
  study$SNPs_duplicates_postMatch.path<-paste(files.prefix, 'vars_duplicates_post_match.txt' , sep='_')

  # mismatched bi-allelic variants
  study$SNPs_mismatches.path<-paste(files.prefix, 'vars_mismatches_BA.txt' , sep='_')

  # monomorphic variants
  study$SNPs_monomorphic.path<-paste(files.prefix, 'vars_monomorphic.txt' , sep='_')

  # multi_allelic variants
  # vairants that are found in reference but cannot be matched because of many variants on the same position
  # mismatched multi-allelic variants
  study$SNPs_multi_allelic.path<-paste(files.prefix, 'vars_mismatches_MA.txt' , sep='_')

  # duplicated match variants
  study$SNPs_ambiguous.path<-paste(files.prefix, 'vars_ambiguous.txt' , sep='_')

  # significant variants
  study$SNPs_significant.path<-paste(files.prefix, 'vars_significant.txt' , sep='_')


  # cleaned output file
  study$output.path<-paste(files.prefix, '.txt' , sep='')


  # effect-size reference dataset path
  study$effect_size_ref.output.path<-paste(files.prefix, 'effect-size_ref.rds' , sep='_')


  # html report file
  study$html.report.path<-paste(files.prefix, 'report.html' , sep='_')

  # txt report file
  study$txt.report.path<-paste(files.prefix, 'report.txt' , sep='_')

  # rdata object file
  study$rds.study.rds.path<-paste(files.prefix, 'object.rds' , sep='_')



  ## ====== column variables ======== ##

  study$original.File.Columns <- character(0)
  study$original.File.Columns.sorted <- character(0)
  study$renamed.File.Columns.sorted  <- character(0)
  study$wanted.columns.index <- character(0)
  study$renamed.File.Columns.classes <- character(0)
  study$missing.Columns <- 'none'

  study$missing.PVALUE.column <- FALSE



  study$hasINDEL <- FALSE
  study$hID.added <- FALSE
  study$hanNoneBaseAlleles <- FALSE
  study$HQ.count  <- 0
  study$LQ.count  <- 0

  study$input.data.rowcount <- 0 # originla file row number - before any processing
  study$rowcount.step1 <- 0      # after processing crucial columns
  study$rowcount.step2 <- 0      # aftere removing duplicates, monomorphics and chromosal variants
  study$rowcount.step3 <- 0      # aftere

  study$missing.crucial.rowcount<- 0
  study$missing.alleles.rowcount <- 0
  study$neg.strand.count<- 0

  study$monomorphic.count <- 0
  study$duplicate.count<- 0


  study$found.rows <- 0
  study$mismatched.rows<- 0
  study$ambiguos.rows<- 0
  study$not.found.rows <- 0
  study$switched.rows <- 0
  study$flipped.rows <- 0
  study$palindromic.rows <- 0
  study$non.palindromic.rows<- 0

  study$skewness <- 'NA'
  study$skewness.HQ <- 'NA'
  study$Visschers.stat <- 'NA'
  study$Visschers.stat.HQ <- 'NA'
  study$MAX_N_TOTAL <- 'NA'


  study$tables$imputed.tbl <-  as.data.table(matrix(NA,2,2))
  study$tables$multi_allele_count_preProcess <- data.table()
  study$tables$multi_allele_count_postProcess <- data.table()
  study$tables$variable.summary.HQ <- data.table()

  colnames(study$tables$imputed.tbl) <- c('IMPUTED','N')
  study$tables$imputed.tbl$IMPUTED[1] <- 'Imputed'
  study$tables$imputed.tbl$IMPUTED[2] <- 'Genotyped'

  study$lambda <- 0
  study$lambda.gen <- 'not available'
  study$lambda.imp <- 'not available'

  study$PVcor <- 'not available (missing P-value column)'
  study$PVcor.palindromic <- 'not available (missing P-value column)'
  study$rownum.PVcor <- 0


  study$fixed.hwep <- 'NA'
  study$fixed.callrate <- 'NA'
  study$fixed.n_total <- 'NA'
  study$fixed.impq <- 'NA'

  study$AFcor.std_ref <- 'NA'
  study$AFcor.non.palindromic.std_ref <- 'NA'
  study$AFcor.palindromic.std_ref <- 'NA'
  study$AFcor.std_ref.indel <- 'NA'
  study$AFcor.std_ref.CHR <- 'NA'


  study$AFcor.alt_ref <- 'NA'
  study$AFcor.non.palindromic.alt_ref <- 'NA'
  study$AFcor.palindromic.alt_ref <- 'NA'
  study$AFcor.alt_ref.indel <- 'NA'

  study$character.chromosome <- FALSE

  ## multi file comparison
  study$STDERR.mean.HQ <- 0
  study$effect.rho_4 <- 'NA (not calculated)'
  # study$column.INVALID.list$CALLRATE <- numeric(length = 0L)
  # study$column.INVALID.list$CHR <- numeric(length = 0L)
  # study$column.INVALID.list$EFF_ALL_FREQ <- numeric(length = 0L)
  # study$column.INVALID.list$EFFECT <- numeric(length = 0L)
  # study$column.INVALID.list$EFFECT_ALL <- numeric(length = 0L)
  # study$column.INVALID.list$HWE_PVAL <- numeric(length = 0L)
  # study$column.INVALID.list$IMP_QUALITY <- numeric(length = 0L)
  # study$column.INVALID.list$IMPUTED <- numeric(length = 0L)
  # study$column.INVALID.list$N_TOTAL <- numeric(length = 0L)
  # study$column.INVALID.list$OTHER_ALL <- numeric(length = 0L)
  # study$column.INVALID.list$POSITION <- numeric(length = 0L)
  # study$column.INVALID.list$PVALUE <- numeric(length = 0L)
  # study$column.INVALID.list$STDERR <- numeric(length = 0L)

  study$column.INVALID.list$minusone.CALLRATE <- numeric(length = 0L)
  study$column.INVALID.list$zero.EFF_ALL_FREQ <- numeric(length = 0L)
  study$column.INVALID.list$one.EFF_ALL_FREQ <- numeric(length = 0L)
  study$column.INVALID.list$minusone.EFF_ALL_FREQ <- numeric(length = 0L)
  study$column.INVALID.list$minusone.HWE_PVAL <- numeric(length = 0L)
  study$column.INVALID.list$minusone.PVALUE <- numeric(length = 0L)
  study$column.INVALID.list$zero.STDERR <- numeric(length = 0L)


  study$column.INVALID.list$CALLRATE = numeric(length = 0L)
  study$column.INVALID.list$CHR = numeric(length = 0L)
  study$column.INVALID.list$EFF_ALL_FREQ = numeric(length = 0L)
  study$column.INVALID.list$EFFECT = numeric(length = 0L)
  study$column.INVALID.list$EFFECT_ALL = numeric(length = 0L)
  study$column.INVALID.list$HWE_PVAL = numeric(length = 0L)
  study$column.INVALID.list$IMP_QUALITY = numeric(length = 0L)
  study$column.INVALID.list$IMPUTED = numeric(length = 0L)
  study$column.INVALID.list$MARKER = numeric(length = 0L)
  study$column.INVALID.list$N_TOTAL = numeric(length = 0L)
  study$column.INVALID.list$OTHER_ALL = numeric(length = 0L)
  study$column.INVALID.list$BOTH_ALL = numeric(length = 0L)
  study$column.INVALID.list$POSITION = numeric(length = 0L)
  study$column.INVALID.list$PVALUE = numeric(length = 0L)
  study$column.INVALID.list$STDERR = numeric(length = 0L)
  study$column.INVALID.list$STRAND = numeric(length = 0L)
  study$column.NA.list$CALLRATE = numeric(length = 0L)
  study$column.NA.list$CHR = numeric(length = 0L)
  study$column.NA.list$EFF_ALL_FREQ = numeric(length = 0L)
  study$column.NA.list$EFFECT = numeric(length = 0L)
  study$column.NA.list$EFFECT_ALL = numeric(length = 0L)
  study$column.NA.list$HWE_PVAL = numeric(length = 0L)
  study$column.NA.list$IMP_QUALITY = numeric(length = 0L)
  study$column.NA.list$IMPUTED = numeric(length = 0L)
  study$column.NA.list$MARKER = numeric(length = 0L)
  study$column.NA.list$N_TOTAL = numeric(length = 0L)
  study$column.NA.list$OTHER_ALL = numeric(length = 0L)
  study$column.NA.list$POSITION = numeric(length = 0L)
  study$column.NA.list$PVALUE = numeric(length = 0L)
  study$column.NA.list$STDERR = numeric(length = 0L)
  study$column.NA.list$STRAND = numeric(length = 0L)

  # study$column.NA.list$CALLRATE <- numeric(length = 0L)
  # study$column.NA.list$CHR <- numeric(length = 0L)
  # study$column.NA.list$EFF_ALL_FREQ <- numeric(length = 0L)
  # study$column.NA.list$EFFECT <- numeric(length = 0L)
  # study$column.NA.list$EFFECT_ALL <- numeric(length = 0L)
  # study$column.NA.list$HWE_PVAL <- numeric(length = 0L)
  # study$column.NA.list$IMP_QUALITY <- numeric(length = 0L)
  # study$column.NA.list$IMPUTED <- numeric(length = 0L)
  # study$column.NA.list$MARKER <- numeric(length = 0L)
  # study$column.NA.list$N_TOTAL <- numeric(length = 0L)
  # study$column.NA.list$OTHER_ALL <- numeric(length = 0L)
  # study$column.NA.list$POSITION <- numeric(length = 0L)
  # study$column.NA.list$PVALUE <- numeric(length = 0L)
  # study$column.NA.list$STDERR <- numeric(length = 0L)
  # study$column.NA.list$STRAND <- numeric(length = 0L)

  # specific chromosome count in file
  study$x.chr.count.removed <- 0
  study$y.chr.count.removed <- 0
  study$xy.chr.count.removed <- 0
  study$m.chr.count.removed <- 0


  study$dup_lines_count <- 0

  # get variable summary statistics from table
  # ===================
  # study$tables$variable.summary['Min.','EFFECT']
  # study$tables$variable.summary['1st Qu.','EFFECT']
  # study$tables$variable.summary['Median','EFFECT']
  # study$tables$variable.summary['Mean','EFFECT']
  # study$tables$variable.summary['3rd Qu.','EFFECT']
  # study$tables$variable.summary['Max.','EFFECT']


  study$effect.plot <- textGrob("Empty Effect Plot", gp = gpar(fontsize=12,col='red', fontface='bold'))

  # study$effect.plot <- ggdraw() +
  #   draw_label('Empty Effect Plot', x = .5, y = .5,
  #              vjust = .5, hjust = .5, size = 10, fontface = 'bold',colour = 'darkred')


  # check header of the input file
  study <-  tryCatch(checkRequiredColumnNames(file.name,study),
                     error = function(err)
                     {
                       print_and_log(paste('Could not read file header:',err$message),'warning')
                       return(NULL)
                     }
  )

  # print_and_log("\nColumn check is done!",'info')
  # cat('----------------------------------------------------',fill = TRUE)
  if(!is.null(study))
    return(study)

}

get_file_line_count <- function(file.path)
{

  ## get line count of input file
  # this values is used to compare count of file lines with count of vatriants in input datset
  # if they  do not match , it means some lines are not read
  # skip if input file is zipped (because line number is binary line count)
  if(.QC$wc.exists){
    line <- system(sprintf("wc -l %s",  file.path), intern = TRUE)

    lineCount.regex.value <- regexec('([0-9]*)([A-z ]*)',line)[[1]]
    start.char <- lineCount.regex.value[2]
    end.char <- start.char + attributes(lineCount.regex.value)$match.length[2] - 1


    # removed to exclude stringr package
    # file.line.count <- format(as.numeric(str_match(line, "([0-9]*)([A-z ]*)")[,2]),
    #                           big.mark="," ,
    #                           scientific = FALSE)

    file.line.count <- format(as.numeric(substr(line, start.char, end.char)),
                              big.mark="," ,
                              scientific = FALSE)

  }
  else
  {
    file.line.count <- 'NA (command not accessible)'
  }

  return(file.line.count)
}

get_file_line_count_RUtils <- function(file.path)
{

  ## get line count of input file
  # this values is used to compare count of file lines with count of vatriants in input datset
  # if they  do not match , it means some lines are not read
  # skip if input file is zipped (because line number is binary line count)

  file.line.count <- tryCatch({

    line <- R.utils::countLines(file.path)

    file_ends_withNewline <- as.character(attr(line,"lastLineHasNewline"))

    line <- format(line,
                   big.mark="," ,
                   scientific = FALSE)

    return(list(line,file_ends_withNewline))
  },
  error = function(err) return("NA (command not accessible)","NA")
  )

  return(file.line.count)
}

get_study_name <- function(study) {

  return(study$file.name)
}


get_skewness_kurtosis <- function(study) {
  vec <- data.table('kurtosis' = study$kurtosis.HQ,
                    'skewness' = study$skewness.HQ,
                    'order' = study$number)

  return(vec)
}

get_precision_plot_values <- function(study) {
  vec <- data.table('SE.mean.HQ' = 1 / study$STDERR.mean.HQ,
                    'sqrt.n' = sqrt(study$MAX_N_TOTAL),
                    'order' = study$number)

  return(vec)
}

verify_files_with_user <- function(qc.study.list, user.verification)
{

  study.table <- sapply(qc.study.list,function(study)
    return(c('File Name' = basename(study$file.path),
             'Lines (including header)'= study$file.line.count,
             'missing items (first 100 lines)'= study$file.header.na,
             'Missing Columns'= paste(study$missing.Columns,collapse = ' | ')
    )
    )
  )

  if(is.null(ncol(study.table)))
    print_and_log('Error in filenames. check if filename contains space or dash or another uknown character.','fatal')

  study.table <- cbind(seq(1:ncol(study.table)),t(study.table))
  colnames(study.table)[1] <- '#'

  print_and_log(kable(study.table,format = "rst"),
                'info',
                cat= FALSE)


  if(user.verification)
  {
    # warn user that existing files will be overwritten if output folder containts files that match the input file pattern
    if(.QC$config$new_items$non.empty.output.folder)
      warning('existing files in the output folder will be overwritten!!!')

    if(menu(c("Yes", "No"),
            title=sprintf('Do you want to continue (1 = \'Yes\' | 2 = \'No\' )?'))
       == 2)
      runStopCommand('QC ended by user!')
  }
}


variable_statistics_pre_matching <- function(input.data)
{

  # count alleles for report

  .QC$thisStudy$tables$EFFECT_ALL.tbl <- input.data[VT == 1,.(.N),keyby = EFFECT_ALL]  ## count alleles
  .QC$thisStudy$tables$OTHER_ALL.tbl <- input.data[VT == 1,.(.N),keyby = OTHER_ALL]  ## count alleles

}

variable_statistics_post_matching <- function(input.data)
{
  # pallindromics
  .QC$thisStudy$palindromic.rows <- input.data[palindromic ==TRUE,.N]
  .QC$thisStudy$non.palindromic.rows <- nrow(input.data) - .QC$thisStudy$palindromic.rows


  if(is.element('CHR' , names(input.data))){
    .QC$thisStudy$tables$CHR.tbl <- input.data[,.(.N),keyby = CHR]  ## count variants per chromosome


    ### looking for missing chromosomes
    chr_range = range(as.numeric(.QC$thisStudy$tables$CHR.tbl$CHR),na.rm = T)

    chr_range = if(chr_range[2] <= 23)
      seq(chr_range[1]:23)
    else
      seq(chr_range[1]:chr_range[2])

    .QC$thisStudy$missing_chromosomes <- which(chr_range %notin% .QC$thisStudy$tables$CHR.tbl$CHR)
    ###################################

    .QC$thisStudy$tables$CHR.tbl$CHR <- as.character(.QC$thisStudy$tables$CHR.tbl$CHR)
    .QC$thisStudy$tables$CHR.tbl[is.na(CHR), CHR:= 'missing']

    .QC$thisStudy$tables$CHR.tbl$N <- thousand_sep(.QC$thisStudy$tables$CHR.tbl$N)
  }else{
    .QC$thisStudy$tables$CHR.tbl <- NA
    .QC$thisStudy$tables$CHR.tbl$CHR <- ''
    .QC$thisStudy$tables$CHR.tbl$N <- '0'
  }
  # count alleles after flipping and switching and compare to pre-match allele count
  # for report
  .QC$thisStudy$tables$EFFECT_ALL.post.matching.tbl <- input.data[VT == 1,.(.N),keyby = EFFECT_ALL]
  .QC$thisStudy$tables$OTHER_ALL.post.matching.tbl <- input.data[VT == 1,.(.N),keyby = OTHER_ALL]  ## count alleles


  if('IMPUTED' %in% colnames(input.data))
  {
    .QC$thisStudy$tables$imputed.tbl <- input.data[,.(.N),keyby = IMPUTED]  ## count variants per chromosome

    .QC$thisStudy$tables$imputed.tbl$IMPUTED <- as.character(.QC$thisStudy$tables$imputed.tbl$IMPUTED)
    .QC$thisStudy$tables$imputed.tbl[is.na(IMPUTED) , IMPUTED := 'missing/invalid']
    .QC$thisStudy$tables$imputed.tbl[IMPUTED == 0 , IMPUTED := 'genotyped']
    .QC$thisStudy$tables$imputed.tbl[IMPUTED == 1 , IMPUTED := 'imputed']
  }

  ##
  ## =================
  existing.columns <- intersect(c('PVALUE','HWE_PVAL','CALLRATE','EFF_ALL_FREQ','IMP_QUALITY','EFFECT','STDERR') ,
                                names(input.data))

  .QC$thisStudy$tables$variable.summary <- cbind(sapply(input.data[,existing.columns,with= FALSE],
                                                        function(x) return(summary(x)[1:6])))

  .QC$thisStudy$tables$variable.summary <- formatC(.QC$thisStudy$tables$variable.summary, format = 'g')


  # change effect column name to BETA or LN(OR)
  colnames(.QC$thisStudy$tables$variable.summary)[colnames(.QC$thisStudy$tables$variable.summary) == 'EFFECT'] <- .QC$config$input_parameters$effect_type_string

  ## get frequency table for multi-allelic variants
  .QC$thisStudy$tables$multi_allele_count_preProcess <- getMultiAlleleCountTbl(input.data)






  # variables that are found in standard reference file
  .QC$thisStudy$found.rows.std <- input.data[match_result != 9L & SOURCE == 'Std_ref' , .N]
  .QC$thisStudy$switched.rows.std <- input.data[match_result == 3L & SOURCE == 'Std_ref' , .N]
  .QC$thisStudy$flipped.rows.std <- input.data[match_result == 2L & SOURCE == 'Std_ref' , .N]

  # variables that are found in alternate reference file
  .QC$thisStudy$found.rows.alt <- input.data[match_result != 9L & SOURCE != 'Std_ref' , .N]
  .QC$thisStudy$switched.rows.alt <- input.data[match_result == 3L & SOURCE != 'Std_ref' , .N]
  .QC$thisStudy$flipped.rows.alt <- input.data[match_result == 2L & SOURCE != 'Std_ref' , .N]

  # variables that are not found in standard reference file
  .QC$thisStudy$not.found.rows.std <- nrow(input.data) - .QC$thisStudy$found.rows.std

  # variables that are not found in either standard or alternate reference file
  .QC$thisStudy$not.found.rows.alt <- nrow(input.data) - .QC$thisStudy$found.rows.std - .QC$thisStudy$found.rows.alt


}

variable_statistics_post_matching_HQ <- function(input.data)
{

  existing.columns <- intersect(c('PVALUE','HWE_PVAL','CALLRATE','EFF_ALL_FREQ','IMP_QUALITY','EFFECT','STDERR') ,
                                names(input.data))

  tbl <- cbind(sapply(input.data[HQ == 1,existing.columns,with= FALSE],
                                                        function(x) return(summary(x)[1:6])))

  tbl <- formatC(tbl, format = 'g')


  # change effect column name to BETA or LN(OR)
  colnames(tbl)[colnames(tbl) == 'EFFECT'] <- .QC$config$input_parameters$effect_type_string

  return(tbl)
}


addEmptyStudy_studyInstance <- function(study)
{
  faultyStudy <- new("Study")
  faultyStudy@File$file.path <- study$file.path
  faultyStudy@Successful_run <- FALSE
  faultyStudy@Counts$input.data.rowcount <- study$input.data.rowcount
  faultyStudy@Counts$rowcount.step1 <- study$rowcount.step1
  faultyStudy@Counts$rowcount.step3 <- study$rowcount.step3
  faultyStudy@Counts$found.rows <- NA
  faultyStudy@Correlations$AFcor.std_ref <- NA
  faultyStudy@Correlations$PVcor <- NA
  faultyStudy@Statistics$lambda <- NA

  .QC$StudyList@studyList <- append(.QC$StudyList@studyList , faultyStudy)
  .QC$StudyList@studyCount <- length(.QC$StudyList@studyList)
}

addEmptyStudy_pathOnly <- function(study.path)
{
  faultyStudy <- new("Study")
  faultyStudy@File$file.path <- study.path
  faultyStudy@Successful_run <- FALSE
  faultyStudy@Counts$input.data.rowcount <- NA
  faultyStudy@Counts$rowcount.step1 <- NA
  faultyStudy@Counts$rowcount.step3 <- NA
  faultyStudy@Counts$found.rows <- NA
  faultyStudy@Correlations$AFcor.std_ref <- NA
  faultyStudy@Correlations$PVcor <- NA
  faultyStudy@Statistics$lambda <- NA

  .QC$StudyList@studyList <- append(.QC$StudyList@studyList , faultyStudy)
  .QC$StudyList@studyCount <- length(.QC$StudyList@studyList)
}
Any scripts or data that you put into this service are public.
GWASinspector documentation built on April 4, 2025, 4:35 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
GWASinspector
Comprehensive and Easy to Use Quality Control of GWAS Results

R/studyFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

Defines functions variable_statistics_pre_matching verify_files_with_user get_precision_plot_values get_skewness_kurtosis get_study_name get_file_line_count_RUtils get_file_line_count create_file_specific_config process_each_file

Try the GWASinspector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GWASinspector Comprehensive and Easy to Use Quality Control of GWAS Results

R/studyFunctions.R In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

Defines functions variable_statistics_pre_matching verify_files_with_user get_precision_plot_values get_skewness_kurtosis get_study_name get_file_line_count_RUtils get_file_line_count create_file_specific_config process_each_file

Try the GWASinspector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GWASinspector
Comprehensive and Easy to Use Quality Control of GWAS Results

R/studyFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results