R/headerRelatedFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

## this file includes functions for reading header file
# functions included are:
# 1- checkRequiredColumnNames => check if input file has the required columns and rename colnames
# 2- getCrucialColumnNames => colnames that are necessary and will cause the function to end
## TODO this function items (necessary colnames) might be added to config file

# 3- getRequiredColumnNames => colnames that are selected from input file (not all columns will be loaded)
## TODO this function items (colnames) might be added to config file

# 4- getFileHeaderKV => check if header file has two columns with no duplicates
# create a key-value variable for checking and renaming header columns

##below two functions get a hashset of keyvalues from header file for searching and replacing input colnames
# 5- replaceInputStringSetFromHashSet => compare all values of a list to hashset and return the keys
# 6- replaceInputStringFromHashSet => compare a single value to hashset and return the key



checkRequiredColumnNames <- function(inputFile, study){

  file.extension<-study$file.extension #file_ext(inputFile)

  ##set NA string set from config
  na.string <- .QC$config$input_parameters$na.string

  ##set column separator from config file
  sep.strings <- .QC$config$input_parameters$column_separator

  # if(file.extension %in% c('gz','zip')){
  #
  #   inputFilePath<-sprintf("gzip -dc %s", inputFile)
  #   data<-fread( inputFilePath,
  #                nrows=100,
  #                header = TRUE,
  #                fill=TRUE,
  #                sep='auto')

  if(file.extension == "zip") {

    # name if txt file inside zip file
    # FIXME how to know the exact name!?!?
    embeded.file <- paste0(tools::file_path_sans_ext(basename(inputFile)),'.txt')

    sep.strings <- ifelse(sep.strings == 'auto' , '' , sep.strings)

    data <- read.table(unz(description = inputFile,
                           filename = embeded.file),
                       sep = sep.strings ,
                       header = TRUE,
                       na.strings = na.string,
                       nrows=100,
                       fill = TRUE,
                       check.names = FALSE)

    close(unz(
      description = inputFile,
      filename = embeded.file)
    )

  # } else if (file.extension == "gz"){
  #
  #   sep.strings <- ifelse(sep.strings == 'auto' , '' , sep.strings)
  #
  #   data <- read.table(gzfile(inputFile),
  #                      sep = sep.strings,
  #                      header = TRUE,
  #                      nrows = 100,
  #                      na.strings = na.string,
  #                      fill = TRUE,
  #                      check.names = FALSE)
  #
  #   close(gzfile(inputFile))

  }else if(file.extension %in% c('txt','dat','csv','gz','bz2'))
  {
   # sep.strings <- ifelse(sep.strings == 'auto' , '' , sep.strings)

    data <- fread(file = inputFile,
                       sep = sep.strings,
                       header = TRUE,
                       nrows = 100,
                       na.strings = na.string,
                       fill = TRUE,
                       check.names = FALSE)

    # data<-fread(inputFile,
    #             nrows=100,
    #             header = TRUE,
    #             fill = TRUE,
    #             sep='auto',
    #             na.strings = na.string)

  }

  ###
  # convert to data.table from read.table function
  if(!is.data.table(data))
    data <- as.data.table(data)

  if(is.null(nrow(data)) | nrow(data) == 0)
  {
    print_and_log("File will be ignored. No data found.",'warning')
    addEmptyStudy_pathOnly(inputFile)
    return(NULL)

  }

  wanted.columns<-getRequiredColumnNames()
  original.File.Columns.upper<-toupper(colnames(data)) ## upper is required for translating column names
  original.File.Columns <- colnames(data) ## for loading dataset and saving final data

  # headerKV<-getFileHeaderKV()

  ###
  renamed.File.Columns<-replaceInputStringSetFromHashSet(.QC$headerKV, original.File.Columns.upper)
  # check if file header is correctly translated, return NULL if there is a duplicate in header names
  if(is.null(renamed.File.Columns))
    return(NULL)




  #### check if all required comuns exist ####
  ## if not, script is stopped

  crucial.columns<-getCrucialColumnNames_onFileLoading()
  missing.crucial.column.indexes<-which(crucial.columns %notin% renamed.File.Columns)

  if(length(missing.crucial.column.indexes) > 0){
    print_and_log(sprintf("File will be ignored. columns \'%s\' were not found.",
                          paste(crucial.columns[missing.crucial.column.indexes], collapse = '|')),
                  'warning')
    addEmptyStudy_pathOnly(inputFile)
    return(NULL)
  }


  ## check if file already has an EFFECT column. this is problematic and can not be continued
  if(is.element('EFFECT',original.File.Columns.upper))
    print_and_log('Column EFFECT will be renamed to BETA!'
                          ,'warning')

  if(is.element('EFFECT',renamed.File.Columns))
    print_and_log('Input file can not have a column that is named EFFECT.'
                  ,'fatal')


  ## check if file has the selected effect column , either BETA or OR
  # if(!is.element(.QC$config$input_parameters$effect_type,renamed.File.Columns))
  #   print_and_log(sprintf('You have selected \"%s\" as the effect column, which is not found in the input file.' ,
  #                         .QC$config$input_parameters$effect_type ),'fatal')



  ### check other columns and notify that one is missing
  missing.columns<-wanted.columns[which(wanted.columns  %notin% renamed.File.Columns )]

  if(length(missing.columns) > 0 )
  {
    study$missing.Columns <- missing.columns # add to config fo report


    if('PVALUE' %in% missing.columns){
      study$missing.PVALUE.column <- TRUE

      print_and_log('PVALUE column is missing. calculated values will be used!',
                    'warning')
      print_and_log('Pvalue correlation plot will not be saved!',
                    'warning')
    }

    # CHR is a crucial column now
#     if('CHR' %in% missing.columns)
#       print_and_log('Manhattan plot will not be saved (CHR column is missing)!',
#                     'warning')

    if('EFF_ALL_FREQ' %in% missing.columns)
      print_and_log('Allele frequency plots will not be saved (EFF_ALL_FREQ column is missing)!',
                    'warning')


  }

  ##----
  ## find colmn classes of input file to use when loading the file
  renamed.File.Columns.classes <- sapply(renamed.File.Columns, function(x) setColumnClass(x))
  names(renamed.File.Columns.classes) <- colnames(data)
  # remove nulls that mean unmatched columns
  renamed.File.Columns.classes = renamed.File.Columns.classes[!sapply(renamed.File.Columns.classes,
                                                                      is.null)]
  #----

  ## saving variables in config file
  col.index.all <- c(1:ncol(data))
  wanted.columns.index<-na.omit(match(wanted.columns,renamed.File.Columns))
  ## add column names to config file
  study$original.File.Columns <- original.File.Columns
  study$renamed.File.Columns <- renamed.File.Columns

  ##used for loadinh the input file, columns are loaded as required order - unknown columns are pushed to the end
  study$original.File.Columns.sorted <- c(original.File.Columns[wanted.columns.index],
                                          original.File.Columns[col.index.all[-wanted.columns.index]])

  #used for renaming the original file header after loading it and saving datasets - it is sorted
  study$renamed.File.Columns.sorted <- c(renamed.File.Columns[wanted.columns.index],
                                         renamed.File.Columns[col.index.all[-wanted.columns.index]])

  ## index of columns , first the known and then the unknowm
  study$wanted.columns.index <- c(wanted.columns.index,
                                  col.index.all[-wanted.columns.index])

  # classes for loading the columns- for the 15 known classes, these items are set. for the rest it is based on fread function
  study$renamed.File.Columns.classes <- unlist(renamed.File.Columns.classes)


  # VERY IMPORTANT: number of NA values in the first 100 lines of file
  # this is so important because it shows if columns are correctly seperated or not
  # i.e. header row may be separated based on space character while other rows are separated by tab
  study$file.header.na <- count_NA(data)

  return(study)
}

count_NA <- function(input.data){

  na.count <- length(which(is.na(input.data)))
  d <- dim(input.data)
  na.percent <- calculatePercent(na.count,  (d[1] * d[2]))

  return(na.percent)
}

##algorithm stops if one of these columns are missing
getCrucialColumnNames_onFileLoading<-function()
{
  wanted.columns<-c(
    "CHR",
    "POSITION",
    "EFFECT_ALL",
    "OTHER_ALL",
   # "EFFECT",  added as a separate test (either BETA or OR)
    "STDERR"
  )

  if(.QC$config$input_parameters$effect_type == 'BETA')
    wanted.columns <- append(wanted.columns,'BETA')
  else  if(.QC$config$input_parameters$effect_type == 'OR')
    wanted.columns <- append(wanted.columns,'OR')

  return(wanted.columns)
}

getCrucialColumnNames_onFileAnalysis<-function()
{
  wanted.columns<-c(
    "CHR",
    "POSITION",
    "EFFECT_ALL",
    "OTHER_ALL",
    "EFFECT",
    "STDERR"
  )

  return(wanted.columns)
}

getNonCrucialColumnNames<-function()
{
  wanted.columns<-c("STRAND",
                    "PVALUE",
                    "EFF_ALL_FREQ",
                    "HWE_PVAL",
                    "IMP_QUALITY",
                    "IMPUTED",
                    "CALLRATE",
                    "N_TOTAL",
                    "MARKER")

  return(wanted.columns)
}

getRequiredColumnNames<-function()
{
  wanted.columns<-c("CHR",
                    "MARKER",
                    "POSITION",
                    "STRAND",
                    "EFFECT_ALL",
                    "OTHER_ALL",
                   # "EFFECT",
                    "STDERR",
                    "PVALUE",
                    "EFF_ALL_FREQ",
                    "HWE_PVAL",
                    "IMP_QUALITY",
                    "IMPUTED",
                    "CALLRATE",
                    "N_TOTAL")

  if(.QC$config$input_parameters$effect_type == 'BETA')
    wanted.columns <- append(wanted.columns,'BETA')
  else  if(.QC$config$input_parameters$effect_type == 'OR')
    wanted.columns <- append(wanted.columns,'OR')

  return(wanted.columns)
}


setColumnClass<-function(column){
  switch(column,
         "CHR"='character',
         "MARKER" ='character' ,
         "POSITION"='numeric',
         "STRAND"='character',
         "EFFECT_ALL"='character',
         "OTHER_ALL"='character',
         "EFFECT"='numeric',
         "STDERR"='numeric',
         "PVALUE"='numeric',
         "EFF_ALL_FREQ"='numeric',
         "HWE_PVAL"='numeric',
         "IMP_QUALITY"='numeric',
         "IMPUTED"='character',
         "CALLRATE"='numeric',
         "N_TOTAL"='numeric'
         ## TODO,'CHR' check this for unknown columns
  )
}

getFileHeaderKV<-function()
{
  headerTable <- read.table(file = .QC$config$supplementaryFiles$header_translations,
                            sep='',
                            header = FALSE,
                            stringsAsFactors = FALSE)
  headerTable$V1 = toupper(headerTable$V1)
  headerTable$V2 = toupper(headerTable$V2)
# TODO delete
#   ###checking header file
#   if(ncol(headerTable) != 2L) {
#     print_and_log(sprintf('\'headerTable\' should have two columns but has %s!',ncol(headerTable)),
#                   'fatal')
#   }
#
#   if(any(duplicated(headerTable[ ,2]))) {
#     print_and_log('Duplicated items found in header table!',
#                   'fatal')
#   }

  ##creating hashset of header elements
  headerKV <- hash::hash(keys = as.matrix(headerTable[, 2]),
                         values = as.matrix(headerTable[, 1]))
  # flog.info("header KV file loaded!", name= .QC$log.name.main)

  print_and_log('Header translation table created!',
                'info')

  return(headerKV)
}


### create header hashset
replaceInputStringSetFromHashSet <- function(hashset , inputStringSet)
{
  if (!hash::is.hash(hashset))
  {
    stop('hashset not loaded correctly')
  }

  returnSet<-NULL
  if (all(hash::has.key(inputStringSet, hashset)))
  {
    returnSet<-hash::values(hashset, inputStringSet)
  } else
  {
    returnSet<-sapply(inputStringSet, function(x)
      replaceInputStringFromHashSet(hashset, x))
  }

  returnSet <- as.vector(returnSet)

  tbl <- data.table('header' = inputStringSet , 'translated_header' = returnSet)

  print_and_log('Header translation in input file...','info',display=.QC$config$debug$verbose)
  print_and_log(kable(tbl,format = "rst"),
                'info',
                cat= FALSE,
                display= .QC$config$debug$verbose)


  if(any(duplicated(returnSet)))
  {

    print_and_log('duplicated values are found in translated header!','warning',display=.QC$config$debug$verbose)
    return(NULL)
  }
  else
  {
    return(as.vector(returnSet))
  }
}

replaceInputStringFromHashSet <- function(hashset, inputString)
{
  if (!hash::is.hash(hashset))
  {
    stop('hashset not loaded correctly')
  }

  if (hash::has.key(inputString, hashset))
  {
    hash::values(hashset, inputString)
  } else
  {
    inputString
  }
}

Any scripts or data that you put into this service are public.

GWASinspector documentation built on Sept. 28, 2023, 1:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GWASinspector
Comprehensive and Easy to Use Quality Control of GWAS Results

R/headerRelatedFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

Defines functions replaceInputStringFromHashSet replaceInputStringSetFromHashSet getFileHeaderKV setColumnClass getRequiredColumnNames getNonCrucialColumnNames getCrucialColumnNames_onFileAnalysis getCrucialColumnNames_onFileLoading count_NA checkRequiredColumnNames

Try the GWASinspector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GWASinspector Comprehensive and Easy to Use Quality Control of GWAS Results

R/headerRelatedFunctions.R In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

Defines functions replaceInputStringFromHashSet replaceInputStringSetFromHashSet getFileHeaderKV setColumnClass getRequiredColumnNames getNonCrucialColumnNames getCrucialColumnNames_onFileAnalysis getCrucialColumnNames_onFileLoading count_NA checkRequiredColumnNames

Try the GWASinspector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GWASinspector
Comprehensive and Easy to Use Quality Control of GWAS Results

R/headerRelatedFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results