R/saveFilesFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

#this file includes functions for saving data sets and logging the job
# 1- saveDataSet()  => general function
# 2- saveDataSet_final() => saving matched data at the end of algorithm IF TRUE in config file





# Save a dataset to output directory
saveDataSet <- function(dataset,
                        file.path,
                        columnSeparator="\t",
                        naValue = "NA",
                        decValue = '.',
                        zipped = FALSE,
                        ordered = FALSE)
{

  # ORDER THE  output rows on CHR-position
  # order on RS number if CHR is not present
  if(ordered == TRUE){
    if(is.element('CHR', colnames(dataset)) && is.element('POSITION', colnames(dataset)) )
      dataset <- dataset[order(CHR,POSITION)]
    else if(is.element('MARKER', colnames(dataset)))
      dataset <- dataset[order(MARKER)]
  }


  ## 1- save dataset
  # zip if selected by user and Rtools is installed
  if(zipped){
    # much slower tha fwrite
    tryCatch(
      # write.table(dataset,
      #             dec = decValue,
      #             na = naValue,
      #             quote = FALSE,
      #             sep = columnSeparator,
      #             row.names = FALSE,
      #             file = gzfile(file.path ,compression = 3)),
      fwrite(x = dataset,
             file = file.path,
             sep = columnSeparator,
             na = naValue,
             dec = decValue,
             quote = FALSE,
             compress = "auto"),
      error = function(err)
      {
        print_and_log(sprintf("Error saving file! check below message for more information:\n%s",
                              err),
                      'warning')
      }
    )
  }else {
    tryCatch(
      fwrite(x = dataset,
             file = file.path,
             sep = columnSeparator,
             na = naValue,
             dec = decValue,
             quote = FALSE),

      error = function(err)
      {
        print_and_log(sprintf("Error saving file! check below message for more information:\n%s",
                              err),
                      'warning')
      }
    )
  }


  ## 2- create log
  if(zipped)
    print_and_log(sprintf("A zipped data file is saved as '%s' with '%s' rows!",
                          file.path,
                          thousand_sep(nrow(dataset))),
                  'info')
  else
    print_and_log(sprintf("A data file is saved as '%s' with '%s' rows!",
                          file.path,
                          thousand_sep(nrow(dataset))),
                  'info')

}



## TODO use 'filename_output' for saved file
saveDataSet_final<-function(dataset)
{
  config <- .QC$config

  if(config$output_parameters$save_final_dataset){

    ## save and effect-size reference rds object from input file
    ## only significant variants are saved based on pvalue
    if(config$output_parameters$save_as_effectSize_reference)
      effect_size_ref_make(dataset,
                           .QC$thisStudy$effect_size_ref.output.path)


    ## remove reference added columns from final dataset
    ## MULTOI_ALLELIC column is needed if it is generated
    ## =========================================

    requiredColNames <- .QC$thisStudy$renamed.File.Columns.sorted

    ## LOG10P is previously converted to PVALUE by 10^(-log10p)
    # so the column name vector should have PVALUE as well
    if(is.element('PVALUE',names(dataset)) && !is.element('PVALUE',requiredColNames))
      requiredColNames <- c(requiredColNames,'PVALUE')

    if(is.element('MULTI_ALLELIC',names(dataset)) && config$output_parameters$add_column_multiallelic == TRUE)
      requiredColNames <- c(requiredColNames,'MULTI_ALLELIC')

    if(is.element('HQ',names(dataset)) && config$output_parameters$add_column_HQ == TRUE)
    {
      requiredColNames <- c(requiredColNames,'HQ')
      dataset[,HQ := as.numeric(HQ)] # convert TRUE/FALSE to 1/0
    }

    if(is.element('highDiffEAF',names(dataset)) && config$output_parameters$add_column_AFmismatch == TRUE)
      requiredColNames <- c(requiredColNames,'highDiffEAF')

    if(is.element('AF',names(dataset)) && config$output_parameters$add_column_AF == TRUE)
      requiredColNames <- c(requiredColNames,'AF')

    if(is.element('REF_RSID',names(dataset)) && config$output_parameters$add_column_rsid == TRUE)
      requiredColNames <- c(requiredColNames,'REF_RSID')


    #select the required columns
    dataset <- subset(dataset , select = requiredColNames)

    #add hid column: CHR-pos-alleles
    if(config$output_parameters$add_column_hid == TRUE)
      {
        if(!is.data.table(dataset))
          setDT(dataset)

        varID = NULL

        dataset[, varID := paste(CHR,
                                 format(POSITION,scientific = FALSE,trim = TRUE),
                                 EFFECT_ALL,
                                 OTHER_ALL,
                                 sep = ':')]
        }


    # check if there were characters in chromosome column that should be reconverted to characters
    ## =========================================
    if(.QC$thisStudy$character.chromosome)
      dataset <- deconvert_column_CHR(dataset)

    # make sure position is not saved as scientific number ,e.g. 8e-6
    dataset$POSITION<-format(dataset$POSITION,scientific = FALSE,trim = TRUE)


    ## rename columns based on user preference
    #PLINK, GWAMA, GENABEL,...
    ## =========================================
    selected.header.format <- config$output_parameters$out_header

    if(selected.header.format == 'GENABEL'){

      if('build' %notin% colnames(data))
        dataset[, build := "unknown"]


      if("pgc" %notin% colnames(data)) {
        if('sebeta' %in% colnames(data)) # this is not in our standard columns
          dataset[, pgc := pchisq((EFFECT/(sebeta * sqrt(.QC$thisStudy$lambda) ))^2,
                                  1, lower.tail=FALSE)]
        else
          dataset[, pgc := NA]
      }

      if(!"lambda.estimate" %notin% colnames(data))
        dataset[, lambda.estimate := .QC$thisStudy$lambda]

      if(!"lambda.se" %notin% colnames(data))
        dataset[, lambda.se := NA]

    }

    new.col.names <- changeColumnNames(colnames(dataset), selected.header.format)
    colnames(dataset) <- new.col.names


    ## =========================================

    # add zip extension to output file path
    if(config$output_parameters$gzip_final_dataset)
      .QC$thisStudy$output.path <- paste0(.QC$thisStudy$output.path, '.gz')


    saveDataSet(dataset,
                .QC$thisStudy$output.path,
                columnSeparator = config$output_parameters$out_sep,
                naValue = config$output_parameters$out_na,
                decValue = config$output_parameters$out_dec,
                zipped = config$output_parameters$gzip_final_dataset,
				ordered = .QC$config$output_parameters$ordered)


  }else{
    print_and_log('Saving final dataset is skipped!','warning',display=.QC$config$debug$verbose)
  }
}


save_NA_Dataset <- function(input.data,input.data.backup) {

  #get the list of noncrucial columns that should be checked for NA values
  nonCrucialColumns <- getNonCrucialColumnNames()
  crucialColumns <- getCrucialColumnNames_onFileAnalysis()

  # get column names of input file
  current.col.names <- colnames(input.data)

  #
  row.count <- nrow(input.data)

  # we need to check the columns for NA values:
  # all column should not be NA
  # only required columns that are NOT all NA are checked for NA items
  wantedColumnsList <- sapply(nonCrucialColumns, function(x)
  {
    if(x %in% current.col.names){
      #if(input.data[is.na(eval(parse(text = x))), .N] == row.count)
      if(all(is.na(input.data[,eval(parse(text = x))])))
      {
        print_and_log(sprintf('Column \'%s\' removed from improbable_variant file because all where NA!',x)
                      ,'warning',display=.QC$config$debug$verbose)
        return(FALSE)
      }
      else
      {
        return(TRUE)
      }
    }else
    {
      return(FALSE)
    }
  })

  # get the list of required columns
  .QC$unwantedColumnsList <- names(wantedColumnsList[which(wantedColumnsList == FALSE)])
  wantedColumnsList <- names(wantedColumnsList[which(wantedColumnsList == TRUE)])



  na.rows <- which(!complete.cases(input.data[,wantedColumnsList,with=FALSE]))

  ## na.rows <- apply(input.data,1,anyNA) => very slow
  # na.rows <- which(!complete.cases(input.data))

  if(length(na.rows) > 0){

    # only require the first 100 rows of crucial columns and those that are not totally NA
    input.data <-  head(input.data[na.rows,union(crucialColumns,wantedColumnsList),with =FALSE],100)
    input.data.backup <-  head(input.data.backup[na.rows,union(crucialColumns,wantedColumnsList),with =FALSE],100)

    INVALID_COLUMN<-apply(input.data,
                          1,
                          function(x)
                          {
                            c<- which(is.na(x))
                            paste(names(c),collapse = '|')
                          })
    input.data.backup <- cbind(input.data.backup,INVALID_COLUMN)
    #	[filename_output]_SNPs_improbable_values.txt
    saveDataSet(input.data.backup,
                .QC$thisStudy$SNPs_improbable_values.path,
                columnSeparator = .QC$config$output_parameters$out_sep,
                naValue = .QC$config$output_parameters$out_na,
                decValue = .QC$config$output_parameters$out_dec,
				ordered = .QC$config$output_parameters$ordered)

  }
}


save_and_remove_unusable_variants <- function(input.data,input.data.backup) {

  #TODO break to two functions
  ## === 1 - first check allele columns => data is saved as SNPS_invalid_alleles.txt
  ## === 2 - check other crucial columns => data is saved as SNPS_removed.txt
  ## === 3 - remove from dataset

  ## 1
  ## save invalid allele variants as dataset

  invalid.allele.index.union <- union( .QC$thisStudy$column.INVALID.list$EFFECT_ALL,
                                       .QC$thisStudy$column.INVALID.list$OTHER_ALL)

  if(length(invalid.allele.index.union) > 0)
  {

    # [filename_output]_SNPs_invalid_allele.txt
    saveDataSet(input.data.backup[head(invalid.allele.index.union,100)],
                file.path = .QC$thisStudy$SNPs_invalid.path,
                columnSeparator = .QC$config$output_parameters$out_sep,
                naValue = .QC$config$output_parameters$out_na,
                decValue = .QC$config$output_parameters$out_dec,
				ordered = .QC$config$output_parameters$ordered)
  }

  ## find rows with NA in allele columns
  ## data is removed from dataset
  # invalid alleles are set to NA in processCOlumns() => process_column_EFFECT_ALL()
  # so, invalid.allele.index is a subset of missing.allele.index
  #


  ## 2
  ## find rows with NA in crucial columns (not alleles)
  ## first 100 is saved
  ## data is removed from dataset

  crucial.columns <- getCrucialColumnNames_onFileAnalysis()
  # allele columns are checked in previous phase (SNPs_invalid_allele) and not needed here
  crucial.columns <- crucial.columns[!crucial.columns %in% c('EFFECT_ALL','OTHER_ALL')]

  # add EFFECT to crucial columns if not present
  # this was removed from initial set because effect could be either beta , or
  if(!is.element('EFFECT',crucial.columns))
    crucial.columns <- append(crucial.columns,'EFFECT')


  crucial.col.index <- match(crucial.columns ,names(input.data))


  # get the list of NA values in each crucial column
  # column operaion is faster than row operation
  NA.list <- sapply(crucial.col.index, function(x)
    {
      which(is.na(input.data[,x,with=FALSE]))
    }
  )


  # check the union of missing rows indexes for each crucial column
  # this result set shows which rows have at least one missing values in a crucial column
  # NA.list.union <- unlist(NA.list[1])
  # for(i in 1:length(NA.list)){
  #   NA.list.union<-union(NA.list.union,
  #                        unlist(NA.list[[i]]))
  # }

  NA.list.union <- Reduce(union,NA.list) ## easier than above loop


  if(length(NA.list.union) > 0)
  {
    sample.data <- input.data.backup[head(NA.list.union,100)] ## 100 samples are saved from backup file, before invalid items are converted to NA

    # [filename_output]_SNPs_removed.txt
    saveDataSet(sample.data,
                file.path = .QC$thisStudy$SNPs_removed.path,
                columnSeparator = .QC$config$output_parameters$out_sep,
                naValue = .QC$config$output_parameters$out_na,
                decValue = .QC$config$output_parameters$out_dec,
				ordered = .QC$config$output_parameters$ordered)
  }

  ## 3
  # find union of missing values in crucial clumns with either allele columns for removing from dataset
  total.NA.list.union <- Reduce(union, list(NA.list.union,
                                            .QC$thisStudy$column.NA.list$OTHER_ALL,
                                            .QC$thisStudy$column.NA.list$EFFECT_ALL
  )
  )

  if(length(total.NA.list.union) > 0)
  {
    print_and_log(sprintf('%s variants that missed a crucial value were removed from dataset (step 1)!',thousand_sep(length(total.NA.list.union))),
                  'warning',display=.QC$config$debug$verbose)
    input.data <- input.data[!total.NA.list.union]
    .QC$thisStudy$missing.crucial.rowcount <- length(total.NA.list.union)
  }


  # also remove variants wher nchar(EA) > 1 and nchar(OA) > 1
  # e.g.    EA = GCGCGCGTA and OA = TGTG
  # input.data <- process_column_BOTH_ALL(input.data)


  return(input.data)
}

save_significant_variants <- function(input.data) {

  sig.variants <- input.data[HQ == TRUE &
                               PVALUE < 5e-08 &
                               startsWith(MARKER,'rs'),
                             list(CHR,POSITION,MARKER,PVALUE)]

  saveDataSet(sig.variants,
              .QC$thisStudy$SNPs_significant.path,
              columnSeparator = .QC$config$output_parameters$out_sep,
				ordered = .QC$config$output_parameters$ordered)

}



# this is for debugging purposes only
save_pre_modification_file <- function(pre.modification.file)
{

  if(.QC$config$debug$save_pre_modification_file)
    data.table::fwrite(pre.modification.file ,
                       file = .QC$config$paths$save_pre_modification_file ,
                       sep = '\t')
}
Any scripts or data that you put into this service are public.
GWASinspector documentation built on April 4, 2025, 4:35 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
GWASinspector
Comprehensive and Easy to Use Quality Control of GWAS Results

R/saveFilesFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

Defines functions save_pre_modification_file save_significant_variants save_and_remove_unusable_variants save_NA_Dataset saveDataSet_final saveDataSet

Try the GWASinspector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GWASinspector Comprehensive and Easy to Use Quality Control of GWAS Results

R/saveFilesFunctions.R In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results

Defines functions save_pre_modification_file save_significant_variants save_and_remove_unusable_variants save_NA_Dataset saveDataSet_final saveDataSet

Try the GWASinspector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GWASinspector
Comprehensive and Easy to Use Quality Control of GWAS Results

R/saveFilesFunctions.R
In GWASinspector: Comprehensive and Easy to Use Quality Control of GWAS Results