R/saveFilesFunctions.R

Defines functions save_pre_modification_file save_significant_variants save_and_remove_unusable_variants save_NA_Dataset saveDataSet_final saveDataSet

#this file includes functions for saving data sets and logging the job
# 1- saveDataSet()  => general function
# 2- saveDataSet_final() => saving matched data at the end of algorithm IF TRUE in config file





# Save a dataset to output directory
saveDataSet <- function(dataset,
                        file.path,
                        columnSeparator="\t",
                        naValue = "NA",
                        decValue = '.',
                        zipped = FALSE,
                        ordered = FALSE)
{

  # ORDER THE  output rows on CHR-position
  # order on RS number if CHR is not present
  if(ordered == TRUE){
    if(is.element('CHR', colnames(dataset)) && is.element('POSITION', colnames(dataset)) )
      dataset <- dataset[order(CHR,POSITION)]
    else if(is.element('MARKER', colnames(dataset)))
      dataset <- dataset[order(MARKER)]
  }


  ## 1- save dataset
  # zip if selected by user and Rtools is installed
  if(zipped){
    # much slower tha fwrite
    tryCatch(
      # write.table(dataset,
      #             dec = decValue,
      #             na = naValue,
      #             quote = FALSE,
      #             sep = columnSeparator,
      #             row.names = FALSE,
      #             file = gzfile(file.path ,compression = 3)),
      fwrite(x = dataset,
             file = file.path,
             sep = columnSeparator,
             na = naValue,
             dec = decValue,
             quote = FALSE,
             compress = "auto"),
      error = function(err)
      {
        print_and_log(sprintf("Error saving file! check below message for more information:\n%s",
                              err),
                      'warning')
      }
    )
  }else {
    tryCatch(
      fwrite(x = dataset,
             file = file.path,
             sep = columnSeparator,
             na = naValue,
             dec = decValue,
             quote = FALSE),

      error = function(err)
      {
        print_and_log(sprintf("Error saving file! check below message for more information:\n%s",
                              err),
                      'warning')
      }
    )
  }


  ## 2- create log
  if(zipped)
    print_and_log(sprintf("A zipped data file is saved as '%s' with '%s' rows!",
                          file.path,
                          thousand_sep(nrow(dataset))),
                  'info')
  else
    print_and_log(sprintf("A data file is saved as '%s' with '%s' rows!",
                          file.path,
                          thousand_sep(nrow(dataset))),
                  'info')

}



## TODO use 'filename_output' for saved file
saveDataSet_final<-function(dataset)
{
  config <- .QC$config

  if(config$output_parameters$save_final_dataset){

    ## save and effect-size reference rds object from input file
    ## only significant variants are saved based on pvalue
    if(config$output_parameters$save_as_effectSize_reference)
      effect_size_ref_make(dataset,
                           .QC$thisStudy$effect_size_ref.output.path)


    ## remove reference added columns from final dataset
    ## MULTOI_ALLELIC column is needed if it is generated
    ## =========================================

    requiredColNames <- .QC$thisStudy$renamed.File.Columns.sorted

    if(is.element('MULTI_ALLELIC',names(dataset)) && config$output_parameters$add_column_multiallelic == TRUE)
      requiredColNames <- c(requiredColNames,'MULTI_ALLELIC')

    if(is.element('HQ',names(dataset)) && config$output_parameters$add_column_HQ == TRUE)
    {
      requiredColNames <- c(requiredColNames,'HQ')
      dataset[,HQ := as.numeric(HQ)] # convert TRUE/FALSE to 1/0
    }

    if(is.element('highDiffEAF',names(dataset)) && config$output_parameters$add_column_AFmismatch == TRUE)
      requiredColNames <- c(requiredColNames,'highDiffEAF')

    if(is.element('AF',names(dataset)) && config$output_parameters$add_column_AF == TRUE)
      requiredColNames <- c(requiredColNames,'AF')

    if(is.element('REF_RSID',names(dataset)) && config$output_parameters$add_column_rsid == TRUE)
      requiredColNames <- c(requiredColNames,'REF_RSID')


    #select the required columns
    dataset <- subset(dataset , select = requiredColNames)

    #add hid column: CHR-pos-alleles
    if(config$output_parameters$add_column_hid == TRUE)
      {
        if(!is.data.table(dataset))
          setDT(dataset)

        varID = NULL

        dataset[, varID := paste(CHR,
                                 format(POSITION,scientific = FALSE,trim = TRUE),
                                 EFFECT_ALL,
                                 OTHER_ALL,
                                 sep = ':')]
        }


    # check if there were characters in chromosome column that should be reconverted to characters
    ## =========================================
    if(.QC$thisStudy$character.chromosome)
      dataset <- deconvert_column_CHR(dataset)

    # make sure position is not saved as scientific number ,e.g. 8e-6
    dataset$POSITION<-format(dataset$POSITION,scientific = FALSE,trim = TRUE)


    ## rename columns based on user preference
    #PLINK, GWAMA, GENABEL,...
    ## =========================================
    selected.header.format <- config$output_parameters$out_header

    if(selected.header.format == 'GENABEL'){

      if('build' %notin% colnames(data))
        dataset[, build := "unknown"]


      if("pgc" %notin% colnames(data)) {
        if('sebeta' %in% colnames(data)) # this is not in our standard columns
          dataset[, pgc := pchisq((EFFECT/(sebeta * sqrt(.QC$thisStudy$lambda) ))^2,
                                  1, lower.tail=FALSE)]
        else
          dataset[, pgc := NA]
      }

      if(!"lambda.estimate" %notin% colnames(data))
        dataset[, lambda.estimate := .QC$thisStudy$lambda]

      if(!"lambda.se" %notin% colnames(data))
        dataset[, lambda.se := NA]

    }

    new.col.names <- changeColumnNames(colnames(dataset), selected.header.format)
    colnames(dataset) <- new.col.names


    ## =========================================

    # add zip extension to output file path
    if(config$output_parameters$gzip_final_dataset)
      .QC$thisStudy$output.path <- paste0(.QC$thisStudy$output.path, '.gz')


    saveDataSet(dataset,
                .QC$thisStudy$output.path,
                columnSeparator = config$output_parameters$out_sep,
                naValue = config$output_parameters$out_na,
                decValue = config$output_parameters$out_dec,
                zipped = config$output_parameters$gzip_final_dataset,
				ordered = .QC$config$output_parameters$ordered)


  }else{
    print_and_log('Saving final dataset is skipped!','warning',display=.QC$config$debug$verbose)
  }
}


save_NA_Dataset <- function(input.data,input.data.backup) {

  #get the list of noncrucial columns that should be checked for NA values
  nonCrucialColumns <- getNonCrucialColumnNames()
  crucialColumns <- getCrucialColumnNames_onFileAnalysis()

  # get column names of input file
  current.col.names <- colnames(input.data)

  #
  row.count <- nrow(input.data)

  # we need to check the columns for NA values:
  # all column should not be NA
  # only required columns that are NOT all NA are checked for NA items
  wantedColumnsList <- sapply(nonCrucialColumns, function(x)
  {
    if(x %in% current.col.names){
      #if(input.data[is.na(eval(parse(text = x))), .N] == row.count)
      if(all(is.na(input.data[,eval(parse(text = x))])))
      {
        print_and_log(sprintf('Column \'%s\' removed from improbable_variant file because all where NA!',x)
                      ,'warning',display=.QC$config$debug$verbose)
        return(FALSE)
      }
      else
      {
        return(TRUE)
      }
    }else
    {
      return(FALSE)
    }
  })

  # get the list of required columns
  .QC$unwantedColumnsList <- names(wantedColumnsList[which(wantedColumnsList == FALSE)])
  wantedColumnsList <- names(wantedColumnsList[which(wantedColumnsList == TRUE)])



  na.rows <- which(!complete.cases(input.data[,wantedColumnsList,with=FALSE]))

  ## na.rows <- apply(input.data,1,anyNA) => very slow
  # na.rows <- which(!complete.cases(input.data))

  if(length(na.rows) > 0){

    # only require the first 100 rows of crucial columns and those that are not totally NA
    input.data <-  head(input.data[na.rows,union(crucialColumns,wantedColumnsList),with =FALSE],100)
    input.data.backup <-  head(input.data.backup[na.rows,union(crucialColumns,wantedColumnsList),with =FALSE],100)

    INVALID_COLUMN<-apply(input.data,
                          1,
                          function(x)
                          {
                            c<- which(is.na(x))
                            paste(names(c),collapse = '|')
                          })
    input.data.backup <- cbind(input.data.backup,INVALID_COLUMN)
    #	[filename_output]_SNPs_improbable_values.txt
    saveDataSet(input.data.backup,
                .QC$thisStudy$SNPs_improbable_values.path,
                columnSeparator = .QC$config$output_parameters$out_sep,
                naValue = .QC$config$output_parameters$out_na,
                decValue = .QC$config$output_parameters$out_dec,
				ordered = .QC$config$output_parameters$ordered)

  }
}


save_and_remove_unusable_variants <- function(input.data,input.data.backup) {

  #TODO break to two functions
  ## === 1 - first check allele columns => data is saved as SNPS_invalid_alleles.txt
  ## === 2 - check other crucial columns => data is saved as SNPS_removed.txt
  ## === 3 - remove from dataset

  ## 1
  ## save invalid allele variants as dataset

  invalid.allele.index.union <- union( .QC$thisStudy$column.INVALID.list$EFFECT_ALL,
                                       .QC$thisStudy$column.INVALID.list$OTHER_ALL)

  if(length(invalid.allele.index.union) > 0)
  {

    # [filename_output]_SNPs_invalid_allele.txt
    saveDataSet(input.data.backup[head(invalid.allele.index.union,100)],
                file.path = .QC$thisStudy$SNPs_invalid.path,
                columnSeparator = .QC$config$output_parameters$out_sep,
                naValue = .QC$config$output_parameters$out_na,
                decValue = .QC$config$output_parameters$out_dec,
				ordered = .QC$config$output_parameters$ordered)
  }

  ## find rows with NA in allele columns
  ## data is removed from dataset
  # invalid alleles are set to NA in processCOlumns() => process_column_EFFECT_ALL()
  # so, invalid.allele.index is a subset of missing.allele.index
  #


  ## 2
  ## find rows with NA in crucial columns (not alleles)
  ## first 100 is saved
  ## data is removed from dataset

  crucial.columns <- getCrucialColumnNames_onFileAnalysis()
  # allele columns are checked in previous phase (SNPs_invalid_allele) and not needed here
  crucial.columns <- crucial.columns[!crucial.columns %in% c('EFFECT_ALL','OTHER_ALL')]

  # add EFFECT to crucial columns if not present
  # this was removed from initial set because effect could be either beta , or
  if(!is.element('EFFECT',crucial.columns))
    crucial.columns <- append(crucial.columns,'EFFECT')


  crucial.col.index <- match(crucial.columns ,names(input.data))


  # get the list of NA values in each crucial column
  # column operaion is faster than row operation
  NA.list <- sapply(crucial.col.index, function(x)
    {
      which(is.na(input.data[,x,with=FALSE]))
    }
  )


  # check the union of missing rows indexes for each crucial column
  # this result set shows which rows have at least one missing values in a crucial column
  # NA.list.union <- unlist(NA.list[1])
  # for(i in 1:length(NA.list)){
  #   NA.list.union<-union(NA.list.union,
  #                        unlist(NA.list[[i]]))
  # }

  NA.list.union <- Reduce(union,NA.list) ## easier than above loop


  if(length(NA.list.union) > 0)
  {
    sample.data <- input.data.backup[head(NA.list.union,100)] ## 100 samples are saved from backup file, before invalid items are converted to NA

    # [filename_output]_SNPs_removed.txt
    saveDataSet(sample.data,
                file.path = .QC$thisStudy$SNPs_removed.path,
                columnSeparator = .QC$config$output_parameters$out_sep,
                naValue = .QC$config$output_parameters$out_na,
                decValue = .QC$config$output_parameters$out_dec,
				ordered = .QC$config$output_parameters$ordered)
  }

  ## 3
  # find union of missing values in crucial clumns with either allele columns for removing from dataset
  total.NA.list.union <- Reduce(union, list(NA.list.union,
                                            .QC$thisStudy$column.NA.list$OTHER_ALL,
                                            .QC$thisStudy$column.NA.list$EFFECT_ALL
  )
  )

  if(length(total.NA.list.union) > 0)
  {
    print_and_log(sprintf('%s variants that missed a crucial value were removed from dataset (step 1)!',thousand_sep(length(total.NA.list.union))),
                  'warning',display=.QC$config$debug$verbose)
    input.data <- input.data[!total.NA.list.union]
    .QC$thisStudy$missing.crucial.rowcount <- length(total.NA.list.union)
  }


  # also remove variants wher nchar(EA) > 1 and nchar(OA) > 1
  # e.g.    EA = GCGCGCGTA and OA = TGTG
  # input.data <- process_column_BOTH_ALL(input.data)


  return(input.data)
}

save_significant_variants <- function(input.data) {

  sig.variants <- input.data[HQ == TRUE &
                               PVALUE < 5e-08 &
                               startsWith(MARKER,'rs'),
                             list(CHR,POSITION,MARKER,PVALUE)]

  saveDataSet(sig.variants,
              .QC$thisStudy$SNPs_significant.path,
              columnSeparator = .QC$config$output_parameters$out_sep,
				ordered = .QC$config$output_parameters$ordered)

}



# this is for debugging purposes only
save_pre_modification_file <- function(pre.modification.file)
{

  if(.QC$config$debug$save_pre_modification_file)
    data.table::fwrite(pre.modification.file ,
                       file = .QC$config$paths$save_pre_modification_file ,
                       sep = '\t')
}

Try the GWASinspector package in your browser

Any scripts or data that you put into this service are public.

GWASinspector documentation built on Sept. 28, 2023, 1:06 a.m.