R/processColumns.R

Defines functions process_column_MARKER process_column_N_TOTAL process_column_N_CASES process_column_CALLRATE process_column_IMP_QUALITY process_column_HWE_PVAL process_column_EFF_ALL_FREQ process_column_PVALUE process_column_STDERR process_column_EFFECT process_column_POSITION process_column_STRAND process_column_IMPUTED process_column_BOTH_ALL process_column_OTHER_ALL process_column_EFFECT_ALL deconvert_column_CHR process_column_CHR

#### COLUMN NAMES
# CALLRATE
# CHR
# EFF_ALL_FREQ
# EFFECT
# EFFECT_ALL
# HWE_PVAL
# IMP_QUALITY
# IMPUTED
# MARKER
# N_TOTAL
# OTHER_ALL
# POSITION
# PVALUE
# STDERR
# STRAND




##CHR column should be as character type
process_column_CHR <- function(input.data){

  .QC$thisStudy$character.chromosome <- FALSE

  if(!is.numeric(input.data$CHR))
  {

    # check if column has character values
    # they are converted to numbers here, but should be deconverted to character before saving final dataset
    .QC$thisStudy$character.chromosome <- any(input.data$CHR %in% c('0X' ,'0Y','X','Y','XY','M','MT'))

    input.data[CHR == '0X', CHR := '23']
    input.data[CHR == 'X'  , CHR := '23']
    input.data[CHR == 'Y'  , CHR := '24']
    input.data[CHR == '0Y'  , CHR := '24']
    input.data[CHR == 'XY' , CHR := '25']
    input.data[CHR == 'M'  , CHR := '26']
    input.data[CHR == 'MT' , CHR := '26']


    #  input.data$CHR<-as.numeric(input.data$CHR)
    input.data[,CHR := as.numeric(CHR)]
  }

  # convert out of range chromosome values wiht NA
  invalid.items <- which(input.data$CHR < 1 | input.data$CHR > 26)

  .QC$thisStudy$column.INVALID.list$CHR <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items , CHR := NA]
  }

  .QC$thisStudy$column.NA.list$CHR <- which(is.na(input.data$CHR))


  return(input.data)
}

## convert chromosomes from numeric to character
deconvert_column_CHR <- function(input.data){

  if(!is.character(input.data$CHR))
    input.data[,CHR := as.character(CHR)]
  #input.data$CHR<-as.character(input.data$CHR)

  input.data[CHR == '23'  , CHR := 'X']
  input.data[CHR == '24'  , CHR := 'Y']
  input.data[CHR == '25' , CHR := 'XY']
  input.data[CHR == '26'  , CHR := 'M']

  return(input.data)
}

##########
process_column_EFFECT_ALL<- function(input.data){

  ## convert empty strings to NA
  input.data[trimws(EFFECT_ALL) == '' , EFFECT_ALL := NA]

  invalid.items <- which(!is.na(input.data$EFFECT_ALL) & !grepl(pattern = '^[ATCGIDR]+$', x = input.data$EFFECT_ALL ,perl = TRUE))

  .QC$thisStudy$column.INVALID.list$EFFECT_ALL <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, EFFECT_ALL := NA]
  }

  .QC$thisStudy$column.NA.list$EFFECT_ALL <- which(is.na(input.data$EFFECT_ALL))

  # chekc if allele has none Base characters
  if(any(is.element(input.data$EFFECT_ALL,c('D','I','R'))))
  {
    .QC$thisStudy$hanNoneBaseAlleles <- TRUE
    print_and_log('File uses non-standard characters for alleles!','warning',display=.QC$config$debug$verbose)
  }

  return(input.data)
}

##########
process_column_OTHER_ALL<- function(input.data){

  ## convert empty strings to NA
  input.data[trimws(OTHER_ALL) == '' , OTHER_ALL := NA]

  invalid.items <- which(!is.na(input.data$OTHER_ALL) & !grepl(pattern = '^[ATCGIDR]+$', x = input.data$OTHER_ALL ,perl = TRUE))

  .QC$thisStudy$column.INVALID.list$OTHER_ALL <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, OTHER_ALL := NA]
  }

  .QC$thisStudy$column.NA.list$OTHER_ALL <- which(is.na(input.data$OTHER_ALL))

  return(input.data)
}


##########

process_column_BOTH_ALL <- function(input.data) {

  invalid.items <- which(nchar(input.data$OTHER_ALL) > 1 & nchar(input.data$EFFECT_ALL) > 1)

  if(length(invalid.items) > 0){

    .QC$thisStudy$column.INVALID.list$BOTH_ALL <- invalid.items
    reqColumns <- .QC$thisStudy$renamed.File.Columns.sorted
    sample.data <- input.data[head(invalid.items,100), ..reqColumns] ## 100 samples are saved

    saveDataSet(sample.data,
                  file.path = .QC$thisStudy$SNPs_invalid_both.path,
                  columnSeparator = .QC$config$output_parameters$out_sep,
                  naValue = .QC$config$output_parameters$out_na,
                  decValue = .QC$config$output_parameters$out_dec,
                  ordered = .QC$config$output_parameters$ordered)


    print_and_log(sprintf('%s variants with conflicting alleles were found.',
                          thousand_sep(length(invalid.items))),
                  'warning',
                  display=.QC$config$debug$verbose)

	input.data <- input.data[!invalid.items,]

  }
  return(input.data)
}

##########
process_column_IMPUTED<- function(input.data){
  config <- .QC$config
  # FIXME check if convert to uppercase is required for as.logical function
  # FIXME TRUE FALSE or 0 1


  # input.data$IMPUTED<-as.logical(input.data$IMPUTED) ## this is new conversion fron char to logical
  #converting from logical to numeric converts T to 1

  if(!is.numeric(input.data$IMPUTED)){

    input.data[,IMPUTED := toupper(IMPUTED)]

    input.data[!is.na(IMPUTED),  IMPUTED := gsub(pattern = config$input_parameters$imputed_T,
                                                 x = IMPUTED,
                                                 replacement = 1)]

    input.data[!is.na(IMPUTED),  IMPUTED := gsub(pattern = config$input_parameters$imputed_F,
                                                 x = IMPUTED,
                                                 replacement = 0)]


    # input.data$IMPUTED= gsub(pattern = config$input_parameters$imputed_T,
    #                          x = input.data$IMPUTED,
    #                          replacement = 1)
    # input.data$IMPUTED= gsub(pattern = config$input_parameters$imputed_F,
    #                          x = input.data$IMPUTED,
    #                          replacement = 0)

    #input.data$IMPUTED <- as.numeric(input.data$IMPUTED)
    input.data[,IMPUTED := as.numeric(IMPUTED)]
  }

  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  invalid.items <- which(input.data$IMPUTED != 1 & input.data$IMPUTED != 0)

  .QC$thisStudy$column.INVALID.list$IMPUTED <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, IMPUTED := NA]
  }


  .QC$thisStudy$column.NA.list$IMPUTED <- which(is.na(input.data$IMPUTED))



  return(input.data)
}


##########
process_column_STRAND<- function(input.data){

  input.data<-switchNegativeStrandsToPositive(input.data) ## variantModifierFUnction.R

  # convert empty strings to NA
  input.data[trimws(STRAND) == '' ,STRAND := NA]

  # all negative strands are converted to positive, so strand  is invalid if not NA or +
  invalid.items <- which(!is.na(input.data$STRAND) & input.data$STRAND != '+')
  .QC$thisStudy$column.INVALID.list$STRAND <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, STRAND := NA]
  }


  .QC$thisStudy$column.NA.list$STRAND <- which(is.na(input.data$STRAND))

  return(input.data)
}


##########
process_column_POSITION<- function(input.data){
  if(!is.numeric(input.data$POSITION))
    input.data[,POSITION := as.numeric(POSITION)]


  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  invalid.items <- which(input.data$POSITION <= 0)

  .QC$thisStudy$column.INVALID.list$POSITION <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, POSITION := NA]
  }

  ##
  .QC$thisStudy$column.NA.list$POSITION <- which(is.na(input.data$POSITION))

  return(input.data)
}

##########
process_column_EFFECT<- function(input.data){

  # ==== convert Beta to OR
  # odds_ratio = exp(Beta)
  # Beta = natural log(odds_taio)

  if(.QC$config$input_parameters$effect_type == 'BETA')
  {
    # change the name of beta column to EFFECT in input data
    names(input.data)[names(input.data) == 'BETA'] <- 'EFFECT'

    # change the name of beta column to EFFECT in renamed columns
    .QC$thisStudy$renamed.File.Columns.sorted[grepl(x = .QC$thisStudy$renamed.File.Columns.sorted, pattern = 'BETA')] =
      'EFFECT'
  }
  else
  {
    if(!is.numeric(input.data$OR))
      input.data[,OR := as.numeric(OR)]

    input.data[,EFFECT := log(OR)]

    # check if any of the EFFECT values are Inf and remove them
    if(any(is.infinite(input.data$EFFECT)))
    {
      infinite.ORs <- which(is.infinite(input.data$EFFECT))

      if(length(infinite.ORs) > 0)
      {
        print_and_log(paste0('variants with Infinite OR value are removed from input file: ',length(infinite.ORs)),'warning',display=.QC$config$debug$verbose)
        input.data[is.infinite(EFFECT), EFFECT := NA]
        # input.data <- input.data[!infinite.ORs,]
      }
    }
  }

  # ===

  if(!is.numeric(input.data$EFFECT))
    input.data[,EFFECT := as.numeric(EFFECT)]



  # EFFECT = -1 is not invalid anymore in recent analysis
  # invalid.items <- which(input.data$EFFECT == -1 &
  #                          (input.data$PVALUE == -1 | input.data$STDERR == -1))
  #
  # .QC$thisStudy$column.INVALID.list$EFFECT <- invalid.items
  #
  # if(length(invalid.items) > 0){
  #   input.data[invalid.items, EFFECT := NA]
  # }

  .QC$thisStudy$column.NA.list$EFFECT <- which(is.na(input.data$EFFECT))


  return(input.data)
}

##########
process_column_STDERR<- function(input.data){
  if(!is.numeric(input.data$STDERR))
    input.data[,STDERR := as.numeric(STDERR)]
  #input.data$STDERR<-as.numeric(input.data$STDERR)


  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  uncertain.items <- which(input.data$STDERR == 0) ## it may be due too poor rounding

  .QC$thisStudy$column.INVALID.list$zero.STDERR <- uncertain.items
  if(length(uncertain.items) > 0){
    input.data[uncertain.items, STDERR := NA]
  }


  invalid.items <- which(input.data$STDERR < 0 | input.data$STDERR == Inf) ## -1 is covered as < 0
  .QC$thisStudy$column.INVALID.list$STDERR <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, STDERR := NA]
  }

  .QC$thisStudy$column.NA.list$STDERR <- which(is.na(input.data$STDERR))


  return(input.data)
}


##########
process_column_PVALUE<- function(input.data){
  if(!is.numeric(input.data$PVALUE))
    input.data[,PVALUE := as.numeric(PVALUE)]
  # input.data$PVALUE<-as.numeric(input.data$PVALUE)


  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  uncertain.items <- which(input.data$PVALUE == -1 )
  .QC$thisStudy$column.INVALID.list$minusone.PVALUE <- uncertain.items

  if(length(uncertain.items) > 0){
    input.data[uncertain.items, PVALUE := NA]
  }


  invalid.items <- which(input.data$PVALUE > 1 | input.data$PVALUE <= 0)
  .QC$thisStudy$column.INVALID.list$PVALUE <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, PVALUE := NA]
  }

  .QC$thisStudy$column.NA.list$PVALUE <- which(is.na(input.data$PVALUE))

  return(input.data)
}

##########
process_column_EFF_ALL_FREQ<- function(input.data){
  if(!is.numeric(input.data$EFF_ALL_FREQ))
    input.data[,EFF_ALL_FREQ := as.numeric(EFF_ALL_FREQ)]
  #input.data$EFF_ALL_FREQ<-as.numeric(input.data$EFF_ALL_FREQ)


  .QC$thisStudy$column.INVALID.list$one.EFF_ALL_FREQ <- which(input.data$EFF_ALL_FREQ == 1)
  .QC$thisStudy$column.INVALID.list$zero.EFF_ALL_FREQ <- which(input.data$EFF_ALL_FREQ == 0)

  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  uncertain.items <- which(input.data$EFF_ALL_FREQ == -1 )
  .QC$thisStudy$column.INVALID.list$minusone.EFF_ALL_FREQ <- uncertain.items

  if(length(uncertain.items) > 0){
    input.data[uncertain.items, EFF_ALL_FREQ := NA]
  }



  invalid.items <- which(input.data$EFF_ALL_FREQ > 1 |
                           input.data$EFF_ALL_FREQ < 0) ## equal to 1 or 0 is monomorphic

  .QC$thisStudy$column.INVALID.list$EFF_ALL_FREQ <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, EFF_ALL_FREQ := NA]
  }


  .QC$thisStudy$column.NA.list$EFF_ALL_FREQ <- which(is.na(input.data$EFF_ALL_FREQ))

  return(input.data)
}

##########
process_column_HWE_PVAL<- function(input.data){
  if(!is.numeric(input.data$HWE_PVAL))
    input.data[,HWE_PVAL := as.numeric(HWE_PVAL)]
  #input.data$HWE_PVAL<-as.numeric(input.data$HWE_PVAL)



  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  uncertain.items <- which(input.data$HWE_PVAL == -1 )

  .QC$thisStudy$column.INVALID.list$minusone.HWE_PVAL <- uncertain.items

  if(length(uncertain.items) > 0){
    input.data[uncertain.items, HWE_PVAL := NA]

  }


  invalid.items <- which(input.data$HWE_PVAL > 1 |
                           input.data$HWE_PVAL <= 0)

  .QC$thisStudy$column.INVALID.list$HWE_PVAL <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, HWE_PVAL := NA]
  }


  .QC$thisStudy$column.NA.list$HWE_PVAL <- which(is.na(input.data$HWE_PVAL))

  # Fixed HWE P-value
  if(length(unique(input.data$HWE_PVAL)) == 1)
    .QC$thisStudy$fixed.hwep <- sprintf('YES (%s)' , input.data[1]$HWE_PVAL)
  else
    .QC$thisStudy$fixed.hwep <- 'No'


  return(input.data)
}

##########
process_column_IMP_QUALITY<- function(input.data){
  config <- .QC$config

  if(!is.numeric(input.data$IMP_QUALITY))
    input.data[,IMP_QUALITY := as.numeric(IMP_QUALITY)]
  #input.data$IMP_QUALITY<-as.numeric(input.data$IMP_QUALITY)


  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  invalid.items <- which(input.data$IMP_QUALITY <= config$filters$minimal_impQ_value |
                           input.data$IMP_QUALITY >= config$filters$maximal_impQ_value  |
                           input.data$IMP_QUALITY == Inf)

  .QC$thisStudy$column.INVALID.list$IMP_QUALITY <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, IMP_QUALITY := NA]
  }

  .QC$thisStudy$column.NA.list$IMP_QUALITY <- which(is.na(input.data$IMP_QUALITY))



  # Fixed imputation quality
  if(length(unique(input.data$IMP_QUALITY)) == 1)
    .QC$thisStudy$fixed.impq <-  sprintf('YES (%s)' , input.data[1]$IMP_QUALITY)
  else
    .QC$thisStudy$fixed.impq <- 'No'



  return(input.data)
}

##########
process_column_CALLRATE<- function(input.data){
  if(!is.numeric(input.data$CALLRATE))
    input.data[,CALLRATE := as.numeric(CALLRATE)]
  #input.data$CALLRATE<-as.numeric(input.data$CALLRATE)


  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  uncertain.items <- which(input.data$CALLRATE == -1 )
  .QC$thisStudy$column.INVALID.list$minusone.CALLRATE <- uncertain.items

  if(length(uncertain.items) > 0){
    input.data[uncertain.items, CALLRATE := NA]
  }


  invalid.items <- which(input.data$CALLRATE > 1 | input.data$CALLRATE < 0)

  .QC$thisStudy$column.INVALID.list$CALLRATE <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, CALLRATE := NA]
  }


  .QC$thisStudy$column.NA.list$CALLRATE <- which(is.na(input.data$CALLRATE))


  #  Fixed call rate
  if(length(unique(input.data$CALLRATE)) == 1)
    .QC$thisStudy$fixed.callrate <- sprintf('YES (%s)' , input.data[1]$CALLRATE)
  else
    .QC$thisStudy$fixed.callrate <- 'No'



  return(input.data)
}

##########

process_column_N_CASES<- function(input.data){

  if(!is.numeric(input.data$N_CASES))
    input.data[,N_CASES := as.numeric(N_CASES)]

  .QC$thisStudy$MAX_N_CASES <- max(input.data$N_CASES,na.rm = TRUE)

  if(.QC$thisStudy$MAX_N_CASES == Inf | .QC$thisStudy$MAX_N_CASES == -Inf)
  {
    print_and_log('Maximum of N_CASES was Inf and was converted to NA!',
                  'warning',
                  display=.QC$config$debug$verbose)
    .QC$thisStudy$MAX_N_CASES = NA
  }
  else if(.QC$thisStudy$MAX_N_CASES %% 1 != 0) # round the value if it has decimal points
  {
    print_and_log(sprintf("%s %s ==> %s",
                          'Maximum of N_CASES had decimal point and was rounded!',
                          .QC$thisStudy$MAX_N_CASES,
                          round(.QC$thisStudy$MAX_N_CASES ,digits = 0)),
                  'warning',display=.QC$config$debug$verbose)

    .QC$thisStudy$MAX_N_CASES = round(.QC$thisStudy$MAX_N_CASES ,digits = 0)
  }


  # Fixed sample size
  if(length(unique(input.data$N_CASES)) == 1)
    print_and_log(sprintf("%s %s",
                          'N_CASES is fixed!',
                          input.data[1]$N_CASES),
                  'warning',display=.QC$config$debug$verbose)

}

########################################

process_column_N_TOTAL<- function(input.data){
  if(!is.numeric(input.data$N_TOTAL))
    input.data[,N_TOTAL := as.numeric(N_TOTAL)]
  #input.data$N_TOTAL<-as.numeric(input.data$N_TOTAL)


  ## Check for inavlid or wrong or missing items => set them to NA
  ## used for report
  invalid.items <- which(input.data$N_TOTAL <= 0)

  .QC$thisStudy$column.INVALID.list$N_TOTAL <- invalid.items

  if(length(invalid.items) > 0){
    input.data[invalid.items, N_TOTAL := NA]
  }

  .QC$thisStudy$column.NA.list$N_TOTAL <- which(is.na(input.data$N_TOTAL))

  # maximum number of N
  .QC$thisStudy$MAX_N_TOTAL <- max(input.data$N_TOTAL,na.rm = TRUE)

  if(.QC$thisStudy$MAX_N_TOTAL == Inf | .QC$thisStudy$MAX_N_TOTAL == -Inf)
  {
    print_and_log(sprintf("%s %s %s",
                          'Maximum of N_TOTAL was ', .QC$thisStudy$MAX_N_TOTAL,' and was converted to NA!'),
                  'warning',
                  display=.QC$config$debug$verbose)

    .QC$thisStudy$MAX_N_TOTAL = NA
  }
  else if(.QC$thisStudy$MAX_N_TOTAL %% 1 != 0) # round the value if it has decimal points
  {
    print_and_log(sprintf("%s %s ==> %s",
                          'Maximum of N_TOTAL had decimal point and was rounded!',
                          .QC$thisStudy$MAX_N_TOTAL,
                          round(.QC$thisStudy$MAX_N_TOTAL ,digits = 0)),
                  'warning',display=.QC$config$debug$verbose)

    .QC$thisStudy$MAX_N_TOTAL = round(.QC$thisStudy$MAX_N_TOTAL ,digits = 0)
  }


  # Fixed sample size
  if(length(unique(input.data$N_TOTAL)) == 1)
    .QC$thisStudy$fixed.n_total <- sprintf('YES (%s)' , input.data[1]$N_TOTAL)
  else
    .QC$thisStudy$fixed.n_total <- 'No'


  return(input.data)
}


process_column_MARKER<- function(input.data){

  ## FIXME nothing set for invalid marker
  .QC$thisStudy$column.INVALID.list$MARKER <- numeric(0L)

  # TODO we are using hID so no need for rsID checking ?!?!
  .QC$thisStudy$column.NA.list$MARKER <- which(is.na(input.data$MARKER))

  return(input.data)
}

Try the GWASinspector package in your browser

Any scripts or data that you put into this service are public.

GWASinspector documentation built on Sept. 28, 2023, 1:06 a.m.