R/variantMatchingFunctions.R

Defines functions compareInputfileWithBetaReferenceFile compareInputfileWithReferenceFile compareInputfileWithAlternateReferenceFile compareInputfileWithReferenceData

compareInputfileWithReferenceData <- function(input.data)
{
  #  .QC$reference.data ==> this can be a data table or a database object
  #  .QC$alt.reference.daOta
  ## 1
  if(!is.null(.QC$stored.reference.data))
    input.data <- compareInputfileWithStoredReferenceFile(input.data)
  else if(is.data.table(.QC$reference.data))
    input.data <- compareInputfileWithReferenceFile(input.data)
  else
    input.data <- compareInputfileWithReferenceDataBase(input.data)


  ## 2
  ## check the unknown variants with alternative reference file
  ## this refrence file will be empty if is not set by user and this step is automatically skipped

  input.data <- tryCatch(compareInputfileWithAlternateReferenceFile(input.data),
                         error = function(x)
                         {
                           print_and_log(paste('Error in searching alternate reference database:',x$message),
                                         'warning')
                           return(input.data)
                         })

  return(input.data)
}

compareInputfileWithAlternateReferenceFile <- function(input.data)
{
  #check if alt ref data has any rows & there are unfound variants
  if(nrow(.QC$alt.reference.data) > 0 & input.data[is.na(REF),.N] > 0 )
  {
    print_and_log('Comparing input file with alternate reference file ...','info')
    # we only want to check unfound variants in the alt ref dataset
    # we only need original file columns, so , added columns from checking with reference file from previous step should be deleted
    if(.QC$thisStudy$hID.added)
    {

      # double check KEY property for merging data
      if(is.null(data.table::key(.QC$alt.reference.data)) || data.table::key(.QC$alt.reference.data) != 'hID')
        data.table::setkey(.QC$alt.reference.data , hID)

      if(is.null(data.table::key(input.data)) || data.table::key(input.data) != 'hID')
        data.table::setkey(input.data , hID)


      # FIXME change merge to data table join
      tmp.data <- merge(x = input.data[is.na(REF) ,!c("REF","ALT","SOURCE","DATE_ADDED","AF")],
                        y = .QC$alt.reference.data,
                        by.x = "hID",
                        by.y = "hID",
                        all.x = TRUE)
    }
    else
    {

      setkey(input.data,"MARKER")
      setkey(.QC$alt.reference.data,"ID")

      tmp.data <- merge(x = input.data[is.na(REF) ,!c("REF","ALT","SOURCE","DATE_ADDED","AF")],
                        y = subset(.QC$alt.reference.data,
                                   select = c('ID','REF','ALT','AF','DATE_ADDED','SOURCE')),
                        by.x = "MARKER",
                        by.y = "ID",
                        all.x = TRUE)
    }



    # reomve ID column from alternate reference file
    if(is.element('ID', names(tmp.data)))
      tmp.data[,ID := NULL] ## removed for consistency with reference data

    if(!is.element('MULTI_ALLELIC', names(tmp.data)))
      tmp.data[,MULTI_ALLELIC := as.numeric(NA)] ## added for consistency with reference data

    # if(!is.element('ignore', names(tmp.data)))
    #   tmp.data[,ignore := NA] ## added for consistency with reference data

    tmp.data[,DATE_ADDED := as.character(DATE_ADDED)]
    input.data[,DATE_ADDED := as.character(DATE_ADDED)]

    # bind matched data with reference set with  matched data from alt reference set
    input.data<-rbind(input.data[!is.na(REF),],tmp.data)
    rm(tmp.data)
  }

  return(input.data)
}


compareInputfileWithReferenceFile<-function(input.data)
{

   # merge using hID if it Exists
  if(.QC$thisStudy$hID.added)
  {

    #set key for fast access
    setkey(.QC$reference.data,"hID")
    setkey(input.data,"hID")

    input.data<-merge(x=input.data,
                      y=.QC$reference.data,
                      by.x="hID",
                      by.y="hID",
                      all.x=TRUE)


  }
  else # merge using rsID if hID does not Exist
  {

    setkey(input.data,"MARKER")
    setkey(.QC$reference.data,"ID")

    input.data<-merge(x=input.data,
                      y=subset(.QC$reference.data,
                               select = c('ID','REF','ALT','AF','DATE_ADDED','SOURCE')),
                      by.x="MARKER",
                      by.y="ID",
                      all.x=TRUE)

  }

  # find multi-allelic variants
  input.data[, MULTI_ALLELIC := ifelse(grepl(',',AF),1,0)]

  ## get frequency table for multi-allelic variants
  ## moved to after step 3
  ##.QC$thisStudy$tables$multi_allele_count_preProcess <- getMultiAlleleCountTbl(input.data,'AF')


  ## TODO the following sectoin is similar to lines 194-234 rSQLiteFunctions.R
  ## try allele matching on multi-allelic variants
  # if(is.element('Yes',.QC$thisStudy$tables$multi_allele_count_preProcess$`Multi-allelic`))
  if(any(input.data$MULTI_ALLELIC == 1, na.rm = TRUE))
  {
    input.data[MULTI_ALLELIC == 1,
               c('ALT','AF') := clean_multi_alleles(EFFECT_ALL , OTHER_ALL, REF, ALT, AF) ,
               by = list(EFFECT_ALL , OTHER_ALL,REF, ALT,AF)]

    # some multi-allele INDEL AFs are all 0 and will be returned the same way due to missing alleles
    # e.g. AAC,AA,TT  0,0,0   ==> this AF can be converted to 0
    #input.data[VT == 2 & MULTI_ALLELIC == 1 &  grepl(',', AF) & all(strsplit(AF,',')[[1]] == "0") , AF := "0" ]
    input.data[VT == 2 & MULTI_ALLELIC == 1 &  grepl(',', AF) & !grepl('[1-9]',AF) , AF := "0" ]

    ## get frequency table for multi-allelic variants
    #.QC$thisStudy$tables$multi_allele_count_postProcess <- getMultiAlleleCountTbl(input.data,'AF')

  }




  # AF column may be character type due to remaining ',' => convert to numeric
  # AF of multi-allelics than could notbe matched are set as NA
  if(!is.numeric(input.data$AF))
    input.data[, AF := as.numeric(AF)]


  # FIXME do not convert ALT to NA because it is used to count unmatched multiallelic variants
  #input.data[is.na(AF) , `:=` (REF = NA , ALT = NA)]
  # input.data[is.na(AF) , REF := NA ]




  ## add column for consistency with table version
  input.data[,DATE_ADDED := NA ]

  ## add std_ref to found variants
  input.data[!is.na(REF), SOURCE := 'Std_ref' ]

  # ' REF , ALT , AF , DATE_ADDED , SOURCE' columns are added to input data

  return(input.data)
}



compareInputfileWithBetaReferenceFile<-function(input.data)
{


  m_b <- mean(input.data$EFFECT)
  sd_b <- sd(input.data$EFFECT)
  ci_b <- m_b + (5 * sd_b)


  if(is.element('hID', names(input.data)))
  {
    input.data <- input.data[,c('EFFECT_ALL', 'OTHER_ALL',  'EFFECT','HQ','hID','PVALUE')]
    setkey(.QC$reference.data.effect, "hID")
    #setkey(input.data, "hID")

    ## 1
    # merge data with reference file
    matched.data<-merge(x=.QC$reference.data.effect,
                        y=input.data,
                        by="hID",
                        all.x=TRUE)

  }
  else
  {
    print_and_log('hID is missing. Can not compare input file with reference dataset.','warning',display=.QC$config$debug$verbose)
    return(data.table())
  }


  matched.data <- matched.data[HQ == TRUE &
                                 !is.na(EFFECT.y) &
                                 EFFECT_ALL == ALT &
                                 OTHER_ALL == REF &
                                 EFFECT.y > (-1 * ci_b) &
                                 EFFECT.y < ci_b , ]



  # p-value of the variants in reference dataset is < 0.001
  # X: reference Beta
  # Y: input file Beta
  .QC$thisStudy$effect.rho_3 <- signif(cor(matched.data$EFFECT.x ,matched.data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_3.n <- nrow(matched.data)

  temp_data <- matched.data[PVALUE.x < 0.0001,]
  .QC$thisStudy$effect.rho_4 <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_4.n <- nrow(temp_data)

  temp_data <- matched.data[PVALUE.x < 0.00001,]
  .QC$thisStudy$effect.rho_5 <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_5.n <- nrow(temp_data)


  temp_data <- matched.data[PVALUE.x < 0.000001,]
  .QC$thisStudy$effect.rho_6 <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_6.n <- nrow(temp_data)


    # filtering the variants in input rsult files on P_value
  temp_data <- matched.data[PVALUE.y < 0.001,]
  .QC$thisStudy$effect.rho_3.y <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_3.n.y <- nrow(temp_data)


  temp_data <- matched.data[PVALUE.y < 0.0001,]
  .QC$thisStudy$effect.rho_4.y <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_4.n.y <- nrow(temp_data)

  temp_data <- matched.data[PVALUE.y < 0.00001,]
  .QC$thisStudy$effect.rho_5.y <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_5.n.y <- nrow(temp_data)


  temp_data <- matched.data[PVALUE.y < 0.000001,]
  .QC$thisStudy$effect.rho_6.y <- signif(cor(temp_data$EFFECT.x ,temp_data$EFFECT.y),4)
  .QC$thisStudy$effect.rho_6.n.y <- nrow(temp_data)


  .QC$thisStudy$tables$betaCor.tbl <- t(data.table( 'P-value < 0.001' = c(sprintf("%s (%s)",.QC$thisStudy$effect.rho_3 , .QC$thisStudy$effect.rho_3.n),
                                                                          sprintf("%s (%s)",.QC$thisStudy$effect.rho_3.y , .QC$thisStudy$effect.rho_3.n.y)),
                                                    'P-value < 0.0001' = c(sprintf("%s (%s)",.QC$thisStudy$effect.rho_4 , .QC$thisStudy$effect.rho_4.n),
                                                                           sprintf("%s (%s)",.QC$thisStudy$effect.rho_4.y , .QC$thisStudy$effect.rho_4.n.y)),
                                                    'P-value < 0.00001' = c(sprintf("%s (%s)",.QC$thisStudy$effect.rho_5 , .QC$thisStudy$effect.rho_5.n),
                                                                            sprintf("%s (%s)",.QC$thisStudy$effect.rho_5.y , .QC$thisStudy$effect.rho_5.n.y)),
                                                    'P-value < 0.000001' = c(sprintf("%s (%s)",.QC$thisStudy$effect.rho_6 , .QC$thisStudy$effect.rho_6.n),
                                                                             sprintf("%s (%s)",.QC$thisStudy$effect.rho_6.y , .QC$thisStudy$effect.rho_6.n.y))))

  colnames(.QC$thisStudy$tables$betaCor.tbl) <- c('r*','r**')

  return(matched.data[PVALUE.x < 0.0001,])
}

Try the GWASinspector package in your browser

Any scripts or data that you put into this service are public.

GWASinspector documentation built on Sept. 28, 2023, 1:06 a.m.