Nothing
getRSQLiteDatabase <- function(database.path){
if(file.exists(database.path)){
DB<-RSQLite::dbConnect(RSQLite::SQLite(),database.path)
return(DB)
}
else{
print_and_log(paste('database file not found at:',database.path),'fatal')
return(NULL)
}
}
getRSQLiteDatabase_tableCount <- function(database){
tblCount <- length(RSQLite::dbListTables(database))
if(tblCount > 0)
return(tblCount)
else
print_and_log('Database is empty!','fatal')
}
getRSQLiteDatabase_hasINDEL <- function(database){
# get the first table of database
tbl <- RSQLite::dbListTables(database)[1]
# check if it has INDELS
INDEL.count <- RSQLite::dbGetQuery(database,sprintf('select TSA from %s where TSA in ("insertion","deletion") LIMIT 5',tbl))
if(nrow(INDEL.count) == 5)
return(TRUE)
else
return(FALSE)
}
getRSQLiteDatabase_SubPopulationExists <- function(database){
population.Column = switch(.QC$config$supplementaryFiles$allele_ref_std_population,
'AMR'= 'AMR_AF',
'EUR'= 'EUR_AF',
'SAS'= 'SAS_AF',
'EAS' = 'EAS_AF',
'AFR'='AFR_AF',
'COMMON' = 'AF')
# get table fields from the first table of database
tblFields <- RSQLite::dbListFields(database, RSQLite::dbListTables(database)[1])
if(!is.element(population.Column , tblFields))
{
# display available population data
available.population <- c(character(0))
if(is.element('AF' , tblFields))
available.population <- cbind(available.population,'COMMON')
if(is.element('EUR_AF' , tblFields))
available.population <- cbind(available.population,'EUR')
if(is.element('AMR_AF' , tblFields))
available.population <- cbind(available.population,'AMR')
if(is.element('SAS_AF' , tblFields))
available.population <- cbind(available.population,'SAS')
if(is.element('EAS_AF' , tblFields))
available.population <- cbind(available.population,'EAS')
if(is.element('AMR_AF' , tblFields))
available.population <- cbind(available.population,'AMR')
available.population <- paste(available.population,collapse = ' | ')
available.population <- paste('Available population data:' , available.population)
print_and_log(sprintf('Allele frequency data for \'%s\' population is not found in database! %s',
.QC$config$supplementaryFiles$allele_ref_std_population,
available.population),
'fatal')
# print_and_log('COMMON super population code is selected by default.','warning')
#
# .QC$config$supplementaryFiles$allele_ref_std_population <- 'AF'
}
else
{
print_and_log(sprintf('Allele frequency data for \'%s\' population will be used.',
.QC$config$supplementaryFiles$allele_ref_std_population),
'info')
.QC$config$supplementaryFiles$allele_ref_std_population <- population.Column
}
}
getRSQLiteDatabase_tableNames <- function(database)
{
tableNames <- RSQLite::dbListTables(database)
return(as.data.table(tableNames))
}
compareInputfileWithReferenceDataBase <- function(input.data)
{
if(!is.element('CHR',names(input.data)))
{
print_and_log('CHR column is required for matching with reference database!','warning')
print_and_log('input file is ignored!','warning')
return(NULL)
}
rn <- unlist(input.data$hID)
if(.QC$config$output_parameters$add_column_rsid && ## get REF_RSID if user has selected in config file
is.element("REF_RSID",RSQLite::dbListFields(.QC$reference.data,'variants')))
{
rs <- RSQLite::dbGetQuery(.QC$reference.data,
sprintf('SELECT hID,REF_RSID,REF,ALT , %s as AF FROM variants WHERE "hID" = :x' ,
.QC$config$supplementaryFiles$allele_ref_std_population) ,
param = list(x = rn))
} else {
rs <- RSQLite::dbGetQuery(.QC$reference.data,
sprintf('SELECT hID,REF,ALT , %s as AF FROM variants WHERE "hID" = :x' ,
.QC$config$supplementaryFiles$allele_ref_std_population) ,
param = list(x = rn))
}
## merging data
if(is.null(data.table::key(input.data)) || any(data.table::key(input.data) != 'hID'))
data.table::setkey(input.data,hID)
rs <- data.table::setDT(rs , key = 'hID')
##input.data <- merge(x = input.data, y = rs, by.x = 'hID', by.y = 'hID',all.x=TRUE)
# removed MERGE with data table join
input.data <- rs[input.data]
# multi allelic variants will be matched more than once and create duplicated lines.
# only the first one is requiored and the rest should be removed.
dup.allele <- which(duplicated(input.data,by=c('hID','EFFECT_ALL','OTHER_ALL','REF','ALT')))
if(length(dup.allele) > 0)
{
#print_and_log(paste('found duplicates in reference matching :',length(dup.allele)),'warning')
input.data <- input.data[!dup.allele]
}
.QC$stored.reference.data <- rs
.QC$searched.hIDs <- rn
rm(rs)
rm(rn)
# a row may match to more than one result due to multiple values in refrence
# first check if there are any duplicated rows that AF of that population is zero. keep the one with AF>0
# FIXME second, check if there are still duplicated hIDs. delete all
# 1
# e.g A|GT 0
# A|GTGT 0.1
# duplicate.hID.zeroAF <- input.data[which(duplicated(input.data$hID) |
# duplicated(input.data$hID , fromLast = TRUE))][AF == 0 ,]$hID
# ignore variable of the duplicated variants on the same position is set to NA becuase one of them is removed in the next step
#input.data[duplicate.hID.zeroAF, ignore := as.numeric(NA)]
# input.data <- input.data[!which(is.element(input.data$hID, duplicate.hID.zeroAF) & AF == 0), ]
# remove varaints on the same chr:pos if it can not be found out
# these are
# 1- INS or DEL on the same position with different alleles
# 2- different INDEL types on the same position (INS/DEL)
input.data <- save_remove_ambiguous_variants(input.data)
# find multi-allelic variants
input.data[, MULTI_ALLELIC := ifelse(is.na(AF),
NA,
ifelse(grepl(',',AF),
1,
0))]
## try allele matching on multi-allelic variants
# if(is.element('Yes',.QC$thisStudy$tables$multi_allele_count_preProcess$`Multi-allelic`))
if(any(input.data$MULTI_ALLELIC == 1, na.rm = TRUE))
{
input.data[MULTI_ALLELIC == 1,
c('ALT','AF') := clean_multi_alleles(EFFECT_ALL , OTHER_ALL, REF, ALT, AF) ,
by = list(EFFECT_ALL , OTHER_ALL,REF, ALT,AF)]
# some multi-allele INDEL AFs are all 0 and will be returned the same way due to missing alleles
# e.g. AAC,AA,TT 0,0,0 ==> this AF can be converted to 0
#input.data[VT == 2 & MULTI_ALLELIC == 1 & grepl(',', AF) & all(strsplit(AF,',')[[1]] == "0") , AF := "0" ]
input.data[VT == 2 & MULTI_ALLELIC == 1 & grepl(',', AF) & !grepl('[1-9]',AF) , AF := "0" ]
## get frequency table for multi-allelic variants
#.QC$thisStudy$tables$multi_allele_count_postProcess <- getMultiAlleleCountTbl(input.data,'AF')
}
# AF column may be character type due to remaining ',' => convert to numeric
# AF of multi-allelics that could notbe matched are set to NA
if(!is.numeric(input.data$AF))
input.data[, AF := as.numeric(AF)]
# FIXME do not convert ALT to NA because it is used to count unmatched multiallelic variants
#input.data[is.na(AF) , `:=` (REF = NA , ALT = NA)]
# input.data[is.na(AF) , REF := NA ]
## add column for consistency with table version
input.data[,DATE_ADDED := NA ]
## add std_ref to found variants
input.data[!is.na(REF), SOURCE := 'Std_ref' ]
# ' REF , ALT , AF , DATE_ADDED , SOURCE' columns are added to input data
return(input.data)
}
compareInputfileWithStoredReferenceFile <- function(input.data)
{
# search variants that are new in this file and are not previously searched
new.hIDs <- unlist(input.data[!is.element(hID , .QC$searched.hIDs),hID])
# search new hIDs
rs <- search_database(new.hIDs)
# some information
# show how many variant information is in memory
# print_and_log(paste0('Variant info from previous search: ',nrow(.QC$stored.reference.data)),
# 'info',
# display=.QC$config$debug$verbose)
#
# print_and_log(paste0('Variants searched before: ',length(.QC$searched.hIDs)),
# 'info',
# display=.QC$config$debug$verbose)
#
# print_and_log(paste0('Variant not seen before: ',length(new.hIDs)),
# 'info',
# display=.QC$config$debug$verbose)
#
# print_and_log(paste0('New found variants: ',nrow(rs)),
# 'info',
# display=.QC$config$debug$verbose)
## ==========
# add new hIDs to the previously searched hID list
.QC$searched.hIDs <- c(.QC$searched.hIDs , new.hIDs)
# bind old and new found reference data
if(nrow(rs) > 0)
{
.QC$stored.reference.data <- list(.QC$stored.reference.data , rs)
.QC$stored.reference.data <- rbindlist(.QC$stored.reference.data)
}
# add key if not existing
if(is.null(data.table::key(.QC$stored.reference.data)) || any(data.table::key(.QC$stored.reference.data) != 'hID'))
data.table::setkey(.QC$stored.reference.data , hID)
if(is.null(data.table::key(input.data)) || any(data.table::key(input.data) != 'hID'))
data.table::setkey(input.data , hID)
input.data <- .QC$stored.reference.data[input.data]
# multi allelic variants will be matched more than once and create duplicated lines.
# only the first one is required and the rest should be removed.
dup.allele <- which(duplicated(input.data,by=c('hID','EFFECT_ALL','OTHER_ALL','REF','ALT')))
if(length(dup.allele) > 0)
{
#print_and_log(paste('found duplicates in reference matching :',length(dup.allele)),'warning')
input.data <- input.data[!dup.allele]
}
# a row may match to more than one result due to multiple values in reference
# first check if there are any duplicated rows that AF of that population is zero. keep the one with AF>0
# FIXME second, check if there are still duplicated hIDs. delete all
# 1
# e.g A|GT 0
# A|GTGT 0.1
# duplicate.hID.zeroAF <- input.data[which(duplicated(input.data$hID) |
# duplicated(input.data$hID , fromLast = TRUE))][AF == 0 ,]$hID
# ignore variable of the duplicated variants on the same position is set to NA becuase one of them is removed in the next step
#input.data[duplicate.hID.zeroAF, ignore := as.numeric(NA)]
# input.data <- input.data[!which(is.element(input.data$hID, duplicate.hID.zeroAF) & AF == 0), ]
# remove variants on the same chr:pos if it can not be found out
# these are
# 1- INS or DEL on the same position with different alleles
# 2- different INDEL types on the same position (INS/DEL)
input.data <- save_remove_ambiguous_variants(input.data)
# find multi-allelic variants
input.data[, MULTI_ALLELIC := ifelse(is.na(AF),
NA,
ifelse(grepl(',',AF),
1,
0))]
## try allele matching on multi-allelic variants
# if(is.element('Yes',.QC$thisStudy$tables$multi_allele_count_preProcess$`Multi-allelic`))
if(any(input.data$MULTI_ALLELIC == 1, na.rm = TRUE))
{
input.data[MULTI_ALLELIC == 1,
c('ALT','AF') := clean_multi_alleles(EFFECT_ALL , OTHER_ALL, REF, ALT, AF) ,
by = list(EFFECT_ALL , OTHER_ALL,REF, ALT,AF)]
# some multi-allele INDEL AFs are all 0 and will be returned the same way due to missing alleles
# e.g. AAC,AA,TT 0,0,0 ==> this AF can be converted to 0
#input.data[VT == 2 & MULTI_ALLELIC == 1 & grepl(',', AF) & all(strsplit(AF,',')[[1]] == "0") , AF := "0" ]
input.data[VT == 2 & MULTI_ALLELIC == 1 & grepl(',', AF) & !grepl('[1-9]',AF) , AF := "0" ]
## get frequency table for multi-allelic variants
#.QC$thisStudy$tables$multi_allele_count_postProcess <- getMultiAlleleCountTbl(input.data,'AF')
}
# AF column may be character type due to remaining ',' => convert to numeric
# AF of multi-allelics that could not be matched are set to NA
if(!is.numeric(input.data$AF))
input.data[, AF := as.numeric(AF)]
# FIXME do not convert ALT to NA because it is used to count unmatched multiallelic variants
#input.data[is.na(AF) , `:=` (REF = NA , ALT = NA)]
# input.data[is.na(AF) , REF := NA ]
## add column for consistency with table version
input.data[,DATE_ADDED := NA ]
## add std_ref to found variants
input.data[!is.na(REF), SOURCE := 'Std_ref' ]
# ' REF , ALT , AF , DATE_ADDED , SOURCE' columns are added to input data
return(input.data)
}
search_database <- function(variantIDs)
{
if(.QC$config$output_parameters$add_column_rsid && ## get REF_RSID if user has selected in config file
is.element("REF_RSID",RSQLite::dbListFields(.QC$reference.data,'variants')))
{
rs <- RSQLite::dbGetQuery(.QC$reference.data,
sprintf('SELECT hID,REF_RSID,REF,ALT , %s as AF FROM variants WHERE "hID" = :x' ,
.QC$config$supplementaryFiles$allele_ref_std_population) ,
param = list(x = variantIDs))
} else {
rs <- RSQLite::dbGetQuery(.QC$reference.data,
sprintf('SELECT hID,REF,ALT , %s as AF FROM variants WHERE "hID" = :x' ,
.QC$config$supplementaryFiles$allele_ref_std_population) ,
param = list(x = variantIDs))
}
rs <- data.table::setDT(rs,key = 'hID')
return(rs)
}
merge_rs <- function(id,rs)
{
# var <- rs[fmatch(id,rs$hID),]
# return(list(var$REF,var$ALT,var$AF))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.