#' Title
#'
#' @param variants object of class vcfR-class with the patient variants.
#' @param sampleName character name with the patient code as is written in the VCF file.
#' @param filter character name with the desired filter to select the variants. It should be as is written in the FILTER column in the vcf file.
#' @param isCodingVar logical indicating whether to consider only variants in coding regions. Default TRUE.
#' @param frequenceAR numerical value indicating the MAF(minor allele frequence) to filter out the variants in autosomal recessive and X-linked inheritance. Default 0.01
#' @param frequenceAD numerical value indicating the MAF(minor allele frequence) to filter out the variants in autosomal dominant recessive inheritance. Default 0.00005
#' @param GlobalPhenotypicScore matrix with the phenotypic metrics obtained from the MatrixPropagation function.
#' @param assembly Genome assembly used. Default assembly human GRCh37.
#' @param processors Default=1.
#'
#' @return data frame with the patient's variants and their associated information classified from most to least likely to be the cause of the patient's phenotype.
#' @export
#'
#' @examples
#' library(vcfR)
#' patientHPOsFile <- paste(system.file("extdata/example", package = "ClinPrior"),"HPOpatient.txt",sep="/")
#' HPOpatient <- unique(read.csv(patientHPOsFile, header = FALSE, sep = "\t")[, 1])
#' Y<-proteinScore(HPOpatient)
#' ClinPriorGeneScore<-MatrixPropagation(Y,alpha=0.2)
#'
#' vcfFile = paste(system.file("extdata/example", package = "ClinPrior"),"HG001_GRCh37_1_22_v4.2.1_benchmark.vep01.KCNQ2Met546Thr.vcf.gz",sep="/")
#' variants <- read.vcfR(vcfFile)
#' variantsFiltered <- readVCF(sampleName = "HG001",variants=variants)
#' result = priorBestVariantVcfR(variants = variantsFiltered, sampleName = "HG001",GlobalPhenotypicScore = ClinPriorGeneScore)
priorBestVariantVcfR<-function(variants, sampleName, filter = "",isCodingVar = TRUE, frequenceAR = 0.01,frequenceAD = 0.00005, GlobalPhenotypicScore, assembly="assembly37",processors=1)
{
variants <- variants
assembly <<- assembly
#load files
dest = system.file(paste("extdata",assembly,sep = "/"), package = "ClinPrior")
ClinPriorfiles<-list.files(path = dest, full.names = TRUE)
CCRautosomesFile = ClinPriorfiles[grep("ccrs.autosomes.*gz$",ClinPriorfiles)]
CCRxchromFile = ClinPriorfiles[grep("ccrs.xchrom.*gz$",ClinPriorfiles, perl = TRUE)]
blacklistFile = ClinPriorfiles[grep("blacklist",ClinPriorfiles)]
#CADDsnvFile = ClinPriorfiles[grep("CADD_SNV.*gz$",ClinPriorfiles, perl = TRUE)]
#CADDindelFile = ClinPriorfiles[grep("CADD_Indels.*gz$",ClinPriorfiles, perl = TRUE)]
#exomeFile = ClinPriorfiles[grep("exomes.*Below001.*gz$",ClinPriorfiles, perl = TRUE)]
#genomeFile = ClinPriorfiles[grep("genomes.*Below001.*gz$",ClinPriorfiles, perl = TRUE)]
#clinvarFile = ClinPriorfiles[grep("clinvar.*gz$",ClinPriorfiles, perl = TRUE)]
loadVariants<-function(variants)
{
library(vcfR)
ObjectVcfR<-read.vcfR(file=vcfFile)
gt = variants@gt
pos = match(sampleName, colnames(gt))
gtPOS = grep("0/0",gt[,pos],invert=TRUE)
variantsObject = variants[gtPOS]
return(variantsObject)
}
#functions
getAlt_forMutliAllelic<-function(variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
options(warn=-1)
genotype = do.call(rbind, strsplit(as.character(gt[,pos]), ":"))[,1]
options(warn=0)
m<-regexpr("(\\d{1,2}\\/)|(\\d{1,2}\\|)",genotype,perl = TRUE)
genotype1<-regmatches(genotype, m)
genotype1<-gsub("\\||\\/", "",genotype1,perl=TRUE)
m<-regexpr("(\\/\\d{1,2})|(\\|\\d{1,2})",genotype,perl = TRUE)
genotype2<-regmatches(genotype, m)
genotype2<-gsub("\\||\\/", "",genotype2,perl=TRUE)
alternative = getFIX(variants)[,5]
m<-regexpr("(\\w+\\,|\\w+)",alternative,perl = TRUE)
allel1<-regmatches(alternative, m)
allel1<-gsub("\\,", "",allel1,perl=TRUE)
m<-regexpr("(\\,\\w+)",alternative,perl = TRUE)
posAllel2<-grep("(\\,\\w+)",alternative,perl = TRUE)
allel2<-regmatches(alternative, m)
allel2<-gsub("\\,", "",allel2,perl=TRUE)
#GENOTYPE2
alt<-matrix(0,ncol = 1, nrow = dim(variants)[1])
#0/1
if(length(grep("1",genotype2))>0){alt[grep("1",genotype2)]<-allel1[grep("1",genotype2)]}
#0/2
if(length(grep("2",genotype2))>0){alt[posAllel2[grep("2",genotype2[posAllel2])]]<-allel2[grep("2",genotype2[posAllel2])]}
#0/3
if(length(grep("3",genotype2))>0){alt[posAllel2[grep("3",genotype2[posAllel2])]]<-allel2[grep("3",genotype2[posAllel2])]}
if(length(grep("4",genotype2))>0){alt[posAllel2[grep("4",genotype2[posAllel2])]]<-allel2[grep("4",genotype2[posAllel2])]}
if(length(grep("5",genotype2))>0){alt[posAllel2[grep("5",genotype2[posAllel2])]]<-allel2[grep("5",genotype2[posAllel2])]}
if(length(grep("6",genotype2))>0){alt[posAllel2[grep("6",genotype2[posAllel2])]]<-allel2[grep("6",genotype2[posAllel2])]}
if(length(grep("7",genotype2))>0){alt[posAllel2[grep("7",genotype2[posAllel2])]]<-allel2[grep("7",genotype2[posAllel2])]}
if(length(grep("8",genotype2))>0){alt[posAllel2[grep("8",genotype2[posAllel2])]]<-allel2[grep("8",genotype2[posAllel2])]}
if(length(grep("9",genotype2))>0){alt[posAllel2[grep("9",genotype2[posAllel2])]]<-allel2[grep("9",genotype2[posAllel2])]}
if(length(grep("10",genotype2))>0){alt[posAllel2[grep("10",genotype2[posAllel2])]]<-allel2[grep("10",genotype2[posAllel2])]}
#GENOTYPE1
#alt<-matrix(0,ncol = 1, nrow = dim(variants)[1])
#1/0
if(length(grep("1",genotype1))>0){alt[grep("1",genotype1)]<-allel1[grep("1",genotype1)]}
#2/0
if(length(grep("2",genotype1))>0){alt[posAllel2[grep("2",genotype1[posAllel2])]]<-allel2[grep("2",genotype1[posAllel2])]}
#3/0
if(length(grep("3",genotype1))>0){alt[posAllel2[grep("3",genotype1[posAllel2])]]<-allel2[grep("3",genotype1[posAllel2])]}
if(length(grep("4",genotype1))>0){alt[posAllel2[grep("4",genotype1[posAllel2])]]<-allel2[grep("4",genotype1[posAllel2])]}
if(length(grep("5",genotype1))>0){alt[posAllel2[grep("5",genotype1[posAllel2])]]<-allel2[grep("5",genotype1[posAllel2])]}
if(length(grep("6",genotype1))>0){alt[posAllel2[grep("6",genotype1[posAllel2])]]<-allel2[grep("6",genotype1[posAllel2])]}
if(length(grep("7",genotype1))>0){alt[posAllel2[grep("7",genotype1[posAllel2])]]<-allel2[grep("7",genotype1[posAllel2])]}
if(length(grep("8",genotype1))>0){alt[posAllel2[grep("8",genotype1[posAllel2])]]<-allel2[grep("8",genotype1[posAllel2])]}
if(length(grep("9",genotype1))>0){alt[posAllel2[grep("9",genotype1[posAllel2])]]<-allel2[grep("9",genotype1[posAllel2])]}
if(length(grep("10",genotype1))>0){alt[posAllel2[grep("10",genotype1[posAllel2])]]<-allel2[grep("10",genotype1[posAllel2])]}
variants@fix[,5]<-alt
return(variants)
}
deleteGnomadBlacklistVariants<-function(variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
options(warn=-1)
genotype = do.call(rbind, strsplit(as.character(gt[,pos]), ":"))[,1]
options(warn=0)
m<-regexpr("(\\d{1,2}\\/)|(\\d{1,2}\\|)",genotype,perl = TRUE)
genotype1<-regmatches(genotype, m)
genotype1<-gsub("\\||\\/", "",genotype1,perl=TRUE)
m<-regexpr("(\\/\\d{1,2})|(\\|\\d{1,2})",genotype,perl = TRUE)
genotype2<-regmatches(genotype, m)
genotype2<-gsub("\\||\\/", "",genotype2,perl=TRUE)
alternative = getFIX(variants)[,5]
m<-regexpr("(\\w+\\,|\\w+)",alternative,perl = TRUE)
allel1<-regmatches(alternative, m)
allel1<-gsub("\\,", "",allel1,perl=TRUE)
m<-regexpr("(\\,\\w+)",alternative,perl = TRUE)
posAllel2<-grep("(\\,\\w+)",alternative,perl = TRUE)
allel2<-regmatches(alternative, m)
allel2<-gsub("\\,", "",allel2,perl=TRUE)
#shortVariant = do.call(paste, c(longVariant, sep="-"))
#longVariant = gsub(" ", "", longVariant, fixed = TRUE)
#GENOTYPE1
alt<-matrix(0,ncol = 1, nrow = dim(variants)[1])
#1/0
if(length(grep("1",genotype1))>0){alt[grep("1",genotype1)]<-allel1[grep("1",genotype1)]}
#2/0
if(length(grep("2",genotype1))>0){alt[posAllel2[grep("2",genotype1[posAllel2])]]<-allel2[grep("2",genotype1[posAllel2])]}
#3/0
if(length(grep("3",genotype1))>0){alt[posAllel2[grep("3",genotype1[posAllel2])]]<-allel2[grep("3",genotype1[posAllel2])]}
if(length(grep("4",genotype1))>0){alt[posAllel2[grep("4",genotype1[posAllel2])]]<-allel2[grep("4",genotype1[posAllel2])]}
if(length(grep("5",genotype1))>0){alt[posAllel2[grep("5",genotype1[posAllel2])]]<-allel2[grep("5",genotype1[posAllel2])]}
if(length(grep("6",genotype1))>0){alt[posAllel2[grep("6",genotype1[posAllel2])]]<-allel2[grep("6",genotype1[posAllel2])]}
if(length(grep("7",genotype1))>0){alt[posAllel2[grep("7",genotype1[posAllel2])]]<-allel2[grep("7",genotype1[posAllel2])]}
if(length(grep("8",genotype1))>0){alt[posAllel2[grep("8",genotype1[posAllel2])]]<-allel2[grep("8",genotype1[posAllel2])]}
if(length(grep("9",genotype1))>0){alt[posAllel2[grep("9",genotype1[posAllel2])]]<-allel2[grep("9",genotype1[posAllel2])]}
if(length(grep("10",genotype1))>0){alt[posAllel2[grep("10",genotype1[posAllel2])]]<-allel2[grep("10",genotype1[posAllel2])]}
longVariant = cbind.data.frame(variants@fix[,c(1,2,4)],alt)
longVariant = do.call(paste, c(longVariant, sep="-"))
longVariant = gsub(" ", "", longVariant, fixed = TRUE)
deleteVariants1<-match(ensemble,longVariant)[!is.na(match(ensemble,longVariant))]
#GENOTYPE2
alt<-matrix(0,ncol = 1, nrow = dim(variants)[1])
#0/1
if(length(grep("1",genotype2))>0){alt[grep("1",genotype2)]<-allel1[grep("1",genotype2)]}
#0/2
if(length(grep("2",genotype2))>0){alt[posAllel2[grep("2",genotype2[posAllel2])]]<-allel2[grep("2",genotype2[posAllel2])]}
#0/3
if(length(grep("3",genotype2))>0){alt[posAllel2[grep("3",genotype2[posAllel2])]]<-allel2[grep("3",genotype2[posAllel2])]}
if(length(grep("4",genotype2))>0){alt[posAllel2[grep("4",genotype2[posAllel2])]]<-allel2[grep("4",genotype2[posAllel2])]}
if(length(grep("5",genotype2))>0){alt[posAllel2[grep("5",genotype2[posAllel2])]]<-allel2[grep("5",genotype2[posAllel2])]}
if(length(grep("6",genotype2))>0){alt[posAllel2[grep("6",genotype2[posAllel2])]]<-allel2[grep("6",genotype2[posAllel2])]}
if(length(grep("7",genotype2))>0){alt[posAllel2[grep("7",genotype2[posAllel2])]]<-allel2[grep("7",genotype2[posAllel2])]}
if(length(grep("8",genotype2))>0){alt[posAllel2[grep("8",genotype2[posAllel2])]]<-allel2[grep("8",genotype2[posAllel2])]}
if(length(grep("9",genotype2))>0){alt[posAllel2[grep("9",genotype2[posAllel2])]]<-allel2[grep("9",genotype2[posAllel2])]}
if(length(grep("10",genotype2))>0){alt[posAllel2[grep("10",genotype2[posAllel2])]]<-allel2[grep("10",genotype2[posAllel2])]}
longVariant = cbind.data.frame(variants@fix[,c(1,2,4)],alt)
longVariant = do.call(paste, c(longVariant, sep="-"))
longVariant = gsub(" ", "", longVariant, fixed = TRUE)
deleteVariants2<-match(ensemble,longVariant)[!is.na(match(ensemble,longVariant))]
deleteVariants<-union(deleteVariants1,deleteVariants2)
return(deleteVariants)
}
deleteIndelVariantsSamePos<-function(variants)
{
#delete indel variants in same position
shortVariant = getFIX(variants)[,c(1,2)]
shortVariant = apply(shortVariant[,1:2],1,function(x) paste(x,collapse="-"))
m<-regexpr("(^\\S+\\-\\d+)",ensemble,perl = TRUE)
shortEnsemble<-regmatches(ensemble, m)
m<-regexpr("(\\w+\\-\\w+$)",ensemble,perl = TRUE)
regexEnsembl<-regmatches(ensemble, m)
refEnsembl = gsub("(\\w+)(-)(\\w+$)", "\\1", regexEnsembl, perl = TRUE)
AltEnsembl = gsub("(\\w+)(-)(\\w+$)", "\\3", regexEnsembl, perl = TRUE)
pos1<-match(shortEnsemble,shortVariant)[!is.na(match(shortEnsemble,shortVariant))]
posInShortEnsembl<-c(1:length(shortEnsemble))[!is.na(match(shortEnsemble,shortVariant[pos1]))]
# ref ==, Alt !=
posRef = c(1:length(posInShortEnsembl))[nchar(variants@fix[pos1,4]) == nchar(refEnsembl[posInShortEnsembl])]
posAlt = c(1:length(posInShortEnsembl))[nchar(variants@fix[pos1,5]) != nchar(AltEnsembl[posInShortEnsembl])]
posIntersect1 = intersect(posRef,posAlt)
# ref !=, Alt ==
posRef = c(1:length(posInShortEnsembl))[nchar(variants@fix[pos1,4]) != nchar(refEnsembl[posInShortEnsembl])]
posAlt = c(1:length(posInShortEnsembl))[nchar(variants@fix[pos1,5]) == nchar(AltEnsembl[posInShortEnsembl])]
posIntersect2 = intersect(posRef,posAlt)
# ref !=, Alt !=
posRef = c(1:length(posInShortEnsembl))[nchar(variants@fix[pos1,4]) != nchar(refEnsembl[posInShortEnsembl])]
posAlt = c(1:length(posInShortEnsembl))[nchar(variants@fix[pos1,5]) != nchar(AltEnsembl[posInShortEnsembl])]
posIntersect3 = intersect(posRef,posAlt)
deleteVariants<-pos1[unique(c(posIntersect1,posIntersect2,posIntersect3))]
return(deleteVariants)
}
qualVariants<-function(qualityThreshold,variants)
{
posDiscard = 0
posGQDiscard <- integer(0)
posGQ<-match("GQ",strsplit(as.character(variants[1,9]),":")[[1]])
if(!is.na(posGQ))
{
options(warn=-1)
GeneQuality = do.call(rbind, strsplit(as.character(variants[,10]), ":"))[,posGQ]
posGQDiscard<-c(1:length(GeneQuality))[as.numeric(GeneQuality)<=20]
options(warn=0)
}
posAllelicDepth<-match("DP",strsplit(as.character(variants[1,9]),":")[[1]])
options(warn=-1)
deep = do.call(rbind, strsplit(as.character(variants[,10]), ":"))[,posAllelicDepth]
options(warn=0)
posDeepDiscard0<-grep("\\d+",deep,perl=TRUE,invert = TRUE)
#deep<-sapply(variants[,10],function(x) strsplit(as.character(x),":")[[1]][posAllelicDepth])
#deep<-sapply(deep,function(x) sum(as.numeric(strsplit(x,",")[[1]])))
deep[posDeepDiscard0]<-0
posDeepDiscard<-c(1:length(deep))[as.numeric(deep)<=10]
#Discard bias <25% and >75% in HET variants
posADvariants<-getADvariants()
posAllelicDepth<-match("AD",strsplit(as.character(variants[1,9]),":")[[1]])
options(warn=-1)
deepAD = do.call(rbind, strsplit(as.character(variants[,10]), ":"))[,posAllelicDepth]
options(warn=0)
HETvariants<-deepAD[posADvariants]
m<-regexpr("^(\\d{1,2}\\/\\d{1,2})|^(\\d{1,2}\\|\\d{1,2})",variants[,10],perl = TRUE)
genotype<-regmatches(variants[,10], m)
m<-regexpr("(\\d{1,2}\\/)|(\\d{1,2}\\|)",genotype,perl = TRUE)
genotype1<-regmatches(genotype, m)
genotype1<-gsub("\\||\\/", "",genotype1,perl=TRUE)
genotype1<-genotype1[posADvariants]
m<-regexpr("(\\/\\d)|(\\|\\d)",genotype,perl = TRUE)
genotype2<-regmatches(genotype, m)
genotype2<-gsub("\\||\\/", "",genotype2,perl=TRUE)
genotype2<-genotype2[posADvariants]
#GENOTYPE1
freqBias<-matrix(0.5,ncol = 1, nrow = length(posADvariants))
options(warn=-1)
#1/0
if(length(grep("1",genotype1))>0){freqBias[grep("1",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("1",genotype1)], ","))[,2])/as.numeric(deep)[posADvariants][grep("1",genotype1)]}
#2/0
if(length(grep("2",genotype1))>0){freqBias[grep("2",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("2",genotype1)], ","))[,3])/as.numeric(deep)[posADvariants][grep("2",genotype1)]}
#3/0..
if(length(grep("3",genotype1))>0){freqBias[grep("3",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("3",genotype1)], ","))[,4])/as.numeric(deep)[posADvariants][grep("3",genotype1)]}
if(length(grep("4",genotype1))>0){freqBias[grep("4",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("4",genotype1)], ","))[,5])/as.numeric(deep)[posADvariants][grep("4",genotype1)]}
if(length(grep("5",genotype1))>0){freqBias[grep("5",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("5",genotype1)], ","))[,6])/as.numeric(deep)[posADvariants][grep("5",genotype1)]}
if(length(grep("6",genotype1))>0){freqBias[grep("6",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("6",genotype1)], ","))[,7])/as.numeric(deep)[posADvariants][grep("6",genotype1)]}
if(length(grep("7",genotype1))>0){freqBias[grep("7",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("7",genotype1)], ","))[,8])/as.numeric(deep)[posADvariants][grep("7",genotype1)]}
if(length(grep("8",genotype1))>0){freqBias[grep("8",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("8",genotype1)], ","))[,9])/as.numeric(deep)[posADvariants][grep("8",genotype1)]}
if(length(grep("9",genotype1))>0){freqBias[grep("8",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("9",genotype1)], ","))[,10])/as.numeric(deep)[posADvariants][grep("9",genotype1)]}
if(length(grep("10",genotype1))>0){freqBias[grep("8",genotype1)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("10",genotype1)], ","))[,11])/as.numeric(deep)[posADvariants][grep("10",genotype1)]}
#GENOTYPE2
freqBias2<-matrix(0.5,ncol = 1, nrow = length(posADvariants))
#1/0
if(length(grep("1",genotype2))>0){freqBias2[grep("1",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("1",genotype2)], ","))[,2])/as.numeric(deep)[posADvariants][grep("1",genotype2)]}
#2/0
if(length(grep("2",genotype2))>0){freqBias2[grep("2",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("2",genotype2)], ","))[,3])/as.numeric(deep)[posADvariants][grep("2",genotype2)]}
#3/0..
if(length(grep("3",genotype2))>0){freqBias2[grep("3",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("3",genotype2)], ","))[,4])/as.numeric(deep)[posADvariants][grep("3",genotype2)]}
if(length(grep("4",genotype2))>0){freqBias2[grep("4",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("4",genotype2)], ","))[,5])/as.numeric(deep)[posADvariants][grep("4",genotype2)]}
if(length(grep("5",genotype2))>0){freqBias2[grep("5",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("5",genotype2)], ","))[,6])/as.numeric(deep)[posADvariants][grep("5",genotype2)]}
if(length(grep("6",genotype2))>0){freqBias2[grep("6",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("6",genotype2)], ","))[,7])/as.numeric(deep)[posADvariants][grep("6",genotype2)]}
if(length(grep("7",genotype2))>0){freqBias2[grep("7",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("7",genotype2)], ","))[,8])/as.numeric(deep)[posADvariants][grep("7",genotype2)]}
if(length(grep("8",genotype2))>0){freqBias2[grep("8",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("8",genotype2)], ","))[,9])/as.numeric(deep)[posADvariants][grep("8",genotype2)]}
if(length(grep("9",genotype2))>0){freqBias2[grep("9",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("9",genotype2)], ","))[,10])/as.numeric(deep)[posADvariants][grep("9",genotype2)]}
if(length(grep("10",genotype2))>0){freqBias2[grep("10",genotype2)]<-as.numeric(do.call(rbind, strsplit(as.character(HETvariants)[grep("10",genotype2)], ","))[,11])/as.numeric(deep)[posADvariants][grep("10",genotype2)]}
#freqBias = as.numeric(do.call(rbind, strsplit(as.character(HETvariants), ","))[,1])/as.numeric(deep[posADvariants])
#freqBias<-sapply(c(1:length(HETvariants)),function(x) as.numeric(strsplit(names(HETvariants[x]),",")[[1]][1])/HETvariants[x])
options(warn=0)
posnoNA<-c(1:length(freqBias))[!is.na(freqBias)]
freqBias<-freqBias[posnoNA]
posDiscard<-union(c(1:length(freqBias))[freqBias>0.75],c(1:length(freqBias))[freqBias<0.25])
posBiasDiscard1<-c(1:length(deep))[posADvariants][posnoNA][posDiscard]
posnoNA<-c(1:length(freqBias2))[!is.na(freqBias2)]
freqBias2<-freqBias2[posnoNA]
posDiscard<-union(c(1:length(freqBias2))[freqBias2>0.75],c(1:length(freqBias2))[freqBias2<0.25])
posBiasDiscard2<-c(1:length(deep))[posADvariants][posnoNA][posDiscard]
lengthMajor1 <- sapply(as.character(HETvariants), function(x) length(c(1:length(as.numeric(strsplit(x, ",")[[1]])))[as.numeric(strsplit(x, ",")[[1]])>1]))
pos<-c(1:length(lengthMajor1))[lengthMajor1>2]
posBiasDiscard3<-c(1:length(deep))[posADvariants][pos]
posDiscard<-unique(c(posGQDiscard,posDeepDiscard,posBiasDiscard1,posBiasDiscard2,posBiasDiscard3))
if(qualityThreshold>0)
{
pos100<-c(1:length(variants[,6]))[as.numeric(gsub(",",".",variants[,6]))<qualityThreshold]
posDiscard<-union(posDiscard,pos100[!is.na(pos100)])
}
return(posDiscard)
}
getHOMvariants<-function(variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
posHOMvariants<-grep("1/1|2/2|3/3",gt[,pos])
posHOMvariants<-posHOMvariants[grep("\\bX\\b",variants@fix[posHOMvariants,1],invert=TRUE)]
return(posHOMvariants)
}
getPhysicalPhasing<-function(selectedVariants,variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
phaseVariants = matrix(".", ncol = 1, nrow = length(selectedVariants))
posPhaseVariants<-grep("\\|",gt[selectedVariants,pos],perl =TRUE)
if(length(posPhaseVariants)>0)
{
m<-regexpr("\\d+\\_\\S+\\_[A-Z]{1,300}",gt[selectedVariants,pos][posPhaseVariants],perl = TRUE)
if(m[1]>-1){
phaseVariants[posPhaseVariants] = regmatches(gt[selectedVariants,pos][posPhaseVariants], m)
}
}
return(phaseVariants)
}
adjustPhysicalPhasing<-function(posCmpHETvariants,variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
posPhaseVariants<-grep("\\|",gt[posCmpHETvariants,pos],perl =TRUE)
posPhaseVariantsNOT<-grep("\\|",gt[posCmpHETvariants,pos],invert = TRUE,perl =TRUE)
if(length(posPhaseVariants)>0)
{
m<-regexpr("\\d+\\_\\S+\\_[A-Z]{1,300}",gt[posCmpHETvariants,pos][posPhaseVariants],perl = TRUE)
if(m[1]>-1){
phaseVariants = regmatches(gt[posCmpHETvariants,pos][posPhaseVariants], m)
posDups = match(unique(phaseVariants),phaseVariants)
posCmpHETvariants<-sort(union(posCmpHETvariants[posPhaseVariantsNOT],posCmpHETvariants[posPhaseVariants][posDups]))
}
}
return(posCmpHETvariants)
}
getGTsamples<-function(selectedVariants,variants)
{
gt = variants@gt
GTsamples=0
for(x in 2:length(colnames(gt)))
{
options(warn=-1)
GTsamples = cbind(GTsamples,do.call(rbind, strsplit(as.character(gt[selectedVariants,x]), ":"))[,1])
options(warn=0)
}
#GTsamples = GTsamples[,-1]
return(GTsamples)
}
getCmpHETvariants<-function(variants)
{
posADvariants<-getADvariants(variants)
VariantAnnotation<-getVariant(posADvariants,variants)
geneCmpHET<-getGeneSymbol(VariantAnnotation,variants)
posUniqueRemove = match(names(table(geneCmpHET))[table(geneCmpHET)==1],geneCmpHET)
posCmpHETvariants<-posADvariants[-posUniqueRemove]
posADvariants<-adjustPhysicalPhasing(posCmpHETvariants,variants)
VariantAnnotation<-getVariant(posADvariants,variants)
geneCmpHET<-getGeneSymbol(VariantAnnotation,variants)
posCmpHETvariants = posADvariants
posNULL = c(1:length(geneCmpHET))[grep(".",geneCmpHET,invert = TRUE)]
if(length(posNULL)>0){geneCmpHET = geneCmpHET[-posNULL]
posCmpHETvariants = posADvariants[-posNULL]}
posUniqueRemove = match(names(table(geneCmpHET))[table(geneCmpHET)==1],geneCmpHET)
if(length(posUniqueRemove)>0){posCmpHETvariants<-posCmpHETvariants[-posUniqueRemove]}
posCmpHETvariants<-posCmpHETvariants[grep("\\bX\\b",variants@fix[posCmpHETvariants,1],invert=TRUE)]
#geneCmpHET<-getGeneName(VariantAnnotation)
#dups<-duplicated(geneCmpHET)
#geneDuplicated<-unique(geneCmpHET[dups])
#options(warn=-1)
#matchItem<-do.call(paste, c(cbind.data.frame("\\b",geneDuplicated,"\\b"), sep=""))
#options(warn=0)
#cl <- parallel::makeCluster(processors)
#parallel::clusterExport(cl,c("geneCmpHET"))
#parallel::clusterEvalQ(cl, library(igraph))
#posGenes<-parSapply(cl,matchItem, function(x) grep(x,geneCmpHET))
#stopCluster(cl)
#print(Sys.time() - strt)
#quant<-sapply(c(1:length(posGenes)), function(x) length(posGenes[[x]]))
#posCmpHETvariants<-posADvariants[unlist(posGenes[quant<4])]
return(posCmpHETvariants)
}
getADvariants<-function(variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
posADvariants<-grep("0/1|0/2|0/3|1/2|2/3|1/3",gt[,pos])
posADvariants<-posADvariants[grep("\\bX\\b",variants@fix[posADvariants,1],invert=TRUE)]
return(posADvariants)
}
getXLvariants<-function(variants)
{
gt = variants@gt
pos = match(sampleName, colnames(gt))
posXLvariants<-grep("0/0",gt[,pos],invert = TRUE)
posXLvariants<-posXLvariants[grep("\\bX\\b",variants@fix[posXLvariants,1])]
return(posXLvariants)
}
getVariant<-function(selectedVariants,variants)
{
VariantAnnotation = variants@fix[selectedVariants,8]
return(VariantAnnotation)
}
getProtein<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
ProteinPOS = match("HGVSp",meta)
options(warn=-1)
ProtAnnotation = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,ProteinPOS]
options(warn=0)
#ProtAnnotation<-sapply(VariantAnnotation,function(x) strsplit(as.character(x),"[|]")[[1]][11])
return(ProtAnnotation)
}
getcDNA<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
cDNAPOS = match("HGVSc",meta)
options(warn=-1)
cDNAannotation = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,cDNAPOS]
options(warn=0)
return(cDNAannotation)
}
getEnsemblGene<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
EnsemblPOS = match("Gene",meta)
options(warn=-1)
Ensemblannotation = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,EnsemblPOS]
options(warn=0)
return(Ensemblannotation)
}
getConsequence<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
ConsequencePOS = match("Consequence",meta)
options(warn=-1)
consequence = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,ConsequencePOS]
options(warn=0)
return(consequence)
}
getGeneName<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
symbolPOS = match("Gene",meta)
options(warn=-1)
geneName = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,symbolPOS]
options(warn=0)
geneName<-totalEnsembl[match(geneName,totalEnsembl[,3]),2]
return(geneName)
}
getGeneSymbol<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
symbolPOS = match("Gene",meta)
options(warn=-1)
geneName = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,symbolPOS]
options(warn=0)
geneName<-totalEnsembl[match(geneName,totalEnsembl[,3]),1]
return(geneName)
}
isCoding<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]##!!!!!!!
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
posCoding = match("BIOTYPE",meta)
options(warn=-1)
coding = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,posCoding]
options(warn=0)
posCoding = c(1:length(VariantAnnotation))[grep("\\bprotein_coding\\b",coding)]
return(posCoding)
}
getisCoding<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
posCoding = match("BIOTYPE",meta)
options(warn=-1)
coding = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,posCoding]
options(warn=0)
return(coding)
}
isModerate<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
impactPOS = match("IMPACT",meta)
options(warn=-1)
impact = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,impactPOS]
options(warn=0)
posModerate<-grep("MODERATE",impact)
ModerateMatrix<-matrix(".",ncol = 1,nrow = length(VariantAnnotation))
if(length(posModerate)>0)
{
ModeratecDNA<-getcDNA(VariantAnnotation[posModerate],variants)
ModerateProtein<-getProtein(VariantAnnotation[posModerate],variants)
options(warn=-1)
ModerateAnnotation<-do.call(paste, c(cbind.data.frame(ModeratecDNA,ModerateProtein), sep="|"))
options(warn=0)
#ModerateAnnotation<-sapply(c(1:length(posModerate)),function(x) paste(ModeratecDNA[x],ModerateProtein[x],sep = "|"))
ModerateMatrix[posModerate]<-ModerateAnnotation
}
return(ModerateMatrix)
}
isHigh<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
impactPOS = match("IMPACT",meta)
options(warn=-1)
impact = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,impactPOS]
options(warn=0)
posHigh<-grep("HIGH",impact)
#pos5UTR<-c(1:length(VariantAnnotation))[grep("5_prime_UTR",VariantAnnotation)]
#posHigh<-setdiff(posHigh,pos5UTR)
HighMatrix<-matrix(".",ncol = 1,nrow = length(VariantAnnotation))
if(length(posHigh)>0)
{
HIGHcDNA<-getcDNA(VariantAnnotation[posHigh],variants)
HighProtein<-getProtein(VariantAnnotation[posHigh],variants)
options(warn=-1)
HighAnnotation<-do.call(paste, c(cbind.data.frame(HIGHcDNA,HighProtein), sep="|"))
options(warn=0)
#HighAnnotation<-sapply(c(1:length(posHigh)),function(x) paste(HIGHcDNA[x],HighProtein[x],sep = "|"))
HighMatrix[posHigh]<-HighAnnotation
}
return(HighMatrix)
}
isSplicingOld<-function(VariantAnnotation,shortVariant,longVariant, Threshold,variants)
{
SplicingMatrix<-matrix(".",ncol = 1,nrow = length(VariantAnnotation))
consequence<-getConsequence(VariantAnnotation,variants)
posSplicing1<-grep("splice",consequence,perl=TRUE)
posSplicing2<-grep("intron_variant",consequence,perl=TRUE)
posSplicing3<-grep("synonymous_variant",consequence,perl=TRUE)
if(length(posSplicing2)>0)
{
branchpoint<-ClinPrior::branchpointer(longVariant[posSplicing2],assembly,0.2)
posBranchPoint<-grep("branchpoint",branchpoint)
if(length(posBranchPoint)>0){SplicingMatrix[posSplicing2][posBranchPoint]<-branchpoint[posBranchPoint]}
cDNA<-getcDNA(VariantAnnotation[posSplicing2],variants)
names(cDNA)<-c()
m <- regexpr("\\+[\\d+]{1,8}|\\-[\\d+]{1,8}", cDNA,perl=TRUE)
posMatch <- grep("\\+[\\d+]{1,8}|\\-[\\d+]{1,8}", cDNA,perl=TRUE)
match<-regmatches(cDNA, m)
distSplic<-as.numeric(gsub("\\+|\\-", "",match,perl=TRUE))
posSplicThreshold<-c(1:length(distSplic))[distSplic<Threshold]
posSplicing2<-posSplicing2[posMatch][posSplicThreshold]
posSplicing<<-unique(c(posSplicing1,posSplicing2,posSplicing3))
if(length(posSplicing)>0)
{
#SplicingcDNA<-getcDNA(VariantAnnotation[posSplicing])
#strt <- Sys.time()
posMT<<-grep("MT",shortVariant[posSplicing], invert = TRUE)
if(length(posMT)>0)
{
VariantAnnotation<<-VariantAnnotation
shortVariant<<-shortVariant
longVariant<<-longVariant
assembly<<-assembly
if(length(posSplicing[posMT])>20)
{
cl <- parallel::makeCluster(processors)
parallel::clusterExport(cl,c("VariantAnnotation","posSplicing","shortVariant","posMT","longVariant","assembly"))
parallel::clusterEvalQ(cl, library(ClinPrior))
SplicingMaxentScan<-parSapply(cl,c(1:length(posSplicing[posMT])), function(x){
getGeneName<-function(VariantAnnotation,variants)
{
options(warn=-1)
geneName = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,4]
options(warn=0)
geneName<-total_unique[match(geneName,total_unique[,1]),2]
#geneName<-unname(sapply(VariantAnnotation,function(x) strsplit(as.character(x),"[|]")[[1]][4]))
return(geneName)
}
#SplicingMaxentScan<-sapply(c(1:length(posSplicing[posMT])),function(x){
geneName<-getGeneName(VariantAnnotation[posSplicing][posMT][x],variants)
chr<-paste("chr",strsplit(shortVariant[posSplicing][posMT][x],":")[[1]][1],sep="")
start<-as.numeric(strsplit(shortVariant[posSplicing][posMT][x],"-")[[1]][2])
Ref<-strsplit(longVariant[posSplicing][posMT][x],"-")[[1]][2]
Alt<-strsplit(longVariant[posSplicing][posMT][x],"-")[[1]][3]
ClinPrior::MaxentScanClinPrior(geneName,chr,start,Ref,Alt,assembly,0.2)
})
stopCluster(cl)
gc()
}else{
SplicingMaxentScan<-sapply(c(1:length(posSplicing[posMT])),function(x){
geneName<-getGeneName(VariantAnnotation[posSplicing][posMT][x],variants)
chr<-paste("chr",strsplit(shortVariant[posSplicing][posMT][x],":")[[1]][1],sep="")
start<-as.numeric(strsplit(shortVariant[posSplicing][posMT][x],"-")[[1]][2])
Ref<-strsplit(longVariant[posSplicing][posMT][x],"-")[[1]][2]
Alt<-strsplit(longVariant[posSplicing][posMT][x],"-")[[1]][3]
ClinPrior::MaxentScanClinPrior(geneName,chr,start,Ref,Alt,assembly,0.2)
})}
#print(Sys.time() - strt)
discard<-c(1:length(posSplicing[posMT]))[colSums(SplicingMaxentScan)==0]
SplicingMaxentScan<-SplicingMaxentScan[,-discard]
options(warn=-1)
SplicingMatrix[posSplicing][posMT][-discard]<-do.call(paste, c(as.data.frame(t(SplicingMaxentScan))))
options(warn=0)
}
}
}
return(SplicingMatrix)
}
isSplicing<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
MaxEntAlt = match("MaxEntScan_alt",meta)
MaxEntDiff = match("MaxEntScan_diff",meta)
MaxEntRef = match("MaxEntScan_ref",meta)
VariantAnnotation = paste(VariantAnnotation,"|",sep="")
options(warn=-1)
MaxEnt = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,c(MaxEntDiff,MaxEntRef,MaxEntAlt)]
options(warn=0)
MaxEnt = rbind(c("","",""),MaxEnt)
percentImp = as.numeric(MaxEnt[,1])/as.numeric(MaxEnt[,2])
pos = c(1:length(percentImp))[abs(percentImp)>=0.2][!is.na(c(1:length(percentImp))[abs(percentImp)>=0.2])]
MaxEnt2 = matrix("",ncol = 3,nrow = length(percentImp))
MaxEnt2[pos,] = MaxEnt[pos,]
SplicingMatrix = apply(MaxEnt2,1,function(x) paste(x,collapse="|"))
SplicingMatrix = SplicingMatrix[-1]
return(SplicingMatrix)
}
clinvarPathoScore<-function(VariantScore,clinvar,phenoScore)
{
pos<-intersect(grep("Pathogenic|Likely_pathogenic",clinvar), c(1:length(VariantScore))[phenoScore[,1] <= 2000][!is.na(c(1:length(VariantScore))[phenoScore[,1] <= 2000])])
# if(length(pos)>0){VariantScore[pos] = VariantScore[pos] + 1}
if(length(pos)>0){VariantScore[pos] = VariantScore[pos] + 0} #treure clinvar PATH score
return(VariantScore)
}
#Calculte Variant score
calculateVariantScore<-function(HIGH,Splicing,Moderate,CADD,CCR,clinvar,metrics,inheritance,gtPOS)
{
VariantScore = 0
if(HIGH !=".")
{
VariantScore = VariantScore + 0.4
if(!is.na(metrics$pLI)){if(metrics$pLI>0.9){VariantScore = VariantScore + 0.2}}
if(!is.na(metrics$lof_z)){if(metrics$lof_z>3){VariantScore = VariantScore + 0.1}}
if(CADD >=20){VariantScore = VariantScore + 0.05}
if(CADD >=25){VariantScore = VariantScore + 0.05}
if(CADD >=30){VariantScore = VariantScore + 0.1}
}
if((Moderate !=".") && (Splicing =="||") && (HIGH =="."))
{
VariantScore = VariantScore + 0.2
if(!is.na(metrics$pLI)){if(metrics$pLI>0.9){VariantScore = VariantScore + 0.1}}
if(!is.na(metrics$mis_z)){if(metrics$mis_z>3){VariantScore = VariantScore + 0.05}}
if(CADD >=20){VariantScore = VariantScore + 0.05}
if(CADD >=25){VariantScore = VariantScore + 0.05}
if(CADD >=30){VariantScore = VariantScore + 0.1}
if(CCR >=85){VariantScore = VariantScore + 0.1}
if(CCR >=95){VariantScore = VariantScore + 0.2}
}
if((Splicing !="||") && (HIGH =="."))
{
VariantScore = VariantScore + 0.2
if(!is.na(metrics$pLI)){if(metrics$pLI>0.9){VariantScore = VariantScore + 0.1}}
if(!is.na(metrics$lof_z)){if(metrics$lof_z>3){VariantScore = VariantScore + 0.1}}
if(CADD >=20){VariantScore = VariantScore + 0.1}
if(CADD >=25){VariantScore = VariantScore + 0.1}
if(CADD >=30){VariantScore = VariantScore + 0.1}
}
if(inheritance == "HOM")
{
VariantScore = VariantScore + 0.5
if(!is.na(metrics$pRec)){if(metrics$pRec>0.7){VariantScore = VariantScore + 0.1}}
}
if(inheritance == "CmpHET")
{
VariantScore = VariantScore + 0.2
if(!is.na(metrics$pRec)){if(metrics$pRec>0.7){VariantScore = VariantScore + 0.05}}
}
if((inheritance == "HET") && (Splicing =="||") && (HIGH ==".") && (CCR ==0)) {VariantScore = VariantScore/10}
if((Splicing =="||") && (HIGH ==".") && (Moderate ==".")){VariantScore = 0}
if((inheritance =="XLink") && (gtPOS =="0/1")){VariantScore = 0}
#if(length(grep("benign",clinvar,ignore.case = TRUE))>0){VariantScore = 0}
return(round(VariantScore, digits = 4))
}
createVCFshortVariantCCR<-function(selectedVariants,variants)
{
shortVariant <- getFIX(variants[selectedVariants])[,1:2]
position <- as.numeric(getFIX(variants[selectedVariants])[,2])
alt <- getFIX(variants[selectedVariants])[,5]
#shortVariant = variants[selectedVariants,c(1,2)]
shortVariant = apply(shortVariant,1,function(x) paste(x,collapse=":"))
lengthALT<-as.numeric(sapply(alt,function(x) nchar(as.character(x))))
shortVariant = apply(cbind(shortVariant,(position+lengthALT-1)),1,function(x) paste(x,collapse="-"))
#process <- function(x) {
# shortVariant <- getFIX(variants[selectedVariants[x]])[1:2]
# position <- as.numeric(getFIX(variants[selectedVariants[x]])[2])
# alt <- getFIX(variants[selectedVariants[x]])[5]
# shortVariant <- paste(shortVariant,collapse=":")
# lengthALT <- as.numeric(nchar(as.character(alt)))
# shortVariant <- paste(cbind(shortVariant,(position+lengthALT-1)),collapse="-")
#}
#shortVariant <-do.call(rbind, lapply(c(1:length(selectedVariants)), process))
shortVariant = gsub(" ", "", shortVariant, fixed = TRUE)
return(shortVariant)
}
createVCFshortVariant<-function(selectedVariants,variants)
{
shortVariant = getFIX(variants[selectedVariants])[,c(1,2,2)]
shortVariant1 = apply(shortVariant[,1:2],1,function(x) paste(x,collapse=":"))
shortVariant = apply(cbind(shortVariant1,shortVariant[,2]),1,function(x) paste(x,collapse="-"))
shortVariant = gsub(" ", "", shortVariant, fixed = TRUE)
return(shortVariant)
}
createVCFlongVariant<-function(selectedVariants,variants)
{
longVariant = getFIX(variants[selectedVariants])[,c(1,2,4,5)]
longVariant1 = apply(longVariant[,1:2],1,function(x) paste(x,collapse=":"))
longVariant = apply(cbind(longVariant1,longVariant[,c(3,4)]),1,function(x) paste(x,collapse="-"))
longVariant = gsub(" ", "", longVariant, fixed = TRUE)
return(longVariant)
}
ClinVarConstructor<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
clinvarPOS = match("CLIN_SIG",meta)
options(warn=-1)
clinvar = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,clinvarPOS]
options(warn=0)
return(clinvar)
}
CCRconstructor<-function(VariantAnnotation,selectedVariants,variants)
{
consequence<-getConsequence(VariantAnnotation,variants)
posCCR<-grep("missense_variant|inframe",consequence,perl=TRUE)
shortVariantCCR<-createVCFshortVariantCCR(selectedVariants,variants)
outputCCR<-matrix(".",ncol = 1,nrow = length(selectedVariants))
if(length(posCCR)>0)
{
resCCR<-sapply(shortVariantCCR[posCCR],function(x) seqminer::tabix.read.table(CCRautosomesFile, x))
if(length(resCCR)==length(posCCR))
{
posSelectedVariantCCR<-c(1:length(resCCR))[sapply(c(1:length(resCCR)),function(x) length(resCCR[[x]]))>0]
TabixCCRscore<-unlist(sapply(posSelectedVariantCCR,function(x) mean(resCCR[[x]][[4]])))
#TabixCCRscore<-resCCR[4,]
outputCCR[posCCR][posSelectedVariantCCR]<-round(as.numeric(TabixCCRscore), digits = 4)
}
if(length(resCCR)>length(posCCR))
{
#TabixCCRscore<-resCCR[4,]
TabixCCRscore<-unlist(sapply(resCCR[4,],function(x) max(x)))
outputCCR[posCCR]<-round(as.numeric(TabixCCRscore), digits = 4)
}
if(length(grep("X",shortVariantCCR[posCCR]))>0)
{
resCCR<-sapply(shortVariantCCR[posCCR],function(x) seqminer::tabix.read.table(CCRxchromFile, x))
if(length(resCCR)==length(posCCR))
{
posSelectedVariantCCR<-c(1:length(resCCR))[sapply(c(1:length(resCCR)),function(x) length(resCCR[[x]]))>0]
TabixCCRscore<-unlist(sapply(posSelectedVariantCCR,function(x) mean(resCCR[[x]][[4]])))
#TabixCCRscore<-resCCR[4,]
outputCCR[posCCR][posSelectedVariantCCR]<-round(as.numeric(TabixCCRscore), digits = 4)
}
if(length(resCCR)>length(posCCR))
{
TabixCCRscore<-unlist(sapply(resCCR[4,],function(x) max(x)))
outputCCR[posCCR]<-round(as.numeric(TabixCCRscore), digits = 4)
}
}
}
return(outputCCR)
}
CADDconstructor<-function(VariantAnnotation,variants)
{
#outputCADD<-matrix(".",ncol = 1,nrow = length(shortVariant))
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
CADD_PHRED = match("CADD_PHRED",meta)
options(warn=-1)
outputCADD = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,CADD_PHRED]
options(warn=0)
return(outputCADD)
}
gnomADConstructor<-function(VariantAnnotation,variants)
{
#meta = queryMETA(variants,"CSQ")[[3]][4]
meta = queryMETA(variants,"\\bCSQ\\b")[[1]][4]
meta = unlist(strsplit(meta, "[|]"))
gnomADPOS = match("AF",meta)
gnomADgenPOS = match("gnomADg_AF",meta)
gnomADexoPOS = match("gnomADe_AF",meta)
process <- function(x) {
MAF = strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation[x])), "[|]")[[1]][c(gnomADPOS)]
gen = strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation[x])), "[|]")[[1]][c(gnomADgenPOS)]
exome = strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation[x])), "[|]")[[1]][c(gnomADexoPOS)]
c(MAF,gen,exome)[order(c(MAF,gen,exome),decreasing = TRUE)][1]
}
gnomAD <-do.call(c, lapply(c(1:length(VariantAnnotation)), process))
return(gnomAD)
}
gnomADexomeConstructor<-function(shortVariant,longVariant)
{
#gnomadExome
gnomADexome<-matrix(".",ncol = 1,nrow = length(shortVariant))
gnomADTabix <- seqminer::tabix.read.table(exomeFile, shortVariant)
if(length(gnomADTabix)>0)
{
options(warn=-1)
gnomadlongVariant = do.call(paste, c(gnomADTabix[,c(1,2,4,5)], sep="-"))
options(warn=0)
gnomadlongVariant = gsub("^(\\d+)-(\\d+)-(\\w+)-(\\w+)", "\\1:\\2-\\3-\\4", gnomadlongVariant, perl = TRUE)
#gnomadlongVariant = apply(gnomADTabix[,1:2],1,function(x) paste(x,collapse=":"))
#gnomadlongVariant = apply(cbind(gnomadlongVariant,gnomADTabix[,c(4,5)]),1,function(x) paste(x,collapse="-"))
#gnomadlongVariant = gsub(" ", "", gnomadlongVariant, fixed = TRUE)
posGnomadTabix<-c(1:length(gnomadlongVariant))[!is.na(match(gnomadlongVariant,longVariant))]
posSelectedVariants<-match(gnomadlongVariant,longVariant)[!is.na(match(gnomadlongVariant,longVariant))]
m<-regexpr("controls_AF_popmax=\\S+?;", gnomADTabix[posGnomadTabix,8],perl=TRUE)
match<-regmatches(gnomADTabix[posGnomadTabix,8], m)
match<-gsub("controls_AF_popmax=", "",match,perl=TRUE)
matchGnomAD<-gsub("\\;", "",match,perl=TRUE)
gnomADexome[posSelectedVariants]<-matchGnomAD
}
return(gnomADexome)
}
getMetrics<-function(VariantAnnotation,variants)
{
options(warn=-1)
genesList = do.call(rbind, strsplit(as.character(gsub(".*CSQ=","",VariantAnnotation)), "[|]"))[,4]
options(warn=0)
genes<-getGeneName(VariantAnnotation,variants)
pos<-match(genes,geneConstraints$gene)
geneMetrics<-geneConstraints[pos,2:6]
#genesList<-as.matrix(geneConstraints[pos,1])
#genesList[is.na(pos),1]<-genes[is.na(pos)]
#genesList<-total_unique[match(genesList,total_unique[,2]),1]
geneMetrics<-cbind(genesList,geneMetrics)
return(geneMetrics)
}
getPhenoScore<-function(VariantAnnotation,GlobalPhenotypicScore,variants)
{
genes<-getGeneName(VariantAnnotation,variants)
FunctPos<-match(genes,GlobalPhenotypicScore[,2])
PhysPos<-match(genes,GlobalPhenotypicScore[,5])
FunctScore<-GlobalPhenotypicScore[FunctPos,3]
PhysScore<-GlobalPhenotypicScore[PhysPos,6]
FunctScore[is.na(FunctScore)]<-0.0001
PhysScore[is.na(PhysScore)]<-0.0001
FunctPos[is.na(FunctPos)]<-dim(GlobalPhenotypicScore)[1]/2
PhysPos[is.na(PhysPos)]<-dim(GlobalPhenotypicScore)[1]/2
FunctScore<-round(as.numeric(FunctScore),digits = 4)
PhysScore<-round(as.numeric(PhysScore),digits = 4)
PhenoScore<-cbind(FunctPos,FunctScore,PhysPos,PhysScore)
return(PhenoScore)
}
getKnownDisease<-function(VariantAnnotation,variants)
{
genes<-c("")
genes<-c(genes,getGeneName(VariantAnnotation,variants))
#indices<-sapply(unique(genes), function(x) which(HPO2genes[,1] == x))
genes<-genes[-1]
#indices<-indices[-1]
HPOrecessive<-"HP:0000007|HP:0001416|HP:0001526"
HPOdominant<-"HP:0000006|HP:0001415|HP:0001447|HP:0001448|HP:0001451|HP:0001455|HP:0001456|HP:0001463"
HPOXlink<-"HP:0001417|HP:0001418|HP:0001419|HP:0001423"
#recessive<-unlist(sapply(c(1:length(indices)), function(x) grep(HPOrecessive,HPO2genes[indices[[x]],2])[1]))
#dominant<-unlist(sapply(c(1:length(indices)), function(x) grep(HPOdominant,HPO2genes[indices[[x]],2])[1]))
#xlink<-unlist(sapply(c(1:length(indices)), function(x) grep(HPOXlink,HPO2genes[indices[[x]],2])[1]))
recessive<-match(genes,HPO2genes[grep(HPOrecessive,HPO2genes[,2]),1])
dominant<-match(genes,HPO2genes[grep(HPOdominant,HPO2genes[,2]),1])
xlink<-match(genes,HPO2genes[grep(HPOXlink,HPO2genes[,2]),1])
knownDisease <- matrix("",nrow = length(genes), ncol = 1)
knownDisease[!is.na(recessive)]<-paste(knownDisease[!is.na(recessive)],"|AR|",sep="")
knownDisease[!is.na(dominant)]<-paste(knownDisease[!is.na(dominant)],"|AD|",sep="")
knownDisease[!is.na(xlink)]<-paste(knownDisease[!is.na(xlink)],"|XLINK|",sep="")
return(knownDisease)
}
getClinPriorScore<-function(VariantScore,phenoScore)
{
#ClinPriorScore<-(0.2*as.numeric(VariantScore)) + (10*(as.numeric(phenoScore[,2])/as.numeric(phenoScore[,1])))
#ClinPriorScore<-(0.2*as.numeric(VariantScore)) + 0.8*(log10(1 + exp(-800*((as.numeric(phenoScore[,1])/20146)-0.2)))*(as.numeric(phenoScore[,2])))
position = do.call(pmin,as.data.frame(phenoScore[,c(1,3)]))
score = do.call(pmax,as.data.frame(phenoScore[,c(2,4)]))
#ClinPriorScore<-0.9*(log10(1 + exp(-80*((as.numeric(as.numeric(min(phenoScore[,c(1,3)])/dim(total_unique)[1])-0.2)))*(as.numeric(max(phenoScore[,c(2,4)])))))+0.1*as.numeric(VariantScore))
ClinPriorScore<-10*(log10(1 + exp(-30*(as.numeric(position/dim(total_unique)[1])-0.3)))*score)*as.numeric(VariantScore)
ClinPriorScore[VariantScore==0]<-ClinPriorScore[VariantScore==0]/20
return(round(ClinPriorScore,digits = 4))
}
filterGnomAD<-function(gnomAD,freqThreshold)
{
posNa = c(1:length(gnomAD))[is.na(as.numeric(gnomAD))]
posThreshold = c(1:length(gnomAD))[!is.na(as.numeric(gnomAD))][as.numeric(gnomAD)[!is.na(as.numeric(gnomAD))]<=as.numeric(freqThreshold)]
selectedVariants = sort(union(posNa,posThreshold))
return(selectedVariants)
}
orderTable<-function(tableRaw, inheritance)
{
tableOrder<-tableRaw
if(inheritance == "HOM" || inheritance == "HET" || inheritance == "XLink")
{
tableRaw<-tableRaw[order(tableRaw$ClinPriorScore, decreasing=TRUE),]
#inheritanceRaw<-sapply(c(1:length(tableRaw$ClinPriorScore)),function(x) paste(x, inheritance,sep = "-"))
remove<-tableRaw$ClinPriorScore
#tableOrder<-cbind(tableRaw,inheritanceRaw,remove)
tableOrder<-cbind(tableRaw,inheritance,remove)
}
if(inheritance == "CmpHET")
{
geneUnique<-unique(tableRaw$genesList[duplicated(tableRaw$genesList)])
posNULL = match("",geneUnique)
if(!is.na(posNULL)){geneUnique = geneUnique[-posNULL]}
process <- function(x) {
posGU = grep(paste("\\b",geneUnique[x],"\\b",sep=""),tableRaw$genesList)
score = matrix(mean(tableRaw$ClinPriorScore[posGU][order(tableRaw$ClinPriorScore[posGU],decreasing=TRUE)][1:2]),ncol=1,nrow = length(posGU))
posGU
}
pos <-do.call(c, lapply(c(1:length(geneUnique)), process))
process <- function(x) {
posGU = grep(paste("\\b",geneUnique[x],"\\b",sep=""),tableRaw$genesList)
score = matrix(mean(tableRaw$ClinPriorScore[posGU][order(tableRaw$ClinPriorScore[posGU],decreasing=TRUE)][1:2]),ncol=1,nrow = length(posGU))
score
}
remove <-do.call(c, lapply(c(1:length(geneUnique)), process))
tableOrder<-cbind(tableRaw[pos,],inheritance,remove)
tableOrder = tableOrder[order(tableOrder$remove,decreasing = TRUE),]
}
return(tableOrder)
}
constructTable<-function(selectedVariants,inheritance, freqThreshold, isCodingVar,variants)
{
Ordertable<-c()
gt = variants@gt
pos = match(sampleName, colnames(gt))
VariantAnnotation<-getVariant(selectedVariants,variants)
if(isCodingVar){selectedVariants2 = isCoding(VariantAnnotation,variants)
selectedVariants = selectedVariants[selectedVariants2]
}
VariantAnnotation<-getVariant(selectedVariants,variants)
gnomAD<-gnomADConstructor(VariantAnnotation,variants)
selectedVariants2 = filterGnomAD(gnomAD,freqThreshold)
selectedVariants = selectedVariants[selectedVariants2]
if(length(selectedVariants)>0)
{
selectedVariants = c(1,2,selectedVariants)
VariantAnnotation<-getVariant(selectedVariants,variants)
gnomAD<-gnomADConstructor(VariantAnnotation,variants)
Moderate<-isModerate(VariantAnnotation,variants)
HIGH<-isHigh(VariantAnnotation,variants)
Splicing<-isSplicing(VariantAnnotation,variants)
metrics<-getMetrics(VariantAnnotation,variants)
cDNA<-getcDNA(VariantAnnotation,variants)
Protein<-getProtein(VariantAnnotation,variants)
Consequence<-getConsequence(VariantAnnotation,variants)
clinvar<-ClinVarConstructor(VariantAnnotation,variants)
CADD<-CADDconstructor(VariantAnnotation,variants)
CCR<-CCRconstructor(VariantAnnotation,selectedVariants,variants)
GTsamples = getGTsamples(selectedVariants,variants)
VariantScore<-sapply(c(1:length(selectedVariants)),function(x) calculateVariantScore(HIGH[x], Splicing[x], Moderate[x], CADD[x], CCR[x], clinvar[x], metrics[x,], inheritance,GTsamples[x,pos]))
phenoScore<-getPhenoScore(VariantAnnotation,GlobalPhenotypicScore,variants)
ClinPriorScore<-getClinPriorScore(VariantScore,phenoScore)
knownDisease<-getKnownDisease(VariantAnnotation,variants)
phaseGT = getPhysicalPhasing(selectedVariants,variants)
coding = getisCoding(VariantAnnotation,variants)
GTsamplesFull = gt[selectedVariants,pos]
table<-cbind(getFIX(variants[selectedVariants])[,c(1:5)],metrics,clinvar,knownDisease,coding,Consequence,cDNA,Protein,HIGH,Splicing,Moderate,CADD,CCR,gnomAD,phenoScore,VariantScore,ClinPriorScore,GTsamples,GTsamplesFull,phaseGT)
Ordertable<-orderTable(table, inheritance)
#delete extra
posEXTRA = match(variants@fix[selectedVariants[c(1:2)],2],Ordertable$POS)[!is.na(match(variants@fix[selectedVariants[c(1:2)],2],Ordertable$POS))]
if(length(posEXTRA)>0){Ordertable = Ordertable[-posEXTRA,]}
}
return(Ordertable)
}
#variants<-filterVariantsVcfR(variants,filter)
#Blacklist
# blacklist = read.csv(blacklistFile, sep="\t", header=FALSE)
# ensemble = do.call(paste, c(blacklist, sep="-"))
#delete ensemble exome,genome Gnomad and blacklist variants
#ensemble<-unique(c(shortSNPexome,shortSNPgenome,blacklist))
# deleteVariants<-deleteGnomadBlacklistVariants()
#if(length(deleteVariants)>0){variants<-variants[-deleteVariants,]}
#variants<-getAlt_forMutliAllelic(variants)
#delete indels in same genomic position
#deleteVariants<-deleteIndelVariantsSamePos()
#if(length(deleteVariants)>0){variants<-variants[-deleteVariants,]}
#HOM variants
HOMvariants<-getHOMvariants(variants)
tableHOM<-c()
if(length(HOMvariants)>0){tableHOM<-constructTable(HOMvariants,"HOM", frequenceAR,isCodingVar,variants)}
#AD variants
ADvariants<-getADvariants(variants)
tableAD<-c()
if(length(ADvariants)>0){tableAD<-constructTable(ADvariants,"HET", frequenceAD,isCodingVar,variants)}
#Compound heterozygous variants
CMPHETvariants<-getCmpHETvariants(variants)
tableCMPHET<-c()
if(length(CMPHETvariants)>0){tableCMPHET<-constructTable(CMPHETvariants,"CmpHET", frequenceAR,isCodingVar,variants)}
#XLINK variants
XLvariants<-getXLvariants(variants)
tableXL<-c()
if(length(XLvariants)>0){tableXL<-constructTable(XLvariants,"XLink",frequenceAR,isCodingVar,variants)}
tableEnsemble<-rbind(tableHOM,tableAD,tableCMPHET,tableXL)
tableEnsemble<-tableEnsemble[order(tableEnsemble$remove,tableEnsemble$genesList,decreasing=TRUE),]
del<-match("remove",colnames(tableEnsemble))
tableEnsemble<-tableEnsemble[,-del]
ClinPriorPosition <- c(1:dim(tableEnsemble)[1])
tableEnsemble<-cbind(ClinPriorPosition,tableEnsemble)
gc()
return(tableEnsemble)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.