#######################################################
### Allele's imputation from SNPs ###
#######################################################
#' Impute HLA allele by using HiBAG package
#' @param bedFile .bed file containing the packed binary SNP genotype data
#' @param bimFile .bim file containingthe SNP descriptions
#' @param famFile .fam file containing subject (and, possibly, family) identifiers. This is basically a tab-delimited "pedfile"
#' @param hlaLoci vector of HLA loci, by default it is: A B C DPR1
#' @param probThreshold Threshold of Postprob from Hibag to keep. By default 0.5
#' @param trainingModel Model for hibag training. Download pre-fit Model https://github.com/zhengxwen/HIBAG
#' @param pcaFile FIle with two principal components of PCA
#' @param outputFolder folder where to save the result
#' @import HIBAG
#' @importFrom utils read.table write.table
#' @importFrom stats predict
#' @importFrom snpStats read.plink
#' @importFrom stringr str_sub
#' @return All HLA alleles with imputation probabilities
#' @export
#'
hlaImputation <- function( bedFile , bimFile , famFile , hlaLoci= c("A", "B", "C", "DRB1", "DQB1"), trainingModel = NULL ,pcaFile , outputFolder, probThreshold = 0.5 )
{
## Check Parameters
if (!file.exists(bedFile)){ stop("bedFile not found") }
if (!file.exists(bimFile)){ stop("bimFile not found") }
if (!file.exists(famFile)){ stop("famFile not found") }
if (!file.exists(pcaFile)){ stop("pcaFile not found") }
if (!dir.exists(outputFolder)){ stop("outputFolder directory doesn't exist.") }
if(str_sub(bedFile,-4,-1 ) != ".bed"){ stop("bedFile doesn't have .bed extension")}
if(str_sub(bimFile,-4,-1 ) != ".bim"){ stop("bimFile doesn't have .bim extension")}
if(str_sub(famFile,-4,-1 ) != ".fam"){ stop("famFile doesn't have .fam extension")}
if (is.null(trainingModel))
{ ##Build A model
stop("Model Building not yet available. Please retry and load a model.")
}else{
if (!file.exists(trainingModel)) {stop("trainingModel file doesn't exist.")}
## Training Model preparation
model.list <- get(load(trainingModel))
## HLA Prediction
data <- HIBAG::hlaBED2Geno(bed.fn= bedFile , fam.fn = famFile, bim.fn = bimFile ) # Import Plink files
for(hla.id in hlaLoci)
{
model <- HIBAG::hlaModelFromObj(model.list[[hla.id]]) # load from model, SNP of one gene (hla.id)
pred.guess <- stats::predict(model, data, type="response", match.type="Position") # predict most probable alleles by using SNPs. SNPs of the model and dataset are matched by position.
# Saving of best guess genotype and its postprob
utils::write.table( pred.guess$value , paste( outputFolder , "HLA-", hla.id ,".csv",sep =""), row.names=F, quote=F, col.names=T, sep =",",dec = "." , na = "NA")
# # all possible combinaison of HLA alleles for each individuals
# all_genotypes <- as.data.frame(pred.guess$postprob)
# all_genotypes <- cbind(rownames(all_genotypes), all_genotypes)
# colnames(all_genotypes)[1] <- "Genotypes" ## Add a colunm with combinaison of alleles example 01:25/01:01
# all_genotypes$allele1 <- gsub("/.*$", "",all_genotypes$Genotypes) # select allele before "/" example 01:25
# all_genotypes$allele2 <- gsub("^.*?/", "",all_genotypes$Genotypes) # select allele after "/" example 01:01
# df <- data.frame(all_genotypes[which(names(all_genotypes)=="allele1")] , all_genotypes[which(names(all_genotypes)=="allele2")] , all_genotypes[2: (ncol(all_genotypes)-2)] )
# utils::write.table( df, paste( outputFolder , "HLA-", hla.id ,"_allGenotypes.csv",sep =""), row.names=F, quote=F, col.names=T, sep =",",dec = "." , na = "NA")
}
plink <- snpStats::read.plink( bed = bedFile , bim= bimFile , fam = famFile , na.strings = "-9" ) ##return a dataframe corrresponding to the .fam
statut <- plink$fam[,c(2,6)] # subject ID ("member") and their disease phenotype ("affected")
names(statut) <- c("member","Disease")
pca <- utils::read.table( pcaFile , stringsAsFactors = F, header = T, sep=',')
names(pca) <- c("member","PC1", "PC2")
mergedFile <- merge(statut,pca[,1:3], by="member", all = TRUE) ## add PC1, PC2 for each individuals
for(hla.id in hlaLoci)
{
# Merge of the 5 files returned by hibag (best guess of genotype)
file <- utils::read.table( paste( outputFolder , "HLA-", hla.id ,".csv",sep =""), stringsAsFactors = F, header = T, sep=',')
file <- subset(file, prob>=probThreshold )# keep genotype with a postprob upper than 0.5
colnames(file) <- c("member" , hla.id ,hla.id)
mergedFile <- merge(mergedFile , file [1:3] , by="member", all = TRUE)
}
names(mergedFile) <- c("subjectID" , "Disease" ,"PC1" , "PC2", "A","A","B","B","C","C","DRB1","DRB1","DQB1" ,"DQB1")
utils::write.table( mergedFile ,paste( outputFolder, "HLAalleles_bestGuess.csv", sep="") ,row.names=F, quote=F, col.names=T , sep = ",")
# deletion of the 5 files HLA-A.csv , HLA-B.csv ... HLA-DQB1.csv
system (paste("rm ", outputFolder , "HLA-*.csv ",sep =""))
}
}
#' Count HLA allele (allele dose) of interest and display their distribution across group
#'
#' Take a file containing at least colunms: subjectId , allele1 , allele 2, prob
#'
#' @param file The File containing Hla allele for each individuals and probability to have it
#' @param selection vector of allele of interest
#' @param outputFile The file that will contain the input info and
#' @return file with with all input file's data and 2xselection colunms: one for allele count(0 1 2 ) and the other with fractionned dose
#' @import ggplot2
#' @export
#'
allele_dose <- function( file , selection , outputFile )
{
data <- as.data.frame( utils::read.table( file , header = T, sep=',') )
result <- as.data.frame(data$sample.id)
colnames(result)[1] <- "sample.id"
for(allele in selection )
{
if( data$allele1 == allele )
{
data[, paste(allele , "_dose" , sep = "") ] <- ifelse(data$allele2 == allele ,2,1)
data[,paste(allele ,"_Fdose",sep = "")] <- data[,paste(allele ,"_dose",sep = "")] * data[,"prob"]
}else{
data[, paste( allele , "_dose" , sep = "") ] <- ifelse(data$allele2 == allele ,1,0)
data[,paste(allele ,"_Fdose",sep = "")] <- data[,paste(allele ,"_dose",sep = "")] * data[,"prob"]
}
result[,paste(allele , "_dose" , sep = "")] <- data[, paste(allele , "_dose" , sep = "") ]
utils::write.table(data , outputFile , row.names = F , sep = ",")
}
# Keep only Alleles Count columns And Do the total of each allele
rownames(result) <- result[,1]
result <- result[,-1]
result <- rbind(result, colSums(result[,]))
rownames(result)[nrow(result)] <- "Total.Count"
# Replace colnames and rownames as row and columns
result <- cbind(rownames(result),result)
colnames(result)[1]<- "Sample.ID"
rownames(result) <- NULL # Remove rownames
# Subset need info for the plot
result <- rbind(colnames(result),result)
displ <- result[nrow(result),2:length(selection)]
r <- as.data.frame(t(displ))
d <- cbind(row.names(r),r)
colnames(d) <-c("alleles" , "values")
plot <- ggplot(d, aes(x = alleles, y = values ,fill= alleles)) + geom_bar(stat = "identity", position = position_dodge())
print(plot)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.