HLAfix: HLA Analysis Pipeline for large-scale Association Studies

Documented in hlaAlleleAnalysis hlaAminoAcidAnalysis hlaCexpressionAnalysis hlaHaplotypeAnalysis hlaKIRAnalysis pvalueCorrection

if(getRversion() >= "2.15.1") utils::globalVariables(".")
globalVariables(c("pheno", "Disease","P_allelic","P_dominant","Freq","freq","pdf","freq_control","freq_case","dev.off","AA","threshold","occurency", "outputFile", "HLA_KIR","capture.output","haploReg","Allele","val1","val2","Nctrl","Ncase","subdata","prob","Mean_pheno","ggplot2"))


#####################################################################
### Allele association analysis with trait(s) (-/+ Covariates) ######
#####################################################################

#' Analyze association of HLA alleles with many variate phenotypes (like case/control or quantitative trait)
#'
#' Take as input :
#' (1) a file containing the following 14 colunms :  subjectID Disease PC1 PC2  A A B B C C DRB1 DRB1 DQB1 DQB1 ("," separator between fields)
#' (2 optional) file with differents quantitative phenotypes : subjectID traitName1 traitName2 ... traitName(n)
#' (3 optional) file with covariable to correct regressions
#'
#' @param dataPath HLA alleles file
#' @param phenoPath phenotypes file with different trait's names in columns
#' @param covarPath must contains SubjectID followed by covariables
#' @param outputFolder folder where to save the result
#' @param outputFile result's filename
#' @param type either "disease" for case/control analysis or "pheno" with quantitative phenotype(s)
#' @return Linear Regression result and many plots : QQplot, personnalized manhattan plot and/or variation of phenotype's expression
#' @import ggplot2
#' @import plotrix
#' @import graphics
#' @importFrom  qqman qq
#' @importFrom stats binomial glm na.omit p.adjust lm
#' @importFrom utils read.table write.table capture.output
#' @importFrom stats binomial glm na.omit
#' @importFrom grDevices dev.off  pdf
#' @export
#'

hlaAlleleAnalysis <- function( dataPath , phenoPath=NULL , covarPath=NULL  , outputFolder , outputFile  , type = "pheno" )
{
  # Check Parameters
  if(!file.exists(dataPath)){stop("dataPath is not found ")}
  if(!dir.exists(outputFolder)){stop("outputFolder directoy doesn't exist. ")}
  if(!is.character(outputFile)){stop("outputFile must be character")}
  if( type=="pheno")
  {
    if(is.null(phenoPath)){stop("For type=pheno you must provide phenoPath parameter.")}
    if(!file.exists(phenoPath)){stop("phenoPath is not found ")}
  }

  ################  DATA IMPORTATION   ################
  all_data <-  as.data.frame(utils::read.table(dataPath  , stringsAsFactors = F, header = T, sep=','))
  if(ncol(all_data)!= 14){stop("data must contains 14 colunms :subjectID Disease PC1 PC2  A A B B C C DRB1 DRB1 DQB1 DQB1 ("," separator between fields)")}
  names(all_data) <- c("subjectID" , "Disease","PC1","PC2" ,"A","A.1","B","B.1","C","C.1","DRB1","DRB1.1","DQB1","DQB1.1")
  # if((!is.element(0,all_data$Disease) & !is.element(1,all_data$Disease))
  # {
  #   all_data$Disease <- ifelse(all_data$Disease=="Ctrl" || all_data$Disease=="ctrl" ||all_data$Disease=="control" || all_data$Disease=="Control" || all_data$Disease=="controle" ,0,1)  # create pheno and 0 for ctrl or 1 for cases
  #   if (sum(is.na(all_data$Disease)) == length(all_data$Disease)) {stop("Please code 0 and 1 the Disease statut of your individuals.")}
  # }
  #
  ################ OPTIONAL PARAMETERS  ################
  if(!is.null(covarPath)){
    covarCorrection <- stats::na.omit(utils::read.table(covarPath ,  header = TRUE, sep = "," , dec = "."))
    colnames(covarCorrection)[1] <- "subjectID"
    covarNames <- stats::na.omit(names(covarCorrection)) # extract name of covariates.
    covarNames <- covarNames[covarNames!="subjectID"]
    covariables <- noquote(paste(covarNames, collapse = " + "))
    all_data <- merge(all_data, covarCorrection, by ="subjectID")
    names(all_data) <- c("subjectID" , "Disease","PC1","PC2" ,"A","A.1","B","B.1","C","C.1","DRB1","DRB1.1","DQB1","DQB1.1", "AGE.V0", "SEX","CMV.V0","NBYTABAC")
  }

  # THRESHOLDS
  freq_threshold <- readline(prompt = "\n \n Enter a frequency threshold of alleles to displays in manhattan plot\n Frequency : " )
  threshold <- readline(prompt = "In order to display less alleles and more clearly. Define a p value treshold under which one alleles are not displayed \n P-value:  ")

  switch(type,
         disease={

           HLA_genes <- c("A", "B", "C", "DRB1", "DQB1") # Set up HLA genes
           res <-  c("Locus","HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant") ## Create header to store results

           # File that will contains best Pvalue for all phenotypes
           X <- c("Locus","HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue_AllelesDiseaseAnalysis.csv ",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")

           for(gene in HLA_genes) # Make a loop with all the HLA genes ## Create the loop to analyze the data one gene at a time and inside another loop that takes each alleles
           {
             subdata <- na.omit(all_data[,c("subjectID","Disease","PC1","PC2",gene,paste(gene,".1",sep = ""))])
             names(subdata) <- c("subjectID","Disease","PC1","PC2", "allele1" , "allele2" )
             if(is.null(subdata) | length(subdata) == 0) {stop(" \n No rows raimining in your dataset, too much NA. Is your disease statut coded 0 and 1 ? \n ")}
             HLA <- stats::na.omit(as.character(unique(c(unique(subdata$allele1), unique(subdata$allele2))))) # Make a list of all existing HLA alleles in each file
             for(allele in HLA)
             { # Make a loop with all the HLA alleles for each gene, Convert the allele to 1,2 code (val1, val2), then to genotype (val3), and finally to recessive (val4) or dominant (val5) testable model
               val1 <- paste("allele1_", allele, sep="")
               subdata[,val1] <- ifelse(subdata$allele1 == allele, 1,
                                        ifelse(subdata$allele1 != allele, 2, NA))

               val2 <- paste("allele2_", allele, sep="")
               subdata[,val2] <- ifelse(subdata$allele2 == allele, 1,
                                        ifelse(subdata$allele2 != allele, 2, NA))

               val3 <- paste("genotype_", allele, sep="")
               subdata[,val3] <- paste(subdata[,val1], subdata[,val2], sep="")

               val4 <- paste("genoA_", allele, sep="")
               subdata[,val4] <- ifelse(subdata[,val3] == '11', 1,
                                        ifelse(subdata[,val3] == '12', 0,
                                               ifelse(subdata[,val3] == '21', 0,
                                                      ifelse(subdata[,val3] == '22', -1, NA))))

               val5 <- paste("genoD_", allele, sep="")
               subdata[,val5] <- ifelse(subdata[,val3] == '11', 1,
                                        ifelse(subdata[,val3] == '12', 1,
                                               ifelse(subdata[,val3] == '21', 1,
                                                      ifelse(subdata[,val3] == '22', 0, NA))))


               freq <- sum(subdata[,val1]==1,subdata[,val2]==1)/(nrow(subdata)*2) # Overall frequency
               n_ind_ctrl <- nrow(subset(subdata, Disease == 0))
               n_ind_cas <-nrow(subset(subdata, Disease == 1))

               if( n_ind_ctrl != 0 & n_ind_cas != 0 ) # Frequency are calculated for case and control
               {
                 # N <- 4/((1/nrow(subset(subdata, Disease == 1)))+(1/nrow(subset(subdata, Disease == 0))))
                 N <- nrow(subdata[,!is.na(Disease)])
                 freq_control <- sum(subset(subdata, Disease == 0)[,val1]==1,subset(subdata, Disease == 0)[,val2]==1)/(nrow(subset(subdata, Disease == 0))*2) # Controls frequency
                 freq_case <- sum(subset(subdata, Disease == 1)[,val1]==1,subset(subdata, Disease == 1)[,val2]==1)/(nrow(subset(subdata, Disease == 1))*2) # Cases frequency with homozygote for the allele (11)

               }else{
                 if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl and set to 0
                   freq_control <- 0
                   freq_case <- sum(subset(subdata, Disease == 1)[,val1]==1,subset(subdata, Disease == 1)[,val2]==1)/(nrow(subset(subdata, Disease == 1))*2) # Cases frequency with homozygote for the allele (11)
                   N <- nrow(subset(subdata, Disease == 1))

                 }else{
                   if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for ctrl and set to 0
                     freq_control <- sum(subset(subdata, Disease == 0)[,val1]==1,subset(subdata, Disease == 0)[,val2]==1)/(nrow(subset(subdata, Disease == 0))*2) # Controls frequency
                     freq_case <- 0
                     N <- nrow(subset(subdata, Disease == 0))

                   }else{ # Unknown case control Status
                     stop("For a Case/Control analysis, the Disease colunm is required. Please note 1=case and 0=control")

                   }
                 }
               }

               # Adapt the regression correction co-variables
               subdata[, val4] <- as.factor(subdata[, val4]) # convert genoA and genoD to factor for a logistic regression
               subdata[, val5] <- as.factor(subdata[, val5])
               subdata[,"Disease"] <- as.factor(subdata[,"Disease"])

               if(!is.null(covarPath)){
                 reg_allelic <- summary(stats::glm(Disease ~ subdata[, val4] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = subdata, family=stats::binomial(link="logit"),maxit=100)) # Logistic regression with allelic model
                 reg_dominant <- summary(stats::glm(Disease ~ subdata[, val5] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = subdata, family=stats::binomial(link="logit"),maxit=100)) # Logistic regression with dominant model

               }else{
                 reg_allelic <- summary(stats::glm(Disease ~ subdata[, val4] + PC1 + PC2, data = subdata, family=stats::binomial(link="logit"),maxit=100)) # Logistic regression with allelic model
                 reg_dominant <- summary(stats::glm(Disease ~ subdata[, val5] + PC1 + PC2, data = subdata, family=stats::binomial(link="logit"),maxit=100)) # Logistic regression with dominant model

               }

               N <- 4/((1/nrow(subset(subdata, Disease == 1)))+(1/nrow(subset(subdata, Disease == 0))))
               P_allelic <- reg_allelic$coefficients[2,4]
               OR_allelic <- exp(reg_allelic$coefficients[2,1])
               lower_95_CI_allelic <- exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2])
               upper_95_CI_allelic <- exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2])
               Beta_allelic <- reg_allelic$coefficients[2,1]
               SE_allelic <- reg_allelic$coefficients[2,2]
               P_dominant <- reg_dominant$coefficients[2,4]
               OR_dominant <- exp(reg_dominant$coefficients[2,1])
               lower_95_CI_dominant <- exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2])
               upper_95_CI_dominant <- exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2])
               Beta_dominant <- reg_dominant$coefficients[2,1]
               SE_dominant <- reg_dominant$coefficients[2,2]

               # add the regression result to the "res" vector
               res <- rbind(res, c(gene, allele , freq , freq_control , freq_case, N, P_allelic, OR_allelic, lower_95_CI_allelic, upper_95_CI_allelic, Beta_allelic, SE_allelic, P_dominant, OR_dominant, lower_95_CI_dominant, upper_95_CI_dominant, Beta_dominant, SE_dominant)) # See header of res for elements explanation

             }

           }
           # Save results
           # if(!is.null(res) & length(res) > 18){
           utils::write.table(res, paste(outputFolder , outputFile,"DiseaseAnalysis.csv" , sep=""), row.names=F, quote=F, col.names=F, sep = ",")
           cat("Regression at allele levels is processed successfully \n")
           cat(paste("File", paste(outputFolder , outputFile,"DiseaseAnalysis.csv" , sep="") ,'is created.  \n', sep = " "))

           ##########################################################################################################################################
           information <-   utils::read.table( paste(outputFolder , outputFile,"DiseaseAnalysis.csv" , sep=""), stringsAsFactors = F, header = T, sep=',')
           seuil <- pvalueCorrection(alleleRegResult = paste(outputFolder , outputFile,"DiseaseAnalysis.csv" , sep=""), correction = "bonferroni")
           if(is.null(seuil)){stop("Seuil calculated with pvalueCorrection() is null")}

           grDevices::pdf(file = paste(outputFolder ,outputFile,"Alleles_Disease_Analysis_freq",freq_threshold,"_pvalManhattan",threshold , ".pdf" ,sep=""))
           ################### Best Pvalue selection gathered in an unique file
           resultLoop <- as.data.frame(information)
           resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold  & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05) ,]
           if(!is.null(resultLoop) & length(resultLoop)> 0){
             suppressWarnings({
               resultLoop <-resultLoop[,c("Locus","HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")]
               write.table(resultLoop, file=paste(outputFolder,"BestPvalue_AllelesDiseaseAnalysis.csv ",sep = "") , append = TRUE, row.names = FALSE ,  col.names = FALSE ,quote=F, sep = ",")
             })
           }
           ################### QQPLOT
           ## Lambda: genomic inflation factor, show the presence of bias
           # allelic Model
           observedR <- sort(information$P_allelic)
           lobsR <- -(log10(observedR))
           expectedR <- c(1:length(observedR))
           lexpR <- -(log10(expectedR / (length(expectedR)+1)))
           lambdaR<- summary(stats::lm(lobsR~lexpR))

           # Dominant Model
           observedD <- sort(information$P_dominant)
           lobsD <- -(log10(observedD))
           expectedD <- c(1:length(observedD))
           lexpD <- -(log10(expectedD / (length(expectedD)+1)))
           lambdaD<- summary(stats::lm(lobsD~lexpD))

           # QQ PLOT
           qq1 <- qq(information$P_allelic , main="Disease analysis of alleles in allelic model ", sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
           qq2 <- qq(information$P_dominant , main="Disease analysis of alleles in dominant model ", sub= paste("Lambda = ", round(lambdaD$coefficients[2,1],2),sep = ""))
           capture.output( print(qq1),  file='NUL' )
           capture.output( print(qq2) , file='NUL')

           ################### MANHATTAN PLOT
           selection <-  na.omit(information[ information$Frequency >= freq_threshold & (information$P_allelic<=threshold | information$P_dominant<=threshold),c(1,2,7,13)]) # "Locus" "HLA_allele" "P_allelic" "P_dominant"
           if(nrow(selection)==0){
             cat(paste("None alleles to display. You can change your threshold p-value. Set at ",threshold,sep = ""))
           }else{
             options(scipen = 999)
             colnames(selection) <- c("Locus", "Allele" , "P_allelic", "P_dominant")
             maxYvalPallelic <- (max(-log10(selection$P_allelic))+1)
             maxYvalPdom <- (max(-log10(selection$P_dominant))+1)

             # Adapt x axis legend with the number of alleles to plot
             n <- nrow(selection)
             if(n<50){
               size_txt_x_axis <- 10
               size_point <- 2
             }else{
               if(n>50 & n<95){
                 size_txt_x_axis <- 5.5
                 size_point <- 2
               }else{
                 if(n>95){
                   size_txt_x_axis <- 3.5
                   size_point <- 1
                 }
               }
             }
             plot1 <-ggplot( selection , aes(x=Allele, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(selection$P_allelic)) >= seuil, "red","black")) + scale_y_continuous( limits = c(0 , maxYvalPallelic)) +  theme(axis.text.x = element_text(size = size_txt_x_axis, angle=90)) + facet_grid(.~Locus , scales = "free" , space = "free_x") + geom_hline(yintercept= -log10(0.05) , color = "blue")  + geom_hline(yintercept= seuil , color = "red")  + ggtitle(paste("Disease analysis of alleles in allelic model",sep = ""))
             plot2 <-ggplot( selection , aes(x=Allele, y=  -(log10(P_dominant)) ))  + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(selection$P_dominant)) >= seuil, "red","black")) + scale_y_continuous( limits = c(0 , maxYvalPdom)) +  theme(axis.text.x = element_text(size = size_txt_x_axis, angle=90)) + facet_grid(.~Locus , scales = "free" , space = "free_x") + geom_hline(yintercept= -log10(0.05) , color = "blue")  + geom_hline(yintercept= seuil , color = "red") + ggtitle(paste("Disease analysis of alleles in dominant model",sep = ""))
           }
           suppressWarnings({
             print(plot1)
             print(plot2)
           })

           grDevices::dev.off()

         }, # end case = disease
  pheno ={
           #List of phenotypes
           phenoFile <-  as.data.frame(utils::read.table(phenoPath  , stringsAsFactors = F, header = T, sep=','))
           colnames(phenoFile)[1] <- "subjectID"
           phenotypes <- stats::na.omit(unique(c(names(phenoFile)))) # Make a list of of phenotypes in the file from colnames
           phenotypes <- phenotypes[phenotypes != "subjectID"]

           # File that will contains best Pvalue for all phenotypes
           X <- c("Phenotype","Locus","HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue_AllelesPhenoAnalysis.csv ",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")
           grDevices::pdf(file = paste(outputFolder ,outputFile,"Alleles_Pheno_Analysis_freq",freq_threshold,"_pvalManhattan",threshold , ".pdf" ,sep=""))

           for(pheno in phenotypes)
           {
             print(paste(pheno, " is currently being analysed.", sep = ""))
             HLA_genes <- c("A", "B", "C", "DRB1", "DQB1")
             res <- c("Locus","HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
             phenoGeno <- merge(all_data,phenoFile[,c("subjectID",pheno)],by = "subjectID") # could contain co-variables if provided
             colnames(phenoGeno)[ncol(phenoGeno)] <- "trait"
             phenoGeno <- na.omit(phenoGeno)
             Mean_pheno <- mean(phenoGeno$trait)

             if(nrow(phenoGeno)==0){stop("After merging genetic data with pheno and removing NA there is no data left to run an analysis. Maybe an incompatible sample.ID")}
             if(dir.exists(paste(outputFolder,"expression_HLA/",sep = ""))){
               grDevices::pdf(file = paste(outputFolder ,"expression_HLA/",outputFile,pheno,"-expression-vs-alleles " , ".pdf" ,sep=""))
             }else{
               dir.create(paste(outputFolder,"expression_HLA/",sep = ""))
               grDevices::pdf(file = paste(outputFolder ,"expression_HLA/",outputFile,pheno,"-expression-vs-alleles " , ".pdf" ,sep=""))
             }
             for(gene in HLA_genes) # Make a loop with all the HLA genes
             {
               if(!is.null(covarPath)){
                 subdata <- na.omit (subset(phenoGeno, select=c("subjectID","Disease","PC1","PC2",gene,paste(gene,".1",sep = ""),"trait", "AGE.V0", "SEX","CMV.V0","NBYTABAC")))
                 names(subdata) <- c("subjectID","Disease", "PC1","PC2","allele1" , "allele2", "trait" , "AGE.V0", "SEX","CMV.V0","NBYTABAC")
               }else{
                 subdata <- na.omit (subset(phenoGeno, select=c("subjectID","Disease","PC1","PC2",gene,paste(gene,".1",sep = ""),"trait")))
                 names(subdata) <- c("subjectID","Disease", "PC1","PC2","allele1" , "allele2", "trait")
               }
               HLA <- stats::na.omit(as.character(unique(c(unique(subdata$allele1), unique(subdata$allele2))))) # Make a list of all existing HLA alleles in each file
               for(allele in HLA)
               {
                 val1 <- paste("allele1_", allele, sep="")
                 subdata[,val1] <- ifelse(subdata$allele1 == allele, 1,
                                          ifelse(subdata$allele1 != allele, 2, NA))

                 val2 <- paste("allele2_", allele, sep="")
                 subdata[,val2] <- ifelse(subdata$allele2 == allele, 1,
                                          ifelse(subdata$allele2 != allele, 2, NA))

                 val3 <- paste("genotype_", allele, sep="")
                 subdata[,val3] <- paste(subdata[,val1], subdata[,val2], sep="")

                 val4 <- paste("genoA_", allele, sep="")
                 subdata[,val4] <- ifelse(subdata[,val3] == '11', 1,
                                          ifelse(subdata[,val3] == '12', 0,
                                                 ifelse(subdata[,val3] == '21', 0,
                                                        ifelse(subdata[,val3] == '22', -1, NA))))

                 val5 <- paste("genoD_", allele, sep="")
                 subdata[,val5] <- ifelse(subdata[,val3] == '11', 1,
                                          ifelse(subdata[,val3] == '12', 1,
                                                 ifelse(subdata[,val3] == '21', 1,
                                                        ifelse(subdata[,val3] == '22', 0, NA))))


                 freq <- sum(subdata[,val1]==1,subdata[,val2]==1)/(nrow(subdata)*2) # Overall frequency
                 n_ind_ctrl <- nrow(subset(subdata, Disease == 0))
                 n_ind_cas <-nrow(subset(subdata, Disease == 1))

                 if( n_ind_ctrl != 0 & n_ind_cas != 0 ) # Frequency are calculated for case and control
                 {
                   # N <- 4/((1/nrow(subset(subdata, Disease == 1)))+(1/nrow(subset(subdata, Disease == 0))))
                   N <- nrow(subdata[,!is.na(Disease)])
                   freq_control <- sum(subset(subdata, Disease == 0)[,val1]==1,subset(subdata, Disease == 0)[,val2]==1)/(nrow(subset(subdata, Disease == 0))*2) # Controls frequency
                   freq_case <- sum(subset(subdata, Disease == 1)[,val1]==1,subset(subdata, Disease == 1)[,val2]==1)/(nrow(subset(subdata, Disease == 1))*2) # Cases frequency with homozygote for the allele (11)

                 }else{
                   if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl and set to 0
                     freq_control <- 0
                     freq_case <- sum(subset(subdata, Disease == 1)[,val1]==1,subset(subdata, Disease == 1)[,val2]==1)/(nrow(subset(subdata, Disease == 1))*2) # Cases frequency with homozygote for the allele (11)
                     N <- nrow(subset(subdata, Disease == 1))

                   }else{
                     if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for ctrl and set to 0
                       freq_control <- sum(subset(subdata, Disease == 0)[,val1]==1,subset(subdata, Disease == 0)[,val2]==1)/(nrow(subset(subdata, Disease == 0))*2) # Controls frequency
                       freq_case <- 0
                       N <- nrow(subset(subdata, Disease == 0))

                     }else{ # Unknown case control Status
                       N <- 0
                       freq_control <- 0
                       freq_case <- 0
                     }
                   }
                 }

                 # Adapt the regression correction co-variables
                 if(!is.null(covarPath)){
                   reg_allelic <- summary(stats::lm(trait ~ subdata[, val4] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = subdata  )) # Linear regression with allelic model
                   reg_dominant <- summary(stats::lm(trait ~ subdata[, val5] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = subdata )) # Linear regression with dominant model

                 }else{
                   reg_allelic <- summary(stats::lm(trait ~ subdata[, val4] + PC1 + PC2, data = subdata)) # Linear regression with allelic model
                   reg_dominant <- summary(stats::lm(trait ~ subdata[, val5] + PC1 + PC2, data = subdata)) # Linear regression with dominant model

                 }

                 P_allelic <- reg_allelic$coefficients[2,4]
                 Beta_allelic <- round(reg_allelic$coefficients[2,1],2)
                 SE_allelic <- round(reg_allelic$coefficients[2,2],2)
                 Rsquare_allelic <-round(reg_allelic$adj.r.squared, 3)

                 P_dominant <- reg_dominant$coefficients[2,4]
                 Beta_dominant <- round(reg_dominant$coefficients[2,1],2)
                 SE_dominant <- round(reg_dominant$coefficients[2,2],2)
                 Rsquare_dominant <-round(reg_dominant$adj.r.squared, 3)
                 # add the regression result to the "res" vector
                 options(scipen = 999)
                 res <- rbind(res, c(gene, allele , freq, freq_control, freq_case, N, Mean_pheno, P_allelic, Beta_allelic, SE_allelic, Rsquare_allelic, P_dominant, Beta_dominant, SE_dominant, Rsquare_dominant)) # See header of res for elements explanation

               }

               ################  ehPlot for visualization of quantitative phenotype as a function of allele ################
               selection <- phenoGeno[,c("subjectID" , gene, paste(gene,".1",sep=""),"trait")]
               names(selection) <- c("subjectID" , "allele1","allele2","trait")
               selection$genotype <- ifelse(selection$allele1==selection$allele2,2,1) # coded 1:heterozugous 2:homozygous
               selection$genotype <- as.factor(selection$genotype)
               names(selection) <- c("subjectID" , "allele1","allele2","trait","genotype")

               # alleles independally
               HLA_a1 <- na.omit(selection[,c("trait","allele1", "genotype")]) #allele 1 without NA
               names(HLA_a1)[2] <- "HLA_alleles" #rename column
               HLA_a2 <- na.omit(selection[,c("trait","allele2", "genotype")]) #allele 2 without NA
               names(HLA_a2)[2] <- "HLA_alleles" #rename column
               HLA_all_alleles <- rbind(HLA_a1, HLA_a2) #combine data
               names(HLA_all_alleles) <- c("trait","All","genotype")
               HLA_all_alleles$HLA_alleles_2d <- substr(HLA_all_alleles$All, 1, 2) # Create 2 digits column

               #Remove alleles with less than 5 data point
               alleles <- unique(HLA_all_alleles$HLA_alleles_2d)
               for(i in alleles){
                 if(sum(HLA_all_alleles$HLA_alleles_2d == i) < 5){
                   HLA_all_alleles <- HLA_all_alleles[! HLA_all_alleles$HLA_alleles_2d == i,]
                 }
               }
               if(nrow(HLA_all_alleles)!=0)
               { # try to draw the plot only if data are available
                 #Get the stat
                 HLA_all_alleles_boxplot <- boxplot(HLA_all_alleles$trait ~ HLA_all_alleles$HLA_alleles_2d, plot=F)
                 HLAstat <- HLA_all_alleles_boxplot$stats
                 colnames(HLAstat) <- HLA_all_alleles_boxplot$names
                 rownames(HLAstat) <- c('min','lower quartile','median','upper quartile','max')
                 HLAstat <- t(HLAstat)
                 HLAstat <- HLAstat[order(HLAstat[,3]),]
                 HLAstat <- cbind(HLAstat, as.matrix(tapply(HLA_all_alleles$trait, HLA_all_alleles$HLA_alleles_2d, mean)))
                 colnames(HLAstat)[6] <- "mean"
                 # write.table(HLAstat, "HLA_DR_2digits_expr_MI_data_280318.txt", sep="\t")
                 HLA_all_alleles$HLA_alleles_2d <- factor(HLA_all_alleles$HLA_alleles_2d, levels=rownames(HLAstat)) #set the HLA-C alleles as factors to show the graph in increasing order

                 #Draw the graph
                 suppressWarnings({
                   myEhplot <- plotrix::ehplot(HLA_all_alleles$trait, HLA_all_alleles$HLA_alleles_2d, xlab=paste("HLA-", gene," alleles",sep = ""), ylab=paste(pheno," expression"), main=paste(pheno," expression depending on alleles"), pch=20, interval=5, col=HLA_all_alleles$genotype)
                   capture.output( print(myEhplot) , file='NUL')
                 })
               }
             } # end gene loop
             grDevices::dev.off()

             # save a result file for one phenotype.
             utils::write.table(res, paste(outputFolder , outputFile , "-", pheno , ".csv" ,sep=""), row.names=F, quote=F, col.names=F, sep = ",")
             cat(paste("Regression at allele levels and with : ", pheno , " as covariates  is processed successfully \n" , sep = ""))
             cat( paste("File", paste(outputFolder , outputFile, "-", pheno   , sep="") ,'is created.  \n', sep = " "))

             ##########################################################################################################################################

             information <-   utils::read.table(paste(outputFolder , outputFile , "-", pheno , ".csv" ,sep=""), stringsAsFactors = F, header = T, sep=',')
             seuil <- pvalueCorrection(alleleRegResult = paste(outputFolder , outputFile , "-", pheno , ".csv" ,sep=""), correction = "bonferroni")
             if(is.null(seuil)){stop("Seuil calculated with pvalueCorrection() is null")}

             # grDevices::dev.off()
             ################### Best Pvalue selection gathered in an unique file
             resultLoop <- as.data.frame(information)
             resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold  & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05) ,]
             if(nrow(resultLoop)!=0){
               suppressWarnings({
                 options(scipen = 999)
                 resultLoop$Phenotype <- pheno
                 resultLoop <-resultLoop[,c("Phenotype","Locus","HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")]
                 write.table(resultLoop, file=paste(outputFolder,"BestPvalue_AllelesPhenoAnalysis.csv ",sep = "") , append = TRUE, row.names = FALSE ,  col.names = FALSE ,quote=F, sep = ",")
               })
             }
             ################### QQPLOT

             ## Lambda: genomic inflation factor, show the presence of bias
             # allelic Model
             observedR <- sort(information$P_allelic)
             lobsR <- -(log10(observedR))
             expectedR <- c(1:length(observedR))
             lexpR <- -(log10(expectedR / (length(expectedR)+1)))
             lambdaR<- summary(stats::lm(lobsR~lexpR))

             # Dominant Model
             observedD <- sort(information$P_dominant)
             lobsD <- -(log10(observedD))
             expectedD <- c(1:length(observedD))
             lexpD <- -(log10(expectedD / (length(expectedD)+1)))
             lambdaD<- summary(stats::lm(lobsD~lexpD))

             # QQ PLOT
             qq1 <- qq(information$P_allelic , main=paste(pheno," in allelic modele ",sep = ""), sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
             qq2 <- qq(information$P_dominant , main=paste(pheno,"Alleles in Dominant modele ",sep = ""), sub= paste("Lambda = ", round(lambdaD$coefficients[2,1],2),sep = ""))
             capture.output( print(qq1),  file='NUL' )
             capture.output( print(qq2) , file='NUL')

             ################### MANHATTAN PLOT
             selection <-  information[ information$Frequency >= freq_threshold & (information$P_allelic <= threshold | information$P_dominant <= threshold),c(1,2,8,12)] # "Locus" "HLA_allele" "P_allelic" "P_dominant"
             if(nrow(selection)==0){
               cat(paste("None alleles to display. You can change your threshold p-value. Set at ",threshold,sep = ""))
             }else{
               options(scipen = 999)
               colnames(selection) <- c("Locus", "Allele" , "P_allelic", "P_dominant")
               maxYvalPallelic <- (max(-log10(selection$P_allelic))+1)
               maxYvalPdom <- (max(-log10(selection$P_dominant))+1)

               # Adapt x axis legend with the number of alleles to plot
               n <- nrow(selection)
               if(n<50){
                 size_txt_x_axis <- 10
                 size_point <- 2
               }else{
                 if(n>50 & n<95){
                   size_txt_x_axis <- 5.5
                   size_point <- 2
                 }else{
                   if(n>95){
                     size_txt_x_axis <- 3.5
                     size_point <- 1
                   }
                 }
               }
               plot1 <-ggplot( selection , aes(x=Allele, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(selection$P_allelic)) > seuil, "red","black")) + scale_y_continuous( limits = c(0 , maxYvalPallelic)) +  theme(axis.text.x = element_text(size = size_txt_x_axis, angle=90)) + facet_grid(.~Locus , scales = "free" , space = "free_x") + geom_hline(yintercept= -log10(0.05) , color = "blue")  + geom_hline(yintercept= seuil , color = "red")  + ggtitle(paste( pheno ," analysis in allelic Model",sep = ""))
               plot2 <-ggplot( selection , aes(x=Allele, y=  -(log10(P_dominant)) ))  + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(selection$P_dominant)) > seuil, "red","black")) + scale_y_continuous( limits = c(0 , maxYvalPdom)) +  theme(axis.text.x = element_text(size = size_txt_x_axis, angle=90)) + facet_grid(.~Locus , scales = "free" , space = "free_x") + geom_hline(yintercept= -log10(0.05) , color = "blue")  + geom_hline(yintercept= seuil , color = "red") + ggtitle(paste( pheno ," analysis in Dominant Model",sep = ""))
             }
             suppressWarnings({
               print(plot1)
               print(plot2)
             })
           } # end pheno loop

           grDevices::dev.off()
         }) # end case = pheno
}


#####################################################################
### Haplotype association analysis with trait(s) (-/+ Covariates)###
#####################################################################

#' Analyze association of HLA haplotypes with many variate phenotypes (like case/control or quantitative trait)
#' Take as input:
#' (1 required) haplotypes :  subjectID haplotype1 freq1  haplotype2 freq2 postprob ("," separator between fields)
#' (2 optional) differents quantitative phenotypes : subjectID traitName1 traitName2 ... traitName(n)
#' (3 required) Disease information : 0= control 1=case (needed for frequencies and/or type=disease analysis)
#' (4 required) the 2 first principals components of the PCA
#' (5 optional) covariable to correct regressions
#'
#'
#' @param haploDataPath HLA haplotypes data exactly as the one provided by easy-HLA webtool
#' @param phenoPath phenotypes file with different trait's names in columns
#' @param statut file conaining subject ID and Disease statut
#' @param pcaPath pca's result: subjectID, PC1 and PC2
#' @param covarPath must contains SubjectID followed by covariables
#' @param outputFolder folder where to save the result
#' @param outputFile result's filename
#' @param type  either "disease" for case/control analysis or "pheno" with quantitative phenotype(s)
#' @return Linear Regression result and many plots : QQplot and personnalized manhattan plot
#' @import ggplot2
#' @import ggrepel
#' @importFrom  qqman qq
#' @importFrom stats binomial glm na.omit p.adjust lm
#' @importFrom utils read.table write.table capture.output
#' @importFrom grDevices dev.off pdf
#' @export
#'


hlaHaplotypeAnalysis <- function( haploDataPath , statut , phenoPath= NULL, pcaPath=NULL, covarPath=NULL , outputFolder , outputFile, type="pheno")
{
  # Check Parameters
  if(!file.exists(haploDataPath)){stop("haploDataPath is not found ")}
  if(!file.exists(phenoPath)){stop("phenoPath is not found ")}
  if(!file.exists(statut)){stop("statut is not found ")}
  if(!dir.exists(outputFolder)){stop("outputFolder directoy doesn't exist. ")}
  if(!is.character(outputFile)){stop("outputFile must be character")}

  if( type=="pheno")
  {
    if(is.null(phenoPath)){stop("For type=pheno you must provide phenoPath parameter.")}
    if(!file.exists(phenoPath)){stop("phenoPath is not found ")}
  }

  ################  DATA IMPORTATION   ################
  haploData <- as.data.frame(utils::read.table(haploDataPath  , stringsAsFactors = F, header = T, sep=','))
  phenoFile <-  as.data.frame(utils::read.table(phenoPath  , stringsAsFactors = F, header = T, sep=','))
  statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))
  if(ncol(statut)!=2){stop("statut must contains 2 colunms : subjectID and Affection statut")}
  names(statut) <- c("subjectID","Disease")
  # statut$Disease <- ifelse(statut$Disease=="Ctrl" || statut$Disease=="ctrl" ||statut$Disease=="control" || statut$Disease=="Control" || statut$Disease=="controle" ,0,1) # create pheno and 0 for ctrl or 1 for cases

  names(haploData) <- c("subjectID" ,"A.B.C.DR.DQ.1","freq1","A.B.C.DR.DQ.2","freq2","postProb" ,"precision")
  colnames(phenoFile)[1] <- "subjectID"

  all_data <- merge(haploData, statut, by ="subjectID")
  colnames(all_data)[ncol(all_data)] <- "Disease"

  ################ OPTIONAL PARAMETERS  ################
  if(!is.null(pcaPath)){
    PCs <- stats::na.omit(utils::read.table(pcaPath ,  header = TRUE, sep = "," , dec = "."))
    if(ncol(PCs)!=3){stop("PCs must contains 3 colunms : subjectID, PC1 and PC2.")}
    colnames(PCs)[1]<-"subjectID"
    colnames(PCs)[2]<-"PC1"
    colnames(PCs)[3]<-"PC2"
    all_data <- merge(all_data, PCs[,1:3], by ="subjectID")
  }
  if(!is.null(covarPath)){
    covarCorrection <- stats::na.omit(utils::read.table(covarPath ,  header = TRUE, sep = "," , dec = "."))
    colnames(covarCorrection)[1] <- "subjectID"
    all_data <- na.omit(merge(all_data, covarCorrection ,by = "subjectID"))
    colnames(all_data)[ncol(all_data)-1] <- "NBYTABAC"
    colnames(all_data)[ncol(all_data)-2] <- "CMV.V0"
    colnames(all_data)[ncol(all_data)-3] <- "SEX"
    colnames(all_data)[ncol(all_data)-4] <- "AGE.V0"

    # covarNames <- stats::na.omit(names(covarCorrection))  # extract name of covariates.
    # covarNames <- covarNames[covarNames!="subjectID"]
    # covariables <- noquote(paste(covarNames, collapse = " + "))
  }

  # THRESHOLDS
  freq_threshold <- readline(prompt = "\n \n Enter a frequency threshold of haplotype to displays in manhattan plot\n Frequency: " )
  threshold <- readline(prompt = "\n \n In order to display most interesting haplotypes and in clearly way. \n Define a p value treshold under which one haplotypes are not displayed in manhattan plot \n P-value:  ")
  occurrency <- readline(prompt = "\n \n Enter number of haplotype occurency in the cohort \n Occurency: " )

  switch(type,
         disease={
           res <- c("HLA_haplotype", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_95%_CI_dominant", "upper_95%_CI_dominant", "Beta_dominant", "SE_dominant")

           ## Determine haplotypes to analyze; more than 2 occurences
           df <- as.data.frame(table(stats::na.omit(c( as.character(all_data$A.B.C.DR.DQ.1) , as.character(all_data$A.B.C.DR.DQ.2))))) ## number of occurences
           haplo <-subset(df , Freq >= occurency) ## select haplo that appears more than 2 times
           grDevices::pdf(file = paste(outputFolder ,outputFile,"Haplo_Disease_analysis_freq",freq_threshold,"_occurency", occurrency ,"_pvalManhattan",threshold , ".pdf" ,sep=""))

           for(hap in haplo[,1])
           { # Make a loop with all the HLA haplotypes, Convert the haplotypes to 1,2 code (val1, val2), then to genotype (val3), and finally to recessive (val4) or dominant (val5) testable model
             val1 <- paste("allele1_", hap, sep="")
             all_data[,val1] <- ifelse(all_data$A.B.C.DR.DQ.1 == hap, 1,
                                       ifelse(all_data$A.B.C.DR.DQ.1 != hap, 2, NA))

             val2 <- paste("allele2_", hap, sep="")
             all_data[,val2] <- ifelse(all_data$A.B.C.DR.DQ.2 == hap, 1,
                                       ifelse(all_data$A.B.C.DR.DQ.2 != hap, 2, NA))

             val3 <- paste("genotype_", hap, sep="")
             all_data[,val3] <- paste(all_data[,val1], all_data[,val2], sep="")

             val4 <- paste("genoA_", hap, sep="")
             all_data[,val4] <- ifelse(all_data[,val3] == '11', 1,
                                       ifelse(all_data[,val3] == '12', 0,
                                              ifelse(all_data[,val3] == '21', 0,
                                                     ifelse(all_data[,val3] == '22', -1, NA))))

             val5 <- paste("genoD_", hap, sep="")
             all_data[,val5] <- ifelse(all_data[,val3] == '11', 1,
                                       ifelse(all_data[,val3] == '12', 1,
                                              ifelse(all_data[,val3] == '21', 1,
                                                     ifelse(all_data[,val3] == '22', 0, NA))))


             ## Frequencies
             freq <- sum(all_data[,val1]==1,all_data[,val2]==1)/(nrow(all_data)*2) # Overall frequency
             n_ind_ctrl <- nrow(subset(all_data, Disease == 0))
             n_ind_cas <-nrow(subset(all_data, Disease == 1))

             if( n_ind_ctrl != 0 & n_ind_cas != 0 )
             { # Frequency are calculated for case and control
               # N <- 4/((1/nrow(subset(all_data, Disease == 1)))+(1/nrow(subset(all_data, Disease == 0))))
               N <- nrow(subdata[,!is.na(Disease)])
               freq_control <- sum(subset(all_data, Disease == 0)[,val1]==1,subset(all_data, Disease == 0)[,val2]==1)/(nrow(subset(all_data, Disease == 0))*2) # Controls frequency
               freq_case <- sum(subset(all_data, Disease == 1)[,val1]==1,subset(all_data, Disease == 1)[,val2]==1)/(nrow(subset(all_data, Disease == 1))*2) # Cases frequency

             }else{
               if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl and set to 0
                 freq_control <- 0
                 freq_case <- sum(subset(all_data, Disease == 1)[,val1]==1,subset(all_data, Disease == 1)[,val2]==1)/(nrow(subset(all_data, Disease == 1))*2) # Cases frequency
                 N <- nrow(subset(all_data, Disease == 1))

               }else{
                 if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case and set to 0
                   freq_control <- sum(subset(all_data, Disease == 0)[,val1]==1,subset(all_data, Disease == 0)[,val2]==1)/(nrow(subset(all_data, Disease == 0))*2) # Controls frequency
                   freq_case <- 0
                   N <- nrow(subset(all_data, Disease == 0))
                 }else{ # Unknown case control Status
                   stop("For a Case/Control analysis, the Disease colunm is required. Please note 1=case and 0=control")
                 }
               }
             }
             # Adapt the regression correction co-variables
             if(is.null(covarPath) & is.null(pcaPath)){
               reg_allelic <- summary(stats::glm(Disease ~ all_data[, val4] , data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
               reg_dominant <- summary(stats::glm(Disease ~  all_data[, val5] , data = all_data, family=stats::binomial(link="logit"), maxit = 100 )) # Logistic regression with dominant model
             }else{
               if(!is.null(covarPath) & is.null(pcaPath)){
                 reg_allelic <- summary(stats::glm(Disease ~ all_data[, val4] + AGE.V0 + SEX + CMV.V0 + NBYTABAC  , data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
                 reg_dominant <- summary(stats::glm(Disease ~  all_data[, val5] + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = all_data , family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with dominant model
               }else{
                 if(is.null(covarPath) & !is.null(pcaPath)){
                   reg_allelic <- summary(stats::glm(Disease ~ all_data[, val4] + PC1 + PC2 , data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
                   reg_dominant <- summary(stats::glm(Disease ~ all_data[, val5] + PC1 + PC2 , data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with dominant model


                 }else{
                   reg_allelic <- summary(stats::glm(Disease ~ all_data[, val4] + PC1 + PC2 +  AGE.V0 + SEX + CMV.V0 + NBYTABAC  , data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
                   reg_dominant <- summary(stats::glm(Disease ~  all_data[, val5] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = all_data , family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with dominant model
                 }
               }
             }
             #reg_allelic <- summary(stats::glm(Disease ~ all_data[, val4] + PC1 + PC2 + covariables , data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
             #reg_dominant <- summary(stats::glm(Disease ~ all_data[, val5] + PC1 + PC2 + covariables, data = all_data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with dominant model

             P_allelic <- reg_allelic$coefficients[2,4]
             OR_allelic <- round(exp(reg_allelic$coefficients[2,1]),2)
             lower_95_CI_allelic <- round(exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2]),2)
             upper_95_CI_allelic <- round(exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2]),2)
             Beta_allelic <- round(reg_allelic$coefficients[2,1],2)
             SE_allelic <- round(reg_allelic$coefficients[2,2],2)
             P_dominant <- reg_dominant$coefficients[2,4]
             OR_dominant <- round(exp(reg_dominant$coefficients[2,1]),2)
             lower_95_CI_dominant <- round(exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2]),2)
             upper_95_CI_dominant <- round(exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2]),2)
             Beta_dominant <- round(reg_dominant$coefficients[2,1],2)
             SE_dominant <- round(reg_dominant$coefficients[2,2],2)
             # add the regression result to the "res" vector
             options(scipen = 999)
             res <- rbind(res, c(hap, freq , freq_control , freq_case, N, P_allelic, OR_allelic, lower_95_CI_allelic, upper_95_CI_allelic, Beta_allelic, SE_allelic, P_dominant, OR_dominant, lower_95_CI_dominant, upper_95_CI_dominant, Beta_dominant, SE_dominant)) # See header of res for elements explanation

           }

           # Save results
           utils::write.table(res,paste(outputFolder , outputFile,"DiseaseAnalysis.csv" ,sep=""), sep=",",row.names=F, quote=F, col.names=F)
           debuggingState(on=FALSE)

           if(file.exists(paste(outputFolder , outputFile,"DiseaseAnalysis.csv" ,sep="")))
           {
             cat("Regression at haplotype levels is Processed successfully \n")
             cat( paste("File", paste(outputFolder , outputFile ,"DiseaseAnalysis.csv" , sep="") ,'is created.  \n', sep = " "))
           }

           ##################################################### PLOT RESULT ############################################################
           information <-   utils::read.table(paste(outputFolder , outputFile,"DiseaseAnalysis.csv" ,sep=""), stringsAsFactors = F, header = T, sep=',')
           seuil <- pvalueCorrection(haploRegResult = paste(outputFolder , outputFile , "-", pheno , ".csv" , sep=""),  correction = "bonferroni")
           if(is.null(seuil)){stop("Seuil calculated with pvalueCorrection() is null")}

           ################### Best Pvalue selection gathered in an unique file
           resultLoop <- as.data.frame(information)
           resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05 ),]
           if(nrow(resultLoop)!=0){
             suppressWarnings({
               options(scipen = 999)
               resultLoop$Phenotype <- pheno
               resultLoop <-resultLoop[,c("Phenotype","HLA_haplotype", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")]
               names(resultLoop) <- c("Phenotype","HLA_haplotype", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")
               write.table(resultLoop, file=paste(outputFolder,"BestPvalue_HaploAnalysis.csv ",sep = "") , append = TRUE, row.names = FALSE ,  col.names = FALSE ,quote=F, sep = ",")
             })
           }

           ################### QQPLOT
           ## Lambda: genomic inflation factor, show the presence of bias
           # allelic Model
           observedR <- sort(information$P_allelic)
           lobsR <- -(log10(observedR))
           expectedR <- c(1:length(observedR))
           lexpR <- -(log10(expectedR / (length(expectedR)+1)))
           lambdaR<-summary(stats::lm(lobsR~lexpR))

           # Dominant Model
           observedD <- sort(information$P_dominant)
           lobsD <- -(log10(observedD))
           expectedD <- c(1:length(observedD))
           lexpD <- -(log10(expectedD / (length(expectedD)+1)))
           lambdaD<- summary(stats::lm(lobsD~lexpD))

           qq1 <- qq(information$P_allelic , main=paste(pheno,"-haplotype (allelic model)",sep = ""), sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
           qq2 <- qq(information$P_dominant , main=paste(pheno,"-haplotype (dominant model)",sep = ""), sub= paste("Lambda = ", round(lambdaD$coefficients[2,1],2),sep = ""))
           capture.output(print(qq1) , file = "NUL")
           capture.output(print(qq2) , file = "NUL")

           ################### MANHATTAN PLOT
           haploSel <- information[information$Frequency >= freq_threshold & (information$P_allelic<=threshold | information$P_dominant<=threshold),]
           if(nrow(haploSel)==0){
             cat(paste("None haplotype to display. You can change your threshold p-value. Set at ",threshold ,sep = ""))
           }else{
             haploSel$order <- c(1:nrow(haploSel))
             options(scipen = 999)
             maxYvalPallelic <- ((max(-log10(haploSel$P_allelic)))+1)
             maxYvalPdom <- ((max(-log10(haploSel$P_dominant)))+1)

             n <- nrow(haploSel)
             if(n<50){ size_point <- 2 }else{ if(n>50 & n<95){ size_point <- 2 }else{ if(n>95){ size_point <- 1}}}

             plot1 <- ggplot( haploSel , aes(x=order, y=  -(log10(P_allelic)) )) + theme(panel.background = element_rect(fill = "#FFE4C4", colour = "#6D9EC1",size = 2, linetype = "solid"))+ geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(haploSel$P_allelic)) > seuil, "red","black")) + scale_x_continuous( limits = c(1 , nrow(haploSel))) + scale_y_continuous( limits = c(0 , maxYvalPallelic)) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ ggtitle(paste(pheno, "-haplotype (allelic model)",sep="" ))
             plot2 <- ggplot( haploSel , aes(x=order, y=  -(log10(P_dominant)) )) + theme(panel.background = element_rect(fill = "#FFE4C4", colour = "#6D9EC1",size = 2, linetype = "solid")) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(haploSel$P_dominant)) > seuil, "red","black")) + scale_x_continuous( limits = c(1 ,nrow(haploSel)))  + scale_y_continuous( limits = c(0 , maxYvalPdom)) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ ggtitle(paste(pheno, "-haplotype (dominant model)",sep="" ))
             suppressWarnings({
               print(plot1)
               print(plot2)
             })
           }

           grDevices::dev.off()
         }, # end case = disease
         pheno={
           #List of phenotypes
           phenotypes <- stats::na.omit(unique(c(names(phenoFile)))) # Make a list of of phenotypes in the file with colnames
           phenotypes <- phenotypes[phenotypes != "subjectID"]

           # File that will contains best Pvalue for all phenotypes
           X <- c("Phenotype","HLA_haplotype", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue_HaploAnalysis.csv ",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")

           grDevices::pdf(file = paste(outputFolder ,outputFile,"Haplo_Pheno_analysis_freq",freq_threshold,"_occurency", occurrency ,"_pvalManhattan",threshold , ".pdf" ,sep=""))

           for(pheno in phenotypes)
           {
             print(paste(pheno, "  currently being analysed.", sep = ""))
             ## Merge haplotypes and the phenotype analyzed in this loop
             merge_data_pheno <- merge(all_data,phenoFile[,c("subjectID",pheno)],by = "subjectID")
             colnames(merge_data_pheno)[ncol(merge_data_pheno)] <- "trait"
             phenoGeno <- na.omit(merge_data_pheno)
             Mean_pheno <- mean(phenoGeno$trait)

             ## Determine list of haplotypes to analyze; more than 2 occurences
             df <- as.data.frame(table(stats::na.omit(c(as.character(phenoGeno$A.B.C.DR.DQ.1) , as.character(phenoGeno$A.B.C.DR.DQ.2)))))
             names(df) <- c("haplotypes","Freq")
             haplo <-subset(df , Freq >= occurrency)  ## do a list of haplotypes having at least an "occurrency" times

             if(nrow(haplo)==0){ stop(paste("None haplotype present in your sample appears more than ", occurrency , " times (occurency threshold" , sep=""))}
             res <- c("HLA_haplotype", "Frequency",  "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")

             for(hap in haplo[,1])
             { # Make a loop with all the HLA haplotypes, Convert the haplotypes to 1,2 code (val1, val2), then to genotype (val3), and finally to recessive (val4) or dominant (val5) testable model
               val1 <- paste("allele1_", hap, sep="")
               phenoGeno[,val1] <- ifelse(phenoGeno$A.B.C.DR.DQ.1 == hap, 1,
                                          ifelse(phenoGeno$A.B.C.DR.DQ.1 != hap, 2, NA))

               val2 <- paste("allele2_", hap, sep="")
               phenoGeno[,val2] <- ifelse(phenoGeno$A.B.C.DR.DQ.2 == hap, 1,
                                          ifelse(phenoGeno$A.B.C.DR.DQ.2 != hap, 2, NA))

               val3 <- paste("genotype_", hap, sep="")
               phenoGeno[,val3] <- paste(phenoGeno[,val1], phenoGeno[,val2], sep="")

               val4 <- paste("genoA_", hap, sep="")
               phenoGeno[,val4] <- ifelse(phenoGeno[,val3] == '11', 1,
                                          ifelse(phenoGeno[,val3] == '12', 0,
                                                 ifelse(phenoGeno[,val3] == '21', 0,
                                                        ifelse(phenoGeno[,val3] == '22', -1, NA))))

               val5 <- paste("genoD_", hap, sep="")
               phenoGeno[,val5] <- ifelse(phenoGeno[,val3] == '11', 1,
                                          ifelse(phenoGeno[,val3] == '12', 1,
                                                 ifelse(phenoGeno[,val3] == '21', 1,
                                                        ifelse(phenoGeno[,val3] == '22', 0, NA))))
               ## Frequencies
               freq <- sum(phenoGeno[,val1]==1,phenoGeno[,val2]==1)/(nrow(phenoGeno)*2) # Overall frequency
               n_ind_ctrl <- nrow(subset(phenoGeno, Disease == 0))
               n_ind_cas <-nrow(subset(phenoGeno, Disease == 1))

               if( n_ind_ctrl != 0 & n_ind_cas != 0 )
               { # Frequency are calculated for case and control
                 # N <- 4/((1/nrow(subset(phenoGeno, Disease == 1)))+(1/nrow(subset(phenoGeno, Disease == 0))))
                 N <- nrow(subdata[,!is.na(Disease)])
                 freq_control <- sum(subset(phenoGeno, Disease == 0)[,val1]==1,subset(phenoGeno, Disease == 0)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 0))*2) # Controls frequency
                 freq_case <- sum(subset(phenoGeno, Disease == 1)[,val1]==1,subset(phenoGeno, Disease == 1)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 1))*2) # Cases frequency

               }else{
                 if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl and set to 0
                   freq_control <- 0
                   freq_case <- sum(subset(phenoGeno, Disease == 1)[,val1]==1,subset(phenoGeno, Disease == 1)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 1))*2) # Cases frequency
                   N <- nrow(subset(phenoGeno, Disease == 1))

                 }else{
                   if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case and set to 0
                     freq_control <- sum(subset(phenoGeno, Disease == 0)[,val1]==1,subset(phenoGeno, Disease == 0)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 0))*2) # Controls frequency
                     freq_case <- 0
                     N <- nrow(subset(phenoGeno, Disease == 0))
                   }else{ # Unknown case control Status
                     N <- 0
                     freq_control <- 0
                     freq_case <- 0
                   }
                 }
               }
               # Adapt the regression correction co-variables
               if(is.null(covarPath) & is.null(pcaPath)){
                 reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, val4] , data = phenoGeno )) # Linear regression with allelic model
                 reg_dominant <- summary(stats::lm(trait ~  phenoGeno[, val5] , data = phenoGeno )) # Linear regression with dominant model
               }else{
                 if(!is.null(covarPath) & is.null(pcaPath)){
                   reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, val4] + AGE.V0 + SEX + CMV.V0 + NBYTABAC  , data = phenoGeno)) # Linear regression with allelic model
                   reg_dominant <- summary(stats::lm(trait ~  phenoGeno[, val5] + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = phenoGeno )) # Linear regression with dominant model
                 }else{
                   if(is.null(covarPath) & !is.null(pcaPath)){
                     reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, val4] + PC1 + PC2, data = phenoGeno)) # Linear regression with allelic model
                     reg_dominant <- summary(stats::lm(trait ~  phenoGeno[, val5] + PC1 + PC2 , data = phenoGeno)) # Linear regression with dominant model
                   }else{
                     reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, val4] + PC1 + PC2 +  AGE.V0 + SEX + CMV.V0 + NBYTABAC  , data = phenoGeno)) # Linear regression with allelic model
                     reg_dominant <- summary(stats::lm(trait ~  phenoGeno[, val5] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = phenoGeno)) # Linear regression with dominant model
                   }
                 }
               }
               # reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, val4] + PC1 + PC2 +  covariables  , data = phenoGeno)) # Logistic regression with allelic model
               # reg_dominant <- summary(stats::lm(trait ~  phenoGeno[, val5] + PC1 + PC2 + covariables , data = phenoGeno )) # Logistic regression with dominant model

               P_allelic <- reg_allelic$coefficients[2,4]
               Beta_allelic <- round(reg_allelic$coefficients[2,1],2)
               SE_allelic <- round(reg_allelic$coefficients[2,2],2)
               Rsquare_allelic <-round(reg_allelic$adj.r.squared, 3)

               P_dominant <- reg_dominant$coefficients[2,4]
               Beta_dominant <- round(reg_dominant$coefficients[2,1],2)
               SE_dominant <- round(reg_dominant$coefficients[2,2],2)
               Rsquare_dominant <-round(reg_dominant$adj.r.squared, 3)

               # add the regression result to the "res" vector
               res <- rbind(res, c(hap, freq , freq_control , freq_case, N, Mean_pheno, P_allelic, Beta_allelic, SE_allelic, Rsquare_allelic, P_dominant, Beta_dominant, SE_dominant, Rsquare_dominant)) # See header of res for elements explanation

             }
             # Save results
             utils::write.table(res, paste(outputFolder , outputFile , "-", pheno , ".csv" ,sep=""), row.names=F, quote=F, col.names=F, sep = ",")

             if(file.exists(paste(outputFolder , outputFile , "-", pheno , ".csv" ,sep="")))
             {
               cat(paste("Regression at haplotype levels with : ", pheno , " expression is processed successfully \n" , sep = ""))
               cat(paste("File", paste(outputFolder , outputFile, "-", pheno , ".csv"  , sep="") ,'is created.  \n', sep = " "))
             }

             ##################################################### PLOT RESULT ############################################################
             information <-   utils::read.table(paste(outputFolder , outputFile , "-", pheno , ".csv" ,sep=""), stringsAsFactors = F, header = T, sep=',')
             seuil <- pvalueCorrection(haploRegResult = paste(outputFolder , outputFile , "-", pheno , ".csv" , sep=""),  correction = "bonferroni")
             if(is.null(seuil)){stop("Seuil calculated with pvalueCorrection() is null")}

             ################### Best Pvalue selection gathered in an unique file
             resultLoop <- as.data.frame(information)
             resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05 ),]
             if(nrow(resultLoop)!=0){
               suppressWarnings({
                 resultLoop$Phenotype <- pheno
                 resultLoop <-resultLoop[,c("Phenotype","HLA_haplotype","Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")]
                 names(resultLoop) <- c("Phenotype","HLA_haplotype", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
                 write.table(resultLoop, file=paste(outputFolder,"BestPvalue_HaploAnalysis.csv ",sep = "") , append = TRUE, row.names = FALSE ,  col.names = FALSE ,quote=F, sep = ",")
               })
             }

             ################### QQPLOT
             ## Lambda: genomic inflation factor, show the presence of bias
             # allelic Model
             observedR <- sort(information$P_allelic)
             lobsR <- -(log10(observedR))
             expectedR <- c(1:length(observedR))
             lexpR <- -(log10(expectedR / (length(expectedR)+1)))
             lambdaR<-summary(stats::lm(lobsR~lexpR))

             # Dominant Model
             observedD <- sort(information$P_dominant)
             lobsD <- -(log10(observedD))
             expectedD <- c(1:length(observedD))
             lexpD <- -(log10(expectedD / (length(expectedD)+1)))
             lambdaD<- summary(stats::lm(lobsD~lexpD))

             qq1 <- qq(information$P_allelic , main=paste(pheno,"-haplotype (allelic model)",sep = ""), sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
             qq2 <- qq(information$P_dominant , main=paste(pheno,"-haplotype (dominant model)",sep = ""), sub= paste("Lambda = ", round(lambdaD$coefficients[2,1],2),sep = ""))
             capture.output(print(qq1) , file = "NUL")
             capture.output(print(qq2) , file = "NUL")

             ################### MANHATTAN PLOT
             haploSel <- information[information$Frequency >= freq_threshold & (information$P_allelic<=threshold | information$P_dominant<=threshold),]
             if(nrow(haploSel)==0){
               cat(paste("None haplotype to display. You can change your threshold p-value. Set at ",threshold ,sep = ""))
             }else{
               haploSel$order <- c(1:nrow(haploSel))
               options(scipen = 999)
               maxYvalPallelic <- ((max(-log10(haploSel$P_allelic)))+1)
               maxYvalPdom <- ((max(-log10(haploSel$P_dominant)))+1)

               n <- nrow(haploSel)
               if(n<50){ size_point <- 2 }else{ if(n>50 & n<95){ size_point <- 2 }else{ if(n>95){ size_point <- 1}}}

               plot1 <- ggplot( haploSel , aes(x=order, y=  -(log10(P_allelic)) )) + theme(panel.background = element_rect(fill = "#FFE4C4", colour = "#6D9EC1",size = 2, linetype = "solid"))+ geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(haploSel$P_allelic)) > seuil, "red","black")) + scale_x_continuous( limits = c(1 , nrow(haploSel))) + scale_y_continuous( limits = c(0 , maxYvalPallelic)) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ ggtitle(paste(pheno, "-haplotype (allelic model)",sep="" ))
               plot2 <- ggplot( haploSel , aes(x=order, y=  -(log10(P_dominant)) )) + theme(panel.background = element_rect(fill = "#FFE4C4", colour = "#6D9EC1",size = 2, linetype = "solid")) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(haploSel$P_dominant)) > seuil, "red","black")) + scale_x_continuous( limits = c(1 ,nrow(haploSel)))  + scale_y_continuous( limits = c(0 , maxYvalPdom)) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ ggtitle(paste(pheno, "-haplotype (dominant model)",sep="" ))
               suppressWarnings({
                 print(plot1)
                 print(plot2)
               })
             }

           } # end pheno loop
           grDevices::dev.off()
         }) # end case pheno
}



#######################################################################
###  Amino-acids association analysis with trait(s) (-/+ Covariates)###
#######################################################################

#' Analyze association of HLA Amino-acids with many variate phenotypes (like case/control or quantitative trait)
#' Take as input:
#' (1 required) amino-acids :  subjectID haplotype1 freq1  haplotype2 freq2 postprob followed by many columns of amino-acids("," separator between fields)
#' (2 optional) differents quantitative phenotypes : subjectID traitName1 traitName2 ... traitName(n)
#' (3 required) Disease information : 0= control 1=case (needed for frequencies and/or type=disease analysis)
#' (4 required) the 2 first principals components of the PCA
#' (5 optional) covariable to correct regressions
#'
#' @param aminoAcidFile  HLA Amino-acids data exactly as the one provided by easy-HLA webtool
#' @param phenoPath phenotypes file with different trait names in columns
#' @param statut file conaining subject ID and Disease statut
#' @param pcaPath pca's result: subjectID, PC1 and PC2
#' @param covarPath must contains SubjectID followed by covariables
#' @param outputFolder folder where to save the result
#' @param outputFile result's filename
#' @param type  either "disease" for case/control analysis or "pheno" with quantitative phenotype(s)
#' @return  Linear Regression result and many plots : QQplot and personnalized manhattan plot
#' @import ggplot2
#' @import cowplot
#' @importFrom qqman qq
#' @importFrom utils read.csv write.table capture.output
#' @export


hlaAminoAcidAnalysis  <- function(aminoAcidFile, phenoPath=NULL, statut, pcaPath=NULL, covarPath=NULL, outputFolder ,outputFile, type="pheno")
{
  ## Check parameters
  if(!file.exists(aminoAcidFile)){stop("aminoAcidFile doesn't exist")}
  if(!file.exists(statut)){stop("statut is not found ")}
  if(!is.character(type)){stop("outputFile must be character")}
  if(!dir.exists(outputFolder)){stop("outputFolder directoy doesn't exist. ")}
  if(!is.character(outputFile)){stop("outputFile must be character")}


  ################  DATA IMPORTATION   ################
  amino_acid <- utils::read.csv(aminoAcidFile, stringsAsFactors = F,sep=",", header = TRUE) # Amino acid file from easy-HLA contains : code_patient	A-B-C-DR-DQ-1	freq1	A-B-C-DR-DQ-2	freq2	PostP	resolution FOLLOWED BY Amino acids
  data <- amino_acid[, apply(amino_acid, 2, function(x) length(unique(stats::na.omit(x)))) > 1] # Keep amino acid with information
  colnames(data)[1] <- "subjectID"
  list_aa <- colnames(data)[-c(1:6,(ncol(data)-6):ncol(data))] # list of amino-acids from colnames (first 7 seven colnames are ignored)

  statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))
  if(ncol(statut)!= 2){stop("statut must contains 2 colunms : subjectID and Affection statut")}
  names(statut) <- c("subjectID","Disease")
  data <- merge(data, statut, by ="subjectID")

  ################  OPTIONAL PARAMETERS   ################
  if(!is.null(pcaPath)){
    PCs <- stats::na.omit(utils::read.table(pcaPath ,  header = TRUE, sep = "," , dec = "."))
    if(ncol(PCs)!=3){stop("pcaPath should ocntain only 3 columns: subjectID, PC1 and PC2")}
    colnames(PCs)[1]<-"subjectID"
    colnames(PCs)[2]<-"PC1"
    colnames(PCs)[3]<-"PC2"
    data <- merge(data, PCs[,1:3], by ="subjectID")
  }

  if(!is.null(covarPath)){
    covarCorrection <- stats::na.omit(utils::read.table(covarPath ,  header = TRUE, sep = "," , dec = "."))
    colnames(covarCorrection)[1] <- "subjectID"
    covarNames <- stats::na.omit(names(covarCorrection))
    covarNames <- covarNames[covarNames!="subjectID"]
    covariables <- noquote(paste(covarNames, collapse = " + ")) #add as.character ??
    data <- merge(data, covarCorrection, by ="subjectID")
  }


  if(type=="pheno")
  {
    if(is.null(phenoPath)){stop('phenoPath must be provided when type="pheno"')}
    if(!file.exists(phenoPath)){stop("phenoPath file doesn't exist")}
    phenoFile <-  as.data.frame(utils::read.table(phenoPath , stringsAsFactors = F, header = T, sep=','))
    colnames(phenoFile)[1] <- "subjectID"

    #List of phenotypes and covarNames
    phenotypes <- stats::na.omit(unique(c(names(phenoFile)))) # Make a list of of phenotypes in the file with colnames
    phenotypes <- phenotypes[phenotypes != "subjectID"]

  }
  # Thresholds
  freq_threshold <- readline(prompt = "\n \n Enter a frequency threshold of Amino-acids to displays in manhattan plot\n Frequency: " )
  threshold <- readline(prompt = "\n \n In order to display most interesting Amino-acids and in clearly way. \n Define a p value treshold under which one Amino-acids are not displayed in manhattan plot \n P-value:  ")

  switch(type,
         disease={
           res <- c("HLA_Amino_Acid", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic",  "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant","lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")
           X <- c("HLA_Amino_Acid", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue-AA-DiseaseAssoc.csv ",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")
           grDevices::pdf(file=paste(outputFolder , outputFile , "-AA-DiseaseAssoc_Plot.pdf" ,sep=""))
           for(aa in list_aa) # Make a loop with all amino acids
           {
             ## Frequencies of amino acid
             freq <-  mean(stats::na.omit(data[,aa]))/2 # Overall frequency
             n_ind_ctrl <- nrow(subset(data, Disease == 0))
             n_ind_cas <-nrow(subset(data, Disease == 1))

             if( n_ind_ctrl != 0 & n_ind_cas != 0 )
             { # Frequency are calculated for case and control
               N <- nrow(data[,!is.na(Disease)])
               freq_control <- mean(stats::na.omit(subset(data, Disease == 0)[,aa]))/2 # Controls frequency
               freq_case <- mean(stats::na.omit(subset(data, Disease == 1)[,aa]))/2 # Cases frequency

             }else{
               if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl and set to 0
                 freq_control <- 0
                 freq_case <- mean(stats::na.omit(subset(data, Disease == 1)[,aa]))/2 # Cases frequency
                 N <- nrow(subset(data, Disease == 1))

               }else{
                 if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case and set to 0
                   freq_control <-  mean(stats::na.omit(subset(data, Disease == 0)[,aa]))/2 # Controls frequency
                   freq_case <- 0
                   N <- nrow(subset(data, Disease == 0))
                 }else{ # Unknown case control Status
                   stop("For a Case/Control analysis, the Disease colunm is required. Please note 1=case and 0=control")
                 }
               }
             }


             if(is.null(covarPath) & is.null(pcaPath)){
               # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
               data[,aa][data[,aa] == 0] <- -1
               data[,aa][data[,aa] == 1] <- 0
               data[,aa][data[,aa] == 2] <- 1
               reg_allelic <- summary(stats::glm(Disease ~ data[, aa] , data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

               # Convert (-1,0,1) to (0,1) to run the regression in dominant model
               data[,aa][data[,aa] == 0] <- 1
               data[,aa][data[,aa] == -1] <- 0
               reg_dominant <- summary(stats::glm(Disease ~ data[, aa] , data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

             }else{
               if(!is.null(covarPath) & is.null(pcaPath)){
                 # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                 data[,aa][data[,aa] == 0] <- -1
                 data[,aa][data[,aa] == 1] <- 0
                 data[,aa][data[,aa] == 2] <- 1
                 reg_allelic <- summary(stats::glm(Disease ~ data[, aa] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                 # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                 data[,aa][data[,aa] == 0] <- 1
                 data[,aa][data[,aa] == -1] <- 0
                 reg_dominant <- summary(stats::glm(Disease ~ data[, aa] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
               }else{
                 if(is.null(covarPath) & !is.null(pcaPath)){
                   # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                   data[,aa][data[,aa] == 0] <- -1
                   data[,aa][data[,aa] == 1] <- 0
                   data[,aa][data[,aa] == 2] <- 1
                   reg_allelic <- summary(stats::glm(Disease ~ data[, aa] + PC1 + PC2, data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                   # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                   data[,aa][data[,aa] == 0] <- 1
                   data[,aa][data[,aa] == -1] <- 0
                   reg_dominant <- summary(stats::glm(Disease ~ data[, aa] + PC1 + PC2, data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                 }else{
                   # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                   data[,aa][data[,aa] == 0] <- -1
                   data[,aa][data[,aa] == 1] <- 0
                   data[,aa][data[,aa] == 2] <- 1
                   reg_allelic <- summary(stats::glm(Disease ~ data[, aa] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                   # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                   data[,aa][data[,aa] == 0] <- 1
                   data[,aa][data[,aa] == -1] <- 0
                   reg_dominant <- summary(stats::glm(Disease ~ data[, aa] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = data, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                 }}}
             # add the regression result to the "res" vector
             res <- rbind(res, c(aa, freq, freq_control, freq_case,N ,reg_allelic$coefficients[2,4], exp(reg_allelic$coefficients[2,1]), exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2]), exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2]), reg_allelic$coefficients[2,1], reg_allelic$coefficients[2,2], reg_dominant$coefficients[2,4], exp(reg_dominant$coefficients[2,1]), exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2]), exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2]), reg_dominant$coefficients[2,1], reg_dominant$coefficients[2,2])) # See header of res for elements explanation

           }
           grDevices::dev.off()
           # Save results
           utils::write.table(res, paste(outputFolder , outputFile,"-AA-DiseaseAssoc-Analysis" ,".csv", sep=""), row.names=F, quote=F, col.names=F, sep = ",")
           if(file.exists( paste(outputFolder , outputFile,"-AA-DiseaseAssoc-Analysis" ,".csv", sep=""))){
             cat("Regression at Amino-Acid levels is Processed successfully \n")
             cat(paste("File", paste(outputFolder , outputFile , sep="") ,'is created. \n', sep = " "))
           }
           ## Import Files
           aaReg_result <- utils::read.table(paste(outputFolder , outputFile,"-AA-DiseaseAssoc-Analysis" ,".csv", sep="") , header= TRUE , sep = "," , dec = "." , na.strings = NA)
           seuil <- pvalueCorrection(AARegResult = paste(outputFolder , outputFile,"-AA-DiseaseAssoc-Analysis" ,".csv", sep=""), correction = "bonferroni")
           if(is.null(seuil)){stop("Seuil calculated with pvalueCorrection() is null")}

           # Ne garder que la position et AA
           aaReg_result$AA <- gsub("^[A-Z]{2}_[A-Z]_","",aaReg_result$HLA_Amino_Acid )
           aaReg_result$AA <- gsub("^[A-Z]{2}_[A-Z]{1,3}\\d_","",aaReg_result$AA)

           aaReg_result$Locus <- gsub("^[A-Z][A-Z]_", "", aaReg_result$HLA_Amino_Acid) # keep gene's name
           aaReg_result$Locus <- gsub("_\\d{1,}_[A-Z]", "",  aaReg_result$Locus)
           aaReg_result$Locus <- gsub("_[[:punct:]]\\d{1,}_[A-Z]", "",  aaReg_result$Locus)
           aaReg_result$Locus <- gsub("_\\d{1,}_[[:punct:]]", "",  aaReg_result$Locus)

           ################### Best Pvalue selection gathered in an unique file
           resultLoop <- as.data.frame(aaReg_result)
           resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold  & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05) ,]
           if(nrow(resultLoop)!=0){
             suppressWarnings({
               resultLoop$Phenotype <- pheno
               resultLoop <-resultLoop[,c("Phenotype","Locus","AA", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")]
               write.table(resultLoop, file=paste(outputFolder,"BestPvalue-AA-DiseaseAssoc.csv ",sep = "") , append = TRUE, row.names = FALSE ,  col.names = FALSE ,quote=F, sep = ",")
             })

           }
           ################### QQPLOT
           ## Lambda: genomic inflation factor, show the presence of bias
           # allelic Model
           observedR <- na.omit(sort(aaReg_result$P_allelic))
           lobsR <- -(log10(observedR))
           expectedR <- c(1:length(observedR))
           lexpR <- -(log10(expectedR / (length(expectedR)+1)))
           lambdaR<-summary(lm(lobsR~lexpR))

           # Dominant Model
           observedD <- sort(aaReg_result$P_dominant)
           lobsD <- -(log10(observedD))
           expectedD <- c(1:length(observedD))
           lexpD <- -(log10(expectedD / (length(expectedD)+1)))
           lambdaD<- summary(lm(lobsD~lexpD))

           ## QQ PLOT
           qq1 <- qq(aaReg_result$P_allelic ,main="Allelic model Disease-Amino-acids assoc" ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
           qq2 <- qq(aaReg_result$P_dominant , main="Dominant model Disease-Amino-acids assoc" ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
           capture.output(print(qq1),file='NUL')
           capture.output(print(qq2),file='NUL')

           ## Graphic for visualizing P value of AA
           aaSel <- aaReg_result[aaReg_result$Frequency >= freq_threshold & (aaReg_result$P_allelic<=threshold | aaReg_result$P_dominant<=threshold),]
           aSel <- aaSel[, c("AA","Locus","P_allelic","P_dominant")]
           if(nrow(aaSel)==0){ message(paste("None Amino-Acid to display. You can change your frequency and threshold p-value.\n Set at :\n Frequency : ", freq_threshold, "\n Pvalue threshold :",threshold,sep = ""))}
           plot5 <- ggplot( aaSel, aes(x=AA, y=  -(log10(P_allelic)) )) + geom_point() + facet_grid(.~Locus , scales = "free" , space = "free_x") + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + ggtitle(" Amino-Acid Analysis in allelic Model")
           plot6 <- ggplot( aaSel , aes(x=AA, y=  -(log10(P_dominant)) )) + geom_point() + facet_grid(.~Locus , scales = "free" , space = "free_x") + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ ggtitle(" Amino-Acid Analysis in Dominant Model")

           suppressWarnings({
             print(plot5)
             print(plot6)
           })

           grDevices::dev.off()
         }, # end case = disease
         pheno ={

           # File that will contains best Pvalue for all phenotypes
           X <- c("Phenotype","Locus","AA", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue-AA-PhenoAssoc.csv",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")

           grDevices::pdf(file=paste(outputFolder , outputFile , "-AA-PhenotypeAssoc_regressionPlot.pdf" ,sep=""),10,10)
           for(pheno in phenotypes)
           {
             res <- c("HLA_Amino_Acid", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
             print(paste(pheno, " is currently being analysed.", sep = ""))
             ## Merge AA and the phenotype analyzed in this loop
             merge_data_pheno <- merge(data,phenoFile[,c("subjectID",pheno)],by = "subjectID")
             colnames(merge_data_pheno)[ncol(merge_data_pheno)] <- "trait"
             phenoGeno <- na.omit(merge_data_pheno)
             Mean_pheno <- mean(phenoGeno$trait)

             for(aa in list_aa) # Make a loop with all amino acids
             {
               # Caculate frequency of amino acid
               freq <- mean(stats::na.omit(phenoGeno[,aa]))/2

               n_ind_ctrl <- nrow(subset(phenoGeno, Disease == 0))
               n_ind_cas <-nrow(subset(phenoGeno, Disease == 1))

               if( n_ind_ctrl != 0 & n_ind_cas != 0 )
               { # Frequency are calculated for case and control
                 N <- 4/((1/nrow(subset(phenoGeno, Disease == 1)))+(1/nrow(subset(phenoGeno, Disease == 0))))
                 freq_control <- mean(stats::na.omit(subset(phenoGeno, Disease == 0)[,aa]))/2
                 freq_case <- mean(stats::na.omit(subset(phenoGeno, Disease == 1)[,aa]))/2

               }else{
                 if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl
                   freq_control <- 0
                   freq_case <-mean(stats::na.omit(subset(phenoGeno, Disease == 1)[,aa]))/2
                   N <- nrow(subset(phenoGeno, Disease == 1))

                 }else{
                   if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case
                     # N <- 4/(1/nrow(subset(subdata, Disease == 0)))
                     freq_control <- mean(stats::na.omit(subset(phenoGeno, Disease == 0)[,aa]))/2
                     freq_case <- 0
                     N <- nrow(subset(phenoGeno, Disease == 0))

                   }else{ # Unknown case control Status. This info is not essential for the analysis
                     N <- 0
                     freq_control <- 0
                     freq_case <- 0
                   }}}

               # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
               # phenoGeno[,aa][phenoGeno[,aa] == 0] <- -1
               # phenoGeno[,aa][phenoGeno[,aa] == 1] <- 0
               # phenoGeno[,aa][phenoGeno[,aa] == 2] <- 1

               # genoA <- paste("genoA_",aa,sep="")
               # phenoGeno[,genoA] <- ifelse(phenoGeno[,aa][phenoGeno[,aa] == 0],-1,
               # ifelse(phenoGeno[,aa][phenoGeno[,aa] == 1],0,
               # ifelse(phenoGeno[,aa][phenoGeno[,aa] == 2],1, NA)))
               # Convert (-1,0,1) to (0,1) to run the regression in dominant model
               # phenoGeno[,aa][phenoGeno[,aa] == 0] <- 1
               # phenoGeno[,aa][phenoGeno[,aa] == -1] <- 0
               # genoD <- paste("genoD_",aa,sep="")
               # phenoGeno[, genoD] <- ifelse(phenoGeno[,aa][phenoGeno[,aa] == 0],1,
               #                                ifelse(phenoGeno[,aa][phenoGeno[,aa] == -1],0,NA))
               # print(phenoGeno[,genoA])
               # print(phenoGeno[,genoD])
               if(is.null(covarPath) & is.null(pcaPath)){

                 reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, aa] , data = phenoGeno)) # Linear regression with allelic model
                 reg_dominant <- summary(stats::lm(trait ~ phenoGeno[, aa] , data = phenoGeno)) # Linear regression with allelic model

               }else{
                 if(!is.null(covarPath) & is.null(pcaPath)){
                   # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                   phenoGeno[,aa][phenoGeno[,aa] == 0] <- -1
                   phenoGeno[,aa][phenoGeno[,aa] == 1] <- 0
                   phenoGeno[,aa][phenoGeno[,aa] == 2] <- 1
                   reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, aa] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Linear regression with allelic model

                   # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                   phenoGeno[,aa][phenoGeno[,aa] == 0] <- 1
                   phenoGeno[,aa][phenoGeno[,aa] == -1] <- 0
                   phenoGeno[, aa] <-  as.factor(phenoGeno[, aa])
                   reg_dominant <- summary(stats::lm(trait ~ data[, aa] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Linear regression with allelic model
                 }else{
                   if(is.null(covarPath) & !is.null(pcaPath)){
                     # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                     phenoGeno[,aa][phenoGeno[,aa] == 0] <- -1
                     phenoGeno[,aa][phenoGeno[,aa] == 1] <- 0
                     phenoGeno[,aa][phenoGeno[,aa] == 2] <- 1
                     reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, aa] + PC1 + PC2, data = phenoGeno)) # Linear regression with allelic model

                     # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                     phenoGeno[,aa][phenoGeno[,aa] == 0] <- 1
                     phenoGeno[,aa][phenoGeno[,aa] == -1] <- 0
                     reg_dominant <- summary(stats::lm(trait ~ phenoGeno[, aa] + PC1 + PC2, data = phenoGeno)) # Linear regression with allelic model

                   }else{
                     # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                     phenoGeno[,aa][phenoGeno[,aa] == 0] <- -1
                     phenoGeno[,aa][phenoGeno[,aa] == 1] <- 0
                     phenoGeno[,aa][phenoGeno[,aa] == 2] <- 1
                     reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, aa] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno)) # Linear regression with allelic model

                     # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                     phenoGeno[,aa][phenoGeno[,aa] == 0] <- 1
                     phenoGeno[,aa][phenoGeno[,aa] == -1] <- 0
                     reg_dominant <- summary(stats::lm(trait ~ phenoGeno[, aa] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Linear regression with allelic model

                   }}}

               P_allelic <-  reg_allelic$coefficients[2,4]
               Beta_allelic <- round(reg_allelic$coefficients[2,1],3)
               SE_allelic <- round(reg_allelic$coefficients[2,2],3)
               Rsquare_allelic <-round(reg_allelic$adj.r.squared, 3)

               P_dominant <- reg_dominant$coefficients[2,4]
               Beta_dominant <- round(reg_dominant$coefficients[2,1],3)
               SE_dominant <- round(reg_dominant$coefficients[2,2],3)
               Rsquare_dominant <-round(reg_dominant$adj.r.squared, 3)

               # add the regression result to the "res" vector
               res <- rbind(res, c(aa,  freq, freq_control, freq_case, N, Mean_pheno, P_allelic, Beta_allelic, SE_allelic, Rsquare_allelic, P_dominant, Beta_dominant, SE_dominant, Rsquare_dominant)) # See header of res for elements explanation

             }

             # Save results
             utils::write.table(res, paste(outputFolder , outputFile,"-AA-", pheno,"-Analysis" ,".csv", sep=""), row.names=F, quote=F, col.names=F, sep = ",")
             if(file.exists( paste(outputFolder , outputFile,"-AApheno-Analysis" ,".csv", sep=""))){
               cat(paste("Regression at amino-acid levels with : ", pheno , " expression is processed successfully \n" , sep = ""))
               cat(paste("File", paste(outputFolder , outputFile , sep="") ,'is created. \n', sep = " "))
             }

             ## Import Files
             aaReg_result <- utils::read.table( paste(outputFolder , outputFile,"-AA-", pheno,"-Analysis" ,".csv", sep="") , header= TRUE , sep = "," , dec = "." , na.strings = NA)
             seuil <- pvalueCorrection(AARegResult =  paste(outputFolder , outputFile,"-AA-", pheno,"-Analysis" ,".csv", sep=""), correction = "bonferroni")
             if(is.null(seuil)){stop("Bonferroni threshold is null ! Line 1333 ")}
             # Ne garder que la position et AA
             aaReg_result$AA <- gsub("^[A-Z]{2}_[A-Z]_","",aaReg_result$HLA_Amino_Acid )
             aaReg_result$AA <- gsub("^[A-Z]{2}_[A-Z]{1,3}\\d_","",aaReg_result$AA)
             aaReg_result$Locus <- gsub("^[A-Z][A-Z]_", "", aaReg_result$HLA_Amino_Acid) # keep gene's name
             aaReg_result$Locus <- gsub("_\\d{1,}_[A-Z]", "",  aaReg_result$Locus)
             aaReg_result$Locus <- gsub("_[[:punct:]]\\d{1,}_[A-Z]", "",  aaReg_result$Locus)
             aaReg_result$Locus <- gsub("_\\d{1,}_[[:punct:]]", "",  aaReg_result$Locus)
             aaReg_result$Locus <- as.factor(aaReg_result$Locus)

             ################### Best Pvalue selection gathered in an unique file
             resultLoop <- as.data.frame(aaReg_result)
             resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold  & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05) ,]
             if(nrow(resultLoop)!=0){
               suppressWarnings({
                 resultLoop$Phenotype <- pheno
                 resultLoop <-resultLoop[,c("Phenotype","Locus","AA","Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")]
                 write.table(resultLoop, file=paste(outputFolder,"BestPvalue-AA-PhenoAssoc.csv",sep = ""), append = TRUE , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")
               })
             }
             ################### QQPLOT
             ## Lambda: genomic inflation factor, show the presence of bias
             # allelic Model
             observedR <- na.omit(sort(aaReg_result$P_allelic))
             lobsR <- -(log10(observedR))
             expectedR <- c(1:length(observedR))
             lexpR <- -(log10(expectedR / (length(expectedR)+1)))
             lambdaR<-summary(lm(lobsR~lexpR))

             # Dominant Model
             observedD <- sort(aaReg_result$P_dominant)
             lobsD <- -(log10(observedD))
             expectedD <- c(1:length(observedD))
             lexpD <- -(log10(expectedD / (length(expectedD)+1)))
             lambdaD<- summary(lm(lobsD~lexpD))

             ## QQ PLOT
             qq1 <- qq(aaReg_result$P_allelic ,main=paste(" AA vs ",pheno," (allelic model)",sep = "") ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
             qq2 <- qq(aaReg_result$P_dominant , main=paste(" AA vs ",pheno," (dominant model)",sep = "") ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
             capture.output(print(qq1),file='NUL')
             capture.output(print(qq2),file='NUL')

             ## Graphic for visualizing P value of AA
             aaSel <- aaReg_result[aaReg_result$Frequency >= freq_threshold & (aaReg_result$P_allelic<=threshold | aaReg_result$P_dominant<=threshold),]
             aSel <- aaSel[, c("AA","Locus","P_allelic","P_dominant")]
             if(nrow(aaSel)==0){ message(paste("None Amino-Acid to display. You can change your frequency and threshold p-value.\n Set at :\n Frequency : ", freq_threshold, "\n Pvalue threshold :",threshold,sep = ""))}

             ######### FONCTIONNE MAIS ILLISIBLE AXE X
             ## Adapt x axis legend with the number of alleles to plot
             n <- nrow(aaSel)
             if(n<50){
               size_txt_x_axis <- 10
               size_point <- 3
             }else{
               if(n>50 & n<95){
                 size_txt_x_axis <- 8
                 size_point <- 3
               }else{
                 if(n>95 & n<130){
                   size_txt_x_axis <- 7
                   size_point <- 2
                 }else{
                   if(n>130){
                     size_txt_x_axis <- 6
                     size_point <- 2
                   }}}}

             panelNames <- list( # change ggplot panel's names
               "A" = "HLA-A",
               "B" = "HLA-B ",
               "C" = "HLA-C",
               "DRB1" = "HLA-DRB1 ",
               "DQB1" = "HLA-DQB1"
             )
             variable_labeller <- function(variable,value){
               return(panelNames[value])
             }

             ## allelic model
             aaA <-  aaSel[aaSel$Locus=="A",]
             plot1 <- ggplot( aaA, aes(x=AA, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaA$P_allelic)) > seuil, "red","black")) + facet_grid( .~ Locus  ) +geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaB <-  aaSel[aaSel$Locus=="B",]
             plot2 <- ggplot( aaB, aes(x=AA, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaB$P_allelic)) > seuil, "red","black"))  + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis-1, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaC <-  aaSel[aaSel$Locus=="C",]
             plot3 <- ggplot( aaC, aes(x=AA, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaC$P_allelic)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis-1, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaDR <-  aaSel[aaSel$Locus=="DRB1",]
             plot4 <- ggplot( aaDR, aes(x=AA, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaDR$P_allelic)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaDQ <-  aaSel[aaSel$Locus=="DQB1",]
             plot5 <- ggplot( aaDQ, aes(x=AA, y=  -(log10(P_allelic)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaDQ$P_allelic)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")  + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             plotclass1 <- ggdraw() +
               draw_label(paste(" AA vs ",pheno," (allelic model)",sep = ""), fontface='bold',x = 0.5, y = .97, hjust = 0.5, vjust = 0.5,color="#ec6649",size=12) +
               draw_plot(plot1, 0, .5, 1, .45) +
               draw_plot(plot2, 0, 0, .5, .5) +
               draw_plot(plot3, .5, 0, .5, .5)
             plotclass2 <- ggdraw() +
               draw_label(paste(" AA vs ",pheno," (allelic model)",sep = ""), fontface='bold',x = 0.5, y =  .97, hjust = 0.5, vjust = 0.5,color="#ec6649",size=12) +
               draw_plot(plot4, 0, .5, 1, .45) +
               draw_plot(plot5, 0, 0, 1, .5)
             suppressWarnings({
               print(plotclass1)
               print(plotclass2)
             })

             ## dominant model
             aaA <-  aaSel[aaSel$Locus=="A",]
             plot1 <- ggplot( aaA, aes(x=AA, y=  -(log10(P_dominant)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaA$P_dominant)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")+ xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaB <-  aaSel[aaSel$Locus=="B",]
             plot2 <- ggplot( aaB, aes(x=AA, y=  -(log10(P_dominant)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaB$P_dominant)) > seuil, "red","black"))  + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis-1, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaC <-  aaSel[aaSel$Locus=="C",]
             plot3 <- ggplot( aaC, aes(x=AA, y=  -(log10(P_dominant)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaC$P_dominant)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis-1, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaDR <-  aaSel[aaSel$Locus=="DRB1",]
             plot4 <- ggplot( aaDR, aes(x=AA, y=  -(log10(P_dominant)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaDR$P_dominant)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red") + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             aaDQ <-  aaSel[aaSel$Locus=="DQB1",]
             plot5 <- ggplot( aaDQ, aes(x=AA, y=  -(log10(P_dominant)) )) + geom_point(na.rm = TRUE, size=size_point, color=ifelse((-log10(aaDQ$P_dominant)) > seuil, "red","black")) + facet_grid( .~ Locus  ) + geom_hline(yintercept= -log10(0.05) , color = "blue") + geom_hline(yintercept= seuil , color = "red")  + xlab("position and amino-acid")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90), axis.title.y = element_text(color="#2b79be", size=11),strip.text = element_text(face="bold",size = 10), strip.background = element_rect(fill="lightblue"))
             # pour changer en rouge le texte de l'axe x comme les points, mais ne foncionne pas
             #theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(size = size_txt_x_axis, angle=90, color=ifelse((-log10(aaDQ$P_dominant)) > seuil, "red","black"))
             plotclass1 <- ggdraw() +
               draw_label(paste(" AA vs ",pheno," (dominant model)",sep = ""), fontface='bold',x = 0.5, y = .97, hjust = 0.5, vjust = 0.5,color="#ec6649",size=12) +
               draw_plot(plot1, 0, .5, 1, .45) +
               draw_plot(plot2, 0, 0, .5, .5) +
               draw_plot(plot3, .5, 0, .5, .5)
             plotclass2 <- ggdraw() +
               draw_label(paste(" AA vs ",pheno," (dominant model)",sep = ""), fontface='bold',x = 0.5, y =  .97, hjust = 0.5, vjust = 0.5,color="#ec6649",size=12) +
               draw_plot(plot4, 0, .5, 1, .45) +
               draw_plot(plot5, 0, 0, 1, .5)
             suppressWarnings({
               print(plotclass1)
               print(plotclass2)
             })

           }})# end case = pheno
  grDevices::dev.off()
}




############################################################################
####  KIR ligands association analysis with trait(s) (-/+ Covariates)   ####
############################################################################

#' Analyze association of HLA KIR ligands with many variate phenotypes (like case/control or quantitative trait)
#' Take as input:
#' (1 required) KIR ligands  :  subjectID haplotype1 freq1  haplotype2 freq2 postprob ("," separator between fields)
#' (2 optional) differents quantitative phenotypes : subjectID traitName1 traitName2 ... traitName(n)
#' (3 required) Disease information : 0= control 1=case (needed for frequencies and/or type=disease analysis)
#' (4 required) the 2 first principals components of the PCA
#' (5 optional) covariable to correct regressions
#'
#' @param datapath HLA File from Easy-HLA
#' @param phenoPath phenotypes file with different trait names in columns
#' @param statut file conaining subject ID and Disease statut
#' @param pcaPath pca result for PC1 and PC2
#' @param covarPath must contains SubjectID followed by covariables
#' @param outputFolder Folder's pathways where to save the result file
#' @param outputFile Name of the result file
#' @param type etiher "disease" for analysis in case/control or "pheno" for analysis with an quantitative phenotype
#' @import ggplot2
#' @importFrom qqman qq
#' @importFrom utils read.csv write.csv capture.output
#' @return Regression result in a file with columns: Amino_Acid	Frequency	Frequency_control	Frequency_case	N	P_allelic	OR_allelic	lower_95%_CI_allelic	upper_95%_CI_allelic	Beta_allelic	SE_allelic	P_dominant	OR_dominant	lower_95%_CI_dominant	upper_95%_CI_dominant	Beta_dominant	SE_dominant
#' @export


hlaKIRAnalysis <- function(datapath, phenoPath=NULL, statut, pcaPath=NULL, covarPath=NULL, outputFolder ,outputFile, type="pheno")
{

  ## Check parameters
  if(is.null(datapath)){stop("datapath must be provided")}
  if(!file.exists(datapath)){stop("datapath doesn't exist")}
  if(is.null(statut)){stop("statut must be provided of both analysis type")}

  ## Import data
  # Import hlaKIR file from easy-HLA code_patient	A-B-C-DR-DQ-1	freq1	A-B-C-DR-DQ-2	freq2	PostP	resolution 3DL2 Bw4 Bw4I Bw4T Bw6 C1 C2
  kirLigand <- utils::read.csv(datapath, stringsAsFactors = F,sep=",", header = TRUE) # Import kir ligand acid data
  data <- kirLigand[, apply(kirLigand, 2, function(x) length(unique(stats::na.omit(x)))) > 1] # Keep kir ligand colums with information
  if(ncol(data) < 7){stop("datapath must contains more than 7 colunms like the file provided by easy-HLA :code_patient,	A-B-C-DR-DQ-1,	freq1,	A-B-C-DR-DQ-2,	freq2,	PostP,	resolution, followed by kir ligand. For example (3DL2, Bw4, Bw4I, Bw4T, Bw6, C1, C2)")}

  colnames(data)[1] <- "subjectID"
  list <- colnames(data)
  list_ligand <- list[7:length(list)]

  ## Merge
  # Optional parameters
  if(!is.null(pcaPath)){
    if(!file.exists(pcaPath)){stop("pcaPath doesn't exist")}
    PCs <- stats::na.omit(utils::read.table(pcaPath ,  header = TRUE, sep = "," , dec = "."))
    if(ncol(PCs)!=3){stop("pcaPath should ocntain only 3 columns: subjectID, PC1 and PC2")}
    colnames(PCs)[1]<-"subjectID"
    colnames(PCs)[2]<-"PC1"
    colnames(PCs)[3]<-"PC2"
    data <- merge(data, PCs[,1:3], by ="subjectID")
  }

  if(!is.null(covarPath)){
    if(!file.exists(covarPath)){stop("covarPath doesn't exist")}
    covarCorrection <- stats::na.omit(utils::read.table(covarPath ,  header = TRUE, sep = "," , dec = "."))
    colnames(covarCorrection)[1] <- "subjectID"
    covarNames <- stats::na.omit(names(covarCorrection))
    covarNames <- covarNames[covarNames!="subjectID"]
    covariables <- noquote(paste(covarNames, collapse = " + ")) #add as.character ??
    data <- merge(data, covarCorrection, by ="subjectID")
  }

  if(type=="disease")
  {
    statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))
    if(ncol(statut)!= 2){stop("statut must contains 2 colunms : subjectID and Affection statut")}
    names(statut) <- c("subjectID","Disease")
    data <- merge(data, statut, by ="subjectID")

  }else{
    if(type=="pheno")
    {
      if(is.null(phenoPath)){stop('phenoPath must be provided when type="pheno"')}
      if(!file.exists(phenoPath)){stop("phenoPath file doesn't exist")}
      phenoFile <-  as.data.frame(utils::read.table(phenoPath , stringsAsFactors = F, header = T, sep=','))
      colnames(phenoFile)[1] <- "subjectID"
      # List of phenotypes and covarNames
      phenotypes <- stats::na.omit(unique(c(names(phenoFile)))) # Make a list of of phenotypes in the file with colnames
      phenotypes <- phenotypes[phenotypes != "subjectID"]

      statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))

      if(ncol(statut)!=2){stop("statut mut contains 2 colunms : subjectID and Affection statut")}
      names(statut) <- c("subjectID","Disease")
      data <- merge(data, statut, by ="subjectID")

    }
  }
  switch(type,
         disease={
           res <- c("HLA_KIR", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic",  "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant","lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")
           freq_threshold <- readline(prompt = "\n \n Enter a frequency threshold of KIR-ligand to displays in manhattan plot\n Frequency: " )
           threshold <- readline(prompt = "\n \n In order to display most interesting KIR-ligand and in clearly way. \n Define a p value treshold under which one haplotypes are not displayed in manhattan plot \n P-value:  ")
           X <- c("HLA_KIR", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue-kir-DisAssoc.csv",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")

           pdf(file=paste(outputFolder , outputFile , "-KIR-DiseaseAssocPlot.pdf" ,sep=""),10,10)

           ## Merge kir and the phenotype analyzed in this loop
           phenoGeno <- na.omit(data)

           for(kir in list_ligand) # Make a loop with all kir ligands
           {
             # Caculate frequency of the kir ligand
             freq <- mean(stats::na.omit(phenoGeno[, kir]))/2

             n_ind_ctrl <- nrow(subset(phenoGeno, Disease == 0))
             n_ind_cas <-nrow(subset(phenoGeno, Disease == 1))

             if( n_ind_ctrl != 0 & n_ind_cas != 0 )
             { # Frequency are calculated for case and control
               N <- 4/((1/nrow(subset(phenoGeno, Disease == 1)))+(1/nrow(subset(phenoGeno, Disease == 0))))
               freq_control <- mean(stats::na.omit(subset(phenoGeno, Disease == 0)[,kir]))/2
               freq_case <- mean(stats::na.omit(subset(phenoGeno, Disease == 1)[,kir]))/2

             }else{
               if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl
                 freq_control <- 0
                 freq_case <-mean(stats::na.omit(subset(phenoGeno, Disease == 1)[,kir]))/2
                 N <- nrow(subset(phenoGeno, Disease == 1))

               }else{
                 if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case
                   # N <- 4/(1/nrow(subset(subdata, Disease == 0)))
                   freq_control <- mean(stats::na.omit(subset(phenoGeno, Disease == 0)[,kir]))/2
                   freq_case <- 0
                   N <- nrow(subset(phenoGeno, Disease == 0))

                 }else{ # Unknown case control Status
                   N <- 0
                   freq_control <- 0
                   freq_case <- 0
                 }}}

             if(is.null(covarPath) & is.null(pcaPath)){
               # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
               phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
               phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
               phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
               reg_allelic <- summary(stats::glm(Disease ~ phenoGeno[, kir] , data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

               # Convert (-1,0,1) to (0,1) to run the regression in dominant model
               phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
               phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
               reg_dominant <- summary(stats::glm(Disease ~ phenoGeno[, kir] , data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

             }else{
               if(!is.null(covarPath) & is.null(pcaPath)){
                 # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                 phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                 phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                 phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                 reg_allelic <- summary(stats::glm(Disease ~ phenoGeno[, kir] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100 )) # Logistic regression with allelic model

                 # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                 phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                 phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                 phenoGeno[, kir] <-  as.factor(phenoGeno[, kir])
                 reg_dominant <- summary(stats::glm(Disease ~ data[, kir] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno , family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model
               }else{
                 if(is.null(covarPath) & !is.null(pcaPath)){
                   # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                   phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                   phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                   phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                   reg_allelic <- summary(stats::glm(Disease ~ phenoGeno[, kir] + PC1 + PC2, data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                   # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                   phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                   phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                   reg_dominant <- summary(stats::glm(Disease ~ phenoGeno[, kir] + PC1 + PC2, data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                 }else{
                   # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                   phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                   phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                   phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                   reg_allelic <- summary(stats::glm(Disease ~ phenoGeno[, kir] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # Logistic regression with allelic model

                   # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                   phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                   phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                   reg_dominant <- summary(stats::glm(Disease ~ phenoGeno[, kir] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Logistic regression with allelic model

                 }}}

             P_allelic <-  reg_allelic$coefficients[2,4]
             OR_allelic <-  round(exp(reg_allelic$coefficients[2,1]),3)
             lower_95_CI_allelic <- round(exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2]),3)
             upper_95_CI_allelic <- round(exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2]),3)
             Beta_allelic <- round(reg_allelic$coefficients[2,1],3)
             SE_allelic <- round(reg_allelic$coefficients[2,2],3)
             P_dominant <- reg_dominant$coefficients[2,4]
             OR_dominant <- round(exp(reg_dominant$coefficients[2,1]),3)
             lower_95_CI_dominant <- round(exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2]),3)
             upper_95_CI_dominant <- round(exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2]),3)
             Beta_dominant <- round(reg_dominant$coefficients[2,1],3)
             SE_dominant <- round(reg_dominant$coefficients[2,2],3)
             options(scipen = 999)
             # add the regression result to the "res" vector
             res <- rbind(res, c(kir, freq, freq_control, freq_case, N ,P_allelic,OR_allelic,lower_95_CI_allelic,upper_95_CI_allelic,Beta_allelic,SE_allelic,P_dominant,OR_dominant,lower_95_CI_dominant,upper_95_CI_dominant,Beta_dominant,SE_dominant))
           }

           # Save results
           utils::write.table(res, paste(outputFolder , outputFile,"-kir-Disease-Analysis" ,".csv", sep=""), row.names=F, quote=F, col.names=F, sep = ",")
           if(file.exists( paste(outputFolder , outputFile,"-kir-Disease-Analysis" ,".csv", sep=""))){
             cat("Regression of KIR ligands is Processed successfully \n")
             cat(paste("File", paste(outputFolder , outputFile , sep="") ,'is created. \n', sep = " "))
           }

           ## Import Files
           kirReg_result <- utils::read.table( paste(outputFolder , outputFile,"-kir-Disease-Analysis" ,".csv", sep="") , header= TRUE , sep = "," , dec = "." , na.strings = NA)
           seuil <- pvalueCorrection(KIR =  paste(outputFolder , outputFile,"-kir-Disease-Analysis" ,".csv", sep=""), correction = "bonferroni")
           if(is.null(seuil)){stop("Bonferroni threshold is null ! ")}

           ################### Best Pvalue selection gathered in an unique file
           resultLoop <- as.data.frame(kirReg_result)
           resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold  & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05) ,]
           # if(resultLoop!=factor(0)){ # if ==factor(0) error is displayed "argument is of length zero"
           if(!is.null(resultLoop) & nrow(resultLoop)!=0){
             suppressWarnings({
               resultLoop$Phenotype <- pheno
               resultLoop <-resultLoop[,c("HLA_KIR", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_0.95_CI_dominant", "upper_0.95_CI_dominant", "Beta_dominant", "SE_dominant")]
               write.table(resultLoop, file=paste(outputFolder,"BestPvalue-kir-DisAssoc.csv",sep = ""), append = TRUE , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")
             })
           }
           # }
           ################### QQPLOT
           ## Lambda: genomic inflation factor, show the presence of bias
           # allelic Model
           observedR <- na.omit(sort(kirReg_result$P_allelic))
           lobsR <- -(log10(observedR))
           expectedR <- c(1:length(observedR))
           lexpR <- -(log10(expectedR / (length(expectedR)+1)))
           lambdaR<-summary(lm(lobsR~lexpR))

           # Dominant Model
           observedD <- sort(kirReg_result$P_dominant)
           lobsD <- -(log10(observedD))
           expectedD <- c(1:length(observedD))
           lexpD <- -(log10(expectedD / (length(expectedD)+1)))
           lambdaD<- summary(lm(lobsD~lexpD))

           ## QQ PLOT
           qq1 <- qq(kirReg_result$P_allelic ,main=paste(" kir vs ",pheno," (allelic model)",sep = "") ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
           qq2 <- qq(kirReg_result$P_dominant , main=paste(" kir vs ",pheno," (dominant model)",sep = "") ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
           capture.output(print(qq1),file='NUL')
           capture.output(print(qq2),file='NUL')

           ## Graphic for visualizing P value of kir
           kirSel <- kirReg_result[kirReg_result$Frequency >= freq_threshold & (kirReg_result$P_allelic<=threshold | kirReg_result$P_dominant<=threshold),]
           kirSel <- kirSel[, c("HLA_KIR","P_allelic","P_dominant")]
           if(nrow(kirSel)==0){ message(paste("None KIR ligand to display for ", pheno , ". You can change your frequency and threshold p-value.\n Set at :\n Frequency : ", freq_threshold, "\n Pvalue threshold :",threshold,sep = ""))}

           ######### FONCTIONNE MAIS ILLISIBLE AXE X
           ## Adapt x axis legend with the number of alleles to plot


           ## allelic model
           plot1 <- ggplot( kirSel, aes(x=HLA_KIR, y=  -(log10(P_allelic)))) + geom_point(na.rm = TRUE, color=ifelse((-log10(kirSel$P_allelic)) >= seuil, "red","black")) + geom_hline(yintercept= -log10(0.05) , color = "blue") + ggtitle(paste(" kir vs ",pheno," (allelic model)",sep = ""))+  geom_hline(yintercept= seuil , color = "red")+ xlab("KIR ligand")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(angle=0), axis.title.y = element_text(color="#2b79be", size=11))
           ## dominant model
           plot2 <- ggplot( kirSel, aes(x=HLA_KIR, y=  -(log10(P_dominant)))) + geom_point(na.rm = TRUE, color=ifelse((-log10(kirSel$P_dominant)) >= seuil, "red","black")) + geom_hline(yintercept= -log10(0.05) , color = "blue") + ggtitle(paste(" kir vs ",pheno," (dominant model)",sep = ""))+  geom_hline(yintercept= seuil , color = "red")+ xlab("KIR ligand")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(angle=0), axis.title.y = element_text(color="#2b79be", size=11))
           suppressWarnings({
             print(plot1)
             print(plot2)
           })


           dev.off()

         },
         pheno ={
           freq_threshold <- readline(prompt = "\n \n Enter a frequency threshold of KIR-ligand to displays in manhattan plot\n Frequency: " )
           threshold <- readline(prompt = "\n \n In order to display most interesting KIR-ligand and in clearly way. \n Define a p value treshold under which one haplotypes are not displayed in manhattan plot \n P-value:  ")
           X <- c("Phenotype","HLA_KIR","Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")
           write.table(t(X), file=paste(outputFolder,"BestPvalue-kir-PhenoAssoc.csv",sep = "") , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")

           pdf(file=paste(outputFolder , outputFile , "-KIR-PhenotypeAssocPlot.pdf" ,sep=""),10,10)
           for(pheno in phenotypes)
           {
             # pheno <- "MFI_CD16_in_CD16hi_of_NKnew.panel4"
             res <- c("HLA_KIR", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")

             ## Merge kir and the phenotype analyzed in this loop
             merge_data_pheno <- merge(data,phenoFile[,c("subjectID",pheno)],by = "subjectID")
             colnames(merge_data_pheno)[ncol(merge_data_pheno)] <- "trait"
             phenoGeno <- na.omit(merge_data_pheno)

             for(kir in list_ligand) # Make a loop with all amino acids
             {
               # Caculate frequency of amino acid
               freq <- mean(stats::na.omit(phenoGeno[, kir]))/2

               n_ind_ctrl <- nrow(subset(phenoGeno, Disease == 0))
               n_ind_cas <-nrow(subset(phenoGeno, Disease == 1))

               if( n_ind_ctrl != 0 & n_ind_cas != 0 )
               { # Frequency are calculated for case and control
                 N <- 4/((1/nrow(subset(phenoGeno, Disease == 1)))+(1/nrow(subset(phenoGeno, Disease == 0))))
                 freq_control <- mean(stats::na.omit(subset(phenoGeno, Disease == 0)[,kir]))/2
                 freq_case <- mean(stats::na.omit(subset(phenoGeno, Disease == 1)[,kir]))/2

               }else{
                 if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl
                   freq_control <- 0
                   freq_case <-mean(stats::na.omit(subset(phenoGeno, Disease == 1)[,kir]))/2
                   N <- nrow(subset(phenoGeno, Disease == 1))

                 }else{
                   if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case
                     # N <- 4/(1/nrow(subset(subdata, Disease == 0)))
                     freq_control <- mean(stats::na.omit(subset(phenoGeno, Disease == 0)[,kir]))/2
                     freq_case <- 0
                     N <- nrow(subset(phenoGeno, Disease == 0))

                   }else{ # Unknown case control Status
                     N <- 0
                     freq_control <- 0
                     freq_case <- 0
                   }}}

               if(is.null(covarPath) & is.null(pcaPath)){
                 # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                 phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                 phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                 phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                 reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, kir] , data = phenoGeno)) # Linear regression with allelic model

                 # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                 phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                 phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                 reg_dominant <- summary(stats::lm(trait ~ phenoGeno[, kir] , data = phenoGeno)) # Linear regression with allelic model

               }else{
                 if(!is.null(covarPath) & is.null(pcaPath)){
                   # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                   phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                   phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                   phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                   reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, kir] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Linear regression with allelic model

                   # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                   phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                   phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                   phenoGeno[, kir] <-  as.factor(phenoGeno[, kir])
                   reg_dominant <- summary(stats::lm(trait ~ data[, kir] + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Linear regression with allelic model
                 }else{
                   if(is.null(covarPath) & !is.null(pcaPath)){
                     # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                     phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                     phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                     phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                     reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, kir] + PC1 + PC2, data = phenoGeno)) # Linear regression with allelic model

                     # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                     phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                     phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                     reg_dominant <- summary(stats::lm(trait ~ phenoGeno[, kir] + PC1 + PC2, data = phenoGeno)) # Linear regression with allelic model

                   }else{
                     # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
                     phenoGeno[,kir][phenoGeno[,kir] == 0] <- -1
                     phenoGeno[,kir][phenoGeno[,kir] == 1] <- 0
                     phenoGeno[,kir][phenoGeno[,kir] == 2] <- 1
                     reg_allelic <- summary(stats::lm(trait ~ phenoGeno[, kir] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno)) # Linear regression with allelic model

                     # Convert (-1,0,1) to (0,1) to run the regression in dominant model
                     phenoGeno[,kir][phenoGeno[,kir] == 0] <- 1
                     phenoGeno[,kir][phenoGeno[,kir] == -1] <- 0
                     reg_dominant <- summary(stats::lm(trait ~ phenoGeno[, kir] + PC1 + PC2 + AGE.V0 + SEX + CMV.V0 + NBYTABAC, data = phenoGeno )) # Linear regression with allelic model

                   }}}

               P_allelic <-  reg_allelic$coefficients[2,4]
               Beta_allelic <- round(reg_allelic$coefficients[2,1],3)
               SE_allelic <- round(reg_allelic$coefficients[2,2],3)
               Rsquare_allelic <-round(reg_allelic$adj.r.squared, 3)

               P_dominant <- reg_dominant$coefficients[2,4]
               Beta_dominant <- round(reg_dominant$coefficients[2,1],3)
               SE_dominant <- round(reg_dominant$coefficients[2,2],3)
               Rsquare_dominant <-round(reg_dominant$adj.r.squared, 3)

               options(scipen = 999)

               res <- rbind(res, c(kir, freq, freq_control, freq_case, N, Mean_pheno, P_allelic, Beta_allelic, SE_allelic, Rsquare_allelic, P_dominant, Beta_dominant, SE_dominant, Rsquare_dominant)) # See header of res for elements explanation

             }

             # Save results
             utils::write.table(res, paste(outputFolder , outputFile,"-kir-", pheno,"-Analysis" ,".csv", sep=""), row.names=F, quote=F, col.names=F, sep = ",")
             if(file.exists( paste(outputFolder , outputFile,"-kirpheno-Analysis" ,".csv", sep=""))){
               cat("Regression on KIR ligands is Processed successfully \n")
               cat(paste("File", paste(outputFolder , outputFile , sep="") ,'is created. \n', sep = " "))
             }

             ## Import Files
             kirReg_result <- utils::read.table( paste(outputFolder , outputFile,"-kir-", pheno,"-Analysis" ,".csv", sep="") , header= TRUE , sep = "," , dec = "." , na.strings = NA)
             seuil <- pvalueCorrection(kirRegResult =  paste(outputFolder , outputFile,"-kir-", pheno,"-Analysis" ,".csv", sep=""), correction = "bonferroni")
             if(is.null(seuil)){stop("Bonferroni threshold is null ! ")}

             ################### Best Pvalue selection gathered in an unique file
             resultLoop <- as.data.frame(kirReg_result)
             resultLoop <- resultLoop[resultLoop$Frequency >= freq_threshold  & (resultLoop$P_allelic <= 0.05 | resultLoop$P_dominant <= 0.05) ,]
             # if(resultLoop!=factor(0)){ # if ==factor(0) error is displayed "argument is of length zero"
             if(!is.null(resultLoop) & nrow(resultLoop)!=0){
               suppressWarnings({
                 resultLoop$Phenotype <- pheno
                 resultLoop <-resultLoop[,c("Phenotype","HLA_KIR", "Frequency", "Frequency_control", "Frequency_case", "N","Mean_pheno", "P_allelic", "Beta_allelic", "SE_allelic", "R2_allelic","P_dominant", "Beta_dominant", "SE_dominant","R2_dominant")]
                 write.table(resultLoop, file=paste(outputFolder,"BestPvalue-kir-PhenoAssoc.csv",sep = ""), append = TRUE , row.names = FALSE ,   col.names = FALSE  ,quote=F, sep = ",")
               })
             }
             # }
             ################### QQPLOT
             ## Lambda: genomic inflation factor, show the presence of bias
             # allelic Model
             observedR <- na.omit(sort(kirReg_result$P_allelic))
             lobsR <- -(log10(observedR))
             expectedR <- c(1:length(observedR))
             lexpR <- -(log10(expectedR / (length(expectedR)+1)))
             lambdaR<-summary(lm(lobsR~lexpR))

             # Dominant Model
             observedD <- sort(kirReg_result$P_dominant)
             lobsD <- -(log10(observedD))
             expectedD <- c(1:length(observedD))
             lexpD <- -(log10(expectedD / (length(expectedD)+1)))
             lambdaD<- summary(lm(lobsD~lexpD))

             ## QQ PLOT
             qq1 <- qq(kirReg_result$P_allelic ,main=paste(" kir vs ",pheno," (allelic model)",sep = "") ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
             qq2 <- qq(kirReg_result$P_dominant , main=paste(" kir vs ",pheno," (dominant model)",sep = "") ,  sub= paste("Lambda = ", round(lambdaR$coefficients[2,1],2),sep = ""))
             capture.output(print(qq1),file='NUL')
             capture.output(print(qq2),file='NUL')

             ## Graphic for visualizing P value of kir
             kirSel <- kirReg_result[kirReg_result$Frequency >= freq_threshold & (kirReg_result$P_allelic<=threshold | kirReg_result$P_dominant<=threshold),]
             kirSel <- kirSel[, c("HLA_KIR","P_allelic","P_dominant")]
             if(nrow(kirSel)==0){ message(paste("None KIR ligand to display for ", pheno , ". You can change your frequency and threshold p-value.\n Set at :\n Frequency : ", freq_threshold, "\n Pvalue threshold :",threshold,sep = ""))}

             ######### FONCTIONNE MAIS ILLISIBLE AXE X
             ## Adapt x axis legend with the number of alleles to plot


             ## allelic model
             plot1 <- ggplot( kirSel, aes(x=HLA_KIR, y=  -(log10(P_allelic)))) + geom_point(na.rm = TRUE, color=ifelse((-log10(kirSel$P_allelic)) >= seuil, "red","black")) + geom_hline(yintercept= -log10(0.05) , color = "blue") + ggtitle(paste(" kir vs ",pheno," (allelic model)",sep = ""))+  geom_hline(yintercept= seuil , color = "red")+ xlab("KIR ligand")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(angle=0), axis.title.y = element_text(color="#2b79be", size=11))
             ## dominant model
             plot2 <- ggplot( kirSel, aes(x=HLA_KIR, y=  -(log10(P_dominant)))) + geom_point(na.rm = TRUE, color=ifelse((-log10(kirSel$P_dominant)) >= seuil, "red","black")) + geom_hline(yintercept= -log10(0.05) , color = "blue") + ggtitle(paste(" kir vs ",pheno," (dominant model)",sep = ""))+  geom_hline(yintercept= seuil , color = "red")+ xlab("KIR ligand")  + theme(axis.title.x = element_text(color="#2b79be", size=11), axis.text.x = element_text(angle=0), axis.title.y = element_text(color="#2b79be", size=11))

             suppressWarnings({
               print(plot1)
               print(plot2)
             })

           }
           dev.off()
         })


}


#######################################################
###  HLA-C expression analysis with many phenotypes ###
#######################################################


#' HLA-C expression analys many variate phenotypes
#'
#' @param dataPath file with subject.ID and HLA-c expression "quantitative"
#' @param phenoPath  file with subject.ID and as many columns as quantitative phenotypes
#' @param statut file conaining subject ID and Disease statut
#' @param pcaPath pca result for PC1 and PC2
#' @param covarPath must contains SubjectID followed by covariables
#' @param outputFolder folder where to save the result
#' @param outputFile Name for the result csv file with columns: subjectID	Disease	PC1	PC2	A	A	B	B	C	C	DRB1	DRB1	DQB1	DQB1
#' @param type etiher "disease" for analysis in case/control or "pheno" for analysis with an quantitative phenotype
#' @return Linear Regression result with the phenotypes or case/control and corrected either with PCs or with covariates
#' @import ggplot2
#' @importFrom utils read.table write.table capture.output
#' @importFrom qqman qq
#' @importFrom stats binomial glm na.omit p.adjust lm
#' @importFrom grDevices dev.off  pdf
#' @export
#'

hlaCexpressionAnalysis <- function( dataPath , phenoPath=NULL  , statut=NULL, pcaPath=NULL,  covarPath=NULL , outputFolder , outputFile , type="pheno")
{
  # Obligatory
  data <-  as.data.frame(utils::read.table(dataPath , stringsAsFactors = F, header = T, sep=','))
  if(ncol(data)!=2){stop("datapath should ocntain only 2 columns: subjectID and HLAC expression")}
  names(data) <- c("subjectID" , "HLAexprC")

  # Option parameters
  if(!is.null(pcaPath)){
    PCs <- stats::na.omit(utils::read.table(pcaPath ,  header = TRUE, sep = "," , dec = "."))
    if(ncol(PCs)!=3){stop("pcaPath should ocntain only 3 columns: subjectID, PC1 and PC2")}
    colnames(PCs)[1]<-"subjectID"
    colnames(PCs)[2]<-"PC1"
    colnames(PCs)[3]<-"PC2"
    data <- merge(data, PCs[,1:3], by ="subjectID")
  }

  if(!is.null(covarPath)){
    covarCorrection <- stats::na.omit(utils::read.table(covarPath ,  header = TRUE, sep = "," , dec = "."))
    colnames(covarCorrection)[1] <- "subjectID"
    covarNames <- stats::na.omit(names(covarCorrection))
    covarNames <- covarNames[covarNames!="subjectID"]
    covariables <- noquote(paste(covarNames, collapse = " + ")) #add as.character ??
    data <- merge(data, covarCorrection, by ="subjectID")
  }

  if(type=="disease")
  {
    if(is.null(statut)){stop("statut must be provided")}
    if(!file.exists(statut)){stop("statut doesn't exist")}
    statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))
    if(ncol(statut)!=2){stop("statut mut contains 2 colunms : subjectID and Affection statut")}
    names(statut) <- c("subjectID","Disease")
    data <- merge(data, statut, by ="subjectID")

  }else{
    if(type=="pheno")
    {
      if(is.null(phenoPath)){stop('phenoPath must be provided when type="pheno"')}
      if(!file.exists(phenoPath)){stop("phenoPath file doesn't exist")}
      phenoFile <-  as.data.frame(utils::read.table(phenoPath , stringsAsFactors = F, header = T, sep=','))
      colnames(phenoFile)[1] <- "subjectID"
      #List of phenotypes and covarNames
      phenotypes <- stats::na.omit(unique(c(names(phenoFile)))) # Make a list of of phenotypes in the file with colnames
      phenotypes <- phenotypes[phenotypes != "subjectID"]

    }
  }
  ggplotRegression <- function (fit) {
    requireNamespace(ggplot2)
    ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
      geom_point() +
      stat_smooth(method = "lm", col = "red") +
      labs(title = paste(pheno," vs HLA-C Expression"),
           subtitle=paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
                          "Intercept =",signif(fit$coef[[1]],5 ),
                          " Slope =",signif(fit$coef[[2]], 5),
                          " P =",signif(summary(fit)$coef[2,4], 5))) +
      xlab("HLA-C expression ") + ylab(paste(pheno," expression", sep=""))+
      theme(
        axis.title.x = element_text(color="#2b79be", size=11),
        axis.title.y = element_text(color="#2b79be", size=11),
        plot.title = element_text(hjust = 0.5, size = 12, color = "#ec6649", face = "bold"),    # Center title position and size
        plot.subtitle = element_text(hjust = 0.5,size=10, color = "#07985b",face = "italic"))            # Center subtitle
  }


  switch(type,
         disease={
           res <- c("Mean_HLACexpr","N","N_ctrl","N_case", "P", "Beta", "SE","R2")
           pdf(file=paste(outputFolder , outputFile , "-HLA-KIR_Disease_analysis_regressionPlot.pdf" ,sep=""))

           ## Frequencies
           freq <- sum(phenoGeno[,val1]==1,phenoGeno[,val2]==1)/(nrow(phenoGeno)*2) # Overall frequency
           n_ind_ctrl <- nrow(subset(phenoGeno, Disease == 0))
           n_ind_cas <-nrow(subset(phenoGeno, Disease == 1))

           if( n_ind_ctrl != 0 & n_ind_cas != 0 )
           { # Frequency are calculated for case and control
             # N <- 4/((1/nrow(subset(all_data, Disease == 1)))+(1/nrow(subset(all_data, Disease == 0))))
             N <- nrow(phenoGeno[,!is.na(Disease)])
             freq_control <- sum(subset(phenoGeno, Disease == 0)[,val1]==1,subset(phenoGeno, Disease == 0)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 0))*2) # Controls frequency
             freq_case <- sum(subset(phenoGeno, Disease == 1)[,val1]==1,subset(phenoGeno, Disease == 1)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 1))*2) # Cases frequency

           }else{
             if( n_ind_ctrl == 0 & n_ind_cas != 0){ # won't calculate frequency for ctrl and set to 0
               freq_control <- 0
               freq_case <- sum(subset(phenoGeno, Disease == 1)[,val1]==1,subset(phenoGeno, Disease == 1)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 1))*2) # Cases frequency
               N <- nrow(subset(phenoGeno, Disease == 1))

             }else{
               if( n_ind_ctrl != 0 & n_ind_cas == 0){ # won't calculate frequency for case and set to 0
                 freq_control <- sum(subset(phenoGeno, Disease == 0)[,val1]==1,subset(phenoGeno, Disease == 0)[,val2]==1)/(nrow(subset(phenoGeno, Disease == 0))*2) # Controls frequency
                 freq_case <- 0
                 N <- nrow(subset(phenoGeno, Disease == 0))
               }else{ # Unknown case control Status
                 N <- 0
                 freq_control <- 0
                 freq_case <- 0
               }
             }
           }
           if(is.null(covarPath) & is.null(pcaPath)){
             reg <- summary(stats::glm(Disease~HLAexprC, data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # explain the phenotype with HLAC expression
           }else{
             if(!is.null(covarPath) & is.null(pcaPath)){
               reg <- summary(stats::glm(Disease~HLAexprC + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)) # explain the phenotype with HLAC expression
             }else{
               if(is.null(covarPath) & !is.null(pcaPath)){
                 reg <- stats::glm(Disease~HLAexprC + PC1 + PC2 , data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100)
               }else{
                 reg <- summary(stats::glm(Disease~HLAexprC + AGE.V0 + SEX + CMV.V0 + NBYTABAC + PC1 + PC2 , data = phenoGeno, family=stats::binomial(link="logit"), maxit = 100))
               }}}

           # print(reg)
           P <- reg$coefficients[2,4]
           OR <- round(exp(reg$coefficients[2,1]),2)
           lower_95_CI <- round(exp(reg$coefficients[2,1]-1.96*reg$coefficients[2,2]),2)
           upper_95_CI <- round(exp(reg$coefficients[2,1]+1.96*reg$coefficients[2,2]),2)
           Beta <- round(reg$coefficients[2,1],2)
           SE <- round(reg$coefficients[2,2],2)
           MeanHLACexpr <- mean(phenoGeno$HLAexprC)
           Rsquare <-signif(reg$adj.r.squared, 5)
           # add the regression result to the "res" vector
           res <- rbind(res, c(MeanHLACexpr, N,Nctrl, Ncase, P, OR,Rsquare, lower_95_CI, upper_95_CI, Beta, SE)) # See header of res for elements explanation

           print(ggplotRegression(reg))
           dev.off()
           utils::write.table(res, paste(outputFolder , outputFile , "-HLAC-expr-Analysis" , ".csv" ,sep=""), row.names=F, quote=F, col.names=F, sep = ",")
           if(file.exists(paste(outputFolder , outputFile , "-HLAC-expr-Analysis" , ".csv" ,sep=""))){
             cat(paste("File", paste(outputFolder , outputFile, "-", pheno , ".csv"  , sep="") ,'is created.  \n', sep = " "))
           }
         },
         pheno ={
           res <- c("Phenotype","Mean_phenotype","Mean_HLACexpr","N", "P", "OR","R^2", "lower_0.95_CI", "upper_0.95_CI", "Beta", "SE")
           pdf(file=paste(outputFolder , outputFile , "-regressionPlot.pdf" ,sep=""))

           for(pheno in phenotypes)
           {
             # Merge of pheno columns and alleles
             phenoGeno <- merge(data,phenoFile[,c("subjectID",pheno)],by = "subjectID")
             colnames(phenoGeno)[ncol(phenoGeno)] <- "trait"
             phenoGeno <- na.omit(phenoGeno)

             N <- nrow(phenoGeno)
             if(is.null(covarPath) & is.null(pcaPath)){
               reg <- summary(stats::lm(trait~HLAexprC, data = phenoGeno)) # explain the phenotype with HLAC expression

             }else{
               if(!is.null(covarPath) & is.null(pcaPath)){
                 reg <- summary(stats::lm(trait~HLAexprC + AGE.V0 + SEX + CMV.V0 + NBYTABAC , data = phenoGeno)) # explain the phenotype with HLAC expression
               }else{
                 if(is.null(covarPath) & !is.null(pcaPath)){
                   reg <- stats::lm(trait~HLAexprC + PC1 + PC2 , data = phenoGeno)
                 }else{reg <- summary(stats::lm(trait~HLAexprC + AGE.V0 + SEX + CMV.V0 + NBYTABAC + PC1 + PC2 , data = phenoGeno))}}}

             P <- summary(reg)$coefficients[2,4]
             lower_95_CI <- round(exp(summary(reg)$coefficients[2,1]-1.96*summary(reg)$coefficients[2,2]),2)
             upper_95_CI <- round(exp(summary(reg)$coefficients[2,1]+1.96*summary(reg)$coefficients[2,2]),2)
             Beta <- round(summary(reg)$coefficients[2,1],2)
             SE <- round(summary(reg)$coefficients[2,2],2)
             Mean_pheno <- mean(phenoGeno$trait)
             MeanHLACexpr <- mean(phenoGeno$HLAexprC)
             Rsquare <-signif(summary(reg)$adj.r.squared, 5)
             # add the regression result to the "res" vector
             res <- rbind(res, c(pheno,Mean_pheno,MeanHLACexpr, N, P,Rsquare, lower_95_CI, upper_95_CI, Beta, SE)) # See header of res for elements explanation

             print(ggplotRegression(reg))

           }
           dev.off()
           utils::write.table(res, paste(outputFolder , outputFile , "-HLAC-expr-Analysis" , ".csv" ,sep=""), row.names=F, quote=F, col.names=F, sep = ",")
           if(file.exists(paste(outputFolder , outputFile , "-HLAC-expr-Analysis" , ".csv" ,sep=""))){
             cat("Regression of HLA-C expression is Processed successfully \n")
             cat(paste("File", paste(outputFolder , outputFile , "-HLAC-expr-Analysis" , ".csv"  , sep="") ,'is created.  \n', sep = " "))
           }})

}


#' Function that correct P-value for HLA alleles, haplotypes or Amino Acids, kir ligand and HLA-C expression. Propose 2 differents methods: bonferroni with allele's regression result or Permutation by redoing regression over 1000 times or different number of loop.
#' @param allele (permutation) Imputation's result file with 14 colunms : subjectID, Disease, PC1, PC2, prob, A, A, B, B, C, C, DRB1, DRB1, DQB1, DQB1 ("," separator between fields)
#' @param haplo (permutation) Haplotypes file with at least these columns : subjectID,	A-B-C-DR-DQ-1,	freq1,	A-B-C-DR-DQ-2,	freq2,	PostP ("," separator between fields)
#' @param AA  (permutation) Amino-Acid File from easy-hla
#' @param KIR (permutation) KIR ligand's file from easy-hla
#' @param HLAC (permutation) HLAC expression as provided by easy-hla
#' @param alleleRegResult (bonferroni) result file of Regression on allele : function
#' @param haploRegResult (bonferroni) result file of Regression on haplotypes : function
#' @param AARegResult (bonferroni) result file of Regression on AA : function
#' @param kirRegResult (bonferroni) result file of Regression on kir ligands : function
#' @param hlaCRegResult (bonferroni) result file of Regression on kir ligands : function
#' @param pcaFile FIle with sample id and two principal components of PCA
#' @param statut File with sample id and disease statut (coded 0 1)
#' @param correction Method to correct P value: permutation or bonferroni (default).
#' @return Return a list with the new critical P value for each level of analysis: seuilAllele, seuilHaplo and seuilAA
#' @importFrom utils read.table write.table capture.output
#' @export
#'

pvalueCorrection <- function(allele = NULL, haplo = NULL, AA = NULL, KIR = NULL, HLAC= NULL
                             ,alleleRegResult = NULL, haploRegResult = NULL, AARegResult = NULL, kirRegResult=NULL, hlaCRegResult=NULL
                             ,pcaFile = NULL , statut = NULL , correction = "bonferroni"){
  # 1er choix la correction
  # 2eme choix le niveau d'analyse
  # Calcul d'un seul seuil puis retourner par la fonction
  # #' @param numberLoop Number of loop for regression when performing permutation correction of P value.
  # , numberLoop=100
  if( correction == "bonferroni" ){
    if(!is.null(alleleRegResult)){
      data <- utils::read.table(alleleRegResult , header= TRUE , sep = "," , dec = "." , na.strings = NA)
      seuil <- -log10((0.05/(nrow(data)-1))) # nombre d'allèles uniques exsitants
    }
    if(!is.null(haploRegResult)){
      data <- utils::read.table(haploRegResult , header= TRUE , sep = "," , dec = "." , na.strings = NA)
      seuil <- -log10((0.05/(nrow(data)-1)))

    }
    if(!is.null(AARegResult)){
      data <- utils::read.table(AARegResult , header= TRUE , sep = "," , dec = "." , na.strings = NA)
      seuil <- -log10((0.05/(nrow(data)-1)))
    }
    if(!is.null(kirRegResult)){
      data <- utils::read.table(kirRegResult , header= TRUE , sep = "," , dec = "." , na.strings = NA)
      seuil <- -log10((0.05/(nrow(data)-1)))
    }
    if(!is.null(hlaCRegResult)){
      data <- utils::read.table(hlaCRegResult , header= TRUE , sep = "," , dec = "." , na.strings = NA)
      seuil <- -log10((0.05/(nrow(data)-1)))
    }

  }else{

    if( correction == "permutation"){

      if(!is.null(allele)){
        data <- utils::read.table(allele , header= TRUE , sep = "," , dec = "." , na.strings = NA)
        HLA_genes <- c("A", "B", "C", "DRB1", "DQB1")
        res <- c("HLA_allele", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_95%_CI_allelic", "upper_95%_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_95%_CI_dominant", "upper_95%_CI_dominant", "Beta_dominant", "SE_dominant")
        p <- c()
        best_Psample <- c()
        for( i in 1:1000) # set to 1000 by default
        {
          print(i)
          pheno <- sample(data$Disease, replace = TRUE )
          data$Disease <- pheno

          for(gene in HLA_genes) # Make a loop with all the HLA genes
          {
            subdata <- data[,c("subjectID","Disease","PC1","PC2",gene,paste(gene,".1",sep = ""))]
            names(subdata) <- c("subjectID","Disease","PC1","PC2", "allele1" , "allele2" )
            HLA <- na.omit(as.character(unique(c(unique(subdata$allele1), unique(subdata$allele2))))) # Make a list of all existing HLA alleles in each file

            for(allele in HLA)
            {
              val1 <- paste("allele1_", allele, sep="")
              subdata[,val1] <- ifelse(subdata$allele1 == allele, 1,
                                       ifelse(subdata$allele1 != allele, 2, NA))
              val2 <- paste("allele2_", allele, sep="")
              subdata[,val2] <- ifelse(subdata$allele2 == allele, 1,
                                       ifelse(subdata$allele2 != allele, 2, NA))
              val3 <- paste("genotype_", allele, sep="")
              subdata[,val3] <- paste(subdata[,val1], subdata[,val2], sep="")
              val4 <- paste("genoA_", allele, sep="")
              subdata[,val4] <- ifelse(subdata[,val3] == '11', 1,
                                       ifelse(subdata[,val3] == '12', 0,
                                              ifelse(subdata[,val3] == '21', 0,
                                                     ifelse(subdata[,val3] == '22', -1, NA))))
              val5 <- paste("genoD_", allele, sep="")
              subdata[,val5] <- ifelse(subdata[,val3] == '11', 1,
                                       ifelse(subdata[,val3] == '12', 1,
                                              ifelse(subdata[,val3] == '21', 1,
                                                     ifelse(subdata[,val3] == '22', 0, NA))))
              reg_allelic <- summary(glm(Disease ~ subdata[, val4] + PC1 + PC2, data = subdata, family=binomial(link="logit"))) # Logistic regression with allelic model
              reg_dominant <- summary(glm(Disease ~ subdata[, val5] + PC1 + PC2, data = subdata, family=binomial(link="logit"))) # Logistic regression with dominant model

              freq <- sum(subdata[,val1]==1,subdata[,val2]==1)/(nrow(subdata)*2) # Overall frequency
              freq_control <- sum(subset(subdata, Disease == 0)[,val1]==1,subset(subdata, Disease == 0)[,val2]==1)/(nrow(subset(subdata, Disease == 0))*2) # Controls frequency
              freq_case <- sum(subset(subdata, Disease == 1)[,val1]==1,subset(subdata, Disease == 1)[,val2]==1)/(nrow(subset(subdata, Disease == 1))*2) # Cases frequency with homozygote for the allele (11)
              res <- rbind(res, c(allele , freq , freq_control , freq_case, 4/((1/nrow(subset(data, Disease == 1)))+(1/nrow(subset(data, Disease == 0)))), reg_allelic$coefficients[2,4], exp(reg_allelic$coefficients[2,1]), exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2]), exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2]), reg_allelic$coefficients[2,1], reg_allelic$coefficients[2,2], reg_dominant$coefficients[2,4], exp(reg_dominant$coefficients[2,1]), exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2]), exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2]), reg_dominant$coefficients[2,1], reg_dominant$coefficients[2,2])) # See header of res for elements explanation
            } # fin boucle alleles pour 1 gène
            a <- sort( as.vector(as.numeric(res[-1,6])) , decreasing = FALSE)
            p <- sort( append(p , a[1], after = length(p)) , decreasing = FALSE)  # List of Pvalue obtained for all 5 genes
          } # fin boucle pour 5 gènes
          best_Psample <- sort( append( best_Psample , p[1] , after = length(best_Psample))) ## 1000 lowest P value from each
        }
        seuil <- round(best_Psample[50] , 9) # set to 50
        seuil <- -log10(seuil)
      }



      if(!is.null(haplo) &!is.null(pcaFile) & !is.null(statut)){
        data <- utils::read.table(haplo , header= TRUE , sep = "," , dec = "." , na.strings = NA)
        statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))
        if(ncol(statut)!=2){stop("statut must contains 2 colunms : subjectID and affection statut.")}
        PCs <- stats::na.omit(utils::read.table(pcaFile ,  header = TRUE))
        if(ncol(pca)!=3){stop("pcaFile must contains 3 colunms : subjectID, PC1 and PC2.")}
        colnames(data)[1] <- "sample.id"
        colnames(statut)[1] <- "sample.id"
        colnames(PCs)[1] <- "sample.id"
        data <- merge(data, statut, by ="sample.id")
        all_data <- merge(data ,PCs[,1:3], by ="sample.id")
        colnames(all_data)[ncol(all_data)-2] <- "Statut"
        all_data$pheno <- ifelse(all_data$Statut=="Ctrl",0,1)
        res <- c("HLA_haplotype", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_0.95_CI_allelic", "upper_0.95_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_95%_CI_dominant", "upper_95%_CI_dominant", "Beta_dominant", "SE_dominant")

        ## Determine haplotypes to analyze; more than 2 occurences
        df <- as.data.frame(table(stats::na.omit(c( as.character(all_data$A.B.C.DR.DQ.1) , as.character(all_data$A.B.C.DR.DQ.2))))) ## number of occurences
        haplo <-subset(df , Freq >= 2) ## select haplo that appears more than 2 times
        p <- c()
        for( i in 1:1000) # set to 1000
        {
          for(hap in haplo[,1]){ # Make a loop with all the HLA haplotypes, Convert the haplotypes to 1,2 code (val1, val2), then to genotype (val3), and finally to recessive (val4) or dominant (val5) testable model
            val1 <- paste("allele1_", hap, sep="")
            all_data[,val1] <- ifelse(all_data$A.B.C.DR.DQ.1 == hap, 1,
                                      ifelse(all_data$A.B.C.DR.DQ.1 != hap, 2, NA))

            val2 <- paste("allele2_", hap, sep="")
            all_data[,val2] <- ifelse(all_data$A.B.C.DR.DQ.2 == hap, 1,
                                      ifelse(all_data$A.B.C.DR.DQ.2 != hap, 2, NA))

            val3 <- paste("genotype_", hap, sep="")
            all_data[,val3] <- paste(all_data[,val1], all_data[,val2], sep="")

            val4 <- paste("genoA_", hap, sep="")
            all_data[,val4] <- ifelse(all_data[,val3] == '11', 1,
                                      ifelse(all_data[,val3] == '12', 0,
                                             ifelse(all_data[,val3] == '21', 0,
                                                    ifelse(all_data[,val3] == '22', -1, NA))))

            val5 <- paste("genoD_", hap, sep="")
            all_data[,val5] <- ifelse(all_data[,val3] == '11', 1,
                                      ifelse(all_data[,val3] == '12', 1,
                                             ifelse(all_data[,val3] == '21', 1,
                                                    ifelse(all_data[,val3] == '22', 0, NA))))

            freq <- sum(all_data[,val1]==1,all_data[,val2]==1)/(nrow(all_data)*2) # Overall frequency
            freq_control <- sum(subset(all_data, pheno == 0)[,val1]==1,subset(all_data, pheno == 0)[,val2]==1)/(nrow(subset(all_data, pheno == 0))*2) # Controls frequency
            freq_case <- sum(subset(all_data, pheno == 1)[,val1]==1,subset(all_data, pheno == 1)[,val2]==1)/(nrow(subset(all_data, pheno == 1))*2) # Cases frequency

            reg_allelic <- summary(stats::glm(pheno ~ all_data[, val4] + PC1 + PC2 , data = all_data, family=stats::binomial(link="logit"))) # Logistic regression with allelic model
            reg_dominant <- summary(stats::glm(pheno ~ all_data[, val5] + PC1 + PC2 , data = all_data, family=stats::binomial(link="logit"))) # Logistic regression with dominant model

            res <- rbind(res, c(hap, freq, freq_control, freq_case, 4/((1/nrow(subset(all_data, pheno == 1)))+(1/nrow(subset(all_data, pheno == 0)))), reg_allelic$coefficients[2,4], exp(reg_allelic$coefficients[2,1]), exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2]), exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2]), reg_allelic$coefficients[2,1], reg_allelic$coefficients[2,2], reg_dominant$coefficients[2,4], exp(reg_dominant$coefficients[2,1]), exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2]), exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2]), reg_dominant$coefficients[2,1], reg_dominant$coefficients[2,2])) # See header of res for elements explanation

          }# fin boucle pour TOUT les haplotypes
          a <- sort( as.vector(as.numeric(res[-1,6])) , decreasing = FALSE)
          p <- sort( append(p , a[1], after = length(p)) , decreasing = FALSE)  # Autant de p value que d'haplotype ON garde une p value à chaque tour de boucle
        }
        seuil <-round(p[50] , 9) # p contains 1000 p values # set to 50
        seuil <- -log10(seuil)
      }


      if(!is.null(AA)){
        if(is.null(pcaFile)){stop(" \n pvalueCorrection function need PCA data of your cohort. ")}
        amino_acid <- utils::read.csv(AA, stringsAsFactors = F,sep=",", header = TRUE) # Import amino acid data
        amino_acid <- amino_acid[, apply(amino_acid, 2, function(x) length(unique(stats::na.omit(x)))) > 1] # Keep amino acid with information
        list_aa <- colnames(amino_acid)[-c(1:6,(ncol(amino_acid)-6):ncol(amino_acid))]

        statut <- stats::na.omit(utils::read.table(statut , header = TRUE , sep = "," , dec = "."))
        if(ncol(statut)!=2){stop("statut must contains 2 colunms : subjectID and affection statut.")}
        pca <- utils::read.table(pcaFile, header=T, stringsAsFactors = F, sep=",") # Load each HLA imputed file for each cohort
        if(ncol(pca)!=3){stop("pcaFile must contains 3 colunms : subjectID, PC1 and PC2.")}
        colnames(amino_acid)[1] <- "sample.id"
        names(statut) <- c("sample.id","Disease")
        names(pca) <- c("sample.id","PC1","PC2")

        data <- merge(amino_acid, pca[,1:3], by="sample.id")
        all_data <-  merge(data, statut, by="sample.id")
        res <- c("HLA_Amino_Acid", "Frequency", "Frequency_control", "Frequency_case", "N", "P_allelic", "OR_allelic", "lower_95%_CI_allelic", "upper_95%_CI_allelic", "Beta_allelic", "SE_allelic", "P_dominant", "OR_dominant", "lower_95%_CI_dominant", "upper_95%_CI_dominant", "Beta_dominant", "SE_dominant")
        p <- c()
        for( i in 1:1000)
        {
          print(i)
          for(aa in list_aa) # Make a loop with all amino acids
          {
            # Caculate frequency of amino acid
            freq <- mean(stats::na.omit(all_data[,aa]))/2
            freq_control <- mean(stats::na.omit(subset(all_data, Disease == 0)[,aa]))/2
            freq_case <- mean(stats::na.omit(subset(all_data, Disease == 1)[,aa]))/2

            # Convert amino acid (0,1,2) to (-1,0,1) to run the regression
            all_data[,aa][all_data[,aa] == 0] <- -1
            all_data[,aa][all_data[,aa] == 1] <- 0
            all_data[,aa][all_data[,aa] == 2] <- 1

            # Run the allelic regression
            reg_allelic <- summary(stats::glm(Disease ~ all_data[, aa] + PC1 + PC2, data = all_data, family=stats::binomial(link="logit"))) # Logistic regression with allelic model

            # Convert (-1,0,1) to (0,1) to run the regression in dominant model
            all_data[,aa][all_data[,aa] == 0] <- 1
            all_data[,aa][all_data[,aa] == -1] <- 0

            # Run the dominant regression
            reg_dominant <- summary(stats::glm(Disease ~ all_data[, aa] +  PC1 + PC2, data = all_data, family=stats::binomial(link="logit"))) # Logistic regression with allelic model
            res <- rbind(res, c(aa, freq, freq_control, freq_case, 4/((1/nrow(subset(all_data, Disease == 1)))+(1/nrow(subset(all_data, Disease == 0)))), reg_allelic$coefficients[2,4], exp(reg_allelic$coefficients[2,1]), exp(reg_allelic$coefficients[2,1]-1.96*reg_allelic$coefficients[2,2]), exp(reg_allelic$coefficients[2,1]+1.96*reg_allelic$coefficients[2,2]), reg_allelic$coefficients[2,1], reg_allelic$coefficients[2,2], reg_dominant$coefficients[2,4], exp(reg_dominant$coefficients[2,1]), exp(reg_dominant$coefficients[2,1]-1.96*reg_dominant$coefficients[2,2]), exp(reg_dominant$coefficients[2,1]+1.96*reg_dominant$coefficients[2,2]), reg_dominant$coefficients[2,1], reg_dominant$coefficients[2,2])) # See header of res for elements explanation
          }
          a <- sort( as.vector(as.numeric(res[-1,6])) , decreasing = FALSE) # Pvalues of all AA
          p <- sort( append(p , a[1], after = length(p)) , decreasing = FALSE)  # One P value for each loop

        }
        # x <- 0.05*numberLoop
        # n <- round(x,digits = -1)
        # if(n==0){n<-1}
        # seuil <- round(p[n] , 9)
        #
        seuil <- round(p[50] , 9)
        seuil <- -log10(seuil)
      }# END of AA levels




    } # END if correction == "permutation"
  } # else

  return(seuil)
}
benabidlina/HLAfix documentation built on Aug. 20, 2021, 11:53 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
benabidlina/HLAfix
HLA Analysis Pipeline for large-scale Association Studies

R/part3.R
In benabidlina/HLAfix: HLA Analysis Pipeline for large-scale Association Studies

Defines functions pvalueCorrection hlaCexpressionAnalysis hlaKIRAnalysis hlaAminoAcidAnalysis hlaHaplotypeAnalysis hlaAlleleAnalysis

Documented in hlaAlleleAnalysis hlaAminoAcidAnalysis hlaCexpressionAnalysis hlaHaplotypeAnalysis hlaKIRAnalysis pvalueCorrection

R Package Documentation

Browse R Packages

We want your feedback!

benabidlina/HLAfix HLA Analysis Pipeline for large-scale Association Studies

R/part3.R In benabidlina/HLAfix: HLA Analysis Pipeline for large-scale Association Studies

Defines functions pvalueCorrection hlaCexpressionAnalysis hlaKIRAnalysis hlaAminoAcidAnalysis hlaHaplotypeAnalysis hlaAlleleAnalysis

Documented in hlaAlleleAnalysis hlaAminoAcidAnalysis hlaCexpressionAnalysis hlaHaplotypeAnalysis hlaKIRAnalysis pvalueCorrection

R Package Documentation

Browse R Packages

We want your feedback!

benabidlina/HLAfix
HLA Analysis Pipeline for large-scale Association Studies

R/part3.R
In benabidlina/HLAfix: HLA Analysis Pipeline for large-scale Association Studies