#' BIGDAWG Main Wrapper Function
#'
#' This is the main wrapper function for each analysis.
#' @param Data Name of the genotype data file.
#' @param HLA Logical Indicating whether data is HLA class I/II genotyping data only.
#' @param Run.Tests Specifics which tests to run.
#' @param Loci.Set Input list defining which loci to use for analyses (combinations permitted).
#' @param All.Pairwise Logical indicating whether all pairwise loci should be analyzed in haplotype analysis.
#' @param Trim Logical indicating if HLA alleles should be trimmed to a set resolution.
#' @param Res Numeric setting what desired resolution to trim HLA alleles.
#' @param EVS.rm Logical indicating if expression variant suffixes should be removed.
#' @param Missing Numeric setting allowable missing data for running analysis (may use "ignore").
#' @param Cores.Lim Interger setting the number of cores accessible to BIGDAWG (Windows limit is 1 core).
#' @param Results.Dir Optional, string of full path directory name for BIGDAWG output.
#' @param Return Logical Should analysis results be returned as list.
#' @param Output Logical Should analysis results be written to output directory.
#' @param Merge.Output Logical Should analysis results be merged into a single file for easy access.
#' @param Verbose Logical Should a summary of each analysis be displayed in console.
#' @examples
#' ### The following examples use the synthetic data set bundled with BIGDAWG
#'
#' # Haplotype analysis with no missing genotypes for two loci sets
#' # Significant haplotypic association with phenotype
#' # BIGDAWG(Data="HLA_data", Run.Tests="H", Missing=0, Loci.Set=list(c("DRB1","DQB1")))
#'
#' # Hardy-Weinberg and Locus analysis ignoring missing data
#' # Significant locus associations with phenotype at all but DQB1
#' # BIGDAWG(Data="HLA_data", Run.Tests="L", Missing="ignore")
#'
#' # Hardy-Weinberg analysis trimming data to 2-Field resolution with no output to files (console only)
#' # Significant locus deviation at DQB1
#' BIGDAWG(Data="HLA_data", Run.Tests="HWE", Trim=TRUE, Res=2, Output=FALSE)
BIGDAWG <- function(Data, HLA=TRUE, Run.Tests, Loci.Set, All.Pairwise=FALSE, Trim=FALSE, Res=2, EVS.rm=FALSE, Missing=2, Cores.Lim=1L, Results.Dir, Return=FALSE, Output=TRUE, Merge.Output=FALSE, Verbose=TRUE) {
options(warn=-1)
MainDir <- getwd()
on.exit(setwd(MainDir), add = TRUE)
# CHECK PARAMETERS
if( missing(Data) ) { Err.Log("P.Missing","Data") ; stop("Conversion Stopped.",call.=FALSE) }
Check.Params(HLA, All.Pairwise, Trim, Res, EVS.rm, Missing, Cores.Lim, Return, Output, Merge.Output, Verbose)
# MULTICORE LIMITATIONS
Cores <- Check.Cores(Cores.Lim, Output)
cat(rep("=",40))
cat("\n BIGDAWG: Bridging ImmunoGenomic Data Analysis Workflow Gaps\n")
cat(rep("=",40),"\n")
cat("\n>>>>>>>>>>>>>>>>>>>>>>>>> BEGIN Analysis <<<<<<<<<<<<<<<<<<<<<<<<<\n\n")
# Define Output object
BD.out <- list()
# ===================================================================================================================================== ####
# Read in Data ________________________________________________________________________________________________________________________ ####
NAstrings=c("NA","","****","-","na","Na")
if(is.character(Data)) {
if (Data=="HLA_data") {
# Using internal synthetic set
Tab <- BIGDAWG::HLA_data
Data.Flag <- "Internal Synthetic Data Set"
} else {
# Read in data file
if(!file.exists(Data)) { Err.Log(Output,"Bad.Filename", Data) ; stop("Analysis stopped.",call.=F) }
Tab <- read.table(Data, header = T, sep="\t", stringsAsFactors = F, na.strings=NAstrings, fill=T, comment.char = "#", strip.white=T, blank.lines.skip=T, colClasses="character")
Data.Flag <- Data # use data file name
}
} else {
# Using R object
Tab <- Data
Data.Flag <- "R Data Object"
}
# Declare Data Input Parameter
cat("Data Input:",Data.Flag,"\n\n\n")
# Convert GLS data
if( ncol(Tab)==3 && !HLA ) { Err.Log(Output,"notHLA.GLS") }
if( ncol(Tab)==3 && HLA ) {
cat("Converting Gene List Strings to Tabular Format...\n\n")
Tab <- GLSconvert(Tab,Convert="GL2Tab",System="HLA",File.Output="R",Strip.Prefix=T,Abs.Fill=T,Cores.Lim=Cores)
}
# Prep Data for processing and checks
Tab <- prepData(Tab)
# Create and Change to the required output directory
if (Output) {
if( missing(Results.Dir) ) {
OutDir <- paste(MainDir,"/output ",format(Sys.time(), "%d%m%y %H%M%S"),sep="")
dir.create(OutDir)
} else {
OutDir <- Results.Dir
}
}
if(Output) { setwd(OutDir) }
# ===================================================================================================================================== ####
# Data Processing and Sanity Checks ___________________________________________________________________________________________________ ####
cat(">>>> DATA PROCESSING AND CHECKS.\n")
#### General processing and checks for all data
# Define Data Columns
Data.Col <- seq(3,ncol(Tab))
# RUN TESTS DEFINITIONS
if (missing(Run.Tests)) { Run <- c("HWE","H","L","A") } else { Run <- Run.Tests }
if(!HLA) {
if("A" %in% Run) {
cat("Not HLA data. Skipping Amino Acid Analysis.\n")
Run <- Run[-which(Run=="A")]
}
}
# BAD DATA DEFINITIONS - No 1's or 0's
if( length(which(Tab[,Data.Col]==0))>0 || length(which(Tab[,Data.Col]==0))>1 ) {
Err.Log(Output,"Bad.Data")
stop("Analysis Stopped.",call. = F)
}
# MISSING DATA
if(Missing == "ignore") {
cat("Ignoring any missing data.\n")
Err.Log(Output,"Ignore.Missing")
rows.rm <- NULL
} else {
if (Missing > 2) {
if ("H" %in% Run) { Err.Log(Output,"Big.Missing") }
}
cat("Removing any missing data. This will affect Hardy-Weinberg Equilibrium test.\n")
geno.desc <- summaryGeno.2(Tab[,Data.Col], miss.val=NAstrings)
test <- geno.desc[,2] + 2*geno.desc[,3]
rows.rm <- which(test > Missing)
if( length(rows.rm) > 0 ) {
rows.rm <- which(test > Missing)
ID.rm <- Tab[rows.rm,1]
Tab <- Tab[-rows.rm,]
if(Output) { write.table(ID.rm, file="Removed_SampleIDs.txt", sep="\t", row.names=F, col.names=F, quote=F) }
}
rm(geno.desc,test,ID.rm)
if(nrow(Tab)==0) { Err.Log(Output,"TooMany.Missing") ; stop("Analysis Stopped.",call. = F) }
}
# MULTIPLE SETS AND ANALYSIS DUPLICATION
if(!missing(Loci.Set)) {
if( length(Loci.Set)>1 && (All.Pairwise | "L" %in% Run | "A" %in% Run ) ) { Err.Log(Output,"MultipleSets") }
}
# DATA MERGE AND NUMBER OF LOCI
if(Output && Merge.Output && All.Pairwise) {
if(ncol(Tab)>52) { Err.Log(Output,"AllPairwise.Merge") }
}
##### HLA specific checks
#Check for the updated ExonPtnList 'UpdatePtnList' and use if found.
UpdatePtnList <- NULL
UPL <- paste(path.package('BIGDAWG'),"/data/UpdatePtnAlign.RData",sep="")
if( file.exists(UPL) ) {
load(UPL)
EPL <- UpdatePtnList
rm(UpdatePtnList)
UPL.flag=T
} else {
rm(UpdatePtnList,UPL)
EPL <- BIGDAWG::ExonPtnList
UPL.flag=F }
if(Trim & !HLA) { Err.Log(Output,"NotHLA.Trim") }
if(EVS.rm & !HLA) { Err.Log(Output,"NotHLA.EVS.rm") }
if(!HLA) { DRBFLAG <- NULL } else { DRB345.test <- length(grep("DRB345",colnames(Tab)))>0 }
if(HLA) {
if(Trim | EVS.rm | "A" %in% Run | DRB345.test ) { cat("Running HLA specific check functions...\n") }
# Check Locus*Allele Formatting across all loci
CheckCol <- sum( unlist( apply(Tab[,Data.Col], MARGIN=c(1,2), FUN = function(x) grepl("\\*",na.omit(x))) ) )
TotalCol <- ( dim(Tab[,Data.Col])[1] * dim(Tab[,Data.Col])[2] ) - length(which(Tab[,Data.Col]=="^")) - sum(is.na(Tab[,Data.Col]))
if( CheckCol>0 && CheckCol!=TotalCol ) {
Err.Log(Output,"Bad.Format.HLA")
stop("Analysis Stopped.",call. = F)
}
# Separate DRB345 if exists as single column pair and check zygosity
if(DRB345.test) {
cat("Processing DRB345 column data.\n")
DRBFLAG <- T
# Expand DRB3/4/5 to separate column pairs
Tab <- DRB345.parser(Tab)
colnames(Tab) <- sapply(colnames(Tab),FUN=gsub,pattern="\\.1",replacement="")
# Redefine Data Columns
Data.Col <- seq(3,ncol(Tab))
# Define DR Loci to Process
getCol <- grep("DRB",colnames(Tab))
Loci.DR <- unique(colnames(Tab)[getCol])
# Process Loci
Tab.list <- lapply(seq_len(nrow(Tab)), FUN=function(z) Tab[z,getCol])
Tab.tmp <- mclapply(Tab.list,FUN=DRB345.Check.Wrapper,Loci.DR=Loci.DR,mc.cores=Cores)
Tab.tmp <- do.call(rbind,Tab.tmp)
Tab[,getCol] <- Tab.tmp[,grep("DRB",colnames(Tab.tmp))]
Tab <- cbind(Tab,Tab.tmp[,'DR.HapFlag']) ; colnames(Tab)[ncol(Tab)] <- "DR.HapFlag"
#Identify DR345 flagged haplotypes and Write to File
DR.Flags <- Tab[which(Tab[,'DR.HapFlag']!=""),c(1,2,getCol,ncol(Tab))] ; row.names(DR.Flags) <- NULL
if(Output) {
if(!is.null(DR.Flags)) {
Err.Log(Output,"Bad.DRB345.hap") ; cat("\n")
write.table(DR.Flags,file="Flagged_DRB345_Haplotypes.txt",sep="\t",quote=F,row.names=F,col.names=T)
}
}
cat("\n")
} else { DRBFLAG <- F }
# Separate locus and allele names if data is formatted as Loci*Allele
Tab[,Data.Col] <- apply(Tab[,Data.Col],MARGIN=c(1,2),FUN=Stripper)
# Sanity Check for Resoltion if Trim="T" and Trim Data
if(Trim & CheckHLA(Tab[,Data.Col])) {
cat("--Trimming Data.\n")
#Tab.untrim <- Tab
Tab[,Data.Col] <- apply(Tab[,Data.Col],MARGIN=c(1,2),GetField,Res=Res)
rownames(Tab) <- NULL
} else if (Trim) {
Err.Log(Output,"Bad.Format.Trim")
stop("Analysis Stopped.",call. = F)
}
# Sanity Check for Expresion Variant Suffix Stripping
if(EVS.rm & CheckHLA(Tab[,Data.Col])) {
cat("--Stripping Expression Variants Suffixes.\n")
Tab[,Data.Col] <- apply(Tab[,Data.Col],MARGIN=c(1,2),gsub,pattern="[[:alpha:]]",replacement="")
EVS.loci <- as.list(names(EPL))
EPL <- lapply(EVS.loci,EVSremoval,EPList=EPL)
names(EPL) <- EVS.loci ; rm(EVS.loci)
} else if (EVS.rm) {
Err.Log(Output,"Bad.Format.EVS")
stop("Analysis Stopped.",call. = F)
}
# Sanity Check for Amino Acid Test Feasibility
if ("A" %in% Run) {
cat("Running Amino Acid Analysis specific checks functions...\n")
Release <- as.character(unlist(EPL[['Release']]))
# Sanity Check for Known HLA loci in Bundled Database Release
cat(paste("--Checking loci against ",Release,".\n",sep=""))
test <- CheckLoci(names(EPL),unique(colnames(Tab)[Data.Col]))
if( test$Flag ) { Err.Log(Output,"Bad.Locus.HLA",test$Loci) ; stop("Analysis stopped.",call. = F) }
# Sanity Check for Known HLA alleles in Bundled Database Release
cat(paste("--Checking alleles against ",Release,".\n",sep=""))
test <- CheckAlleles(EPL, Tab[,Data.Col])
if( test$Flag ) { Err.Log(Output,"Bad.Allele.HLA",test$Alleles) ; stop("Analysis stopped.",call. = F) }
# Sanity Check for Analysis and HLA Allele Resolution (MUST perform THIS STEP AFTER TRIM!!!!)
if(Res<2 | !CheckHLA(Tab[,Data.Col])) {
Err.Log(Output,"Low.Res")
cat("You have opted to run the amino acid analysis.\n")
stop("Analysis stopped.",call. = F)
}
} # End A if statement
} # End HLA if statement and HLA specific functionalities
# LOCI SET COLUMN DEFINITIONS
# This MUST follow DRB345 processing on the chance that DRB345 is formatted as single column
# and DRB3, DRB4, or DRB5 is defined in Loci.Set.
if(missing(Loci.Set)) {
Set <- list(Data.Col)
} else {
Loci.Set <- lapply(Loci.Set,FUN=function(x) sapply(x,toupper))
Set <- lapply(Loci.Set,FUN=function(x) seq(1,ncol(Tab))[colnames(Tab) %in% x])
}
# ===================================================================================================================================== ####
# Case-Control Summary ________________________________________________________________________________________________________________ ####
cat("\n>>>> CASE - CONTROL SUMMARY STATISTICS\n")
#cat(paste(rep("_",50),collapse=""),"\n")
if (Trim) { rescall <- paste(Res,"-Field",sep="") } else { rescall <- "Not Defined" }
Check <- PreCheck(Tab,colnames(Tab),rescall,HLA,Verbose,Output)
if(Output) { write.table(Check,file="Data_Summary.txt",sep=": ",col.names=F,row.names=T,quote=F); rm(Check,rescall) }
# ===================================================================================================================================== ####
# Write to Parameter File _____________________________________________________________________________________________________________ ####
if(Output) {
if(HLA && !is.null(DRBFLAG)) { DRB345.tmp <- DRBFLAG } else { DRB345.tmp <- NULL }
if(HLA) { Trim.tmp <- Trim } else { Trim.tmp <- NULL }
if(HLA && Trim) { Res.tmp <- Res } else { Res.tmp <- NULL }
if(HLA) { EVS.rm.tmp <- EVS.rm } else { EVS.rm.tmp <- NULL }
Params.Run <- list(Time = format(Sys.time(), "%a %b %d %X %Y"),
BD.Version = as.character(packageVersion("BIGDAWG")),
Cores.Used = Cores,
File = Data.Flag,
Output.Results = Output,
Merge = Merge.Output,
Return.Object = Return,
Display.Results = Verbose,
HLA.Data = HLA,
DRB345.Parsed = DRB345.tmp,
Tests = paste(Run,collapse=","),
All.Pairwise = All.Pairwise,
Trim = Trim.tmp,
Resolution = Res.tmp,
Suffix.Stripping = EVS.rm.tmp,
Missing.Allowed = Missing,
Samples.Removed = length(rows.rm))
Params.Run <- do.call(rbind,Params.Run)
write.table(Params.Run,file="Run_Parameters.txt",sep=": ", row.names=T, col.names=F, quote=F)
}
# ===================================================================================================================================== ####
# Hardy Weignberg Equilibrium 'HWE' ___________________________________________________________________________________________________ ####
if ("HWE" %in% Run) {
cat("\n>>>> STARTING HARDY-WEINBERG ANALYSIS...\n")
#cat(paste(rep("_",50),collapse=""),"\n")
if(HLA && Trim) {
cat("HWE performed at user defined resolution.\n")
} else if (HLA) {
cat("HWE performed at maximum available resolution.\n")
}
HWE <- HWE.wrapper(Tab,colnames(Tab),Output,Verbose)
BD.out[['HWE']] <- HWE
rm(HWE)
} #END HARDY-WEINBERG
# ===================================================================================================================================== ####
# Set Loop Begin (loop through each defined locus/loci set) ___________________________________________________________________________ ####
if ( sum( c("H","L","A") %in% Run ) > 0 ) {
cat("\n>>>>>>>>>>>>>>>>>>>>>>>>> Begin Locus Sets <<<<<<<<<<<<<<<<<<<<<<<<<\n\n")
if(length(Set)==1) {
cat("Your analysis has a 1 set to analyze.\n")
} else {
cat(paste("Your analysis has ", length(Set), " sets to analyze.", sep=""),"\n")
}
for(k in 1:length(Set)) {
cat("\n")
cat(paste(rep(">",35),collapse=""),"Running Set",k,"\n")
cols <- Set[[k]]
Tabsub <- Tab[,c(1,2,cols)]
#Set Specific Global Variables
SID <- Tabsub[,1] # sample IDs
genos <- Tabsub[,3:ncol(Tabsub)] # genotypes
genos[genos==""] <- NA
grp <- Tabsub[, 2] # phenotype
nGrp0 <- length(which(grp==0))*2 #nalleles
nGrp1 <- length(which(grp==1))*2 #nalleles
loci <- unique(gsub(".1","",colnames(genos),fixed=T)) # name of loci
loci.ColNames <- gsub(".1","",colnames(genos),fixed=T) # column names
nloci <- as.numeric(length(loci)) # number of loci
SetName <- paste('Set',k,sep="")
if(HLA==T) { genos[genos=='^'] <- "00:00" }
if(Output) {
OutSetDir <- paste(OutDir,"/Set",k,sep="")
dir.create(OutSetDir)
setwd(OutSetDir)
Params.set <- list(Set = paste("Set",k),
Loci.Run = paste(loci,collapse=","))
Params.set <- do.call(rbind,Params.set)
write.table(Params.set,file="Set_Parameters.txt",sep=": ", row.names=T, col.names=F, quote=F)
}
SAFE <- c(ls(),"SAFE")
# ===================================================================================================================================== ####
# Haplotype Analysis 'H' ______________________________________________________________________________________________________________ ####
if ("H" %in% Run) {
#cat(paste(rep("_",50),collapse="","\n"))
# Sanity check for set length and All.Pairwise=T
if (nloci<2) {
Err.Log(Output,"Loci.No")
stop("Analysis Stopped.", call. = F)
} else if (All.Pairwise & nloci<=2) {
Err.Log(Output,"Loci.No.AP")
stop("Analysis Stopped.", call. = F) }
Haps.list <- H.MC.wrapper(SID,Tabsub,loci,loci.ColNames,genos,grp,All.Pairwise,Output,Verbose,Cores)
if(All.Pairwise) {
if(length(BD.out[['H']])>0) { BD.out[['H']] <- c(BD.out[['H']],Haps.list) } else { BD.out[['H']] <- Haps.list }
} else {
BD.out[['H']][[SetName]] <- Haps.list
}
rm(list=ls()[!(ls() %in% SAFE)])
} #END HAPLOTYPE
# ===================================================================================================================================== ####
# Locus Level 'L' _____________________________________________________________________________________________________________________ ####
if ("L" %in% Run) {
#cat(paste(rep("_",50),collapse=""))
L.list <- L.wrapper(nloci,loci,loci.ColNames,genos,grp,nGrp0,nGrp1,Output,Verbose)
BD.out[['L']][[SetName]] <- list(binned=L.list[['AB']],
freq=L.list[['AF']],
OR=L.list[['OR']],
chisq=L.list[['CS']],
table=L.list[['FB']])
rm(list=ls()[!(ls() %in% SAFE)])
} #END LOCUS
# ===================================================================================================================================== ####
# Amino Acid Level 'A' ________________________________________________________________________________________________________________ ####
if(HLA) {
if ("A" %in% Run) {
#cat(paste(rep("_",50),collapse=""))
if(UPL.flag) { cat("Using updated protein exon alignments for amino acid analysis.\n") }
A.list <- A.wrapper(loci,loci.ColNames,genos,grp,nGrp0,nGrp1,EPL,Cores,Output,Verbose)
if(Output) {
## write to file
write.table(Release, file = "Set_Parameters.txt", sep="\t", row.names = F, col.names=F, quote = F, append=T)
}
BD.out[['A']][[SetName]] <- list(log=A.list[['AL']],
binned=A.list[['AB']],
freq=A.list[['AF']],
OR=A.list[['OR']],
chisq=A.list[['CS']],
table=A.list[['FB']])
rm(list=ls()[!(ls() %in% SAFE)])
} #END AMINO ACID
}#END if(HLA)
# ===================================================================================================================================== ####
# End Analyses ________________________________________________________________________________________________________________________ ####
}; rm(k)
}# END SET LOOP
if(Output) {
if(Merge.Output) {
cat("\nMerging data files ...\n")
if("HWE" %in% Run) { Run <- Run[-which(Run=="HWE")] }
if( length(Run)>=1 ) { MergeData_Output(BD.out,Run,OutDir) }
}
}
# ===================================================================================================================================== ####
cat("\n>>>>>>>>>>>>>>>>>>>>>>>>>> End Analysis <<<<<<<<<<<<<<<<<<<<<<<<<<\n")
if(Output) { setwd(OutDir); save(BD.out, file="Analysis.RData") }
options(warn=0)
if(Return) { return(BD.out) }
}# END FUNCTION
# last update: 06/22/18
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.