R/LD.heatmap.R
In pould: Phased or Unphased Linkage Disequilibrium

Documented in LD.heat.map

## LD.heat.map -- Steven J. Mack April 6, 2020 v1.0
## Displays a heatmap of the matrix for phased and unphased LD values returned by LDWrap()

#' Generates heat-maps for four linkage disequilibrium (LD) values ({D'}, {Wn}, {WLoc1/Loc2} and {WLoc2/Loc1}) generated for all pairs of phased and unphased two-locus haplotypes by LDWrap().
#' 
#' This function accepts *_LD_results.csv files generated by LDWrap() as input, and generates a PNG-formatted heat-map plot file for each LD measure.
#' 
#' @param dataName The "base" name of the _LD_result.csv files generated by LDWrap() without the "_Phased_LD_results.csv" or "_Unphased_LD_results.csv" suffixes. See Examples, below. If both corresponding "<dataName>_Phased_LD_results.csv" and "<dataName>_Unphased_LD_results.csv" files are not found, the funciton will halt with a notification. However, if only one of those files is found in the working director, half-matrix heat map plots will be generated.
#' @param phasedData The complete name of a file of phased LD results generated by LDWrap(). Provide this filename if no base name is provided for dataName and you want to generate heat-maps for a specific set of phased LD values.
#' @param unphasedData The complete name of a file of unphased LD results generated by LDWrap(). Provide this filename if no base name is provided for dataName and you want to generate heat-maps for a specific set of unphased LD values.
#' @param phasedLabel The label that should appear on the heat-map plots for the upper, phased half of the plot. The default option is 'Phased'.
#' @param unphasedLabel The label that should appear on the heat-map plots for the lower, unphased half of the plot. The default option is 'EM-estimated'.
#' @param color A logical parameter that identifies if the heat-maps should be plotted in color (TRUE) or greyscale (FALSE). The default option is TRUE. 
#' @param writePlot A logical parameter that identifies if the heat-map plots should be automatically saved after they are generated. The default is 'writePlot=FALSE'.
#' @param writeDir The directory into which the heat-map plots should be saved when 'writePlot=TRUE'. The default is the directory specified by tempdir().
#' @keywords heat map linkage disequilibrium
#' @importFrom utils read.csv
#' @importFrom ggplot2 ggsave ggplot geom_tile geom_text scale_fill_gradient2 theme_minimal theme guides xlab ggtitle coord_fixed aes element_text element_blank guide_colorbar
#' @importFrom reshape2 melt
#' @keywords linkage disequilibrium heat map heat-map heatmap
#' @export
#' @examples 
#' # Using the results of LDWrap() for the first 10 rows of the drb1.dqb1.demo dataset.
#' # Results are saved in the temporary directory as 
#' # "hla-family-data_Phased_LD_results.csv" and 
#' # "hla-family-data_Unphased_LD_results.csv", respectively.
#' LDWrap(drb1.dqb1.demo[1:10,])
#' LDWrap(drb1.dqb1.demo[1:10,],phased=FALSE)
#' exampleData <- paste(tempdir(),"hla-family-data",sep=.Platform$file.sep)
#' LD.heat.map(exampleData)
#' # Alternatively, these files can be sepcified individually to generate a half-matrix. 
#' LD.heat.map(phasedData=paste(exampleData,"_Phased_LD_results.csv",sep=""),unphasedLabel="")
#' # Further, two different sets of results for the same loci can be plotted; e.g., using
#' # phasedData="my_Phased_LD_results.csv" and unphasedData="your_Phased_LD_results.csv".
#' @references Osoegawa et al. Hum Immunol. 2019;80(9):644 https://doi.org/10.1016/j.humimm.2019.05.018

LD.heat.map <- function(dataName="",phasedData="", unphasedData="",phasedLabel="Phased",unphasedLabel="EM-estimated", color=TRUE, writePlot=FALSE, writeDir=tempdir()) {

Var1 <- Var2 <- value <- NULL
## Importing LDWrap() output data and processing to populate matrices
dTop <- dBot <- TRUE
phaseDataRaw <- unphaseDataRaw <- ""
if(dataName!=""){
if(file.exists(paste(dataName,"_Phased_LD_results.csv",sep=""))) { phaseDataRaw <- read.csv(file=paste(dataName,"_Phased_LD_results.csv",sep=""),header = TRUE,stringsAsFactors = FALSE)}
if(file.exists(paste(dataName,"_Unphased_LD_results.csv",sep=""))) { unphaseDataRaw <- read.csv(file=paste(dataName,"_Unphased_LD_results.csv",sep=""),header=TRUE,stringsAsFactors = FALSE)}
  # At lease one file of phased or unphased data must be available
} else {
    if(file.exists(phasedData)) { phaseDataRaw <- read.csv(file=phasedData,header = TRUE,stringsAsFactors = FALSE)}
    if(file.exists(unphasedData)) { unphaseDataRaw <- read.csv(file=unphasedData,header = TRUE,stringsAsFactors = FALSE)}
  }
if(length(phaseDataRaw) == 1) {dTop <- FALSE} 
if(length(unphaseDataRaw) == 1) {dBot <- FALSE} 
if(!dTop && !dBot) { 
  if(dataName!="") { warning(paste("Files ",dataName,"_Phased_LD_results.csv"," and ",dataName,"_Unphased_LD_results.csv"," are not found.",sep=""))
      } else {warning(paste("The specified phased and unphased datafiles are not found.",sep=""))}
  } else {

# Some LDWrap() output files will contain monomorphic positions, and # subects < threshold for specific locus-combinations; these wil result in NA values in column 6.
if(dTop) { phaseDataRaw <- phaseDataRaw[!is.na(phaseDataRaw$N_Haplotypes),]}    ## Be sure to check to see if dTop and bTop are True
if(dBot) { unphaseDataRaw <- unphaseDataRaw[!is.na(unphaseDataRaw$N_Haplotypes),] }
    
# Assuming both phased and unphased files are the same, use whichever you have and extract the locus names
locNames <- unique(as.vector(unlist(strsplit(unlist(ifelse(dTop,phaseDataRaw,unphaseDataRaw)),"~",TRUE))))

plotDo <- TRUE # With support for two distinct files, need to check to make sure the loci are the same
if(dTop && dBot && dataName =="") { 
  locNames2 <- unique(as.vector(unlist(strsplit(unlist(unphaseDataRaw[[1]]),"~",TRUE))))
  if(length(locNames) != length(locNames2)) { 
      plotDo <- FALSE } else {
  if(TRUE %in% (locNames != locNames2)) { plotDo <- FALSE #v0.5 correct evaluation of locus match
      }
    }
  if(!plotDo) {warning(paste("Different loci included in ", phasedData, " and ", unphasedData,". Cannot produce heatmaps.", sep=""))}
    
}

if(plotDo) {
measureNames <- c("D'","Wn","Wa/b","Wb/a")

# building full matrixes for each LD measure 
matrices <- list()
for(i in 1:4) {
  # making 1 matrix for each LD measure 
  matrices[[i]] <- matrix(NA,length(locNames),length(locNames),FALSE,list(locNames,locNames))
  
  # if rows without LD data had to be eliminated, the data columns have to be converted to numeric
  if(dTop) {phaseDataRaw[[i+1]] <- as.numeric(phaseDataRaw[[i+1]])}
  if(dBot) {unphaseDataRaw[[i+1]] <- as.numeric(unphaseDataRaw[[i+1]])}
  
  for(j in 1:(length(locNames)-1)){
    for(k in (j+1):length(locNames)) {
              ### v0.6 Some datasets don't have LD values for all locus pairs. 
              ### This resulted in this match returnig a value of 'numeric(0)', which had length of 0. NA has a length of 1.
    if(dTop) { matrices[[i]][j,k] <- phaseDataRaw[match(paste(locNames[j],locNames[k],sep="~"),phaseDataRaw$Loc1.Loc2,NA_integer_),i+1] }
    if(dBot) { matrices[[i]][k,j] <- unphaseDataRaw[match(paste(locNames[j],locNames[k],sep="~"),unphaseDataRaw$Loc1.Loc2,NA_integer_),i+1] }
      
    }
  }

  ## Building and saving the plots
  #ggplot2
  currTest <- paste(measureNames[i],"value",sep="\n")
  melted <- melt(matrices[[i]],na.rm=TRUE)
 
  minCol <- maxCol <- midCol <- ""
  if(color) { 
    minCol <- "blue"
    midCol <- "white" 
    maxCol <- "red"
    } else {
    minCol <- "grey15"
    midCol <- "grey53" 
    maxCol <- "grey91"
    }
  
  heatPlot <- ggplot(data=melted,aes(x=Var1,y=Var2,fill=value)) + 
    geom_tile(color="white") + 
    geom_text(aes(Var1,Var2,label=formatC(value,2,format="f")), color="black", size=4) +
      scale_fill_gradient2(low=minCol,high=maxCol,mid=midCol,
                         midpoint=0.5,limit=c(0,1),space="Lab",name=currTest) + 
    theme_minimal() +
    theme(axis.text.x = element_text(size=12), axis.text.y = element_text(size=12),
          axis.title.x = element_text(size=12,vjust = -2), axis.title.y = element_blank(),
          panel.grid.major = element_blank(), panel.border = element_blank(),
          panel.background = element_blank(), axis.ticks = element_blank(),
          legend.justification = c(1,0),legend.direction = "vertical", legend.position="right",  
          plot.title=element_text(size=12,hjust=0.5),axis.title=element_text(size=12)) +
    guides(fill=guide_colorbar(barwidth=1,barheight = 7,
          title.position = "top",title.hjust = 0.5)) +
    xlab(label=unphasedLabel) + 
    ggtitle(phasedLabel) +
    coord_fixed() 
 
  plot(heatPlot)
 
  if(writePlot){
                  # If no dataName was specified combine the names of the specified phased and unphased files
  if(dataName=="") {dataName <- paste(gsub(".csv","",phasedData,fixed=TRUE),gsub(".csv","",unphasedData,fixed=TRUE),sep="-")}
  # Save each plot as it is defined
  ggsave(paste(writeDir,paste(basename(dataName),gsub("/","-",measureNames[i],fixed=TRUE),"heatmap.png",sep="_"),sep=.Platform$file.sep),heatPlot,width=7.78,height = 7.39)
        }
      } 
    }
  } 
}