R/PALM.R
In SSHAARP: Searching Shared HLA Amino Acid Residue Prevalence

Documented in PALM

## Population Allele Locating Mapmaker v1 16APR2020
#'Population Allele Locating Mapmaker
#'
#'Produces a frequency heatmap for a specified amino-acid motif, based on the allele frequency data in the Solberg dataset.
#'
#'@param filename The filename of the local copy of the Solberg dataset - the defaulted filename is the solberg_dataset in the SSHAARP package.
#'@param motif An amino acid motif in the following format: Locus*##$~##$~##$, where ## identifies a peptide position, and $ identifies an amino acid residue. Motifs can include any number of amino acids.
#'@param color A logical parameter that identifies if the heat maps should be made in color (TRUE) or gray scale (FALSE). The default option is TRUE.
#'@param filterMigrant A logical parameter that determines if admixed populations (OTH) and migrant populations (i.e. any complexities with the 'mig') should be excluded from the dataset. The default option is TRUE.
#'@param direct The directory into which the map produced is written. The default directory is the user's working directory.
#'
#'@importFrom data.table rbindlist
#'@importFrom gmt gmt.system r2gmt
#'@importFrom DescTools RoundTo
#'@importFrom utils sessionInfo
#'@importFrom dplyr filter %>%
#'@importFrom stringr str_extract_all
#'@importFrom filesstrings file.move
#'
#'@note The produced frequency heatmap is generated by using the Generic Mapping Tools (GMT) R Package, which is an interface between R and the GMT Map-Making software.
#'@note The Solberg dataset is the tab-delimited ‘1-locus-alleles.dat’ text file in the results.zip archive at http://pypop.org/popdata/.
#'@note The Solberg dataset is also prepackaged into SSHAARP as 'solberg_dataset'.
#'@note While the map legend identifies the highest frequency value, values in this range may not be represented on the map due to frequency averaging over neighboring populations.
#'@export
#'
#'@return The specified motif and the directory into which the heatmap was written are returned in an invisible character vector. If the user enters a motif that is not found in the Solberg dataset, or that does not exist, a warning message is returned. If an incorrectly formatted motif is entered, or the user does not have the GMT software installed on their operating system, a vector with a warning message is returned. The produced heatmap is written to the user's specified directory (default is user's working directory) as a .jpg file, where the filename is "'motif'.jpg".
#'
#'@examples
#'#example to produce a color frequency heat map where migrant populations are filtered out
#'PALM("DRB1*26F~28E~30Y",filename = solberg_dataset[85:100,], filterMigrant=TRUE)
#'#example to produce a greyscale heat map where migrant populations are not filtered out
#'PALM("DRB1*26F~28E~30Y", filename = solberg_dataset[85:100,], color=FALSE, filterMigrant=FALSE)
#'
#'@references Solberg et.al. (2008) <doi: 10.1016/j.humimm.2008.05.001>
#'
#'
PALM<-function(motif, filename=SSHAARP::solberg_dataset, direct=getwd(), color=TRUE, filterMigrant=TRUE){

  wd <- getwd()
  on.exit(setwd(wd))

  #checks if the directory user entered is valid
  if(dir.exists(direct)==FALSE){
    return(warning("The directory you specified does not exist. Please enter a valid directory to write your map to."))
  }

  OS<-strsplit(sessionInfo()$running, " ")[[1]][[1]]

  #uses dataSubset to read and manipulate the Solberg dataset
  solberg_DS<-suppressWarnings(dataSubset(motif, filename))

  #if output of solberg_DS after dataSubset is not a dataframe, it is an error
  #return solberg_DS, which contains the error message
  if(is.data.frame(solberg_DS)==FALSE){
    return(warning(solberg_DS))
  }

  #filters out migrant populations if filter_migrant==TRUE
  if(filterMigrant==TRUE){
    #filters out admixed, migrant OTH populations
    solberg_DS<-solberg_DS[solberg_DS$contin!="OTH",]

    #filters out migrant populations in complexity column
    solberg_DS<-solberg_DS[which((grepl("mig", solberg_DS$complex))==FALSE),]}

  if(filterMigrant==FALSE){
    solberg_DS<-solberg_DS}

  fMresults<-suppressWarnings(findMotif(motif))

  if(is.vector(fMresults)==TRUE){
    return(warning(fMresults))
  }

  else{
    #makes an empty list named unique_AWM, where the name of each element is after a unique AWM,
    #which is acquired by using the motif_finder function
    unique_AWM<-sapply(unique(fMresults$trimmed_allele), function(x) NULL)}

  if(length(unique_AWM)==0){
    return(warning(paste(motif, "No alleles possess this motif", sep =" : ")))
  }

  #finds unique_AWMs in Solberg dataset and extracts the allele frequency and locus_allele column
  for(y in 1:length(unique_AWM)){
    unique_AWM[[y]]<-solberg_DS[,c(2,3,4,5,6,11,14)][solberg_DS$locus_allele %in% names(unique_AWM[y]),]}

  #subsets out locus_allele pairs with the motif from the alignment but aren't present in the Solberg ds
  unique_AWM<-unique_AWM[sapply(unique_AWM, nrow)>0]

  if(length(unique_AWM)==0){
    return(warning(paste(motif, "No alleles possess this motif", sep =" : ")))
  }

  #melts dataframes in list into one big dataframe
  unique_AWM<-rbindlist(unique_AWM)

  #reorders dataframe based on popname alphabetical order
  unique_AWM<-unique_AWM[order(unique_AWM$popname),]

  #makes row names NULL so they are back in sequential enumeration
  row.names(unique_AWM)<-NULL

  #creates a variable named Population Allele Frequencies (PAF), where each element is named after a
  #unique popname for the targeted locus
  #makes each element a list to take in allele frequencies in the next for loop
  PAF<-sapply(unique(solberg_DS$popname), function(x) NULL)

  #for loop for finding allele frequencies for each named element in PAF
  #if multiple entries are present for a given population name, the allele frequencies are added up
  #if the population does not have the motif, the allele frequency is 0, since the entry is not found in unique_AWM
  for(i in 1:length(PAF)){
    PAF[[i]]<-sum(as.numeric(unique_AWM$allele.freq[which((names(PAF[i])==unique_AWM$popname))]))}

  #melts PAF into a two columned df
  PAF<-data.frame(allele_freq = unlist(PAF),
                  popname = rep(names(PAF), sapply(PAF, length)), row.names = NULL, stringsAsFactors = F)

  #merges PAF with solberg_DS data to a new variable, tbm_ds
  #which contains summed up allele frequencies, and relevant contin, complex, locus*allele, and coordinate information
  tbm_ds<-merge(PAF, solberg_DS[!duplicated(solberg_DS$popname),], by="popname")[,c("popname","contin", "complex", "latit", "longit", "allele_freq")]


  #specifies certain rows from motif_map_df to go into a new variable, gmt_converted_data
  gmt_converted_data<-tbm_ds[,c(1,4,5,6)]

  #rerranges format
  gmt_converted_data<-gmt_converted_data[,c("longit", "latit", "allele_freq", "popname")]

  #if max allele frequency has more than 3 leading 0's after the decimal point, return message that map cannot be made
  if((length(str_extract_all(strsplit(as.character(max(gmt_converted_data$allele_freq)), ".", fixed = T)[[1]][[2]], "0")[[1]])>=3) & (strsplit(as.character(max(gmt_converted_data$allele_freq)), ".", fixed = T)[[1]][[1]]==0)){
    #obtain max allele information to see if it is large enough to produce an accurate map for
    max_allele_info<-gmt_converted_data %>% filter(gmt_converted_data$allele_freq==max(gmt_converted_data$allele_freq))

    #convert latit and longit back to
    ifelse(grepl("-", max_allele_info$longit)==TRUE, max_allele_info$longit<-paste(max_allele_info$longit, "W", sep=""), max_allele_info$longit<-paste(max_allele_info$longit, "E", sep=""))
    max_allele_info$longit<-gsub("-", "", max_allele_info$longit)

    ifelse(grepl("-", max_allele_info$latit)==TRUE, max_allele_info$latit<-paste(max_allele_info$latit, "S", sep=""), max_allele_info$latit<-paste(max_allele_info$latit, "N", sep=""))
    max_allele_info$latit<-gsub("-", "", max_allele_info$latit)

    return(paste(paste(paste("The motif you entered has a maximum allele frequency of", max_allele_info$allele_freq), "which is too low of a frequency to produce an accurate map for.", sep=", "), "There are/is", nrow(gmt_converted_data), "population/populations at or below this frequency. The most frequent is found at", max_allele_info$longit, max_allele_info$latit,"."))
  }

  #set wd to tempdir() to store output files
  setwd(tempdir())

  #Windows specific arguments on extracting motif and allele frequency information
  if(OS=="Windows"){

    max_allelefreq<-max(sort(gmt_converted_data$allele_freq))

    #calls function to convert R object gmt_converted_data to GMT formatted data
    #outputs converted data into local environment
    write(max_allelefreq, "upperbound")

    #changes asterisk to '-' in filename since *s are forbidden in Windows file names
    write(gsub("\\*", "-", motif), "filename")
  }


  #writes motif to working directory so bash script can extract
  #and use it as the title for the map
  write(motif, "motif")

  #calls function to convert R object gmt_converted_data to GMT formatted data
  #outputs converted data into local environment
  r2gmt(gmt_converted_data, "motif.xyz")


  #checks if user's machine has GMT software installed
  #if not, an error message is returned and the function stops
  gmtOut<-tryCatch({gmt.system("gmt blockmean motif.xyz -R-180/180/-60/80 -I3 > motif.block")}, error=function(cond){gmtOut<<-"Error"})

  if(length(gmtOut)==1){
    return("The GMT software is not installed at the level of your operating system. Please install GMT from http://gmt.soest.hawaii.edu/projects/gmt/wiki/Installing in order to use this function. Refer to the vignette for more information.")
  }

  #finds blockmean of data
  #-60 for lower bound lat
  #+80 for upper bound lat
  #I for every 3x3 mean value, as obtained from gmt.sh file
  gmt.system("gmt blockmean motif.xyz -R-180/180/-60/80 -I3 > motif.block")

  #grids table using surface
  gmt.system("gmt surface motif.block -R-180/180/-60/80 -Gmotif.grd -I0.5 -T.7 -Ll0")



  ###############WINDOWS SPECIFIC COMMAND LINE ARGUMENTS FOR MAP GENERATION###############

  if(OS=="Windows"){

    #creates basemap with correct motif as title, and output file as the motif but with a '-'
    #separating the locus and amino acid combination; this is done because asterisks are
    #forbidden in Windows naming conventions
    shell("for /f 'usebackq' %C in (`type filename`) do for /f 'usebackq' %A in (`type motif`) do gmt psbasemap -JM6i -R-180/180/-60/80 -B0 -B+t%A -K >> %C.ps")

    if(color==TRUE){
      #uses white background to fill landmasses if color=T
      shell("for /f 'usebackq' %C in (`type filename`) do gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -B0 -G200 -W0.25p -O -K >> %C.ps")
    }

    #uses a hashed background to fill landmasses if color=F
    if(color==FALSE){
      shell("for /f 'usebackq' %C in (`type filename`) do gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -B0 -Gp61 -W0.25p -O -K >> %C.ps")
    }


    #uses readLines to obtain rounded upperbound information; rounds the last digit to the nearest 0 or 5
    upper_round<-shortRoundUp(as.numeric(readLines("upperbound")))
    upper_decile<- shortRoundUp(as.numeric(readLines("upperbound"))/10)


    #the Solberg dataset has some rounding errors that produce an allele frequency greater than 1;
    #for those populations, use a set 1.000 and 0.1 for the decile interval
    if(max(gmt_converted_data$allele_freq) >=1){
      upper_round<-as.numeric("1")
      upper_decile<-as.numeric("0.1")
    }

    #writes max allelic frequency (rounded) and increment needed to form deciles
    write(upper_round, "upper_round")
    write(upper_decile, "upper_decile")


    #color palette generation
    #uses seis color palette if color=T
    if(color==TRUE){
      #makes custom CPT with increments of max_frequency/10 for deciles
      #specifically calls on max_cpt for max frequency and decile increments
      shell("for /f 'usebackq' %E in (`type upper_decile`) do for /f 'usebackq' %D in (`type upper_round`) do gmt makecpt -Cseis -Iz -T0/%D/%E > decile.cpt")
    }

    #uses grayscale color palette if color=F
    if(color==FALSE){
      shell("for /f 'usebackq' %E in (`type upper_decile`) do for /f 'usebackq' %D in (`type upper_round`) do gmt makecpt -Cgray -Iz -T0/%D/%E > decile.cpt")

    }


    #creates a vector called decile_interval, which gives decile increments based on upper_round and upper_decile information
    decile_interval<-seq(0, upper_round, upper_decile)


    #writes decile interval without any line breaks
    cat(decile_interval, file="deciles")

    #use smaller font size if max allele frequency has more digits
    if(nchar(strsplit(as.character(decile_interval[2]), ".", fixed=T)[[1]][[2]]) >=5){
      #adds color scale to basemap based on cpt provided -- makes font on legend smaller
      shell("for /f 'usebackq' %C in (`type filename`) do gmt psscale -D0.1i/1.1i/2i/0.3i -Cdecile.cpt --FONT_ANNOT_PRIMARY=6.5p,Helvetica,black -Np -L -O -K >> %C.ps")
    }

    if(nchar(strsplit(as.character(decile_interval[2]), ".", fixed=T)[[1]][[2]]) <5){
      #adds color scale to basemap based on cpt provided -- makes font on legend smaller
      shell("for /f 'usebackq' %C in (`type filename`) do gmt psscale -D0.1i/1.1i/2i/0.3i -Cdecile.cpt --FONT_ANNOT_PRIMARY=10.0p,Helvetica,black -Np -L -O -K >> %C.ps")
    }

    shell("for /f 'usebackq' %C in (`type filename`) do gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -Gc -O -K >> %C.ps")

    #clips/masks map areas with no data table coverage -- radius of influence increased to 900 km
    shell("for /f 'usebackq' %C in (`type filename`) do gmt psmask motif.xyz -R-180/180/-60/80 -I3 -JM6i -S850k -O -K >> %C.ps")


    #grids .grd file onto map
    shell("for /f 'usebackq' %C in (`type filename`) do gmt grdimage motif.grd -Cdecile.cpt  -JM6i -R-180/180/-60/80 -O -K >> %C.ps")


    #makes a contour map using a .grd file
    shell("for /f 'usebackq' %C in (`type filename`) do gmt grdcontour motif.grd -JM6i -Cdecile.cpt -A- -R-180/180/-60/80 -O -K >> %C.ps")

    #plots longtitude/latitude coordinates onto basemap
    shell("for /f 'usebackq' %C in (`type filename`) do gmt psxy motif.block -R-180/180/-60/80 -JM6i -A -G255 -W0.5p -Sc.05 -O -K >> %C.ps")

    #calls psmask again to terminate clip path with -C parameter
    shell("for /f 'usebackq' %C in (`type filename`) do gmt psmask motif.xyz -R-180/180/-60/80 -I3 -JM6i -S850k -C -O -K >> %C.ps")

    #calls pcoast again to re-establish coastlines and -Q parameter to quit clipping
    shell("for /f 'usebackq' %C in (`type filename`) do gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -W0.5 -O -Q  >>  %C.ps")

    #converts ps map to jpg
    #requires Ghostscript in order to execute command
    shell("for /f 'usebackq' %C in (`type filename`) do gmt psconvert %C.ps -A -Tj -P -Qg4 -E2000")

    motif <-gsub("*", "-", motif, fixed=T)
  }

  ###############MAC SPECIFIC COMMAND LINE ARGUMENTS FOR MAP GENERATION###############
  else{
    #creates basemap with correct motif
    gmt.system("gmt psbasemap -JM6i -R-180/180/-60/80 -B0:.`cat motif`: -K > `cat motif`.ps")

    #uses white background to fill landmasses if color=T
    if(color==TRUE){
      #overlays relevant continents onto basemap
      gmt.system("gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -B0 -G200 -W0.25p -O -K >> `cat motif`.ps")}

    #uses a hashed background to fill landmasses if color=F
    if(color==FALSE){
      gmt.system("gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -B0 -Gp61 -W0.25p -O -K >> `cat motif`.ps")
    }

    #gets upperbound for allele frequencies
    gmt.system("awk '{print $3}' motif.xyz | sort -r | head -1 > upperbound")

    #the Solberg dataset has some rounding errors that produce an allele frequency greater than 1;
    #for those populations, use a set 1.000 and 0.1 for the decile interval
    if(max(gmt_converted_data$allele_freq) >=1){
      cpt_interval<-c(1.000, 0.1)
    }

    #if the max allele frequency for a motif is under 1, use shortRoundUp, which rounds the last
    #digit in a decimal to either a 5 or 10
    if(max(gmt_converted_data$allele_freq) <1){
      #makes a vector called cpt_interval which contains max allelic information and decile increment needed
      #to form deciles
      #uses readLines to obtain upperbound information from bash, and rounds it to the nearest 0.5
      #uses readLines to obtain decile needed
      cpt_interval<-c(shortRoundUp(as.numeric(readLines("upperbound"))), shortRoundUp(as.numeric(readLines("upperbound"))/10))
    }

    #creates a vector called decile_interval, which gives decile increments based on cpt_interval information
    decile_interval<-seq(0, cpt_interval[1], cpt_interval[2])

    #writes max allelic frequency (rounded) and increment needed to form deciles
    write(cpt_interval, "max_cpt")

    #writes decile interval without any line breaks
    cat(decile_interval, file="deciles", sep=",")

    #uses seis color palette if color=T
    if(color==TRUE){
      #makes custom CPT with increments of max_frequency/10 for deciles
      #specifically calls on max_cpt for max frequency and decile increments
      gmt.system("gmt makecpt -Cseis -Iz -T0/`awk '{print $1}' max_cpt`/`awk '{print $2}' max_cpt` > decile.cpt")

    }

    #uses grayscale color palette if color=F
    if(color==FALSE){
      gmt.system("gmt makecpt -Cgray -Iz -T0/`awk '{print $1}' max_cpt`/`awk '{print $2}' max_cpt` > decile.cpt")

    }
    #use smaller font size if max allele frequency has more digits
    if(nchar(strsplit(as.character(decile_interval[2]), ".", fixed=T)[[1]][[2]]) >=5){
      gmt.system("gmt psscale -D0.1i/1.1i/2i/0.3i -Cdecile.cpt --FONT_ANNOT_PRIMARY=6.5p,Helvetica,black -Np -L -O -K >> `cat motif`.ps")
    }

    if(nchar(strsplit(as.character(decile_interval[2]), ".", fixed=T)[[1]][[2]]) <5){
      #adds color scale to basemap based on cpt provided -- makes font on legend smaller
      gmt.system("gmt psscale -D0.1i/1.1i/2i/0.3i -Cdecile.cpt --FONT_ANNOT_PRIMARY=10p,Helvetica,black -Np -L -O -K >> `cat motif`.ps")}

    #overlays more coastlines with pscoast
    gmt.system("gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -Gc -O -K >> `cat motif`.ps")

    #clips/masks map areas with no data table coverage -- radius of influence increased to 900 km
    gmt.system("gmt psmask motif.xyz -R-180/180/-60/80 -I3 -JM6i -S850k -O -K >> `cat motif`.ps")

    #grids .grd file onto map
    gmt.system("gmt grdimage motif.grd -Cdecile.cpt  -JM6i -R-180/180/-60/80 -O -K >> `cat motif`.ps")

    #makes a contour map using a .grd file
    gmt.system("gmt grdcontour motif.grd -JM6i -Cdecile.cpt -A- -R-180/180/-60/80 -O -K >> `cat motif`.ps")

    #plots longtitude/latitude coordinates onto basemap
    gmt.system("gmt psxy motif.block -R-180/180/-60/80 -JM6i -A -G255 -W0.5p -Sc.05 -O -K >> `cat motif`.ps")

    #calls psmask again to terminate clip path with -C parameter
    gmt.system("gmt psmask motif.xyz -R-180/180/-60/80 -I3 -JM6i -S850k -C -O -K >> `cat motif`.ps")

    #calls pcoast again to re-establish coastlines and -Q parameter to quit clipping
    gmt.system("gmt pscoast -JM6i -R-180/180/-60/80 -A30000 -W0.5 -O -Q  >>  `cat motif`.ps")

    #converts ps map to jpg -- saves into local environment
    #requires Ghostscript in order to execute command
    gmt.system("gmt psconvert `cat motif`.ps -A -Tj -P -Qg4 -E2000")

  }

  #move final output of motif.jpg directory specified by the direct parameter; default is
  #set to the user's working directory
  #silence file.move message
  suppressMessages(file.move(paste(tempdir(), .Platform$file.sep , motif, ".jpg", sep=""), direct, overwrite = T))
  invisible(paste(motif,".jpg written to ",direct,".",sep=""))

}

shortRoundUp <- function(frVal){
  valPlus <- (5-as.numeric(substr(as.character(frVal),nchar(as.character(frVal)),nchar(as.character(frVal)))))
  return(ceiling(frVal*10^(nchar(as.character(frVal))-2)+ifelse(valPlus<0,valPlus+5,valPlus))/(10^(nchar(as.character(frVal))-2)))
}