R/hydrophobicity_plot.R
In Fiscore: Effective Protein Structural Data Visualisation and Exploration

Documented in hydrophobicity_plot

#' @title hydrophobicity_plot
#'
#' @description Function to plot amino acid sequence hydrophobicity profile using Kyte-Doolittle hydrophobicity scale; Reference= J. Mol. Biol. 157=105-132;1982. The Kyte-Doolittle scale is used for detecting hydrophobic regions in proteins. Regions with a positive value are hydrophobic ans those with negative values are hydrophylic. This scale can be used to identify both surface-exposed regions as well as transmembrane regions, depending on the window size used.
#'
#' @param pdb_df Requires a pdb data frame generated by PDB_prepare
#' @param window Size of a window between 3 and 21, default is 21
#' @param weight Relative weight of the window edges compared to the window center in percent; default=100
#' @param model Weight variation model either "linear" or "exponential", if the relative weight at the edges is selected to be < 100 percent; default="linear"
#' @return  Scaled line graph
#' @ImportFrom ggplot2 aes
#' @ImportFrom ggplot2 ggplot
#' @ImportFrom ggplot2 geom_line
#' @ImportFrom ggplot2 scale_x_continuous
#' @ImportFrom ggplot2 xlab
#' @ImportFrom ggplot2 ylab
#' @ImportFrom ggplot2 ggtitle
#' @export
#' @examples
#' path_to_processed_PDB<- system.file("extdata", "pdb_df.tabular", package="Fiscore")
#' # basic usage of hydrophobicity_plot
#' pdb_df<-read.table(path_to_processed_PDB)
#' hydrophobicity_plot(pdb_df)
hydrophobicity_plot<-function(pdb_df, window=3, weight=100, model="exponential"){



#Kyte-Doolittle hydrophobicity scale values------
  KD_values<-list("ALA"= 1.800,"ARG"= -4.500,"ASN"= -3.500,"ASP"= -3.500, "CYS"=  2.500, "GLN"= -3.500, "GLU"= -3.500,"GLY"= -0.400,"HIS"= -3.200,"ILE"=  4.500, "LEU"=  3.800,"LYS"= -3.900,"MET"=  1.900, "PHE"=  2.800,  "PRO"= -1.600,"SER"= -0.800,"THR"= -0.700,"TRP"= -0.900,"TYR"= -1.300, "VAL"=  4.200)

#Warnings------
  if(((window<3) || (window>21))||(window%%2==0)) {stop("Please select a window size between values 3 and 21 where the value is odd")}
  if((weight<0) || (weight>100)) {stop("Please select weight values as a percentage, e.g. 50 for 50%")}
  if(!(model %in%c("exponential","linear"))) {stop("You can only select linear or exponential models, e.g., 'exponential' ")}


#Helper functions--------

  #***linear fit-----

  linear_fit<-function(value, size){

    #value - weight
    #size - window size
    #function k*x+b=y
    k=(100-value)/((size-1)/2) #establish a coefficient
    b= value-k     #calculate the intercept for linear function
    values<-c()
    #calculate k>0 values
    for(i in 1:((size-1)/2)){
      values<-c(values,k*i+b)
    }

    # add 100%
    values<-c(values,100, rev(values))
    # add the reverse values to complete the window


    return(values)
  }

  #***linear fit-----

  exp_fit<-function(value, size){

    #value - weight
    #size - window size
    #function a*b^x=y
    b=(100/value)^(2/(size-1))
    a= value/b     #calculate the intercept for linear function
    values<-c()
    #calculate k>0 values
    for(i in 1:((size-1)/2)){
      values<-c(values,a*b^i)
    }

    # add 100%
    values<-c(values,100, rev(values))
    # add the reverse values to complete the window


    return(values)
  }

  MINMAX_normalization_func<-function(array){

    #MIN-MAX normalisation based on the input array
    #input numeric array
    #returns normalised array values

    return ((array-min(array))/(max(array)-min(array)))
  }

#Extract Protein sequence and build temporary data frame for analysis

  df<-pdb_df[,c("df_resno","df_res" , "Type")]
  rownames(df)<-1:nrow(df)
  df$"Type"<-as.factor(df$"Type")

#Prepare weights
  if(model=="exponential"){values<-exp_fit(weight, window)  }
  if(model=="linear"){  values<-linear_fit(weight, window)  }
  values<-values/100 #scale values
  scores<-c()

  weight_dist<-c()
  half_window<-(window-1)/2

  #process data within sliding window, i.e. amino acids that can be analysed this way
  for(i in 1+half_window:(nrow(df)-half_window-1)){
    scores_temp<-c()
    score<-0
    #calculate score for amino acid based on the window
    for(j in (i-half_window):(i+half_window)){
      aa<-df[j,"df_res"]
      scores_temp<-c(scores_temp,KD_values[[aa]])
    }
    #add weighed scores for amino acid where values provide earlier calculated scores
    for(n in 1:length(values)){
      score<-score+scores_temp[n]*values[n]    }

     scores<-c(scores,score)

  }

# add missing residues for beggining and end of the protein by just adding the value from the scale without weighing
  scores_temp<-c()
  for(i in 1:half_window){
    aa<-df[i,"df_res"]

    scores_temp<-c(scores_temp,KD_values[[aa]])
  }
  scores<-c(scores_temp,scores)
  scores_temp<-c()
  for(i in (nrow(df)-half_window+1):nrow(df)){
    aa<-df[i,"df_res"]
    scores_temp<-c(scores_temp,KD_values[[aa]])
  }
  scores<-c(scores,scores_temp)

#scale scores
  normalised_scores<-MINMAX_normalization_func(scores)

  df$"Score"<-normalised_scores

  #to avoid namespace conflucts
  df_resno_val<-df$"df_resno"
  Score_val<-df$"Score"
  Type_val<-df$"Type"

  #Plot will capture breakage regions with empty
ggplot2::ggplot() +ggplot2::geom_line(data = df,ggplot2::aes(df_resno_val, Score_val, group = 1, color = Type_val))+ggplot2::scale_x_continuous(breaks=seq(min(df$"df_resno"),max(df$"df_resno"),25), limits=c(min(df$"df_resno"),max(df$"df_resno")))+ggplot2::ggtitle(label="Kyte-Doolittle hydrophobicity plot")+ ggplot2::xlab(label = "Residue number")+ggplot2::ylab(label = "Normalised score")






}