hyQSAR: Hybrid QSAR model

Documented in hat_matrix plot_wp williams_plot williams_plot_linear

#' Plot williams Plot
#'
#' This function perform the random split analysis method
#' @param data_to_plot is a list containing williamps plot data created y the function evaluating_model
#' @keywords plot williams plot
#' @export
#' @examples
#'
plot_wp = function(data_to_plot){
  plot(x = data_to_plot$lev,y = data_to_plot$res,
       col =data_to_plot$col, ylim = c(min(-4.5,data_to_plot$res),max(3.5,data_to_plot$res)),
       xlim = c(0,max(max(data_to_plot$lev),data_to_plot$h_star)+0.1), xlab = "Leverage",ylab = "Standardized Residual")
  abline(v=data_to_plot$h_star)
  abline(h = 3, lty=3)
  abline(h = -3, lty=3)
  legend(x = "bottomright",legend = c("Train","Test"),fill = c("black","red"),bty = "n")
}


#' A function to compute hat matrix
#'
#' This function computes the hat matrix of regression
#' @param X is the dataset matrix with samples on rows and features on columns.
#' @return hat matrix whose diagonal represents the leverage of the regression
#' @keywords hat matrix, leverages
#' @export
#' @examples
#'

hat_matrix = function(X){
  xtx = t(X) %*% X
  diag(xtx) = diag(xtx) + runif(n = length(diag(xtx)),min = 0.001,max = 0.002)
  ixtx = inv(xtx)
  return(X %*% (ixtx %*% t(X)))
}

#' A function to create the williams plot and compute the percentage of applicability domain
#'
#' This function create the williams plot for a specific model and compute the percentage of training and test point falling in the applicability domain
#' @param X_train is the training dataset matrix with samples on rows and features on columns.
#' @param X_test is the test dataset matrix with samples on rows and features on columns.
#' @param y_train is the vector of the training response.
#' @param y_test is the vector of the test response.
#' @param model is a fitted linear model for which we want to evaluate the applicability domain
#' @param beta is a vector of the relevant beta values for the regression model
#' @return ADVal is a vector with the percentage of training and test sample falling in the applicability domain
#' @return DTP is a list containing the data to plot the williams plot. In particular it contain the leverages (lev)
#' the residual (res), the colors of training/test points (col) and the h* value (h_star)
#' @keywords williams plot, applicability domain
#' @export
#' @examples
#'

williams_plot = function(X_train, X_test, y_train, y_test, model,beta){
  H_train= hat_matrix (rbind(X_train[,names(beta)],X_test[,names(beta)]))
  #H_train= hat_matrix (X_train[,names(beta)])

  y_pred_train = predict(model,X_train)
  y_pred_test = predict(model,X_test)


  residual_train= abs(y_train - y_pred_train)
  residual_test= abs(y_test - y_pred_test)
  s_residual_train = ((residual_train) - mean(residual_train)) / sd(residual_train)
  s_residual_test = (residual_test - mean(residual_test))/ sd(residual_test)

  leverage_train = diag(H_train)[rownames(H_train) %in% rownames(X_train)]
  leverage_test = diag(H_train)[rownames(H_train) %in% rownames(X_test)]

  # idx = which(rownames(H_train) %in% rownames(X_test))
  # leverage_train = leverage_train[-idx]

  p = length(beta) + 1#features
  n = nrow(X_train)#+nrow(X_test)#training compounds
  h_star = (3*p)/n

  AD_train = 100 * (sum(leverage_train < h_star & abs(s_residual_train)<3) / length(leverage_train))
  AD_test = 100 * (sum(leverage_test < h_star & abs(s_residual_test)<3) / length(leverage_test))
  lev=  c(leverage_train,leverage_test)
  res = c(s_residual_train,s_residual_test)
  col = c(rep("black",nrow(X_train)),rep("red",nrow(X_test)))

  data_to_plot = list(lev=lev,res=res,col=col,h_star=h_star)

  return(list(ADVal = c(AD_train,AD_test), DTP = data_to_plot))
}

#' A function to create the williams plot and compute the percentage of applicability domain
#'
#' This function create the williams plot for a specific model and compute the percentage of training and test point falling in the applicability domain
#' @param X_train is the training dataset matrix with samples on rows and features on columns.
#' @param X_test is the test dataset matrix with samples on rows and features on columns.
#' @param y_train is the vector of the training response.
#' @param y_test is the vector of the test response.
#' @param model is a fitted linear model for which we want to evaluate the applicability domain
#' @param beta is a vector of the relevant beta values for the regression model
#' @return ADVal is a vector with the percentage of training and test sample falling in the applicability domain
#' @return DTP is a list containing the data to plot the williams plot. In particular it contain the leverages (lev)
#' the residual (res), the colors of training/test points (col) and the h* value (h_star)
#' @keywords williams plot, applicability domain
#' @export
#' @examples
#'

williams_plot_linear = function(X_train, X_test, y_train, y_test, model){
  H_train= hat_matrix (rbind(X_train,X_test))

  y_pred_train = predict(model,as.data.frame(X_train))
  y_pred_test = predict(model,as.data.frame(X_test))

  residual_train= abs(y_train - y_pred_train)
  residual_test= abs(y_test - y_pred_test)
  s_residual_train = ((residual_train) - mean(residual_train)) / sd(residual_train)
  s_residual_test = (residual_test - mean(residual_test))/ sd(residual_test)

  leverage_train = diag(H_train)[rownames(H_train) %in% rownames(X_train)]
  leverage_test = diag(H_train)[rownames(H_train) %in% rownames(X_test)]

  p = ncol(X_train) + 1#features
  n = nrow(X_train)#+nrow(X_test)#training compounds
  h_star = (3*p)/n

  AD_train = 100 * (sum(leverage_train < h_star & abs(s_residual_train)<3) / length(leverage_train))
  AD_test = 100 * (sum(leverage_test < h_star & abs(s_residual_test)<3) / length(leverage_test))
  lev=  c(leverage_train,leverage_test)
  res = c(s_residual_train,s_residual_test)
  col = c(rep("black",nrow(X_train)),rep("red",nrow(X_test)))

  data_to_plot = list(lev=lev,res=res,col=col,h_star=h_star)

  return(list(ADVal = c(AD_train,AD_test), DTP = data_to_plot))
}