R/qsar_metrics.R

Defines functions validation_metrics validation_metrics_linear ojha_validation_metrics ojha_validation_metrics_linear_model tropsha_metrics compute_slope_and_intercept random_split_Y_scramble Y_scrambled_mse Q2F1_func Q2F2_func Q2F3_func CCC_func R2_func R2_func_2

Documented in CCC_func compute_slope_and_intercept Q2F1_func Q2F2_func Q2F3_func R2_func random_split_Y_scramble validation_metrics validation_metrics_linear Y_scrambled_mse

#' Function to compute the Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2 metrics
#'
#' This function computes the mean Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2 values for a certain number of random split repetition
#' @import glmnet
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param train.prop is percentage of point to be used as training
#' @param lambda is the shrinking parameter of the lasso model
#' @param nPerm is the number of permutation to perform
#' @return a vector of numeric values representing the mean Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2
#' @keywords Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2
#' @export
#' @examples
#'

validation_metrics = function(y,X,train.prop=0.9,  nPerm = 100,lambda){
  n = length(y)
  ntrain   <- ceiling(train.prop * n)
  ntest = n-ntrain
  CVmse <- Q2 <- Q2F1 <- Q2F2 <- Q2F3 <- CCC  <- c()
  print("External Measures computation")

  pb = txtProgressBar(min=1,max=nPerm,style=3)

  for(i in 1:nPerm){

    idx_train <- sample(1:n, size = ntrain, replace = FALSE)
    idx_test = 1:n
    idx_test = idx_test[-idx_train]

    X_train_train = X[idx_train,]
    X_train_test = X[-idx_train,]
    train_class = y[idx_train]

    #model with intercept
    fit = glmnet(x = X_train_train,y = train_class,lambda = lambda,intercept = TRUE)
    #prediction with intercept
    yhat = predict(fit,X_train_test)

    RSS <- sum( { y[-idx_train]  -  yhat }^2 )
    TSS <- sum( { y[-idx_train]  -  mean(y) }^2 )

    CVmse = c(CVmse,RSS / ntest)
    Q2 = c(Q2,1 - RSS / TSS)

    Q2F1 = c(Q2F1,Q2F1_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    Q2F2 = c(Q2F2,Q2F2_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    Q2F3 = c(Q2F3,Q2F3_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    CCC  = c(CCC,CCC_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    setTxtProgressBar(pb,i)
  }
  close(pb)

  Q2 = mean(Q2)
  CCC = mean(CCC)
  Q2F1 = mean(Q2F1)
  Q2F2 = mean(Q2F2)
  Q2F3 = mean(Q2F3)

  return(c(Q2,CCC,Q2F1,Q2F2,Q2F3,0))
}

#' Function to compute the Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2 metrics
#'
#' This function computes the mean Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2 values for a certain number of random split repetition
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param train.prop is percentage of point to be used as training
#' @param nPerm is the number of permutation to perform
#' @return a vector of numeric values representing the mean Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2
#' @keywords Q2, CCC, Q2F1, Q2F2, Q2F3, and rm2
#' @export
#' @examples
#'
validation_metrics_linear = function(y,X,train.prop=0.9,  nPerm = 100){
  n = length(y)
  ntrain   <- ceiling(train.prop * n)
  ntest = n-ntrain
  CVmse <- Q2 <- Q2F1 <- Q2F2 <- Q2F3 <- CCC  <- c()
  print("External Measures computation")

  pb = txtProgressBar(min=1,max=nPerm,style=3)

  for(i in 1:nPerm){
    idx_train <- sample(1:n, size = ntrain, replace = FALSE)
    idx_test = 1:n
    idx_test = idx_test[-idx_train]

    X_train_train = X[idx_train,]
    X_train_test = X[-idx_train,]
    train_class = y[idx_train]

    #model with intercept
    fit = lm(train_class~., data = as.data.frame(X_train_train ))
    #prediction with intercept
    yhat = predict(fit,as.data.frame(X_train_test))

    RSS <- sum( { y[-idx_train]  -  yhat }^2 )
    TSS <- sum( { y[-idx_train]  -  mean(y) }^2 )

    CVmse = c(CVmse,RSS / ntest)
    Q2 = c(Q2,1 - RSS / TSS)

    Q2F1 = c(Q2F1,Q2F1_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    Q2F2 = c(Q2F2,Q2F2_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    Q2F3 = c(Q2F3,Q2F3_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    CCC  = c(CCC,CCC_func(observed_train = y[idx_train],observed_test = y[-idx_train],predicted_test = yhat))
    setTxtProgressBar(pb,i)
  }
  close(pb)

  Q2 = mean(Q2)
  CCC = mean(CCC)
  Q2F1 = mean(Q2F1)
  Q2F2 = mean(Q2F2)
  Q2F3 = mean(Q2F3)

  return(c(Q2,CCC,Q2F1,Q2F2,Q2F3,0))
}


# compute r2 and delta_r2 metrics
ojha_validation_metrics = function(X_train, X_test, y_train, y_test,lambda){
    #model with intercept
    print("    fit glmnet with intercept")
    fit = glmnet(x = X_train,y = y_train,lambda = lambda,intercept = TRUE,nlambda = 10)
    #model without intercept
    print("    fit glmnet with no intercept")
    fit0 = glmnet(x = X_train,y = y_train,lambda = lambda,intercept = FALSE,nlambda = 10)
    #prediction with intercept
    yhat = predict(fit,X_test)
    #prediction without intercept
    yhat0 = predict(fit0,X_test)

    print("computed prediciton")
    r_2 = R2_func(observed = y_test,predicted = yhat)
    r_20 = R2_func(observed = y_test,predicted = yhat0)
    if((r_2 - r_20) == 0){
      rm2 = r_2
    }else{
      rm2 = r_2 * (1 - sqrt(abs(r_2 - r_20)))
    }

    r_2p = summary(lm(y_test~yhat))$adj.r.squared
    r_20p = summary(lm(y_test~yhat0))$adj.r.squared
    print("computed summary")
    if((r_2 - r_20) == 0){
      rm2p = r_2p
    }else{
      rm2p = r_2p * (1 - sqrt(abs(r_2p - r_20p)))
    }
    dr2m = abs(rm2-rm2p)
    print("end")

  return(c(rm2=rm2,dr2m=dr2m))
}

# compute r2 and delta_r2 metrics
ojha_validation_metrics_linear_model = function(X_train, X_test, y_train, y_test){
  #model with intercept
  fit = lm(formula = y_train~.,data = as.data.frame(X_train))
  #model without intercept
  fit0 = lm(formula = y_train~.-1,data = as.data.frame(X_train))
  #prediction with intercept
  yhat = predict(fit,as.data.frame(X_test))
  #prediction without intercept
  yhat0 = predict(fit0,as.data.frame(X_test))

  r_2 = R2_func(observed = y_test,predicted = yhat)
  r_20 = R2_func(observed = y_test,predicted = yhat0)

  if((r_2 - r_20) == 0){
    rm2 = r_2
  }else{
    rm2 = r_2 * (1 - sqrt(abs(r_2 - r_20)))
  }

  r_2p = summary(lm(y_test~yhat))$adj.r.squared
  r_20p = summary(lm(y_test~yhat0))$adj.r.squared

  if((r_2 - r_20) == 0){
    rm2p = r_2p
  }else{
    rm2p = r_2p * (1 - sqrt(abs(r_2p - r_20p)))
  }

  dr2m = abs(rm2-rm2p)
  return(c(rm2=rm2,dr2m=dr2m))
}


# compute thropsha metrics
tropsha_metrics = function(observed, predicted){

  ### data for tropsha criterias
  k = sum(observed * predicted)/sum(predicted^2) # sum(observed * predicted)/sum(predicted^2)
  k1 = sum(observed * predicted)/sum(observed^2) # sum(observed * predicted)/sum(observed^2)
  yr0 = k * predicted # regression line through the origin defined by yr0 = k * predicted
  yr0_ = k1 * observed # regression line through the origin defined by yr0_ = k1 * observed

  low = min(min(predicted), min(observed))
  up = max(max(predicted),max(observed))
  plot(predicted, observed,ylim = c(low,up),xlim=c(low,up))
  abline(v=0, lty=3)
  abline(h=0, lty=3)
  lines(predicted, yr0)
  if(sd(predicted)>0.01){
    abline(lm(observed~predicted), col="red")
  }else{
    abline(v = mean(predicted), col = "red")
  }

  plot(observed, predicted, ylim = c(low,up),xlim=c(low,up))
  abline(v=0, lty=3)
  abline(h=0, lty=3)
  lines(observed, yr0_)
  abline(lm(predicted~observed), col="red")

  R20 = 1 - ( (sum(predicted-yr0)^2) / (sum((predicted-mean(predicted))^2)) )
  R201 = 1 - ( (sum(observed-yr0_)^2) / (sum((observed-mean(observed))^2)) )

  R = (sum((observed - mean(observed))*(predicted-mean(predicted))))/(sqrt((sum((observed-mean(observed))^2))*(sum((predicted-mean(predicted))^2))))

  trm1 = ((R^2 - R20)/R^2)
  trm2 = ((R^2 - R201)/R^2)
  trm3 = abs(R^2 - R201)

  return(c(trm1 = trm1, k = k, trm2 = trm2, k1=k1, trm3 = trm3))
}


#' this function compute slope a and intercept b for the regression line
#' yr = ay + b for the Golbraikh and Tropsha criteria, from Beware of q2!
#' Alexander Golbraikh, Alexander Tropsha (2002)

#' @param predicted vector of predicted values
#' @param observed vector of observed values
#' @return a vector conntaining the slope and intercept values
compute_slope_and_intercept = function(predicted, observed){
  num = sum((predicted - mean(predicted)) * (observed - mean(observed)))
  den = sum(predicted - mean(predicted))^2
  a = num/den

  b = mean(observed) - a * mean(predicted)

  return(c(slope = a, intercept = b))
}

#' Function to compute the scrambled R2 and Q2
#'
#' This function computes the mean R2 and Q2 values for a certain number of repetition of the scrambling test
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param train.prop is percentage of point to be used as training
#' @param lambda is the shrinking parameter of the lasso model
#' @param nPerm is the number of permutation to perform
#' @return a vector of numeric values representing the mean R2 and Q2 for the scrambling test
#' @keywords R2, Q2, scrambling test
#' @export
#' @examples
#'
random_split_Y_scramble = function(y,X,train.prop=0.9,  nPerm = 100,lambda){
  n = length(y)
  ntrain   <- ceiling(train.prop * n)
  ntest = n-ntrain
  R2 <- Q2 <-  c()
  print("Y scambling")
  pb = txtProgressBar(min=1,max=nPerm,style=3)
  for(i in 1:nPerm){
    y = sample(y)

    fit0 = glmnet(x = X,y = y,lambda = lambda,intercept = TRUE)
    yhat0 = predict(fit0,X)
    RSS <- sum( { y  -  yhat0 }^2 )
    TSS <- sum( { y  -  mean(y) }^2 )
    R2 = c(R2,1-(RSS/TSS))

    # idx_test <- sample(5:44, size = 4, replace = FALSE)
    # idx_train = 1:n
    # idx_train = idx_train[-idx_test]
    idx_train <- sample(1:n, size = ntrain, replace = FALSE)
    idx_test = 1:n
    idx_test = idx_test[-idx_train]

    X_train_train = X[idx_train,]
    X_train_test = X[-idx_train,]
    train_class = y[idx_train]
    fit = glmnet(x = X_train_train,y = train_class,lambda = lambda,intercept = TRUE)

    yhat = predict(fit,X_train_test)

    RSS <- sum( { y[-idx_train]  -  yhat }^2 )
    TSS <- sum( { y[-idx_train]  -  mean(y) }^2 )

    Q2 = c(Q2,1 - RSS / TSS)
    setTxtProgressBar(pb,i)
  }
  close(pb)

  Q2 = mean(Q2)
  R2 = mean(R2)
  return(c(R2,Q2))
}

#' Function to compute the scrambled MSE
#'
#' This function computes the mean MSE value for a certain number of repetition of the scrambling test
#' @import utils
#' @import glmnet
#' @import stats
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param train.prop is percentage of point to be used as training
#' @param lambda is the shrinking parameter of the lasso model
#' @param nPerm is the number of permutation to perform
#' @return numeric value representing the mean MSE for the scrambling test
#' @keywords MSE scrambling test
#' @export
#' @examples
#'

Y_scrambled_mse = function(y,X,train.prop=0.9,  nPerm = 100,lambda){
  n = length(y)
  ntrain   <- ceiling(train.prop * n)
  ntest = n-ntrain
  MSE <-  c()
  print("Y scambling")
  pb = txtProgressBar(min=1,max=nPerm,style=3)
  for(i in 1:nPerm){
    y = sample(y)
    idx_test <- sample(1:n, size = ntest, replace = FALSE)
    idx_train = 1:n
    idx_train = idx_train[-idx_test]
    X_train_train = X[idx_train,]
    X_train_test = X[-idx_train,]
    train_class = y[idx_train]
    fit = glmnet(x = X_train_train,y = train_class,lambda = lambda,intercept = TRUE)
    yhat = predict(fit,X_train_test)
    RSS <- sum( { y[-idx_train]  -  yhat }^2 )
    MSE = c(MSE,RSS / ntest)

    setTxtProgressBar(pb,i)
  }
  close(pb)


  MSE = mean(MSE)
  return(MSE)
}

#' Q2F1 function
#'
#' This function computes the Q2F1 metric between observed and predicted value
#' @param observed_train is the vector of the observed reponses of the training set
#' @param observed_test is the vector of the observed reponses of the test set
#' @param predicted_test is the vector of the predicted reponses on the test set
#' @return numeric value representing the Q2F1 metric
#' @keywords Q2F1
#' @export
#' @examples
#'
Q2F1_func = function(observed_train, observed_test, predicted_test){
  SSRes =  sum((observed_test-predicted_test)^2)
  SStot = sum((observed_test-mean(observed_train))^2)
  r2 = 1 - (SSRes/SStot)
  #if(r2<0){r2=abs(r2)}
  return(r2)
}

#' Q2F2 function
#'
#' This function computes the Q2F2 metric between observed and predicted value
#' @param observed_train is the vector of the observed reponses of the training set
#' @param observed_test is the vector of the observed reponses of the test set
#' @param predicted_test is the vector of the predicted reponses on the test set
#' @return numeric value representing the Q2F2 metric
#' @keywords Q2F2
#' @export
#' @examples
#'
Q2F2_func = function(observed_train, observed_test, predicted_test){
  SSRes =  sum((observed_test-predicted_test)^2)
  SStot = sum((observed_test-mean(observed_test))^2)
  r2 = 1 - (SSRes/SStot)
  #if(r2<0){r2=abs(r2)}

  return(r2)
}

#' Q2F3 function
#'
#' This function computes the Q2F3 metric between observed and predicted value
#' @param observed_train is the vector of the observed reponses of the training set
#' @param observed_test is the vector of the observed reponses of the test set
#' @param predicted_test is the vector of the predicted reponses on the test set
#' @return numeric value representing the Q2F3 metric
#' @keywords Q2F3
#' @export
#' @examples
#'
Q2F3_func = function(observed_train, observed_test, predicted_test){
  SSRes =  sum((observed_test-predicted_test)^2)
  SSRes = SSRes/length(observed_test)
  SStot = sum((observed_train-mean(observed_train))^2)
  SStot = SStot/length(observed_train)

  r2 = 1 - (SSRes/SStot)

  return(r2)
}

#' CCC function
#'
#' This function computes the CCC metric between observed and predicted value
#' @param observed_train is the vector of the observed reponses of the training set
#' @param observed_test is the vector of the observed reponses of the test set
#' @param predicted_test is the vector of the predicted reponses on the test set
#' @return numeric value representing the CCC metric
#' @keywords CCC
#' @export
#' @examples
#'
CCC_func = function(observed_train, observed_test, predicted_test){
  SSRes =  sum((observed_test-mean(observed_test))*(predicted_test-mean(predicted_test))) * 2
  SStot = sum((observed_test-mean(observed_test))^2) +
    sum((predicted_test-mean(predicted_test))^2) +
    length(predicted_test) * (mean(observed_test)-mean(predicted_test))^2

  r2 = SSRes/SStot
  #if(r2<0){r2=abs(r2)}

  return(r2)
}

#' R2 function
#'
#' This function computes the R2 metric between observed and predicted value
#' @param observed is the vector of the observed reponses
#' @param predicted is the vector of the predicted reponses
#' @return numeric value representing the R2 metric
#' @keywords R2
#' @export
#' @examples
#'
R2_func = function(observed, predicted){

  SSRes =  sum((predicted-observed)^2)
  SStot = sum((observed-mean(observed))^2)
  r2 = 1 - (SSRes/SStot)
  #if(r2<0){r2=abs(r2)}

  return(r2)
}

R2_func_2 = function(observed, predicted){
  tss <- sum((predicted - mean(observed)) ^ 2)  ## total sum of squares
  regss <- sum((observed - mean(observed)) ^ 2) ## regression sum of squares
  return(regss / tss)
}
angy89/hyQSAR documentation built on Sept. 24, 2019, 7:31 a.m.