R/InternalRisk.fun.R

#################################  This function develops the Risk score (step 1) - Internal Method #########################################################
#############################################################################################################################################################

internalRisk.fun=function(dataset) {
  ##loading libraries
  library(glmnet)
  library(Hmisc)
  library(rms)
  
  ################# LASSO preperation ##########################################
  ################################################################
  #drop STUDYID, USIBJID (no needed for the matrix), drop TRT01A (blinded to treatment)
  ##### drop RELAPSE2year - we need only the analysis foor 1 year
  todrop<-c("STUDYID","USUBJID","TRT01A","RELAPSE2year")
  X<-dataset[ , !(names(dataset) %in% todrop)]
  ### develop the matrix for the model
  X.both<-model.matrix(X$RELAPSE1year~.,data=X)
  #### delete NAs values (LASSO requirement)
  X.both<-na.omit(X.both)
  X<-na.omit(X)
  set.seed(1)
  #############################LASSO###########################################
  ########################################################################
  ###LASSO (alpha=1-default) with 10 cross validations
  cv.fit.both<-cv.glmnet(x=X.both,y=X$RELAPSE1year,family="binomial")
  ###coefficients of variables via LASSO
  cv.coef.both<-coef(cv.fit.both,s="lambda.1se")
  ###############RESULTS
  ### Non zero coefficients -> selected variables
  
  cv.pf.em.both<-rownames(cv.coef.both)[as.numeric(cv.coef.both)!=0]
  
  ###selected variables
  cv.pf.em.both
  ##plot lambda
  plot(cv.fit.both)
  ##plot coeff
  plot(coef(cv.fit.both))
  ###logistic model based on LASSO selected variables
  model<-lrm(RELAPSE1year~AGE+RLPS3YR+REGION+GDLESBL+SFPCSBL+SENSORBL,data=X)
  model
  ##check the assumption of linearity
  a<-lrm(RELAPSE1year~rcs(AGE,5)+rcs(RLPS3YR,5)+REGION+rcs(GDLESBL,5)+rcs(SFPCSBL,5)+SENSORBL,data=X)
  anova(a)
  b<-lrm(RELAPSE1year~rcs(AGE,4)+rcs(RLPS3YR,4)+REGION+rcs(GDLESBL,4)+rcs(SFPCSBL,4)+SENSORBL,data=X)
  anova(b)
  AIC(a)
  AIC(b) ###best choice of knots is knots=4
  
  ###final model (as anova does not indicate nonlinear relationship)
  model<-lrm(RELAPSE1year~AGE+RLPS3YR+REGION+GDLESBL+SFPCSBL+SENSORBL,data=X)
  model
  df<-model[["stats"]][["d.f."]]
  events<- nrow(X[which(X$RELAPSE1year==1),])
  ####return back
  cat("The selected variables are:" , cv.pf.em.both, fill = TRUE)
  EPV<-events/df
  cat("The EPV of the model is", EPV, fill=TRUE)
  if (EPV>10){cat("The EPV of the model is >10, as recommended.", fill=TRUE)}
  if (EPV<10){cat("The EPV of the model is <10, possibility of overfitting problem.", fill = TRUE)}
  
  cat("The final model is:",fill=TRUE)
  return(model)
}
htx-r/RiskModelNMApredictions documentation built on June 12, 2019, 9:52 a.m.