R/linearitycheck.R

fmt <- function(dsn,X,k){
  Cut.off.points <- quantile(dsn[,X],seq(0,1,length.out = (k+1)),type = 1)
  print(noquote(paste("Cut points for",X,"with",k,"categories:")))
  print(Cut.off.points)
  dsn.new <- dsn[order(dsn[,X]),]
  Cut.off.loc <- rep(NA,k)
  for(i in 1:k){
    Cut.off.loc[i] <- which(dsn.new[,X]==Cut.off.points[i+1])[1]
  }
  Cut.off.loc <- c(0,Cut.off.loc)
  Cut.off.label <- c("low",as.character(round(Cut.off.points,5))[2:k],"high")
  New.X <- rep(NA,nrow(dsn.new))
  for(i in 1:k){
    New.X[(Cut.off.loc[i]+1):Cut.off.loc[i+1]] <- paste(Cut.off.label[i],"-",Cut.off.label[i+1])
  }
  dsn.new <- cbind(dsn.new,New.X)
  Cut.off.points <- as.numeric(Cut.off.points)
  fmt.output <- list(Cut.off.points = Cut.off.points, dsn.new = dsn.new)
  return(fmt.output)
}

lk.func <- function(link.type,Group.Means){
  link.name <- NULL
  Group.Means[which(Group.Means==0)]<- 0.000001
  Group.Means[which(Group.Means==1)]<- 0.999999
  if(link.type=="identity"){
    link.name <- "identity link"
    Group.Means.Y <- Group.Means
  }
  else if(link.type=="logit"){
    link.name <- "logit link"
    Group.Means.Y <- log(Group.Means/(1-Group.Means))
  }
  else if(link.type=="probit"){
    link.name <- "probit link"
    Group.Means.Y <- pnorm(Group.Means)
  }
  else if(link.type=="log"){
    link.name= "log link"
    Group.Means.Y <- log(Group.Means)
  }
  else if(link.type=="clog-log"){
    link.name= "complementary log-log link"
    Group.Means.Y <- log(-log(1-Group.Means))
  }
  else if(link.type=="log-log"){
    link.name= "log-log link"
    Group.Means.Y <- -log(-log(Group.Means))
  }
  else if(link.type=="reciprocal"){
    link.name="reciprocal link"
    Group.Means.Y <- 1/Group.Means
  }
  link.list <- list(link.name = link.name, Y.Means = Group.Means.Y)
  return(link.list)
}

#' @title Check linearity assumption
#' @description  Check linearity assumption in a model using a given link function.
#' @usage LFL(dsn,X,Y,nkat,link.type,outlier)
#' @param dsn data set name containing X and Y;
#' @param X continuous covariate
#' @param Y outcome variable
#' @param nkat number of categories into which X should be divided (an even number)
#' @param link.type link function to be used:
#' link.type is coded as follows:
#' "identity" = identity link;
#' "logit" = logit link;
#' "probit" = probit link;
#' "log" = log link;
#' "clog-log" = complementary log-log link;
#' "log-log" = log-log link;
#' "reciprocal" = reciprocal link;
#' @param outlier exclude outliers from the plot.
#' @return Return a matrix with 6 columns: 
#' \item{X}{intervals of continuous covariate.}
#' \item{Frequency}{number of observations in each group.}
#' \item{mean}{group means of outcome variable.}
#' \item{link}{name of link function.}
#' \item{gmu}{group means of outcome variable with link function.}
#' \item{midpt}{median of continuous covariate in each group.}
#' @examples 
#' Data.X <- runif(1000)
#' Data.Y <- exp(Data.X+1)+rnorm(1000,0,1)
#' Data <- data.frame(cbind(Data.X,Data.Y))
#' LFL(Data,"Data.Y","Data.X",20,4)
#' ##Use NH data:
#' NH$quality.rating <- (scale(NH$quality.rating)/2+0.5)
#' LFL(dsn = NH,Y = "quality.rating", X = "RNHRD", nkat = 40, link.type = "logit", outlier = F)
#' @export
LFL <- function(dsn,X,Y,nkat,link.type,outlier=FALSE){
  fmt.out <- fmt(dsn,X,nkat)
  dsn.new <- fmt.out$dsn.new
  Cut.off.points <- fmt.out$Cut.off.points
  Group.means <- aggregate(dsn.new[,Y],list(dsn.new[,"New.X"]),mean)
  midpt <- aggregate(dsn.new[,X],list(dsn.new[,"New.X"]),median)
  lk.output <- lk.func(link.type,Group.means[,2])
  link.name <- rep(lk.output$link.name,nkat)
  LFL.output <- cbind(as.character(Group.means$Group.1),
                      as.numeric(table(dsn.new$New.X)),
                      Group.means[,2],
                      link.name,
                      lk.output$Y.Means,
                      midpt$x)
  LFL.output <- rbind(LFL.output[nkat,],LFL.output[1:(nkat-1),])
  colnames(LFL.output) <- c("X","Frequency","mean","link","gmu","midpt")
  LFL.output[,3] <- round(as.numeric(LFL.output[,3]),5)
  LFL.output[,5] <- round(as.numeric(LFL.output[,5]),5)
  LFL.output[,6] <- round(as.numeric(LFL.output[,6]),5)
  LFL.output <- LFL.plot <- data.frame(LFL.output)
  if(outlier){
    Model.outliers <- 1
    while(Model.outliers==1){
      LFL.model <- lm(as.numeric(as.character(LFL.plot$gmu))~as.numeric(as.character(LFL.plot$midpt)))
      Outliers <- as.numeric(which(cooks.distance(LFL.model)>4*mean(cooks.distance(LFL.model))))
      if(length(Outliers)>0){
        LFL.plot <- LFL.plot[-Outliers,]
      }
      else if(length(Outliers) == 0){
        Model.outliers <- 0
      }
    }
    plot(as.numeric(as.character(LFL.plot$midpt)),as.numeric(as.character(LFL.plot$gmu)),xlab = "midpt",ylab = "gmu",main = paste("Check linearity of",lk.output$link.name,Y, "by",X,"with",nkat,"categories"))
    abline(LFL.model)
  }
  else{
    LFL.model <- lm(as.numeric(as.character(LFL.plot$gmu))~as.numeric(as.character(LFL.plot$midpt)))
    plot(as.numeric(as.character(LFL.plot$midpt)),as.numeric(as.character(LFL.plot$gmu)),xlab = "midpt",ylab = "gmu",main = paste("Check linearity of",lk.output$link.name,Y, "by",X,"with",nkat,"categories"))
    abline(LFL.model)
  }
  return(LFL.output)
}
CastleLi/BetaPSS documentation built on May 15, 2019, 10:34 p.m.