R/dfrr.R
In dfrr: Dichotomized Functional Response Regression

Documented in dfrr

#' Dichotomized Functional Response Regression (dfrr)
#'
#' Implementing Function-on-Scalar Regression model, in which the response function
#'  is dichotomized and observed sparsely.
#'  This function fits the dichotomized functional response regression (dfrr) model as
#'  \deqn{Y_{i}(t)=I(\beta_0(t)+\beta_1(t)*x_{1i}+\ldots+\beta_{q-1}(t)*x_{(q-1)i}+\varepsilon_{i}(t)+\epsilon_{i}(t)\times\sigma^2>0),}
#'  where \eqn{I(.)} is the indicator function, \eqn{\varepsilon_{i}} is a Gaussian random function, and \eqn{\epsilon_{i}(t)} are iid standard normal for each \eqn{i} and \eqn{t} independent of \eqn{\varepsilon_{i}}.
#'  \eqn{\beta_k} and \eqn{x_k} for \eqn{k=0,1,\ldots,q-1} are the functional regression coefficients and scalar covariates, respectively.
#'
#'@return
#'  The output is a \code{dfrr}-object, which then can be injected into other methods/functions
#'  to postprocess the fitted model, including:
#'  \code{\link{coef.dfrr}},\code{\link{fitted.dfrr}}, \code{\link{basis}}, \code{\link{residuals.dfrr}}, \code{\link{predict.dfrr}},
#'   \code{\link{fpca}}, \code{\link{summary.dfrr}}, \code{\link{model.matrix.dfrr}},
#'   \code{\link{plot.coef.dfrr}}, \code{\link{plot.fitted.dfrr}}, \code{\link{plot.residuals.dfrr}},
#'   \code{\link{plot.predict.dfrr}}, \code{\link{plot.fpca.dfrr}}
#'
#'@importFrom stats as.formula model.matrix terms
#'@examples
#' set.seed(2000)
#' \donttest{N<-50;M<-24

#' X<-rnorm(N,mean=0)
#' time<-seq(0,1,length.out=M)
#' Y<-simulate_simple_dfrr(beta0=function(t){cos(pi*t+pi)},
#'                         beta1=function(t){2*t},
#'                         X=X,time=time)
#' #The argument T_E indicates the number of EM algorithm.
#' #T_E is set to 1 for the demonstration purpose only.
#' #Remove this argument for the purpose of converging the EM algorithm.
#' dfrr_fit<-dfrr(Y~X,yind=time,T_E=1)
#' plot(dfrr_fit)
#'}
#' #Fitting dfrr model to the Madras Longitudinal Schizophrenia data
#' data(madras)
#' ids<-unique(madras$id)
#' N<-length(ids)
#'
#' ydata<-data.frame(.obs=madras$id,.index=madras$month,.value=madras$y)
#'
#' xdata<-data.frame(Age=rep(NA,N),Gender=rep(NA,N))
#' for(i in 1:N){
#'   dt<-madras[madras$id==ids[i],]
#'   xdata[i,]<-c(dt$age[1],dt$gender[1])
#' }
#' rownames(xdata)<-ids
#'
#' #The argument T_E indicates the number of EM algorithm.
#' #T_E is set to 1 for the demonstration purpose only.
#' #Remove this argument for the purpose of converging the EM algorithm.
#' #J is the number of basis functions that will be used in estimating the functional parameters.
#' madras_dfrr<-dfrr(Y~Age+Gender+Age*Gender, data=xdata, ydata=ydata, J=11,T_E=1)
#'
#' \donttest{
#' coefs<-coef(madras_dfrr)
#' plot(coefs)
#'
#' fpcs<-fpca(madras_dfrr)
#' plot(fpcs)
#' plot(fpcs,plot.eigen.functions=FALSE,plot.contour=TRUE,plot.3dsurface = TRUE)
#'
#' oldpar<-par(mfrow=c(2,2))
#'
#' fitteds<-fitted(madras_dfrr) #Plot first four fitted functions
#'   plot(fitteds,id=c(1,2,3,4))
#' par(oldpar)
#'
#' resids<-residuals(madras_dfrr)
#' plot(resids)
#'
#'
#' newdata<-data.frame(Age=c(1,1,0,0),Gender=c(1,0,1,0))
#'   preds<-predict(madras_dfrr,newdata=newdata)
#'   plot(preds)
#'
#'
#' newdata<-data.frame(Age=c(1,1,0,0),Gender=c(1,0,1,0))
#' newydata<-data.frame(.obs=rep(1,5),.index=c(0,1,3,4,5),.value=c(1,1,1,0,0))
#' preds<-predict(madras_dfrr,newdata=newdata,newydata = newydata)
#' plot(preds)
#' }
#'
#' @param times_to_evaluate a numeric vector indicating the set of time points for evaluating the functional regression coefficients and principal components.
#' @param formula an object of class "\code{\link[stats]{formula}}" (or one that can be coerced to that class with \code{as.formula}:
#'  a symbolic description of the model to be fitted.
#' @param yind a vector with length equal to the number of columns of the matrix of functional
#'  responses giving the vector of evaluation points \eqn{(t_1,...,t_{G})}.
#'  If not supplied, \code{yind} is set to \code{1:ncol(<response>)}.
#' @param data an (optional) \code{data.frame} containing the covariate data.
#' the variable terms will be searched from the columns of \code{data},
#' covariates also can be read from the workspace if it is not available in \code{data}.
#' @param ydata an (optional) \code{data.frame} consists of three columns \code{.obs}, \code{.index} and \code{.value},
#'  supplying the functional responses that are not observed on a regular grid.
#'  ydata must be provided if the sampling design is irregular.
#' @param method detrmines the estimation method of functional parameters. Defaults to "\code{REML}" estimation.
#' @param rangeval an (optional) vector of length two, indicating the lower and upper limit of the domain of
#' latent functional response. If not specified, it will set by minimum and maximum of \code{yind} or \code{.index} column of \code{ydata}.
#' @param ... other arguments that can be passed to the inner function \code{AMCEM}.
#' @param basis an (optional) object of class \code{'basisfd'}. Defaults to cubic bspline basis.
#' @export
dfrr <-
function(formula, yind=NULL, data = NULL, ydata = NULL,
               method = c("REML","ML"),rangeval=NULL,basis=NULL,times_to_evaluate=NULL,...){

    formula<-as.formula(formula)


  method<-match.arg(method)

  is.regular<-is.null(ydata)

  if(is.regular){
    if(!is.null(ydata))
    {
      if(!all(c(".obs",".index",".value") %in% colnames(ydata)))
        stop("Argument 'ydata' is not of the expected structure. See the help for more details")
      if(!all(unique(ydata$.index) %in% yind))
        stop("'yind' does not contain unique values of '.index' column of ydata")
    }

  }else{
    if(is.null(ydata))
      stop("User must provide either ydata or yind")
    if(!all(c(".obs",".index",".value") %in% colnames(ydata)))
      stop("Argument 'ydata' is not of the expected structure. See the help for more details")
  }

  if(!is.null(data))
    if(is.null(rownames(data)))
      rownames(data)<-1:N

  data_ids<-rownames(data)

  if(!is.null(ydata)){
    if(!is.null(data))
    {
      int<-intersect(unique(ydata$.obs) ,data_ids)
      q<-length(attr(terms(formula),"term.labels"))
      if(length(int)==0){
        stop("Rownames of argument 'data' does not match with the unique values of column '.obs' in 'ydata'")
      }else if(length(int)<=q){
        stop("Rownames of argument 'data' does not match with the unique values of column '.obs' in 'ydata'.\\r\\n Not enough sample to fit the model.")
      }else if(length(int)!=nrow(data)){
        data<-data[int,]
        N<-nrow(data)
        excluded_ids<-setdiff(unique(ydata$.obs),int)
        excluded_ids<-paste0(excluded_ids,", ",collapse = "")
        excluded_ids<-substr(excluded_ids,1,nchar(excluded_ids)-2)
        warning(paste0("Rownames of argument 'data' does not completely match with the unique values of column '.obs' in 'ydata'.\\r\\n",
                       "The following ", nrow(data)-length(int)," cases excluded from ydata. \\r\\n .obs=",excluded_ids))
      }

    }
  }

  #Analyze formula
  # Specify response variable
  # specify coariates' varialbes
  # check for variables' environments

  #Identifying response variable
  res<-paste0(formula)
  resp_varname<-""
  if(length(res)==3)
    resp_varname<-res[2]

  if(resp_varname=="" & is.null(ydata))
    stop("response variable must be specified")

  check_wsp<-FALSE
  y_matrix<-NULL
  if(is.null(ydata)){
    check_wsp<-TRUE
    y_matrix<-eval({as.symbol(resp_varname)},envir = parent.frame())
  }else{
    # if(!is.na(resp_varname))
    #   if(resp_varname!="")
    #     invisible(cat(paste0("Argument 'ydata' is provided. The variable name '",resp_varname,"' from the left-hand side of the formula is is read from '.value' column of 'ydata'\\r\\n")))
  }

  tof<-terms(formula)
  var_names<-rownames(attr(tof,"factors"))
  var_names<-setdiff(var_names,resp_varname)

  if(!is.null(ydata))
    N<-length(unique(ydata$.obs))
  else
    N<-nrow(y_matrix)

  if(is.null(data))
    data<-data.frame(.int=rep(1,N))


  if(!is.null(data))
  {
    if(nrow(data)!=N)
      stop(paste0("The number of rows in data frame is different from number of samples in the response (N=",N,")"))

  }

  if(!is.null(var_names)){
    wsp_var_names<-setdiff(var_names,colnames(data))

    if(!is.null(wsp_var_names) & length(wsp_var_names)>0){
      for(i1 in 1:length(wsp_var_names)){
        tmp_<-eval({as.symbol(wsp_var_names[i1])},envir = parent.frame())
        if(length(tmp_)!=N)
          stop(paste0("Variable '",wsp_var_names[i1],"' has a length different from number of samples in the response (N=",N,")"))
        data[,wsp_var_names[i1]]<-tmp_
      }

      if(!is.null(ydata)){
        ids<-intersect(1:N,unique(ydata$.obs))
        if(length(ids)!=N)
          stop(paste0("The ids in '.obs' column in ydata must be the sequence 1,...,N; N=",N, " is the nmber of samples"))
      }
    }


  }



  if(!is.null(ydata)){
    ids<-sort(unique(ydata$.obs),decreasing = FALSE)
    if(is.null(rownames(data)))
      stop("'rownmaes(data)' must agree with the '.obs' column of ydata")
    rnm<-rownames(data)
    data2<-as.data.frame(matrix(NA,nrow=length(ids),ncol=ncol(data)))
    colnames(data2)<-colnames(data)
    for(ii in 1:length(ids)){
      inds<-which(rnm==ids[ii])
      if(length(inds)==0)
        data2[ii,]<-NA
      else
        data2[ii,]<-data[inds,]
    }
    data<-data2
  }else
  if(is.null(ydata)){
    if(is.null(rownames(y_matrix))){
      ids<-1:N
      rownames(y_matrix)<-1:nrow(y_matrix)
    }else{
      ids<-rownames(y_matrix)
      u<-intersect(rownames(y_matrix),data_ids)
      if(length(u)!=length(rownames(y_matrix)))
        stop("Rownames of response matrix does not meatch with rownames of 'data' matrix")

      #data<-data[rownames(y_matrix),]
    }
  }


  formula2<-formula
  if(length(res)==3)
  {
    formula_string<-paste0("~",res[3])
    formula2<-formula(formula_string)
  }

  xData<-model.matrix(formula2,data=data)
  if(ncol(xData)==0)
    xData<-NULL


  time<-list()
  Y<-list()
  ids_nna<-c()
  i_nna<-c()

  kk<-0
  if(!is.null(ydata)){

    for(i in 1:N){
      if(is.null(xData) | (!is.null(xData) & all(!is.na(xData[i,])))){
        ys<-ydata[ydata$.obs==ids[i],]
        ys<-ys[!is.na(ys$.value) & !is.na(ys$.index),]
        if(length(ys)==0)
          next
        kk<-kk+1
        time[[kk]]<-ys$.index
        Y[[kk]]<-ys$.value
        ids_nna[kk]<-ids[i]
        i_nna[kk]<-i
      }
    }

  }else{

    for(i in 1:N){
      if(is.null(xData) | (!is.null(xData) & all(!is.na(xData[i,])))){
        ys<-y_matrix[i,]
        ty<-yind[!is.na(ys)]
        if(length(ty)==0)
          next
        kk<-kk+1
        time[[kk]]<-ty
        Y[[kk]]<-ys[!is.na(ys)]
        ids_nna[kk]<-ids[i]
        i_nna[kk]<-i
      }
    }

  }


  if(kk==0)
    stop("There is no sample with complete data")

  if(length(i_nna)!=N)
    if(!is.null(xData))
      xData<-xData[i_nna,]

  if(!is.null(xData))
    if(qr(xData)$rank<ncol(xData))
      stop("xData does not have full column rank")

  timeT<-sort(unique(unlist(time)),decreasing = FALSE)
  if(is.null(rangeval)){
    rangeval<-c(min(timeT),max(timeT))
  }


  if(!is.null(times_to_evaluate))
    if(min(times_to_evaluate)<rangeval[1] | max(times_to_evaluate)>rangeval[2])
      stop(paste0("The argument 'times_to_evaluate' does not match with the minumum and maximum of times in which the samples are observed.\r\n",
                  "Please rectify the 'times_to_evalute' argument or 'rangeval' argument.\r\n",
                  "The 'times_to_evaluate' argument must be included in the interval specified by the 'rangeval' argument."))

  if(length(rangeval)!=2)
    stop("rangeval must be a vector of length 2")

  if(rangeval[1]>min(timeT) | rangeval[2]<max(timeT))
    stop("rangeval does not include some of time points")

  if(is.null(times_to_evaluate))
    times_to_evaluate<-seq(rangeval[1],rangeval[2],length.out = 100)

  #Pass Y, time, xData, ids_nna to AMCEM3


  # if(!is.null(ydata)){
  #   timeT<-sort(unique(unlist(time)),decreasing = FALSE)
  #   if(all(sapply(1:N, function(i){length(intersect(tim[[i]],timeT))==length(timeT)})))
  #     design<-"R"
  # }

  res<-AMCEM(Y,time,xData,ids = ids_nna,method = method,
             rangeval=rangeval,times_to_evaluate=times_to_evaluate,basis=basis,...)

  if(!is.null(ydata))
    res$ydata<-ydata
  else{
    res$y_matrix<-y_matrix
    res$yind<-yind
    res$ids_rows<-ids
  }
  class(res)<-"dfrr"
  attr(res,"formula")<-formula2
  res$modelMatrix<-xData

  res

}