R/spm.impute.R

Defines functions spm.impute getNextY.cont func1 getPrevY.discr getPrevY.discr.m getNextY.discr.m getNextY.discr2 getNextY.discr getNextY.cont2 mu m sigma_sq

Documented in func1 getNextY.cont getNextY.cont2 getNextY.discr getNextY.discr.m getPrevY.discr getPrevY.discr.m m mu sigma_sq spm.impute

#' An internal function to compute sigma square analytically
#' @param t1 t1
#' @param t2 t2
#' @param b b (see Yashin et. al, 2007)
#' @return sigma_square (see Akushevich et. al, 2005)
sigma_sq <- function(t1, t2, b) {
    # t2 = t_{j}, t1 = t_{j-1}
    ans <- as.numeric(b) %*% as.numeric(t2-t1)
    ans
}

#' An internal function to compute m from 
#' @param y Current value of Y
#' @param t1 t1
#' @param t2 t2
#' @param a a (see Yashin et. al, 2007)
#' @param f1 f1 (see Yashin et. al, 2007)
#' @return m m (see Yashin et. al, 2007)
m <- function(y, t1, t2, a, f1) {
    # y = y_{j-1}, t1 = t_{j-1}, t2 = t_{j}
    ans <- y + as.numeric(a) %*% t(as.numeric(y) - as.numeric(f1)) %*% as.numeric((t2 - t1))
    ans
}

#' An internal function to compute mu
#' @param y Current value of y
#' @param mu0 mu0 (see Yashin et. al, 2007)
#' @param b b (see Yashin et. al, 2007)
#' @param Q Q (see Yashin et. al, 2007)
#' @param theta theta (see Yashin et. al, 2007)
#' @param tt t (time)
#' @return mu Next value of mu
mu <- function(y, mu0, b, Q, theta, tt) {
    ans <- (mu0 + t(y) %*% b + t(y) %*% Q %*% y)*exp(theta*tt)
    ans
}

#mu <- function(tt, y1, gamma1, f, f1, mu0, theta, Q) {
#  hf <- f - y1
#  hf1 <- f1 - y1
#  #if(gomp) {
#  #  mu0Ht = mu0H*exp(thetaH*t);
#  #} else {
#  #  mu0Ht = mu0H;
#  #}
#  #print(t)
#  #print(theta)
#  mu0Ht <- mu0*exp(theta*tt)
#  QH_gamma1 <- Q %*% gamma1
#  mu <- mu0Ht + (t(hf) %*% Q) %*% hf + sum(diag(QH_gamma1))
#  mu
#}

#' An internal function to compute next value of physiological variable Y
#' @param y1 y1
#' @param t1 t1
#' @param t2 t2
#' @param b b (see Yashin et. al, 2007)
#' @param a a (see Yashin et. al, 2007)
#' @param f1 f1 (see Yashin et. al, 2007)
#' @return y.next Next value of y
getNextY.cont2 <- function(y1, t1, t2, b, a, f1) {
    ssd <- sigma_sq(t1, t2, b)
    mean <- m(as.numeric(y1), t1, t2, a, f1)
    dt <- as.numeric(t2-t1)
    #y2 <- sapply(1:length(y1), function(i) {rnorm(1,mean=mean[i], sd=sqrt(b[i])/dt)})
    y2 <- sapply(1:length(y1), function(i) {rnorm(1,mean=mean[i], sd=sqrt(ssd[i]))})
    #y2 <- sapply(1:length(y1), function(i) {rnorm(1,mean=mean[i], sd=b[i])})
    y2
}

#' An internal function to compute the next value of physiological variable Y
#' based on discrete-time model (Akushevich et. al., 2005)
#' @param y1 y1
#' @param u u (see Akushevich et. al, 2005)
#' @param R R (see Akushevich et. al, 2005)
#' @param Sigma Sigma (see Akushevich et. al, 2005)
#' @return y.next Next value of y
getNextY.discr <- function(y1, u, R, Sigma) {
    eps<-matrix(nrow=dim(R)[1], ncol=1)
    #eps[,1] <- sapply(1:length(Sigma), function(i) {rnorm(1, mean=0.0, sd=sqrt(Sigma[i]))})
    eps[,1] <- sapply(1:length(Sigma), function(i) {rnorm(1, mean=0.0, sd=Sigma[i])})
    y2 <- u + R %*% y1 + eps
    return(y2)
}

getNextY.discr2 <- function(y1, u, R, Sigma) {
    y2 <- sapply(1:length(Sigma), function(i) {rnorm(1, mean=getNextY.discr.m(y1, u, R)[i], sd=sqrt(Sigma[i])/2)})
    y2
}


#' An internal function to compute next m based on dicrete-time model 
#' @param y1 y1
#' @param u u
#' @param R R
#' @return m Next value of m (see Yashin et. al, 2007)
getNextY.discr.m <- function(y1, u, R) {
    m <- u + R %*% y1
    return(m)
}

#' An internal function to compute previous m based on discrete-time model
#' @param y2 y2
#' @param u u
#' @param R R
#' @return m Next value of m (see Yashin et. al, 2007)
getPrevY.discr.m <- function(y2, u, R) {
  m <- solve(R) %*% (y2 - u)
  m
}

#' An internal function to compute previous value of
#' physiological variable Y based on 
#' discrete-time model
#' @param y2 y2
#' @param u u
#' @param R R
#' @param Sigma Sigma
#' @return y1 Previous value of y
getPrevY.discr <- function(y2, u, R, Sigma) {
    eps<-matrix(nrow=dim(R)[1], ncol=1)
    eps[,1] <- sapply(1:dim(eps)[1], function(i) {rnorm(1, mean=0.0, sd=Sigma[i])})
    
    y1 <- solve(R) %*% (y2 - u - eps)
    y1
}


#' An internal function to compute m and gamma based on 
#' continuous-time model (Yashin et. al., 2007)
#' @param tt tt - time
#' @param y y 
#' @param a a (see Yashin et. al, 2007)
#' @param f1 f1 (see Yashin et. al, 2007)
#' @param Q Q (see Yashin et. al, 2007)
#' @param f f (see Yashin et. al, 2007)
#' @param b b (see Yashin et. al, 2007)
#' @param theta theta
#' @return list(m, gamma) Next values of m and gamma (see Yashin et. al, 2007)
func1 <- function(tt, y, a, f1, Q, f, b, theta) {
    hf <- f - as.numeric(y[[1]])
    hf1 <- f1 - as.numeric(y[[1]])
    
    dm <- -1.00 * (a %*% as.matrix(hf1)) + (2.00 * as.numeric(y[[2]])) %*% Q %*% hf
    dgamma <- a %*% as.numeric(y[[2]]) + as.numeric(y[[2]]) %*% t(a) + b %*% t(b) - 2.00 * ((as.numeric(y[[2]]) %*% Q) %*% as.numeric(y[[2]]))
    
    res <- list(m=dm, gamma=dgamma)
    return(res)
}

#' An internal function to compute next Y based on 
#' continous-time model (Yashin et. al., 2007)
#' @param y1 y1
#' @param t1 t1
#' @param t2 t2
#' @param a a (see Yashin et. al, 2007)
#' @param f1 f1 (see Yashin et. al, 2007)
#' @param Q Q (see Yashin et. al, 2007)
#' @param f f (see Yashin et. al, 2007)
#' @param b b (see Yashin et. al, 2007)
#' @param theta theta (see Yashin et. al, 2007)
#' @return y.next Next value of Y
getNextY.cont <- function(y1, t1, t2, a, f1, Q, f, b, theta) {
    nsteps <- 4
    tdiff <- t2-t1
    h <- tdiff/nsteps
    gamma1 <- matrix(nrow=dim(a)[1], ncol=dim(a)[1], 0) # set gamma1 to zero matrix
    #gamma1 <- matrix(nrow=1, ncol=1, 0) # set gamma1 to zero matrix
    tt <- t1
  
    out <- list()
    out[[1]] <- y1
    out[[2]] <- gamma1
  
  
    for(j in 1:nsteps) {
        #Runge-Kutta method:
        yn <- out
        k1 <- func1(tt, yn, a, f1, Q, f, b, theta)
    
        yn[[1]] <- out[[1]] + h/2 * k1[[1]]
        yn[[2]] <- out[[2]] + h/2 * k1[[2]]
        k2 <- func1(tt+h/2, yn, a, f1, Q, f, b, theta)
    
        yn[[1]] <- out[[1]] + h/2 * k2[[1]]
        yn[[2]] <- out[[2]] + h/2 * k2[[2]]
        k3 <- func1(tt+h/2, yn, a, f1, Q, f, b, theta)
    
        yn[[1]] <- out[[1]] + h * k3[[1]]
        yn[[2]] <- out[[2]] + h * k3[[2]]
        k4 <- func1(tt+h, yn, a, f1, Q, f, b, theta)
    
        out[[1]] <- out[[1]] + h/6 * (k1[[1]] + 2*k2[[1]] + 2*k3[[1]] + k4[[1]])
        out[[2]] <- out[[2]] + h/6 * (k1[[2]] + 2*k2[[2]] + 2*k3[[2]] + k4[[2]])
    
        tt <- tt + h
    }
  
    m2 <- out[[1]]
    gamma2 <- out[[2]]
  
    # New y2:
    y2 <- matrix(nrow=dim(m2)[1], ncol=1, 0)
    for(ii in 1:dim(m2)[1]) {
        y2[ii,1] <- rnorm(1, mean=m2[ii,1], sd=sqrt(gamma2[ii,ii])) 
        #y2[ii,1] <- rnorm(1, mean=m2[ii,1], sd=gamma2[ii,ii]) 
    }
    y2
}

#'Multiple Data Imputation with SPM
#'@param x A longitudinal dataset with missing observations
#'@param id A name (text) or index (numeric) of ID column. Default: 1
#'@param case A case status column name (text) or index (numeric). Default: 2
#'@param t1 A t1 (or t if short format is used) column name (text) or index (numeric). Default: 3
#'@param t2 A t2 column name (if long format is used) (text) or index (numeric). Default: 4
#'@param covariates A list of covariate column names or indices. Default: 5
#'@param minp Number of imputations. Default: 5
#'@param theta_range A range of parameter theta used for optimization, default: seq(0.01, 0.15, by=0.001).
#'@return A list(imputed, imputations)
#'@return imputed An imputed dataset.
#'@return imputations Temporary imputed datasets used in multiple imputaitons.
#'@export
#'@examples \dontrun{
#'library(stpm) 
#'##Data preparation ##
#'data <- simdata_discr(N=1000, dt = 2)
#'miss.id <- sample(x=dim(data)[1], size=round(dim(data)[1]/4)) # ~25% missing data
#'incomplete.data <- data
#'incomplete.data[miss.id,5] <- NA
#'incomplete.data[miss.id-1,6] <- NA
#'## End of data preparation ##
#'
#'# Estimate parameters from the complete dataset #
#'p <- spm_discrete(data, theta_range = seq(0.075, 0.09, by=0.001))
#'p
#'
#'##### Multiple imputation with SPM #####
#'imp.data <- spm.impute(x=incomplete.data, 
#'                       minp=5, 
#'                       theta_range=seq(0.075, 0.09, by=0.001))$imputed
#'head(imp.data)
#'## Estimate SPM parameters from imputed data and compare them to the p ##
#'pp.test <- spm_discrete(imp.data, theta_range = seq(0.075, 0.09, by=0.001))
#'pp.test
#'}
spm.impute <- function(x, 
                       id=1, 
                       case=2,
                       t1=3, 
                       t2=3, 
                       covariates=4,
                       minp=5, 
                       theta_range=seq(0.01, 0.2, by=0.001)) 
{
    
    # Check input parameters for correctness
    if(!is.data.frame(x)) {
        stop("Class of dataset must be a 'data.frame'.")
    }
  
    datasets <- list() # To hold imputed datasets
    
    col.id.ind <- ifelse(is.character(id), get.column.index(x, id), id)
    if(col.id.ind == 0) stop(paste("Column",id, "not found in data table!"))
    
    col.status.ind <- ifelse(is.character(case), get.column.index(x, case), case)
    if(col.status.ind == 0) stop(paste("Column",case, "not found in data table!"))
    
    col.age.ind <- ifelse(is.character(t1), get.column.index(x, t1), t1)
    if(col.age.ind == 0) stop(paste("Column",t1, "not found in data table!"))
    
    col.age.event.ind <- ifelse(is.character(t2), get.column.index(x, t2), t2)
    if(col.age.event.ind == 0) stop(paste("Column", t2, "not found in data table!"))
    
    col.covar.ind <- c()
    for(c in covariates) {
        c.ind <- ifelse(is.character(c), get.column.index(x, c), c)
        if(c.ind == 0) stop(paste("Column", c, "not found in data table!"))
        col.covar.ind <- c(col.covar.ind, c.ind)
    } 
    
    if(length(which(is.na(x[, col.id.ind]))) != 0) {
        stop("Dataset contains NAs in ID column.")
    }
    # Remove rows with id = NA/NULL
    #x <- x[which(!is.na(x[, col.id.ind])), ]
    
    # Constructing working dataset
    dataset.work <- data.frame(x[, c(col.id.ind, col.status.ind, col.age.ind, col.covar.ind)])
    
    Ncol <- dim(dataset.work)[2]
    
    # Estimate parameters from raw data
    dataset <- prepare_data_cont(x, 
                                col.id.ind=col.id.ind, 
                                col.status.ind=col.status.ind,
                                col.age.ind=col.age.ind, 
                                col.age.event.ind=col.age.event.ind, 
                                col.covar.ind=covariates, 
                                dt=1, 
                                verbose=FALSE)
    
    pp <- spm_discrete(dataset, theta_range = theta_range)
    
    # Individual IDs
    ids <- unique(dataset[,1])
    #print(pp)
    for(m in 1:minp) {
      
        ####
        # for u:
        pp.dmodel.u <- pp$dmodel$u
        for(i in 1:length(pp$dmodel$u)) {
            mu <- rnorm(10000, mean=pp$dmodel$u[i], sd=sqrt(pp$dmodel$u.std.err[i]))
            z <- rnorm(10000, mean=mu, sd=pp$dmodel$u.std.err[i])
            pp.dmodel.u[i] <- mean(z)
        }
        # For R:
        pp.dmodel.R <- pp$dmodel$R
        for(i in dim(pp$dmodel$R)[1]) {
            for(j in dim(pp$dmodel$R)[2]) {
                mu <- rnorm(10000, mean=pp$dmodel$R[i,j], sd=sqrt(pp$dmodel$R.std.err[i,j]))
                z <- rnorm(10000, mean=mu, sd=pp$dmodel$R.std.err[i,j])
                pp.dmodel.R[i,j] <- mean(z)
            }
        }
        ####
        ## Temporary working dataset:
        dat <- dataset.work
        
        for(k in ids) {
            ########## Forward #########
            # Retrieve records for single subject
            df <- dat[which(dat[,1] == k), ]
            
            Nrec <- dim(df)[1]
      
            if(Nrec == 1) {
                row.cur <- df
                for(j in 4:Ncol) {
                    l <- j-3
                    if(is.na(row.cur[j])) {
                        y.start <- rnorm(1, mean = pp$cmodel$f[l], sd=pp$dmodel$Sigma[l])
                        row.cur[j] <- y.start
                    }
                }
                
                df <- row.cur
                dat[which(dat[,1] == k), ] <- df
                
                next
            }
      
            # Check the first row #
            row.cur <- df[1, ]
            row.next <- df[2, ]
  
            for(j in 4:Ncol) {
                l <- j-3
                if(is.na(row.cur[j]) & !is.na(row.next[j])) {
                    y2 <- row.next[j]
                    # Only one-dimension
                    y1 <- getPrevY.discr(as.matrix(y2), pp.dmodel.u[l], as.matrix(pp.dmodel.R[l,l]), pp$dmodel$Sigma[l])
                    row.cur[j] <- y1
                } else if(is.na(row.cur[j]) & is.na(row.next[j])) {
                    y.start <- rnorm(1, mean = pp$cmodel$f1[l], sd=pp$cmodel$b[l])
                    row.cur[j] <- y.start
                }
            }
            
            df[1, ] <- row.cur
            
      
            #### Main imputation loop ####
        
            for(i in 2:Nrec) {
                row.prev <- df[i-1,]
                row.cur <- df[i,]
                
                if(i != Nrec & Nrec >2) {row.next <- df[i+1,]}
                
                y1 <- row.prev[4:Ncol]
                
                if(any(is.na(row.cur))) {
                    y.next <- getNextY.discr.m(as.matrix(as.numeric(y1)), as.matrix(pp.dmodel.u), pp.dmodel.R)
                    row.cur.na <- which(is.na(row.cur))
                    row.cur[row.cur.na] <- y.next[row.cur.na - 3]
                }
            
                df[i, ] <- row.cur
            }
      
            ### Last record in a dataset ###
            row.cur <- df[Nrec, ]
            row.prev <- df[Nrec-1,]
            if(any(is.na(row.cur[4:Ncol])) & row.cur[2] == 0) {
                y1 <- row.prev
                y.next <- getNextY.discr.m(as.matrix(as.numeric(y1)), as.matrix(pp.dmodel.u), pp.dmodel.R)
                row.cur.na <- which(is.na(row.cur))
                row.cur[row.cur.na] <- y.next[row.cur.na - 3]
            }
      
            df[Nrec, ] <- row.cur
            
            
            tryCatch({
                dat[which(dat[,1] == k), ] <- df
            },error=function(e) {
                print(e)
                print("dat:")
                print(dat[which(dat[,1] == k), ])
                print("df:")
                print(df)
                stop()
            }, warning=function(w){
                print(w)
                print("dat:")
                print(dat[which(dat[,1] == k), ])
                print("df:")
                print(df)
                dat[which(dat[,1] == k), ] <- df
            })
        }
        #################################################################################
        datasets[[m]] <- dat
    }
  
    ### Summarizing imputed datasets together by averaging missing values, i.e. 'completion' ###
    final.dataset <- x
    for(j in 4:Ncol) {
        data.tmp <- matrix(nrow = dim(dataset.work)[1], ncol=0)
        for(i in 1:minp) {
            data.tmp <- cbind(data.tmp, datasets[[i]][,j])
        }
        data.tmp.2.mean <- apply(X = data.tmp, FUN = mean, MARGIN = 1, na.rm=T)
        dataset.work[,j] <- data.tmp.2.mean
    }
    
    colnames.x <- colnames(x)
    final.dataset[, col.covar.ind] <- dataset.work[,4:Ncol]
    colnames(final.dataset) <- colnames(x)
    
    res <- list(imputed=final.dataset, imputations=datasets, data.long.format=dataset)
    res
}

Try the stpm package in your browser

Any scripts or data that you put into this service are public.

stpm documentation built on Sept. 5, 2022, 5:06 p.m.