Nothing
dataXY <- function(formula, data,
family = "gaussian",
trialscol = NULL,
trans.power = NULL,
trans.shift = 0,
distord,
CorModels
)
{
##start function here, pass distord to function
## data.orig <- data ## not used again so commented out
##make sure the order of the data matches order in distance matrix
data <- data[distord,]
# get a list of response and covariate names
mod.names <- as.character(attr(terms(formula, data = data),"variables"))
tt <- model.matrix(formula,data)
if(!all(mod.names[-1] %in% colnames(data))) stop("The specified formula causes problems in the dataXY function, try to use a simple formula, avoid indicator functions if you can.")
## Check random and fixed effects are different
if(any(mod.names[-1] %in% CorModels)) stop("Random effects and fixed effects overlap")
## get the number of names ( + 1, as the first is always "list")
nc.tmp <- length(mod.names)
# name of the response variable
response.col <- mod.names[2]
# total number of observations
n.all <- length(data[,1])
# create a vector of all TRUE values
ind.all <- rep(TRUE, times = n.all)
ind.allcov <- ind.all
# if there are any covariates ...
if(nc.tmp > 2) {
# create a FALSE for a record with missing values of the covariates
for(i in 3:nc.tmp) ind.allcov <- ind.allcov & !is.na(data[,mod.names[i]])
}
## Check the random effects also
REind <- which(names(data) %in% CorModels)
if(length(REind)) {
for(ii in REind) ind.allcov <- ind.allcov & !is.na(data[,ii])
}
# sample size without missing covariates
n.allcov <- sum(ind.allcov)
# remove records that had any missing values for any of the covariates
data1 <- data[ind.allcov,]
#standardize numeric covariates and turn character fields to factors
covnames <- NULL
if(nc.tmp > 2) covnames <- mod.names[3:nc.tmp]
# StdXDataFrame <- NULL
if(!is.null(covnames)) {
for(i in 1:length(covnames)) {
# if(is.numeric(data1[,covnames[i]])) {
# xmean <- mean(data1[,covnames[i]])
# xsdev <- sqrt(var(data1[,covnames[i]]))
# data1[,covnames[i]] <- (data1[,covnames[i]] - xmean)/xsdev
# StdXDataFrame <- rbind(StdXDataFrame,
# data.frame(VariableName = covnames[i], Mean = xmean, StdDev = xsdev))
# }
if(is.character(data1[,covnames[i]])) {
data1[,covnames[i]] <- as.factor(data1[,covnames[i]])
}
}
}
# replace response column with all 1's for design matrix of all records
z1 <- data1[,response.col]
data1[,response.col] <- rep(1, times = n.allcov)
mf <- model.frame(formula, data = data1)
mt <- attr(mf, "terms")
X1 <- model.matrix(mt, mf)
# get names for all terms and interactions, including those set to zero
terms.list <- attr(mt,"term.labels")
if(attr(mt,"intercept") == 1) effnames <- "(Intercept)"
else effnames <- "NULL"
if(!is.null(covnames)) {
for (i in 1:length(terms.list)) {
form1 <- formula(paste("~ ", terms.list[i], " - 1"))
Xt <- model.matrix(form1, data = data1)
effnames <- c(effnames, colnames(Xt))
}
}
setzero <- !(effnames %in% colnames(X1))
# check for estimability
X1re <- rref(X1)
# create X1 with estimable functions
cutX1toX2 <- apply(abs(X1re),2,sum) == 1
if(any(cutX1toX2 == FALSE)) X1 <- X1[,cutX1toX2]
setNA <- !(effnames %in% colnames(X1)) & !setzero
data1[,response.col] <- z1
#indicator vector of all records without missing covariates or response
ind.allxy <- ind.allcov & !is.na(data[,response.col])
#sample size of all records without missing covariates or response
n.allxy <- sum(ind.allxy)
# indicator vector of data1 without missing covariates
ind.ysubx <- !is.na(data1[response.col])
# data set of all records without missing covariates or response
data2 <- data[ind.allxy,]
# create a working column of the response variable
data2[,"work.col"] <- data2[,response.col]
# transformations on the working column
if(!is.null(trans.power)) {
if(family != "gaussian")
return(print("Power transformation can only be used with gaussian family"))
if(trans.power < 0) return(print("Power transformation must be > 0"))
if(trans.power > .5) return(print("Power transformation must be < 0.5"))
if(trans.power == 0) {
if(any((data2[,response.col] + trans.shift) <= 0))
return(print("Data must be > 0 to use log transformation"))
data2[,"work.col"] <- log(data2[,response.col] + trans.shift)
} else
data2[,"work.col"] <- (data[,response.col] + trans.shift)^trans.power
}
X1 <- as.matrix(X1)
X2 <- X1[ind.ysubx,]
X2 <- as.matrix(X2)
# check for estimability
X2re <- rref(X2)
# create X1 with estimable functions
cutX1toX2 <- apply(abs(X2re),2,sum) == 1
if(any(cutX1toX2 == FALSE)) X2 <- X2[,cutX1toX2]
setNA2 <- !(effnames %in% colnames(X2))
# data set of observed data only
if(is.factor(data2[,"work.col"]))
data2[,"work.col"] <- as.numeric(as.character(data2[,"work.col"]))
if(is.character(data2[,"work.col"]))
data2[,"work.col"] <- as.numeric(data2[,"work.col"])
# vector of observed response variable
z <- as.matrix(data2[,"work.col"], ncol = 1)
attr(z,"pid") <- data2[,"pid"]
trialsvec <- NULL
# if response variable is binomial with n trials change z to proportion
if(!is.null(trialscol) & family == "binomial"){
if(is.factor(data2[,"trialscol"]))
data2[,"trialscol"] <- as.numeric(as.character(data2[,"trialscol"]))
if(is.character(data2[,"trialscol"]))
data2[,"trialscol"] <- as.numeric(data2[,"trialscol"])
trialsvec <- data2[,"trialscol"]
z <- as.matrix(z/trialsvec, ncol = 1)
}
# else if response is Bernoulli, set trialsvec to all ones
if(is.null(trialscol) & family == "binomial")
trialsvec <- rep(1, times = n.allxy)
# if any missing response values, design matrix of missing records
data.na <- NULL
ind.RespNA <- NULL
if((n.allcov - n.allxy) > 0) {
data.na <- data1[!ind.ysubx,]
ind.RespNA <- data[,"pid"] %in% data.na[,"pid"]
}
# get the rank of the design matrix
p <- sum(svd(X1)$d>1e-10)
## GET REs
REs <- NULL
REmodelmatrices <- NULL
REind <- which(names(data) %in% CorModels)
if(length(REind)) {
REs <- list()
REnames <- sort(names(data)[REind])
## model matrix for a RE factor
for(ii in 1:length(REind)) REmodelmatrices[[REnames[ii]]] <- model.matrix(~data[ind.allxy,REnames[ii]] - 1)
## corresponding block matrix
for(ii in 1:length(REind)) REs[[ii]] <- tcrossprod(REmodelmatrices[[ii]])
names(REs) <- REnames
names(REmodelmatrices) <- REnames
}
outpt <- list(
datasets = list(
data0 = data,
data1 = data1,
data2 = data2,
data.na = data.na
),
indvecs = list(
ind.allcov = ind.allcov,
ind.allxy = ind.allxy,
ind.ysubx = ind.ysubx,
ind.RespNA = ind.RespNA,
setzero = setzero,
setNA = setNA,
setNA2 = setNA2,
cutX1toX2 = cutX1toX2
),
sampsizes = list(
n.all = n.all,
n.allcov = n.allcov,
n.allxy = n.allxy
),
Xmats = list(
X1 = X1,
X2 = X2,
p = p,
## StdXDataFrame = StdXDataFrame,
effnames = effnames
),
respvecs = list(
response.col = response.col,
z = as.matrix(z, ncol = 1),
trialsvec = trialsvec
),
REs = REs,
REmodelmatrices = REmodelmatrices
)
outpt
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.