TrueTrait <- function(datX, y, datXtest = NULL, corFnc = "bicor", corOptions = "use = 'pairwise.complete.obs'", LeaveOneOut.CV = FALSE, skipMissingVariables = TRUE, addLinearModel = FALSE) {
datX <- as.matrix(datX)
no.variables <- dim(as.matrix(datX))[[2]]
datVariableInfo <- data.frame(matrix(NA, nrow = no.variables, ncol = 4))
names(datVariableInfo) <- c("Variable", "center", "scale", "weights.y.true2")
if (is.null(colnames(datX))) {
datVariableInfo$Variable <- 1:no.variables
} else {
datVariableInfo$Variable <- colnames(datX)
}
no.observations <- dim(as.matrix(datX))[[1]]
if (no.observations != length(y)) {
stop("The number of rows of datX does not correspond to the length of y. Consider transposing your input matrix or use a different input.")
}
if (no.observations == 1) {
warning("Only 1 observations, i.e. the length of y is 1. The function cannot be used. For your convenience, the estimated true values will be set to the input value of y.")
y.true1 <- y
y.true2 <- y
y.true3 <- y
}
if (no.observations > 1) {
y.true1 <- rep(NA, length(y))
y.true2 <- rep(NA, length(y))
y.true3 <- rep(NA, length(y))
y.lm <- rep(NA, length(y))
restNonMissingY <- !is.na(y)
r.characteristic <- NA
SD.ytrue2 <- NA
SD.ytrue3 <- NA
SsquaredBE <- NA
if (sum(restNonMissingY, na.rm = TRUE) > 3) {
corX <- parse(text = paste(corFnc, "(datX,y ", prepComma(corOptions), ")"))
rVector <- as.numeric(eval(corX))
datCoef <- t(coef(lm(datX ~ y, na.action = "na.exclude")))
datVariableInfo$center <- datCoef[, 1] # intercept
datVariableInfo$scale <- datCoef[, 2] # slope
datXscaled <- scale(datX, center = datCoef[, 1], scale = datCoef[, 2])
weights0 <- rVector^2 / ((1 - rVector^2) * var(y, na.rm = TRUE)) # Steve, this is where I made the one change
weights <- weights0 / sum(weights0)
datVariableInfo$weights.y.true2 <- weights
y.true1 <- as.numeric(apply(as.matrix(datXscaled), 1, mean))
y.true2 <- as.numeric(as.matrix(datXscaled) %*% weights)
if (skipMissingVariables) {
y.true1 <- as.numeric(apply(as.matrix(datXscaled), 1, mean, na.rm = TRUE))
weightsMatrix <- matrix(weights, byrow = TRUE, nrow = dim(as.matrix(datXscaled))[[1]], ncol = length(weights))
weightsMatrix[is.na(datXscaled)] <- 0
rowsum.weightsMatrix <- apply(as.matrix(weightsMatrix), 1, sum)
weightsMatrix <- t(scale(t(as.matrix(weightsMatrix)), center = F, scale = rowsum.weightsMatrix))
datXscaledweighted <- as.matrix(datXscaled * weightsMatrix)
# this corresponds to formula 25 in Klemera et al 2006
y.true2 <- as.numeric(apply(datXscaledweighted, 1, sum, na.rm = TRUE))
} # end of if (skipMissingVariables )
# the following is different from Klemera in that it has an absolute value
r.characteristic <- sum(rVector^2 / sqrt(1 - rVector^2)) / sum(abs(rVector) / sqrt(1 - rVector^2))
no.missing <- sum(apply(as.matrix(is.na(datX)), 1, sum))
if (sum(no.missing) > 0) {
warning("The input datX contains missing values.\n
I recommend you impute missing values in datX before running this function. ")
} # end of if (sum(no.missing)>0)
# formula 37 from Klemera
SsquaredBE <- var(y.true2 - y, na.rm = TRUE) - (1 - r.characteristic^2) / r.characteristic^2 * var(y, na.rm = TRUE) / no.variables
# this corresponds to formula 34 in Klemera
y.true3 <- (as.numeric(as.matrix(datXscaled) %*% weights0) + y / SsquaredBE) / (sum(weights0) + 1 / SsquaredBE)
y.true3[is.na(y.true3)] <- y[is.na(y.true3)]
} # end of if (no.observations>1 )
SD.ytrue2 <- sqrt(1 - r.characteristic^2) / r.characteristic * sqrt(var(y, na.rm = TRUE) / no.variables)
# now formula 42
SD.ytrue3 <- SD.ytrue2 / sqrt(1 + SD.ytrue2^2 / SsquaredBE)
} # end of if (sum(restNonMissingY,na.rm=TRUE) >3 )
datEstimates <- data.frame(y, y.true1, y.true2, y.true3)
if (!is.null(datXtest)) {
datXtest <- as.matrix(datXtest)
no.variablestest <- dim(as.matrix(datXtest))[[2]]
if (no.variablestest != no.variables) {
stop("the number of variables in the test data is not the same as in the training data")
}
y.true1test <- rep(NA, length(y))
y.true2test <- rep(NA, length(y))
y.true3test <- rep(NA, length(y))
restNonMissingY <- !is.na(y)
if (sum(restNonMissingY, na.rm = TRUE) > 3) {
datXtestscaled <- scale(datXtest, center = datCoef[, 1], scale = datCoef[, 2])
y.true1test <- as.numeric(apply(as.matrix(datXtestscaled), 1, mean))
y.true2test <- as.numeric(as.matrix(datXtestscaled) %*% weights)
if (skipMissingVariables) {
y.true1test <- as.numeric(apply(as.matrix(datXtestscaled), 1, mean, na.rm = TRUE))
weightsMatrixtest <- matrix(weights, byrow = TRUE, nrow = dim(as.matrix(datXtestscaled))[[1]], ncol = length(weights))
weightsMatrixtest[is.na(datXtestscaled)] <- 0
rowsum.weightsMatrixtest <- apply(as.matrix(weightsMatrixtest), 1, sum)
weightsMatrixtest <- t(scale(t(as.matrix(weightsMatrixtest)), center = F, scale = rowsum.weightsMatrixtest))
datXscaledweightedtest <- as.matrix(datXtestscaled * weightsMatrixtest)
# this corresponds to formula 25 in Klemera et al 2006
y.true2test <- as.numeric(apply(datXscaledweightedtest, 1, sum, na.rm = TRUE))
} # end of if (skipMissingVariables )
} # end of if (sum(restNonMissingY,na.rm=TRUE) >3 )
datEstimatestest <- data.frame(y.true1 = y.true1test, y.true2 = y.true2test)
} # end of if (!is.null(datXtest))
if (LeaveOneOut.CV) {
y.true1test.LOO <- rep(NA, no.observations)
y.true2test.LOO <- rep(NA, no.observations)
y.lmLOO <- rep(NA, no.observations)
for (i in 1:no.observations) {
rm(datCoef)
rm(corX)
datX.LOO <- datX[-i, ]
datXtest.LOO <- matrix(datX[i, ], nrow = 1)
y.LOO <- y[-i]
no.variables <- dim(as.matrix(datX.LOO))[[2]]
no.observations <- dim(as.matrix(datX.LOO))[[1]]
if (no.observations == 1) {
warning("When dealing with leave one out cross validation, there is only 1 observations in the training data")
}
if (no.observations > 1) {
if (addLinearModel) {
lmLOO <- lm(y.LOO ~ ., data = data.frame(datX.LOO), na.action = na.exclude)
y.lmLOO[i] <- sum(datXtest.LOO * lmLOO$coeff[-1]) + lmLOO$coeff[[1]]
}
corX <- parse(text = paste(
corFnc, "(datX.LOO,y.LOO ",
prepComma(corOptions), ")"
))
rVector <- as.numeric(eval(corX))
datCoef <- t(coef(lm(datX.LOO ~ y.LOO, na.action = "na.exclude")))
datX.LOOscaled <- scale(datX.LOO, center = datCoef[, 1], scale = datCoef[, 2])
weights0 <- rVector^2 / (1 - rVector^2)
weights <- weights0 / sum(weights0)
datXtest.LOOscaled <- (datXtest.LOO - datCoef[, 1]) / datCoef[, 2]
y.true1test.LOO[i] <- mean(datXtest.LOOscaled)
y.true2test.LOO[i] <- sum(datXtest.LOOscaled * weights)
if (skipMissingVariables) {
y.true1test.LOO[i] <- mean(datXtest.LOOscaled, na.rm = TRUE)
weightsMatrixLOO <- weights
weightsMatrixLOO[is.na(datXtest.LOOscaled)] <- 0
rowsum.weightsMatrixLOO <- sum(weightsMatrixLOO)
weightsMatrixLOO <- weightsMatrixLOO / rowsum.weightsMatrixLOO
datXscaledweightedLOO <- datXtest.LOOscaled * weightsMatrixLOO
y.true2test.LOO[i] <- sum(datXscaledweightedLOO, na.rm = TRUE)
} # end of if (skipMissingVariables )
} # end of for loop
} # end of if (no.observations>1 )
datEstimates.LeaveOneOut.CV <- data.frame(
y.true1 = y.true1test.LOO,
y.true2 = y.true2test.LOO
)
} # end of if ( LeaveOneOut.CV )
if (addLinearModel) {
y.lmTest <- rep(NA, dim(as.matrix(datXtest))[[1]])
restNonMissingY <- !is.na(y)
if (sum(restNonMissingY, na.rm = TRUE) > 3) {
lm1 <- lm(y ~ ., data = data.frame(datX), na.action = na.exclude)
y.lmTraining <- predict(lm1)
y.lmTraining <- predict(lm1)
if (!is.null(datXtest)) {
y.lmTest <- predict(lm1, newdata = data.frame(datXtest))
}
}
}
if (!is.null(datXtest) & LeaveOneOut.CV & !addLinearModel) {
out <- list(
datEstimates = datEstimates, datEstimatestest = datEstimatestest,
datEstimates.LeaveOneOut.CV = datEstimates.LeaveOneOut.CV, SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo
)
}
if (!is.null(datXtest) & !LeaveOneOut.CV & !addLinearModel) {
out <- list(datEstimates = datEstimates, datEstimatestest = datEstimatestest, SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo)
}
if (is.null(datXtest) & LeaveOneOut.CV & !addLinearModel) {
out <- list(datEstimates = datEstimates, datEstimates.LeaveOneOut.CV = datEstimates.LeaveOneOut.CV, SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo)
}
if (is.null(datXtest) & !LeaveOneOut.CV & !addLinearModel) {
out <- list(datEstimates = datEstimates, SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo)
}
if (!is.null(datXtest) & LeaveOneOut.CV & addLinearModel) {
out <- list(
datEstimates = data.frame(datEstimates, y.lm = y.lmTraining), datEstimatestest = data.frame(datEstimatestest, y.lm = y.lmTest),
datEstimates.LeaveOneOut.CV = data.frame(datEstimates.LeaveOneOut.CV, y.lm = y.lmLOO), SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo
)
}
if (!is.null(datXtest) & !LeaveOneOut.CV & addLinearModel) {
out <- list(datEstimates = data.frame(datEstimates, y.lm = y.lmTraining), datEstimatestest = data.frame(datEstimatestest, y.lm = y.lmTest), SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo)
}
if (is.null(datXtest) & LeaveOneOut.CV & addLinearModel) {
out <- list(
datEstimates = data.frame(datEstimates, y.lm = y.lmTraining), datEstimates.LeaveOneOut.CV = data.frame(datEstimates.LeaveOneOut.CV, y.lm = y.lmLOO),
SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo
)
}
if (is.null(datXtest) & !LeaveOneOut.CV & addLinearModel) {
out <- list(datEstimates = data.frame(datEstimates, y.lm = y.lmTraining), SD.ytrue2 = SD.ytrue2, SD.ytrue3 = SD.ytrue3, datVariableInfo = datVariableInfo)
}
out
} # end of function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.