Nothing
.onAttach <- function(...) {
packageStartupMessage("User manual in ../library/REGENT/doc/REGENT_usermanual.pdf")
}
REGENT.model<-function (AnalysisName, LocusFile = NULL, EnvFile = NULL, prev = 0.001,
cv = 0.05, alpha = 0.05, sims = 1e+05, indsims = 1e+05, SmallSampAdjust = 0.5,
BaseRange = 0.01, PlotMax = 5, Block = 100)
{
print(paste("Analysis started at", date(), sep = " "))
log = paste("Analysis started at", date(), sep = " ")
print("Reading input...")
log = c(log, "Reading input...")
if (is.null(LocusFile) && is.null(EnvFile)) {
print("ERROR: No data available - specify LocusFile or EnvFile")
log = c(log, "ERROR: No data available - specify LocusFile or EnvFile")
return(NULL)
}
if ((sims < 1e+05) && (sims > 9999)) {
print("Warning: low number of simulations, confidence intervals may be inaccurate")
log = c(log, "Warning: low number of simulations, confidence intervals may be inaccurate")
}
if (sims < 10000) {
print("Warning: very low number of simulations, confidence intervals likely to be inaccurate!!")
log = c(log, "Warning: very low number of simulations, confidence intervals likely to be inaccurate!!")
}
if ((indsims < 1e+05) && (indsims > 9999)) {
print("Warning: low number of individuals simulationed, relatively common multilocus genotypes may be missed")
log = c(log, "Warning: low number of individuals simulationed, relatively common multilocus genotypes may be missed")
}
if (indsims < 10000) {
print("Warning: very low number of simulations, relatively common multilocus genotypes likely to be missed!!")
log = c(log, "Warning: very low number of simulations, relatively common multilocus genotypes likely to be missed!!")
}
Geno = TRUE
if (is.null(LocusFile))
Geno = FALSE
Env = TRUE
if (is.null(EnvFile))
Env = FALSE
if (Geno) {
GenoIn = read.table(LocusFile, header = TRUE)
path = unlist(strsplit(LocusFile, split = "/"))
}
else {
path = unlist(strsplit(EnvFile, split = "/"))
}
path = path[-length(path)]
if (length(path) > 0)
setwd(paste(path, collapse = "/"))
if (Geno) {
if (sum(c("SNP", "MAF", "Ncase", "Ncontrol") %in% colnames(GenoIn)) <
4) {
print("ERROR: Variables missing from input. Ensure SNP name (SNP), Minor Allele Frequency (MAF), number of cases (Ncase) and number of controls (Ncontrol) are specified in locus file")
log = c(log, "ERROR: Variables missing from input. Ensure SNP name (SNP), Minor Allele Frequency (MAF), number of cases (Ncase) and number of controls (Ncontrol) are specified in locus file")
return(NULL)
}
p = GenoIn[, match("MAF", colnames(GenoIn))]
ncase = GenoIn[, match("Ncase", colnames(GenoIn))]
ncontr = GenoIn[, match("Ncontrol", colnames(GenoIn))]
if ("RR" %in% colnames(GenoIn)) {
OR = GenoIn[, match("RR", colnames(GenoIn))]
OR[p > 0.5] = 1/OR[p > 0.5]
MltplctvOR = TRUE
}
else {
if (("RR_het" %in% colnames(GenoIn)) && ("RR_hom" %in%
colnames(GenoIn))) {
OR = cbind(GenoIn[, match("RR_het", colnames(GenoIn))],
GenoIn[, match("RR_hom", colnames(GenoIn))])
OR[p > 0.5, ] = 1/OR[p > 0.5, ]
MltplctvOR = FALSE
}
else {
print("ERROR: Ensure risk ratios are present, with column name OR for allelic risk ratios or two columns RR_het and RR_hom for genotypic risk ratios")
log = c(log, "ERROR: Ensure risk ratios are present, with column name RR for allelic risk ratios or two columns RR_het and RR_hom for genotypic odds ratios")
return(NULL)
}
}
p[p > 0.5] = 1 - p[p > 0.5]
q = 1 - p
nSNP = length(as.matrix(OR)[, 1])
}
else {
nSNP = 0
p = integer(0)
q = integer(0)
MltplctvOR = TRUE
OR = NULL
GenoIn = NULL
}
MlvlEF = TRUE
if (Env) {
EnvIn = read.table(EnvFile, header = TRUE)
if (!("Factor" %in% colnames(EnvIn))) {
print("ERROR: Ensure factor name (Factor)is specified in environmental file")
log = c(log, "ERROR: Ensure factor name (Factor)is specified in environmental file")
return(NULL)
}
ORnames = paste("RR", 1:((dim(EnvIn)[2] - 1)/3), sep = "")
if (length(ORnames) == 1) {
ORnames = "RR"
MlvlEF = FALSE
}
Exnames = paste("Exposed", 1:((dim(EnvIn)[2] - 1)/3),
sep = "")
if (length(Exnames) == 1) {
Exnames = "Exposed"
MlvlEF = FALSE
}
SEnames = paste("SE", 1:((dim(EnvIn)[2] - 1)/3), sep = "")
if (length(SEnames) == 1) {
SEnames = "SE"
MlvlEF = FALSE
}
if (MltplctvOR) {
if (!MlvlEF) {
OR = c(OR, EnvIn[, match(ORnames, colnames(EnvIn))])
}
else {
if (length(OR) > 0) {
OR = rbind(as.matrix(cbind(OR, matrix(rep(NA,
length(OR) * (length(ORnames) - 1)), nrow = length(OR)))),
as.matrix(EnvIn[, match(ORnames, colnames(EnvIn))]))
}
else {
OR = as.matrix(EnvIn[, match(ORnames, colnames(EnvIn))])
}
}
}
else {
if (!MlvlEF) {
OR = rbind(OR, cbind(EnvIn[, match("RR", colnames(EnvIn))],
rep(NA, sum("RR" %in% colnames(EnvIn)))))
}
else {
if (length(OR) > 0) {
OR = rbind(as.matrix(cbind(OR, matrix(rep(NA,
length(OR) * (length(ORnames) - 2)), nrow = length(OR)))),
as.matrix(EnvIn[, match(ORnames, colnames(EnvIn))]))
}
else {
OR = as.matrix(EnvIn[, match(ORnames, colnames(EnvIn))])
}
}
}
colnames(OR) = NULL
Ex = as.matrix(EnvIn[, match(Exnames, colnames(EnvIn))])
nEx = 1 - Ex
SE = as.matrix(EnvIn[, match(SEnames, colnames(EnvIn))])
nEnv = dim(as.matrix(OR))[1] - nSNP
if (sum(rowSums(Ex) > 1) > 0) {
print("ERROR: sum of environmental exposure levels greater than 1")
log = c(log, "ERROR: sum of environmental exposure levels greater than 1")
return(NULL)
}
}
else {
nEnv = 0
Ex = integer(0)
nEx = integer(0)
EnvIn = NULL
}
nVar = nSNP + nEnv
print("Simulating population of genotypes...")
log = c(log, "Simulating population of genotypes...")
pop = matrix(nrow = indsims, ncol = nVar, rep(0, nVar * indsims))
if (Geno) {
for (i in 1:nSNP) {
rand = runif(indsims, 0, 1)
pop[rand <= p[i], i] = 1
rand = runif(indsims, 0, 1)
pop[rand <= p[i], i] = pop[rand <= p[i], i] + 1
}
}
if (Env) {
for (i in 1:nEnv) {
rand = runif(indsims, 0, 1)
for (j in dim(Ex)[2]:1) {
pop[rand <= sum(Ex[i, 1:j]), nSNP + i] = j
}
}
}
Mp = table(apply(X = pop, MARGIN = 1, FUN = paste, collapse = ""))/indsims
Combos = t(matrix(as.numeric(unlist(strsplit(rownames(Mp),
split = ""))), nrow = nVar))
nGeno = dim(Combos)[1]
rownames(Mp) = NULL
pop = NULL
print("Calculating point estimates...")
log = c(log, "Calculating point estimates...")
MGRR = rep(1, nGeno)
rsim = rnorm(sims, 1, sd = cv)
if (Geno) {
gsim = array(dim = c(sims, nSNP, 3))
gsim[, , 1] = 1
if (MltplctvOR) {
for (i in 1:nSNP) {
MGRR = MGRR * OR[i]^Combos[, i]
denom = (1 + (OR[i] - 1) * p[i])^2
concas0 = 1/(q[i]^2 * ncontr[i]) + denom/(q[i]^2 *
ncase[i])
gsim[, i, 2] = rlnorm(sims, meanlog = log(OR[i]),
sdlog = sqrt(concas0 + 1/(2 * p[i] * q[i] *
ncontr[i]) + denom/(2 * p[i] * q[i] * OR[i] *
ncase[i])))
gsim[, i, 3] = rlnorm(sims, meanlog = log(OR[i]^2),
sdlog = sqrt(concas0 + 1/(p[i]^2 * ncontr[i] +
SmallSampAdjust) + 1/(((p[i] * OR[i])^2 *
ncase[i]/denom) + SmallSampAdjust)))
lsim = rnorm(sims, mean = p[i], sd = sqrt((p[i] *
q[i])/ncontr[i]))
lsim = (1 + (gsim[, i, 2] - 1) * lsim)^2
lsim = mean(lsim)/lsim
gsim[, i, ] = gsim[, i, ] * lsim
}
}
else {
for (i in 1:nSNP) {
MGRR = MGRR * cbind(rep(1, nSNP), as.matrix(OR)[1:nSNP,
])[i, Combos[, i] + 1]
denom = (1 + (OR[i, 1] - 1) * p[i])^2
concas0 = 1/(q[i]^2 * ncontr[i]) + denom/(q[i]^2 *
ncase[i])
gsim[, i, 2] = rlnorm(sims, meanlog = log(OR[i,
1]), sdlog = sqrt(concas0 + 1/(2 * p[i] * q[i] *
ncontr[i]) + denom/(2 * p[i] * q[i] * OR[i,
1] * ncase[i])))
gsim[, i, 3] = rlnorm(sims, meanlog = log(OR[i,
2]), sdlog = sqrt(concas0 + 1/(p[i]^2 * ncontr[i] +
SmallSampAdjust) + 1/(((p[i]^2 * OR[i, 2]) *
ncase[i]/denom) + SmallSampAdjust)))
lsim = rnorm(sims, mean = p[i], sd = sqrt((p[i] *
q[i])/ncontr[i]))
lsim = (1 + (gsim[, i, 2] - 1) * lsim)^2
lsim = mean(lsim)/lsim
gsim[, i, ] = gsim[, i, ] * lsim
}
}
}
if (Env) {
esim = array(dim = c(sims, nEnv, dim(Ex)[2] + 1))
esim[, , 1] = 1
for (i in 1:nEnv) {
MGRR = MGRR * (c(1, as.matrix(OR)[nSNP + i, ])[!is.na(c(1,
as.matrix(OR)[nSNP + i, ]))][Combos[, nSNP +
i] + 1])
for (j in 2:(dim(Ex)[2] + 1)) {
esim[, i, j] = rlnorm(sims, meanlog = log(as.matrix(OR)[nSNP +
i, j - 1]), sdlog = as.matrix(SE)[i, j - 1])
}
}
}
print("Simulating for confidence intervals...")
log = c(log, "Simulating for confidence intervals...")
upper = vector(length = nGeno)
lower = upper
for (i in seq(0, nGeno - 1, by = Block)) {
if (i <= (nGeno) - Block) {
risksim = matrix(rep(rsim, Block), nrow = sims, ncol = Block)
if (Geno) {
for (j in 1:nSNP) {
risksim = risksim * gsim[, j, Combos[(i + 1):(i +
Block), j] + 1]
}
}
if (Env) {
for (j in 1:nEnv) {
risksim = risksim * esim[, j, Combos[(i + 1):(i +
Block), nSNP + j] + 1]
}
}
sorted = as.matrix(apply(X = risksim, MARGIN = 2,
FUN = quantile, probs = c(alpha/2, 1 - (alpha/2))),
nrow = 2)
lower[(i + 1):(i + Block)] = sorted[1, ]
upper[(i + 1):(i + Block)] = sorted[2, ]
}
else {
risksim = matrix(rep(rsim, nGeno - i), nrow = sims,
ncol = nGeno - i)
if (Geno) {
for (j in 1:nSNP) {
risksim = risksim * gsim[, j, Combos[(i + 1):nGeno,
j] + 1]
}
}
if (Env) {
for (j in 1:nEnv) {
risksim = risksim * esim[, j, Combos[(i + 1):nGeno,
nSNP + j] + 1]
}
}
sorted = as.matrix(apply(X = risksim, MARGIN = 2,
FUN = quantile, probs = c(alpha/2, 1 - (alpha/2))),
nrow = 2)
lower[(i + 1):nGeno] = sorted[1, ]
upper[(i + 1):nGeno] = sorted[2, ]
}
}
ranks = order(MGRR, decreasing = FALSE)
MGRR = MGRR[ranks]
Mp = Mp[ranks]
upper = upper[ranks]
lower = lower[ranks]
Combos = Combos[ranks, ]
b = order(abs(MGRR - sum(Mp * MGRR)), decreasing = FALSE)[1]
tot = Mp[b]
b2 = b
i = 1
while (tot < BaseRange) {
tot = tot + Mp[b + i]
b2 = c(b2, b + i)
tot = tot + Mp[b - i]
b2 = c(b - i, b2)
i = i + 1
}
baseline = (sum(MGRR[b2] * Mp[b2])/sum(Mp[b2]))
zeroGenType = rep(0, nSNP)
zeroGenType[GenoIn$MAF > 0.5] = 2
zeroGenType = c(zeroGenType, rep(0, nEnv))
baseline2 = 1
if (MltplctvOR) {
if (Geno)
baseline2 = prod(as.matrix(OR)[1:nSNP, 1]^zeroGenType[1:nSNP])
if (Env) {
for (i in 1:nEnv) baseline2 = baseline2 * (c(1, as.matrix(OR)[nSNP +
i, ])[!is.na(c(1, as.matrix(OR)[nSNP + i, ]))][zeroGenType[nSNP +
i] + 1])
}
}
else {
if (Geno)
baseline2 = baseline2 * prod(as.vector(t(cbind(rep(1,
nSNP), OR[1:nSNP, ])))[(0:2)[zeroGenType[1:nSNP] +
1] + seq(1, 3 * nSNP, by = 3)])
if (Env) {
for (i in 1:nEnv) baseline2 = baseline2 * (c(1, as.matrix(OR)[nSNP +
i, ])[!is.na(c(1, as.matrix(OR)[nSNP + i, ]))][zeroGenType[nSNP +
i] + 1])
}
}
baseline2 = baseline/baseline2
MGRRb = MGRR/baseline
upper = upper/baseline
lower = lower/baseline
print("Calculating risk categories...")
log = c(log, "Calculating risk categories...")
LowerAv = sum(lower[b2] * Mp[b2])/sum(Mp[b2])
UpperAv = sum(upper[b2] * Mp[b2])/sum(Mp[b2])
LowerHi = upper[(1:nGeno)[lower > UpperAv][1]]
bins = seq(log(min(MGRR)), log(max(MGRR)), by = (log(max(MGRR)) -
log(min(MGRR)))/1000)
binfreq = vector(length = length(bins) - 1)
binlow = binfreq
binhigh = binfreq
for (i in 1:(length(bins) - 2)) {
index = (1:nGeno)[log(MGRR) >= bins[i]]
binfreq[i] = sum(Mp[index[log(MGRR)[index] < bins[i +
1]]])
binhigh[i] = sum(upper[index[log(MGRR)[index] < bins[i +
1]]] * Mp[index[log(MGRR)[index] < bins[i + 1]]])/binfreq[i]
binlow[i] = sum(lower[index[log(MGRR)[index] < bins[i +
1]]] * Mp[index[log(MGRR)[index] < bins[i + 1]]])/binfreq[i]
}
i = i + 1
index = (1:nGeno)[log(MGRR) >= bins[i]]
binfreq[i] = sum(Mp[index[log(MGRR)[index] <= bins[i + 1]]])
binhigh[i] = sum(upper[index[log(MGRR)[index] <= bins[i +
1]]] * Mp[index[log(MGRR)[index] <= bins[i + 1]]])/binfreq[i]
binlow[i] = sum(lower[index[log(MGRR)[index] <= bins[i +
1]]] * Mp[index[log(MGRR)[index] <= bins[i + 1]]])/binfreq[i]
write.table(cbind(exp(bins[-length(bins)])/baseline, exp(bins[-1])/baseline,
binfreq, binlow, binhigh), file = paste(AnalysisName,
"_RRDist.txt", sep = ""), col.names = c("Lower_risk",
"Upper_risk", "Frequency", "Mean_lowerCI", "Mean_UpperCI"),
row.names = FALSE, quote = FALSE)
out = matrix(nrow = 4, ncol = 2)
rownames(out) = c("Reduced", "Average", "Elevated", "High")
colnames(out) = c("LowerCI", "UpperCI")
out[1, ] = c(0, LowerAv)
out[2, ] = c(LowerAv, UpperAv)
out[3, ] = c(UpperAv, LowerHi)
out[4, ] = c(LowerHi, Inf)
out = round(out, digits = 3)
options(warn = -1)
write.table(x = paste("##REGENTv1.0 by G. Goddard, D. Crouch & C. Lewis##",
sep = ""), file = paste(AnalysisName, ".txt", sep = ""),
quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(x = out, file = paste(AnalysisName, ".txt", sep = ""),
quote = FALSE, row.names = TRUE, col.names = TRUE, append = TRUE)
if (Geno) {
GenoIn = GenoIn[, colnames(GenoIn) %in% c("SNP", "MAF",
"RR", "RR_het", "RR_hom", "Ncase", "Ncontrol")]
if ("RR" %in% colnames(GenoIn)) {
GenoIn = GenoIn[, match(colnames(GenoIn), c("SNP",
"MAF", "RR", "Ncase", "Ncontrol"))]
}
else {
GenoIn = GenoIn[, match(colnames(GenoIn), c("SNP",
"MAF", "RR_het", "RR_hom", "Ncase", "Ncontrol"))]
}
}
if (Env) {
RRvec = grep("RR", colnames(EnvIn))
SEvec = grep("SE", colnames(EnvIn))
EXvec = grep("Exposed", colnames(EnvIn))
EnvIn = EnvIn[, c(match("Factor", colnames(EnvIn)), EXvec,
RRvec, SEvec)]
}
out = list(categories = as.matrix(out), baseline = round(baseline2,
digits = 3), LocusFile = GenoIn, EnvFile = EnvIn)
write.table(x = paste(rep("#", 50), collapse = ""), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, row.names = FALSE,
col.names = FALSE, append = TRUE)
EnvName = EnvFile
if (is.null(EnvName))
EnvName = "-"
GenoName = LocusFile
if (is.null(GenoName))
GenoName = "-"
write.table(cbind(c("Analysis name:", "Locus file:", "Environment file:",
"Disease prevalence:", "Coefficient of variation:", "alpha value:",
"Number of RR simulations:", "Size of simulated population:",
"Genotypes in RAM ('Block'):", "Small sample adjustment:",
"Proportion of population used to calculate baseline confidence intervals:",
"Baseline RR:"), c(AnalysisName, GenoName, EnvName, prev,
cv, alpha, sims, indsims, Block, SmallSampAdjust, BaseRange,
round(baseline2, digits = 3))), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, col.names = FALSE,
row.names = FALSE, append = TRUE)
write.table(x = paste(rep("#", 50), collapse = ""), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, row.names = FALSE,
col.names = FALSE, append = TRUE)
#prev = prev/(1 - prev)
vy = c(0, cumsum(rev(Mp * MGRR/sum(MGRR * Mp))))
vx = c(0, cumsum(rev(Mp * (1 - prev * MGRR/sum(MGRR * Mp))/(1 -
prev))))
auc = round(sum((vx[2:length(vx)] - vx[1:(length(vx) - 1)]) *
(vy[2:length(vy)] + vy[1:(length(vy) - 1)]) * 0.5), digits = 3)
Categories = rep(2, nGeno)
Categories[upper < LowerAv] = 1
Categories[lower > UpperAv] = 3
Categories[lower > LowerHi] = 4
write.table(cbind(c("Reduced", "Average", "Elevated", "High",
"", "AUC"), c(round(sum(Mp[Categories == 1]), digits = 3),
1 - sum(c(round(sum(Mp[Categories == 1]), digits = 3),
round(sum(Mp[Categories == 3]), digits = 3), round(sum(Mp[Categories ==
4]), digits = 3))), round(sum(Mp[Categories ==
3]), digits = 3), round(sum(Mp[Categories == 4]),
digits = 3), "", auc)), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, col.names = c("Risk_Category",
"Proportion_of_population"), row.names = FALSE, append = TRUE)
write.table(x = paste(rep("#", 50), collapse = ""), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, row.names = FALSE,
col.names = FALSE, append = TRUE)
if (Geno) {
write.table(GenoIn, file = paste(AnalysisName, ".txt",
sep = ""), append = TRUE, col.names = TRUE, row.names = FALSE,
quote = FALSE)
write.table(x = paste(rep("#", 50), collapse = ""), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, row.names = FALSE,
col.names = FALSE, append = TRUE)
}
if (Env) {
write.table(EnvIn, file = paste(AnalysisName, ".txt",
sep = ""), append = TRUE, col.names = TRUE, row.names = FALSE,
quote = FALSE)
write.table(x = paste(rep("#", 50), collapse = ""), file = paste(AnalysisName,
".txt", sep = ""), quote = FALSE, row.names = FALSE,
col.names = FALSE, append = TRUE)
}
options(warn = 0)
print("Printing graphs...")
log = c(log, "Printing graphs...")
PlotMax2 = PlotMax
if (PlotMax > max(MGRRb))
PlotMax2 = max(MGRRb)
colvec = rep(4, nGeno)
colvec[Categories == 1] = 3
colvec[Categories == 3] = 2
colvec[Categories == 4] = 1
colvec2 = rainbow(6, alpha = 0.5)[colvec]
colvec3 = rainbow(6)[colvec]
colkey = rainbow(6)[c(1, 2, 4, 3)][c(4, 3, 2, 1) %in% Categories]
tiff(filename = paste(AnalysisName, "_RRDistCol.TIF", sep = ""),
pointsize = 30, width = 1500, height = 1500, units = "px",
antialias = "default")
barplot(MGRRb, width = Mp, border = NA, ylab = "Risk Ratio (Rebased)",
ylim = c(0, PlotMax2), space = 0, xlab = "Percentage of population",
col = colvec2, density = -100)
legend(x = 0, y = PlotMax2, as.factor(c("High", "Elevated",
"Average", "Reduced")[c(4, 3, 2, 1) %in% Categories]),
cex = 0.8, col = colkey, pch = 19, title = "Risk")
axis(1, seq(0, 1, by = 0.1), c("0%", "", "", "", "", "50%",
"", "", "", "", "100%"))
abline(1, 0, col = grey(0.75))
if (sum(MGRRb > PlotMax) > 0) {
par(xpd = TRUE)
arrows(1.03, PlotMax - (PlotMax/5), 1.03, PlotMax, col = colkey[1])
}
quiet <- dev.off()
tiff(filename = paste(AnalysisName, "_RRDistGrey.TIF", sep = ""),
pointsize = 30, width = 1500, height = 1500, units = "px",
antialias = "default")
barplot(MGRRb, width = Mp, border = NA, ylab = "Risk Ratio (Rebased)",
ylim = c(0, PlotMax2), space = 0, xlab = "Percentage of population",
density = -100)
axis(1, seq(0, 1, by = 0.1), c("0%", "", "", "", "", "50%",
"", "", "", "", "100%"))
abline(1, 0, col = 1)
if (sum(MGRRb > PlotMax) > 0) {
par(xpd = TRUE)
arrows(1.03, PlotMax - (PlotMax/5), 1.03, PlotMax, col = grey(0.75))
}
quiet <- dev.off()
print(paste("Analysis completed at", date(), sep = " "))
log = c(log, paste("Analysis completed at", date(), sep = " "))
write.table(as.matrix(log), file = paste(AnalysisName, ".txt",
sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE,
append = TRUE)
return(out)
}
REGENT.predict<-function(AnalysisName,model,ind,prev=0.001,cv=0.05,sims=100000,Block=100,alpha=0.05,SmallSampAdjust=0.5){
#######################################################################
#Read in the model and locus files/objects
#######################################################################
print(paste("Analysis started at",date(),sep=" "))
log=paste("Analysis started at",date(),sep=" ")
print("Reading input...");log=c(log,"Reading input...")
ModelIn=NULL
if(is.character(model)){
ModelIn$categories=read.table(model,comment.char="#",nrows=4)
ModelIn$baseline=as.numeric(scan(model,skip=18,comment.char="#",what=character(0),quiet=TRUE)[3])
ModelIn2=scan(model,what=character(0),comment.char="#",quiet=TRUE,skip=27)
if("SNP"%in%ModelIn2){
ModelIn3=ModelIn2[1: sort(match(c("Factor","Analysis"),ModelIn2)[!is.na(match(c("Factor","Analysis"),ModelIn2))],decreasing=FALSE)[1]-1]
cols=6
if("RR"%in%ModelIn3)cols=5
ModelIn3=matrix(ModelIn3,ncol=cols,byrow=TRUE)
if(dim(ModelIn3)[1]<3){ModelIn3=rbind(ModelIn3,ModelIn3[2,]);oneRow=TRUE}else{oneRow=FALSE}
cols=ModelIn3[1,];ModelIn3=ModelIn3[-1,]
rows=as.matrix(ModelIn3)[,1];ModelIn3=as.matrix(ModelIn3)[,-1];mode(ModelIn3)='numeric'
ModelIn3=data.frame(rows,ModelIn3);colnames(ModelIn3)=cols
if(oneRow)ModelIn3=ModelIn3[-dim(ModelIn3)[1],]
ModelIn$LocusFile=ModelIn3
}else{ModelIn$LocusFile=NULL}
if("Factor"%in%ModelIn2){
ModelIn3=ModelIn2[match("Factor",ModelIn2):(match("Analysis",ModelIn2)-1)]
cols=sort(grep("RR",ModelIn3))
cols=cols[cols<grep("SE",ModelIn3)[1]]
ModelIn3=matrix(ModelIn3,ncol=max(cols+length(cols)),byrow=TRUE)
if(dim(ModelIn3)[1]<3){ModelIn3=rbind(ModelIn3,ModelIn3[2,]);oneRow=TRUE}else{oneRow=FALSE}
cols=ModelIn3[1,];ModelIn3=ModelIn3[-1,]
rows=ModelIn3[,1];ModelIn3=ModelIn3[,-1];mode(ModelIn3)='numeric'
ModelIn3=data.frame(rows,ModelIn3);colnames(ModelIn3)=cols
if(oneRow)ModelIn3=ModelIn3[-dim(ModelIn3)[1],]
ModelIn$EnvFile=ModelIn3
}else{ModelIn$EnvFile=NULL}
}else{
ModelIn=model
}
if((sims<100000)&&(sims>9999)){
print("Warning: low number of simulations, confidence intervals may be inaccurate");log=c(log,"Warning: low number of simulations, confidence intervals may be inaccurate")
}
if(sims<10000){
print("Warning: very low number of simulations, confidence intervals likely to be inaccurate!!");log=c(log,"Warning: very low number of simulations, confidence intervals likely to be inaccurate!!")
}
GenoIn=ModelIn$LocusFile;EnvIn=ModelIn$EnvFile
Geno=TRUE;if(is.null(GenoIn)) Geno=FALSE
Env=TRUE;if(is.null(EnvIn)) Env=FALSE
path=unlist(strsplit(AnalysisName,split="/"))
path=path[-length(path)]
if(length(path)>0) setwd(paste(path,collapse="/"))
if(Geno){
if(sum(c("SNP","MAF","Ncase","Ncontrol")%in%colnames(GenoIn))<4){
print("Variables missing from input. Ensure SNP name (SNP), Minor Allele Frequency (MAF), number of cases (Ncase) and number of controls (Ncontrol) are specified in locus file")
log=c(log,"Variables missing from input. Ensure SNP name (SNP), Minor Allele Frequency (MAF), number of cases (Ncase) and number of controls (Ncontrol) are specified in locus file")
return(0)
}
p=GenoIn[,match("MAF",colnames(GenoIn))]
ncase=GenoIn[,match("Ncase",colnames(GenoIn))]
ncontr=GenoIn[,match("Ncontrol",colnames(GenoIn))]
if("RR"%in%colnames(GenoIn)){
OR=GenoIn[,match("RR",colnames(GenoIn))]
MltplctvOR=TRUE
}else{
if(("RR_het"%in%colnames(GenoIn))&& ("RR_hom"%in%colnames(GenoIn))){
OR=cbind(GenoIn[,match("RR_het",colnames(GenoIn))],GenoIn[,match("RR_hom",colnames(GenoIn))])
MltplctvOR=FALSE
}else{
print("Ensure odds ratios are present, with column name RR for allelic odds ratios or two columns RR_het and RR_hom for genotypic odds ratios");log=c(log,"Ensure odds ratios are present, with column name RR for allelic odds ratios or two columns rR_het and RR_hom for genotypic odds ratios")
}
}
q=1-p
nSNP=dim(as.matrix(OR))[1]
}else{GenoIn=NULL;nSNP=0;OR=NULL;p=integer(0);q=integer(0);MltplctvOR=TRUE}
MlvlEF=TRUE
if(Env){
ORnames=paste("RR",1:((dim(EnvIn)[2]-1)/3),sep="")
if(length(ORnames)==1){
ORnames="RR"
MlvlEF=FALSE
}
Exnames=paste("Exposed",1:((dim(EnvIn)[2]-1)/3),sep="")
if(length(Exnames)==1){
Exnames="Exposed"
MlvlEF=FALSE
}
SEnames=paste("SE",1:((dim(EnvIn)[2]-1)/3),sep="")
if(length(SEnames)==1){
SEnames="SE"
MlvlEF=FALSE
}
if(MltplctvOR){
if(!MlvlEF){
OR=c(OR,EnvIn[,match(ORnames,colnames(EnvIn))])
}else{
if(length(OR)>0){OR=rbind(as.matrix(cbind(OR, matrix(rep(NA,length(OR)*(length(ORnames)-1)),nrow=length(OR)))), as.matrix(EnvIn[,match(ORnames,colnames(EnvIn))]))}else{OR=as.matrix(EnvIn[,match(ORnames,colnames(EnvIn))])}
}
}else{
if(!MlvlEF){
OR=rbind(OR,cbind(EnvIn[,match("RR",colnames(EnvIn))],rep(NA,sum("RR"%in%colnames(EnvIn)))))
}else{
if(length(OR)>0){OR=rbind(as.matrix(cbind(OR, matrix(rep(NA,dim(OR)[1]*(length(ORnames)-2)),nrow=dim(OR)[1]))), as.matrix(EnvIn[,match(ORnames,colnames(EnvIn))]))}else{OR=as.matrix(EnvIn[,match(ORnames,colnames(EnvIn))])}
}
}
colnames(OR)=NULL
Ex=as.matrix(EnvIn[,match(Exnames,colnames(EnvIn))])
nEx=1-Ex
SE=as.matrix(EnvIn[,match(SEnames,colnames(EnvIn))])
nEnv=dim(as.matrix(OR))[1]-nSNP
if(sum(rowSums(Ex)>1)>0){
print("ERROR: sum of environmental exposure levels greater than 1");log=c(log,"ERROR: sum of environmental exposure levels greater than 1")
return(0)
}
}else{nEnv=0;Ex=integer(0);nEx=integer(0);EnvIn=NULL}
nVar=nSNP+nEnv
#######################################################################
#Read in the individual file and check that the variables
#correspond with those in the locus file
#######################################################################
IndIn=read.table(ind,check.names=FALSE)
IndInStore=IndIn
nInd=dim(IndIn)[1]
names=row.names(IndIn)
if(sum(colnames(IndIn)%in%GenoIn[,1]+colnames(IndIn)%in%EnvIn[,1])>0){
if(sum(colnames(IndIn)%in%GenoIn[,1]+colnames(IndIn)%in%EnvIn[,1])<nVar){
print("ERROR: variables missing from individual file. Rebuild the model with only the available variables, using REGENT.model.");log=c(log,"ERROR: variables missing from individual file. Rebuild the model with only the available variables, using REGENT.model.")
return(NULL)
}
if(sum((colnames(IndIn)%in%GenoIn[,1]+colnames(IndIn)%in%EnvIn[,1])<1)>0){
print("Warning: variables in the individual not present in model. Continuing without these variables");log=c(log,"Warning: variables in the individual not present in model. Continuing without these variables")
IndIn=IndIn[,(colnames(IndIn)%in%GenoIn[,1]+colnames(IndIn)%in%EnvIn[,1])>0]
}
}else{
print("ERROR: No modelled variables in individual file");log=c(log,"ERROR: No modelled variables in individual file")
return(NULL)
}
if(is.matrix(IndIn)){ #Don't need to do column matching if we only have 1 column
if(Geno)IndIn[,colnames(IndIn)%in%GenoIn[,1]]=IndIn[,match(colnames(IndIn[colnames(IndIn)%in%GenoIn[,1]]),GenoIn[,1])]
if(Env)IndIn[,colnames(IndIn)%in%EnvIn[,1]]=IndIn[,sum(!(colnames(IndIn)%in%EnvIn[,1]))+match(colnames(IndIn[colnames(IndIn)%in%EnvIn[,1]]),EnvIn[,1])]
}
if(sum(is.na(IndIn))>0){
print("ERROR: variables missing from individual file. Rebuild the model using only the available variables, using REGENT.model.");log=c(log,"ERROR: variables missing from individual file. Rebuild the model using only the available variables, using REGENT.model.")
}
#######################################################################
#Calculate the multifactoral risks for the test individuals
#######################################################################
print("Calculating point estimates...");log=c(log,"Calculating point estimates...")
rsim=rnorm(sims,1,sd=cv)
MGRR=rep(1,nInd)
ZeroGenType=rep(0,nSNP);ZeroGenType[p>0.5]=2;ZeroGenType=c(ZeroGenType,rep(0,nEnv))
baseline2=1
if(Geno){
gsim=array(dim=c(sims,nSNP,3))
gsim[,,1]=1
if(MltplctvOR){
baseline2=prod(OR[1:nSNP]^ZeroGenType[1:nSNP])
for(i in 1:nSNP){
OR2=OR[i];p2=p[i];OR2[p2>0.5]=1/OR2;p2[p2>0.5]=1-p2;q2=1-p2
MGRR=MGRR*OR[i]^IndIn[,i]
denom=(1+(OR2-1)*p2)^2
concas0=1/(q2^2*ncontr[i])+denom/(q2^2*ncase[i])
gsim[,i,2]=rlnorm(sims,meanlog=log(OR2),sdlog=sqrt( concas0+1/(2*p2*q2*ncontr[i])+denom/(2*p2*q2*OR2*ncase[i])))
gsim[,i,3]=rlnorm(sims,meanlog=log(OR2^2),sdlog= sqrt(concas0+1/(p2^2*ncontr[i]+SmallSampAdjust)+1/(((p2*OR2)^2*ncase[i]/denom)+SmallSampAdjust)))
lsim=rnorm(sims,mean=p2,sd=sqrt((p2*q2)/ncontr[i]))#Simulate allele frequencies
lsim=(1+(gsim[,i,2]-1)*lsim)^2 #Simulate denominator of risk eqation
lsim=mean(lsim)/lsim
gsim[,i,]=gsim[,i,]*lsim
}
}else{
for(i in 1:nSNP){
baseline2=baseline2*c(1,OR[i,])[ZeroGenType[i]+1]
OR2=OR[i,];p2=p[i];if(p2>0.5)OR2=1/OR2;p2[p2>0.5]=1-p2;q2=1-p2
MGRR=MGRR*c(1,OR[i,])[IndIn[,i]+1]
denom=(1+(OR2[1]-1)*p2)^2
concas0=1/(q2^2*ncontr[i])+denom/(q2^2*ncase[i])
gsim[,i,2]=rlnorm(sims,meanlog=log(OR2[1]),sdlog=sqrt( concas0+1/(2*p2*q2*ncontr[i])+denom/(2*p2*q2*OR2[1]*ncase[i])))
gsim[,i,3]=rlnorm(sims,meanlog=log(OR2[2]),sdlog= sqrt(concas0+1/(p2^2*ncontr[i]+SmallSampAdjust)+1/((p2^2*OR2[2]*ncase[i]/denom)+SmallSampAdjust)))
lsim=rnorm(sims,mean=p2,sd=sqrt((p2*q2)/ncontr[i]))#Simulate allele frequencies
lsim=(1+(gsim[,i,2]-1)*lsim)^2 #Simulate denominator of risk eqation
lsim=mean(lsim)/lsim
gsim[,i,]=gsim[,i,]*lsim
}
}
}#End of condition is(geno)
if(Env){
esim=array(dim=c(sims,nEnv,dim(Ex)[2]+1))
esim[,,1]=1
for(i in 1:nEnv){
MGRR=MGRR*(c(1,as.matrix(OR)[nSNP+i,])[!is.na(c(1,as.matrix(OR)[nSNP+i,]))][as.matrix(IndIn)[,nSNP+i]+1])
baseline2=baseline2*c(1,as.matrix(OR)[nSNP+i,])[ZeroGenType[nSNP+i]+1]
for(j in 2:(dim(Ex)[2]+1)){
esim[,i,j]=rlnorm(sims,meanlog=log(as.matrix(OR)[nSNP+i,j-1]),sdlog=SE[i,j-1])
}
}
}
baseline2=1/(baseline2/ModelIn$baseline)
print("Simulating for confidence intervals...");log=c(log,"Simulating for confidence intervals...")
#######################################################################
#Obtain confidence intervals for multilocus GRRs by simulation
#######################################################################
rare=(1:nSNP)[p<0.1];rare2=(1:nSNP)[q<0.1]
if(length(c(rare,rare2))>0){
if(length(rare)>0){HetInds=(1:nInd)[rowSums(as.matrix(as.matrix(IndIn[,1:nSNP])[,rare])==2)>0]}else{HetInds=NULL}
if(length(rare2)>0)HetInds=unique(sort(c(HetInds,(1:nInd)[rowSums(as.matrix(as.matrix(IndIn[,1:nSNP])[,rare2])==0)>0])))
IndIn2=as.matrix(IndIn[HetInds,])
nInd2=dim(IndIn2)[1]
if(length(rare)>0){
if(nSNP==1){IndIn2[,1][IndIn2[,1]==2]=1}else{
IndIn2[,1:nSNP][,rare][IndIn2[,1:nSNP][,rare]==2]=1
}}
if(length(rare2)>0){
if(nSNP==1){IndIn2[,1][IndIn2[,1]==0]=1}else{
IndIn2[,1:nSNP][,rare2][IndIn2[,1:nSNP][,rare2]==0]=1
}}
IndIn=rbind(as.matrix(IndIn),as.matrix(IndIn2))
}else{nInd2=0;HetInds=rep(FALSE,length=nInd)}
upper=vector(length=nInd+nInd2);lower=upper;lower2=lower;lower3=lower;upper2=upper;upper3=upper
IndIn2=IndIn
if(sum(p>0.5)>0){
if(nSNP>1){IndIn2[,1:nSNP][,p>0.5]=2-as.matrix(IndIn2[,1:nSNP])[,p>0.5]}else{IndIn2[,1][p>0.5]=2-as.matrix(IndIn2[,1])[p>0.5]}
}
for(i in seq(0,(nInd+nInd2)-1,by=Block)){
if(i<=(nInd+nInd2)-Block){
risksim=matrix(rep(rsim,Block),nrow=sims,ncol=Block)
if(Geno){
for(j in 1:nSNP){
risksim=risksim*gsim[,j,as.matrix(IndIn2)[(i+1):(i+Block),j]+1]
}
}
if(Env){
for(j in 1:nEnv){
risksim=risksim*esim[,j,as.matrix(IndIn2)[(i+1):(i+Block),nSNP+j]+1]
}
}
sorted=as.matrix(apply(X=risksim,MARGIN=2,FUN=quantile,probs=c(alpha/2,1-(alpha/2))),nrow=2)
lower[(i+1):(i+Block)]=sorted[1,]
upper[(i+1):(i+Block)]=sorted[2,]
sorted=as.matrix(apply(X=risksim,MARGIN=2,FUN=quantile,probs=c((alpha+0.01)/2,1-((alpha+0.01)/2))),nrow=2)
lower2[(i+1):(i+Block)]=sorted[1,]
upper2[(i+1):(i+Block)]=sorted[2,]
sorted=as.matrix(apply(X=risksim,MARGIN=2,FUN=quantile,probs=c((alpha-0.01)/2,1-((alpha-0.01)/2))),nrow=2)
lower3[(i+1):(i+Block)]=sorted[1,]
upper3[(i+1):(i+Block)]=sorted[2,]
}else{
risksim=matrix(rep(rsim,(nInd+nInd2)-i),nrow=sims,ncol=(nInd+nInd2)-i)
if(Geno){
for(j in 1:nSNP){
risksim=risksim*gsim[,j,as.matrix(IndIn2)[(i+1):(nInd+nInd2),j]+1]
}
}
if(Env){
for(j in 1:nEnv){
risksim=risksim*esim[,j,as.matrix(IndIn2)[(i+1):(nInd+nInd2),nSNP+j]+1]
}
}
sorted=as.matrix(apply(X=risksim,MARGIN=2,FUN=quantile,probs=c(alpha/2,1-(alpha/2))),nrow=2)
lower[(i+1):(nInd+nInd2)]= sorted[1,]
upper[(i+1):(nInd+nInd2)]= sorted[2,]
sorted=as.matrix(apply(X=risksim,MARGIN=2,FUN=quantile,probs=c((alpha+0.01)/2,1-((alpha+0.01)/2))),nrow=2)
lower2[(i+1):(nInd+nInd2)]=sorted[1,]
upper2[(i+1):(nInd+nInd2)]=sorted[2,]
sorted=as.matrix(apply(X=risksim,MARGIN=2,FUN=quantile,probs=c((alpha-0.01)/2,1-((alpha-0.01)/2))),nrow=2)
lower3[(i+1):(nInd+nInd2)]=sorted[1,]
upper3[(i+1):(nInd+nInd2)]=sorted[2,]
}#End of condition
}#End of loop over seq(0,nInd,by=Block)
########################################################################Find risk categories
#######################################################################
print("Finding risk categories...");log=c(log,"Finding risk categories...")
MGRR=MGRR/ModelIn$baseline;upper=upper/baseline2;lower=lower/baseline2;upper2=upper2/baseline2;upper3=upper3/baseline2;lower2=lower2/baseline2;lower3=lower3/baseline2
Categories=rep(2,nInd+nInd2);Categories2=Categories;Categories3=Categories
Categories[upper<ModelIn$categories[1,2]]=1
Categories[lower>ModelIn$categories[2,2]]=3
Categories[lower>ModelIn$categories[3,2]]=4
if(sum(Categories[1:nInd][HetInds]<Categories[(nInd+1):(nInd+nInd2)])>0){
print(paste("Warning: Individual(s) ",names[1:nInd][HetInds][ Categories[1:nInd][HetInds]<Categories[(nInd+1):(nInd+nInd2)]]," move to higher risk category when homozygotes at SNP(s) ", paste(colnames(as.matrix(IndIn[,1:nSNP]))[c(rare,rare2)],collapse=", ")," are changed to heterozygotes",sep=""));log=c(log,paste("Warning: Individual(s) ",names[1:nInd][HetInds][ Categories[1:nInd][HetInds]<Categories[(nInd+1):(nInd+nInd2)]]," move to higher risk category when homozygotes at SNP(s) ", paste(colnames(as.matrix(IndIn[,1:nSNP]))[c(rare,rare2)],collapse=", ")," are changed to heterozygotes",sep=""))
}
Categories=Categories[1:nInd];upper=upper[1:nInd];lower=lower[1:nInd];upper2=upper2[1:nInd];upper3=upper[1:nInd];lower2=lower2[1:nInd];lower3=lower3[1:nInd];Categories2=Categories2[1:nInd];Categories3=Categories3[1:nInd]
Categories2[upper2<ModelIn$categories[1,2]]=1
Categories2[lower2>ModelIn$categories[2,2]]=3
Categories2[lower2>ModelIn$categories[3,2]]=4
Categories3[upper3<ModelIn$categories[1,2]]=1
Categories3[lower3>ModelIn$categories[2,2]]=3
Categories3[lower3>ModelIn$categories[3,2]]=4
Librl=Categories3!=Categories
Consv=Categories2!=Categories
Borderline=rep("No",length=nInd)
Borderline[(Librl+Consv)>0]="Yes"
#prev=prev/(1-prev)
AR=MGRR*prev
out=matrix(nrow=nInd,ncol=6)
out[,1]=round(AR,digits=3);out[,2]=round(MGRR,digits=3);out[,3]=round(lower,digits=3);out[,4]=round(upper,digits=3);out[,5]=c("Reduced","Average","Elevated","High")[Categories];out[,6]=Borderline
colnames(out)=c("Absolute_Risk","Genotype_Relative_Risk","Lower_CI","Upper_CI","Risk_Category","Category_Boderline");rownames(out)=names
print("Writing output...");log=c(log,"Writing output...")
options(warn=-1) #Suppress warning about appending column names
write.table(x=paste("##REGENTv1.0 by G. Goddard, D. Crouch & C. Lewis##",sep=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE)
write.table(out,file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,col.names=TRUE,row.names=TRUE,append=TRUE)
write.table(paste(rep("#",50),collapse=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
write.table(IndInStore,file=paste(AnalysisName,"_Predictions.txt",sep=""),col.names=TRUE,row.names=TRUE,quote=FALSE,append=TRUE)
write.table(paste(rep("#",50),collapse=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
options(warn=0)
print(paste("Analysis completed at",date(),sep=" "))
log=c(log,paste("Analysis completed at",date(),sep=" "))
options(warn=-1) #Suppress warning about appending column names
if(Geno){
write.table(GenoIn,file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=TRUE,append=TRUE)
write.table(paste(rep("#",50),collapse=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
}
if(Env){
write.table(EnvIn,file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=TRUE,append=TRUE)
write.table(paste(rep("#",50),collapse=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
}
write.table(paste("AnalysisName: ",AnalysisName,sep=""), file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
write.table(paste(rep("#",50),collapse=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
write.table(ModelIn$categories, file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=TRUE,col.names=TRUE,append=TRUE)
options(warn=0) #Restore default
write.table(paste(rep("#",50),collapse=""),file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
write.table(log,file=paste(AnalysisName,"_Predictions.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
return(data.frame(out))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.