# R/marginal_assoc_tests.r In CCpop: One and two locus GWAS of binary phenotype with case-control-population design

#### Documented in marginal.assoc.test.hwe.in.controlsmarginal.assoc.test.kpx.kpymarginal.assoc.test.pop.hwe.kpymarginal.assoc.test.unconstrained.chisqmarginal.assoc.test.unconstrained.gsq

```# Tests for association (or independence) between a SNP locus and a binary
# disease phenotype.

marginal.assoc.test.unconstrained.chisq = function(t0, t1) {
# The marginal Pearson chi-squared test on the 2x3 (or less) contingency table.

# NOTE: the tables must be pruned so that they do not contain columns with all
# zeros.

n0 = sum(t0)
n1 = sum(t1)
nx = t0 + t1
n = n0 + n1
e0 = (n0 / n) * nx
e1 = (n1 / n) * nx

statistic = sum((t0 - e0) ^ 2 / e0 + (t1 - e1) ^ 2 / e1)
p.value = pchisq(statistic, df = length(t0), lower.tail = F)

return (list(pen = t1 / (t0 + t1), statistic = statistic, p.value = p.value))
}

marginal.assoc.test.unconstrained.gsq = function(t0, t1) {
# The marginal G^2 test (GLRT conditioned on the observed marginals of the 2x3
# contingency table)

pen.full = t1 / (t0 + t1)
pen.null = sum(t1) / sum(t1 + t0)

statistic = 2 * (sum(t0 * log(1 - pen.full) + t1 * log(pen.full), na.rm = T) -
sum(t0 * log(1 - pen.null) + t1 * log(pen.null), na.rm = T))
p.value = pchisq(statistic, df = length(t0), lower.tail = F)

return (list(pen = pen.full, statistic = statistic, p.value = p.value))
}

marginal.assoc.test.hwe.in.controls = function(t0, t1) {
# Assuming HWE among the controls (justifiable if the disease is rare). This is
# a Wald test for marginal association assuming HWE among the controls (Chen and
# Chatterjee 2007)

t1[t1 == 0] = 1 # is this a good solution to the case where I have zero t1 cells?
n0 = sum(t0)

ef = (t0 + 2 * t0) / (2 * n0)
et0 = n0 * c((1 - ef) ^ 2, 2 * ef * (1 - ef), ef ^ 2)
eor.hom = (t1 * et0) / (et0 * t1)
eor.het = (t1 * et0) / (et0 * t1)

beta = log(c(eor.hom, eor.het))

beta.cov = matrix(c(
1/t1 + 1/t1 + 1/(2*et0 + et0) + 1/(2*et0 + et0),
1/t1 + 1/n0 * 1/(ef * (1 - ef)),
1/t1 + 1/n0 * 1/(ef * (1 - ef)),
1/t1 + 1/t1 + 4/(2*et0 + et0) + 4/(2*et0 + et0)), nrow = 2)

statistic = t(beta) %*% solve(beta.cov) %*% beta
p.value = pchisq(statistic, df = 2, lower.tail = F)

return (list(statistic = statistic, p.value = p.value))
}

marginal.assoc.test.pop.hwe.kpy = function(t0, t1, tp, prevalence, pen.initial = NULL, f.initial = NULL) {
# Assuming HWE in the general population, and that disease prevalence is
# known. This test uses an extraneous population sample to improve estimation
# of population parameters such as SNP distributions.

tx = t0 + t1 + tp

f.null.est = min(0.5, (0.5 * tx + tx) / sum(tx))
px.null.est = c((1 - f.null.est) ^ 2, 2 * f.null.est * (1 - f.null.est), f.null.est ^ 2)
null.loglik = sum(t0 * log(1 - prevalence) + t1 * log(prevalence) + tx * log(px.null.est))

if (is.null(pen.initial)) {
pars.initial = c(rep(prevalence, 3), f.null.est)
} else {
pars.initial = c(pen.initial, f.initial)
}

eval_f = function(pars) {
pen = pars[1:3]
f = pars

px = c((1 - f) ^ 2, 2 * f * (1 - f), f ^ 2)
dpx.df = c(2 * f - 2, 2 - 4 * f, 2 * f)

objective = null.loglik - sum(t0 * log(1 - pen) + t1 * log(pen)) - sum(tx * log(px))
gradient = -c(t1 / pen - t0 / (1 - pen), sum(tx / px * dpx.df))

}

lb = rep(.Machine\$double.eps, 4)
ub = c(rep(1, 3) - .Machine\$double.neg.eps, 0.5)

eval_g_eq = function(pars) {
pen = pars[1:3]
f = pars

px = c((1 - f) ^ 2, 2 * f * (1 - f), f ^ 2)
dpx.df = c(2 * f - 2, 2 - 4 * f, 2 * f)

constraints = sum(pen * px) - prevalence
jacobian = matrix(c(px, sum(pen * dpx.df)), nrow = 1)

return (list(constraints = constraints, jacobian = jacobian))
}

sol = nloptr(x0 = pars.initial, eval_f = eval_f, eval_g_eq = eval_g_eq, lb = lb, ub = ub,
opts = list(algorithm = 'NLOPT_LD_SLSQP', ftol_abs = 1e-8, xtol_rel = 0, maxeval = 1000))

pen = t0 * 0 + sol\$solution[1:3]
f = sol\$solution
px = c((1 - f) ^ 2, 2 * f * (1 - f), f ^ 2)

statistic = -2 * sol\$objective
p.value = pchisq(statistic, df = 2, lower.tail = F)

return (list(pen = pen, statistic = statistic, p.value = p.value))
}

marginal.assoc.test.kpx.kpy = function(t0, t1, prevalence, px, pen.initial = NULL) {
# Assuming the SNP distribution is known, and that disease prevalence is known

# The GLRT for testing a marginal association given a known SNP distribution
# (specified as genotypic, but can come from a HWE assumption and known MAF)

# NOTE: this is exactly the same algorithm as that for a the pairwise test with
# known pxx.

m = length(px)
df = m - 1

if (is.null(pen.initial)) {
pars.initial = rep(prevalence, df)
} else {
pen.initial = pen.initial[2:m]
}

fn = function(x) {
pen = c((prevalence - sum(x * px[2:m])) / px, x)
return (-sum(t0 * log(1 - pen) + t1 * log(pen)))
}

gr = function(x) {
x0 = (prevalence - sum(x * px[2:m])) / px
g = t1[2:m] / x - t0[2:m] / (1 - x) - t1 / x0 / px * px[2:m] + t0 / (1 - x0) / px * px[2:m]
return (-g)
}

ui = rbind(diag(df), -diag(df), -px[2:m], px[2:m])
ci = c(rep(.Machine\$double.eps, df), rep(-(1 - .Machine\$double.neg.eps), df), .Machine\$double.eps - prevalence, prevalence - px * (1 - .Machine\$double.neg.eps))
sol = constrOptim(pars.initial, fn, gr, ui, ci)

pen = t0 * 0 + c((prevalence - sum(sol\$par * px[2:m])) / px, sol\$par)

statistic = 2 * sum(t0 * log((1 - pen) / (1 - prevalence)) + t1 * log(pen / prevalence))
p.value = pchisq(statistic, df = df, lower.tail = F)

return (list(pen = pen, statistic = statistic, p.value = p.value))
}
```

## Try the CCpop package in your browser

Any scripts or data that you put into this service are public.

CCpop documentation built on May 2, 2019, 3:25 a.m.