R/utils.R
In radmixture: Calculate Population Stratification

cacheEnv <- new.env()
assign("lb", 1e-5, envir = cacheEnv)
assign("ub", 0.99999, envir = cacheEnv)

lb <- get("lb", envir = cacheEnv)
ub <- get("ub", envir = cacheEnv)
# The function lfun() is used to calculate log-likelihood value.
lfun <- function(g, q, f) {
    l1 <- g * log(q %*% f)
    l2 <- (2 - g) * log(q %*% (1 - f))
    l <- l1 + l2
    return(sum(l))
}
# update q
em.q <- function(g, q0, q, f0, p, s) {
    for (k in 1:ncol(q0)) {
        c1 <- g[s, ] * q0[s, k] * f0[k, ]
        c2 <- q0[s, ] %*% f0
        c3 <- (2 - g[s, ]) * q0[s, k] * (1 - f0[k, ])
        c4 <- q0[s, ] %*% (1 - f0)
        c <- c1 / c2 + c3 / c4
        q[s, k] <- sum(c) / (2 * p)
    }
    q[q < lb] <- lb
    q[q > ub] <- ub
    q[s, ] <- q[s, ] / sum(q[s, ])
    return(q = q)
}

# Calculate Jacobian and Hessian for F when Q fixed

diff.f <- function(g, q, f, s) {
    qf <- q %*% f[, s]
    qf2 <- q %*% (1 - f[, s])
    gq <- crossprod(g[, s] / qf, q)
    gq2 <- crossprod( (2 - g[, s]) / qf2, q)
    df <- t(gq - gq2)
    qqf <- qf * qf
    qqf2 <- qf2 * qf2
    q_temp <- sweep(t(q), MARGIN = 2, g[, s] / qqf, "*")
    q_temp2 <- sweep(t(q), MARGIN = 2, (2 - g[, s]) / qqf2, "*")
    ddf <- -1 * (q_temp %*% q + q_temp2 %*% q)
    res <- list(Hess = ddf, Jaco = t(df))
    return(res)
}

# Calculate Jacobian and Hessian for Q when F fixed

diff.q <- function(g, q, f, s) {
    qf <- q[s, ] %*% f
    qf2 <- q[s, ] %*% (1 - f)
    gf <- tcrossprod(g[s, ] / qf, f)
    gf2 <- tcrossprod( (2 - g[s, ]) / qf2, 1 - f)
    dq <- gf + gf2
    qqf <- qf * qf
    qqf2 <- qf2 * qf2
    f_temp <- sweep(f, MARGIN = 2, g[s, ] / qqf, "*")
    f_temp2 <- sweep( (1 - f), MARGIN = 2, (2 - g[s, ]) / qqf2, "*")
    ddq <- -1 * (tcrossprod(f_temp, f) + tcrossprod(f_temp2, (1 - f)))
    res <- list(Hess = ddq, Jaco = dq)
    return(res)
}

# Quadratic Programming (QP) for F
brf <- function(g, q, f) {
    lb <- 1e-5
    ub <- 1 - 1e-5
    Amatf <- rbind(diag(ncol(q)), -1 * diag(ncol(q)))
    flongvec <- matrix(NA, nrow(f), ncol(f))
    for (j in 1:ncol(f)) {
        bvec <- c(-f[, j], f[, j] - 1)
        flongvec[, j] <- f[, j] + solve.QP(-diff.f(g, q, f, s = j)$Hess * 1e-2,
                                           diff.f(g, q, f, s = j)$Jaco * 1e-2,
                                           t(Amatf), bvec)$solution
    }
    flongvec[flongvec < lb] <- lb
    flongvec[flongvec > ub] <- ub
    return(f = flongvec)
}

# Quadratic Programming (QP) for Q
brq <- function(g, q, f, model) {
    lb <- 1e-5
    ub <- 1 - 1e-5
    Amatq <- rbind(rep(1, ncol(q)), diag(ncol(q)))
    if (model == "supervised") {
        bvec <- c(0, -q[nrow(q), ])
        q[nrow(q), ] <- q[nrow(q), ] + solve.QP(-diff.q(g, q, f, nrow(q))$Hess * 1e-2,
                                                diff.q(g, q, f, nrow(q))$Jaco * 1e-2,
                                                t(Amatq), bvec,
                                                meq = 1)$solution
        q[q < lb] <- lb
        q[q > ub] <- ub
        q[nrow(q), ] <- q[nrow(q), ] / sum(q[nrow(q), ])
    } else {
        qlongvec <- matrix(NA, nrow(q), ncol(q))
        for (i in 1:nrow(q)) {
            bvec <- c(0, -q[i, ])
            qlongvec[i, ] <- q[i, ] + solve.QP(-diff.q(g, q, f, i)$Hess * 1e-2,
                                               diff.q(g, q, f, i)$Jaco * 1e-2,
                                               t(Amatq), bvec, meq = 1)$solution
            qlongvec[qlongvec < lb] <- lb
            qlongvec[qlongvec > ub] <- ub
            qlongvec[i, ] <- qlongvec[i, ] / sum(qlongvec[i, ])
        }
        q <- qlongvec
    }
    return(q = q)
}

# quasi-newton (update Q)
qnq <- function(q, qvector, model) {
    if (model == "supervised") {
        u <- qvector[, 2] - qvector[, 1]
        v <- qvector[, 3] - qvector[, 2]
        if (u == 0 && v == 0) {
            c <- 1
        } else {
            c <- - (t(u) %*% u / t(u) %*% (v - u))
        }
        if (c == Inf || c == -Inf || is.nan(c)) {
            c <- 1
        }
        updateq <- (1 - c) * qvector[, 2] + c * qvector[, 3]
        updateq[updateq < lb] <- lb
        updateq[updateq > ub] <- ub
        updateq <- updateq / sum(updateq)
    } else {
        q1 <- matrix(qvector[, 1], nrow(q), ncol(q))
        q2 <- matrix(qvector[, 2], nrow(q), ncol(q))
        q3 <- matrix(qvector[, 3], nrow(q), ncol(q))
        updateq <- matrix(NA, nrow(q), ncol(q))
        for (i in 1:nrow(q)) {
            u <- q2[i, ] - q1[i, ]
            v <- q3[i, ] - q2[i, ]
            if (u == 0 && v == 0) {
                c <- 1
            } else {
                c <- - (t(u) %*% u / t(u) %*% (v - u))
            }
            if (c == Inf || c == -Inf || is.nan(c)) {
                c <- 1
            }
            updateq[i, ] <- (1 - c) * q2[i, ] + c * q3[i, ]
        }
        updateq[updateq < lb] <- lb
        updateq[updateq > ub] <- ub
        updateq[i, ] <- updateq[i, ] / sum(updateq[i, ])
    }
    return(updateq)
}
# quasi-newton (update F)
qnf <- function(f, fvector) {
    f1 <- matrix(fvector[, 1], nrow(f), ncol(f))
    f2 <- matrix(fvector[, 2], nrow(f), ncol(f))
    f3 <- matrix(fvector[, 3], nrow(f), ncol(f))
    updatef <- matrix(NA, nrow(f), ncol(f))
    for (i in 1:ncol(f)) {
        u <- f2[, i] - f1[, i]
        v <- f3[, i] - f2[, i]
        if (u == 0 && v == 0) {
            c <- 1
        } else {
            c <- - (t(u) %*% u / t(u) %*% (v - u))
        }
        if (c == Inf || c == -Inf || is.nan(c)) {
            c <- 1
        }
        updatef[, i] <- (1 - c) * f2[, i] + c * f3[, i]
    }
    updatef[updatef < lb] <- lb
    updatef[updatef > ub] <- ub
    return(updatef)
}

# index for ped

indp <- function(index) {
    res <- numeric(length = length(index) * 2)
    for(i in 1:length(index)) {
        res[2 * i - 1] <- index[i] * 2 - 1
        res[2 * i] <- index[i] * 2
    }
    res
}


gg <- function(ped, f) {
    # calculate allele counts
    a1 <- ped[, seq(1, ncol(ped) - 1, 2)]
    a2 <- ped[, seq(2, ncol(ped), 2)]
    a1 <- as.matrix(a1)
    a2 <- as.matrix(a2)
    a <- rbind(a1, a2)
    # find major allele
    major <- rep(NA, ncol(a))
    minor <- rep(NA, ncol(a))
    del <- numeric()
    for(i in 1:ncol(a)) {
        freqco <- count(a[, i])
        if(nrow(freqco) != 2 || freqco[1, 2] == freqco[2, 2] || 0 %in% freqco[, 1]) {
            del[i] <- i
            major[i] <- NA
            minor[i] <- NA
        } else {
            major[i] <- freqco[which.max(freqco[, 2]), 1]
            minor[i] <- freqco[which.min(freqco[, 2]), 1]
        }
    }
    del <- which(!is.na(del))
    # generate two genotypes
    if (length(del) == 0) {
        a1 <- a1
        a2 <- a2
        a <- a
        major <- major
        minor <- minor
    } else {
        a1 <- a1[, -del]
        a2 <- a2[, -del]
        a <- a[, -del]
        major <- major[-del]
        minor <- minor[-del]
    }
    genotype <- matrix(NA, 2, ncol(a))
    genotype[1, ] <- 10 * major + major
    genotype[2, ] <- 10 * minor + minor
    # generate G matrix
    genotype1 <- 10 * a1 + a2
    g <- matrix(NA, nrow(ped), ncol(a))
    for (i in 1:ncol(a)) {
        g[which(genotype1[, i] == genotype[1, i]), i] <- 0
        g[which(genotype1[, i] == genotype[2, i]), i] <- 2
    }
    g[is.na(g)] <- 1
    f <- f[, -del]
    return(list(g = g, f = f, del = del))
}

tfrd <- function(genotype, map, referenceped, f) {
    cha <- paste(genotype[, 2], genotype[, 3], sep = ",")
    cha1 <- paste(map[, 1], map[, 2], sep = ",")
    overlap <- intersect(cha, cha1)
    index_user <- match(overlap, cha)
    index_wg <- match(overlap, cha1)
    ped <- referenceped[, - (1:6)] %>%
        as.matrix()
    ped <- ped[, indp(index_wg)]
    f <- f[, index_wg]
    genotype <- genotype[index_user, 4]
    genotype <- as.character(genotype)
    newped <- paste(genotype, collapse = "") %>%
        strsplit(split = "") %>%
        unlist()
    newped[newped == "A"] <- 1
    newped[newped == "C"] <- 2
    newped[newped == "G"] <- 3
    newped[newped == "T"] <- 4
    newped[newped == "-"] <- 0
    newped[newped == "_"] <- 0
    newped[newped == "I"] <- 0
    newped[newped == "D"] <- 0
    newped <- as.numeric(newped)
    del <- which(is.na(newped))
    newped[del] <- 0
    ped <- rbind(ped, newped)
    gf <- gg(ped, f)
    if (ncol(gf$g) < 40000) {
        warning("The number of SNPs is
                inappropriate for ancestry estimation!")
    } else if (ncol(gf$g) < 50000) {
        warning("The number of SNPs used for calculating is small,
                so your result might be unreasonable!")
    }
    return(list(g = gf$g, f = gf$f))
}