setGeneric("pcrelate", function(gdsobj, ...) standardGeneric("pcrelate"))
setMethod("pcrelate",
"GenotypeIterator",
function(gdsobj,
pcs,
scale = c('overall', 'variant', 'none'),
ibd.probs = TRUE,
sample.include = NULL,
training.set = NULL,
sample.block.size = 5000,
maf.thresh = 0.01,
maf.bound.method = c('filter', 'truncate'),
small.samp.correct = TRUE,
BPPARAM = bpparam(),
verbose = TRUE) {
.pcrelate(gdsobj,
pcs = pcs,
scale = scale,
ibd.probs = ibd.probs,
sample.include = sample.include,
training.set = training.set,
sample.block.size = sample.block.size,
maf.thresh = maf.thresh,
maf.bound.method = maf.bound.method,
small.samp.correct = small.samp.correct,
BPPARAM = BPPARAM,
verbose = verbose)
})
setMethod("pcrelate",
"SeqVarIterator",
function(gdsobj,
pcs,
scale = c('overall', 'variant', 'none'),
ibd.probs = TRUE,
sample.include = NULL,
training.set = NULL,
sample.block.size = 5000,
maf.thresh = 0.01,
maf.bound.method = c('filter', 'truncate'),
small.samp.correct = TRUE,
BPPARAM = bpparam(),
verbose = TRUE) {
filt <- seqGetFilter(gdsobj)
out <- .pcrelate(gdsobj,
pcs = pcs,
scale = scale,
ibd.probs = ibd.probs,
sample.include = sample.include,
training.set = training.set,
sample.block.size = sample.block.size,
maf.thresh = maf.thresh,
maf.bound.method = maf.bound.method,
small.samp.correct = small.samp.correct,
BPPARAM = BPPARAM,
verbose = verbose)
seqSetFilter(gdsobj,
sample.sel=filt$sample.sel,
variant.sel=filt$variant.sel,
verbose=FALSE)
out
})
.pcrelate <- function(gdsobj,
pcs,
scale = c('overall', 'variant', 'none'),
ibd.probs = TRUE,
sample.include = NULL,
training.set = NULL,
sample.block.size = 5000,
maf.thresh = 0.01,
maf.bound.method = c('filter', 'truncate'),
small.samp.correct = TRUE,
BPPARAM = bpparam(),
verbose = TRUE){
# checks
scale <- match.arg(scale)
maf.bound.method <- match.arg(maf.bound.method)
sample.include <- samplesGdsOrder(gdsobj, sample.include)
.pcrelateChecks(pcs = pcs, scale = scale, ibd.probs = ibd.probs, sample.include = sample.include, training.set = training.set,
maf.thresh = maf.thresh)
if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores')
# number of sample blocks
nsampblock <- ceiling(length(sample.include)/sample.block.size)
# check for small sample correction
if(small.samp.correct){
small.samp.correct <- (nsampblock == 1) & (scale != 'none')
if(!small.samp.correct) {
warning('small.samp.correct can only be used when all samples are analyzed in one block and `scale != none`')
}
}
# list of samples in each block
if(nsampblock > 1){
samp.blocks <- unname(split(sample.include, cut(1:length(sample.include), nsampblock)))
}else{
samp.blocks <- list(sample.include)
}
if(nsampblock == 1){
if(verbose) message(length(sample.include), ' samples to be included in the analysis...')
# create matrix of PCs
V <- .createPCMatrix(pcs = pcs, sample.include = sample.include)
# matrix product of V
VVtVi <- .calcISAFBetaPCProd(V = V, training.set = training.set, verbose = verbose)
snp.blocks <- .snpBlocks(gdsobj)
nsnpblock <- length(snp.blocks)
if(verbose) {
message('Running PC-Relate analysis for ', length(sample.include), ' samples using ',
length(unlist(snp.blocks)), ' SNPs in ', nsnpblock, ' blocks...')
}
# iterator function to return genotype blocks
k <- 1
Giter <- function() {
if (k > nsnpblock) return(NULL)
G <- .readGeno(gdsobj, sample.include, snp.index = snp.blocks[[k]])
k <<- k + 1
return(G)
}
# for each snp block
#matList <- foreach(k = 1:nsnpblock, .combine = .matListCombine, .inorder = FALSE, .multicombine = FALSE) %do% {
snpBlock <- function(G, ...) {
#if(verbose) message(' Running block ', k, '...')
# read genotype data for the block
#G <- .readGeno(gdsobj, sample.include, snp.index = snp.blocks[[k]])
# calculate ISAF betas
beta <- .calcISAFBeta(G = G, VVtVi = VVtVi)
# calculate PC-Relate estimates
.pcrelateVarBlock(G = G, beta = beta, V = V, idx = 1:nrow(V), jdx = 1:nrow(V), scale = scale,
ibd.probs = ibd.probs, maf.thresh = maf.thresh, maf.bound.method = maf.bound.method)
}
matList <- bpiterate(Giter, snpBlock, REDUCE = .matListCombine, reduce.in.order = FALSE, BPPARAM = BPPARAM)
# take ratios to compute final estimates
estList <- .pcrelateCalcRatio(matList = matList, scale = scale, ibd.probs = ibd.probs)
rm(matList)
# cast to data.tables
kinSelf <- data.table(ID = rownames(estList$kin), f = 2*diag(estList$kin) - 1, nsnp = diag(estList$nsnp))
kinBtwn <- .estListToDT(estList, drop.lower = TRUE)
rm(estList)
}else if(nsampblock > 1){
if(verbose) {
message(length(sample.include), ' samples to be included in the analysis, split into ', nsampblock, ' blocks...')
}
# calculate betas for individual specific allele frequencies
beta <- calcISAFBeta(gdsobj = gdsobj, pcs = pcs, sample.include = sample.include,
training.set = training.set, verbose = verbose, BPPARAM = BPPARAM)
### beta is a matrix of variants x PCs; needs to be saved, not sure of the best format or the best way to split this up ###
# compute estimates for current (pair of) sample block(s)
### this is where we would parallelize with multiple jobs ###
kinSelf <- NULL
kinBtwn <- NULL
for(i in 1:nsampblock){
for(j in i:nsampblock){
if(verbose) message('Running PC-Relate analysis for sample block pair (', i, ',', j, ')')
# compute estimates for the (pair of) sample block(s)
tmp <- pcrelateSampBlock(gdsobj = gdsobj, pcs = pcs, betaobj = beta,
sample.include.block1 = samp.blocks[[i]],
sample.include.block2 = samp.blocks[[j]],
scale = scale, ibd.probs = ibd.probs,
maf.thresh = maf.thresh,
maf.bound.method = maf.bound.method,
BPPARAM = BPPARAM, verbose = verbose)
# update results with this (pair of) sample block(s)
if(i == j) kinSelf <- rbind(kinSelf, tmp$kinSelf)
kinBtwn <- rbind(kinBtwn, tmp$kinBtwn)
}
}
}
### post processing after putting all samples together
# order samples
setkeyv(kinSelf, 'ID')
setkeyv(kinBtwn, c('ID1', 'ID2'))
# correct kinship - small sample
if(small.samp.correct){
if(verbose) message('Performing Small Sample Correction...')
out <- correctKin(kinBtwn = kinBtwn, kinSelf = kinSelf, pcs = pcs,
sample.include = sample.include)
kinBtwn <- out$kinBtwn
kinSelf <- out$kinSelf
}
# correct k2 - HW departure and small sample
if(ibd.probs) kinBtwn <- correctK2(kinBtwn = kinBtwn, kinSelf = kinSelf,
small.samp.correct = small.samp.correct, pcs = pcs,
sample.include = sample.include)
# use alternate k0 estimator for non-1st degree relatives
if(ibd.probs) kinBtwn <- correctK0(kinBtwn = kinBtwn)
# return output
out <- list(kinBtwn = as.data.frame(kinBtwn), kinSelf = as.data.frame(kinSelf))
class(out) <- "pcrelate"
return(out)
}
.pcrelateChecks <- function(pcs, scale, ibd.probs, sample.include, training.set, maf.thresh){
# check parameters
if(scale == 'none' & ibd.probs) stop('`ibd.probs` must be FALSE when `scale` == none')
if(maf.thresh < 0 | maf.thresh > 0.5) stop('maf.thresh must be in [0,0.5]')
# check training.set
if(!is.null(training.set) & !all(training.set %in% sample.include)) {
stop('All samples in training.set must be in sample.include')
}
# check pcs
if(!is.matrix(pcs) | is.null(rownames(pcs))) {
stop('pcs should be a matrix of PCs with rownames set to sample.ids')
}
if(!all(sample.include %in% rownames(pcs))) {
stop('All samples in sample.include must be in pcs')
}
}
### get sample ids in same order as gdsobj
samplesGdsOrder <- function(gdsobj, sample.include) {
sample.id <- .readSampleId(gdsobj)
if (!is.null(sample.include)) {
sample.id <- intersect(sample.id, sample.include)
}
return(as.character(sample.id))
}
### function to match samples and create PC matrix
.createPCMatrix <- function(pcs, sample.include){
# subset and re-order pcs if needed
V <- pcs[match(sample.include, rownames(pcs)), , drop = FALSE]
# append intercept
V <- cbind(rep(1, nrow(V)), V)
return(V)
}
# function to get
.calcISAFBetaPCProd <- function(V, training.set, verbose = TRUE){
if(!is.null(training.set)){
idx <- rownames(V) %in% training.set
VVtVi <- tcrossprod(V[idx,], chol2inv(chol(crossprod(V[idx,]))))
if(verbose) {
message('Betas for ', ncol(V) - 1, ' PC(s) will be calculated using ',
sum(idx), ' samples in training.set...')
}
}else{
idx <- NULL
VVtVi <- tcrossprod(V, chol2inv(chol(crossprod(V))))
if(verbose) {
message('Betas for ', ncol(V) - 1, ' PC(s) will be calculated using all ',
nrow(V), ' samples in sample.include...')
}
}
return(list(val = VVtVi, idx = idx))
}
# function to do actual calculation of betas
.calcISAFBeta <- function(G, VVtVi){
# impute missing genotype values
G <- .meanImpute(G, freq=0.5*colMeans(G, na.rm=TRUE))
# calculate beta
if(is.null(VVtVi$idx)){
if(!identical(rownames(G), rownames(VVtVi$val))) {
stop('sample order in genotypes and pcs do not match')
}
beta <- crossprod(G, VVtVi$val)
}else{
if(!identical(rownames(G)[VVtVi$idx], rownames(VVtVi$val))) {
stop('sample order in genotypes and pcs do not match')
}
beta <- crossprod(G[VVtVi$idx, ], VVtVi$val)
}
return(beta)
}
### this function does the pcrelate estimation for a variant block
.pcrelateVarBlock <- function(G, beta, V, idx, jdx, scale, ibd.probs, maf.thresh, maf.bound.method){
# make sure order of G, beta, and V all line up
if(!identical(colnames(G), rownames(beta))) stop('G and beta do not match')
if(!identical(rownames(G), rownames(V))) stop('G and V do not match')
# estimate individual specific allele frequencies
mu <- .estISAF(beta = beta, V = V, bound.thresh = maf.thresh, bound.method = maf.bound.method)
# compute values for estimates
if(scale == 'overall'){
matList <- .pcrCalcOvr(G = G, mu = mu, idx = idx, jdx = jdx, ibd.probs = ibd.probs)
}else if(scale == 'variant'){
matList <- .pcrCalcVar(G = G, mu = mu, idx = idx, jdx = jdx, ibd.probs = ibd.probs)
}else if(scale == 'none'){
matList <- .pcrCalcNone(G = G, mu = mu, idx = idx, jdx = jdx)
}
return(matList)
}
.matListCombine <- function(...){
mapply(FUN = "+", ..., SIMPLIFY = FALSE)
}
### these functions are for estimating individual specific allele frequencies
.estISAF <- function(beta, V, bound.thresh, bound.method){
# get ISAF estimates (i.e. 0.5*fitted values)
mu <- 0.5*tcrossprod(V, beta)
# fix boundary cases
if(bound.method == 'truncate'){
mu <- apply(mu, 2, .muBoundaryTrunc, thresh = bound.thresh)
}else if(bound.method == 'filter'){
mu <- apply(mu, 2, .muBoundaryFilt, thresh = bound.thresh)
}else{
stop('bound.method must be one of `truncate` or `filter`')
}
return(mu)
}
# function to truncate boundary issues with ISAFs
.muBoundaryTrunc <- function(x, thresh){
x[x < thresh] <- thresh
x[x > (1 - thresh)] <- (1 - thresh)
return(x)
}
# function to filter boundary issues with ISAFs
.muBoundaryFilt <- function(x, thresh){
x[x < thresh] <- NA
x[x > (1 - thresh)] <- NA
return(x)
}
### all of the functions below are used for computing kinship and ibd estimates ###
.pcrCalcOvr <- function(G, mu, ibd.probs, idx, jdx){
# compute mu(1 - mu)
muqu <- mu*(1 - mu)
# compute number of observed snps by pair
nonmiss <- !(is.na(G) | is.na(mu))
nsnp <- tcrossprod(nonmiss[idx,,drop=F], nonmiss[jdx,,drop=F])
# index of missing values
filt.idx <- which(!nonmiss)
# compute kinship values
kinList <- .pcrCalcKinOvr(G, mu, muqu, filt.idx, idx, jdx)
if(ibd.probs){
# compute ibd values
ibdList <- .pcrCalcIBDOvr(G, mu, muqu, nonmiss, filt.idx, idx, jdx)
return(list(kinNum = kinList$kinNum,
kinDen = kinList$kinDen,
k0Num = ibdList$k0Num,
k0Den = ibdList$k0Den,
k2Num = ibdList$k2Num,
k2Den = ibdList$k2Den,
nsnp = nsnp))
}else{
return(list(kinNum = kinList$kinNum,
kinDen = kinList$kinDen,
nsnp = nsnp))
}
}
.pcrCalcKinOvr <- function(G, mu, muqu, filt.idx, idx, jdx){
# residuals
R <- G - 2*mu
# set missing values to 0 (i.e. no contribution from that variant)
R[filt.idx] <- 0
muqu[filt.idx] <- 0
# numerator (crossprod of residuals)
kinNum <- tcrossprod(R[idx,,drop=F], R[jdx,,drop=F])
# denominator
kinDen <- tcrossprod(sqrt(muqu[idx,,drop=F]), sqrt(muqu[jdx,,drop=F]))
return(list(kinNum = kinNum, kinDen = kinDen))
}
.pcrCalcIBDOvr <- function(G, mu, muqu, nonmiss, filt.idx, idx, jdx){
# opposite allele frequency
qu <- 1 - mu
# indicator matrices of homozygotes
Iaa <- G == 0 & nonmiss
IAA <- G == 2 & nonmiss
# dominance coded genotype matrix
Gd <- mu
Gd[G == 1 & nonmiss] <- 0
Gd[IAA] <- qu[IAA]
Gd <- Gd - muqu
# set missing values to 0 (i.e. no contribution from that variant)
Iaa[filt.idx] <- 0
IAA[filt.idx] <- 0
Gd[filt.idx] <- 0
mu[filt.idx] <- 0
qu[filt.idx] <- 0
muqu[filt.idx] <- 0
# k0
k0Num <- tcrossprod(IAA[idx,,drop=F], Iaa[jdx,,drop=F]) + tcrossprod(Iaa[idx,,drop=F], IAA[jdx,,drop=F])
k0Den <- tcrossprod(mu[idx,,drop=F]^2, qu[jdx,,drop=F]^2) + tcrossprod(qu[idx,,drop=F]^2, mu[jdx,,drop=F]^2)
# k2
k2Num <- tcrossprod(Gd[idx,,drop=F], Gd[jdx,,drop=F])
k2Den <- tcrossprod(muqu[idx,,drop=F], muqu[jdx,,drop=F])
return(list(k0Num = k0Num, k0Den = k0Den, k2Num = k2Num, k2Den = k2Den))
}
.pcrCalcVar <- function(G, mu, ibd.probs, idx, jdx){
# compute mu(1 - mu)
muqu <- mu*(1 - mu)
# compute number of observed snps by pair
nonmiss <- !(is.na(G) | is.na(mu))
nsnp <- tcrossprod(nonmiss[idx,,drop=F], nonmiss[jdx,,drop=F])
# index of missing values
filt.idx <- which(!nonmiss)
# compute kinship values
kinNum <- .pcrCalcKinVar(G, mu, muqu, filt.idx, idx, jdx)
if(ibd.probs){
# compute ibd values
ibdList <- .pcrCalcIBDVar(G, mu, muqu, nonmiss, filt.idx, idx, jdx)
return(list(kinNum = kinNum,
k0Num = ibdList$k0Num,
k2Num = ibdList$k2Num,
nsnp = nsnp))
}else{
return(list(kinNum = kinNum, nsnp = nsnp))
}
}
.pcrCalcKinVar <- function(G, mu, muqu, filt.idx, idx, jdx){
# scaled residuals
R <- (G - 2*mu)/sqrt(muqu)
# set missing values to 0 (i.e. no contribution from that variant)
R[filt.idx] <- 0
# numerator (crossprod of scaled residuals)
kinNum <- tcrossprod(R[idx,,drop=F], R[jdx,,drop=F])
return(kinNum)
}
.pcrCalcIBDVar <- function(G, mu, muqu, nonmiss, filt.idx, idx, jdx){
# opposite allele frequency
qu <- 1 - mu
# indicator matrices of homozygotes
Iaa <- G == 0 & nonmiss
IAA <- G == 2 & nonmiss
# dominance coded genotype matrix
Gd <- mu
Gd[G == 1 & nonmiss] <- 0
Gd[IAA] <- qu[IAA]
# scale
Iaa <- 0.5*Iaa/qu^2 # factor of 1/2 for IAA*Iaa comes in from splitting up the two terms AA,aa vs aa,AA
IAA <- IAA/mu^2
Gd <- Gd/muqu - 1
# set missing values to 0 (i.e. no contribution from that variant)
Iaa[filt.idx] <- 0
IAA[filt.idx] <- 0
Gd[filt.idx] <- 0
# k0
k0Num <- tcrossprod(IAA[idx,,drop=F], Iaa[jdx,,drop=F]) + tcrossprod(Iaa[idx,,drop=F], IAA[jdx,,drop=F])
# k2
k2Num <- tcrossprod(Gd[idx,,drop=F], Gd[jdx,,drop=F])
return(list(k0Num = k0Num, k2Num = k2Num))
}
.pcrCalcNone <- function(G, mu, idx, jdx){
# compute number of observed snps by pair
nonmiss <- !(is.na(G) | is.na(mu))
nsnp <- tcrossprod(nonmiss[idx,,drop=F], nonmiss[jdx,,drop=F])
# index of missing values
filt.idx <- which(!nonmiss)
rm(nonmiss)
# compute kinship values
kin <- .pcrCalcKinNone(G, mu, filt.idx, idx, jdx)
return(list(kin = kin, nsnp = nsnp))
}
.pcrCalcKinNone <- function(G, mu, filt.idx, idx, jdx){
# residuals
R <- G - 2*mu
# set missing values to 0 (i.e. no contribution from that variant)
R[filt.idx] <- 0
# crossprod of scaled residuals
kin <- tcrossprod(R[idx,,drop=F], R[jdx,,drop=F])
return(kin)
}
### functions for post processing on variant block level
.pcrelateCalcRatio <- function(matList, scale, ibd.probs){
# compute final estimates
if(scale == 'overall'){
kin <- matList$kinNum/(4*matList$kinDen)
if(ibd.probs){
k2 <- matList$k2Num/matList$k2Den
k0 <- matList$k0Num/matList$k0Den
return(list(kin = kin, k0 = k0, k2 = k2, nsnp = matList$nsnp))
}else{
return(list(kin = kin, nsnp = matList$nsnp))
}
}else{
kin <- matList$kin/(4*matList$nsnp)
if(ibd.probs){
k2 <- matList$k2/matList$nsnp
k0 <- matList$k0/matList$nsnp
return(list(kin = kin, k0 = k0, k2 = k2, nsnp = matList$nsnp))
}else{
return(list(kin = kin, nsnp = matList$nsnp))
}
}
}
### functions for post processing on sample block level
.estListToDT <- function(x, drop.lower){
estDT <- lapply(x, meltMatrix, drop.lower = drop.lower, drop.diag = drop.lower)
for(k in 1:length(estDT)){
setnames(estDT[[k]], 'value', names(estDT)[k])
}
# merge those data.tables into one data.table
estDT <- Reduce(merge, estDT)
return(estDT)
}
### functions for final processing
correctKin <- function(kinBtwn, kinSelf, pcs, sample.include = NULL){
# keep R CMD check from warning about undefined global variables
f <- ID1 <- ID2 <- kin <- newval <- value <- NULL
# temporary data.table to store values
tmp <- kinSelf[, c('ID', 'f')]
setnames(tmp, c('ID','f'), c('ID1', 'kin'))
tmp[, ID2 := ID1]
tmp <- rbind(kinBtwn[, c('ID1', 'ID2', 'kin')], tmp)
setnames(tmp, 'kin', 'newval')
setkeyv(tmp, c('ID1', 'ID2'))
# get the PC matrix
V <- .createPCMatrix(pcs = pcs, sample.include = sample.include)
# make adjustment for each PC
for(k in 2:ncol(V)){
Acov <- tcrossprod(V[,k])
rownames(Acov) <- rownames(V)
colnames(Acov) <- rownames(V)
Avec <- meltMatrix(Acov, drop.lower = TRUE, drop.diag = FALSE)
tmp <- Avec[tmp, on = c('ID1', 'ID2')]
coef <- lm(formula = as.formula(newval ~ value), data = tmp[newval < 2^(-11/2)])$coef
tmp[, newval := newval - coef[1] - coef[2]*value]
tmp[, value := NULL]
}
# merge back into kinBtwn
kinBtwn <- tmp[kinBtwn, on = c('ID1', 'ID2')]
kinBtwn[, kin := newval][, newval := NULL]
# merge back into kinSelf
tmp <- tmp[ID1 == ID2][, ID2 := NULL]
setnames(tmp, 'ID1', 'ID')
kinSelf <- tmp[kinSelf, on = 'ID']
kinSelf[, f := newval][, newval := NULL]
return(list(kinBtwn = kinBtwn, kinSelf = kinSelf))
}
correctK2 <- function(kinBtwn, kinSelf, pcs, sample.include = NULL, small.samp.correct = TRUE){
# keep R CMD check from warning about undefined global variables
f.1 <- f.2 <- kin <- k2 <- newval <- value <- NULL
# correct k2 for HW departure
kinBtwn <- merge(kinBtwn, kinSelf[, c('ID', 'f')], by.x = 'ID2', by.y = 'ID')
setnames(kinBtwn, 'f', 'f.2')
kinBtwn <- merge(kinBtwn, kinSelf[, c('ID', 'f')], by.x = 'ID1', by.y = 'ID')
setnames(kinBtwn, 'f', 'f.1')
kinBtwn[, k2 := k2 - f.1*f.2][, `:=`(f.1 = NULL, f.2 = NULL)]
if(small.samp.correct){
# temporary data.table to store values
tmp <- kinBtwn[, c('ID1', 'ID2', 'kin', 'k2')]
setnames(tmp, 'k2', 'newval')
setkeyv(tmp, c('ID1', 'ID2'))
# get the PC matrix
V <- .createPCMatrix(pcs = pcs, sample.include = sample.include)
# make adjustment for each PC
for(k in 2:ncol(V)){
Acov <- tcrossprod(V[,k])
rownames(Acov) <- rownames(V)
colnames(Acov) <- rownames(V)
Avec <- meltMatrix(Acov, drop.lower = TRUE, drop.diag = FALSE)
tmp <- Avec[tmp, on = c('ID1', 'ID2')]
coef <- lm(formula = as.formula(newval ~ value + I(value^2)), data = tmp[kin < 2^(-11/2)])$coef
tmp[, newval := newval - coef[1] - coef[2]*value - coef[3]*value^2]
tmp[, value := NULL]
}
# make adjustment for kinship
coef <- lm(formula = as.formula(newval ~ kin), data = tmp[newval < 2^(-9/2)])$coef
tmp[, newval := newval - coef[1] - coef[2]*kin]
tmp[, kin := NULL]
# merge back into kinBtwn
kinBtwn <- tmp[kinBtwn, on = c('ID1', 'ID2')]
kinBtwn[!is.na(newval), k2 := newval][, newval := NULL]
}
return(kinBtwn)
}
correctK0 <- function(kinBtwn){
# keep R CMD check from warning about undefined global variables
kin <- k0 <- k2 <- NULL
# use alternate k0 estimator for non-1st degree relatives
kinBtwn[kin < 2^(-5/2), k0 := 1 - 4*kin + k2]
return(kinBtwn)
}
setGeneric("meltMatrix", function(x, ...) standardGeneric("meltMatrix"))
setMethod("meltMatrix",
"matrix",
function(x, drop.lower = FALSE, drop.diag = FALSE){
ID1 <- ID2 <- NULL
if(drop.lower){
x[lower.tri(x, diag = drop.diag)] <- NA
}
x <- as.data.table(reshape2::melt(x, varnames = c('ID1', 'ID2'), na.rm = TRUE, as.is = TRUE))
x <- x[,`:=`(ID1 = as.character(ID1), ID2 = as.character(ID2))]
setkeyv(x, c('ID1', 'ID2'))
})
setMethod("meltMatrix",
"Matrix",
function(x, drop.lower = FALSE, drop.diag = FALSE){
if(drop.lower){
x[lower.tri(x, diag = drop.diag)] <- NA
}
# this modeled after reshape2:::melt.matrix
labels <- as.data.table(expand.grid(dimnames(x), KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE))
setnames(labels, c('Var1', 'Var2'), c('ID1', 'ID2'))
missing <- is.na(as.vector(x))
x <- cbind(labels[!missing,], data.table(value = x[!missing]))
setkeyv(x, c('ID1', 'ID2'))
})
### exported function for computing PC betas for individual specific allele frequency calculations ###
calcISAFBeta <- function(gdsobj, pcs, sample.include, training.set = NULL, BPPARAM = bpparam(), verbose = TRUE){
# checks - add some
if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores')
# create matrix of PCs
V <- .createPCMatrix(pcs = pcs, sample.include = sample.include)
# matrix product of V
VVtVi <- .calcISAFBetaPCProd(V = V, training.set = training.set, verbose = verbose)
snp.blocks <- .snpBlocks(gdsobj)
nsnpblock <- length(snp.blocks)
if(verbose) {
message('Calculating Indivdiual-Specific Allele Frequency betas for ',
length(unlist(snp.blocks)), ' SNPs in ', nsnpblock, ' blocks...')
}
# iterator function to return genotype blocks
k <- 1
Giter <- function() {
if (k > nsnpblock) return(NULL)
G <- .readGeno(gdsobj, sample.include, snp.index = snp.blocks[[k]])
k <<- k + 1
return(G)
}
# beta <- foreach(k = 1:nsnpblock, .combine = rbind, .inorder = FALSE, .multicombine = TRUE) %do% {
# snpBlock <- function(G, ...) {
# if(verbose) message(' Running block ', k, '...')
# # read genotype data for the block
# G <- .readGeno(gdsobj, sample.include, snp.index = snp.blocks[[k]])
#
# # calculate ISAF betas
# .calcISAFBeta(G = G, VVtVi = VVtVi)
# }
### rather than returning and rbinding here, we may want to be writing the output to something
beta <- bpiterate(Giter, .calcISAFBeta, VVtVi = VVtVi, REDUCE = rbind, reduce.in.order = FALSE, BPPARAM = BPPARAM)
return(beta)
}
### exported function that does the pcrelate estimation for a (pair of) sample block(s)
pcrelateSampBlock <- function(gdsobj, betaobj, pcs, sample.include.block1, sample.include.block2,
scale = c('overall', 'variant', 'none'), ibd.probs = TRUE,
maf.thresh = 0.01, maf.bound.method = c('filter', 'truncate'),
BPPARAM = bpparam(), verbose = TRUE){
scale <- match.arg(scale)
maf.bound.method <- match.arg(maf.bound.method)
if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores')
# create (joint) PC matrix and indices
sample.include <- unique(c(sample.include.block1, sample.include.block2))
V <- .createPCMatrix(pcs = pcs, sample.include = sample.include)
idx <- which(rownames(V) %in% sample.include.block1)
jdx <- which(rownames(V) %in% sample.include.block2)
oneblock <- setequal(idx, jdx)
### slight inefficiency above because we create V for samples in block1 for each block2 when we don't have to if we are running serially;
### but this seems more straightforward to parallelize
# iterator function to return genotype blocks
k <- 1
Giter <- function() {
if (k > nsnpblock) return(NULL)
G <- .readGeno(gdsobj, sample.include, snp.index = snp.blocks[[k]])
k <<- k + 1
return(G)
}
snp.blocks <- .snpBlocks(gdsobj)
nsnpblock <- length(snp.blocks)
if(verbose) {
message('Running PC-Relate analysis using ', length(unlist(snp.blocks)), ' SNPs in ', nsnpblock, ' blocks...')
}
# compute estimates for each variant block; sum along the way
#matList <- foreach(k = 1:nsnpblock, .combine = .matListCombine, .inorder = FALSE, .multicombine = FALSE) %do% {
snpBlock <- function(G, ...) {
#if(verbose) message(' Running block ', k, '...')
# read genotype data for the block
#G <- .readGeno(gdsobj, sample.include, snp.index = snp.blocks[[k]])
# load betas for the current block of variants
beta.block <- betaobj[colnames(G), , drop = FALSE]
### this line of code will probably be different if we save the betas; need to load correct betas
# calculate PC-Relate estimates
.pcrelateVarBlock(G = G, beta = beta.block, V = V, idx = idx, jdx = jdx, scale = scale,
ibd.probs = ibd.probs, maf.thresh = maf.thresh, maf.bound.method = maf.bound.method)
}
matList <- bpiterate(Giter, snpBlock, REDUCE = .matListCombine, reduce.in.order = FALSE, BPPARAM = BPPARAM)
# take ratios to compute final estimates
estList <- .pcrelateCalcRatio(matList = matList, scale = scale, ibd.probs = ibd.probs)
rm(matList)
# cast to data.tables
if(oneblock){
# self table
kinSelf <- data.table(ID = rownames(estList$kin), f = 2*diag(estList$kin) - 1, nsnp = diag(estList$nsnp))
setkeyv(kinSelf, 'ID')
# between samples table
kinBtwn <- .estListToDT(estList, drop.lower = TRUE)
}else{
kinSelf <- NULL
# between samples table
kinBtwn <- .estListToDT(estList, drop.lower = FALSE)
}
rm(estList)
setkeyv(kinBtwn, c('ID1', 'ID2'))
return(list(kinSelf = kinSelf, kinBtwn = kinBtwn))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.