# R/eila.R In EILA: Efficient Inference of Local Ancestry

#### Documented in eila

```## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Subset AIMs
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

comp.anc.dist <- function(anc1,anc2){
m <- nrow(anc1)
res <- rep(NA,m)
for (i in 1:m){
res[i] <- wilcox.test(anc1[i,],anc2[i,])\$p.value
}
return(res)
}

get.thin.idx <- function(pvalues, position, V=50000, C=0.2){
pos.int <- floor((position-min(position)+0.5)/V)

res <- rep(FALSE,length(position))
for (i in unique(pos.int)){
idx <- pos.int != i
p <- pvalues
p[idx] <-  1
res[which.min(p)] <- TRUE
}

res[pvalues > C] <- FALSE
return(res)
}

subset.data <- function(obj,idx){
m <- length(obj)
res <- obj
for (i in 1:m){
x <- obj[[i]]
if(is.vector(x)){
if(length(x) > 1){
res[[i]] <- x[idx]
}else{
res[[i]] <- x
}
}else if(is.matrix(x)){
res[[i]] <- x[idx,]
}
}
return(res)
}

get.AIMs <- function(obj){

pval <- comp.anc.dist(obj\$anc1,obj\$anc2)
idx <- get.thin.idx(pvalues=pval,position=obj\$position)
res <- subset.data(obj,idx)

return(res)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## dec genotype to probability
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

get.prob.score <- function(obj){
anc1 <- obj\$anc1
anc2 <- obj\$anc2

old.o <- options(warn= -1)
prob <- matrix(NA, nrow=r, ncol=c)
y <- c(rep(1,ncol(anc1)),rep(0,ncol(anc2)))
ANC <- cbind(anc1,anc2)
for (i in 1:r){
x <- ANC[i,]
type="response")
}
options(old.o)

res <- list(prob=prob,
position=obj\$position)
return(res)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## partition chromosome
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

partition.chr <- function(pos){

EPI <- 0.1
b0 <- min(pos)
b1 <- max(pos)
res <- c(b0-EPI,b1+EPI)
D <- 300
if(length(pos)> D){
M <- ceiling(length(pos)/D)
K <- (b1-b0)/M
res <- c(res,seq(1,M-1)*K)
res <- sort(res)
}
return(res)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Calculate dec QR est
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

quantreg.est <- function(y, lambda, eps=1e-3, tau=0.5) {
m <- length(y)
E <- diag(m)
D <- diff(E)
B <- rbind(E, lambda * D)

uniq.y <- unique(y)
if(length(uniq.y) < length(y)){
d0 <- min(diff(sort(uniq.y))) / 100
y <- y + rnorm(length(y), mean=0, sd=d0)
}
yhat <- c(y,rep(0,m-1))
res.rq.fit <- rq.fit.fnb(B,yhat, eps=eps, tau=tau)
rq.est <- res.rq.fit\$coef
return(rq.est)
}

prob.to.qr <- function(prob, lambda){
R <- nrow(prob)
C <- ncol(prob)
res <- matrix(NA, nrow=R, ncol=C)

for (i in 1:C){
res[,i] <- quantreg.est(prob[,i], lambda=lambda)
}
return(res)
}

get.fused.qr <- function(obj,lambda){

partition.points <- partition.chr(obj\$position)

N <- nrow(obj\$prob)
C <- ncol(obj\$prob)
dec.qr <- matrix(NA,nrow=N,ncol=C)
for (k in 1:(length(partition.points)-1)){
##cat(k," out of ",length(partition.points)," ......\n")
idx <- obj\$position > partition.points[k] &
obj\$position <= partition.points[k+1]
dec.qr[idx,] <- prob.to.qr(obj\$prob[idx,],lambda)

}
position = obj\$position)

return(res)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Get breakpoints(jumps)
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
get.breakpoints <- function(obj){
position <- obj\$position
EPS <- 10^(-2)
N <- ncol(dec.qr)
res <- list()
for (i in 1:N){
left.idx <- which(abs(diff(dec.qr[,i])) > EPS)
res[[i]] <- (position[left.idx] + position[left.idx + 1])/2
}
return(res)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## cluster each segment
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

n1 <- nrow(obj1)
n2 <- nrow(obj2)
n <- min(c(n1,n2))
idx1 <- sample(n1,n)
idx2 <- sample(n2,n)
res <- matrix(NA, nrow=n, ncol=ncol(obj1))
for (i in 1:n){
x1 <- obj1[idx1[i],]
x2 <- obj2[idx2[i],]
idx1.0 <- x1 == 0
idx1.1 <- x1 == 1
idx1.2 <- x1 == 2
idx2.0 <- x2 == 0
idx2.1 <- x2 == 1
idx2.2 <- x2 == 2
z <- (x1 + x2)/2

idx <- (idx1.0 & idx2.1) | (idx1.1 & idx2.0)
if(any(idx)) z[idx] <- sample(c(0,1),sum(idx), replace=T)

idx <- (idx1.1 & idx2.2) | (idx1.2 & idx2.1)
if(any(idx)) z[idx] <- sample(c(1,2),sum(idx), replace=T)

idx <- idx1.1 & idx2.1
if(any(idx)) z[idx] <- sample(c(0,1,2), size=sum(idx),
replace=T, prob=c(0.25,0.5,0.25))
res[i,] <- z
}
return(res)
}

cluster.segment <- function(obj){

test <- obj\$test
train <- obj\$train
train.lab <- obj\$train.lab
ini <- obj\$ini

get.mode <- function(x){
y <- table(x)
names(y[which.max(y)])
}

cluster <- try(kmeans(rbind(train,test),centers=ini)\$cluster,T)
if(class(cluster) == "try-error"){
res <- knn(train, test, train.lab,k=7)
res <- as.character(res)
}else{
test.cluster <- cluster[length(cluster)]
train.cluster <- cluster[-length(cluster)]
names(train.cluster) <- train.lab
res <- get.mode(names(train.cluster[train.cluster ==
test.cluster]))
if(length(res) == 0){
res <- knn(train, test, train.lab,k=7)
res <- as.character(res)
}
}
res <- as.integer(res)
return(res)
}

get.train.test.data <- function(obj){
anc1 <- obj\$anc1
anc2 <- obj\$anc2
anc3.NULL <- obj\$anc3.NULL

if(!anc3.NULL){
anc3 <- obj\$anc3
n3 <- nrow(anc3)
ini <- rbind(apply(anc1,2,mean),
apply(anc2,2,mean),
apply(anc3,2,mean),
}else{
anc3 <- NULL
n3 <- n13 <- n23 <- 0
ini <- rbind(apply(anc1,2,mean),
apply(anc2,2,mean),
)
}
train.lab <- c(rep(11,nrow(anc1)),
rep(22,nrow(anc2)),
rep(33,n3),
rep(13,n13),
rep(23,n23))

return(list(test=dec,
train=train,
train.lab=train.lab,
ini=ini)
)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Infer local ancestry
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

subset.segment <- function(obj,i,idx){
list(test      = obj\$test[i,idx,drop=F],
train     = obj\$train[,idx],
train.lab = obj\$train.lab,
ini       = obj\$ini[,idx]
)
}
transpose.matrix <- function(obj){
m <- length(obj)
res <- obj
for (i in 1:m){
x <- obj[[i]]
if(is.matrix(x)) res[[i]] <- t(x)
}
return(res)
}

get.local.ancestry <- function(obj, breakpoints){

obj.t <- transpose.matrix(obj) ## From GxN to NxG
my.data <- get.train.test.data(obj.t)

pos <- obj\$position
b0 <- min(pos) - 1
bm <- max(pos) + 1
R <- nrow(my.data\$test)
C <- ncol(my.data\$test)
res <- matrix(NA, nrow=R, ncol=C)

for (i in 1:R){
##cat("\ni=",i,"\n")
bps <- c(b0, breakpoints[[i]], bm)
m <- length(bps)
for (b in 1:(m-1)){
##cat("\tb=",b,"\n")
idx <- pos > bps[b]  & pos <= bps[b+1]
##cat("\tidx length =",sum(idx),"\n")
if(sum(idx) > 1){
obj.ib <- subset.segment(my.data,i,idx)
res.cluster <- cluster.segment(obj.ib)
res[i,idx] <- res.cluster
}
}
}
res <- t(res) ## Transpose back to match the input data format:
return(res)
}

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## MAIN program
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

infer.breakpoints <- function(obj,lambda){
#    cat("\n\nget AIMs ......")
AIM <- get.AIMs(obj)

#    cat("\n\nget prob score ......")
PROB <- get.prob.score(AIM)

#    cat("\n\nget fused quantile regression ......\n")
FQR <- get.fused.qr(PROB, lambda=lambda)

#    cat("\n\nget breakpoints ......")
res <- get.breakpoints(FQR)
return(res)
}

infer.local.ancestry1 <- function(obj,lambda){

Breakpoints <- infer.breakpoints(obj,lambda)

#    cat("\n\ninfer local ancestry ......")
res <- get.local.ancestry(obj, Breakpoints)
return(res)

}

infer.local.ancestry2 <- function(obj,obj1,obj2,obj3,lambda){

Breakpoints1 <- infer.breakpoints(obj1,lambda)
Breakpoints2 <- infer.breakpoints(obj2,lambda)
Breakpoints3 <- infer.breakpoints(obj3,lambda)

m <- length(Breakpoints1)
Breakpoints <- list()
for (i in 1:m){
z <- c(Breakpoints1[[i]],
Breakpoints2[[i]],
Breakpoints3[[i]])
Breakpoints[[i]] <- sort(z)
}

cat("infer local ancestry ......\n")
res <- get.local.ancestry(obj, Breakpoints)
return(res)
}

lambda=15,rng.seed=172719943){

rng.state <- NULL
if(exists(".Random.seed")) rng.state <- .Random.seed
set.seed(rng.seed)

anc3.NULL <- ifelse(is.null(anc3), TRUE, FALSE)

old.op <- options(error = NULL)
if(!hasArg(position)) stop("Argument position is missing")
if(!hasArg(anc1))     stop("Argument anc1 is missing")
if(!hasArg(anc2))     stop("Argument anc2 is missing")

if(!is.vector(position)) position <- as.vector(position)
if(!is.matrix(anc1)) anc1 <- as.matrix(anc1)
if(!is.matrix(anc2)) anc2 <- as.matrix(anc2)

G1 <- nrow(anc1)
G2 <- nrow(anc2)

if(length(unique(c(G0,G1,G2))) > 1 )
stop("Numbers of rows between admxied, anc1, or anc2 do not match")

stop("Current verison of eila requires at least five columns in admixed, anc1, and anc2")

if(any(is.na(position))) stop("There is a missing value in position")
stop("There is a missing value in admixed. Please impute missing (eg. impute package)")
if(any(is.na(anc1)))
stop("There is a missing value in anc1. Please impute missing (eg. impute pacakge)")
if(any(is.na(anc2)))
stop("There is a missing value in anc2. Please impute missing (eg. impute pacakge)")

stop("The values in anc1 are not valid")
if(any(anc1 < 0) || any(anc1 > 2))
stop("The values in anc1 are not valid")
if(any(anc2 < 0) || any(anc2 > 2))
stop("The values in anc1 are not valid")

if(lambda < 0) stop("lambda has to be a positive number")

position=position,
anc1=anc1,
anc2=anc2,
anc3.NULL=anc3.NULL)

if(!anc3.NULL){
if(!is.matrix(anc3)) anc3 <- as.matrix(anc3)
G3 <- nrow(anc3)
if(G0 != G3) stop("Numbers of rows between admxied and anc3 do not match")
if(any(is.na(anc3)))
stop("There is a missing value in anc3. Please impute missing (eg. impute pacakge)")

obj <- c(obj, list(anc3=anc3))
}

if(anc3.NULL){
local.ancestry <- infer.local.ancestry1(obj,lambda)
}else{
local.ancestry <- infer.local.ancestry2(obj,obj1,obj2,obj3,lambda)
}
options(old.op)

return(list(local.ancestry=local.ancestry,
rng.seed=rng.seed, rng.state=rng.state))
}
```

## Try the EILA package in your browser

Any scripts or data that you put into this service are public.

EILA documentation built on May 29, 2017, 11:46 a.m.