#' Hidden Ancestries function
#'
#' Estimates ancestry proportions in heterogeneous allele frequency data
#'
#' @param data Dataframe: This dataframe should be in the format described in our data formatting document.
#' @param k Vector of character strings:
#' Column names of reference ancestries to be included in the model.
#' @param t Character sting: column name of the observe ancestry.
#' @param x0 Numeric vector: x_0 is the starting guess for the SLSQP algorithm.
#' @return Estimated ancestry proportions
#'
#' @author Gregory Matesi, \email{gregory.matesi@ucdenver.edu}
#' @reference
#' @keywords genomics
#'
#' @examples
#' # load the data
#' data("ancestryData")
#'
#' # Estimate 5 reference ancestry proportion values for the gnomAD african american ancestry group
#' # using a starting guess of .2 for each ancestry proportion.
#' ancestr( data = ancestryData,
#' k=c("ref_AF_afr_1000G",
#' "ref_AF_eur_1000G",
#' "ref_AF_sas_1000G",
#' "ref_AF_iam_1000G",
#' "ref_AF_eas_1000G"),
#' t="gnomAD_AF_afr",
#' x0 = c(.2, .2, .2, .2, .2) )
#'
#' @export
#' @importFrom nloptr slsqp
ancestr = function(data=NULL, k=c(NA), t=NA, x0=NULL){
if(class(data)!="data.frame"){
stop("ERROR: data must be a data.frame as described in the vignette")
}
if(typeof(t)!="character"){
stop("ERROR: t must be the column name of the observed ancestry from data")
}
if(!t %in% names(data)){
stop("ERROR: t must be the column name of the observed anestry from data")
}
if(typeof(k)!="character"){
stop("ERROR: k must be an array of column names from data to be used in the reference")
}
if(all(k %in% names(data))==FALSE){
stop("ERROR: k must be an array of column names from data to be used in the reference")
}
# Filter NA allele frequencies out of the observed column
filteredNA <- length(which(is.na(data[,t]==TRUE)))
# The math in the ancestr function only uses the
# observed allele frequency vector and the reference panel
observed <- as.data.frame( data[which(is.na(data[t])==FALSE),t] )
refmatrix <- as.data.frame( data[which(is.na(data[t])==FALSE),k] )
#
if(length(x0) != 0){
#########################
# Check if x0 is numeric
if (is.numeric(x0)==FALSE){
stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
}
# Check if length(x0)==k
if (length(x0)!= length(k)){
stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
}
# Check that x0 is positive
if (all(x0>0) == FALSE){
stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
}
# Check if sum(x0) = 1
if (sum(x0)!=1){
stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
}
#########################
###############################
# Set the starting guess to x_0
###############################
starting = x0
} else{
starting = rep( 1/ncol(refmatrix), ncol(refmatrix) )
}
# Here we are defining the objective function. This function is evaluated at a
# k-dimensional set x . Each of our K reference allele frequencies are multiplied by
# our current best guess for the ancestry proportion.
# We then subtract the allele frequency values from the observed homogeneous population.
# And finally this sum is squared to achieve a least squares form.
#########################
fn.ancmix = function(x){
minfunc = 0
for (i in 1:ncol(refmatrix)){
minfunc = minfunc + x[i]*refmatrix[,i]
}
minfunc = minfunc - observed
minfunc = sum((minfunc)**2)
return(minfunc)
}
########################
#fn.ancmix = function(x){
# minfunc = 0
# for (i in 1:ncol(refmatrix)){
# minfunc = minfunc + x[i]*refmatrix[,i]
# }
# minfunc = minfunc - observed
# minfunc = sum((minfunc)**2)
# return(minfunc)
#}
# Here we are defining the gradient of the objective function.
gr.ancmix <- function(x){
gradvec = matrix(0,ncol(refmatrix),1)
gradfunc = 0
for (i in 1:ncol(refmatrix)){
gradfunc = gradfunc + x[i]*refmatrix[,i]
}
gradfunc = gradfunc - observed
for (i in 1:ncol(refmatrix)){
gradvec[i] = sum(2 * refmatrix[,i] * gradfunc)
}
return(gradvec)
}
# H equality
# This function returns the equality constraints for the nloptr slsqp algorithm
# We sum up the K current proportion estimate values and subtract 1. If the estimated proportion values sum to 1 than this value should equal zero.
heq.ancmix = function(x){
equality = 0
for (i in 1:ncol(refmatrix)){
equality = equality + x[i]
}
return(equality - 1)
}
# H inequality
# This function returns a K vector of
hin.ancmix <- function(x){
h = numeric(ncol(refmatrix))
for (i in 1:ncol(refmatrix)){
h[i] = x[i]
}
return(h)
}
# We use the start_time function base function
#to record the run time for our convex optimization algorithm.
# The output for the nloptr slsqp function is stored in the variable S.
# This function requires 5 inputs:
# 1. fn.ancmix is the objective function.
# 2. gr.ancmix is the gradient of the objective function.
# 3. hin.ancmix defining the inequality constraints
# 4. heq.ancmix defining the equality constraints
start_time = Sys.time()
S = suppressMessages( slsqp(starting,
fn = fn.ancmix,
gr = gr.ancmix,
hin = hin.ancmix,
heq = heq.ancmix)
)
end_time = Sys.time()
ttime = end_time - start_time
# par is the K optimal ancestry propotion estimates
# value is the minimization function evaluated at par
# iter is the number of iterations that the algorithm took to reach the optimal solution of par
# finally ttime is the run time for the algorithm
d <- data.frame(matrix(ncol = length(k)+4, nrow = 1))
colnames(d) <- c("objective", "iterations", "time",
"filtered", colnames(refmatrix))
d[1] <- S$value
d[2] <- S$iter
d[3] <- ttime
d[4] <- filteredNA
for(i in 1:length(k)){
d[4+i] <- S$par[i]
}
return(d)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.