RHiddenAncestries: Hidden Ancestries Group R Package

Documented in ancestr

#' Hidden Ancestries function
#'
#' Estimates ancestry proportions in heterogeneous allele frequency data
#'
#' @param data Dataframe: This dataframe should be in the format described in our data formatting document.
#' @param k Vector of character strings: 
#'          Column names of reference ancestries to be included in the model.
#' @param t Character sting: column name of the observe ancestry.
#' @param x0 Numeric vector: x_0 is the starting guess for the SLSQP algorithm.
#' @return Estimated ancestry proportions
#'
#' @author Gregory Matesi, \email{gregory.matesi@ucdenver.edu}
#' @reference
#' @keywords genomics
#' 
#' @examples
#' # load the data
#' data("ancestryData")
#' 
#' # Estimate 5 reference ancestry proportion values for the gnomAD african american ancestry group
#' # using a starting guess of .2 for each ancestry proportion.
#' ancestr( data = ancestryData, 
#'     k=c("ref_AF_afr_1000G", 
#'         "ref_AF_eur_1000G", 
#'         "ref_AF_sas_1000G", 
#'         "ref_AF_iam_1000G", 
#'         "ref_AF_eas_1000G"), 
#'     t="gnomAD_AF_afr", 
#'     x0 = c(.2, .2, .2, .2, .2) )
#'
#' @export
#' @importFrom nloptr slsqp


ancestr = function(data=NULL, k=c(NA), t=NA, x0=NULL){
  
  if(class(data)!="data.frame"){
    stop("ERROR: data must be a data.frame as described in the vignette")
  }
  
  if(typeof(t)!="character"){
    stop("ERROR: t must be the column name of the observed ancestry from data")
  }
  if(!t %in% names(data)){
    stop("ERROR: t must be the column name of the observed anestry from data")
  }
  if(typeof(k)!="character"){
    stop("ERROR: k must be an array of column names from data to be used in the reference")
  }
  if(all(k %in% names(data))==FALSE){
    stop("ERROR: k must be an array of column names from data to be used in the reference")
  }
  
  # Filter NA allele frequencies out of the observed column
  filteredNA <- length(which(is.na(data[,t]==TRUE)))
  # The math in the ancestr function only uses the 
  # observed allele frequency vector and the reference panel
  observed  <- as.data.frame( data[which(is.na(data[t])==FALSE),t] )
  refmatrix <- as.data.frame( data[which(is.na(data[t])==FALSE),k] )
  
  # 
  if(length(x0) != 0){
    #########################
    # Check if x0 is numeric
    if (is.numeric(x0)==FALSE){
      stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
    }
    # Check if length(x0)==k
    if (length(x0)!= length(k)){
      stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
    }
    # Check that x0 is positive
    if (all(x0>0) == FALSE){
      stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
    }
    # Check if sum(x0) = 1
    if (sum(x0)!=1){
      stop("ERROR: Please make sure x0 is a positive numeric vector of length k that sums to one")
    }
    #########################
    ###############################
    # Set the starting guess to x_0
    ###############################
    starting = x0
  } else{
    starting = rep( 1/ncol(refmatrix), ncol(refmatrix) )
  }
  # Here we are defining the objective function. This function is evaluated at a
  # k-dimensional set  x . Each of our K reference allele frequencies are multiplied by 
  # our current best guess for the ancestry proportion.
  # We then subtract the allele frequency values from the observed homogeneous population.
  # And finally this sum is squared to achieve a least squares form.
  
  #########################
  fn.ancmix = function(x){
    minfunc = 0
    for (i in 1:ncol(refmatrix)){
      minfunc = minfunc + x[i]*refmatrix[,i]
    }
    minfunc = minfunc - observed
    minfunc = sum((minfunc)**2)
    return(minfunc)
  }
  ########################
  
  #fn.ancmix = function(x){
  #  minfunc = 0
  #  for (i in 1:ncol(refmatrix)){
  #    minfunc = minfunc + x[i]*refmatrix[,i]
  #  }
  #  minfunc = minfunc - observed
  #  minfunc = sum((minfunc)**2)
  #  return(minfunc)
  #}
  
  # Here we are defining the gradient of the objective function.
  gr.ancmix <- function(x){
    gradvec = matrix(0,ncol(refmatrix),1)
    gradfunc = 0
    for (i in 1:ncol(refmatrix)){
      gradfunc = gradfunc + x[i]*refmatrix[,i]
    }
    gradfunc = gradfunc - observed
    
    for (i in 1:ncol(refmatrix)){
      gradvec[i] = sum(2 * refmatrix[,i] * gradfunc)
    }
    return(gradvec)
  }
  
  # H equality
  # This function returns the equality constraints for the nloptr slsqp algorithm
  # We sum up the K current proportion estimate values and subtract 1. If the estimated proportion values sum to 1 than this value should equal zero.
  heq.ancmix = function(x){
    equality = 0
    for (i in 1:ncol(refmatrix)){
      equality = equality + x[i]
    }
    return(equality - 1)
  }
  
  # H inequality
  # This function returns a K vector of 
  hin.ancmix <- function(x){
    h = numeric(ncol(refmatrix))
    for (i in 1:ncol(refmatrix)){
      h[i] = x[i]
    }
    return(h)
  }
  
  # We use the start_time function base function 
  #to record the run time for our convex optimization algorithm.
  # The output for the nloptr slsqp function is stored in the variable S.
  # This function requires 5 inputs:
  #   1. fn.ancmix is the objective function.
  #   2. gr.ancmix is the gradient of the objective function.
  #   3. hin.ancmix defining the inequality constraints
  #   4. heq.ancmix defining the equality constraints
  start_time = Sys.time()
  S = suppressMessages( slsqp(starting,
            fn = fn.ancmix,
            gr = gr.ancmix,
            hin = hin.ancmix,
            heq = heq.ancmix)
      )
  end_time = Sys.time()
  ttime = end_time - start_time
  
  # par is the K optimal ancestry propotion estimates
  # value is the minimization function evaluated at par
  # iter is the number of iterations that the algorithm took to reach the optimal solution of par
  # finally ttime is the run time for the algorithm
  
  d <- data.frame(matrix(ncol = length(k)+4, nrow = 1)) 
  colnames(d) <- c("objective", "iterations", "time",
                        "filtered", colnames(refmatrix))
  
  d[1] <- S$value
  d[2] <- S$iter
  d[3] <- ttime
  d[4] <- filteredNA
  for(i in 1:length(k)){
    d[4+i] <- S$par[i]
  }
  return(d)
}