#' update allele frequencies function
#'
#' updates allele frequency data in heterogeneous genetic variant data
#'
#' @param data dataframe
#' @param reference Array of character strings.
#' Column names for the ancestries used in the model
#' @param observed Character string. Column name of observed ancestry
#' @param pi_target Numeric vector.
#' Ancestry proportion values for the target ancestry
#' @param pi_observed Numeric vector.
#' Ancestry proportion values for the observed ancestry.
#' @param panel array of character strings. The reference ancestry to be used in the ancestr() function
#' If the user does not have pi_observed values already.
#' @return data.frame data with an additional column at the end containing updated allele frequencies.
#'
#' @author Gregory Matesi, \email{gregory.matesi@ucdenver.edu}
#'
#' @reference
#' @keywords genomics
#'
#' @examples
#' data(ancestryData)
#' updateAF(data = ancestryData,
#' reference = c("ref_AF_eur_1000G", "gnomAD_AF_afr"),
#' observed = "gnomAD_AF_afr",
#' pi_target = c(1,0),
#' pi_observed = c(.15,.18))
#' @export
# INPUTS:
# data (dataframe):
# CHR RSID POS A1 A2 ref_eur ... ref_iam target_afr obs_amr obs_oth
# 1 rs2887286 1156131 C T 0.173275495 ... 0.7093 0.4886100 0.52594300 0.22970500
# 1 rs41477744 2329564 A G 0.001237745 ... 0.0000 0.0459137 0.00117925 0.00827206
# 1 rs9661525 2952840 G T 0.168316089 ... 0.2442 0.1359770 0.28605200 0.15561700
# 1 rs2817174 3044181 C T 0.428212624 ... 0.5000 0.8548790 0.48818000 0.47042500
# 1 rs12139206 3504073 T C 0.204214851 ... 0.3372 0.7241780 0.29550800 0.25874800
# 1 rs7514979 3654595 T C 0.004950604 ... 0.0000 0.3362490 0.01650940 0.02481620
#
# reference: k reference ancestry column names from data
#
# observed:
#
# pi_target:
#
# pi_reference:
#
# pi_observed:
#
#
#
#
updateAF <- function( data ,
reference ,
observed ,
pi_target ,
pi_observed = c( NA),
panel = c("None")
){
# If they are both empty, Error
if(all(panel == "None") & all(is.na(pi_observed))){
stop("ERROR: Either pi_observed or panel is required.")
}
# If only panal was entered, call ancestr()
if( all(panel != "None") & all(is.na(pi_observed)) ){
# Estimate observed proportion values
pi_observed <- ancestr(data=data, k=panel, t=observed)[,-c(1:4)]
# Else if both pi_observed and panel were entered, Error
}else if( !all(is.na(pi_observed)) & all(panel !="None") ){
stop("Error: Either panel or pi_observed must be entered. not both.")
}
if( !all(colnames(data)[1:5] == c("CHR", "RSID", "POS", "A1", "A2") ) ){
stop("ERROR: Please make sure the first five columns of the data are names according to the documentation.")
}
if(class(data)!="data.frame"){
stop("ERROR: data must be a data.frame as described in the vignette")
}
if(typeof(observed)!="character"){
stop("ERROR: please enter the column name of the observed ancestry for t")
}
if( !( observed %in% names(data) ) ){
stop("ERROR: please enter the column name of the observed ancestry for t")
}
if(typeof(reference)!="character"){
stop("ERROR: please enter the column names for the reference ancestries for k")
}
if(all(reference %in% names(data))==FALSE){
stop("ERROR: please enter the column names for the reference ancestries for k")
}
if(length(pi_target) != length(reference)){
stop("ERROR: please enter a vector of proportion estimates for the target ancestry")
}
if(length(pi_observed) != length(reference)){
stop("ERROR: pi_observed was not entered correctly. This will prompt the function to estimate them using sequential quadratic programming.")
}
if( !(all(pi_observed<=1) & all(pi_target<=1)) | !(all(pi_observed>=0) & all(pi_target>=0)) ){
stop("ERROR: pi_observed and pi_target must contain ratios between 0 and 1.")
}
# Output the users ancestries and pi values in a dataframe as a check.
# Check to see if pi_reference are NA.
# If yes, use the HA function.
# Normalize pi. We need the pi_reference and pi_target to sum to 1
pi_target <- pi_target / sum(pi_target)
pi_observed <- pi_observed / sum(pi_observed)
# We need to sum the reference ancestry allele frequencies multiplied by pi_target.
# We will call this sum "starred"
# We also need to sum up the reference ancestry allele frequencies multiplied by pi_reference.
# We will call this sum "hatted"
# Initialize "hatted" and "starred"
hatted <- vector(mode = "double", length = dim(data)[1])
starred <- vector(mode = "double", length = dim(data)[1])
# Sum the K-1 reference ancestries multiplied by pi_hat.
# Also the sum the k-1 reference ancestries multiplied by pi_star.
# I'd like to find better names for "leaveoneout," "hatted," and "starred."
leaveoneout <- reference[-c(which(reference==observed))]
for ( k in 1:length(leaveoneout) ){
hatted <- hatted + pi_observed[which(reference!=observed)][k] * data[,leaveoneout[k]]
starred <- starred + pi_target[ which(reference!=observed)][k] * data[,leaveoneout[k]]
}
# Add a new column to D:
# Ancestry adjusted allele frequency time the ratio or pi_star/pi_hat
# plus the sum "starred"
data$adjustedAF <- (
( pi_target[which(reference==observed)] / pi_observed[which(reference==observed)] ) *
( data[,observed] - hatted )
+ starred )
data <- cbind(data[,c("CHR", "RSID", "POS", "A1", "A2")], data[,c(reference, "adjustedAF")] )
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.