#' get and parse ExAC variants within a gene
#'
#' @param hgnc HGNC symbol for the gene that we want data for eg "ARID1B"
#' @param chrom chromosome string e.g. "6"
#' @param start start position of region to be investigated (if checking for
#' gene region defined by chromosome coordinates rather than a HGNC-based
#' gene region).
#' @param end end position of region to be investigated (if checking for
#' gene region defined by chromosome coordinates rather than a HGNC-based
#' gene region).
#' @param fileName path to tabis-indexed ExAC VCF file.
#' @param check_last_base boolean for whether to check if last base non-lofs can
#' be LoF.
#' @param remove_benign boolean for whether to exclude missense_variants with
#' polyphen tolerated predictions.
#' @export
#'
#' @return data frame of variants in gene
get_exac_variants_for_gene <- function(hgnc, chrom, start=NULL, end=NULL,
fileName=NULL, check_last_base=FALSE, remove_benign=FALSE) {
if (!is.null(hgnc) & is.null(start) & is.null(end)) {
rows = get_gene_coordinates(hgnc, chrom)
start=rows$start
end=rows$stop
}
if (is.null(fileName)) {
fileName = get("EXAC", envir=exacPathEnv)
}
if (!file.exists(fileName)) {
stop(paste("Cannot find the ExAC file. Check that the file exists at: ",
fileName, ", or obtain the ExAC datasets, ",
"then run set_exac_path(YOUR_PATH) before trying this function ",
"again.", sep=""))
}
# define the populations of interest within the ExAC dataset
populations = list(AFR="African/African American", AMR="American",
EAS="East Asian", FIN="Finnish", NFE="Non-Finnish European",
SAS="South Asian")
# extract variants within the region from the VCF
vars = seqminer::readVCFToListByRange(fileName,
range=paste(chrom, ":", start, "-", end, sep=""),
annoType="",
vcfColumn=c("CHROM", "POS", "REF", "ALT"),
vcfInfo=c("AC", "AN", paste("AC_", names(populations), sep=""),
paste("AN_", names(populations), sep=""), "CSQ"),
vcfIndv=c())
# if the gene doesn't contain any variants in ExAC, then return a list of
# empty dataframes
if (length(vars$CHROM) == 0) {
empty = data.frame('CHROM'=character(0), 'POS'=character(0),
'REF'=character(0), 'ALT'=character(0), 'AC'=character(0),
'AN'=character(0), 'CSQ'=character(0))
vars = list()
for (x in names(populations)) { vars[[x]] = empty }
return(vars)
}
# convert the vars list to a data frame, then extract the necessary info
# from the VEP data field.
vars$sampleId = NULL
vars = data.frame(vars, stringsAsFactors=FALSE)
vars$CQ = apply(vars, 1, parse_vep_output, hgnc=hgnc, remove_benign=remove_benign)
vars$CSQ = NULL
if (check_last_base & !is.null(hgnc)) {
ends = get_exon_ends(hgnc)
exon_ends = ends$all_ends
strand = ends$strand
vars$CQ = apply(vars, 1, check_for_last_base_in_exon, exon_ends=exon_ends, strand=strand)
}
vars = get_list_of_population_frequencies(vars, populations)
return(vars)
}
#' create a list of allele frequency dataframes for each population.
#'
#' @param vars data frame of variants in gene, with allele counts and totals for
#' several ExAC sub-groups.
#' @param populations list of populations (e.g. list(EAS="East Asian",
#' FIN="Finnish")). Perhaps this should be a simple vector of names, but
#' this way I have more information about what the populations are.
#' @export
#'
#' @return list of dataframes, one for each population. The dataframes contain
#' allele count, allele number and VEP consequence strings for each
#' allele in the input dataframe. The list is named for each of the
#' populations.
get_list_of_population_frequencies <- function(vars, populations) {
# define a blank list to place the formatted dataframes into
groups = vector(mode="list", length=length(populations))
names(groups) = names(populations)
for (population in names(populations)) {
# define the allele count and total alleles for the population
vars$AC = vars[[paste("AC_", population, sep="")]]
vars$AN = vars[[paste("AN_", population, sep="")]]
# split out multiple alleles, drop all unecessary columns, and add the
# dataframe to the list
by_allele = standardise_multiple_alt_variants(vars)
by_allele = by_allele[, c("CHROM", "POS", "REF", "ALT", "AC", "AN", "CQ")]
groups[[population]] = by_allele
}
return(groups)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.