#' @title format ensembl vep
#' @description A function that converts UKBB MFI data into a format that is read-able by VEP 94.
#' Note, that this function always assumes the input order ('ALT_ID', 'RS_ID', 'POS', 'A1', 'A2', 'MAF', 'MA', 'INFO')
#' @param path string. Path to file. If NULL, will ignore path and exepcted argument df.
#' @param df a data.frame.
#' @param strand string. Assumes + strand.
#' @param data.table.fill See data.table::fread (fill)
#' @param data.table.fill See data.table::fread (verbose)
#' @param use.alt.id Should alternative (unique ID) be used instead?
#' @param data.table.verbose should data.table stats be printed?
#' @return a data.table
format_ensembl_vep <- function(path = NULL, df = NULL, strand = '+', data.table.fill = TRUE, data.table.verbose = T, use.alt.id = T){
#require(data.table)
# Ensembl format
# Column 1: chr
# Column 2: start pos
# Column 3: end pos, which is different from start pos if the length of the effect allele is >1: Start pos + number of characters in effect allele -1
# Column 4: Effect allele/Other allele
# Column 5: strand (always + for UKBB)
# Column 6: rsid, or other unique ID for variants for which there is no ID starting with rs
#df = read.csv(path, sep = '\t')
#df = read.table(path, header = F)
if (!is.null(path)){
df = fread(path, fill = data.table.fill, verbose = data.table.verbose, header = F)
colnames(df) = c('ALT_ID', 'RS_ID', 'POS', 'A1', 'A2', 'MAF', 'MA', 'INFO')
} else {
df <- setDT(df)
}
write(paste(nrow(df), 'lines were loaded..\n'),stdout())
# assumes that first ALD_ID also contains chromosome name
chr = strsplit(df$ALT_ID, split = '\\:')[[1]][1]
df$ENS_CHR = as.factor(chr)
df$ENS_START = as.integer(df$POS)
df$ENS_END = as.integer(df$POS)
df$ENS_ALLELE = paste0(df$A2, '/', df$A1)
df$ENS_STRAND = as.factor(strand)
# use alternate ID for out. This is used
# to combine merge SNPs for multi-allelic sites.
if (use.alt.id == TRUE){
df$ENS_ID = df$ALT_ID
} else {
df$ENS_ID = df$RS_ID
}
# deletions / insertions
df$A1N = as.integer(unlist(lapply(df$A1, nchar)))
df$A2N = as.integer(unlist(lapply(df$A2, nchar)))
df$BOOL = df$A2N > 1
df$ENS_END[df$BOOL] = df$ENS_END[df$BOOL] + df$A2N[df$BOOL] - 1
df$ENS_CHR_POS_A1_A2 = paste0(df$ENS_CHR,':',df$ENS_START,'_',df$A1,'_',df$A2)
write(paste(nrow(df), 'Formatting complete..\n'),stdout())
# Return data
cols = colnames(df)[grepl('ENS',colnames(df))]
df = df[,..cols]
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.