R/format_ensembl_vep.R

Defines functions format_ensembl_vep

Documented in format_ensembl_vep

#' @title format ensembl vep
#' @description A function that converts UKBB MFI data into a format that is read-able by VEP 94.
#' Note, that this function always assumes the input order ('ALT_ID', 'RS_ID', 'POS', 'A1', 'A2', 'MAF', 'MA', 'INFO')
#' @param path string. Path to file. If NULL, will ignore path and exepcted argument df.
#' @param df a data.frame.
#' @param strand string. Assumes + strand.
#' @param data.table.fill See data.table::fread (fill)
#' @param data.table.fill See data.table::fread (verbose)
#' @param use.alt.id Should alternative (unique ID) be used instead?
#' @param data.table.verbose should data.table stats be printed?
#' @return a data.table

format_ensembl_vep <- function(path = NULL, df = NULL,  strand = '+', data.table.fill = TRUE, data.table.verbose = T, use.alt.id = T){

  #require(data.table)
  # Ensembl format
  # Column 1: chr
  # Column 2: start pos
  # Column 3: end pos, which is different from start pos if the length of the effect allele is >1: Start pos + number of characters in effect allele -1
  # Column 4: Effect allele/Other allele
  # Column 5: strand (always + for UKBB)
  # Column 6: rsid, or other unique ID for variants for which there is no ID starting with rs

  #df = read.csv(path, sep = '\t')
  #df = read.table(path, header = F)
  if (!is.null(path)){
    df = fread(path, fill = data.table.fill, verbose = data.table.verbose, header = F)
    colnames(df) = c('ALT_ID', 'RS_ID', 'POS', 'A1', 'A2', 'MAF', 'MA', 'INFO')
  } else {
    df <- setDT(df)
  }

  write(paste(nrow(df), 'lines were loaded..\n'),stdout())

  # assumes that first ALD_ID also contains chromosome name
  chr = strsplit(df$ALT_ID, split = '\\:')[[1]][1]
  df$ENS_CHR = as.factor(chr)
  df$ENS_START = as.integer(df$POS)
  df$ENS_END = as.integer(df$POS)
  df$ENS_ALLELE = paste0(df$A2, '/', df$A1)
  df$ENS_STRAND = as.factor(strand)

  # use alternate ID for out. This is used
  # to combine merge SNPs for multi-allelic sites.
  if (use.alt.id == TRUE){
    df$ENS_ID = df$ALT_ID
  } else {
    df$ENS_ID = df$RS_ID
  }

  # deletions / insertions
  df$A1N = as.integer(unlist(lapply(df$A1, nchar)))
  df$A2N = as.integer(unlist(lapply(df$A2, nchar)))
  df$BOOL = df$A2N > 1
  df$ENS_END[df$BOOL] = df$ENS_END[df$BOOL] + df$A2N[df$BOOL] - 1
  df$ENS_CHR_POS_A1_A2 = paste0(df$ENS_CHR,':',df$ENS_START,'_',df$A1,'_',df$A2)
  write(paste(nrow(df), 'Formatting complete..\n'),stdout())

  # Return data
  cols = colnames(df)[grepl('ENS',colnames(df))]
  df = df[,..cols]
  return(df)

}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.