R/get_variants_in_genes.R

Defines functions get_variants_in_genes

Documented in get_variants_in_genes

#' @title get_variants_in_genes
#' @description Returns all variants that are within a gene (defined by start/stop
#' in the reference dataset).
#' @param df data.frame with three columns: chr, bp and id.
#' @param reference a reference dataset that contains 6 columns:
#' ensembl_gene_id, ensembl_transcript_id, hgnc_symbol,
#' chromosome_name, start_position, end_position. This could, for
#' instance be retrieved by biomart.
#' @param window integer. How many bases should the window be extended upstrea/downstream of the gene.
#' @note Assumes genes are aligned to positive strand.
#' @author flassen
#' @export

get_variants_in_genes = function(df, reference, window = 0){

  if (!is.numeric(window)) stop('arg window must be integer')
  if (window < 0) warning('arg window is less than zero! Truncating genes at start/end.')

  #require(data.table)
  # df: two columns, chromosome, bpstart, variant id
  # reference: data. frame with columns:
  colnames(df) = c('chr','bp','id')
  colnames(reference) = c("ensembl_gene_id", "ensembl_transcript_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position")
  reference = setDT(reference)
  n = nrow(df)

  result = lapply(1:n, function(i){

    # find overlap
    overlap_reference = reference[reference$chromosome_name == df$chr[i] &
                                    (reference$start_position - window) <= df$bp[i] &
                                    (reference$end_position + window) >= df$bp[i], ]

    # collapse into one single line
    transcript_ids = paste(overlap_reference$ensembl_transcript_id, collapse = ';')
    overlap_reference$ensembl_transcript_id = NULL
    overlap_reference = overlap_reference[!duplicated(overlap_reference),]
    overlap_reference$ensembl_transcript_id = transcript_ids

    # append with variant ID
    overlap_reference$variant_position = df$bp[i]
    overlap_reference$variant_id = df$id[i]
    return(overlap_reference)
  })

  merged = do.call(rbind, result)
  return(merged)

}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.