R/getClinvarData.R

#' getClinvarData
#'
#' Function to load from HGMD MySQL database. Assumes data schema as published by HGMD. Requires RMySQL package
#' @param clinvarFile Gene name to load
#' @param transcript VEP transcript object or list. Needs to contain 'id', 'seq_region_name','start' and 'end' elements
#' @param password Return filtered-out sites (i.e. sites marked with anything other than PASS on vcf). Defaults to FALSE
#' @param host Max number of records to load. Set to -1 for no limit.
#' @param port Reference version. Defaults to "GRCh37".
#' @param dbname Reference version. Defaults to "GRCh37".
#' @keywords cats
#' @export
#' @examples
#' hgmddata<- getClinvarData('clinvar.vcf',transcript)

getClinvarData <- function(clinvarFile, transcript, useTableData) {
  if (useTableData) {
    clinvarTableData <- read.table(clinvarFile,
                                   header = T,sep = "\t",comment.char = "")

    clinvarGeneData <- subset(clinvarTableData,Assembly=="GRCh37" & GeneSymbol == geneName)
    clinvarGeneData <- clinvarGeneData[grep("Benign",clinvarGeneData$ClinicalSignificance,invert = T,ignore.case = T),]
    #clinvarGeneData <- clinvarGeneData[grep(clinvarPrefix,clinvarGeneData$Name, fixed=T),]
    clinvarMutations <- sub("^NM.*:(c.*)","\\1",clinvarGeneData$HGVS.c..,perl = T)
    #v2 <- sub("\\s.*" ,"", v1, perl=T)
    clinvarMutations
  } else {
    packedData<-loadVcfFileForGene(clinvarFile, transcript,n = -1,includeFilteredSites = T)
    unpackData(packedData)
  }

}

unpackData <- function(packedData) {
  init=F
  # unpack multiallelic data and form nicely shaped data frame
  for (i in 1:nrow(packedData) ) {
    # expand multiallelics
    unpackedRow<-unpackRow(packedData[i,])
    if (!init) {
      unpackedData=unpackedRow
      init=T
    } else unpackedData=rbind(unpackedData,unpackedRow)

  }

  unpackedData
}

unpackRow <- function(vcfRow) {
  #print(vcfRow)
  # unpack multiallelics into one row per alt allele
  fieldsToUnpack <- names(vcfRow)[sapply(sapply(vcfRow,unlist),length)>1]

  df <- as.data.frame(vcfRow)
  # build DF with all fields except the ones we need to unpack and convert from list to scalar
  altIdx=which(names(vcfRow) %in% fieldsToUnpack)
  newRows <- if (length(altIdx)>0) df[,-altIdx] else df

  altAlleles<-as.character(vcfRow$ALT[[1]])

  allAlleles <- c(newRows$REF,altAlleles)
  commonBases <- sapply(altAlleles,function(x,ref) commonSubstring(c(ref,x)), newRows$REF)

  for (i in 1:length(altAlleles)) {
    newDF<-newRows
    # change alt,
    newDF$ALT=altAlleles[[i]]
    # for all columns which are lists, pick element i
    for (fi in fieldsToUnpack) {
      valList=df[,fi]
      #print(valList)
      newDF[,fi]=valList[[1]][i]
    }
    #newDF <- newDF[0,]

    result<- if (i==1) newDF else rbind(result,newDF)
  }

  result
}
gdelangel/genomicRTools documentation built on May 16, 2019, 11:14 p.m.