#This script requires that the script & relevant data are in the same folder, presumed to be denovolyzeR/data-raw library(devtools) library(knitr) library(reshape) library(biomaRt) library(dplyr) opts_chunk$set(collapse=TRUE)
cat("TURN OFF CACHE BEFORE SAVING DEFINITIVE DATA\n") opts_chunk$set(cache=T) # disable before saving data
probTableCurrent_raw <- "hgnc_annotated_depthadj_pmuts_fs_canonical_transcripts_div_Feb07.txt" probTable2014_raw <- "fixed_mut_prob_fs_adjdepdiv.txt" autismDeNovos_raw <- "annotated_asd_trios1078.txt" fmrpGenes_raw <- "fmrp_gencode.txt"
-default prob table = r probTableCurrent_raw
-extra (original) prob table = r probTable2014_raw
-example variants = r autismDeNovos_raw
-example gene list = r fmrpGenes_raw
Import latest (Gencode-based) probabilities. Update other identifiers using biomaRt (from ENST)
#Gencode probability table pDNM <- read.table(probTableCurrent_raw,header=T) #Use biomaRt to update annotations: grch37 <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl") #biomart does not retain transcript versions: trim to 15 digit unversioned ENST pDNM$enst <- pDNM$transcript %>% substr(1,15) #bmAttributes <- listAttributes(grch37) #returns all available fields #retrieve matching gene IDs enstToEnsg <- biomaRt::getBM( attributes=c("ensembl_gene_id","ensembl_transcript_id","hgnc_id","hgnc_symbol","external_gene_name"), filters="ensembl_transcript_id", values=pDNM$enst, mart=grch37 ) ## Some QC on the new annotations cat("QC duplicate entries (imply 1:many mappings):\n") apply(enstToEnsg,MARGIN=2,FUN=function(x){ paste("Dups:",sum(duplicated(x)),"NAs:",sum(is.na(x)))}) cat("Explore duplicated ENSG / ENST:\n") duplicated <- enstToEnsg$ensembl_transcript_id[duplicated(enstToEnsg$ensembl_transcript_id)] filter(enstToEnsg,ensembl_transcript_id %in% duplicated) cat("In latest Ensembl (79, GRCh38.p2), this ENST & ENSG maps to VTN. SEBOX maps elsewhere. Discard SEBOX.") cat("Explore duplicated external gene name:\n") duplicated <- enstToEnsg$external_gene_name[duplicated(enstToEnsg$external_gene_name)] filter(enstToEnsg,external_gene_name %in% duplicated) cat("2 genes: VTN & NPIPA7. VTN already sorted. 3 transcripts map to NPIPA7. Retain the transcript with HGNC data.") duplicated <- pDNM$external_gene_name[duplicated(enstToEnsg$hgnc_symbol)] %>% unique # REMOVE DODGY CASES #1 remove SEBOX enstToEnsg <- filter(enstToEnsg,hgnc_symbol!="SEBOX") #2 remove NPIPA7 transcripts enstToEnsg <- filter(enstToEnsg, ensembl_transcript_id!="ENST00000381497" & ensembl_transcript_id!="ENST00000427999") nrow(enstToEnsg) cat("Explore duplicated HGNC:\n") duplicated <- enstToEnsg$hgnc_symbol[duplicated(enstToEnsg$hgnc_symbol)] %>% unique duplicated cat("There are no duplicated hgncID. ",sum(is.na(enstToEnsg$hgnc_id)),"ENST have no matching hgncID.\n") ## Join to prob table, and keep chosen fields pDNM <- left_join(enstToEnsg,pDNM,by=c("ensembl_transcript_id"="enst")) pDNM <- pDNM %>% dplyr::select(ensgID = ensembl_gene_id, enstID = ensembl_transcript_id, hgncID = hgnc_id, hgncSymbol = hgnc_symbol, geneName = external_gene_name, syn = p_syn, mis = p_mis, non = p_non, splice = p_css, frameshift = p_fs) # already antilogged... # pDNM$splice[is.na(pDNM$splice)] <- 0 # not required: there are no NAs... # summarise protein-altering and lof variants as defined below pDNM$lof <- apply(pDNM[,c("non","splice","frameshift")],MARGIN=1,FUN=sum,na.rm=T) pDNM$prot <- apply(pDNM[,c("mis","lof")],MARGIN=1,FUN=sum,na.rm=T) pDNM$all <- apply(pDNM[,c("syn","prot")],MARGIN=1,FUN=sum,na.rm=T) # show head & numbers of vars pDNM %>% head cat("pDNM:\n", "Number rows:",nrow(pDNM),"\n", "Number unique enstID:",length(unique(pDNM$enstID)),"\n", "Number unique hgncID:",length(unique(pDNM$hgncID)),"\n", "Number unique geneSymbol:",length(unique(pDNM$geneName)),"\n" ) write.table(pDNM,"probTable_gencode_reannotatedIDs.txt",quote=F,row.names=F,sep="\t") pDNM <- melt(pDNM,id.vars=c("hgncID","hgncSymbol","enstID","ensgID","geneName"),variable_name="class")
Import original (RefSeq-based) probabilities.
#Previously published de novo probability table probTable_Samocha2014 <- read.table(probTable2014_raw,header=T) # probTable_Samocha2014 <- probTable_Samocha2014 %>% dplyr::rename(splice = css, geneID = gene, refseqID = transcript) %>% dplyr::select(-bp, -covered_bp, -rdt, -all) # antilog raw data probTable_Samocha2014[,3:7] <- 10^probTable_Samocha2014[,3:7] probTable_Samocha2014$splice[is.na(probTable_Samocha2014$splice)] <- 0 # summarise protein-altering and lof variants as defined below probTable_Samocha2014$lof <- apply(probTable_Samocha2014[,c("non","splice","frameshift")],MARGIN=1,FUN=sum,na.rm=T) probTable_Samocha2014$prot <- apply(probTable_Samocha2014[,c("mis","lof")],MARGIN=1,FUN=sum,na.rm=T) probTable_Samocha2014$all <- apply(probTable_Samocha2014[,c("syn","prot")],MARGIN=1,FUN=sum,na.rm=T) probTable_Samocha2014 %>% head cat("probTable_Samocha2014:\n", "Number rows:",nrow(probTable_Samocha2014),"\n", "Number unique genes:",length(unique(probTable_Samocha2014$geneID)),"\n" ) probTable_Samocha2014 <- melt(probTable_Samocha2014,id.vars=c("refseqID","geneID"),variable_name="class")
Current classification of variant consequences: ignore rdt rename css -> splice lof = non + splice + frameshift prot = lof + mis * total = prot + syn
Variants with p_css = NA have no splice boundaries, so are converted to p=0
# Autism (1,078 trios) de novo list from Nature Genetics publication. autismDeNovos <- read.table(autismDeNovos_raw) names(autismDeNovos) <- c("coords","gene","class") cat("autism demo data:\n", "Number variants:",nrow(autismDeNovos),"\n", "Number unique genes:",length(unique(autismDeNovos$gene)),"\n", "Tabulate by variant class:\n" ) table(autismDeNovos$class) %>% data.frame # truncate variant class names & filter unrecognised classes updateClassNames <- data.frame( oldclasses=c(".","3_prime_UTR_variant","frameshift_variant", "inframe_deletion","intron_variant","missense_variant", "non_coding_transcript_exon_variant","splice_acceptor_variant","splice_donor_variant", "splice_region_variant","stop_gained","synonymous_variant"), newclasses=c("filter","filter","frameshift", "filter","filter","mis", "filter","splice","splice", "syn","non","syn") ) cat("Re-mapping variant classes using the following classification:") updateClassNames autismDeNovos <- left_join(autismDeNovos,updateClassNames,by=c("class"="oldclasses")) %>% dplyr::select(coords,gene,class=newclasses) cat("removing unrecognised variant classes:\n") autismDeNovos %>% filter(class=="filter") if(sum(is.na(autismDeNovos$class))>0){warning("unrecognised class names present")} autismDeNovos <- autismDeNovos %>% filter(class %in% c("syn","mis","non","splice","frameshift"))
There are r nrow(autismDeNovos)
coding variants in the dataset.
Looking for genes that are not in probability table:
missing <- autismDeNovos$gene[!toupper(autismDeNovos$gene) %in% toupper(pDNM$hgncSymbol)] missing <- missing[!missing %in% toupper(pDNM$geneName)]
The following genes in autismDeNovos
do not match to geneName or hgncSymbol in the gencode-based probability table:
r missing
These will be removed from autism de novo list
autismDeNovos <- autismDeNovos %>% filter(!gene %in% missing) %>% select(-coords)
number variants = r nrow(autismDeNovos)
number unique genes = r length(unique(autismDeNovos$gene))
# List of FMRP genes fmrpGenes_enst <- read.table(pipe(paste("cut -f1",fmrpGenes_raw)))$V1 close(pipe(paste("cut -f1",fmrpGenes_raw))) fmrpGenes_geneName <- read.table(pipe(paste("cut -f2",fmrpGenes_raw)))$V1 close(pipe(paste("cut -f2",fmrpGenes_raw))) fmrpGenes <- data.frame(enst=fmrpGenes_enst,geneName=fmrpGenes_geneName) # NA for missing ENST fmrpGenes[grep("ENST",fmrpGenes$enst,invert=T),"enst"] <- NA #biomart does not retain transcript versions: trim to 15 digit unversioned ENST fmrpGenes$enst <- fmrpGenes$enst %>% substr(1,15) #Note NEURL should be NEURL1 fmrpGenes$geneName[fmrpGenes$geneName=="NEURL"] <- "NEURL1" #Complete missing enst fmrpGenes <- left_join(fmrpGenes, enstToEnsg, by=c("geneName"="external_gene_name")) #QC: Do all of the old ENST match the new ENST (after joining on geneName) sum(!is.na(fmrpGenes$enst)) == sum(fmrpGenes$enst==fmrpGenes$ensembl_transcript_id,na.rm=T) fmrpGenes <- fmrpGenes %>% select(ensgID = ensembl_gene_id, enstID = ensembl_transcript_id, hgncID = hgnc_id, hgncSymbol = hgnc_symbol, geneName) cat("FMRP genes:\n", "Number genes:",nrow(fmrpGenes),"\n", "Number unique:",length(unique(fmrpGenes$geneName)),"\n", "Number in probTable:",sum(fmrpGenes$geneName %in% pDNM$geneName),"\n" )
setwd("..") # Add data to data folder devtools::use_data(autismDeNovos,fmrpGenes, overwrite=T) # Add data to sysdata.R devtools::use_data(pDNM, internal = TRUE, overwrite=T) save(probTable_Samocha2014, file=file.path("alternativeProbabilityTables","probTable_Samocha2014.rda"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.