# by FRHL
## merge with data and setup INFP
df <- read.table('~/Desktop/210131_ukbb_mfi_vep_all_HIGH_maf_info.txt')
colnames(df) <- c('CHR','BP1','SNPID','RSID','REF','ALT','MAF','MA','INFO','BP2','MINOR','MAJOR','CONS')
df <- df[df$INFO > 0.9,]
df <- df[!is.na(df$BP1), ]
# extract consequence
cons <- as.data.frame(do.call(rbind, strsplit(df$CONS, '(CSQ\\=)|(\\|)')))
cons[[1]] <- NULL
colnames(cons) <- c('ENSGID','ENST','TRANS','TYPE','IMPACT','GENE','HGNC','AFFECT','YES', "CDS","AA_POS",'EXON','INTRON')
cons$TYPE_SIMPLE <- unlist(lapply(strsplit(cons$TYPE, split = '\\&'), function(x) x[1]))
df = cbind(df, cons)
# what do we need (cols)
df <-df[,c('ENSGID','ENST','TYPE_SIMPLE','AFFECT', 'CHR','BP1','SNPID', 'MAF','CDS')]
colnames(df) <- c("ensembl_gene_id", "ensembl_transcript_id", "consequence",'how', "variant_chr", "variant_pos",'snpid','maf','cds')
df <- df[df$how == 'protein_coding' & df$consequence %in% c("stop_gained", "frameshift_variant"),]
## 1) position in sequence
cds = gsub('\\?','',df$cds)
cds = gsub(".*\\-",'',cds)
cds = as.data.frame(do.call(rbind, strsplit(cds, split = '\\/')))
cds = as.data.frame(sapply(cds, as.numeric))
cds = data.frame(value = (cds$V1 / cds$V2) * 100)
## ) get MAF bins
cds_mrg = cbind(df[,c('maf')], cds)
colnames(cds_mrg) <- c('maf','cds')
cds_mrg$cat = ifelse(cds_mrg$maf < 0.001, 'MAF < 0.001', 'MAF > 0.001 & MAF < 0.02')
cds_mrg$cat = ifelse(cds_mrg$maf > 0.02, 'MAF > 0.02', cds_mrg$cat)
## 2) some stats
cds_stats_genes = length(unique(df$ensembl_gene_id))
cds_stats_variants = length(unique(df$snpid))
## 3) merge cds
cds_mrg$group <- as.factor(cut(x=cds_mrg$cds, breaks=seq(from=0, to=100, by = 0.05)))
# plot data
p1 <- ggplot(cds_mrg, aes(x = cds, fill = cat)) +
geom_histogram(bins = 25, color = 'black', alpha = 0.7) +
xlab('Position along gene coding sequence %') +
ylab('Frameshift and stop-gain count') +
ggtitle(paste(cds_stats_variants,'LoF variants in', cds_stats_genes, 'genes.'),
paste('(INFO > 0.9)')) +
labs(fill = "Type") +
theme_minimal()
ggsave(filename = '~/Desktop/210128_cds_position.png', width = 10, height = 6)
#library(biomaRt)
#listMarts()
#
# get exons
#ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh = 37)
#attr = attributes.1 = c("chromosome_name", "exon_chrom_start", "exon_chrom_end", "transcript_length","strand", "ensembl_gene_id", "ensembl_transcript_id","ensembl_exon_id")
#all_genes <- getBM(attributes=attr, mart = ensembl)
#all_genes <- all_genes[all_genes$chromosome_name %in% c(1:22,'X','XY','Y'),]
#all_genes <- all_genes[!duplicated(all_genes),]
#all_genes <- all_genes[order(all_genes$ensembl_gene_id),]
# let's start with (+) strand
#genes = unique(mrg$ensembl_gene_id)
#result = lapply(genes, function(g){
#
# # get the unique gene
# res_gene = mrg[mrg$ensembl_gene_id == g,]
# variant_id = unique(res_gene$snpid)
#
# res2 = lapply(variant_id, function(v){
#
# # go over each variant
# res_variant = res_gene[res_gene$snpid == v,]
# res_variant = res_variant[order(res_variant$exon_chrom_start),]
#
# # get bounds
# res_variant$exonic_lens = res_variant$exon_chrom_end-res_variant$exon_chrom_start
# lower_bound = min(res_variant$exon_chrom_start)
# upper_bound = max(res_variant$exon_chrom_end)
# total_len = upper_bound-lower_bound
# exon_len = sum(abs(res_variant$exon_chrom_end-res_variant$exon_chrom_start))
# intron_len = (max(res_variant$exon_chrom_end) - min(res_variant$exon_chrom_start))-exon_len
# var_pos = unique(res_variant$variant_pos)
# n_exons = nrow(res_variant)
#
# # ensure it adds up
# stopifnot(total_len - exon_len == intron_len)
# stopifnot(lower_bound < var_pos)
# stopifnot(upper_bound > var_pos)
#
# # proecss specific exon afflicted
# bool = var_pos <= res_variant$exon_chrom_end & var_pos >= res_variant$exon_chrom_start
# stopifnot(any(bool))
# stopifnot(sum(bool) == 1)
# exon = res_variant[bool, ]
# pos_in_exon = exon$exon_chrom_end-exon$variant_pos
# cds_pos = pos_in_exon
#
# # position in gene
# gene_pos_pct = (exon$variant_pos - lower_bound) / (upper_bound - lower_bound)
#
# # position in exon (coding region)
# if (bool[1] != TRUE){
# ranges = 1:(which(bool==TRUE)-1)
# cds_pos = cds_pos + sum(res_variant$exonic_lens[ranges])
# }
# exon_pos_pct = cds_pos / exon_len
#
# d = data.frame(gene_pos_pct = gene_pos_pct, exon_pos_pct = exon_pos_pct, af = exon$maf)
# return(d)
# })
#
# res2 = as.data.frame(do.call(rbind, res2))
#
# return(res2)
#})
#
#data = as.data.frame(do.call(rbind, result))
#data = data[order(data$gene_pos_pct),]
#
#library(reshape2)
#library(ggplot2)
#mydf = melt(data[,c(1)])
#mydf = data.frame(value=dat$V1/dat$V2)
#mrg = merge(df, all_genes, by = c("ensembl_gene_id", "ensembl_transcript_id"))
#mrg = mrg[mrg$how == 'protein_coding' & mrg$consequence %in% c("stop_gained", "frameshift_variant"),]
#mrg = mrg[mrg$strand == 1, ]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.