our: genetic data analysis

# by FRHL

## merge with data and setup INFP
df <- read.table('~/Desktop/210131_ukbb_mfi_vep_all_HIGH_maf_info.txt')
colnames(df) <- c('CHR','BP1','SNPID','RSID','REF','ALT','MAF','MA','INFO','BP2','MINOR','MAJOR','CONS')
df <- df[df$INFO > 0.9,]
df <- df[!is.na(df$BP1), ]

# extract consequence
cons <- as.data.frame(do.call(rbind, strsplit(df$CONS, '(CSQ\\=)|(\\|)')))
cons[[1]] <- NULL
colnames(cons) <- c('ENSGID','ENST','TRANS','TYPE','IMPACT','GENE','HGNC','AFFECT','YES', "CDS","AA_POS",'EXON','INTRON')
cons$TYPE_SIMPLE <- unlist(lapply(strsplit(cons$TYPE, split = '\\&'), function(x) x[1]))
df = cbind(df, cons)

# what do we need (cols)
df <-df[,c('ENSGID','ENST','TYPE_SIMPLE','AFFECT', 'CHR','BP1','SNPID', 'MAF','CDS')]
colnames(df) <- c("ensembl_gene_id",  "ensembl_transcript_id", "consequence",'how', "variant_chr", "variant_pos",'snpid','maf','cds')
df <- df[df$how == 'protein_coding' & df$consequence %in% c("stop_gained", "frameshift_variant"),]

## 1) position in sequence
cds = gsub('\\?','',df$cds)
cds = gsub(".*\\-",'',cds)
cds = as.data.frame(do.call(rbind, strsplit(cds, split = '\\/')))
cds = as.data.frame(sapply(cds, as.numeric))
cds = data.frame(value = (cds$V1 / cds$V2) * 100)

## ) get MAF bins
cds_mrg = cbind(df[,c('maf')], cds)
colnames(cds_mrg) <- c('maf','cds')
cds_mrg$cat = ifelse(cds_mrg$maf < 0.001, 'MAF < 0.001', 'MAF > 0.001 & MAF < 0.02')
cds_mrg$cat = ifelse(cds_mrg$maf > 0.02, 'MAF > 0.02', cds_mrg$cat)

## 2) some stats
cds_stats_genes = length(unique(df$ensembl_gene_id))
cds_stats_variants = length(unique(df$snpid))

## 3) merge cds
cds_mrg$group <- as.factor(cut(x=cds_mrg$cds, breaks=seq(from=0, to=100, by = 0.05)))

# plot data
p1 <- ggplot(cds_mrg, aes(x = cds, fill = cat)) +
  geom_histogram(bins = 25, color = 'black', alpha = 0.7) +
  xlab('Position along gene coding sequence %') +
  ylab('Frameshift and stop-gain count') +
  ggtitle(paste(cds_stats_variants,'LoF variants in', cds_stats_genes, 'genes.'),
          paste('(INFO > 0.9)')) +
  labs(fill = "Type") +
  theme_minimal()

ggsave(filename = '~/Desktop/210128_cds_position.png', width = 10, height = 6)








#library(biomaRt)
#listMarts()
#
# get exons
#ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh = 37)
#attr = attributes.1 = c("chromosome_name", "exon_chrom_start", "exon_chrom_end", "transcript_length","strand", "ensembl_gene_id",  "ensembl_transcript_id","ensembl_exon_id")
#all_genes <- getBM(attributes=attr, mart = ensembl)
#all_genes <- all_genes[all_genes$chromosome_name %in% c(1:22,'X','XY','Y'),]
#all_genes <- all_genes[!duplicated(all_genes),]
#all_genes <- all_genes[order(all_genes$ensembl_gene_id),]



# let's start with (+) strand
#genes = unique(mrg$ensembl_gene_id)
#result = lapply(genes, function(g){
#
#  # get the unique gene
#  res_gene = mrg[mrg$ensembl_gene_id == g,]
#  variant_id = unique(res_gene$snpid)
#
#  res2 = lapply(variant_id, function(v){
#
#    # go over each variant
#    res_variant = res_gene[res_gene$snpid == v,]
#    res_variant = res_variant[order(res_variant$exon_chrom_start),]
#
#    # get bounds
#    res_variant$exonic_lens = res_variant$exon_chrom_end-res_variant$exon_chrom_start
#    lower_bound = min(res_variant$exon_chrom_start)
#    upper_bound = max(res_variant$exon_chrom_end)
#    total_len = upper_bound-lower_bound
#    exon_len = sum(abs(res_variant$exon_chrom_end-res_variant$exon_chrom_start))
#    intron_len = (max(res_variant$exon_chrom_end) - min(res_variant$exon_chrom_start))-exon_len
#    var_pos = unique(res_variant$variant_pos)
#    n_exons = nrow(res_variant)
#
#    # ensure it adds up
#    stopifnot(total_len - exon_len == intron_len)
#    stopifnot(lower_bound < var_pos)
#    stopifnot(upper_bound > var_pos)
#
#    # proecss specific exon afflicted
#    bool = var_pos <= res_variant$exon_chrom_end & var_pos >= res_variant$exon_chrom_start
#    stopifnot(any(bool))
#    stopifnot(sum(bool) == 1)
#    exon = res_variant[bool, ]
#    pos_in_exon = exon$exon_chrom_end-exon$variant_pos
#    cds_pos = pos_in_exon
#
#    # position in gene
#    gene_pos_pct = (exon$variant_pos - lower_bound) / (upper_bound - lower_bound)
#
#    # position in exon (coding region)
#    if (bool[1] != TRUE){
#      ranges = 1:(which(bool==TRUE)-1)
#      cds_pos = cds_pos + sum(res_variant$exonic_lens[ranges])
#    }
#    exon_pos_pct = cds_pos / exon_len
#
#    d = data.frame(gene_pos_pct = gene_pos_pct, exon_pos_pct = exon_pos_pct, af = exon$maf)
#    return(d)
#  })
#
#  res2 = as.data.frame(do.call(rbind, res2))
#
#  return(res2)
#})
#
#data = as.data.frame(do.call(rbind, result))
#data = data[order(data$gene_pos_pct),]
#
#library(reshape2)
#library(ggplot2)
#mydf = melt(data[,c(1)])
#mydf = data.frame(value=dat$V1/dat$V2)
#mrg = merge(df, all_genes, by = c("ensembl_gene_id",  "ensembl_transcript_id"))
#mrg = mrg[mrg$how == 'protein_coding' & mrg$consequence %in% c("stop_gained", "frameshift_variant"),]
#mrg = mrg[mrg$strand == 1, ]