if (F){
# last edited
setwd('~/Projects/04_HUMAN_KO/')
require(ggplot2)
# get high impact table
df <- read.table('~/Desktop/210131_ukbb_mfi_vep_all_HIGH_maf_info.txt')
colnames(df) <- c('CHR','BP1','SNPID','RSID','REF','ALT','MAF','MA','INFO','BP2','MINOR','MAJOR','CONS')
df <- df[df$ALT == df$MA,]
df <- df[df$INFO > 0.9,] # setup INFO
df <- df[!is.na(df$BP1), ] # remove some NA's
## overview of all MAF < 0.005
df_maf_reduced = df[df$MAF < 0.005,]
pct = paste0(round((nrow(df_maf_reduced)/nrow(df))*100,1),'%')
title = paste0('Variants with HIGH impact - All samples')
subtitle = paste0(nrow(df_maf_reduced),'/',nrow(df),' (~',pct,') variants with MAF < 0.005')
p1 <- ggplot(df_maf_reduced, aes(x=MAF)) +
geom_histogram(color='black', fill = 'lightblue', bins = 50) +
ggtitle(title, subtitle) +
theme_minimal()
## overview of maf in our thresholds
df_maf_our = df[df$MAF < 0.02 & df$MAF > 0.001,]
pct = paste0(round((nrow(df_maf_our)/nrow(df))*100,1),'%')
title = paste0('Variants with HIGH impact - All samples')
subtitle = paste0(nrow(df_maf_our),'/',nrow(df),' (~',pct,') variants with MAF < 0.02 & MAF > 0.001')
p2 <- ggplot(df_maf_our, aes(x=MAF)) +
geom_histogram(color='black', fill = 'lightblue', bins = 50) +
ggtitle(title, subtitle) +
theme_minimal()
# save data
ggsave('~/Desktop/ukb_all_HIGH_INFO_maf_lt_0_005.png', p1, width = 8, height = 8)
ggsave('~/Desktop/ukb_all_HIGH_INFO_maf_our_treshold.png', p2, width = 8, height = 8)
## tables
# *) simply variants, so that only the one with the highest impact is retained
cons <- as.data.frame(do.call(rbind, strsplit(df$CONS, '\\|')))
colnames(cons) <- c('CSQ','ENST','TRANS','TYPE','IMPACT','GENE','HGNC','AFFECT','YES',"CDS","AA_POS",'EXON','INTRON')
cons$TYPE_SIMPLE <- unlist(lapply(strsplit(cons$TYPE, split = '\\&'), function(x) x[1]))
#types = as.data.frame(sort(table(cons$TYPE)), stringsAsFactors = F)
types = as.data.frame(sort(table(cons$TYPE_SIMPLE)), stringsAsFactors = F)
colnames(types) <- c('term', 'total')
# which ones are indels?
combined = cbind(df, cons)
combined$INDEL = as.numeric(apply(cbind(unlist(lapply(combined$REF, nchar)), unlist(lapply(combined$ALT, nchar))), 1, max) > 1)
# stratify according to indel / snv
stratify_type <- function(df){
maf_indel = sum(df$INDEL)
maf_snv = sum(df$INDEL == 0)
return(c(maf_snv, maf_indel))
}
# iterate over data
overview <- lapply(types$term, function(t){
row <- c(
stratify_type(combined[combined$TYPE_SIMPLE %in% t& combined$MAF < 0.001,]),
stratify_type(combined[combined$TYPE_SIMPLE %in% t & combined$MAF < 0.02 & combined$MAF > 0.001,]),
stratify_type(combined[combined$TYPE_SIMPLE %in% t & combined$MAF > 0.02,]),
stratify_type(combined[combined$TYPE_SIMPLE %in% t,]),
length(unique(combined$GENE[combined$TYPE_SIMPLE %in% t]))
)
row <- t(data.frame(row))
colnames(row) <- c("MAF < 0.001 (SNVs)",
"MAF < 0.001 (INDELs)",
"MAF < 0.02 & MAF > 0.001 (SNVs)",
"MAF < 0.02 & MAF > 0.001 (INDELs)",
"MAF > 0.02 (SNVs)",
"MAF > 0.02 (INDELs)",
'Total SNVs',
'Total Indels',
'Genes')
return(row)
})
# write out table
newoverview = cbind(types, do.call(rbind, overview))
print(newoverview)
write('Writing overview to file..',stdout())
write.csv(newoverview, '~/Desktop/210131_lof_by_term_and_maf_INFO_ma_eq_alt.csv', quote = F)
# count how many genes have x number of mutations
res = sort(table(combined$GENE))
print(paste("Genes with 1 LoF Mutation:",sum(res==1)))
print(paste("Genes with 2 LoF Mutations:",sum(res==2)))
print(paste("Genes with 3 or more LoF Mutations:",sum(res>=3)))
# where in the coding sequence
sort(table(combined$AFFECT))
# how many unique/non-unqiue genes
length(unique(combined$SNPID))
length(unique(combined$GENE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.