workflows/210126_plot_vep_imact_stats.R

if (F){

  # last edited
  setwd('~/Projects/04_HUMAN_KO/')

  require(ggplot2)

  # get high impact table
  df <- read.table('~/Desktop/210131_ukbb_mfi_vep_all_HIGH_maf_info.txt')
  colnames(df) <- c('CHR','BP1','SNPID','RSID','REF','ALT','MAF','MA','INFO','BP2','MINOR','MAJOR','CONS')
  df <- df[df$ALT == df$MA,]
  df <- df[df$INFO > 0.9,] # setup INFO
  df <- df[!is.na(df$BP1), ] # remove some NA's

  ## overview of all MAF < 0.005
  df_maf_reduced = df[df$MAF < 0.005,]
  pct = paste0(round((nrow(df_maf_reduced)/nrow(df))*100,1),'%')
  title = paste0('Variants with HIGH impact - All samples')
  subtitle = paste0(nrow(df_maf_reduced),'/',nrow(df),' (~',pct,') variants with MAF < 0.005')
  p1 <- ggplot(df_maf_reduced, aes(x=MAF)) +
    geom_histogram(color='black', fill = 'lightblue', bins = 50) +
    ggtitle(title, subtitle) +
    theme_minimal()

  ## overview of maf in our thresholds
  df_maf_our = df[df$MAF < 0.02 & df$MAF > 0.001,]
  pct = paste0(round((nrow(df_maf_our)/nrow(df))*100,1),'%')
  title = paste0('Variants with HIGH impact - All samples')
  subtitle = paste0(nrow(df_maf_our),'/',nrow(df),' (~',pct,') variants with MAF < 0.02 & MAF > 0.001')
  p2 <- ggplot(df_maf_our, aes(x=MAF)) +
    geom_histogram(color='black', fill = 'lightblue', bins = 50) +
    ggtitle(title, subtitle) +
    theme_minimal()

  # save data
  ggsave('~/Desktop/ukb_all_HIGH_INFO_maf_lt_0_005.png', p1, width = 8, height = 8)
  ggsave('~/Desktop/ukb_all_HIGH_INFO_maf_our_treshold.png', p2, width = 8, height = 8)


  ## tables
  # *) simply variants, so that only the one with the highest impact is retained
  cons <- as.data.frame(do.call(rbind, strsplit(df$CONS, '\\|')))
  colnames(cons) <- c('CSQ','ENST','TRANS','TYPE','IMPACT','GENE','HGNC','AFFECT','YES',"CDS","AA_POS",'EXON','INTRON')
  cons$TYPE_SIMPLE <- unlist(lapply(strsplit(cons$TYPE, split = '\\&'), function(x) x[1]))
  #types = as.data.frame(sort(table(cons$TYPE)), stringsAsFactors = F)
  types = as.data.frame(sort(table(cons$TYPE_SIMPLE)), stringsAsFactors = F)
  colnames(types) <- c('term', 'total')

  # which ones are indels?
  combined = cbind(df, cons)
  combined$INDEL = as.numeric(apply(cbind(unlist(lapply(combined$REF, nchar)), unlist(lapply(combined$ALT, nchar))), 1, max) > 1)

  # stratify according to indel / snv
  stratify_type <- function(df){
    maf_indel = sum(df$INDEL)
    maf_snv = sum(df$INDEL == 0)
    return(c(maf_snv, maf_indel))
  }

  # iterate over data
  overview <- lapply(types$term, function(t){

    row <- c(
      stratify_type(combined[combined$TYPE_SIMPLE %in% t& combined$MAF < 0.001,]),
      stratify_type(combined[combined$TYPE_SIMPLE %in% t & combined$MAF < 0.02 & combined$MAF > 0.001,]),
      stratify_type(combined[combined$TYPE_SIMPLE %in% t & combined$MAF > 0.02,]),
      stratify_type(combined[combined$TYPE_SIMPLE %in% t,]),
      length(unique(combined$GENE[combined$TYPE_SIMPLE %in% t]))
    )
    row <- t(data.frame(row))
    colnames(row) <- c("MAF < 0.001 (SNVs)",
                       "MAF < 0.001 (INDELs)",
                       "MAF < 0.02 & MAF > 0.001 (SNVs)",
                       "MAF < 0.02 & MAF > 0.001 (INDELs)",
                       "MAF > 0.02 (SNVs)",
                       "MAF > 0.02 (INDELs)",
                       'Total SNVs',
                       'Total Indels',
                       'Genes')

    return(row)
  })

  # write out table
  newoverview = cbind(types, do.call(rbind, overview))
  print(newoverview)
  write('Writing overview to file..',stdout())
  write.csv(newoverview, '~/Desktop/210131_lof_by_term_and_maf_INFO_ma_eq_alt.csv', quote = F)

  # count how many genes have x number of mutations
  res = sort(table(combined$GENE))
  print(paste("Genes with 1 LoF Mutation:",sum(res==1)))
  print(paste("Genes with 2 LoF Mutations:",sum(res==2)))
  print(paste("Genes with 3 or more LoF Mutations:",sum(res>=3)))

  # where in the coding sequence
  sort(table(combined$AFFECT))

  # how many unique/non-unqiue genes
  length(unique(combined$SNPID))
  length(unique(combined$GENE))



}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.