workflows/210126_plot_homozygous_carrier_count.R

# this file was generated by combining and extracting all homozygous carriers.
# it requires first, that 05_find_homo_plof.sh to be run. And then, the
# resulting data to be combined row-wise.

# todo: executeable script


#plot_vpc_homozygous <- function(df, n = 480000, cols = c('ID1', 'ID2', 'CHROM','BP','SNP','MAF')){


  ########################
  ### run on the server ##
  ########################

  x <- lapply(1:22, function(x) read.table(paste0('ukb_mfi_homo_plof_chr',x,'.txt'), skip = 2))
  df <- do.call(rbind, x)
  colnames(df) <- c('ID1', 'ID2', 'CHROM','BP','SNP','MAF')

  n = 479230 # all
  #n = 402133 # wb

  # summarize data
  inds <- as.data.frame(table(df$ID2, df$SNP))
  snp_count <- aggregate(Freq ~ Var2, data = inds, FUN = sum)
  colnames(snp_count) <- c('SNP','observed_snps')
  snp_maf <- df[,c('SNP','MAF')]
  snp_maf <- snp_maf[!duplicated(snp_maf),]
  res <- merge(snp_count, snp_maf)
  res$homo_freq <- res$MAF ** 2
  res$expected_snps <-  res$homo_freq * n
  write.csv(res, '/well/lindgren/flassen/210131_ukbb_homozyogus_carriers_count.csv', quote = F, row.names = F)

  ###############
  # run locally #
  ###############

  # plot data
  res <- read.csv('~/Desktop/210131_ukbb_homozyogus_carriers_count.csv')
  res$AF <- res$MAF
  title = paste0('Observed vs Expected high impact variant count in UKBB (All)')
  subtitle = paste0('Samples: ',479230,' SNPs: ', length(unique(res$SNP)))
  p1 <- ggplot(res, aes(x=observed_snps, y=expected_snps, color = AF)) +
    geom_point() +
    geom_abline(slope=1, intercept=0, col = 'red') +
    annotate("text", x = 184, y = 170, col = 'red', label = 'x=y') +
    xlab('Observed homozygous carriers (n)') +
    ylab('Expected homozygous carries (n)') +
    ggtitle(title, subtitle) +
    theme_bw()

  # plot data WB
  wb <- read.csv('~/Desktop/210131_ukbb_wb_homozyogus_carriers_count.csv')
  wb$AF <- wb$MAF
  title = paste0('Observed vs Expected high impact variant count in UKBB (WB)')
  subtitle = paste0('Samples: ',402133,' SNPs: ', length(unique(wb$SNP)))
  p2 <- ggplot(wb, aes(x=observed_snps, y=expected_snps, color = AF)) +
    geom_point() +
    geom_abline(slope=1, intercept=0, col = 'red') +
    annotate("text", x = 180, y = 170, col = 'red', label = 'x=y') +
    xlab('Observed homozygous carries (n)') +
    ylab('Expected homozygous carries (n)') +
    ggtitle(title, subtitle) +
    theme_bw()

  ggsave('210131_ukb_all_samples_obs_vs_expected_high_imapct_count.png', p1, width = 6, height = 6)
  ggsave('210131_ukb_all_samples_obs_vs_expected_high_imapct_WB_count.png', p2, width = 6, height = 6)


#}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.