R/Sarah-Seq.R

Defines functions read_23andMe plot_karyotype zoom_plot print_DNA vcf_to_23 rainfall_karyoplot

# function for reading in 23andMe files
read_23andMe <- function(filepath){
  data <- read.table(filepath, col.names = c('id', 'chr', 'pos', 'gt'))
  print(paste('Processed file with ', nrow(data), ' variants!'))
  return(data)
}

# karyoplot function
plot_karyotype <- function(data, chrs = "auto", main = NULL){
  if(chrs != "auto"){
    chrs <- paste0('chr', chrs)}
  sub <- subset(data, data$chr %in% c(seq(1,22),'X', 'Y'))
  gr <- GRanges(seqnames = paste0('chr', sub$chr), ranges = IRanges(start = sub$pos, width = 1, names = sub$id))
  c_tab <- c(rep('#68DAFC', 9), '#014571', '#68DAFC', '#030303')
  names(c_tab) <- names(getCytobandColors(color.schema = 'only.centromeres'))
  pp <- getDefaultPlotParams(plot.type = 1)
  pp$topmargin <- 280
  kp <- plotKaryotype("hg19", chromosomes = chrs, ideogram.plotter = NULL, main = main, plot.params = pp)
  kpAddCytobandsAsLine(kp,lwd = 4, color.table = c_tab)
  kpPlotDensity(kp, data=gr, col = '#FAA3A3')
}

# zoom plot function
zoom_plot <- function(data, chr_num, range){
  plot_data <- subset(data, data$chr == as.character(chr_num) & data$pos >= range[1] & data$pos <= range[2])
  plot_data <- separate(plot_data, gt, into = c('allele1', 'allele2'), 1)
  plot_data <- pivot_longer(plot_data, cols = c(allele1, allele2), names_to = 'allele')
  allele_cols <- c('#595454', '#eb0505', '#0f46db', '#09bd21', '#fafa2a')
  names(allele_cols) <- c('-', 'A', 'C', 'G', 'T')
  plot_data$value <- factor(plot_data$value, levels = c('A', 'C', 'G', 'T', '-'))
  if(nrow(plot_data) < 2000){
    tryCatch(
      ggplotly(ggplot(plot_data, aes(x = pos, fill = value, text = id)) +
                   geom_segment(aes(x=pos, xend=pos, y=0.53, yend=1), size = 0.1, show.legend = F) +
                   geom_segment(aes(x=pos, xend=pos, y=0, yend=0.47), size = 0.1, show.legend = F) +
                   geom_point(data = subset(plot_data, plot_data$allele == 'allele1'), size=5,  alpha=0.8, shape=21, stroke=0.1, aes(y = 1, fill = value))+
                   geom_point(data = subset(plot_data, plot_data$allele == 'allele2'), size=5,  alpha=0.8, shape=21, stroke=0.1, aes(y = 0, fill = value))+
                   #geom_hline(yintercept = 0.53, col = 'black', size = 8, lty =1) +
                   geom_hline(yintercept = 0.53, col = "#C8C8C8", size = 2.4) +
                   #geom_hline(yintercept = 0.47, col = 'black', size = 0.5, lty =1) +
                   geom_hline(yintercept = 0.47, col = "#C8C8C8", size = 2.4) +
                   scale_fill_manual(values = allele_cols[levels(plot_data$value)])+
                   scale_y_continuous(limits = c(-0.25,1.25))+
                   scale_x_continuous(labels = comma) +
                   labs(title = paste(' Chromosome', chr_num, ':\n', prettyNum(range[1],big.mark=",", scientific = F), '-', prettyNum(range[2], big.mark=",", scientific = F)), x = NULL, y = NULL, fill = NULL) +
                   theme_classic() +
                   theme(axis.text.x = element_text(angle = 90),
                         axis.text.y = element_blank(), 
                         axis.ticks.y = element_blank(),
                         axis.line.y = element_blank(),
                         legend.position = 'none'), tooltip = 'text'), 
      error = function(e) {print("This region is either not valid or doesn't contain any variants!")}
    )
  } else {
    print('Choose a smaller genomic region')
  }}
# print my DNA function
print_DNA <- function(data){
  # create list to store plots
  plots <- list()
  
  # loop through data one chr at a time
  chrs <- c(seq(1:22), 'X', 'Y', 'MT')
  data <- subset(data, data$chr %in% chrs)
  gt_cols <- paletteer_d('Polychrome::glasbey', length(levels(data$gt))) # create colour palette
  gt_cols <- sample(gt_cols) # shuffle colours in palette
  names(gt_cols) <- levels(data$gt) # name colours
  
  for(i in chrs){
    plot_data <- subset(data, data$chr == i) # subset to chr
    plot_data <- plot_data %>% group_by(gt) %>% tally() # get counts for each genotype
    plot_data$percent <- plot_data$n/sum(plot_data$n) # calculate percentages for each gt
    plot_data$scale_n <- log2(plot_data$n) # log scale counts to decrease range
    plot_data$scale_per <- plot_data$percent*360 # scale percentage so that it should cover full 360 degrees
    plot_data <- plot_data[order(-plot_data$n),] # order by size
    plot_data$gt <- factor(plot_data$gt, levels = plot_data$gt) # set factor levels so biggest gets plotted first
    plot <- eval(substitute(ggplot(plot_data) + # initialize plot
                              geom_ellipse(aes(x0 = 0, y0 = 0, a = scale_n, b = scale_n*2.5, angle = scale_per, fill = gt, col = gt), alpha = 0.7, show.legend = F) + # draw ellipses
                              geom_text( x = -Inf, y = -Inf, hjust = 0, vjust = 0, label = i, col = 'gray95', size = 3) + # annotate with chr number
                              scale_fill_manual(values=gt_cols[levels(plot_data$gt)]) + # set fill colours
                              scale_color_manual(values = gt_cols[levels(plot_data$gt)]) + # set outline colours
                              theme_classic() +
                              theme(axis.title = element_blank(), # get rid of axes
                                    axis.text = element_blank(),
                                    axis.line = element_blank(),
                                    axis.ticks = element_blank(),
                                    panel.background = element_rect(fill = "grey8", colour = NA), # set black background
                                    plot.background = element_rect(fill = "grey8", colour = NA),
                                    plot.margin=margin(l=-1, r = -1, unit="cm")) + # reduce width of plots
                              coord_fixed(ratio = 1),list(i = i)))
    plots[[i]] <- plot # save plot to list
  }
  
  # arrange all plots in a grid with black background
  g <- plot_grid(plotlist = plots, align = 'hv')
  ggdraw(g) + 
    theme(plot.background = element_rect(fill="grey8", color = NA))
}

# function to read in vcf and convert to 23andMe format
vcf_to_23 <- function(input_filepath, return_df = F, zip = T){
  vcf <- read.vcfR(input_filepath, verbose = F) # read in vcf
  id <- vcf@fix[,'ID'] # get rsIDs
  chr <- gsub('chr', '', vcf@fix[,'CHROM']) # get chr column
  pos <- vcf@fix[,'POS'] # get pos column
  alleles <- as.data.frame(extract.gt(vcf, return.alleles = T, IDtoRowNames = F)) %>% separate(1, c('A1', 'A2'), '[/|]', fill = 'right')  %>% replace_na(list(A2='-')) # get genotypes for all variants and separate into A1 and A2 columns
  alleles$gt <- ifelse(alleles$A1 == alleles$A2, 1, 0) # determine whether each allele is homo/heterozygous
  multialleles <- intersect(which(grepl(',',vcf@fix[,'ALT'], fixed = T)), which(nchar(gsub(',', '', vcf@fix[,'ALT'])) == 2)) # find multiallelic sites (should contain ',')
  ins <- setdiff(which(nchar(vcf@fix[,'ALT']) > 1 ), multialleles) # find insertions
  dels <- which(nchar(vcf@fix[,'REF']) > 1) # find deletions
  hom_ins <- intersect(which(alleles$gt == 1), ins) # determine which insertions are homozygous
  hom_dels <- intersect(which(alleles$gt == 1), dels) # determine which deletions are homozygous
  alleles[ins, 'A2'] <- '+' # replace all indels with +/_ 
  alleles[hom_ins, 'A1'] <- '+'
  alleles[dels, 'A2'] <- '-'
  alleles[hom_dels, 'A1'] <- '-'
  het_dels <- which(nchar(alleles$A1) > 1)
  alleles[het_dels, 'A1'] <- substr(alleles[het_dels, 'A1'],1,1 )
  final_gts <- rep('NA', nrow(alleles)) # get list of gts in 23andMe format
  final_gts[which(alleles[,'gt'] == 0)]<- paste0(alleles[which(alleles[,'gt'] == 0), 'A1'], alleles[which(alleles[,'gt'] == 0), 'A2'])
  final_gts[which(alleles[,'gt'] == 1)] <- paste0(alleles[which(alleles[,'gt'] == 1), 'A2'], alleles[which(alleles[,'gt'] == 1), 'A2'])
  data <- data.frame( id, chr, pos, final_gts) # construct 23andMe df
  colnames(data) <- c('# rsid', 'chromosome', 'position', 'genotype')
  if(return_df == F){
    output_txt <- gsub("\\.vcf|\\.vcf.gz",'_23andMe.txt', input_filepath) # create name for output files
    output_zip <- gsub("\\.vcf|\\.vcf.gz",'_23andMe.zip', input_filepath)
    write.table(data, output_txt, sep = '\t', quote = F, row.names = F) # write output file
    if(zip == T){ 
      system(paste('zip ', output_zip, output_txt)) # zip output file
      output_filepath <- output_zip } else{
        output_filepath <- output_txt
      }
    print(paste('Created file', as.character(output_filepath), 'with', nrow(data), 'variants!'))
  } else {
    return(data)
  }
}

# rainfall karyoplot
rainfall_karyoplot <- function(data, chrs = "auto", range = NULL){
  if(chrs != "auto"){
    chrs <- paste0('chr', chrs)}
  if(!is.null(range)){
    range <- GRanges(seqnames = rep(chrs, 2), ranges = IRanges(start = range[1], width = range[2]- range[1]))
  }
  sub <- subset(data, data$chr %in% c(seq(1,22),'X', 'Y'))
  gr <- GRanges(seqnames = paste0('chr', sub$chr), ranges = IRanges(start = sub$pos, width = 1, names = sub$id))
  kp <- plotKaryotype("hg19", chromosomes = chrs, plot.type=4, labels.plotter = NULL, zoom = range)
  kpPlotRainfall(kp, data=gr, col = 1:5)
  kpAddChromosomeNames(kp, srt=45)
}
Sarah145/Sarah-Seq documentation built on Jan. 8, 2020, 11:02 p.m.