R/methylation_calling.R

Defines functions plot_split_reports plot_site_counts get_met_call_counts process_bismark_bias_report process_bismark_bias_reports process_bismark_split_reports call_methylation_sites_for_cell find_failed_met_calls call_methylation_sites_for_sample

# library(doSNOW)
# library(readr)
# library(scales)

call_methylation_sites_for_sample <- function(alignment_dir, methylation_calls_dir, log_dir, bme_param_settings)
{
  dir.create(methylation_calls_dir, showWarnings = F, recursive = T)
  setwd(alignment_dir)

  bam_files = list.files(alignment_dir, pattern = "*.organism.bam$")

  if(num_cores > 1)
  {
    # cl <- parallel::makeCluster(num_cores, outfile="", type = 'SOCK')
    # doSNOW::registerDoSNOW(cl)
    foreach::`%dopar%`(foreach::foreach(i=1:length(bam_files), .export= ls(globalenv())), {
        bam_file = bam_files[i]
        cell_id = gsub('.organism.bam', '', bam_file)

        call_methylation_sites_for_cell(alignment_dir = alignment_dir,
                                        methylation_calls_dir = methylation_calls_dir,
                                        cell_id = cell_id,
                                        bme_param_settings = bme_param_settings,
                                        log_dir = log_dir)

      })#foreach::foreach
  }else#if(num_cores > 1)
  {
    for(i in 1:length(bam_files))
    {
      bam_file = bam_files[i]
      cell_id = gsub('.organism.bam', '', bam_file)

      call_methylation_sites_for_cell(alignment_dir = alignment_dir,
                                      methylation_calls_dir = methylation_calls_dir,
                                      cell_id = cell_id,
                                      bme_param_settings = bme_param_settings,
                                      log_dir = log_dir)

    }#foreach::foreach

  }#else

  failed_cells = find_failed_met_calls(methylation_calls_dir)
  for(cell_id in failed_cells)
  {
    call_methylation_sites_for_cell(alignment_dir, methylation_calls_dir, cell_id, bme_param_settings, log_dir)
  }


}

find_failed_met_calls <- function(methylation_calls_dir)
{
  cpg_met_files = list.files(methylation_calls_dir, pattern = "CpG_calls.*cov.gz")
  cpg_met_paths = paste0(methylation_calls_dir, cpg_met_files)
  cell_ids = cpg_met_files
  cell_ids = gsub("CpG_calls.", "", cell_ids)
  cell_ids = gsub(".organism.cov.gz", "", cell_ids)
  names(cpg_met_paths) = cell_ids
  file.sizes = file.info(cpg_met_paths)$size
  names(file.sizes) = cell_ids
  zero.size.files = cell_ids[file.sizes == 0]


  setwd(methylation_calls_dir)
  pattern = paste0('.', 'organism', "_splitting_report.txt")
  report_files = list.files(methylation_calls_dir, pattern = paste0("*", pattern) )
  if(length(report_files) == 0 ) {return(NULL) }
  row_names = c()
  result_list = list()
  inf_line_counts = c()
  for(report_file in report_files)
  {
    print(report_file)
    cell_id = gsub(pattern = pattern, '', report_file)
    lines = readLines(report_file)
    informative_lines = lines[grep('\t', lines)]
    informative_lines = informative_lines[!grepl('strand', informative_lines)]
    inf_line_counts[cell_id] = length(informative_lines)
  }

  zero.inf_lines = names(inf_line_counts[inf_line_counts == 0])

  failed_cells = union(zero.size.files, zero.inf_lines)
  return(failed_cells)
}

call_methylation_sites_for_cell <- function(alignment_dir, methylation_calls_dir, cell_id, bme_param_settings, log_dir)
{
  setwd(alignment_dir)

  lambda_file = paste0(cell_id,'.lambda.bam')
  organism_file = paste0(cell_id,'.organism.bam')

  log_sub_dir = paste0(log_dir, '/call_methylation_sites/')
  dir.create(log_sub_dir, recursive = T, showWarnings = F)
  log_file = paste0(log_sub_dir, cell_id, '.log')


  sys_command = paste0('rm ', methylation_calls_dir, '/*', cell_id, '*')
  command_result_0 = system(sys_command)

  #Extract methylation sites
  sys_command = paste0(bismark_path, '/bismark_methylation_extractor ',bme_param_settings, ' ',
                       lambda_file,' --output  ', methylation_calls_dir, '/'
                       , ' > ', log_file)
  print(sys_command)
  command_result_1 = system(sys_command)

  sys_command = paste0(bismark_path, '/bismark_methylation_extractor ',bme_param_settings, ' ',
                       organism_file,' --output  ', methylation_calls_dir, '/'
                       , ' >> ', log_file)
  print(sys_command)
  command_result_2 = system(sys_command)

  #Convert to bismark cov format (Only for organism, not for lambda control)
  setwd(methylation_calls_dir)

  sink(log_file, append = T)

  sys_command = paste0(bismark_path, '/bismark2bedGraph -o CpG_calls.',cell_id,'.organism ', 'CpG_context_',cell_id,'.organism.txt.gz')
  command_result_3 = system(sys_command)

  file.rename(paste0('CpG_calls.',cell_id,'.organism.gz.bismark.cov.gz'),  paste0('CpG_calls.',cell_id,'.organism.cov.gz'))

  if(file.exists(paste0('Non_CpG_context_',cell_id,'.organism.txt.gz')))
  {
    sys_command = paste0(bismark_path, '/bismark2bedGraph --CX -o CH_calls.',cell_id,'.organism ', 'Non_CpG_context_',cell_id,'.organism.txt.gz')
    command_result_4 = system(sys_command)
    file.rename(paste0('CH_calls.',cell_id,'.organism.gz.bismark.cov.gz'),  paste0('CH_calls.',cell_id,'.organism.cov.gz'))

  }

  if(file.exists(paste0('CHG_context_',cell_id,'.organism.txt.gz')))
  {
    sys_command = paste0(bismark_path, '/bismark2bedGraph --CX -o CHG_calls.',cell_id,'.organism ', 'CHG_context_',cell_id,'.organism.txt.gz')
    command_result_5 = system(sys_command)

    sys_command = paste0(bismark_path, '/bismark2bedGraph --CX -o CHH_calls.',cell_id,'.organism ', 'CHH_context_',cell_id,'.organism.txt.gz')
    command_result_6 = system(sys_command)

    file.rename(paste0('CHG_calls.',cell_id,'.organism.gz.bismark.cov.gz'),  paste0('CHG_calls.',cell_id,'.organism.cov.gz'))
    file.rename(paste0('CHH_calls.',cell_id,'.organism.gz.bismark.cov.gz'),  paste0('CHH_calls.',cell_id,'.organism.cov.gz'))

  }


  #delete_command = paste0('rm *CpG_context_',cell_id, '.organism.txt.gz')
  #command_result_9 = system(delete_command)

  sink()

  command_result = command_result_1 + command_result_2 + command_result_3


  if(command_result == 0)
  {
    cat('Methylation calling is successful for cell ', cell_id, '\n')
  }else
  {
    stop('Methylation calling failed to run for ', cell_id, '.\n Exiting the pipeline. Please see the output log file :', log_file)
  }

}



process_bismark_split_reports <- function(methylation_calls_dir, genome_type = 'organism')
{
  setwd(methylation_calls_dir)
  pattern = paste0('.', genome_type, "_splitting_report.txt")
  report_files = list.files(methylation_calls_dir, pattern = paste0("*", pattern) )
  if(length(report_files) == 0 ) {return(NULL) }
  row_names = c()
  result_list = list()
  for(report_file in report_files)
  {
    print(report_file)
    cell_id = gsub(pattern = pattern, '', report_file)
    lines = readLines(report_file)
    informative_lines = lines[grep('\t', lines)]
    informative_lines = informative_lines[!grepl('strand', informative_lines)]

    df_report = read.delim(text = informative_lines, sep = '\t',  header = F)
    class(df_report)
    row_names = gsub(':', '', df_report$V1)
    result_list[[cell_id]] = as.character(df_report$V2)
  }

  df_result = do.call(cbind, result_list)
  rownames(df_result) = row_names
  t_df_result =  t(df_result)
  final_table = data.frame(Cell_ID = rownames(t_df_result),  t_df_result, stringsAsFactors = F)

  return(final_table)
}



process_bismark_bias_reports <- function(methylation_calls_dir, genome_type = 'organism')
{
  setwd(methylation_calls_dir)
  pattern = paste0('.', genome_type, ".M-bias.txt")
  report_files = list.files(methylation_calls_dir, pattern = paste0("*", pattern) )
  row_names = c()
  result_list = list()
  CpG_met_rates_list = list()
  CH_met_rates_list = list()
  for(report_file in report_files)
  {
    print(report_file)
    cell_id = gsub(pattern = pattern, '', report_file)
    result_list[[cell_id]] = process_bismark_bias_report(cell_id, report_file)
    CpG_met_rates_list[[cell_id]]  = result_list[[cell_id]]$CpG_met_rate
    CH_met_rates_list[[cell_id]]  = result_list[[cell_id]]$CH_met_rate
  }

  CpG_met_rates  = do.call("cbind", CpG_met_rates_list)
  CH_met_rates  = do.call("cbind", CH_met_rates_list)

  return(list(CpG_met_rates = CpG_met_rates, CH_met_rates = CH_met_rates))
}

#report_file = 'Test___Lane1_CAGATC.lambda.M-bias.txt'
#cell_id = 'Test___Lane1_CAGATC'
process_bismark_bias_report <- function(cell_id, report_file)
{

  x <- readLines(report_file)

  start <- grep("C", x)
  mark <- vector('integer', length(x))
  mark[start] <- 1
  # determine limits of each table
  mark <- cumsum(mark)
  # split the data for reading
  df_list <- lapply(split(x, mark), function(.data){
    .input <- read.table(textConnection(.data), skip=2, header=TRUE, sep = '\t')
    attr(.input, 'name') <- .data[1]  # save the name
    .input
  })
  # rename the list
  names(df_list) <- sapply(df_list, attr, 'name')
  df_list
  df_list$`CH context` = df_list$`CHG context` + df_list$`CHH context`
  df_list$`CH context`$X..methylation = 100 * df_list$`CH context`$count.methylated / (df_list$`CH context`$count.methylated + df_list$`CH context`$count.unmethylated)
  df_list$`CH context`$X..methylation = round(df_list$`CH context`$X..methylation, 2)

  result_df = data.frame(position = df_list$`CpG context`$position,
                         CpG_met_rate = df_list$`CpG context`$X..methylation,
                         CH_met_rate = df_list$`CH context`$X..methylation
  )

  result_df$position = NULL
  return(result_df)
}


get_met_call_counts <- function(methylation_calls_dir, met_type = 'CpG')
{

  print('!!! get_met_call_counts')
  print('--------------------------------------------------------------------')

  cov_files= list.files(methylation_calls_dir, pattern = paste0( met_type, '.*', 'cov.gz'))
  met_call_counts = c()
  counter = 0

  print('***************************************')
  print(methylation_calls_dir)
  print('***************************************')

  #cl <-parallel::makeCluster(num_cores, type = 'SOCK')
  #parallel::clusterExport(cl, ls(.GlobalEnv))
  #doSNOW::registerDoSNOW(cl)
  #parallel::clusterExport(cl, ls(.GlobalEnv))


  #met_call_counts = foreach::foreach(i=1:length(cov_files), .export= ls(globalenv()) ) %dopar%
  for(cov_file in cov_files)
  {
    counter = counter + 1
    print(counter)
    print(cov_file)
    cell_id = cov_file
    cell_id = sub('.organism.cov.gz', '', cell_id)
    cell_id = sub(paste0(met_type, '_calls.'), '', cell_id)
    print(cell_id)

    df_cov.dummy = data.frame(chrom = 'chrZz',
                        start = 1,
                        end = 2,
                        met_rate = -.5,
                        met = 1,
                        demet = 1   )

    num_calls = 0

    num_calls = tryCatch({
     nrow(data.table::fread(paste0(methylation_calls_dir,  cov_file), tmpdir = methylation_calls_dir ))

    }, warning = function(w) {
      0
    }, error = function(e) {
      0
    }, finally = {

    }
    )



    met_call_counts[cell_id] = num_calls
  }

  return(met_call_counts)
}


plot_site_counts <- function(call_counts, log_scale = F)
{
  #boxplot(list_met_call_counts, outline = F)

  call_count = list_met_call_counts$CpG

  if(log_scale)
  {
    hist(log10(call_count + 1), col = '#bebada',
         main = 'Sites called',
         xlab = 'Sites (log10)',
         ylab = 'Number of cells',
         breaks = 25,
         cex.lab = 1.50, cex.axis = 1.5)

    abline(v = median(log10(call_count + 1)), col = 'red', lty = 2)

  }else
  {
    hist(call_count/1000000, col = '#bebada',
         main = 'Sites called',
         xlab = 'million sites',
         ylab = 'Number of cells',
         breaks = 25,
         cex.lab = 1.50, cex.axis = 1.5)

    abline(v = median(call_count/1000000), col = 'red', lty = 2)

  }




}


plot_split_reports <- function(sample_name = '', df_org_split_reports, df_lambda_split_reports, list_org_bias_reports,
                               list_met_call_counts, log_scale = F, lambda_flag = F)
{

  numeric_column_index = 2:ncol(df_org_split_reports)

  lambda_flag = !is.null(df_lambda_split_reports)

  for(i in numeric_column_index)
  {
    if(is.character(df_org_split_reports[,i]) )
    {
      df_org_split_reports[,i] = readr::parse_number(df_org_split_reports[,i])
    }
    if(is.character(df_lambda_split_reports[,i]) & lambda_flag )
    {
      df_lambda_split_reports[,i] = readr::parse_number(df_lambda_split_reports[,i])
    }
  }

  #Conversion rate

  if(lambda_flag)
  {
    conv_CpG = 100 - df_lambda_split_reports$C.methylated.in.CpG.context
    pcts_conv = list(conv_CpG)

  }

  met_CpG = df_org_split_reports$C.methylated.in.CpG.context
  x_labels = c('CpG')
  pcts_met = list(met_CpG)

  if(!is.null(df_org_split_reports$C.methylated.in.non.CpG.context))
  {
    met_CH = 100 - df_org_split_reports$C.methylated.in.non.CpG.context
    x_labels = c(x_labels, 'CH')
    pcts_met[[length(pcts_met)+1]] = met_CH

    if(lambda_flag)
    {
      conv_CH = 100 - df_lambda_split_reports$C.methylated.in.non.CpG.context
      other_conv  = list(conv_CH)
      pcts_conv[[length(pcts_conv)+1]] = conv_CH
    }

  }


  if(!is.null(df_org_split_reports$C.methylated.in.CHG.context))
  {


    met_CHG = df_org_split_reports$C.methylated.in.CHG.context
    met_CHH = df_org_split_reports$C.methylated.in.CHH.context

    pcts_met[[length(pcts_met)+1]] = met_CHG
    pcts_met[[length(pcts_met)+1]] = met_CHH

    x_labels = c(x_labels, 'CHG', 'CHH')


    if(lambda_flag)
    {

      conv_CHG = 100 - df_lambda_split_reports$C.methylated.in.CHG.context
      conv_CHH = 100 - df_lambda_split_reports$C.methylated.in.CHH.context

      pcts_conv[[length(pcts_conv)+1]] = conv_CHG
      pcts_conv[[length(pcts_conv)+1]] = conv_CHH

    }


  }

  #######################Potting#############
  par(mfrow = c(2,3))

  font.face = 'Helvetica'
  font.size = 1.5
  color_vec = scales::hue_pal()(3)
  sep.lwd = 1
  #sep.color = "#33a02c"

  sep.color = "#bebada"

  par(mar = c(4,8,6,4))


  #############Conversion rates################

  if(lambda_flag)
  {

    roof = max(unlist(pcts_conv))
    floor = min(unlist(pcts_conv))

    if(floor > 95) {floor = 95}


    #vioplot(
    boxplot(
      pcts_conv, ylim = c(floor, roof),
      col = "#fb8072", names = x_labels, cex.main = font.size, cex.sub = font.size,
      #main = '% Conversion rate',
      cex.lab = font.size, cex.names = font.size, cex.axis = font.size, cex = font.size,
      ylab = '% conversion', outline = F
      , las = 1
    )

    mtext(line=1, cex.main=font.size, family =font.face, font=2, "Conversion rate")

    box("figure", col=sep.color,  lwd = sep.lwd)


  }


  ##################Bias#####################

  org_bias_reports_CpG = list_org_bias_reports$CpG_met_rates
  org_bias_reports_CH = list_org_bias_reports$CH_met_rates

  class(org_bias_reports_CpG)
  dim(org_bias_reports_CpG)

  par(family = 'Helvetica')

  x = 1:nrow(org_bias_reports_CpG)
  y = rowMeans(org_bias_reports_CpG, na.rm = T)
  roof = max(y) * 1.05
  floot = min(y) * 1.05

  plot(x, y, ylim = c(0,100),
       type = 'l', xlab = 'Position in read', ylab = 'Met. rate (%)', col = 'red',
       main = '',
       #legend = c('CpG', 'CH') ,
       cex.lab = font.size,
       cex.axis = font.size,
       las = 1,


  )

  x = 1:nrow(org_bias_reports_CH)
  y = rowMeans(org_bias_reports_CH)
  roof = max(y) * 1.05
  floot = min(y) * 1.05
  lines(x, y, ylim = c(0,10),
        type = 'l', xlab = 'position', ylab = 'Met. rate (%)', col = 'blue')

  legend('right', legend = c('CpG', 'CH'), col = c('red', 'blue'), lty = 1, bty = "n", cex = font.size)

  mtext(line=1, cex.main=font.size, family =font.face, font=2, "Position bias")

  box("figure", col=sep.color,  lwd = sep.lwd)


  ####Methylation rate##########################3


  roof = max(unlist(pcts_met))

  #vioplot(
  boxplot(
    pcts_met,
    col = "lightgreen", names = x_labels,
    cex.main = font.size,
    #main = '% Methylation rate',
    ylab = '% Methylated C', cex.lab = font.size,
    cex.axis = font.size, cex.names = font.size
    , outline = F
    , las = 1
  )

  mtext(line=1, cex.main=font.size, family =font.face, font=2, "Methylation rate")


  ##########Call Counts###################

  par(mar = c(7,8.5,3,4))

  #c_type = 'CpG'

  c_types = c('CpG', 'CHG', 'CHH')

  for(c_type in c_types)
  {

    call_counts =list_met_call_counts[[c_type]] / 1000000
    dens = density(call_counts)

    plot(dens, main="", xlab = "million sites called", ylab = "", las = 1,
         cex.lab = font.size, cex.axis = font.size)
    polygon(dens, col="#a6cee3", border="#a6cee3")

    title(ylab="Density", line=4, cex.lab=font.size, family=font.face)

    median_call_count = median(call_counts)
    abline(v = median_call_count, col = 'red', lty = 2, lwd = 2)

    legend('topright', legend = c('Med.'),
           col = c('red'), cex = font.size * 0.9,
           lty = 2, bty = "n", inset = c(0, 0))

    box("figure", col=sep.color,  lwd = sep.lwd)


    mtext(line=1, cex.main=font.size, family =font.face, font=2, c_type)


  }



  box("outer", col=sep.color,  lwd = 75)
  box("outer", col="black",  lwd = 3)

  title(paste(sample_name, ': Methylation calling results'), line = -1.5, outer = TRUE,
        cex.main=font.size * 1.2, family=font.face)

}
yasin-uzun/SINBAD documentation built on March 20, 2022, 11:48 p.m.