R/preprocessing.R

Defines functions plot_preprocessing_results count_fastq_reads process_cutadapt_logs trim_fastq_files read_demux_logs demux_fastq_files get_r2_indeces_from_r1 get_fast_files

# library(ShortRead)
# library(scales)


get_fast_files <- function(fastq_dir, pattern = '') {
  fastq_files_1 = list.files(fastq_dir, pattern = "\\.fastq.gz$")
  fastq_files_2 = list.files(fastq_dir, pattern = "\\.fastq$")

  fastq_files = union(fastq_files_1, fastq_files_2)
  fastq_files = fastq_files[grepl(pattern = pattern, fastq_files)]

  return(fastq_files)
}

get_r2_indeces_from_r1 <- function(r1_fastq_dir, r2_input_fastq_dir, r2_output_fastq_dir, sample_name = '') {
  setwd(r1_fastq_dir)
  dir.create(r2_output_fastq_dir, recursive = T)

  r1_fastq_files = get_fast_files(fastq_dir = r1_fastq_dir, pattern = '_R1')
  r1_fastq_files = r1_fastq_files[!grepl(pattern = 'Undetermined', r1_fastq_files)]

  r2_fastq_files = get_fast_files(fastq_dir = r2_input_fastq_dir, pattern = '_R2')
  r2_fastq_files = r2_fastq_files[!grepl(pattern = 'Undetermined', r2_fastq_files)]

  r1_fastq_files = r1_fastq_files[grepl(sample_name, r1_fastq_files)]
  r2_fastq_files = r2_fastq_files[grepl(sample_name, r2_fastq_files)]

  r1_fastq_file_count = length(r1_fastq_files)

  # overwrite Perl index script path
  perl_index_transfer_path <- file.path(find.package("SINBAD"), "extdata", "get_r2_indeces_from_r1.pl")

  if (num_cores > 1) {
    thread_count = min(r1_fastq_file_count, num_cores)
    # cl <- parallel::makeCluster(thread_count, outfile="", type = 'SOCK')
    # doSNOW::registerDoSNOW(cl)

    foreach::`%dopar%`(foreach::foreach(i=1:r1_fastq_file_count, .export = ls(globalenv())), {
      r1_fastq_file = r1_fastq_files[i]
      print(r1_fastq_file)

      r2_fastq_file = gsub('_R1', '_R2', r1_fastq_file)

      command = paste0('perl ', perl_index_transfer_path, ' ',
                       ' --input_r1_fastq_file ',  r1_fastq_dir, '/', r1_fastq_file ,
                       ' --input_r2_fastq_file ', r2_input_fastq_dir, '/', r2_fastq_file ,
                       ' --demux_index_length ', demux_index_length,
                       ' --output_r2_fastq_file ', r2_output_fastq_dir, '/', r2_fastq_file )

      print(command)
      system(command)

    })

    # parallel::stopCluster(cl)

  }
  else
  {

    for(i in 1:r1_fastq_file_count)
    {
      r1_fastq_file = r1_fastq_files[i]
      print(r1_fastq_file)

      r2_fastq_file = gsub('_R1', '_R2', r1_fastq_file)

      command = paste0('perl ', perl_index_transfer_path, ' ',
                       ' --input_r1_fastq_file ',  r1_fastq_dir, '/', r1_fastq_file ,
                       ' --input_r2_fastq_file ', r2_input_fastq_dir, '/', r2_fastq_file ,
                       ' --demux_index_length ', demux_index_length,
                       ' --output_r2_fastq_file ', r2_output_fastq_dir, '/', r2_fastq_file )

      print(command)
      system(command)

    }#foreach::foreach(i=1:length(raw_fastq_files))


  }

}#demux_fastq_files

demux_fastq_files <- function(raw_fastq_dir, demux_index_file, demux_index_length, demux_fastq_dir,
                              main_log_dir, read_type = '', sample_name = '')
{
  setwd(raw_fastq_dir)

  fastq_files_1 = list.files(raw_fastq_dir, pattern = "\\.fastq.gz$")
  fastq_files_2 = list.files(raw_fastq_dir, pattern = "\\.fastq$")

  raw_fastq_files = union(fastq_files_1, fastq_files_2)
  raw_fastq_files = raw_fastq_files[!grepl('Undetermined', raw_fastq_files)]

  raw_fastq_files = raw_fastq_files[grepl(sample_name, raw_fastq_files)]
  raw_fastq_files = raw_fastq_files[grepl(read_type, raw_fastq_files)]

  demux_log_dir = paste0(main_log_dir, '/demux/')
  dir.create(demux_log_dir, showWarnings = F, recursive = T)

  if(num_cores > 1)
  {
    raw_fastq_file_count = length(raw_fastq_files)
    thread_count = min(raw_fastq_file_count, num_cores)
    # cl <-parallel::makeCluster(thread_count, outfile="", type = 'SOCK')
    # doSNOW::registerDoSNOW(cl)
    foreach::`%dopar%`(foreach::foreach(i=1:raw_fastq_file_count, .export = ls(globalenv())), {
      raw_fastq_file = raw_fastq_files[i]
      print(raw_fastq_file)
      output_prefix = gsub('.fastq.gz', '', raw_fastq_file)
      #ssdemux_log_file = paste0(demux_log_dir, '/', raw_fastq_file, '.log')

      perl_demux_path <- file.path(find.package("SINBAD"), "extdata", "demultiplex_fastq.pl")

      demux_command = paste0('perl ', perl_demux_path, ' ',
                             ' --demux_index_file ',  demux_index_file ,
                             ' --raw_fastq_file ', raw_fastq_dir, '/', raw_fastq_file ,
                             ' --demux_index_length ', demux_index_length,
                             ' --output_dir ', demux_fastq_dir ,
                             ' --output_prefix ', output_prefix,
                             ' --log_dir ', demux_log_dir)

      print(demux_command)
      system(demux_command)

    })#foreach::foreach(i=1:length(raw_fastq_files))

    # parallel::stopCluster(cl)

  }#if(num_cores > 1)



}#demux_fastq_files

read_demux_logs <- function(main_log_dir)
{

  demux_log_dir = paste0(main_log_dir, '/demux/')
  demux_log_files = list.files(demux_log_dir, pattern = "\\.log$")
  demux_log_files = demux_log_files[!grepl('Undetermined', demux_log_files)]
  setwd(demux_log_dir)
  list_df_demux_combined = list()

  for(demux_log_file in demux_log_files)
  {
    print(demux_log_file)
    lane_id = gsub('.log', '', demux_log_file)
    list_df_demux_combined[[lane_id]] = read.table(demux_log_file, header = T)
  }#for

  df_demux_combined = do.call('rbind', list_df_demux_combined)
  df_demux_combined = data.frame(Group = rownames(df_demux_combined), df_demux_combined)
  head(df_demux_combined)

  return(df_demux_combined)
}#read_demux_logs


trim_fastq_files <- function(demux_fastq_dir, trimmed_fastq_dir, main_log_dir)
{

  print('Trimming fastq files')
  setwd(demux_fastq_dir)

  fastq_files_1 = list.files(demux_fastq_dir, pattern = ".fastq.gz")
  fastq_files_2 = list.files(demux_fastq_dir, pattern = ".fastq")

  demux_fastq_files = union(fastq_files_1, fastq_files_2)
  demux_fastq_files = demux_fastq_files[!grepl('No_matching_index', demux_fastq_files)]
  print('demux_fastq_files :')
  #print( demux_fastq_files)
  trimming_log_dir = paste0(main_log_dir, '/trimming/')
  dir.create(trimming_log_dir, showWarnings = F, recursive = T)

  if(num_cores > 1)
  {
    # cl <-parallel::makeCluster(num_cores, outfile="", type = 'SOCK')
    # doSNOW::registerDoSNOW(cl)
    # parallel::clusterExport(cl, ls(), envir = environment())


    if(sequencing_type  == 'paired')
    {
      demux_fastq_files = demux_fastq_files[grepl('R1', demux_fastq_files)]
    }

    print('*******************Trimming....')
    foreach::`%dopar%`(foreach::foreach(i=1:length(demux_fastq_files), .export = ls(globalenv())), {
      fastq_file = demux_fastq_files[i]
      print(paste('Input fastq: ', fastq_file) )
      log_file = paste0(trimming_log_dir, fastq_file, '.log')

      if(trimmer == 'cutadapt')
      {
        if(sequencing_type  == 'paired')
        {
          fastq_file.left = fastq_file
          fastq_file.right = gsub('R1', 'R2', fastq_file)
          command = paste0(cutadapt_path, '/cutadapt ',cutadapt_param_settings,
                           ' -o ', trimmed_fastq_dir, '/', fastq_file.left,
                           ' -p ', trimmed_fastq_dir, '/', fastq_file.right,
                           '  ',  demux_fastq_dir, '/',fastq_file.left ,
                           '  ',  demux_fastq_dir, '/',fastq_file.right ,
                           ' >  ',  log_file
          )

        }else
        {
           command = paste0(cutadapt_path, '/cutadapt ',cutadapt_param_settings,
                             ' -o ', trimmed_fastq_dir, '/', fastq_file,
                             '  ',  demux_fastq_dir, '/',fastq_file ,
                             ' >  ',  log_file
                             )
      }
          print(command)
          system(command)


      }else if(trimmer == 'trim_galore')
      {

        command = paste0(trim_galore_path, '/trim_galore ',trim_galore_param_settings,
                           ' -o ', trimmed_fastq_dir,
                           '  ',  demux_fastq_dir, '/',fastq_file ,
                           ' >  ',  log_file
                          )
        print(command)
        system(command)
        trimmed_fq_name = sub('.fastq.gz', '_trimmed.fq.gz',  fastq_file)
        mv_command = paste0('mv ', trimmed_fastq_dir, trimmed_fq_name, ' ',  trimmed_fastq_dir, fastq_file)
        system(mv_command)

      }else if(trimmer == 'Trimmomatic')
      {

        command = paste0('java -jar ', Trimmomatic_jar_path,
                         ' ', Trimmomatic_param_settings,
                         ' -trimlog ', trimming_log_dir, '/',log_file,
                         ' ', demux_fastq_dir, '/',fastq_file ,
                         ' ',  fastq_file
                         )

        system(command)


      }else
      {
        stop('Invalid  trimmer name, exiting. Trimmer must be one of these: cutadapt trim_galore Trimmomatic')
      }


      #ILLUMINACLIP:${ADAPTER_SEQ}:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:25



    }) #foreach::foreach(i=1:length(raw_fastq_files))

  }#if(num_cores > 1)



}


process_cutadapt_logs <- function(main_log_dir)
{
  cutadapt_log_dir = paste0(log_dir, '/cutadapt/')
  cutadapt_log_files = list.files(cutadapt_log_dir, pattern = "*.log")
  setwd(cutadapt_log_dir)

  row_names = c()
  result_list = list()
  for(report_file in cutadapt_log_files)
  {
    print(report_file)
    cell_id = gsub('.fastq.gz.log', '', report_file)
    lines = readLines(report_file)
    informative_lines = lines[grep(':', lines)]
    informative_lines = informative_lines[!grepl('strand', informative_lines)]
    informative_lines = gsub('\\(', ':', informative_lines)
    informative_lines = gsub('\\)', ':', informative_lines)

    df_report = read.delim(text = informative_lines, sep = '\t',  header = F)
    class(df_report)
    row_names = gsub(':', '', df_report$V1)
    result_list[[cell_id]] = as.character(df_report$V2)
  }

  row_names[1] = "Total_reads"
  row_names[2] = "Alignments"
  row_names[3] = "Alignment_rate"

  row_names[4] = "Sequences_with_no_alignments"
  row_names[6] = "Discarded_sequences"

  df_result = do.call(cbind, result_list)
  rownames(df_result) = row_names
  t_df_result =  t(df_result)
  final_table = data.frame(Cell_ID = rownames(t_df_result),  t_df_result, stringsAsFactors = F)

}

count_fastq_reads <- function(fastq_dir)
{
  print(paste('Reading fastq files from the directory:', fastq_dir))
  fastq_files_1 = list.files(fastq_dir, pattern = ".fastq.gz")
  fastq_files_2 = list.files(fastq_dir, pattern = ".fastq")

  fastq_files = union(fastq_files_1, fastq_files_2)
  fastq_files = fastq_files[!grepl('No_matching_index', fastq_files)]

  num_fastq = length(fastq_files)
  print(paste('I found:', num_fastq, ' fastq files: '))
  print(fastq_files)
  print(paste('Now counting reads inside fastqs '))


  # cl <-parallel::makeCluster(num_cores, outfile="", type = 'SOCK')
  # doSNOW::registerDoSNOW(cl)
  # parallel::clusterExport(cl, ls(), envir = environment())
  #parallel::clusterExport(cl, varlist = ls(), envir = environment())

  #read_counts = c()
  line_counts_list = foreach::`%dopar%`(foreach::foreach(i=1:length(fastq_files), .export = ls(globalenv())), {
    # library(ShortRead)
    fastq_file = fastq_files[i]
    print(fastq_file)
    ShortRead::countLines(fastq_dir, fastq_file)
  })

  line_counts = unlist(line_counts_list)
  read_counts = line_counts / 4
  names(read_counts) = gsub('.fastq.gz', '', names(read_counts))
  names(read_counts) = gsub('.fastq', '', names(read_counts))

  return(read_counts)

}

plot_preprocessing_results <- function(sample_name, demux_reports, demux_read_counts, trimmed_read_counts)
{
  font.face = 'Helvetica'
  font.size = 1.5
  color_vec = scales::hue_pal()(3)
  sep.lwd = 1

  #sep.color = "#33a02c"

  sep.color = "#fdbf6f"


  #layout(mat = matrix(c(1, 2, 1, 3),
  #                    2, 2, byrow = TRUE))

  #par(oma = c(2,2,2,2))

  par(mfrow = c(2,3))


  ###########DEMUX STATS####################

  par(mar = c(7,7,8,5))

  count_multiplexed_fastq = nrow(demux_reports)
  count_demultiplexed_fastq = length(demux_read_counts)

  Total_reads.sum = sum(demux_reports$Total_reads)
  Reads_with_matching_index.sum = sum(demux_reports$Reads_with_matching_index)
  Reads_without_matching_index.sum = sum(demux_reports$Reads_without_matching_index)

  par(mar = c(2,4,6,2))

  plot.new()
  overall_stat = c()

  overall_stat["Sample_Name"]  = sample_name

  #title("Demultiplexing Statistics:", line=1, cex = font.size, family =font.face, adj=0)
  mtext(side =3 ,line=1, cex.main=font.size, family =font.face, font=2, adj=0, "Demultiplexing Statistics:")

  lbl = paste0("Input count: ",format(count_multiplexed_fastq, scientific = F, big.mark = ","))
  mtext(side =3 , line=-2, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["Input_count"]  = count_multiplexed_fastq

  lbl = paste0("Output count: ",format(count_demultiplexed_fastq, scientific = F, big.mark = ","))
  mtext(side =3 , line=-4, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["Output_count"]  = count_demultiplexed_fastq

  lbl = paste0("Total reads: ",format(Total_reads.sum, scientific = F, big.mark = ","))
  mtext(side =3 , line=-6, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["Total_reads"]  = Total_reads.sum


  lbl = paste0("Matching index: ",format(Reads_with_matching_index.sum, scientific = F, big.mark = ","))
  mtext(side =3 , line=-8, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["Matching_index"]  = Reads_with_matching_index.sum


  lbl = paste0("No matching index: ",format(Reads_without_matching_index.sum, scientific = F, big.mark = ","))
  mtext(side =3 , line=-10, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["No_matching_index"]  = Reads_without_matching_index.sum



  box("figure", col=sep.color,  lwd = sep.lwd)


  ###########DEMUX BOXPLOT####################
  par(mar = c(6,7,6,5))


  list_demux_counts = demux_reports[c('Total_reads', 'Reads_with_matching_index')] /1000000
  roof = max(list_demux_counts) * 1.2

  boxplot(list_demux_counts, names = c('Total', 'With index'), #main = 'Demultiplexing statistics',
           col = color_vec[2], outline = F, las = 1, ylab = 'million reads',
          cex.lab = font.size, cex.axis = font.size, ylim = c(0, roof)
          ,  medlwd = 1)
  #title("Demultiplexing", line=1, cex.lab=font.size, cex.main = font.size,  family=font.face)
  mtext(line=1, cex.main=font.size, family =font.face, font=2, "Demultiplexing")

  box("figure", col=sep.color,  lwd = sep.lwd)



  ###########DEMUX LOSS RATES####################

  par(mar = c(6,6,6,4))


  index_loss_rates = (1 - list_demux_counts$Reads_with_matching_index / list_demux_counts$Total_reads ) * 100

  #barplot(sort(index_loss_rates))
  roof = max(10, max(index_loss_rates * 1.2))
  barplot(sort(index_loss_rates, decreasing = T),
       #xlab = 'Multiplexed input',
       ylab = '% of total reads',  cex.lab = font.size, cex.axis = font.size,
       col = 'brown1',

       las = 2,
       #main = 'Pct of reads without index',
       ylim = c(0, roof)

       #, cex.main = 0.8
  )
  title(xlab="Multiplexed input", line=1, cex.lab=font.size, family=font.face)
  #title("Pct. of reads without index", line=1, cex.lab=font.size, family=font.face)
  mtext(line=1, cex.main=font.size, family =font.face, font=2, "Pct. of reads without index")

  box("figure", col=sep.color,  lwd = sep.lwd)



  ###########TRIM STATS####################



  count_multiplexed_fastq = nrow(demux_reports)
  count_demultiplexed_fastq = length(demux_read_counts)

  Total_reads.trim = sum(trimmed_read_counts)
  Mean_reads.trim = mean(trimmed_read_counts)
  Pct.read.trimmed.out = (demux_read_counts - trimmed_read_counts) / demux_read_counts * 100
  Pct.read.trimmed.out.mean = round(mean(Pct.read.trimmed.out), 1)

  par(mar = c(2,4,6,2))

  plot.new()
  #title("Demultiplexing Statistics:", line=1, cex = font.size, family =font.face, adj=0)
  mtext(side =3 ,line=1, cex.main=font.size, family =font.face, font=2, adj=0, "Trimming Statistics:")

  lbl = paste0("Total trimmed reads: ",format(Total_reads.trim, scientific = F, big.mark = ","))
  mtext(side =3 , line=-2, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["Total_trimmed_reads"]  = Total_reads.trim


  lbl = paste0("Mean trimmed reads: ",format(Mean_reads.trim, scientific = F, big.mark = ","))
  mtext(side =3 , line=-4, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  overall_stat["Mean_trimmed_reads"]  = round(Mean_reads.trim)

  lbl = paste0("Mean pct. of reads filtered out: ",format(Pct.read.trimmed.out.mean, scientific = F, big.mark = ","), "%")
  mtext(side =3 , line=-6, cex.main=font.size,adj=0,
        family=font.face,  font.main =1, lbl)
  box("figure", col=sep.color,  lwd = sep.lwd)
  overall_stat["Pct_removed_reads"]  = Pct.read.trimmed.out.mean



  ###########TRIM LINE PLOT####################

  par(mar = c(8,7,4,4))

  #trimmed_read_counts = trimmed_read_counts[!grepl('R2', names(trimmed_read_counts))]
  #demux_read_counts = demux_read_counts[!grepl('R2', names(demux_read_counts))]


  trim_loss_rates = (1 - trimmed_read_counts / demux_read_counts[names(trimmed_read_counts)] ) * 100

  #barplot(sort(loss_rates))
  plot(sort(trim_loss_rates), type = 'l',
       xlab = 'Demultiplexed input', ylab = '% reads filtered out', col = 'blue',
       #main = 'Adapter trimming'
       #, cex.main = 0.8
       , cex.lab=font.size
       , las =1
       , cex.axis = font.size
  )
  #title("Adapter trimming", line=1, cex.lab=font.size, family=font.face)
  mtext(line=1, cex.main=font.size, family =font.face, font=2, "Adapter trimming")

  box("figure", col=sep.color,  lwd = sep.lwd)




  ###########TRIM BOXPLOT####################

  par(mar = c(8,7,4,5))


  x_labels = c('Demux', 'Trimmed')
  bb = boxplot(
    demux_read_counts/1000000,
    trimmed_read_counts/1000000, plot = F)

  roof = max(bb$stats) * 1.2

  boxplot(
    demux_read_counts/1000000,
    trimmed_read_counts/1000000,
    col = color_vec[3], names = x_labels,
    ylab = 'million reads'
    ,cex.lab = font.size, cex.axis = font.size
    #,main = 'Read statistics'
    , outpch = 19, outcex = 0.2
    , las = 1
    , ylim = c(0, roof)
    #,cex.axis = 1.25, cex.names = 1.25,
    #cex.main = 1.25, cex.lab = 1.25
    , outline =F
    ,  medlwd = 1
  )    #Plot number of reads
  #title("Trimming", line=1, cex.lab=font.size, family=font.face)
  mtext(line=1, cex.main=font.size, family =font.face, font=2, "Trimming")

  box("figure", col=sep.color,  lwd = sep.lwd)



   ##############OUTER BOX########################


  #box("inner", col="black",  lwd = 1)


  box("outer", col=sep.color,  lwd = 75)
  box("outer", col="black",  lwd = 3)

  title(paste(sample_name, ': Preprocessing results'), line = -1.5,
        outer = TRUE, cex.main=font.size * 1.2, family=font.face)

  return(overall_stat)


}
yasin-uzun/SINBAD documentation built on March 20, 2022, 11:48 p.m.