inst/Kronos_scRT/modules/Kronos_RT.R

#options
options(
  dplyr.summarise.inform = FALSE,
  scipen = 999
)
#parse input
option_list = list(
  optparse::make_option(
    c("-K", "--Kronos_conf_file"),
    type = "character",
    default = NULL,
    help = "Kronos setting file. If provided -F,-T,-S,-b and -g are ignored. Tab file containing: Per cell stat file <TAB> tracks file <TAB> settings file <TAB> basename (optional) <TAB> group (optional) ",
    metavar = "character"
  ),
  optparse::make_option(
    c("-F", "--file"),
    type = "character",
    default = NULL,
    help = "Per cell stat file , if multiple files are provided they have to be separated by a comma",
    metavar = "character"
  ),
  optparse::make_option(
    c("-T", "--tracks"),
    type = "character",
    default = NULL,
    help = "Tracks file,  if multiple files are provided they have to be separated by a comma",
    metavar = "character"
  ),
  optparse::make_option(
    c("-C", "--chrSizes"),
    type = "character",
    default = NULL,
    help = "Chromosome size file",
    metavar = "character"
  ),
  optparse::make_option(
    c("-o", "--out"),
    type = "character",
    default = "output",
    help = "Output directory [default= %default]",
    metavar = "character"
  ),
  optparse::make_option(
    c("-f", "--output_file_base_name"),
    type = "character",
    default = "out",
    help = "Base name for the output file [default= %default]",
    metavar = "character"
  ),
  optparse::make_option(
    c("-b", "--base_name"),
    type = "character",
    default = NULL,
    help = "Basename of an experiment, if multiple basenames are provided they have to be separated by a comma",
    metavar = "character"
  ),
  optparse::make_option(
    c("-g", "--groups"),
    type = "character",
    default = NULL,
    help = "Group name of an experiment, if multiple groups are provided they have to be separated by a comma",
    metavar = "character"
  ),
  optparse::make_option(
    c("-S", "--settings_file"),
    type = "character",
    help = "File generated by Kronos diagnostic, if multiple files are provided they have to be separated by a comma",
    metavar = "character"
  ),
  optparse::make_option(
    c("-B", "--binSize"),
    type = "character",
    default = '500Kb',
    help = "RT resolution (supports units) [default= %default] ",
    metavar = "character"
  ),
  optparse::make_option(
    c("-c", "--cores"),
    type = "integer",
    default = 3,
    help = "Number of parallel jobs to run [default= %default]",
    metavar = "integer"
  ),
  optparse::make_option(
    c("--min_correlation"),
    type = "double",
    default = 0.25,
    help = "Minimum correlation value between one cell and its best correlating cell for this cell to not be discarded [default= %default]",
    metavar = "double"
  ),
  optparse::make_option(
    c("--extract_G1_G2_cells"),
    type = "logical",
    default = F,
    action = "store_true",
    help = "Extract G1/G2 single cells copy numebr file [default= %default]",
    metavar = "logical"
  ),
  optparse::make_option(
    c("--chr_prefix"),
    type = "character",
    action = 'store',
    help = "Chromosome prefix, if there is no prefix use none [default= %default]",
    default = "chr",
    metavar = "character"
  ),
  optparse::make_option(
    c("--chr_range"),
    type = "character",
    action = 'store',
    help = "Chromosomes to consider in the analysis (example 1:5,8,15:18,X) [default= %default]",
    default = "1:22",
    metavar = "character"
  )
)

#recover inputs
opt = optparse::parse_args(object = optparse::OptionParser(option_list = option_list))

#set plotting theme
ggplot2::theme_set(ggplot2::theme_bw())

#define operators
`%do%` = foreach::`%do%`
`%>%` = tidyr::`%>%`
`%dopar%` = foreach::`%dopar%`


#check inputs
if ('Kronos_conf_file' %in% names(opt)) {
  if (!file.exists(opt$Kronos_conf_file)) {
    stop('Provided setting file does not exist')
  }

  settings = tryCatch(
    expr = readr::read_tsv(
      opt$Kronos_conf_file,
      col_names = c('file', 'traks', 'settings', 'basename', 'groups'),
      col_types = readr::cols()
    ) %>%
      dplyr::mutate(
        basename = ifelse(
          is.na(basename),
          paste0('exp', dplyr::row_number()),
          basename
        ),
        groups = ifelse(is.na(groups), basename, groups)
      ),
    error = function(e) {
      stop('Settings file does not exitst. See script usage (--help)')
    },
    warning = function(w) {
      tmp = suppressWarnings(readr::read_tsv(
        opt$Kronos_conf_file,
        col_names = c('file', 'traks', 'settings'),
        col_types = readr::cols()
      ))
      return(tmp)
    }
  )

  #reformat files
  opt$file = settings$file

  opt$settings_file = settings$settings

  opt$tracks = settings$traks

  if ('basename' %in% names(settings)) {
    opt$base_name = settings$basename
  }
  if ('basename' %in% names(settings)) {
    opt$groups = settings$groups
  }
} else{
  if (!'file' %in% names(opt)) {
    stop("Per cell stat file or Kronos setting file must be provided. See script usage (--help)")
  }

  if (!'tracks' %in% names(opt)) {
    stop(
      "File containing cells CNV or Kronos setting file must be provided. See script usage (--help)"
    )
  }

  if (!'settings_file' %in% names(opt)) {
    stop("File containing settings sizes must be provided. See script usage (--help)")
  }

  if (!'groups' %in% names(opt)) {
    opt$groups = opt$base_name
  }

  #reformat files
  opt$file = stringr::str_split(opt$file, ',', simplify = T)

  opt$settings_file = stringr::str_split(opt$settings_file, ',', simplify = T)

  opt$tracks = stringr::str_split(opt$tracks, ',', simplify = T)

  if (!is.null(opt$base_name)) {
    opt$base_name = stringr::str_split(opt$base_name, ',', simplify = T)
    #if provided base names are not enough
    if (length(opt$base_name) != length(opt$file)) {
      opt$base_name = NULL
      opt$groups = NULL
    }
  }
  if (!is.null(opt$groups)) {
    opt$groups = stringr::str_split(opt$groups, ',', simplify = T)
    #if provided groups are not enough
    if (length(opt$groups) != length(opt$file)) {
      opt$groups = NULL
      opt$base_name = NULL
    }
  }
}

if (!'chrSizes' %in% names(opt)) {
  stop("File containing Chromosomes sizes must be provided. See script usage (--help)")
}

#check per cell files
results = paste(lapply(opt$file, function(x)
  Kronos.scRT::right_format(
    file_path = x,
    delim = ',',
    columns_to_check = c(
      'Cell',
      'normalized_dimapd',
      'mean_ploidy',
      'ploidy_confidence',
      'is_high_dimapd',
      'is_noisy',
      'coverage_per_1Mbp',
      "basename",
      "group"
    ),
    wrong_message = paste(x, ',provided as a per cell file, does not have the right format. ')
  )),
  collapse = '')
if (results != '') {
  stop(results)
}

#check tracks files
results = paste(lapply(opt$tracks, function(x)
  Kronos.scRT::right_format(
    file_path = x,
    delim = '\t',
    columns_to_check = c(
      'Cell',
      'chr',
      'start',
      'end',
      'copy_number',
      'reads',
      "basename",
      "group"
    ),
    wrong_message = paste(x, ',provided as a track file, does not have the right format')
  )),
  collapse = '')

if (results != '') {
  stop(results)
}
#check settings files
results = paste(lapply(opt$settings_file, function(x)
  Kronos.scRT::right_format(
    file_path = x,
    delim = '\t',
    columns_to_check = c(
      'threshold_Sphase',
      'threshold_G1G2phase',
      'Sphase_first_part',
      'Sphase_second_part',
      'RPM_TH',
      'Ploidy',
      "basename",
      "group"
    ),
    wrong_message = paste(x, ',provided as a setting file, does not have the right format')
  )),
  collapse = '')

if (results != '') {
  stop(results)
}

#check chr size file
results = paste(
  Kronos.scRT::right_format(
    file_path = opt$chrSizes,
    delim = '\t',
    columns_to_check = 2,
    wrong_message = paste(
      opt$chrSizes,
      ',provided as a chromosome size file, does not have the right format'
    )
  ),
  collapse = ''
)

if (results != '') {
  stop(results)
}

# convert binsize to numeric
extract_unit = stringr::str_extract(opt$binSize, pattern = '.{2}$')
resolution = as.numeric(stringr::str_remove(opt$binSize, "[Bb][Pp]|[Kk][Bb]|[Mm][Bb]")) * dplyr::case_when(
  grepl(x = extract_unit, pattern =  '[Kk][Bb]') ~ 1000,
  grepl(x = extract_unit, pattern = '[Mm][Bb]') ~ 1000000,
  grepl(x = extract_unit, pattern = '[Bp][Pp]') ~ 1,
  grepl(x = extract_unit, pattern =  '[0-9][0-9]') ~ 1
)

if (any(is.na(resolution))) {
  stop('binsize have an incorrect format')
}

# prepare name file
if (grepl(x = extract_unit, pattern =  '[0-9][0-9]')) {
  n_of_zeros = stringr::str_length(stringr::str_extract(opt$binSize, '0{1,10}$'))
  opt$binSize = dplyr::case_when(
    is.na(n_of_zeros) ~ paste0(opt$binSize, 'bp'),
    n_of_zeros < 3 ~ paste0(opt$binSize, 'bp'),
    n_of_zeros < 6 ~ paste0(stringr::str_remove(opt$binSize, '0{3}$'), 'Kb'),
    n_of_zeros >= 6 ~ paste0(stringr::str_remove(opt$binSize, '0{6}$'), 'Mp')
  )
} else{
  opt$binSize = opt$binSize
}

#create directory
if (!dir.exists(opt$out)) {
  dir.create(opt$out, recursive = T)
}

# check inputs
if (length(opt$tracks) != length(opt$file)) {
  stop("The number stat files does not match provided trakcs. See script usage (--help)")
}

#select chrs
chr_list = paste0(
  ifelse(opt$chr_prefix == 'none', '', opt$chr_prefix),
  unlist(Kronos.scRT::String_to_Range(
    stringr::str_split(opt$chr_range, ',', simplify = T)
  ))
)

#load genome sizes
Chr_Size = readr::read_tsv(
  opt$chrSizes,
  col_names = c('chr', 'size'),
  col_types = readr::cols(chr = 'c')
) %>%
  dplyr::filter(chr %in% chr_list) %>%
  dplyr::mutate(chr = factor(x =  chr, levels = chr_list)) %>%
  tidyr::drop_na()

chr_list = chr_list[chr_list %in% unique(Chr_Size$chr)]


#load single cells info
data <- foreach::foreach(i = 1:length(opt$file),
                         .combine = 'rbind') %do% {
                           tmp = readr::read_csv(opt$file[i], col_types = readr::cols())
                           if (!is.null(opt$base_name)) {
                             tmp %>%
                               dplyr::mutate(
                                 basename = factor(opt$base_name[i], levels = opt$base_name),
                                 group = factor(opt$groups[i], levels = opt$groups)
                               )
                           } else{
                             tmp
                           }
                         }

#load settings
settings <- foreach::foreach(i = 1:length(opt$settings_file),
                             .combine = 'rbind') %do% {
                               tmp = readr::read_tsv(opt$settings_file[i], col_types = readr::cols())
                               if (!is.null(opt$base_name)) {
                                 tmp %>%
                                   dplyr::mutate(
                                     basename = factor(opt$base_name[i], levels = opt$base_name),
                                     group = factor(opt$groups[i], levels = opt$groups)
                                   )
                               } else{
                                 tmp
                               }
                             }

#load tracks
all_tracks <-
  foreach::foreach(i = 1:length(opt$tracks),
                   .combine = 'rbind') %do% {
                     tmp = readr::read_delim(opt$tracks[i],
                                             delim = '\t',
                                             col_types = readr::cols(chr = 'c'))

                     if (!is.null(opt$base_name)) {
                       tmp %>%
                         dplyr::mutate(
                           basename = factor(opt$base_name[i], levels = opt$base_name),
                           group = factor(opt$groups[i], levels = opt$groups),
                           chr = factor(x =  chr, levels = chr_list)
                         ) %>%
                         tidyr::drop_na()
                     } else{
                       tmp %>%
                         dplyr::mutate(chr = factor(x =  chr, levels = chr_list)) %>%
                         tidyr::drop_na()
                     }
                   }

#Adjust Percell
data = Kronos.scRT::AdjustPerCell(PerCell = data,
                                  Settings = settings)

#adjust CNV
all_tracks = Kronos.scRT::AdjustCN(PerCell = data, scCN = all_tracks)

#create genome bins for the final output
#load chromosome file
bins = Kronos.scRT::GenomeBinning(
  Chr_size = Chr_Size,
  size = resolution,
  Chr_filter = chr_list,
  Cores = opt$cores
)

#save bins to resize References
saveRDS(
  object = list(bins = bins, bs = opt$binSize),
  file = paste0(file.path(opt$out,
                          opt$output_file_base_name),
                '_bins.rds')
)

#rebin data
signal_smoothed = Kronos.scRT::Rebin(
  PerCell = data,
  scCN = all_tracks,
  Bins = bins,
  Sphase = T
)


G1G2_smoothed = Kronos.scRT::Rebin(
  PerCell = data,
  scCN = all_tracks,
  Bins = bins,
  Sphase = F
)

#free some memory
rm('all_tracks')

#calculate background
backgroud_smoothed = Kronos.scRT::BackGround(G1_scCN = G1G2_smoothed)

# binarizes data into Replicated or not replicated bins
signal_smoothed = Kronos.scRT::Replication_state(
  Samples = signal_smoothed ,
  background = backgroud_smoothed,
  Chr_filter = chr_list,
  cores = opt$cores
)

if (opt$extract_G1_G2_cells) {
  # create single G1/G2 single cell file
  G1G2_smoothed = Kronos.scRT::Replication_state(
    Samples = G1G2_smoothed,
    background = backgroud_smoothed,
    Chr_filter = chr_list,
    cores = opt$cores
  )
  #write results
  G1G2_smoothed %>%
    dplyr::select(
      chr,
      start,
      end,
      CN,
      background,
      CN_bg,
      th,
      Rep,
      PercentageReplication,
      Cell,
      basename,
      group,
      newIndex
    ) %>%
    readr::write_delim(
      file = paste0(
        file.path(opt$out,
                  opt$output_file_base_name),
        '_G1_G2_single_cells_CNV_',
        opt$binSize,
        '.tsv'
      ),
      delim = '\t',
      col_names = T
    )
}

# remove control track
rm('backgroud_smoothed')
rm('G1G2_smoothed')
#plot profile binning
plot = signal_smoothed %>%
  dplyr::group_by(index, group) %>%
  dplyr::summarise(Rep_percentage = mean(Rep)) %>%
  ggplot2::ggplot(ggplot2::aes(Rep_percentage, color = group)) +
  ggplot2::geom_density(ggplot2::aes(y = ..scaled..)) +
  ggplot2::scale_x_continuous(labels = scales::percent) +
  ggplot2::xlab('Percentage of the genome that has been replicated') +
  ggplot2::ylab('density') + ggplot2::coord_cartesian(xlim = c(0, 1))

suppressMessages(ggplot2::ggsave(plot = plot,
                                 filename = file.path(
                                   opt$out,
                                   paste0(
                                     opt$output_file_base_name,
                                     '_percentage_of_replicating_cells.pdf'
                                   )
                                 )))

#Filter and save plots
signal_smoothed = Kronos.scRT::FilterCells(
  scCN = signal_smoothed,
  min_cor = opt$min_correlation,
  save_plot_basename = file.path(opt$out, paste0(opt$output_file_base_name,
                                                 '_S_phase'))
)


#plot dist after filtering
rep_percentage = signal_smoothed %>%
  dplyr::group_by(Cell, basename, group, newIndex) %>%
  dplyr::summarise(Rep_percentage = mean(Rep))

plot = rep_percentage %>%
  ggplot2::ggplot(ggplot2::aes(Rep_percentage, color = group)) +
  ggplot2::geom_density(ggplot2::aes(y = ..scaled..)) +
  ggplot2::scale_x_continuous(labels = scales::percent) +
  ggplot2::xlab('Percentage of the genome that has been replicated') +
  ggplot2::ylab('density') + ggplot2::coord_cartesian(xlim = c(0, 1))

suppressMessages(ggplot2::ggsave(
  plot = plot,
  filename = paste0(
    file.path(opt$out,
              opt$output_file_base_name),
    '_percentage_of_replicating_cells_after_filtering.pdf'
  )
))

#new index
readr::write_delim(
  x = signal_smoothed,
  file = paste0(
    file.path(opt$out,
              opt$output_file_base_name),
    '_single_cells_CNV_',
    opt$binSize,
    '.tsv'
  ),
  delim = '\t',
  col_names = T
)

#calculate pseudo bulk RT
scRT = Kronos.scRT::pseudoBulkRT(S_scCN = signal_smoothed)

#write scRT
readr::write_delim(
  x = scRT,
  file = paste0(
    file.path(opt$out,
              opt$output_file_base_name),
    '_calculated_replication_timing_',
    opt$binSize,
    '.tsv'
  ),
  delim = '\t',
  col_names = T
)
# Variability S-phase
Var = Kronos.scRT::Variability(S_scCN = signal_smoothed, scRT = scRT)

Var %>%
  readr::write_delim(
    file = paste0(
      file.path(opt$out,
                opt$output_file_base_name),
      '_variability.tsv'
    ),
    delim = '\t',
    col_names = T
  )

#free some space
rm('Var')

#select used data and save the new per cell files
data =
  data %>%
  dplyr::filter(Type == 'G1/G2-phase cells' |
                  Cell %in% unique(signal_smoothed$Cell))

if (!dir.exists(file.path(opt$out, 'Cells_used_in_the_analysis_info'))) {
  dir.create(file.path(opt$out, 'Cells_used_in_the_analysis_info'))
}

bs = data %>%
  dplyr::select(basename, group) %>%
  unique()

bs = foreach::foreach(i = 1:nrow(bs)) %do% {
  data %>%
    dplyr::filter(basename == bs$basename[i] &
                    group == bs$group[i]) %>%
    dplyr::select(
      Cell,
      normalized_dimapd,
      mean_ploidy,
      ploidy_confidence,
      is_high_dimapd,
      is_noisy,
      coverage_per_1Mbp,
      basename,
      group
    ) %>%
    readr::write_csv(paste0(
      file.path(opt$out,
                'Cells_used_in_the_analysis_info',
                bs$basename[i]),
      '_',
      bs$group[i],
      '_per_Cell_summary_metrics.csv'
    ))
  i

}

print('done')
Derfen3001/Kronos.scRT documentation built on Aug. 5, 2022, 8:30 a.m.