R/tlx.R

Defines functions tlxcov_write_bedgraph tlx_read_many tlx_read tlx_read_samples tlx_generate_filename_col tlx_blank tlx_cols tlx_get_group_cols validate_exttype validate_normalization_target validate_bed_mode validate_group_between validate_group_within validate_group

validate_group = function(group) {
  valid_group = c("all", "group", "treatment", "none")
  if(!(group %in% valid_group)) { stop(simpleError(paste0("group should be one of: ", paste(valid_group, collapse=", "), " (Actual:", group, ")"))) }
}

validate_group_within = function(within) {
  within_group = c("group", "treatment", "none")
  if(!(within %in% within_group)) { stop(simpleError(paste0("group should be one of: ", paste(within_group, collapse=", "), " (Actual:", within, ")"))) }
}

validate_group_between = function(within) {
  within_group = c("all", "group", "treatment", "none")
  if(!(within %in% within_group)) { stop(simpleError(paste0("group should be one of: ", paste(within_group, collapse=", "), " (Actual:", within, ")"))) }
}

validate_bed_mode = function(mode) {
  valid_mode = c("junction", "alignment")
  if(!(mode %in% valid_mode)) { stop(simpleError(paste0("group should be one of: ", paste(valid_mode, collapse=", "), " (Actual:", mode, ")"))) }
}

validate_normalization_target = function(target) {
  valid_targets = c("min", "max", "identity", "samplecount_min", "samplecount_max")
  if(!(target %in% valid_targets)) { stop(simpleError(paste0("normalization target should be one of: ", paste(valid_targets, collapse=", "), " (Actual:", target, ")"))) }
}


validate_exttype = function(exttype) {
  valid_exttype = c("symmetrical", "along", "opposite", "none")
  if(!(exttype %in% valid_exttype)) { stop(simpleError(paste0("exttype should be one of: ", paste(valid_exttype, collapse=", "), " (Actual:", exttype, ")"))) }
}

tlx_get_group_cols = function(group, ignore.strand=T, ignore.control=F) {
  validate_group(group)
  if(group=="all") group_cols = c("")[-1]
  if(group=="group") group_cols = c("tlx_group")
  if(group=="treatment") group_cols = c("tlx_group", "tlx_control")
  if(group %in% c("sample", "none")) group_cols = c("tlx_group", "tlx_group_i", "tlx_sample")

  if(!ignore.control) group_cols = unique(c(group_cols, "tlx_control"))
  if(!ignore.strand) group_cols = unique(c(group_cols, "tlx_strand"))

  return(group_cols)
}

tlx_cols = function() {
  readr::cols(
    Qname=readr::col_character(), JuncID=readr::col_character(), Rname=readr::col_character(), Junction=readr::col_double(),
    Strand=readr::col_character(), Rstart=readr::col_double(), Rend=readr::col_double(),
    B_Rname=readr::col_character(), B_Rstart=readr::col_double(), B_Rend=readr::col_double(), B_Strand=readr::col_double(),
    B_Qstart=readr::col_double(), B_Qend=readr::col_double(), Qstart=readr::col_double(), Qend=readr::col_double(), Qlen=readr::col_double(),
    B_Cigar=readr::col_character(), Cigar=readr::col_character(), Seq=readr::col_character(), J_Seq=readr::col_character(), Barcode=readr::col_logical(),
    unaligned=readr::col_double(), baitonly=readr::col_double(), uncut=readr::col_double(), misprimed=readr::col_double(), freqcut=readr::col_double(),
    largegap=readr::col_double(), mapqual=readr::col_double(), breaksite=readr::col_double(), sequential=readr::col_double(), repeatseq=readr::col_double(), duplicate=readr::col_double()
  )
}

tlx_blank = function() {
  blank_tibble(tlx_cols()) %>%
    dplyr::mutate(tlx_sample=NA_character_, tlx_path=NA_character_, tlx_group=NA_character_, tlx_control=NA) %>%
    dplyr::mutate(tlx_is_bait_chromosome=NA, tlx_is_bait_junction=NA, tlx_is_offtarget=NA) %>%
    dplyr::slice(0)
}

tlx_generate_filename_col = function(df, include_sample=F, include_group=F, include_strand=F, include_treatment=T) {
  if(!include_sample & !include_group & !include_strand & !include_treatment) {
    stop(simpleError("At least one parameter must be used to generate file name (include_sample, include_group, include_strand, include_treatment"))
  }
  if(include_sample & !("tlx_sample" %in% colnames(df))) stop(simpleError("tlx_sample column is not pressent in data.frame"))
  if(include_group & !("tlx_group" %in% colnames(df))) stop(simpleError("tlx_group column is not pressent in data.frame"))
  if(include_strand & !("tlx_strand" %in% colnames(df))) stop(simpleError("tlx_strand column is not pressent in data.frame"))
  if(include_treatment & !("tlx_control" %in% colnames(df))) stop(simpleError("tlx_control column is not pressent in data.frame"))

  map_strand = c("+"="plus", "-"="minus", "*"="both")
  filenames_df = df %>% dplyr::select()
  if(include_group) filenames_df$group = df$tlx_group
  if(include_sample) filenames_df$sample = df$tlx_sample
  if(include_treatment) filenames_df$control = ifelse(df$tlx_control, "ctrl", "trmnt")
  if(include_strand) filenames_df$strand = map_strand[df$tlx_strand]

  generated_path = do.call(paste, c(filenames_df, sep = "_"))
  unique_generated_path = unique(generated_path)
  map_unique_generated_path = gsub("[/.]", "-", unique_generated_path)
  map_unique_generated_path = gsub("[^A-Za-z0-9 -]+", "_", map_unique_generated_path)
  map_unique_generated_path = gsub("(_| )+", "\\1", map_unique_generated_path)
  names(map_unique_generated_path) = unique_generated_path
  unname(map_unique_generated_path[generated_path])
}

#' @title tlx_read_samples
#' @export
#' @description Read tab-separated-file with description and location of TLX files (generated with HTGTS), one sample per row
#'
#' @param annotation_path Path of tab-separated-file with samples information
#' @param samples_path Path to a folder where actual TLX files are located in case the information in samples tab-separated-information doesn't contain a full path
#'
#' @details
#' Original HTGTS pipeline is hosted on \href{https://github.com/robinmeyers/transloc_pipeline}{github}. A docker image is available
#' through \href{https://hub.docker.com/repository/docker/sandrejev/htgts}{DockerHUB} (docker://sandrejev/htgts:latest)
#'
#' The file should contain following columns:
#'   \strong{path} - Path to the TLX file. TLX file path is appended to samples_path to get an actual path to TLX sample
#'   \strong{sample} - Name of the sample
#'   \strong{control} - A boolean column that specifies whether sample is a control sample or not (values TRUE/FALSE)
#'   \strong{group} - Name of a group to which a sample belongs (for example it can be different experiments)
#'
#' @seealso tlx_read tlx_read_many
#'
#' @return A data frame with sample information
tlx_read_samples = function(annotation_path, samples_path=".") {
  samples_df = readr::read_tsv(annotation_path, comment="#") %>%
    dplyr::mutate(path=file.path(samples_path, path)) %>%
    dplyr::mutate(tlx_exists=file.exists(path))
  samples_df
}

#' @title tlx_read
#' @export
#' @description Read TLX file (generated with HTGTS)
#'
#' @param sample Path to the TLX file
#' @param sample Name of the sample
#' @param group Name of a group to which a sample belongs (for example it can be different experiments)
#' @param group_i Index of the sample inside the group
#' @param control A boolean value specifying whether sample is a control sample or not
#'
#' @details
#' Original HTGTS pipeline is hosted on \href{https://github.com/robinmeyers/transloc_pipeline}{github}. A docker image is available
#' through \href{https://hub.docker.com/repository/docker/sandrejev/htgts}{DockerHUB} (docker://sandrejev/htgts:latest)
#'
#' @seealso tlx_read_many tlx_read_samples
#'
#' @return Data frame from TLX file
tlx_read = function(path, sample, group="", group_i=1, control=F) {
  tlx_single_df = readr::read_tsv(path, comment="#", skip=1, col_names=names(tlx_cols()$cols), col_types=tlx_cols()) %>%
    dplyr::mutate(tlx_strand=as.character(ifelse(Strand<0, "-", "+"))) %>%
    dplyr::mutate(Seq_length=as.numeric(nchar(Seq)), tlx_sample=as.character(sample), tlx_path=as.character(path), tlx_group=as.character(group), tlx_group_i=as.numeric(group_i), tlx_control=as.logical(control)) %>%
    dplyr::mutate(tlx_duplicated=duplicated(paste0(Rname, B_Rstart, B_Rend, ifelse(tlx_strand=="+", Rstart, Rend))) | duplicated(Seq)) %>%
    dplyr::mutate(QSeq=substr(Seq, Qstart, Qend))


  if(any(is.na(tlx_single_df$misprimed))) {
    warning(paste0("[", sample ,"] ", sum(is.na(tlx_single_df$misprimed)), " of entries had misprimed column not assigned (NA). Replacing with 0 ..."))
    tlx_single_df$misprimed[is.na(tlx_single_df$misprimed)] = 0
  }

  tlx_single_df
}


#' @title tlx_read_many
#' @export
#' @description Read multiple TLX files (generated with HTGTS) using information in samples data, one sample per row
#'
#' @param samples_df Path of tab-separated-file with samples information
#' @param threads Use multiple threads to read TLX files in paralel
#'
#' @details
#' Original HTGTS pipeline is hosted on \href{https://github.com/robinmeyers/transloc_pipeline}{github}. A docker image is available
#' through \href{https://hub.docker.com/repository/docker/sandrejev/htgts}{DockerHUB} (docker://sandrejev/htgts:latest)
#'
#' Data frame with sample information should contain following columns:
#'   \strong{path} - Path to the TLX file. TLX file path is appended to samples_path to get an actual path to TLX sample
#'   \strong{sample} - Name of the sample
#'   \strong{control} - A boolean column that specifies whether sample is a control sample or not (values TRUE/FALSE)
#'   \strong{group} - Name of a group to which a sample belongs (for example it can be different experiments)
#'
#' @seealso tlx_read tlx_read_samples
#'
#' @return Data frame from multiple TLX file
tlx_read_many = function(samples_df, threads=1) {
  if(!all(samples_df$tlx_exists)) {
    files_str = paste(samples_df %>% dplyr::filter(!file.exists(samples_df$path)) %>% .$path, collapse="\n")
    stop(paste0("Some TLX files do not exist: \n", files_str))
  }

  tlx_df.all = data.frame()
  if(threads > 1) {
    doParallel::registerDoParallel(cores=threads)
    tlx_df.all = foreach(f=1:nrow(samples_df)) %dopar% {

      df = tlx_read(
        path=samples_df$path[f],
        sample=samples_df$sample[f],
        control=samples_df$control[f],
        group=samples_df$group[f],
        group_i=ifelse("group_i" %in% colnames(samples_df), samples_df$group_i[f], 1))
      log("Read tlx file ", f, "/", nrow(samples_df), ": ",  samples_df$path[f])
      df
    }
    log("Merging TLX files...")
    tlx_df.all = do.call(dplyr::bind_rows, tlx_df.all)
  } else {
    for(f in 1:nrow(samples_df)) {
      log("Reading tlx file ", f, "/", nrow(samples_df), ": ",  samples_df$path[f])
      tlx_df.f = tlx_read(
        path=samples_df$path[f],
        sample=samples_df$sample[f],
        control=samples_df$control[f],
        group=samples_df$group[f],
        group_i=ifelse("group_i" %in% colnames(samples_df), samples_df$group_i[f], 1))
      tlx_df.all = dplyr::bind_rows(tlx_df.all, tlx_df.f)
    }
  }

  tlx_df.all %>%
    dplyr::inner_join(samples_df, by=c("tlx_sample"="sample"))
}

#' @export
#'
tlxcov_write_bedgraph = function(tlxcov_df, path, group, ignore.strand=T) {
  writeLines("calculating filenames(s)...")
  path = paste0(path, "XXXXXXXXXXXX")
  if(group=="all") tlxcov_df = tlxcov_df %>%
    dplyr::mutate(g_dir=paste0(dirname(path), "/", basename(path), ifelse(basename(path)=="XXXXXXXXXXXX", "", "-"))) %>%
    dplyr::mutate(g=paste0(g_dir, tlx_generate_filename_col(., include_group=F, include_sample=F, include_treatment=T, include_strand=!ignore.strand), ".bedgraph")) %>%
    dplyr::mutate(g=gsub("XXXXXXXXXXXX", "", g))
  if(group=="group") tlxcov_df = tlxcov_df %>%
    dplyr::mutate(g_dir=paste0(dirname(path), "/", basename(path), ifelse(basename(path)=="XXXXXXXXXXXX", "", "-"))) %>%
    dplyr::mutate(g=paste0(g_dir, tlx_generate_filename_col(., include_group=T, include_sample=F, include_treatment=T, include_strand=!ignore.strand), ".bedgraph")) %>%
    dplyr::mutate(g=gsub("XXXXXXXXXXXX", "", g))
  if(group=="sample") tlxcov_df = tlxcov_df %>%
    dplyr::mutate(g_dir=paste0(dirname(path), "/", basename(path), ifelse(basename(path)=="XXXXXXXXXXXX", "", "-"))) %>%
    dplyr::mutate(g=paste0(g_dir, tlx_generate_filename_col(., include_group=F, include_sample=T, include_treatment=T, include_strand=!ignore.strand), ".bedgraph")) %>%
    dplyr::mutate(g=gsub("XXXXXXXXXXXX", "", g))
  if(!ignore.strand) tlxcov_df = tlxcov_df %>% dplyr::mutate(tlxcov_pileup=ifelse(tlx_strand %in% c("+", "*"), 1, -1)*tlxcov_pileup)


  writeLines("Writing bedgraph file(s)...")
  tlxcov_df %>%
    dplyr::group_by(g) %>%
    dplyr::do((function(z){
      # z = tlxcov_df %>% dplyr::filter(g=="reports/detect_offtargets/off-Chr5_101Mb_trmnt_both.bedgraph")
      z.out = z %>% dplyr::select(tlxcov_chrom, tlxcov_start, tlxcov_end, tlxcov_pileup)
      z.path = z$g[1]
      if(!dir.exists(dirname(z.path))) dir.create(dirname(z.path), recursive=T)
      writeLines(paste0("Writing to file '", z.path, "'"))
      readr::write_tsv(z.out, file=z.path, col_names=F)
      data.frame()
    })(.))

  tlxcov_df %>%
    dplyr::select(bedgraph_path=g) %>%
    dplyr::distinct(bedgraph_path, .keep_all=T) %>%
    data.frame()
}

test = function()
{
  devtools::load_all('~/Workspace/breaktools/')
  samples_df = tlx_read_samples("~/Workspace/Datasets/HTGTS/samples/All_samples.tsv", "~/Workspace/Datasets/HTGTS") %>%
    dplyr::filter(grepl("concentration", experiment))

  tlx_df = tlx_read_many(samples_df, threads=30)
  tlx_all_df = tlx_df %>%
    dplyr::mutate(tlx_group=ifelse(tlx_group=="DMSO", "APH 0.2 uM 96h", tlx_group))
  tlx_libfactors = tlx_libfactors(tlx_all_df, group="group", normalize_within="treatment", normalize_between="all")

  samples_df = tlx_read_samples("samples.tsv", "TLX")
  tlx_df = tlx_read_many(samples_df, threads=30)
  tlx_libfactors = tlx_libfactors(tlx_all_df, normalize_within="treatment", normalize_between="treatment")
  tlxcov_df = tlx_coverage(tlx_df, group="treatment", extsize=20e3, exttype="symmertrical", libfactors_df=tlx_libfactors, ignore.strand=T)

  libsizes_df = tlx_df %>%
    tlx_libsizes(within=c("tlx_group", "tlx_control"), between="tlx_group") %>%
    tlx_libfactors_within(max(library_size)/library_size) %>%
    tlx_libfactors_between(max(library_between_samples)/library_between_samples)
}

tlx_libsizes = function(tlx_df, within=NULL, between=NULL) {
  sample_cols = setdiff(colnames(tlx_df), names(tlx_cols()$cols))
  sample_cols = setdiff(sample_cols, c("misprimed_max", "tlx_duplicated", "tlx_strand", "QSeq", "Qname", "Seq_length",  "tlx_bait_start", "tlx_bait_end", "tlx_is_bait_chrom", "tlx_is_bait_junction", "tlx_dust_dust_regions", "tlx_dust_length", "tlx_dust_coordinates", "tlx_has_dust", "tlx_dust_prop", "tlx_copynumber", "tlx_id", "tlx_is_offtarget"))
  sample_cols = unique(c(sample_cols, "library_size"))

  tlx_df %>%
    dplyr::ungroup() %>%
    dplyr::group_by(tlx_sample) %>%
    dplyr::mutate(library_size=dplyr::n()) %>%
    dplyr::select_at(sample_cols) %>%
    dplyr::slice(1) %>%
    dplyr::ungroup() %>%
    dplyr::group_by_at(within) %>%
    dplyr::mutate(library_within_group=paste0(within, "=", dplyr::cur_group(), collapse=","), library_within_samples=length(unique(tlx_sample))) %>%
    dplyr::group_by_at(between) %>%
    dplyr::mutate(library_between_group=paste0(between, "=", dplyr::cur_group(), collapse=","), library_between_samples=length(unique(tlx_sample))) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(library_within_factor=1, library_between_factor=1, library_factor=1) %>%
    dplyr::arrange(library_between_group, library_within_group) %>%
  dplyr::select(tlx_group, tlx_control, tlx_sample, dplyr::starts_with("library"))
}

tlx_libfactors_within = function(libsizes_df, factor) {
  libsizes_df %>%
    dplyr::group_by(library_within_group) %>%
    dplyr::mutate(library_within_factor={{ factor }}, library_factor=library_within_factor*library_between_factor, library_within_size=library_size*library_within_factor) %>%
    dplyr::mutate(library_within_groupsize=sum(library_within_size)) %>%
    dplyr::ungroup()
}

tlx_libfactors_between = function(libsizes_df, factor) {
  libsizes_df %>%
    dplyr::group_by(library_between_group) %>%
    dplyr::mutate(library_between_factor={{ factor }}, library_factor=library_within_factor*library_between_factor) %>%
    ungroup()
}


#' @title tlx_libfactors
#' @export
#' @description Calculate normalization factors for samples
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @param normalize_within Strategy of data normalization within each group (possible values are:
#'   \strong{none} - do not perform any sample normalization
#'   \strong{group} - normalize all samples within group realative to their size
#'   \strong{treatment} - normalizes samples within each group like group option but analyze control and treatment separately)
#' @param normalize_between Strategy of data normalization between groups or treatments (possible values are:
#'   \strong{none} - do not perform any between-groups normalization
#'   \strong{treatment} - normalize treatments within each group
#'   \strong{group} - normalizes all groups and treatments accross all dataset
#' @param normalization_target Name of the function that should be used for normalization (\strong{group} - normalize to smallest group/sample, \strong{max} - normalize to largest group/sample)
#' @param normalization_samplecount Whether each group have to be additionally normalized with samples number in that group
#'
#' @seealso tlx_coverage
#'
#' @return Data frame with normalization factors for each sample
tlx_libfactors = function(tlx_df, normalize_within, normalize_between, normalization_within_target="min", normalization_between_target="min")
{
  validate_group_within(normalize_within)
  validate_group_between(normalize_between)
  validate_normalization_target(normalization_within_target)
  validate_normalization_target(normalization_between_target)

  if(normalize_within=="group") normalize_within_cols = c("tlx_group")
  if(normalize_within=="treatment") normalize_within_cols = c("tlx_group", "tlx_control")
  if(normalize_within=="none") normalize_within_cols = c("tlx_sample")
  if(normalize_between=="group") {
    normalize_group_cols = c("tlx_group")
    normalize_between_cols = ""[-1]
  }
  if(normalize_between=="treatment") {
    normalize_group_cols = c("tlx_group", "tlx_control")
    normalize_between_cols = c(normalize_within_cols, "tlx_control")
  }
  if(normalize_between=="none") {
    normalize_group_cols = c("tlx_sample")
    normalize_between_cols = normalize_within_cols
  }

  normalization_within_target_fun = match.fun(normalization_within_target)
  if(grepl("^samplecount", normalization_between_target)) {
    normalization_between_target_fun = match.fun(gsub("samplecount_", "", normalization_between_target))
  } else {
    normalization_between_target_fun = match.fun(normalization_between_target)
  }

  # Calculate library sizes for each sample and a normalization factor according to normalize argument
  libsizes_df = tlx_df %>%
    dplyr::ungroup() %>%
    dplyr::mutate(tlx_control=ifelse(tlx_control, "Control", "Treatment")) %>%
    dplyr::group_by(tlx_sample, tlx_group, tlx_group_i, tlx_control) %>%
    dplyr::summarize(library_size=dplyr::n(), .groups="keep") %>%
    dplyr::ungroup() %>%
    dplyr::group_by_at(normalize_within_cols) %>%
    dplyr::mutate(library_within_factor=normalization_within_target_fun(library_size)/library_size) %>%
    dplyr::ungroup()
  groupsizes_df = libsizes_df %>%
    dplyr::group_by_at(normalize_group_cols) %>%
    dplyr::summarize(library_between_samplecount=length(unique(tlx_sample)), library_groupsize=sum(library_size*library_within_factor)) %>%
    dplyr::ungroup() %>%
    dplyr::group_by_at(normalize_between_cols) %>%
    dplyr::mutate(
      library_between_samplefactor=normalization_between_target_fun(library_between_samplecount)/library_between_samplecount,
      library_between_junctionfactor=normalization_between_target_fun(library_groupsize)/library_groupsize,
      library_between_factor=dplyr::case_when(grepl("samplecount", normalization_between_target)~library_between_samplefactor, T~library_between_junctionfactor)) %>%
    data.frame()
  libsizes_df = libsizes_df %>%
    dplyr::inner_join(groupsizes_df, by=intersect(colnames(libsizes_df), colnames(groupsizes_df))) %>%
    dplyr::mutate(library_factor=library_within_factor*library_between_factor)

  libsizes_df
}

#' @title tlx_write_bed
#' @export
#' @description Export TLX data frame as single, multiple groupped or multiple single-sample BED file(s)
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @param path Path to a folder for exporting BED file(s)
#' @param group Data groupping strategy
#'   \strong{none} - Export each sample separately
#'   \strong{group} - Export each group separately
#'   \strong{all} - Pull data from all samples together
#' @param mode Specifies which coordinates to use in BED file. Possible values are:
#'   \strong{junction} - Single nucleotide position of a junction
#'   \strong{alignment} - Coordinates of aligned prey sequence
#' @param ignore.strand Boolean value indicating whether a separate BED file for each strand junctions should be created
#' @param include_strand Boolean value indicating whether a separate BED file for each control/treatment should be created
tlx_write_bed = function(tlx_df, path, group="all", mode="junction", ignore.strand=F, ignore.treatment=F) {
  validate_group(group)
  validate_bed_mode(mode)

  if(mode=="junction") tlx_bed_df = tlx_df %>% dplyr::mutate(start=Junction, end=Junction, name=paste0(Qname, " (", tlx_sample, ")"))
  if(mode=="alignment") tlx_bed_df = tlx_df %>% dplyr::mutate(start=Rstart, end=Rend, name=paste0(Qname, " (", tlx_sample, ")"))

  writeLines("calculating filenames(s)...")
  if(group=="all") tlx_bed_df = tlx_bed_df %>% dplyr::mutate(g="all")
  if(group=="group") tlx_bed_df = tlx_bed_df %>% dplyr::mutate(g=tlx_generate_filename_col(., include_group=T, include_sample=F, include_treatment=!ignore.treatment, include_strand=!ignore.strand))
  if(group %in% c("sample", "none")) tlx_bed_df = tlx_bed_df %>% dplyr::mutate(g=tlx_generate_filename_col(., include_group=F, include_sample=T, include_treatment=!ignore.treatment, include_strand=!ignore.strand))

  writeLines("Writing bedgraph file(s)...")
  tlx_bed_df %>%
    dplyr::group_by(g) %>%
    dplyr::do((function(z){
      z.out = z %>%
        dplyr::mutate(thickStart=dplyr::case_when(mode=="alignment"~Junction, T~start), thickEnd=dplyr::case_when(mode=="alignment"~Junction+1, T~end), score=1, rgb=dplyr::case_when(tlx_strand=="+"~"255,0,0", tlx_strand=="-"~"0,0,255", T~"0,0,0")) %>%
        dplyr::select(Rname, start, end, name, score, tlx_strand, thickStart, thickEnd, rgb) %>%
        dplyr::arrange(Rname, start, end, tlx_strand)
      z.path = paste0(path, "XXXXXXXXXXXX")
      if(!dir.exists(dirname(z.path))) dir.create(dirname(z.path), recursive=T)
      z.path = file.path(dirname(z.path), paste0(basename(z.path), ifelse(basename(z.path)=="XXXXXXXXXXXX", "", "-"), z$g[1], ".bed"))
      z.path = gsub("XXXXXXXXXXXX", "", z.path)
      writeLines(paste0("Writing to file '", z.path, "'"))
      # writeLines('track itemRgb=On visibility=2 colorByStrand="255,0,0 0,0,255"', con=z.path)
      readr::write_tsv(z.out, file=z.path, col_names=F, append=F)
      data.frame()
    })(.))

  return(NULL)
}

#' @title tlx_coverage
#' @export
#'
#' @description Calculates coverage (pileup) from tlx data frame. The coverage can be pulled from multiple samples based on \code{group} parameter and
#' normalized base factors specified for each sample in \code{libfactors_df}. The pileup is constructed upon summing overlapping
#' extended (extended based on \code{extsize} and \code{exttype} parameters) junctions.
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @param group Grouping columns. Possible values are:
#'   \strong{all} - group together all groups
#'   \strong{group} - group together all samples within each group
#'   \strong{treatment} - group together all samples within each group, separately for control and treatment
#'   \strong{none} - do not perform any groupping
#' @param extsize To build coverage (pileup) data frame it is often usefull to extend the size of the junction (which is just one nucleotide representing the double stranded break).
#'    Single nucleotide position is extended to a size large enough to overlap with other junctions, so that density could be calculated
#' @param exttype Type of extension
#'    \strong{symmetrical} - Extention is done around the center of the junction extending by \code{extsize/2} each direction
#'    \strong{along} - Extention is done in the direction corresponding to pray strand and extending \code{extsize} that direction
#'    \strong{opposite} - Extention is done opposite of pray strand direction and extending \code{extsize}
#'    \strong{none} - No extention is performed
#' @param libfactors_df A data frame with \code{tlx_sample} and \code{library_factor} columns used for sample normalization
#' @param ignore.strand Ignore strand information or calculate coverage for each strand individually
#' @param min_sample_pileup
#' @param recalculate_duplicate_samples
#'
#' @return A data frame with coverages for each sample or group
#' @examples
#' samples_df = tlx_read_samples("samples.tsv", "TLX")
#' tlx_df = tlx_read_many(samples_df, threads=30)
#' tlx_libfactors = tlx_libfactors(tlx_all_df, normalize_within="treatment", normalize_between="treatment")
#' tlxcov_df = tlx_coverage(tlx_df, group="treatment", extsize=20e3, exttype="symmertrical", libfactors_df=tlx_libfactors, ignore.strand=T)
tlx_coverage = function(tlx_df, group, extsize, exttype, libfactors_df=NULL, ignore.strand=T, min_sample_pileup=0, recalculate_duplicate_samples=T) {
  validate_group(group)
  validate_exttype(exttype)

  if(is.null(libfactors_df)) {
    libfactors_df = tlx_df %>%
      dplyr::mutate(library_factor=1) %>%
      dplyr::distinct(tlx_sample, library_factor)
  }

  if(!all(tlx_df$tlx_sample %in% libfactors_df$tlx_sample)) {
    stop(paste0("Samples are missing from libfactors_df data.frame: ", paste(setdiff(tlx_df$tlx_sample, libfactors_df$tlx_sample), collapse=", ")))
  }

  tlx_coverage_ = function(x, extsize, exttype) {
    if(exttype[1]=="along") {
      x_ranges  = GenomicRanges::makeGRangesFromDataFrame(x %>% dplyr::mutate(seqnames=Rname, start=ifelse(tlx_strand=="-", Junction-extsize, Junction-1), end=ifelse(tlx_strand=="-", Junction, Junction+extsize-1)) %>% dplyr::select(seqnames, start, end), ignore.strand=T, keep.extra.columns=T)
    } else {
      if(exttype[1]=="opposite") {
        x_ranges  = GenomicRanges::makeGRangesFromDataFrame(x %>% dplyr::mutate(seqnames=Rname, start=ifelse(tlx_strand=="+", Junction-extsize, Junction-1), end=ifelse(tlx_strand=="+", Junction, Junction+extsize-1)) %>% dplyr::select(seqnames, start, end), ignore.strand=T, keep.extra.columns=T)
      } else {
        if(exttype[1]=="symmetrical") {
          x_ranges  = GenomicRanges::makeGRangesFromDataFrame(x %>% dplyr::mutate(seqnames=Rname, start=Junction-ceiling(extsize/2), end=Junction+ceiling(extsize/2)) %>% dplyr::select(seqnames, start, end), ignore.strand=T, keep.extra.columns=T)
        } else {
          x_ranges  = GenomicRanges::makeGRangesFromDataFrame(x %>% dplyr::mutate(seqnames=Rname, start=Junction, end=Junction+1) %>% dplyr::select(seqnames, start, end), ignore.strand=T, keep.extra.columns=T)
        }
      }
    }

    cov_ranges = as(GenomicRanges::coverage(x_ranges), "GRanges")
    ret_df = as.data.frame(cov_ranges) %>%
      dplyr::rename(tlxcov_chrom="seqnames", tlxcov_start="start", tlxcov_end="end", tlxcov_pileup="score") %>%
      dplyr::inner_join(x %>% dplyr::distinct(Rname, tlxcov_is_bait_chrom=tlx_is_bait_chrom), by=c("tlxcov_chrom"="Rname")) %>%
      dplyr::select(matches("tlxcov_"))
    ret_df
  }

  group_cols = tlx_get_group_cols(group, ignore.strand=ignore.strand)

  # Calculate coverage for each sample
  writeLines("Calculating each sample coverage...")
  tlxcov_cols = c("tlx_group", "tlx_group_i", "tlx_sample", "tlx_control", "tlx_path", "tlx_strand")
  if(ignore.strand) tlxcov_cols = setdiff(tlxcov_cols, "tlx_strand")
  if(recalculate_duplicate_samples) {
    tlxcov_df = tlx_df %>%
      # dplyr::filter(Rname=="chr1" & Junction>=9970000 & Junction<=10030000 & tlx_sample%in% c("VI043")) %>%
      dplyr::group_by_at(tlxcov_cols) %>%
      dplyr::do(tlx_coverage_(., extsize=extsize, exttype=exttype)) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(tlxcov_pileup=ifelse(tlxcov_pileup<min_sample_pileup, 0, tlxcov_pileup))
  } else {
    sample_cols = "tlx_sample"
    if(!ignore.strand) sample_cols = c(sample_cols, "tlx_strand")
    tlxcov_sample_df = tlx_df %>%
      dplyr::distinct(tlx_sample, tlx_strand, Rname, Qname, .keep_all=T) %>%
      dplyr::group_by_at(sample_cols) %>%
      dplyr::do(tlx_coverage_(., extsize=extsize, exttype=exttype)) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(tlxcov_pileup=ifelse(tlxcov_pileup<min_sample_pileup, 0, tlxcov_pileup))
    tlxcov_df = tlx_df %>%
      dplyr::rename(tlxcov_chrom="Rname") %>%
      dplyr::distinct_at(c(tlxcov_cols, "tlxcov_chrom")) %>%
      dplyr::inner_join(tlxcov_sample_df, by=c(sample_cols, "tlxcov_chrom"))
  }

  tlxcov_df = tlxcov_df %>%
    dplyr::left_join(libfactors_df %>% dplyr::select(tlx_sample, library_factor), by="tlx_sample") %>%
    dplyr::mutate(tlxcov_pileup.norm=tlxcov_pileup*library_factor)

  # Summarize group coverage by summing all samples in the group with each sample having a weight decided by library size
  writeLines("Adding up coverages from sample(s)...")
  tlxcov_ret_df = tlxcov_df %>%
    dplyr::group_by_at(group_cols) %>%
    dplyr::do((function(z){
      zz <<- z
      z_ranges = GenomicRanges::makeGRangesFromDataFrame(z %>% dplyr::mutate(seqnames=tlxcov_chrom, start=tlxcov_start, end=tlxcov_end), ignore.strand=T, keep.extra.columns=T)
      cov_ranges = as(GenomicRanges::coverage(z_ranges, weight=z$tlxcov_pileup.norm), "GRanges")
      ret_df = as.data.frame(cov_ranges) %>%
        dplyr::rename(tlxcov_chrom="seqnames", tlxcov_start="start", tlxcov_end="end", tlxcov_pileup="score") %>%
        dplyr::select(matches("tlxcov_"))
      ret_df
    })(.)) %>%
    dplyr::ungroup()
  if(!("tlx_strand" %in% colnames(tlxcov_ret_df))) {
    tlxcov_ret_df$tlx_strand = "*"
  }
  tlxcov_ret_df
}

#' @title tlx_remove_rand_chromosomes
#' @export
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @description Remove non-conventional WGS sequence scaffolds (like Chr4_JH584294_random)
#' @seealso tlx_mark_rand_chromosomes
#'
#' @return Data frame with rand scaffolds removed
tlx_remove_rand_chromosomes = function(tlx_df) {
  tlx_df %>%
    dplyr::filter(Rname %in% paste0("chr", c(1:40, "X", "Y")))
}

#' @title tlx_mark_rand_chromosomes
#' @export
#'
#' @description Mark non-conventional WGS sequence scaffolds (like Chr4_JH584294_random). Junctions detected in these
#' chromosomes are marked appropriately in \code{tlx_is_rand_chrom} column
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @seealso tlx_remove_rand_chromosomes
#' @return Data frame with rand scaffolds removed
tlx_mark_rand_chromosomes = function(tlx_df) {
  tlx_df %>%
    dplyr::mutate(tlx_is_rand_chrom = !(Rname %in% paste0("chr", c(1:40, "X", "Y"))))
}


#' @title tlx_calc_copynumber
#' @export
#' @description Calculate copy number for each pray sequence in reference genome and save this value in \code{tlx_copynumber} column. The results are cached.
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @param bowtie2_index Path to bowtie2 index
#' @param max_hits Maximum number of matches to search for using bowtie2
#' @param threads Number of threads used in bowtie2 when aligning pray sequences
#' @param tmp_dir Path to folder with cached results
#' @param max_age Max age of files in seconds. Remove everything older than specified age
#'
#' @return Data frame with copy number in \code{tlx_copynumber} column
tlx_calc_copynumber = function(tlx_df, bowtie2_index, max_hits=500, threads=8, tmp_dir="tmp", max_age=NULL) {
  if(!bedr::check.binary("bowtie2")) {
    stop("dustmasker executible not found")
  }

  dir.create(tmp_dir, recursive=T, showWarnings=F)
  clear_tmpdir(tmp_dir=tmp_dir, max_age=max_age)

  qseq_vec = unique(tlx_df$QSeq)
  qseq_hash = openssl::md5(paste0(qseq_vec, collapse=""))
  qseq_fasta = file.path(tmp_dir, paste0(qseq_hash, ".fa"))
  qseq_count = file.path(tmp_dir, paste0(qseq_hash, ".count"))
  qseq_cumcount = file.path(tmp_dir, paste0(qseq_hash, ".cumcount"))

  if(!file.exists(qseq_cumcount)) {
    if(!file.exists(qseq_count)) {
      if(!file.exists(qseq_fasta)) {
        writeLines(with(tlx_df, paste0(">", qseq_vec, "\n", qseq_vec)), con=qseq_fasta)
      }

      writeLines("Using bowtie2 to find number of copies in reference fasta file...")
      cmd = paste0("bowtie2 -f -x ", bowtie2_index, " -U ", qseq_fasta ," -k ", max_hits, " --threads ", threads, " -S ", qseq_count)
      writeLines(paste0("> ", cmd))
      system(cmd)
    }

    qseq_count_df = readr::read_tsv(qseq_count, col_names=F, skip=68)
    qseq_cumcount_df = qseq_count_df %>%
      dplyr::group_by(X1) %>%
      dplyr::summarize(n=dplyr::n())%>%
      setNames(c("QSeq", "tlx_copynumber"))
    readr::write_tsv(qseq_cumcount_df, file=qseq_cumcount)
  } else {
    qseq_cumcount_df = readr::read_tsv(qseq_cumcount)
  }

  Sys.setFileTime(qseq_cumcount, Sys.time())

  tlx_df %>%
    dplyr::select(-dplyr::matches("tlx_copynumber")) %>%
    dplyr::left_join(qseq_cumcount_df, by="QSeq") %>%
    dplyr::mutate(tlx_copynumber=pmax(1, tidyr::replace_na(tlx_copynumber, 0)))
}


#' @title tlx_mark_dust
#' @export
#' @description Predict "dust" using dustmasker
#'
#' @param tlx_df Data frame with information from multiple TLX files
#' @param tmp_dir Path to folder with cached results
#' @param max_age Max age of files in seconds. Remove everything older than specified age
#'
#' @return Data frame with dust marked using \code{tlx_has_dust} column
tlx_mark_dust = function(tlx_df, tmp_dir="tmp", max_age=NULL) {
  if(!bedr::check.binary("dustmasker")) {
    stop("dustmasker executible not found")
  }

  dir.create(tmp_dir, recursive=T, showWarnings=F)
  clear_tmpdir(tmp_dir=tmp_dir, max_age=max_age)

  qseq_vec = unique(tlx_df$QSeq)
  qseq_hash = openssl::md5(paste(qseq_vec, collapse=""))
  qseq_fasta = file.path(tmp_dir, paste0(qseq_hash, ".fa"))
  qseq_dust = file.path(tmp_dir, paste0(qseq_hash, ".dust"))

  if(!file.exists(qseq_dust)) {
    if(!file.exists(qseq_fasta)) {
      writeLines("Writing temporary fasta file with sequences...")
      writeLines(with(tlx_df, paste0(">", QSeq, "\n", QSeq)), con=qseq_fasta)
    }
    writeLines("Using dustmasker to calculate low complexity regions...")
    system(paste0("dustmasker -in ", qseq_fasta, " -out ", qseq_dust, " -outfmt acclist"))
  }

  dust_cols = readr::cols(QSeq=readr::col_character(), dust_start=readr::col_double(), dust_end=readr::col_double())
  tlx_dust_df = readr::read_tsv(qseq_dust, col_names=names(dust_cols$cols), col_types=dust_cols) %>%
    dplyr::mutate(dust_length=dust_end-dust_start+1, QSeq=gsub("^>", "", QSeq)) %>%
    dplyr::group_by(QSeq) %>%
    dplyr::summarize(tlx_dust_dust_regions=dplyr::n(), tlx_dust_length=sum(dust_length), tlx_dust_coordinates=paste0(dust_start, "-", dust_end, collapse="; ")) %>%
    dplyr::mutate(tlx_has_dust=T) %>%
    dplyr::ungroup()
  tlx_df = tlx_df %>%
    dplyr::select(-dplyr::matches("tlx_dust_dust_regions|tlx_dust_length|tlx_dust_prop|tlx_dust_coordinates|tlx_has_dust")) %>%
    dplyr::left_join(tlx_dust_df, by="QSeq") %>%
    dplyr::mutate(tlx_has_dust=tidyr::replace_na(tlx_has_dust, F), tlx_dust_prop=tlx_dust_length/Seq_length)
  Sys.setFileTime(qseq_dust, Sys.time())

  tlx_df
}

#' @export
tlx_identify_baits = function(tlx_df, genome_fasta="") {
  if(is.null(tlx_df) || nrow(tlx_df)==0) {
    return(data.frame(bait_sample=NA, bait_chrom=NA, bait_strand=NA, bait_start=NA, bait_end=NA) %>% dplyr::slice(0))
  }

  baits_df = tlx_df %>%
    dplyr::group_by(tlx_group, tlx_sample, tlx_bait_chrom, tlx_bait_strand, tlx_bait_start, tlx_bait_end) %>%
    dplyr::summarize(n=dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(dplyr::desc(n)) %>%
    dplyr::select(bait_group=tlx_group, bait_sample=tlx_sample, bait_chrom=tlx_bait_chrom, bait_strand=tlx_bait_strand, bait_start=tlx_bait_start, bait_end=tlx_bait_end)

  if(genome_fasta!="") {
    if(!file.exists(genome_fasta)) {
      log("Could not find genome file '", genome_fasta, "'")
    }
    baits_ranges = GenomicRanges::makeGRangesFromDataFrame(baits_df %>% dplyr::select(seqnames=bait_chrom, start=bait_start, end=bait_end, strand=bait_strand))
    baits_df$bait_sequence = get_seq(genome_fasta, baits_ranges)$sequence
  }

  baits_df
}

#' @export
tlx_test_hits = function(tlx_df, hits_ranges, paired_samples=T, paired_controls=T, extsize=10000, exttype="along") {
  validate_exttype(exttype)

  if(exttype[1]=="along") {
    tlx_ranges  = GenomicRanges::makeGRangesFromDataFrame(tlx_df %>% dplyr::mutate(seqnames=Rname, sstart=ifelse(Strand=="-1", Junction-extsize, Junction-1), end=ifelse(Strand=="-1", Junction, Junction+extsize-1)), ignore.strand=T, keep.extra.columns=T)
  } else {
    if(exttype[1]=="symmetrical") {
      tlx_ranges  = GenomicRanges::makeGRangesFromDataFrame(tlx_df %>% dplyr::mutate(seqnames=Rname, start=Junction-ceiling(extsize/2), end=Junction+ceiling(extsize/2)), ignore.strand=T, keep.extra.columns=T)
    } else {
      tlx_ranges  = GenomicRanges::makeGRangesFromDataFrame(tlx_df %>% dplyr::mutate(seqnames=Rname, start=Junction, end=Junction+1), ignore.strand=T, keep.extra.columns=T)
    }
  }

  hits_df = as.data.frame(hits_ranges) %>% dplyr::mutate(compare_chrom=seqnames, compare_start=start, compare_end=end)
  hits_ranges = GenomicRanges::makeGRangesFromDataFrame(hits_df, keep.extra.columns=T)

  hits_ranges_reduced = GenomicRanges::makeGRangesFromDataFrame(as.data.frame(GenomicRanges::reduce(hits_ranges)) %>% dplyr::mutate(compare_chrom=seqnames, compare_start=start, compare_end=end), keep.extra.columns=T)
  hits_reduced_df = as.data.frame(hits_ranges_reduced) %>% dplyr::select(compare_chrom, compare_start, compare_end)

  tlxsum_df = tlx_df %>%
    dplyr::group_by(tlx_sample, .drop=F) %>%
    dplyr::summarize(compare_total=sum(!tlx_is_bait_junction)) %>%
    dplyr::ungroup()

  # Prepare overlap counts table (add compare_n=0 for missing entries)
  counts_df_incomplete = as.data.frame(IRanges::mergeByOverlaps(hits_ranges_reduced, tlx_ranges)) %>%
    dplyr::rename(compare_group="tlx_group", compare_group_i="tlx_group_i", compare_sample="tlx_sample") %>%
    dplyr::group_by(compare_chrom, compare_start, compare_end, compare_group, compare_group_i, compare_sample, tlx_control, .drop=F) %>%
    dplyr::summarize(compare_n=dplyr::n())
  counts_df = dplyr::bind_rows(
    counts_df_incomplete,
    hits_reduced_df %>%
      dplyr::select(compare_chrom, compare_start, compare_end) %>%
      tidyr::crossing(tlx_df %>% dplyr::distinct(compare_group=tlx_group, compare_group_i=tlx_group_i, compare_sample=tlx_sample, tlx_control)) %>%
      dplyr::mutate(compare_n=0)) %>%
    dplyr::distinct(compare_chrom, compare_start, compare_end, compare_group, compare_group_i, compare_sample, tlx_control, .keep_all=T) %>%
    dplyr::inner_join(tlxsum_df, by=c("compare_sample"="tlx_sample")) %>%
    data.frame()

  #
  # Calculate breaks count adjusted with control (by substracting control breaks)
  #
  counts_df.input = counts_df %>% dplyr::filter(!tlx_control)
  counts_df.control = counts_df %>% dplyr::filter(tlx_control)
  if(paired_controls) {
   normcounts_df = counts_df.input %>%
     dplyr::inner_join(counts_df.control %>% select(compare_chrom, compare_start, compare_end, compare_group, compare_group_i, compare_total.control=compare_total, compare_n.control=compare_n), by=c("compare_chrom", "compare_start", "compare_end", "compare_group", "compare_group_i")) %>%
     dplyr::mutate(compare_n.control_adj=compare_n.control*(compare_total/compare_total.control),  compare_n.norm=compare_n-compare_n.control_adj, compare_frac.norm=compare_n.norm/compare_total, compare_frac=compare_n/compare_total) %>%
     dplyr::arrange(compare_group, compare_group_i)
  } else {
   normcounts_df = counts_df.input %>%
     dplyr::left_join(counts_df.control %>% dplyr::group_by(compare_chrom, compare_start, compare_end, compare_group) %>% summarize(compare_total.control=sum(compare_total), compare_n.control=sum(compare_n)), by=c("compare_chrom", "compare_start", "compare_end", "compare_group")) %>%
     dplyr::mutate(compare_total.control=ifelse(!is.na(compare_total.control), compare_total.control, compare_total), compare_n.control=tidyr::replace_na(compare_n.control, 0)) %>%
     dplyr::mutate(compare_n.control_adj=compare_n.control*(compare_total/compare_total.control),  compare_n.norm=compare_n-compare_n.control_adj, compare_frac.norm=compare_n.norm/compare_total, compare_frac=compare_n/compare_total) %>%
     dplyr::arrange(compare_group, compare_group_i)
  }

  if(paired_samples)
  {
    normcounts_sum_df = normcounts_df %>%
      dplyr::select(compare_chrom, compare_start, compare_end, compare_group, compare_group_i, compare_n.norm, compare_n, compare_total, compare_n.control, compare_total.control, compare_n.control_adj)
  } else {
    normcounts_sum_df = normcounts_df %>%
      dplyr::group_by(compare_chrom, compare_start, compare_end, compare_group) %>%
      dplyr::summarize(compare_group_i=1, compare_n=sum(compare_n), compare_n.norm=sum(compare_n.norm), compare_total=sum(compare_total), compare_n.control=sum(compare_n.control), compare_total.control=sum(compare_total.control)) %>%
      dplyr::mutate(compare_n.control_adj=compare_n.control*(compare_total/compare_total.control))
  }

  if(length(unique(tlx_df$tlx_group)))
  {
    z_sum.test = as.data.frame(t(apply(combn(unique(normcounts_sum_df$compare_group), 2), 2, sort))) %>%
      dplyr::rename(compare_group1="V1", compare_group2="V2") %>%
      dplyr::inner_join(normcounts_sum_df %>% dplyr::rename(compare_n.norm1="compare_n.norm", compare_n1="compare_n", compare_total1="compare_total", compare_n.control1="compare_n.control", compare_n.control_adj1="compare_n.control_adj", compare_total.control1="compare_total.control"), by=c("compare_group1"="compare_group")) %>%
      dplyr::inner_join(normcounts_sum_df %>% dplyr::rename(compare_n.norm2="compare_n.norm", compare_n2="compare_n", compare_total2="compare_total", compare_n.control2="compare_n.control", compare_n.control_adj2="compare_n.control_adj", compare_total.control2="compare_total.control"), by=c("compare_group2"="compare_group", "compare_group_i", "compare_chrom", "compare_start", "compare_end")) %>%
      dplyr::group_by(compare_group1, compare_group2, compare_chrom, compare_start, compare_end) %>%
      dplyr::do((function(z){
        zz<<-z

        z.groups = c(z$compare_group1[1], z$compare_group2[1])
        z.fold = mean(z$compare_n1/z$compare_total1) / mean(z$compare_n2/z$compare_total2)

        # Reapeated measures ANOVA
        z.test_data = z %>%
          reshape2::melt(measure.vars=c("compare_n1", "compare_n.control_adj1", "compare_n2", "compare_n.control_adj2")) %>%
          dplyr::select(compare_group_i, treatment=variable, breaks=value) %>%
          dplyr::mutate(group=z.groups[as.numeric(gsub(".*([0-9])$", "\\1", treatment))], treatment=gsub("([0-9])$", "", treatment)) %>%
          dplyr::mutate(treatment=c(compare_n.control="control", compare_n="treatment", compare_n.control_adj="control")[treatment]) %>%
          dplyr::mutate(group=factor(group), treatment=factor(treatment), compare_group_i=factor(compare_group_i)) %>%
          tibble::tibble()
        z.aov = rstatix::anova_test(data=z.test_data, dv=breaks, wid=compare_group_i, within=c(treatment, group))
        z.aov_pval = data.frame(z.aov) %>% dplyr::filter(Effect=="group") %>% .$p

        i.contignency = lapply(split(z, 1:nrow(z)), function(y) matrix(as.numeric(y[c("compare_n1", "compare_n2", "compare_total1", "compare_total2")]), ncol=2))
        if(length(i.contignency)>=2) {
          i.contignency = abind::abind(i.contignency, along=3)
          i.test = mantelhaen.test(i.contignency)
        } else {
          i.test = fisher.test(i.contignency[[1]])
        }

        z %>%
          dplyr::slice(1) %>%
          dplyr::mutate(compare_pvalue=i.test$p.value, compare_odds=i.test$estimate, compare_aov_pvalue=z.aov_pval, compare_fold=z.fold) %>%
          dplyr::select(compare_group1, compare_group2, compare_chrom, compare_start, compare_end, compare_pvalue, compare_odds, compare_aov_pvalue, compare_fold)
      })(.)) %>%
      data.frame()
  } else {
    compare_test.cols = readr::cols(compare_group1=readr::col_character(), compare_group2=readr::col_character(), compare_chrom=readr::col_character(), compare_start=readr::col_double(), compare_end=readr::col_double(), compare_pvalue=readr::col_double(), compare_odds=readr::col_double(), compare_aov_pvalue=readr::col_double(), compare_fold=readr::col_double())
    z_sum.test = blank_tibble(compare_test.cols)
  }

  list(test=z_sum.test, data=normcounts_df)
}

#' @export
tlx_extract_bait = function(tlx_df, bait_size, bait_region) {
  tlx_df %>%
    dplyr::select(-dplyr::matches("^(tlx_bait_chrom|tlx_bait_start|tlx_bait_end|tlx_bait_strand|tlx_is_bait_junction)$")) %>%
    dplyr::group_by(tlx_group, tlx_sample, B_Rname, B_Strand) %>%
    dplyr::mutate(misprimed_max=max(misprimed-uncut, na.rm=T)) %>%
    dplyr::mutate(tlx_bait_start=ifelse(B_Strand<0, B_Rstart + misprimed - misprimed_max-2, B_Rend - misprimed + misprimed_max)) %>%
    dplyr::mutate(tlx_bait_end=tlx_bait_start+bait_size - 1) %>%
    dplyr::mutate(tlx_bait_chrom=B_Rname, tlx_bait_strand=ifelse(B_Strand<0, "-", "+")) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(tlx_is_bait_chrom=B_Rname==Rname) %>%
    dplyr::mutate(tlx_is_bait_junction=B_Rname==Rname & (abs(tlx_bait_start-Junction)<=bait_region/2 | abs(tlx_bait_end-Junction)<=bait_region/2))
}

#' @export
tlx_mark_offtargets = function(tlx_df, offtargets_df, offtarget_region=1000, bait_region=1000) {
  tlx_df$tlx_id = 1:nrow(tlx_df)
  tlx_bait_ranges = tlx_df %>% df2ranges(tlx_bait_chrom, tlx_bait_start, tlx_bait_end)
  tlx_junc_ranges = tlx_df %>% df2ranges(Rname, Junction, Junction)
  offtargets_bait_ranges = offtargets_df %>% dplyr::mutate(bait_region=bait_region) %>% df2ranges(offtarget_bait_chrom, offtarget_bait_start-bait_region/2, offtarget_bait_end+bait_region/2)
  offtargets_offt_ranges = offtargets_df %>% dplyr::mutate(offtarget_region=offtarget_region) %>% df2ranges(offtarget_chrom, offtarget_start-offtarget_region/2, offtarget_end+offtarget_region/2)

  tlx_offtarget_ids = as.data.frame(IRanges::findOverlaps(tlx_bait_ranges, offtargets_bait_ranges)) %>%
    dplyr::rename(tlx_id="queryHits", o2b_id="subjectHits") %>%
    dplyr::inner_join(as.data.frame(IRanges::findOverlaps(tlx_junc_ranges, offtargets_offt_ranges)), by=c(tlx_id="queryHits", o2b_id="subjectHits")) %>%
    dplyr::distinct(tlx_id) %>%
    .$tlx_id

  tlx_df$tlx_is_offtarget = tlx_df$tlx_id %in% tlx_offtarget_ids | tlx_df$tlx_is_bait_junction

  tlx_df
}


#' @export
tlx_strand_crosscorrelation = function(tlxcov_df, step=1000, max_rellag=0.25, min_points=5, method="pearson", negative_correlation=F, negative_lag=T)
{
  z.range = c(min(tlxcov_df$tlxcov_start), max(tlxcov_df$tlxcov_end-1))
  z.sense = tlxcov_df %>%
    dplyr::filter(tlx_strand=="+") %>%
    dplyr::mutate(start=tlxcov_start, end=tlxcov_end-1) %>%
    reshape2::melt(measure.vars=c("start", "end"), value.name="position") %>%
    dplyr::arrange(position) %>%
    dplyr::distinct(position, .keep_all=T)
  z.anti = tlxcov_df %>%
    dplyr::filter(tlx_strand=="-") %>%
    dplyr::mutate(start=tlxcov_start, end=tlxcov_end-1) %>%
    reshape2::melt(measure.vars=c("start", "end"), value.name="position") %>%
    dplyr::arrange(position) %>%
    dplyr::distinct(position, .keep_all=T)

  if(nrow(z.sense)>=min_points & nrow(z.anti)>=min_points) {
    pos = seq(z.range[1], z.range[2], by=step)
    p.sense = approx(x=z.sense$position, y=z.sense$tlxcov_pileup, xout=pos, yleft=0, yright=0)$y
    p.anti = approx(x=z.anti$position, y=z.anti$tlxcov_pileup, xout=pos, yleft=0, yright=0)$y

    n = length(p.sense)
    max_lag = floor(length(p.anti)*max_rellag)
    ccf.sense = data.frame(lag=-max_lag:max_lag)
    cor_fun = function(t) {
      tt<<-t
      p.cor = list(estimate=NA_real_, p.value=NA_real_)
      if(t>0) {
        if(length(unique(na.omit(p.sense[(1 + t):n])))>=min_points & length(unique(na.omit(p.anti[1:(n - t)])))>=min_points) {
          p.cor = cor.test(p.sense[(1 + t):n], p.anti[1:(n - t)], method=method)
        }
      } else {
        if(t<0) {
          if(length(unique(na.omit(p.sense[1:(n + t)])))>=min_points & length(unique(na.omit(p.anti[(1 - t):n])))>=min_points) {
            p.cor = cor.test(p.sense[1:(n + t)], p.anti[(1 - t):n], method=method)
          }
        } else {
          if(length(unique(na.omit(p.sense)))>=min_points & length(unique(na.omit(p.anti)))>=min_points) {
            p.cor = cor.test(p.sense, p.anti, method=method)
          }
        }
      }
      data.frame(acf=p.cor$estimate, pvalue=p.cor$p.value)
    }
    ccf.sense = cbind(ccf.sense, do.call(rbind, lapply(ccf.sense$lag, cor_fun)))
    # plot(y=ccf.sense$acf, x=ccf.sense$lag, type="l")

    # ccf.sense = stats::ccf(p.sense, p.anti,type="covariance", lag.max=max_lag, plot=F, na.action=stats::na.pass)
    acf_values = ccf.sense$acf
    if(!negative_lag) acf_values[ccf.sense$lag<0] = NA_real_
    if(negative_correlation) {
      # t = ccf.sense$lag[f]
      # 1 / length(p.sense) * sum((p.sense[(1 + t):length(p.sense)] - mean(p.sense)) * (p.anti[1:(length(p.sense) - t)] - mean(p.anti)))
      f = which.max(abs(acf_values))
    } else {
      f = which.max(acf_values)
    }

    if(length(f)==0) {
      res = data.frame(crosscorrelation_lag=NA_real_, crosscorrelation_rellag=NA_real_, crosscorrelation_value=NA_real_, crosscorrelation_cor=NA_real_, crosscorrelation_cor_pvalue=NA_real_)
    } else {
      res = data.frame(crosscorrelation_lag=ccf.sense$lag[f]*step, crosscorrelation_rellag=ccf.sense$lag[f]/length(p.sense), crosscorrelation_value=ccf.sense$acf[f], crosscorrelation_cor=ccf.sense$acf[f], crosscorrelation_cor_pvalue=ccf.sense$pvalue[f])
    }
  } else {
    res = data.frame(crosscorrelation_lag=NA_real_, crosscorrelation_rellag=NA_real_, crosscorrelation_value=NA_real_, crosscorrelation_cor=NA_real_, crosscorrelation_cor_pvalue=NA_real_)
  }

  res
}



#' @export
tlx_mark_repeats = function(tlx_df, repeatmasker_df) {
  # @todo: make group_by faster using data.table
  repeatmasker_ranges = GenomicRanges::makeGRangesFromDataFrame(repeatmasker_df %>% dplyr::mutate(seqnames=repeatmasker_chrom, start=repeatmasker_start, end=repeatmasker_end), keep.extra.columns=T)
  tlx_df = tlx_df %>%
    dplyr::mutate(tlx_id=1:n()) %>%
    dplyr::select(-dplyr::matches("tlx_repeatmasker_")) %>%
    dplyr::mutate(seqnames=Rname, start=Rstart, end=Rend)
  tlx_ranges = GenomicRanges::makeGRangesFromDataFrame(tlx_df, keep.extra.columns=T, ignore.strand=T)
  r1 = as.data.frame(IRanges::findOverlaps(tlx_ranges, repeatmasker_ranges)) %>%
    dplyr::inner_join(repeatmasker_df, by=c("subjectHits"="repeatmasker_id"))

  data.table::setDT(r1)[,list(tlx_repeatmasker_class=paste0(unique(repeatmasker_class),collapse=", ")), by=list(queryHits)] %>%
    dplyr::right_join(tlx_df, by=c("queryHits"="tlx_id")) %>%
    dplyr::select(-queryHits) %>%
    data.frame()
}

#' @export
geom_tlxcov = function(tlxcov_df, grouping_cols=c("rdc_chrom", "rdc_name", "tlx_strand"), scale=1, alpha=0.7) {
  distinct_cols = unique(c(grouping_cols, "tlxcov_chrom", "tlxcov_pos", "tlx_strand"))
  tlxcov_extdf = tlxcov_df %>%
    dplyr::arrange(tlxcov_chrom, tlx_strand, tlxcov_end) %>%
    dplyr::group_by_at(unique(c(grouping_cols, "tlxcov_chrom", "tlx_strand"))) %>%
    # dplyr::filter(tlxcov_start - dplyr::lag(tlxcov_end)<0 & tlxcov_strand=="+")
    dplyr::mutate(tlxcov_diff=tlxcov_start - dplyr::lag(tlxcov_end)) %>%
    dplyr::ungroup() %>%
    dplyr::filter(tlxcov_diff<0)

  if(length(which(tlxcov_extdf$tlxcov_diff<0))>0) stop("Overlapping tlxcov regions")
  x_area = tlxcov_df %>%
    reshape2::melt(measure.vars=c("tlxcov_start", "tlxcov_end"), value.name="tlxcov_pos") %>%
    dplyr::distinct_at(distinct_cols, .keep_all=T)

  geom_ribbon(aes(x=tlxcov_pos, ymin=0, ymax=tlxcov_pileup*scale, fill=tlx_strand), alpha=alpha, data=x_area)
}


#' @export
tlxcov_macs2 = function(tlxcov_df, group, bgmodel_df, params, extended_islands=F, extended_islands_dist=1e6, extended_islands_significance=0.1, debug_plots=F) {
  validate_group(group)
  group_cols = tlx_get_group_cols(group, ignore.strand=T, ignore.control=T)

  results_df = tlxcov_df %>%
    dplyr::group_by_at(group_cols) %>%
    dplyr::mutate(macs2_group_id=dplyr::cur_group_id()) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(macs2_group_count=max(macs2_group_id)) %>%
    dplyr::group_by_at(group_cols) %>%
    dplyr::do((function(z){
      zz<<-z

      tlxcov_ranges = z %>% dplyr::mutate(score=tlxcov_pileup) %>% df2ranges(tlxcov_chrom, tlxcov_start, tlxcov_end, tlx_strand)
      sample_ranges = tlxcov_ranges[!tlxcov_ranges$tlx_control]
      control_ranges = tlxcov_ranges[tlxcov_ranges$tlx_control]

      group_desc = paste0(group_cols, rep("=", length(group_cols)), sapply(distinct(z[,group_cols]), as.character), collapse=",")
      writeLines(paste0("Running MACS for group ", z$macs2_group_id[1], "/", z$macs2_group_count[1], " {", group_desc, "} | ", "Finding islands with ", length(sample_ranges), " sample value points and ", length(control_ranges), " control value points..."))
      if(length(sample_ranges)==0) {
        stop("Sample data not found!")
      }

      z.bgmodel_df = bgmodel_df %>% dplyr::filter(tlx_group==z$tlx_group[1]) %>% dplyr::select(-dplyr::matches("tlx_group"))
      results = macs2_coverage(sample_ranges=sample_ranges, control_ranges=control_ranges, params=params, extended_islands=extended_islands, extended_islands_dist=extended_islands_dist, extended_islands_significance=extended_islands_significance, bgmodel_df=z.bgmodel_df, debug_plots=debug_plots)
      results_df = dplyr::bind_cols(z[1,group_cols], dplyr::bind_rows(results[["islands"]], results[["qvalues"]])) %>%
        dplyr::relocate(dplyr::matches("qvalue_|island_"), .after=tidyselect::last_col())
      results_df
    })(.)) %>%
    dplyr::ungroup()

  islands_df = results_df %>%
    dplyr::filter_at(dplyr::vars(dplyr::starts_with("island_")), dplyr::any_vars(!is.na(.))) %>%
    dplyr::select(-dplyr::starts_with("qvalue_")) %>%
    dplyr::mutate(island_name=paste0("RDC_", stringr::str_pad((0:(dplyr::n()))[-1], 3, pad="0"))) %>%
    dplyr::relocate(island_name) %>%
    dplyr::select(dplyr::matches(paste0("^", paste(group_cols, collapse="|"), "$")), dplyr::starts_with("island_"))

  qvalues_df = results_df %>%
    dplyr::filter_at(dplyr::vars(dplyr::starts_with("qvalue_")), dplyr::any_vars(!is.na(.))) %>%
    dplyr::select(-dplyr::starts_with("island_"))
  list(qvalues=qvalues_df, islands=islands_df)
}
brainbreaks/breaktools documentation built on Jan. 3, 2023, 3:32 a.m.