R/trackplot.R

Defines functions FUN extract_summary profile_plot profile_extract track_plot track_extract

Documented in extract_summary profile_extract profile_plot track_extract track_plot

# This R script contains functions for bigWig visualization
#
# track_extract() -> track_plot() : to generate IGV style track plots (aka locus plots) from bigWig files.
# profile_extract() -> profile_plot() : to generate profile-plots from bigWig files.
# extract_signal() -> pca_plot() : to perform PCA analysis based on genomic regions of interest or around TSS sites.
#
# Source code: https://github.com/PoisonAlien/trackplot
#
# MIT License
# Copyright (c) 2020 Anand Mayakonda <anandmt3@gmail.com>
#
# Version: 1.3.0
#
# Changelog:
# Version: 1.0.0 [2020-11-27]
#   * Initial release
# Version: 1.1.0 [2020-12-01]
#   * Added profileplot()
# Version: 1.1.1 [2020-12-04]
#   * trackplot() now plots ideogram of target chromosome
# Version: 1.1.11 [2020-12-07]
#   * Bug fixes in profileplot(): Typo for .check_dt() and startFrom estimation
# Version: 1.2.0 [2020-12-09]
#   * Added bwpcaplot()
# Version: 1.3.0 [2021-03-26]
#   * modularize the code base to avoid repetitve data extraction and better plotting

#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#' Extract bigWig track data for the given loci
#' @param bigWigs bigWig files. Default NULL. Required.
#' @param loci target region to plot. Should be of format "chr:start-end". e.g; chr3:187715903-187752003 OR chr3:187,715,903-187,752,003
#' @param binsize bin size to extract signal. Default 50 (bps).
#' @param nthreads Default 1. Number of threads to use.
#' @param custom_names Default NULL and Parses from the file names.
#' @export
track_extract = function(bigWigs = NULL, loci = NULL, binsize = 50, custom_names = NULL, nthreads = 1){
  .check_windows()
  .check_bwtool()
  .check_dt()
  
  options(warn = -1)
  op_dir = tempdir() #For now
  
  message("Parsing loci..")
  if(is.null(loci)){
    stop("Missing loci. Provide a target region to plot.\n  Should be of format \"chr:start-end\". e.g; chr3:187715903-187752003 OR chr3:187,715,903-187,752,003")
  }else{
    chr = as.character(unlist(data.table::tstrsplit(x = loci, spli = ":", keep = 1)))
    start = unlist(data.table::tstrsplit(x = unlist(data.table::tstrsplit(x = loci, spli = ":", keep = 2)), split = "-"))[1]
    start = as.numeric(as.character(gsub(pattern = ",", replacement = "", x = as.character(start))))
    end = unlist(data.table::tstrsplit(x = unlist(data.table::tstrsplit(x = loci, spli = ":", keep = 2)), split = "-"))[2]
    end = as.numeric(as.character(gsub(pattern = ",", replacement = "", x = as.character(end))))
  }
  
  if(start >= end){
    stop("End must be larger than Start!")
  }
  message("    Queried region: ", chr, ":", start, "-", end, " [", end-start, " bps]")
  
  if(is.null(bigWigs)){
    stop("Provide at-least one bigWig file")
  }
  message("Checking for files..")
  for(i in 1:length(bigWigs)){
    if(!file.exists(as.character(bigWigs)[i])){
      stop(paste0(as.character(bigWigs)[i], " does not exist!"))
    }
  }
  
  if(!is.null(custom_names)){
    if(length(custom_names) != length(bigWigs)){
      stop("Please provide names for all bigWigs")
    }
  }
  
  windows = .gen_windows(chr = chr, start = start, end = end, window_size = binsize, op_dir = op_dir)
  track_summary = .get_summaries(bedSimple = windows, bigWigs = bigWigs, op_dir = op_dir, nthreads = nthreads)
  track_summary$loci = c(chr, start, end)
  track_summary
}

#' Generate IGV style locus tracks with ease
#' @param summary_list Output from track_extract
#' @param draw_gene_track Default FALSE. If TRUE plots gene models overlapping with the queried region
#' @param query_ucsc Default FALSE. But switches to TRUE when `gene_model` is not given. Requires `mysql` installation.
#' @param show_ideogram Default TRUE. If TRUE plots ideogram of the target chromosome with query loci highlighted. Works only when `query_ucsc` is TRUE. 
#' @param build Genome build. Default `hg19`
#' @param txname transcript name to draw. Default NULL. Plots all transcripts overlapping with the queried region
#' @param genename gene name to draw. Default NULL. Plots all genes overlapping with the queried region
#' @param collapse_txs Default FALSE. Whether to collapse all transcripts belonging to same gene into a unified gene model
#' @param gene_model File with gene models. Can be a gtf file or UCSC file format. If you have read them into R as a data.frame, that works as well. Default NULL, automatically fetches gene models from UCSC server
#' @param isGTF Default FALSE. Set to TRUE if the `gene_model` is a gtf file.
#' @param groupAutoScale Default TRUE
#' @param gene_fsize Font size. Default 1
#' @param gene_track_height Default 2 
#' @param scale_track_height Default 1
#' @param col Color for tracks. Default `#2f3640`. Multiple colors can be provided for each track
#' @param show_axis Default FALSE
#' @param track_names Default NULL
#' @param track_names_pos Default 0 (corresponds to left corner)
#' @param regions genomic regions to highlight. A data.frame with at-least three columns containing chr, start and end positions.
#' @param boxcol color for highlighted region. Default "#192A561A"
#' @param boxcolalpha Default 0.5
#' @export
track_plot = function(summary_list = NULL,
                      draw_gene_track = TRUE,
                      query_ucsc = TRUE,
                      show_ideogram = FALSE,
                      build = "hg19",
                      col = "gray70",
                      groupAutoScale = TRUE,
                      txname = NULL,
                      genename = NULL,
                      gene_track_height = 2,
                      scale_track_height = 1,
                      show_axis = TRUE,
                      gene_fsize = 0.6,
                      track_names = NULL,
                      track_names_pos = 0,
                      regions = NULL,
                      region_width = 1,
                      collapse_txs = FALSE,
                      boxcol = "#192A561A",
                      boxcolalpha = 0.2,
                      gene_model = NULL,
                      isGTF = FALSE
){
  
  if(is.null(summary_list)){
    stop("Missing input! Expecting output from track_extract()")
  }
  
  chr = summary_list$loci[1]
  start = as.numeric(summary_list$loci[2])
  end = as.numeric(summary_list$loci[3])
  
  summary_list$loci = NULL
  
  if(length(col) != length(summary_list)){
    col = rep(x = col, length(summary_list))
  }
  
  plot_regions = FALSE
  if(!is.null(regions)){
    if(is(object = regions, class2 = "data.frame")){
      regions = data.table::as.data.table(x = regions)
      colnames(regions)[1:3] = c("chromsome", "startpos", "endpos")
      regions = regions[chromsome %in% chr]
      if(nrow(regions) == 0){
        warning("None of the regions are within the requested chromosme: ", chr)
        plot_regions = TRUE
      }else{
        plot_regions = TRUE  
      }
    }else{
      stop("'mark_regions' must be a data.frame with first 3 columns containing : chr, start, end")
    }
  }
  
  if(!is.null(track_names)){
    names(summary_list) = track_names
  }
  
  if(groupAutoScale){
    plot_height = max(unlist(lapply(summary_list, function(x) max(x$max, na.rm = TRUE))), na.rm = TRUE)
    plot_height = rep(plot_height, length(summary_list))
  }else{
    plot_height = unlist(lapply(summary_list, function(x) max(x$max, na.rm = TRUE)))
  }
  
  plot_height = round(plot_height, digits = 2)
  
  ntracks = length(summary_list)
  if(draw_gene_track){
    if(is.null(gene_model)){
      if(query_ucsc){
        message("Missing gene model. Trying to query UCSC genome browser..")
        etbl = .extract_geneModel_ucsc(chr, start = start, end = end, refBuild = build, txname = txname, genename = genename)
        if(show_ideogram){
          #lo = layout(mat = matrix(data = seq_len(ntracks+2)), heights = c(region_width, rep(3, ntracks), gene_track_height, scale_track_height))
          lo = layout(mat = matrix(data = seq_len(ntracks+3)), heights = c(rep(3, ntracks), gene_track_height, scale_track_height, 1))
        }else{
          lo = layout(mat = matrix(data = seq_len(ntracks+2)), heights = c(rep(3, ntracks), gene_track_height, scale_track_height))  
        }
      }else{
        etbl = NULL
        lo = layout(mat = matrix(data = seq_len(ntracks+1)), heights = c(rep(3, ntracks), scale_track_height))  
      }
    }else{
      if(gtf){
        etbl = .parse_gtf(gtf = gene_model, chr = chr, start = start, end = end, txname = txname, genename = genename)  
      }else{
        etbl = .extract_geneModel(ucsc_tbl = gene_model, chr = chr, start = start, end = end, txname = txname, genename = genename)  
      }
      
      lo = layout(mat = matrix(data = seq_len(ntracks+2)), heights = c(rep(3, ntracks), gene_track_height, scale_track_height))
    }
  }else{
    if(all(c(query_ucsc, show_ideogram))){
      lo = layout(mat = matrix(data = seq_len(ntracks+2)), heights = c(rep(3, ntracks), scale_track_height, 1))  
    }else{
      lo = layout(mat = matrix(data = seq_len(ntracks+1)), heights = c(rep(3, ntracks), scale_track_height))    
    }
  }
  
  #Draw bigWig signals
  lapply(1:length(summary_list), function(idx){
    x = summary_list[[idx]]
    if(show_axis){
      par(mar = c(0.5, 4, 2, 1))
    }else{
      par(mar = c(0.5, 1, 2, 1))  
    }
    
    plot(NA, xlim = c(start, end), ylim = c(0, plot_height[idx]), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
    rect(xleft = x$start, ybottom = 0, xright = x$end, ytop = x$max, col = col[idx], border = col[idx])
    if(show_axis){
      axis(side = 2, at = c(0, plot_height[idx]), las = 2)  
    }else{
      text(x = start, y = plot_height[idx], labels = paste0("[0-", plot_height[idx], "]"), adj = 0, xpd = TRUE)
    }
    #plot(NA, xlim = c(start, end), ylim = c(0, nrow(regions)), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
    
    if(plot_regions){
      # boxcol = "#192a56"
      boxcol = grDevices::adjustcolor(boxcol, alpha.f = boxcolalpha)
      if(nrow(regions) > 0){
        for(i in 1:nrow(regions)){
          if(idx == length(summary_list)){
            #If its a last plot, draw rectangle till 0
            rect(xleft = regions[i, startpos], ybottom = 0, xright = regions[i, endpos], ytop = plot_height[idx]+10, col = boxcol, border = NA, xpd = TRUE)
          }else if (idx == 1){
            if(ncol(regions) > 3){
              text(x = mean(c(regions[i, startpos], regions[i, endpos])), y = plot_height[idx]+(plot_height[idx]*0.1), labels = regions[i, 4], adj = 0.5, xpd = TRUE, font = 1, cex = 1.2)
            }else{
              text(x = mean(c(regions[i, startpos], regions[i, endpos])), y = plot_height[idx]+(plot_height[idx]*0.1), labels = paste0(regions[i, startpos], "-", regions[i, endpos]), adj = 0.5, xpd = TRUE, font = 1, cex = 1.2)
            }
            rect(xleft = regions[i, startpos], ybottom = -10, xright = regions[i, endpos], ytop = plot_height[idx], col = boxcol, border = NA, xpd = TRUE)
          }else{
            rect(xleft = regions[i, startpos], ybottom = -10, xright = regions[i, endpos], ytop = plot_height[idx]+10, col = boxcol, border = NA, xpd = TRUE)  
          }
        }
      }
    }
    
    title(main = names(summary_list)[idx], adj = track_names_pos, font.main = 3)
  })
  
  #Draw gene models
  if(draw_gene_track){
    if(!is.null(etbl)){
      if(collapse_txs){
        etbl = .collapse_tx(etbl)
      }
      
      if(show_axis){
        par(mar = c(0.25, 4, 0, 0.5))
      }else{
        par(mar = c(0.25, 1, 0, 0.5))  
      }
      
      plot(NA, xlim = c(start, end), ylim = c(0, length(etbl)), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
      exon_col = "#192a56"
      for(tx_id in 1:length(etbl)){
        txtbl = etbl[[tx_id]]
        segments(x0 = attr(txtbl, "start"), y0 = tx_id-0.45, x1 = attr(txtbl, "end"), y1 = tx_id-0.45, col = exon_col, lwd = 1)
        
        if(is.na(attr(txtbl, "tx"))){
          text(x = start, y = tx_id-0.45, labels = paste0(attr(txtbl, "gene")), adj = 0, cex = gene_fsize)
        }else{
          text(x = start, y = tx_id-0.45, labels = paste0(attr(txtbl, "tx"), " [", attr(txtbl, "gene"), "]"), cex = gene_fsize, adj = 0)  
        }
        
        rect(xleft = txtbl[[1]], ybottom = tx_id-0.75, xright = txtbl[[2]], ytop = tx_id-0.25, col = exon_col, border = NA)
        if(attr(txtbl, "strand") == "+"){
          dirat = pretty(x = c(min(txtbl[[1]]), max(txtbl[[2]])))
          dirat[1] = min(txtbl[[1]]) #Avoid drawing arrows outside gene length
          dirat[length(dirat)] = max(txtbl[[2]])
          points(x = dirat, y = rep(tx_id-0.45, length(dirat)), pch = ">", col = exon_col)
        }else{
          dirat = pretty(x = c(min(txtbl[[1]]), max(txtbl[[2]])))
          dirat[1] = min(txtbl[[1]]) #Avoid drawing arrows outside gene length
          dirat[length(dirat)] = max(txtbl[[2]])
          points(x = dirat, y = rep(tx_id-0.45, length(dirat)), pch = "<", col = exon_col)
        }
      }
    }
  }
  
  if(show_axis){
    par(mar = c(0, 4, 0, 0))  
  }else{
    par(mar = c(0, 1, 0, 0))
  }
  lab_at = pretty(c(start, end))
  plot(NA, xlim = c(start, end), ylim = c(0, 1), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
  rect(xleft = start, ybottom = 0.5, xright = end, ytop = 0.5, lty = 2, xpd = TRUE)
  rect(xleft = lab_at, ybottom = 0.45, xright = lab_at, ytop = 0.5, xpd = TRUE)
  text(x = lab_at, y = 0.3, labels = lab_at)
  #axis(side = 1, at = lab_at, lty = 2, line = -3)
  text(x = end, y = 0.75, labels = paste0(chr, ":", start, "-", end), adj = 1, xpd = TRUE)
  
  if(all(c(query_ucsc, show_ideogram))){
    cyto = .extract_cytoband(chr = chr, refBuild = build)
    #par(fig = c(0, 1, 0, 0.05), new = TRUE)
    if(show_axis){
      par(mar = c(0, 4, 0, 0))  
    }else{
      par(mar = c(0, 1, 0, 0))
    }
    plot(NA, xlim = c(0, max(cyto$end)), ylim = c(0, 1), axes = FALSE, frame.plot = FALSE, xlab = NA, ylab = NA)
    rect(xleft = cyto$start, ybottom = 0.1, xright = cyto$end, ytop = 0.6, col = cyto$color, border = "#34495e")
    rect(xleft = start, ybottom = 0, xright = end, ytop = 0.7, col = "#d35400", lwd = 2, border = "#d35400")
    text(x = 0, y = 0.8, labels = chr, adj = 0, font = 2)
  }
}

# profileplot is an ultra-fast, simple, and minimal dependency R script to generate profile-plots from bigWig files
#' Generate profile plots with ease
#' @param bigWigs bigWig files. Default NULL. Required.
#' @param bed bed file or a data.frame with first 3 column containing chromosome, star, end positions. 
#' @param binSize bin size to extract signal. Default 50 (bps). Should be >1
#' @param startFrom Default "center". Can be "center", "start" or "end"
#' @param up extend upstream by this many bps from `startFrom`. Default 2500
#' @param down extend downstream by this many bps from `startFrom`. Default 2500
#' @param ucsc_assembly If `bed` file not provided, setting `ucsc_assembly` to ref genome build will fetch transcripts from UCSC genome browser. e.g; 'hg19'
#' @param nthreads Default 4
#' @param custom_names Default NULL and Parses from the file names.
#' @export

profile_extract = function(bigWigs = NULL, bed = NULL, binSize = 50, startFrom = "center",
                           up = 2500, down = 2500, ucsc_assembly = NULL, nthreads = 4, 
                           custom_names = NULL){
  .check_windows()
  .check_bwtool()
  .check_dt()
  
  if(is.null(bigWigs)){
    stop("Provide at-least one bigWig file")
  }
  
  message("Checking for files..")
  for(i in 1:length(bigWigs)){
    if(!file.exists(as.character(bigWigs)[i])){
      stop(paste0(as.character(bigWigs)[i], " does not exist!"))
    }
  }
  
  if(!is.null(custom_names)){
    if(length(custom_names) != length(bigWigs)){
      stop("Please provide names for all bigWigs")
    }
  }
  
  op_dir = tempdir() #For now
  
  
  if(is.null(bed)){
    if(is.null(ucsc_assembly)){
      stop("Please provide either a BED file or ucsc_assembly name")
    }else{
      if(length(ucsc_assembly) > 1){
        warning("Multiple assemblies provided. Using first one..")
        ucsc_assembly = ucsc_assembly[1]
      }
      bed = .make_genome_bed(refBuild = ucsc_assembly, up = as.numeric(up), down = as.numeric(down), tss = startFrom, op_dir = op_dir, pc_genes = FALSE)
    }
  }else{
    startFrom = match.arg(arg = startFrom, choices = c("start", "end", "center"))
    bed = .make_bed(bed = bed, op_dir = op_dir, up = as.numeric(up), down = as.numeric(up), tss = startFrom)
  }
  
  message("Extracting signals..")
  mats = parallel::mclapply(bigWigs, function(x){
    .bwt_mats(bw = x, binSize = binSize, bed = bed, size = paste0(up, ":", down), startFrom = startFrom, op_dir = op_dir)
  }, mc.cores = nthreads)
  
  mats = as.character(unlist(x = mats))
  sig_list = lapply(mats, data.table::fread)
  
  if(!is.null(custom_names)){
    names(sig_list) = custom_names
  }else{
    names(sig_list) = gsub(pattern = "*\\.matrix$", replacement = "", x = basename(path = mats))
  }
  #return(sig_list)
  
  sig_list$args = c(up, down)
  
  #Remove intermediate files
  lapply(mats, function(x) system(command = paste0("rm ", x), intern = TRUE))
  
  sig_list
}

#' Profile Plot
#' @param sig_list Output generated from profile_extract
#' @param color Manual colors for each bigWig. Default NULL. 
#' @param condition Default. Condition associated with each bigWig. Lines will colord accordingly.
#' @param condition_colors Manual colors for each level in condition. Default NULL. 
#' @param collapse_replicates Default FALSE. If TRUE and when `condition` is given, collapse signals samples belonging to same condition
#' @param plot_se Default FALSE. If TRUE plots standard error shading
#' @param line_size Default 1
#' @param legend_fs Legend font size. Default 1
#' @param axis_fs Axis font size. Default 1
#' @export

profile_plot = function(sig_list = NULL, color = NULL, condition = NULL, condition_colors = NULL, 
                        collapse_replicates = FALSE, plot_se = FALSE, line_size = 1, legend_fs = 1, axis_fs = 1){
  
  
  if(is.null(sig_list)){
    stop("Missing input! Expecting output from profile_extract()")
  }
  
  up = as.numeric(sig_list$args[1])
  down = as.numeric(sig_list$args[2])
  sig_list$args = NULL
  
  message("Summarizing..")
  sig_summary = .summarizeMats(mats = sig_list, group = condition, collapse_reps = collapse_replicates)
  
  if(is.null(color)){
    color = c("#A6CEE3FF", "#1F78B4FF", "#B2DF8AFF", "#33A02CFF", "#FB9A99FF", 
              "#E31A1CFF", "#FDBF6FFF", "#FF7F00FF", "#CAB2D6FF", "#6A3D9AFF", 
              "#FFFF99FF", "#9E0142FF", "#D53E4FFF", "#F46D43FF", "#000000FF", 
              "#EE82EEFF", "#4169E1FF", "#7B7060FF", "#535C68FF")
    color = color[1:length(sig_list)]
  }
  
  
  sig_se_summary = NULL
  if(plot_se){
    sig_se_summary = .estimateCI(mats = sig_list, group = condition, collapse_reps = collapse_replicates)
  }
  
  if(!is.null(condition)){
    group_df = data.table::data.table(sample = names(sig_summary), condition = condition)
    if(is.null(condition_colors)){
      condition_colors = c("#A6CEE3FF", "#1F78B4FF", "#B2DF8AFF", "#33A02CFF", "#FB9A99FF", 
                           "#E31A1CFF", "#FDBF6FFF", "#FF7F00FF", "#CAB2D6FF", "#6A3D9AFF", 
                           "#FFFF99FF", "#9E0142FF", "#D53E4FFF", "#F46D43FF", "#000000FF", 
                           "#EE82EEFF", "#4169E1FF", "#7B7060FF", "#535C68FF")[1:nrow(group_df[,.N,condition])]
    }
    names(condition_colors) = group_df[,.N,condition][,condition]
    color = condition_colors[group_df$condition]
  }
  
  y_max = max(unlist(lapply(sig_summary, max, na.rm = TRUE)))
  y_min = min(unlist(lapply(sig_summary, min, na.rm = TRUE)))
  ylabs = pretty(c(y_min, y_max), n = 5)
  
  x_max = max(unlist(lapply(sig_summary, length)))
  xlabs = c(up, 0, down)
  xticks = xticks = c(0,
                      as.integer(length(sig_summary[[1]])/sum(as.numeric(xlabs[1]), as.numeric(xlabs[3])) * as.numeric(xlabs[1])),
                      length(sig_summary[[1]]))
  
  #line_size = 1
  par(mar = c(3, 3, 2, 1))
  plot(NA, xlim = c(0, x_max), ylim = c(min(ylabs), max(ylabs)), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
  abline(h = ylabs, v = pretty(xticks), col = "gray90", lty = 2)
  
  lapply(1:length(sig_summary), function(idx){
    points(sig_summary[[idx]], type = 'l', lwd = line_size, col = color[idx])
    if(plot_se){
      polygon(x = c(1:length(sig_summary[[idx]]), rev(1:length(sig_summary[[idx]]))),
              y = c(sig_summary[[idx]]-sig_se_summary[[idx]], rev(sig_summary[[idx]]+sig_se_summary[[idx]])),
              col = grDevices::adjustcolor(col = color[idx], #color[names(mat_list)[[i]]],
                                           alpha.f = 0.4), border = NA)
    }
  })
  axis(side = 1, at = xticks, labels = xlabs, cex.axis = axis_fs)
  axis(side = 2, at = ylabs, las = 2, cex.axis = axis_fs)
  
  if(!is.null(condition)){
    legend(x = "topright", legend = unique(names(color)), col = unique(color), bty = "n", lty = 1, lwd = 1.2, cex = legend_fs, xpd = TRUE)
  }else{
    legend(x = "topright", legend = names(sig_summary), col = color, bty = "n", lty = 1, lwd = 1.2, cex = legend_fs, xpd = TRUE)
  }
  
  invisible(list(mean_signal = sig_summary, std_err = sig_se_summary, color_codes = color, xticks = xticks, xlabs = xlabs))
  
}

#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# bwpcaplot function to perform PCA analysis based on genomic regions of interest or around TSS sites.

#' Perform PCA analysis
#' @param bigWigs bigWig files. Default NULL. Required.
#' @param bed bed file or a data.frame with first 3 column containing chromosome, star, end positions. 
#' @param binSize bin size to extract signal. Default 50 (bps). Should be >1
#' @param ucsc_assembly If `bed` file not provided, setting `ucsc_assembly` to ref genome build will fetch transcripts from UCSC genome browser. e.g; 'hg19'
#' @param ucsc_startFrom Default "center". Can be "center", "start" or "end"
#' @param ucsc_up extend upstream by this many bps from `startFrom`. Default 2500
#' @param ucsc_down extend downstream by this many bps from `startFrom`. Default 2500
#' @param nthreads Default 4
#' @param custom_names Default NULL and Parses from the file names.
#' @export
extract_summary = function(bigWigs = NULL, bed = NULL, binSize = 50, top = 1000,
                           nthreads = 4, ucsc_assembly = NULL, ucsc_startFrom = "start", ucsc_up = 2500, ucsc_down = 2500, custom_names = NULL){
  
  .check_windows()
  .check_bwtool()
  .check_dt()
  
  if(is.null(bigWigs)){
    stop("Missing bigWig files")
  }
  
  message("Checking for files..")
  for(i in 1:length(bigWigs)){
    if(!file.exists(as.character(bigWigs)[i])){
      stop(paste0(as.character(bigWigs)[i], " does not exist!"))
    }
  }
  
  if(!is.null(custom_names)){
    if(length(custom_names) != length(bigWigs)){
      stop("Please provide names for all bigWigs")
    }
  }else{
    custom_names = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "", x = basename(bigWigs))
  }
  
  op_dir = tempdir() #For now
  if(!dir.exists(paths = op_dir)){
    dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
  }
  
  
  if(!is.null(ucsc_assembly)){
    #If assembly is requested, fetch protein coding transcripts (+/- 2500bp.
    temp_op_bed = .make_genome_bed(refBuild = ucsc_assembly, tss = "start", pc_genes = TRUE, up = ucsc_up, down = ucsc_down)
  }else{
    temp_op_bed = tempfile(pattern = "pcaplot", tmpdir = op_dir, fileext = ".bed")
    if(is.data.frame(bed)){
      bed = data.table::as.data.table(x = bed)
      #data.table::setDT(x = bed)
      colnames(bed)[1:3] = c("chr", "start", "end")
      bed[, chr := as.character(chr)]
      bed[, start := as.numeric(as.character(start))]
      bed[, end := as.numeric(as.character(end))]
    }else if(file.exists(bed)){
      bed = data.table::fread(file = bed, select = list(character = 1, numeric = c(2, 3)), col.names = c("chr", "start", "end"))
    }
    data.table::fwrite(x = bed[,.(chr, start, end)], file = temp_op_bed, sep = "\t", col.names = FALSE)
  }
  
  message("Extracting BED summaries..")
  summaries = parallel::mclapply(bigWigs, FUN = function(bw){
    bn = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "", x = basename(bw))
    message(paste0("    Processing ", bn, " .."))
    cmd = paste("bwtool summary -with-sum -keep-bed -header", temp_op_bed, bw, paste0(op_dir, bn, ".summary"))
    system(command = cmd, intern = TRUE)
    paste0(op_dir, bn, ".summary")
  }, mc.cores = nthreads)
  
  summary_list = lapply(summaries, function(x){
    x = data.table::fread(x)
    colnames(x)[1] = 'chromosome'
    x = x[,.(chromosome, start, end, size, sum)]
    x[,id := paste0(chromosome, ":", start, "-", end)]
    x
  })
  
  names(summary_list) = custom_names
  
  summary_list
  
}

#' PCA plot
#' @param summary_list output from extract_summary
#' @param top Top most variable peaks to consider for PCA. Default 1000
#' @param color Manual colors for each bigWig. Default NULL. 
#' @param condition Default. Condition associated with each bigWig. Lines will colord accordingly.
#' @param condition_colors Manual colors for each level in condition. Default NULL. 
#' @param show_cree If TRUE draws a cree plot. Default TRUE
#' @export
pca_plot = function(summary_list = NULL, top = 1000, color = NULL, condition = NULL, condition_colors = NULL, show_cree = TRUE){
  
  
  if(is.null(summary_list)){
    stop("Missing input! Expecting output from extract_summary()")
  }
  
  sum_tbl = data.table::rbindlist(l = summary_list, idcol = "sample", use.names = TRUE, fill = TRUE)
  sum_tbl = data.table::dcast(data = sum_tbl, formula =  id ~ sample, value.var = "sum", fun.aggregate = max)
  data.table::setDF(x = sum_tbl, rownames = sum_tbl$id)
  sum_tbl$id = NULL
  sum_tbl
  
  if(is.null(color)){
    color = c("#A6CEE3FF", "#1F78B4FF", "#B2DF8AFF", "#33A02CFF", "#FB9A99FF", 
              "#E31A1CFF", "#FDBF6FFF", "#FF7F00FF", "#CAB2D6FF", "#6A3D9AFF", 
              "#FFFF99FF", "#9E0142FF", "#D53E4FFF", "#F46D43FF", "#000000FF", 
              "#EE82EEFF", "#4169E1FF", "#7B7060FF", "#535C68FF")
    color = color[1:length(summary_list)]
  }
  
  if(!is.null(condition)){
    group_df = data.table::data.table(sample = names(summary_list), condition = condition)
    if(is.null(condition_colors)){
      condition_colors = c("#A6CEE3FF", "#1F78B4FF", "#B2DF8AFF", "#33A02CFF", "#FB9A99FF", 
                           "#E31A1CFF", "#FDBF6FFF", "#FF7F00FF", "#CAB2D6FF", "#6A3D9AFF", 
                           "#FFFF99FF", "#9E0142FF", "#D53E4FFF", "#F46D43FF", "#000000FF", 
                           "#EE82EEFF", "#4169E1FF", "#7B7060FF", "#535C68FF")[1:nrow(group_df[,.N,condition])]
    }
    names(condition_colors) = group_df[,.N,condition][,condition]
    group_df$color = condition_colors[group_df$condition]
  }else{
    group_df = data.table::data.table(sample = names(summary_list), color = color)
  }
  
  
  
  sum_tbl = sum_tbl[complete.cases(sum_tbl),, drop = FALSE]
  if(nrow(sum_tbl) < top){
    pca = prcomp(t(.order_by_sds(mat = sum_tbl)))
  }else{
    pca = prcomp(t(.order_by_sds(mat = sum_tbl)[1:top,]))  
  }
  
  pca_dat = as.data.frame(pca$x)
  pca_var_explained = pca$sdev^2/sum(pca$sdev^2)
  data.table::setDT(x = pca_dat, keep.rownames = "sample")
  pca_dat = merge(pca_dat, group_df, by = 'sample')
  attr(pca_dat, "percentVar") <- round(pca_var_explained, digits = 3)
  
  if(show_cree){
    lo = layout(mat = matrix(data = c(1, 2), ncol = 2))
  }
  
  grid_cols = "gray90"
  par(mar = c(3, 4, 1, 1))
  plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(pca_dat$PC1)), ylim = range(pretty(pca_dat$PC2)))
  abline(h = pretty(pca_dat$PC2), v = pretty(pca_dat$PC1), col = grid_cols, lty = 2)
  points(x = pca_dat$PC1, y = pca_dat$PC2, col = "black", bg = pca_dat$color, pch = 21, cex = 1.25)
  axis(side = 1, at = pretty(pca_dat$PC1), cex.axis = 0.8)
  axis(side = 2, at = pretty(pca_dat$PC2), las = 2, cex.axis = 0.8)
  mtext(text = paste0("PC2 [", round(pca_var_explained[2], digits = 2), "]"), side = 2, line = 2.5, cex = 0.8)
  mtext(text = paste0("PC1 [", round(pca_var_explained[1], digits = 2), "]"), side = 1, line = 2, cex = 0.8)
  text(x = pca_dat$PC1, y = pca_dat$PC2, labels = pca_dat$sample, pos = 3, col = pca_dat$color, xpd = TRUE, cex = 0.8)
  #title(main = NA, sub = paste0("N = ", top, " peaks"), adj = 0, outer = TRUE)
  
  if(!is.null(condition)){
    legend(x = "topright", legend = unique(pca_dat$condition), col = unique(pca_dat$color), bty = "n", pch = 19, cex = 0.8, xpd = TRUE, ncol = 2)
  }
  
  if(show_cree){
    par(mar = c(3, 4, 1, 4))
    b = barplot(height = pca_var_explained, names.arg = NA, col = "#2c3e50", ylim = c(0, 1), las = 2, axes = FALSE)
    points(x = b, y = cumsum(pca_var_explained), type = 'l', lty = 2, lwd = 1.2, xpd = TRUE, col = "#c0392b")
    points(x = b, y = cumsum(pca_var_explained), type = 'p', pch = 19, xpd = TRUE, col = "#c0392b")
    mtext(text = paste0("PC", 1:length(pca_var_explained)), side = 1, at = b, las = 2, line = 0.5, cex = 0.8)
    axis(side = 2, at = seq(0, 1, 0.1), line = 0, las = 2, cex.axis = 0.8)
    mtext(text = "var. explained", side = 2, line = 2.5)
    axis(side = 4, at = seq(0, 1, 0.1), line = 0, las = 2, cex.axis = 0.8)
    mtext(text = "cumulative var. explained", side = 4, line = 2.5)
  }
  
  invisible(x = list(pca_data = pca_dat, bed_summary = sum_tbl))
}

#------------------------------------------------------------------------------------------------------------------------------------

.gen_windows = function(chr = NA, start, end, window_size = 50, op_dir = getwd()){
  #chr = "chr19"; start = 15348301; end = 15391262; window_size = 50; op_dir = getwd()
  message(paste0("Generating windows ", "[", window_size, " bp window size]"))
  
  window_dat = data.table::data.table()
  #temp = start;
  while(start <= end){
    window_dat = data.table::rbindlist(l = list(window_dat, data.table::data.table(start, end = start + window_size)), fill = TRUE)
    start = start + window_size
  }
  window_dat[, chr := chr]
  window_dat = window_dat[, .(chr, start, end)]
  
  op_dir = paste0(op_dir, "/")
  
  if(!dir.exists(paths = op_dir)){
    dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
  }
  
  temp_op_bed = tempfile(pattern = "trackr", tmpdir = op_dir, fileext = ".bed")
  data.table::fwrite(x = window_dat, file = temp_op_bed, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
  temp_op_bed
}


.get_summaries = function(bedSimple, bigWigs, op_dir = getwd(), nthreads = 1){
  #bedSimple = temp_op_bed; bigWigs = list.files(path = "./", pattern = "bw"); op_dir = getwd(); nthreads = 1
  op_dir = paste0(op_dir, "/")
  
  if(!dir.exists(paths = op_dir)){
    dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
  }
  
  message(paste0("Extracting signals"))
  
  summaries = parallel::mclapply(bigWigs, FUN = function(bw){
    bn = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "", x = basename(bw))
    message(paste0("    Processing ", bn, " .."))
    cmd = paste("bwtool summary -with-sum -keep-bed -header", bedSimple, bw, paste0(op_dir, bn, ".summary"))
    system(command = cmd, intern = TRUE)
    paste0(op_dir, bn, ".summary")
  }, mc.cores = nthreads)
  
  summary_list = lapply(summaries, function(x){
    x = data.table::fread(x)
    colnames(x)[1] = 'chromosome'
    x = x[,.(chromosome, start, end, size, max)]
    if(all(is.na(x[,max]))){
      message("No signal! Possible cause: chromosome name mismatch between bigWigs and queried loci.")
      x[, max := 0]
    }
    x
  })
  
  
  #Remove intermediate files
  lapply(summaries, function(x) system(command = paste0("rm ", x), intern = TRUE))
  system(command = paste0("rm ", bedSimple), intern = TRUE)
  
  names(summary_list) = gsub(pattern = "*\\.summary$", replacement = "", x = basename(path = unlist(summaries)))
  summary_list
}


.extract_geneModel = function(ucsc_tbl = NULL, chr = NULL, start = NULL, end = NULL, txname = txname, genename = genename){
  
  message("Parsing UCSC file..")
  if(is(object = ucsc_tbl, class2 = "data.frame")){
    ucsc = data.table::as.data.table(ucsc_tbl)
  }else if(file.exists(ucsc_tbl)){
    ucsc = data.table::fread(file = ucsc_tbl)  
  }else{
    stop("Gene model must be a file or a data.frame")
  }
  
  query = data.table::data.table(chr, start, end)
  query[,chr := as.character(chr)]
  query[,start := as.numeric(as.character(start))]
  query[,end := as.numeric(as.character(end))]
  data.table::setkey(x = query, chr, start, end)
  
  colnames(ucsc)[c(3, 5, 6)] = c("chr", "start", "end")
  data.table::setkey(x = ucsc, chr, start, end)
  
  gene_models = data.table::foverlaps(x = query, y = ucsc, type = "any", nomatch = NULL)
  
  if(nrow(gene_models) == 0){
    message("No features found within the requested loci! If you are not sure why..\n    1.Make sure there are no discripancies in chromosome names i.e, chr prefixes\n")
    return(NULL)  
  }else{
    if(!is.null(txname)){
      gene_models = gene_models[name %in% txname]
    }
    
    if(!is.null(genename)){
      gene_models = gene_models[name2 %in% genename]
    }
    
    exon_tbls = lapply(seq_along(along.with = 1:nrow(gene_models)), function(idx){
      exon_start = as.numeric(unlist(data.table::tstrsplit(x = gene_models[idx, exonStarts], split = ",")))
      exon_end = as.numeric(unlist(data.table::tstrsplit(x = gene_models[idx, exonEnds], split = ",")))
      exon_tbl = data.frame(start = exon_start, end = exon_end)
      attributes(exon_tbl) = list(start = gene_models[idx, start], end = gene_models[idx, end], strand = gene_models[idx, strand], tx = gene_models[idx, name], gene = gene_models[idx, name2])
      exon_tbl
    })
    
    return(exon_tbls)
  }
}

.extract_cytoband = function(chr = NULL, refBuild = "hg19"){
  
  if(!grepl(pattern = "^chr", x = chr)){
    message("Adding chr prefix to target chromosome for UCSC query..")
    chr = paste0("chr", chr)
  }
  
  cmd = paste0(
    "mysql --user genome --host genome-mysql.cse.ucsc.edu -NAD ",
    refBuild,
    " -e 'select chrom, chromStart, chromEnd, name, gieStain from cytoBand WHERE chrom =\"",
    chr,
    "\"'"
  )
  message(paste0("Extracting cytobands from UCSC:\n", "    chromosome: ", chr, "\n", "    build: ", refBuild, "\n    query: ", cmd))
  
  cyto = data.table::fread(cmd = cmd, colClasses = c("character", "numeric", "numeric", "character", "character"))
  colnames(cyto) = c("chr", "start", "end", "band", "stain")
  data.table::setkey(x = cyto, chr, start, end)
  
  #Color codes from https://github.com/jianhong/trackViewer (Thank you..)
  ### gieStain #############################
  # #FFFFFF - gneg    - Giemsa negative bands
  # #999999 - gpos25  - Giemsa positive bands
  # #666666 - gpos50  - Giemsa positive bands
  # #333333 - gpos75  - Giemsa positive bands
  # #000000 - gpos100 - Giemsa positive bands
  # #660033 - acen    - centromeric regions
  # #660099 - gvar    - variable length heterochromatic regions
  # #6600cc - stalk   - tightly constricted regions on the short arms of
  #                     the acrocentric chromosomes
  colorSheme = c(
    "gneg"    = "#FFFFFF",
    "acen"    = "#660033",
    "gvar"    = "#660099",
    "stalk"   = "#6600CC"
  )
  gposCols <- sapply(1:100, function(i){
    i <- as.hexmode(round(256-i*2.56, digits = 0))
    i <- toupper(as.character(i))
    if(nchar(i)==1) i <- paste0("0", i)
    return(paste0("#", i, i, i))
  })
  names(gposCols) <- paste0("gpos", 1:100)
  colorSheme <- c(gposCols, colorSheme)
  cyto$color = colorSheme[cyto[,stain]]
  cyto
  
  cyto
}

.extract_geneModel_ucsc = function(chr, start = NULL, end = NULL, refBuild = "hg19", txname = NULL, genename = NULL){
  .check_mysql()
  op_file = tempfile(pattern = "ucsc", fileext = ".tsv")
  
  if(!grepl(pattern = "^chr", x = chr)){
    message("Adding chr prefix to target chromosome for UCSC query..")
    tar_chr = paste0("chr", chr)
  }else{
    tar_chr = chr
  }
  
  cmd = paste0("mysql --user genome --host genome-mysql.cse.ucsc.edu -NAD ",  refBuild,  " -e 'select chrom, txStart, txEnd, strand, name, name2, exonStarts, exonEnds from refGene WHERE chrom =\"", tar_chr, "\"'")
  message(paste0("Extracting gene models from UCSC:\n", "    chromosome: ", tar_chr, "\n", "    build: ", refBuild, "\n    query: ", cmd))
  #system(command = cmd)
  ucsc = data.table::fread(cmd = cmd)
  if(nrow(ucsc) == 0){
    message("No features found within the requested loci!")
    return(NULL)
  }
  colnames(ucsc) = c("chr", "start", "end", "strand", "name", "name2", "exonStarts", "exonEnds")
  if(!grepl(pattern = "^chr", x = chr)){
    ucsc[, chr := gsub(pattern = "^chr", replacement = "", x = chr)]
  }
  data.table::setkey(x = ucsc, chr, start, end)
  
  query = data.table::data.table(chr = chr, start = start, end = end)
  data.table::setkey(x = query, chr, start, end)
  
  gene_models = data.table::foverlaps(x = query, y = ucsc, type = "any", nomatch = NULL)
  
  if(nrow(gene_models) == 0){
    message("No features found within the requested loci! If you are not sure why..\n    1.Make sure there are no discripancies in chromosome names i.e, chr prefixes\n")
    return(NULL) 
  }else{
    
    if(!is.null(txname)){
      gene_models = gene_models[name %in% txname]
    }
    
    if(nrow(gene_models) == 0){
      message("    Requested transcript ", txname, " does not exist within the queried region!\n    Skipping gene track plotting..")
      return(NULL)
    }
    
    if(!is.null(genename)){
      gene_models = gene_models[name2 %in% genename]
    }
    
    if(nrow(gene_models) == 0){
      message("    Requested gene ", genename, " does not exist within the queried region!\n    Skipping gene track plotting..")
      return(NULL)
    }
    
    exon_tbls = lapply(seq_along(along.with = 1:nrow(gene_models)), function(idx){
      exon_start = as.numeric(unlist(data.table::tstrsplit(x = gene_models[idx, exonStarts], split = ",")))
      exon_end = as.numeric(unlist(data.table::tstrsplit(x = gene_models[idx, exonEnds], split = ",")))
      exon_tbl = data.frame(start = exon_start, end = exon_end)
      attributes(exon_tbl) = list(start = gene_models[idx, start], end = gene_models[idx, end], strand = gene_models[idx, strand], tx = gene_models[idx, name], gene = gene_models[idx, name2])
      exon_tbl
    })
    
    return(exon_tbls)
  }
}

.collapse_tx = function(exon_tbls){
  message("Collapsing transcripts..")
  tx_tbl = lapply(exon_tbls, function(x){
    xdt = data.table::data.table(start = x[[1]], end = x[[2]])
    xdt$tx = attr(x = x, which = "tx")
    xdt$gene = attr(x = x, which = "gene")
    xdt$strand = attr(x = x, which = "strand")
    xdt$tx_start = attr(x = x, which = "start")
    xdt$tx_end = attr(x = x, which = "end")
    xdt
  })
  tx_tbl = data.table::rbindlist(l = tx_tbl)
  tx_tbl = tx_tbl[,id := paste0(start, ":", end)][!duplicated(id), ,.(gene)]
  
  exon_tbls = lapply(split(tx_tbl, as.factor(as.character(tx_tbl$gene))), function(x){
    x = x[order(start)]
    exon_start = as.numeric(x[,start])
    exon_end = as.numeric(x[,end])
    gene_tbl = data.frame(start = exon_start, end = exon_end)
    attributes(gene_tbl) = list(start = min(x[, tx_start]), end = max(x[, tx_end]), strand = unique(x[, strand]), tx = NA, gene = unique(x[, gene]))
    gene_tbl
  })
  
  exon_tbls
}

.parse_gtf = function(gtf = NULL, chr, start = NULL, end = NULL, refBuild = "hg19", txname = NULL, genename = NULL){
  message("Parsing gtf file..")
  if(is(object = gtf, class2 = "data.frame")){
    gtf = data.table::as.data.table(gtf)
  }else if(file.exists(gtf)){
    gtf = data.table::fread(file = gtf)  
  }else{
    stop("Gene model must be a file or a data.frame")
  }
  
  colnames(gtf) = c("chr", "source", "feature", "start", "end", "ph", "strand", "ph2", "info")
  gtf[,chr := as.character(chr)]
  gtf[,start :=as.numeric(as.character(start))]
  gtf[,end := as.numeric(as.character(end))]
  data.table::setkey(x = gtf, chr, start, end)
  
  query = data.table::data.table(chr, start, end)
  data.table::setkey(x = query, chr, start, end)
  
  gene_models = data.table::foverlaps(x = query, y = gtf, type = "any", nomatch = NULL)
  
  if(nrow(gene_models) == 0){
    message("No features found within the requested loci! If you are not sure why..\n    1.Make sure there are no discripancies in chromosome names i.e, chr prefixes\n")
    return(NULL)  
  }
  gene_models_exon = gene_models[feature %in% c("exon", "transcript")]
  #gene_models_rest = gene_models[!feature %in% "exon"]
  
  feature_ids = data.table::tstrsplit(x = gene_models_exon$info, split = "; ")
  feature_id_names = lapply(feature_ids, function(x){
    #x = x[1:50] #sample rows
    x = unique(unlist(data.table::tstrsplit(x = x, split = " ", keep = 1)))
    x = x[complete.cases(x)]
    x[1]
  })
  names(feature_ids) = feature_id_names
  req_fields = c("gene_id", "transcript_id")
  req_fields = req_fields[req_fields %in% unlist(feature_id_names)]
  feature_ids = feature_ids[req_fields]
  
  feature_ids = sapply(feature_ids, function(x){
    gsub(pattern = "\"|;", replacement = "", x = unlist(data.table::tstrsplit(x = x, split = " ", keep = 2)))
  })
  feature_ids = data.frame(feature_ids)
  colnames(feature_ids) = c("gene", "tx")
  gene_models_exon = cbind(gene_models_exon, feature_ids)
  #gene_models = data.table::rbindlist(list(gene_models_rest, gene_models_exon), use.names = TRUE, fill = TRUE)
  gene_models = gene_models_exon[order(as.numeric(as.character(start)))]
  
  if(nrow(gene_models) == 0){
    warning("No features found within the requested loci!")
    return(NULL)
  }else{
    if(!is.null(txname)){
      gene_models = gene_models[tx %in% txname]
    }
    
    if(!is.null(genename)){
      gene_models = gene_models[gene %in% genename]
    }
    
    if(nrow(gene_models) == 0){
      warning("Requested gene or transcript could not be found within the requested loci!")
      return(NULL)
    }
    
    gene_models = split(gene_models, as.factor(as.character(gene_models$tx)))
    
    exon_tbls = lapply(seq_along(along.with = 1:length(gene_models)), function(idx){
      x = gene_models[[idx]]
      exon_start = as.numeric(as.character(x[feature %in% "exon"][, start]))
      exon_end = as.numeric(as.character(x[feature %in% "exon"][, end]))
      exon_tbl = data.frame(start = exon_start, end = exon_end)
      attributes(exon_tbl) = list(start = min(x[,start], na.rm = TRUE), end = max(x[,end], na.rm = TRUE), strand = unique(x[,strand]), tx = unique(x[, tx]), gene = unique(x[, gene]))
      exon_tbl
    })
  }
  exon_tbls
}

.make_genome_bed = function(refBuild = "hg19", tss = "start", up = 2500, down = 2500, op_dir = tempdir(), pc_genes = FALSE){
  if(!dir.exists(paths = op_dir)){
    dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
  }
  
  tss = match.arg(arg = tss, choices = c("start", "end"))
  
  temp_op_bed = tempfile(pattern = "profileplot_ucsc", tmpdir = op_dir, fileext = ".bed")
  
  cmd = paste0("mysql --user genome --host genome-mysql.cse.ucsc.edu -NAD ",  refBuild,  " -e 'select chrom, txStart, txEnd, strand, name, name2 from refGene'")
  message(paste0("Extracting gene models from UCSC:\n", "    build: ", refBuild, "\n    query: ", cmd))
  #system(command = cmd)
  ucsc = data.table::fread(cmd = cmd)
  colnames(ucsc) = c("chr", "start", "end", "strand", "tx_id", "gene_id")
  
  main_contigs = paste0("chr", c(1:22, "X", "Y"))
  ucsc = ucsc[chr %in% main_contigs]
  
  if(pc_genes){
    ucsc = ucsc[tx_id %like% "^NM"]
  }
  
  message("Fetched ", nrow(ucsc), " transcripts from ", nrow(ucsc[,.N,.(chr)]), " contigs")
  
  if(tss == "end"){
    colnames(ucsc)[2:3] = c("end", "start")
  }
  
  ucsc_minus = ucsc[strand %in% "-"]
  if(nrow(ucsc_minus) > 0){
    ucsc_minus[, bed_start := end-up]
    ucsc_minus[, bed_end := end+down]
  }
  
  ucsc_plus = ucsc[strand %in% "+"]
  if(nrow(ucsc_plus) > 0){
    ucsc_plus[, bed_start := start-up]
    ucsc_plus[, bed_end := start+down]
  }
  
  ucsc_bed = data.table::rbindlist(l = list(ucsc_plus[, .(chr, bed_start, bed_end)], ucsc_minus[, .(chr, bed_start, bed_end)]), use.names = TRUE, fill = TRUE)
  colnames(ucsc_bed) = c("chr", "start", "end")
  data.table::setkey(x = ucsc_bed, chr, start, end)
  
  data.table::fwrite(x = ucsc_bed[,1:3], file = temp_op_bed, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  temp_op_bed
}

.make_bed = function(bed, op_dir = tempdir(), up = 2500, down = 2500, tss = "center"){
  #bwtool tool requires only three columns
  
  tss = match.arg(arg = tss, choices = c("start", "end", "center"))
  
  if(!dir.exists(paths = op_dir)){
    dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
  }
  
  temp_op_bed = tempfile(pattern = "profileplot", tmpdir = op_dir, fileext = ".bed")
  
  if(is.data.frame(bed)){
    bed = data.table::as.data.table(x = bed)
    #data.table::setDT(x = bed)
    colnames(bed)[1:3] = c("chr", "start", "end")
    bed[, chr := as.character(chr)]
    bed[, start := as.numeric(as.character(start))]
    bed[, end := as.numeric(as.character(end))]
  }else if(file.exists(bed)){
    bed = data.table::fread(file = bed, select = list(character = 1, numeric = c(2, 3)), col.names = c("chr", "start", "end"))
  }
  
  if(tss == "center"){
    bed[, focal_point := as.integer(apply(bed[,2:3], 1, mean))]
    bed[, bed_start := focal_point-up]
    bed[, bed_end := focal_point+down]
  }else if(tss == "start"){
    bed[, bed_start := start-up]
    bed[, bed_end := start+down]
  }else{
    bed[, bed_start := end-up]
    bed[, bed_end := end+down]
  }
  bed = bed[,.(chr, bed_start, bed_end)]
  data.table::setkey(x = bed, chr, bed_start, bed_end)
  
  data.table::fwrite(x = bed, file = temp_op_bed, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  return(temp_op_bed)
}

.bwt_mats = function(bw, binSize, bed, size, startFrom, op_dir){
  
  bn = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "",
            x = basename(bw), ignore.case = TRUE)
  message(paste0("Processing ", bn, ".."))
  
  bw = gsub(pattern = " ", replacement = "\\ ", x = bw, fixed = TRUE) #Change spaces with \ for unix style paths
  
  cmd = paste0("bwtool matrix -tiled-averages=", binSize, " ", size, " " , bed , " ", bw, " ", paste0(op_dir, "/", bn, ".matrix"))
  system(command = cmd, intern = TRUE)
  paste0(op_dir, "/", bn, ".matrix")
}


.summarizeMats = function(mats = NULL, summarizeBy = 'mean', group = NULL, collapse_reps = FALSE){
  
  if(!is.null(group)){
    # gdf = data.table::data.table(sample = names(mats), condition = group)
    # gdf = gdf[order(group, sample)]
    group_u = unique(group)
    if(collapse_reps){
      summarizedMats = lapply(group_u, function(g){
        x = apply(data.table::rbindlist(l = mats[which(group == g)], fill = TRUE, use.names = TRUE), 2, summarizeBy, na.rm = TRUE)
        x
      })
      names(summarizedMats) = group_u
    }else{
      summarizedMats = lapply(mats[1:(length(mats))], function(x){
        if(!is.null(dim(x))){
          x = apply(x, 2, summarizeBy, na.rm = TRUE)
        }
        x
      })
    }
  }else{
    summarizedMats = lapply(mats[1:(length(mats))], function(x){
      if(!is.null(dim(x))){
        x = apply(x, 2, summarizeBy, na.rm = TRUE)
      }
      x
    })
  }
  
  #summarizedMats$param = mats$param
  summarizedMats
}


# estimate tandard deviation for CI
.estimateCI = function(mats = NULL, group = NULL, collapse_reps = FALSE){
  
  if(!is.null(group)){
    if(collapse_reps){
      group_u = unique(group)
      ciMats = lapply(group_u, function(g){
        x = apply(data.table::rbindlist(l = mats[which(group == g)], fill = TRUE, use.names = TRUE), 2, function(y){
          sd(y, na.rm = TRUE)/sqrt(length(y))
        })
        x
      })
      names(ciMats) = group_u
    }else{
      ciMats = lapply(mats[1:(length(mats))], function(x){
        if(!is.null(dim(x))){
          x = apply(x, 2, function(y){
            sd(y, na.rm = TRUE)/sqrt(length(y))
          })
        }
        x
      })
    }
  }else{
    ciMats = lapply(mats[1:(length(mats))], function(x){
      if(!is.null(dim(x))){
        x = apply(x, 2, function(y){
          sd(y, na.rm = TRUE)/sqrt(length(y))
        })
      }
      x
    })
  }
  
  ciMats
}

.order_by_sds = function(mat, keep_sd = FALSE){
  mat_sd = apply(as.matrix(mat), 1, sd, na.rm = TRUE) #order it based on SD
  mat_sd = sort(mat_sd, decreasing = TRUE)
  mat = mat[names(mat_sd),, drop = FALSE]
  mat
}

.extract_summary = function(bw, binSize, bed, op_dir){
  bn = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "",
            x = basename(bw), ignore.case = TRUE)
  message(paste0("    Processing ", bn, ".."))
  
  cmd = paste("bwtool summary -with-sum -keep-bed -header", bedSimple, bw, paste0(op_dir, bn, ".summary"))
  paste0(op_dir, "/", bn, ".summary")
}

.loci2df = function(loci){
  chr = as.character(unlist(data.table::tstrsplit(x = loci, spli = ":", keep = 1)))
  start = unlist(data.table::tstrsplit(x = unlist(data.table::tstrsplit(x = loci, split = ":", keep = 2)), split = "-", keep = 1))
  start = as.numeric(as.character(gsub(pattern = ",", replacement = "", x = as.character(start))))
  end = unlist(data.table::tstrsplit(x = unlist(data.table::tstrsplit(x = loci, split = ":", keep = 2)), split = "-", keep = 2))
  end = as.numeric(as.character(gsub(pattern = ",", replacement = "", x = as.character(end))))
  data.table::data.table(chr, start, end)
}

.check_bwtool = function(warn = TRUE){
  check = as.character(Sys.which(names = 'bwtool'))[1]
  if(check != ""){
    if(warn){
      message("Checking for bwtool installation")
      message(paste0("    All good! Found bwtool at: ", check))
    }else{
      return(invisible(0))
    }
  }else{
    stop("Could not locate bwtool. Download it from here: https://github.com/CRG-Barcelona/bwtool/releases")
  }
}

.check_mysql = function(warn = TRUE){
  check = as.character(Sys.which(names = 'mysql'))[1]
  if(check != ""){
    if(warn){
      message("Checking for mysql installation")
      message(paste0("    All good! Found mysql at: ", check))
    }else{
      return(invisible(0))
    }
  }else{
    stop("Could not locate mysql.\nInstall:\n apt install mysql-server [Debian]\n yum install mysql-server [centOS]\n brew install mysql [macOS]\n conda install -c anaconda mysql [conda]")
  }
}

.check_dt = function(){
  if(!requireNamespace("data.table", quietly = TRUE)){
    message("Could not find data.table library. Attempting to install..")
    install.packages("data.table")
  }
  suppressPackageStartupMessages(expr = library("data.table", quietly = TRUE, warn.conflicts = FALSE, verbose = FALSE))
}

.check_windows = function(){
  if(Sys.info()["sysname"] == "Windows"){
    stop("Windows is not supported :(")
  }
}

#------------------------------------------------------------------------------------------------------------------------------------

#Test
# .run_test_trackplot = function(){
#   bigWigs = list.files(path = "/Volumes/datadrive/bws/trackR/", pattern = "*\\.bw", full.names = TRUE)
#bigWigs = list.files(path = "/bigdisk/Aurore/Epigenomic_landscape_Adult_T-ALL/data/extdata/BLUEPRINT_final_data/H3K27ac/bigWigs/", pattern = "*\\.bw", full.names = TRUE)
#   #bigWigs = bigWigs[grep(pattern = "^[0-9]", x = basename(bigWigs), invert = TRUE)][1:5]
#   bigWigs = bigWigs[grep(pattern = "^GSM", x = basename(bigWigs), invert = TRUE)]
# markregions = data.frame(
#   chr = c("chr3", "chr3"),
#   start = c(187743255, 187735888),
#   end = c(187747473, 187736777),
#   name = c("Promoter-1", "Promoter-2")
# )
# trackplot(
#   bigWigs = bigWigs,
#   loci = "chr3:187,715,903-187,752,003",
#   draw_gene_track = TRUE,
#   build = "hg38",
#   mark_regions = markregions,
#   custom_names = c("CD34", "EC", "LC", "CD4+", "CD8+")
# )
# }

# .run_test_profileplot = function(){
#   #bigWigs = list.files(path = "/Volumes/datadrive/bws/trackR/", pattern = "*\\.bw", full.names = TRUE)
#bigWigs = list.files(path = "/bigdisk/Aurore/Epigenomic_landscape_Adult_T-ALL/data/extdata/BLUEPRINT_final_data/H3K27ac/bigWigs/", pattern = "*\\.bw", full.names = TRUE)
#   #bigWigs = bigWigs[grep(pattern = "^[0-9]", x = basename(bigWigs), invert = TRUE)][1:5]
#   bigWigs = bigWigs[grep(pattern = "^GSM", x = basename(bigWigs), invert = TRUE)]
#   bed = "/Volumes/datadrive/bws/trackR/sample.narrowPeak"
# 
#   profileplot(bigWigs = bigWigs, bed = bed, genome = "tss")
# }

#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
scfurl/seqGlue documentation built on Sept. 1, 2024, 11:20 a.m.