R/somenone_facets.R

Defines functions plot_out_list output_out_list anno_cgc_cna anno_ens_cna facets_jointsegs_parse_to_gr process_in_list facets_cna_consensus facets_cna_call

Documented in anno_cgc_cna anno_ens_cna facets_cna_call facets_cna_consensus facets_jointsegs_parse_to_gr output_out_list plot_out_list process_in_list

#' Standard Facets CNA call wrapper
#'
#' @import pctGCdata
#' @param input_csv path to CSV format file as per requirements of Facets
#' @param work_dir character path of where files should be found, written (default: ./)
#' @return none, saves plots and writes output in current dir
#' @export

facets_cna_call <- function(input_csv, work_dir = NULL){

  if(is.null(work_dir)){
    work_dir <- "./"
  }
  attachNamespace("pctGCdata")
  tumour <- stringr::str_split(input_csv, "\\.")[[1]][1]
  set.seed(1234)
  snpmat <- facets::readSnpMatrix(input_csv)
  pps <- facets::preProcSample(snpmat)
  oo <- facets::procSample(pps)
  diplogr <- oo$dipLogR
  fit <- facets::emcncf(oo)
  ploidpurdf <- data.frame(PLOIDY = round(fit$ploidy, digits=3),
                           PURITY = round(fit$purity, digits=3))
  readr::write_tsv(ploidpurdf, file = paste0(work_dir, "/", tumour,".fit_ploidy_purity.tsv"))
  readr::write_tsv(round(fit$cncf, 3), file = paste0(work_dir, "/", tumour,".fit_cncf_jointsegs.tsv"))
  pcgr_out <- tibble::tibble(Chromosome = fit$cncf$chrom,
                             Start = as.integer(fit$cncf$start),
                             End = as.integer(fit$cncf$end),
                             Segment_Mean = fit$cncf$cnlr.median)
  readr::write_tsv(pcgr_out, file = paste0(work_dir, "/", tumour,".cncf_jointsegs.pcgr.tsv"))
  grDevices::pdf(file = paste0(work_dir, "/", tumour, ".facets_CNA.pdf"))
    facets::plotSample(x = oo, emfit = fit, plot.type = "both", sname = tumour)
  grDevices::dev.off()
}

#' Consensus CNA estimation and plotting with Facets input
#' All files being matched to be contained in work_dir
#' @import pctGCdata
#' @param cncf_match character matched for a cncf format output of facets
#' @param pp_match character path to a ploidy/purity format output of facets
#' @param dict_file character string name of dictionary file for the fasta used to align data
#' @param tag a string used to tag output
#' @param cgc_bed bedfile from the Cancer Gene Census from COSMIC
#' @param work_dir character path of where files should be found, written (default: ./)
#' @return none, saves plots and data in current dir
#' @export

facets_cna_consensus <- function(cncf_match, pp_match, dict_file, tag, cgc_bed = NULL, work_dir = NULL) {

  ##housekeeping
  options(scipen = 999)
  options(stringAsFactors = FALSE)

  if(is.null(work_dir)){
    work_dir <- "./"
  }

  ##set genome assembly version based on dictfile
  which_genome <- "hg19"
  if(length(grep("38", dict_file))==1) {
    which_genome <- "hg38"
  }

  ##load Cancer Gene Census bed file if supplied and run process
  cgc_gr <- NULL
  if(!is.null(cgc_bed)){
    print("Working on CGC")
    cgc <- utils::read.table(paste0(work_dir, "/", cgc_bed))
    cgc_gr <- GenomicRanges::GRanges(seqnames = cgc[,1],
                                    ranges = IRanges::IRanges(cgc[,2], cgc[,3]),
                                    CGC_gene = unlist(lapply(as.vector(cgc[,4]), function(f){
                                                        strsplit(f, ";")[[1]][1]
                                                      })))
    in_list <- as.list(dir(work_dir, pattern = paste0(cncf_match), full.names = TRUE))
    out_list <- lapply(in_list, function(fin_list){
      somenone::process_in_list(fin_list = fin_list,
                                which_genome = which_genome,
                                pp_match = pp_match,
                                cgc_gr = cgc_gr,
                                work_dir = work_dir)
    })
    somenone::output_out_list(out_list, in_list, dict_file, which_genome, tag = paste0(tag, ".CGC"), cgc_gr = cgc_gr, work_dir = work_dir)
  }

  ##using ENS annotation otherwise/also
  ##set input list based on file pattern
  print("Working on ENS")
  in_list <- as.list(dir(work_dir, pattern = cncf_match, full.names = TRUE))
  out_list <- lapply(in_list, function(fin_list){
    somenone::process_in_list(fin_list, which_genome, pp_match, cgc_gr = NULL, work_dir = work_dir)
  })

  somenone::output_out_list(out_list, in_list, dict_file, which_genome, tag = paste0(tag, ".ENS"), work_dir = work_dir)
}

#' Processing list of Facets input
#'
#' @param fin_list of data from Facets
#' @param which_genome genoem assembly being used "hg19" or "hg38"
#' @param pp_match character path to a ploidy/purity format output of facets
#' @param cgc_gr GRanges of cancer gene census from COSMIC
#' @param work_dir chanracter dir in which files found
#' @return purity-ploidy dataframe, also write rds used
#' @export

process_in_list <- function(fin_list, which_genome, pp_match, cgc_gr, work_dir){

  ##allow CGC, else full Ensembl annotations
  anno <- "ENS"
  if(!is.null(cgc_gr)){
    anno <- "CGC"
  }

  ##iterate over samples
  for(samp in 1:length(fin_list)){
    jointseg <- fin_list[[samp]]
    sampleID <- out_name <- stringr::str_split(
                              rev(strsplit(jointseg, "/")[[1]])[1],
                              "\\.")[[1]][1]

    ##also read and store ploidy:
    ploidypurity <- grep(sampleID, dir(work_dir, pattern = pp_match, full.names = TRUE), value = TRUE)
    if(!file.exists(ploidypurity)){
      stop("Ploidypurity file not found")
    }

    ploidy <- as.vector(utils::read.table(ploidypurity)[2,1])
    purity <- as.vector(utils::read.table(ploidypurity)[2,2])
    pp_df <- data.frame(PLOIDY = ploidy, PURITY = purity)

    print(paste0("Working on: ", sampleID))

    ##run function to make GRangesList
    ##(jointsegs_in, which_genome, cgc_gr = NULL, anno = NULL, bsgenome = NULL)
    grl <- somenone::facets_jointsegs_parse_to_gr(jointseg, sampleID, which_genome, anno = anno, cgc_gr = cgc_gr, work_dir = work_dir)
  }
  return(list(grl, pp_df))
}

#' Parse jointsegs into GRanges object
#'
#' @param jointseg filename (output from Facets) with columns:
#'    "seg", "num.mark", "nhet", "cnlr.median", "mafR", "segclust", "cnlr.median.clust", "mafR.clust", "cf.em", "tcn.em", "lcn.em"
#' @param sampleID the sample identifier, taken as the string before first dot in jointseg
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @param cgc_gr GRanges object of cancer gene census from COSMIC
#' @param anno annotation strategy, "ENS" or "CGC"
#' @param bsgenome which version of bsgenome to use
#' @param work_dir character path of where files should be found, written (default: ./)
#' @return a GRanges object with annotated jointsegs from input
#' @export

facets_jointsegs_parse_to_gr <- function(jointseg, sampleID, which_genome, anno = NULL, cgc_gr = NULL, bsgenome = NULL, work_dir){

  ##bed file from: https://cancer.sanger.ac.uk/census
  ##NB requires login hence not provided (but parameters exist to supply
  ##credentials for download-references.nf in somatic_n-of-1 workflow)
  if(!is.null(cgc_gr)){
    anno <- "CGC"
  }

  ##annotation source
  if(is.null(anno)){
    anno <- "ENS"
  }

  ##default genome version is hg38
  if(which_genome == "hg19"){
    bsgenome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
    gene_symbols <- c("GRCh37_v75_SYMBOL", "count_GRCh37_v75_SYMBOLs")
  } else {
    bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
    gene_symbols <- c("GRCh38_v86_SYMBOL", "count_GRCh38_v86_SYMBOLs")
  }

  ##read input
  js <- utils::read.table(jointseg, header=T)

  ##convert chr23 -> X
  if("23" %in% js$chrom){
    js$chrom[js$chrom == 23] <- "X"
  }
  gr <- GenomicRanges::GRanges(seqnames = js$chrom,
                ranges = IRanges::IRanges(start = js$start, end = js$end),
                mcols = js[, c("seg", "num.mark", "nhet", "cnlr.median", "mafR", "segclust", "cnlr.median.clust", "mafR.clust", "cf.em", "tcn.em", "lcn.em")])

  if(is.matrix(GenomeInfoDb::mapSeqlevels(GenomeInfoDb::seqlevels(gr), "NCBI"))){
    if(dim(GenomeInfoDb::mapSeqlevels(GenomeInfoDb::seqlevels(gr), "NCBI"))[1]>1) {
      GenomeInfoDb::seqlevels(gr) <- GenomeInfoDb::mapSeqlevels(GenomeInfoDb::seqlevels(gr), "NCBI")[1,]
    } else {
      GenomeInfoDb::seqlevels(gr) <- GenomeInfoDb::mapSeqlevels(GenomeInfoDb::seqlevels(gr), "NCBI")
    }
  } else {
    GenomeInfoDb::seqlevels(gr) <- GenomeInfoDb::mapSeqlevels(GenomeInfoDb::seqlevels(gr), "NCBI")
  }
  GenomeInfoDb::seqinfo(gr) <- GenomeInfoDb::seqinfo(bsgenome)[GenomeInfoDb::seqlevels(gr)]
#  GenomeInfoDb::seqlevelsStyle(gr) <- "NCBI"

  ##overlaps
  if(anno == "ENS"){
    print("Annotating Ensembl genes")
    gr_anno <- anno_ens_cna(gr, which_genome)

    ##rename S4Vectors::mcols
    names(S4Vectors::mcols(gr_anno)) <- c(names(S4Vectors::mcols(gr_anno))[1:9], "Total_Copy_Number", "Minor_Copy_Number", gene_symbols)
    readr::write_tsv(as.data.frame(gr_anno), file = paste0(work_dir, "/", sampleID, ".facets.CNA.ENS.tsv"))
    saveRDS(gr_anno, file = paste0(work_dir, "/", sampleID, ".facets.CNA.ENS.RData"))
  }

  if(anno == "CGC"){
    print("Annotating Cancer Gene Census genes")
    gr_anno <- somenone::anno_cgc_cna(gr, cgc_gr, which_genome)

    ##rename S4Vectors::mcols
    names(S4Vectors::mcols(gr_anno)) <- c(names(S4Vectors::mcols(gr_anno))[1:9], "Total_Copy_Number", "Minor_Copy_Number", "CGC_SYMBOL", "n_CGC_SYMBOLs")
    readr::write_tsv(as.data.frame(gr_anno), file = paste0(work_dir, "/", sampleID, ".facets.CNA.CGC.tsv"))
    saveRDS(gr_anno, file = paste0(work_dir, "/", sampleID, ".facets.CNA.CGC.RData"))
  }

  return(gr_anno)
}

#' Annotate CNA with ENSembl IDs
#'
#' @param gr GRanges object
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @return GRanges object with annotation
#' @export

anno_ens_cna <- function(gr, which_genome){
    ##annotation
    if(which_genome == "hg19"){
      genes <- ensembldb::genes(EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75)
      gene_symbols <- c("GRCh37_v75_SYMBOL", "count_GRCh37_v75_SYMBOLs")
    } else {
      genes <- ensembldb::genes(EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86)
      gene_symbols <- c("GRCh38_v86_SYMBOL", "count_GRCh38_v86_SYMBOLs")
    }

    ##used
    genes <- genes[GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(genes)) %in% GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(gr)),]
    GenomeInfoDb::seqlevels(genes, pruning.mode="coarse") <- GenomeInfoDb::seqlevels(gr)
    GenomeInfoDb::seqinfo(genes) <- GenomeInfoDb::seqinfo(gr)
    hits <- as.data.frame(GenomicRanges::findOverlaps(gr, genes, ignore.strand=TRUE))

    hits$SYMBOL <- biomaRt::select(org.Hs.eg.db::org.Hs.eg.db,
                                   as.character(genes[hits$subjectHits]$entrezid),
                                   "SYMBOL")$SYMBOL
    gr$SYMBOL <- "-"
    gr$SYMBOLs <- "-"

    ##loop to collapse symbols per region
    print("collapsing gene symbol annotations per region")
    if(dim(hits)[1] > 0){

      for(x in 1:max(hits$queryHits)){
        hitsx <- sort(unique(hits$SYMBOL[hits$queryHits==x]))
        gr$SYMBOL[x] <- paste(hitsx[2:length(hitsx)], collapse=";")
        gr$SYMBOLs[x] <- length(hitsx)-1;
      }
    }

    colnames(S4Vectors::mcols(gr))[colnames(S4Vectors::mcols(gr)) %in% c("SYMBOL", "SYMBOLs")] <- gene_symbols
    return(gr)
}

#' Annotate CNA with cgc_gr IDs
#'
#' @param gr GRanges object
#' @param cgc_gr cancer gene census GRanges
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @return GRanges object with annotation
#' @export

anno_cgc_cna <- function(gr, cgc_gr, which_genome){
  hits <- as.data.frame(GenomicRanges::findOverlaps(gr, cgc_gr, ignore.strand=TRUE))
  hits$SYMBOL <- cgc_gr[hits$subjectHits]$CGC_gene
  gr$CGC_SYMBOL <- "-"
  gr$CGC_SYMBOLs <- "-"
  if(dim(hits)[1] > 0){
    ##loop to collapse symbols per region
    for(x in 1:max(hits$queryHits)){
      hitsx <- as.vector(sort(unique(hits$SYMBOL[hits$queryHits==x])))
      hitsx <- hitsx[!is.na(hitsx)]
      if(length(hitsx)==0){gr$CGC_SYMBOL[x] <- NA; gr$CGC_SYMBOLs[x] <- 0}
      else{
        gr$CGC_SYMBOL[x] <- paste(hitsx[2:length(hitsx)], collapse=";")
        gr$CGC_SYMBOLs[x] <- length(hitsx)-1;
      }
    }
  }

  return(gr)
}

#' Plotting and Output from 'out_list' object
#'
#' @param out_list resulting from process_in_list
#' @param in_list input to process_in_list
#' @param dict_file dictionary file of fasta used to align BAMs
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @param tag runID and annotation strategy, "ENS" or "CGC"
#' @param cgc_gr GRanges of cancer gene census
#' @param work_dir character path of where files should be found, written
#' @return none writes output files
#' @export

output_out_list <- function(out_list, in_list, dict_file, which_genome, tag, cgc_gr = NULL, work_dir){

  ##separate elements in list object into list objects
  facets_list <- lapply(out_list, function(f){
    ff <- f[[1]]
    names(ff) <- ff$mcols.seg
    return(ff)
  })
  pp_list <- lapply(out_list, function(f){
    return(f[[2]])
  })

  samples <- names(pp_list) <- names(facets_list) <- unlist(lapply(in_list, function(f){
    stringr::str_split(basename(f),"\\.")[[1]][1]
  }))

  ##list of only those CNA (i.e. not TCN == 2)
  cna_list <- lapply(facets_list, function(f){
    fo <- f[S4Vectors::mcols(f)$Total_Copy_Number != 2]
    sort(unique(c(fo)))
  })

  ##make bed, bedr:: and GRanges from that
  ps_vec <- c("mcols.seg",
              "mcols.cnlr.median",
              "Total_Copy_Number",
              "Minor_Copy_Number")

  cna_master_gr <- somenone::master_intersect_cna_grlist(cna_list, ps_vec, which_genome)

  ##annotate
  if(length(cna_master_gr)>0){
    anno <- "ENS"
    if(!is.null(cgc_gr)){
      anno <- "CGC"
      cna_master_anno_gr <-  somenone::anno_cgc_cna(cna_master_gr, cgc_gr, which_genome)
    } else {
      cna_master_anno_gr <-  somenone::anno_ens_cna(cna_master_gr, which_genome)
    }

    GenomeInfoDb::seqinfo(cna_master_anno_gr) <- GenomeInfoDb::seqinfo(cna_list[[1]])
    cna_master_anno_df <- as.data.frame(cna_master_anno_gr)
    cna_master_anno_df$width <- cna_master_anno_df$end - cna_master_anno_df$start
    readr::write_tsv(cna_master_anno_df, file = paste0(work_dir, "/", tag, ".facets.CNA.master.tsv"))

    ##write output of CNA analysis to XLSX
    cna_df_list <- lapply(cna_list, as.data.frame)
    cna_df_list$master_anno_df <- cna_master_anno_df

    ##replace NAs
    cna_df_list_na <- lapply(cna_df_list, function(f){
      f[is.na(f)] <- "-"
      return(tibble::as_tibble(f))
    })

    ##summarise master gr
    summ_tb <- somenone::summarise_master(cna_master_anno_gr)
    cna_df_list_na$summary <- as.data.frame(summ_tb)

    openxlsx::write.xlsx(cna_df_list_na, file = paste0(work_dir, "/", tag, ".facets.CNA.full.xlsx"))

    ##plot
    somenone::plot_out_list(cna_list, pp_list, dict_file, which_genome, tag, samples, write_out = TRUE, max_cna_maxd = 8, sample_map = NULL, work_dir)
  } else {
    cna_master_df <- as.data.frame(cna_master_gr)
    readr::write_tsv(cna_master_df, file = paste0(work_dir, "/", tag, ".facets.CNA.master.tsv"))
  }
}

#' Plotting from 'out_list' object
#'
#' @param cna_list GRanges in list of facets results
#' @param pp_list ploidy/purity list
#' @param dict_file dictionary file of fasta used to align BAMs
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @param tag runID and annotation strategy, "ENS" or "CGC"
#' @param samples vector of sampleIDs (use sample_map to change naming in plots)
#' @param write_out runID and annotation strategy, "ENS" or "CGC"
#' @param max_cna_maxd runID and annotation strategy, "ENS" or "CGC"
#' @param sample_map named vector, names are all in samples, elements are new names
#' @param work_dir character path of where files should be found, written
#' @return none writes output files
#' @export

plot_out_list <- function(cna_list, pp_list, dict_file, which_genome, tag, samples, write_out = TRUE, max_cna_maxd = 8, sample_map = NULL, work_dir){

  if(length(dir(pattern = paste0(work_dir, "/", tag, ".facets_consensus.plot.pdf"))) == 1){
    print(paste0("tag: ", tag, " has already been used, please use another"))
  } else {

    ##make CNA df
    cna_df_list <- lapply(seq_along(samples), function(x){
      if(length(cna_list[[x]])>0){
        print(samples[x])
        cna_dfb <- as.data.frame(cna_list[[x]], row.names = seq(1:length(cna_list[[x]])))
        cna_dfb$purity <- unlist(rep(pp_list[[x]][2], length(cna_dfb[,1])))
        cna_dfb$ploidy <- unlist(rep(pp_list[[x]][1], length(cna_dfb[,1])))
        if(is.null(sample_map)){
          cna_dfb$sampleID <- samples[x]
        } else {
          cna_dfb$sampleID <- as.vector(sample_map[samples[x]])
        }
        if(write_out == TRUE){
          readr::write_tsv(cna_dfb,file = paste0(work_dir, "/", samples[x],".facets.CNA.jointsegs.tsv"))
        }
        return(cna_dfb)
      } else {
        if(write_out == TRUE){
          readr::write_tsv(as.data.frame(cna_list[[x]]), file = paste0(work_dir, "/", samples[x], ".facets.CNA.jointsegs.tsv"))
        }
      }
    })

    cna_df <- do.call(rbind, cna_df_list)

    ##remove character chromosomes NB no MT, GL...
    sqn <- gsub("chr", "", as.vector(cna_df[,1]))
    sqn[sqn=="X"] <- 23
    sqn[sqn=="Y"] <- 24
    cna_df$seqnames <- sqn

    seqlengths <- somenone::seqlengths_df(unique(cna_df[,1]), dict_file, which_genome)

    cna_df <- cna_df[cna_df$seqnames %in% seqlengths$seqnames,]
    cum_sum_add <- unlist(apply(cna_df, 1, function(x) {
                cum_sum_off <- seqlengths$cum_sum_0[seqlengths$seqnames %in% x[1]]
                return(cum_sum_off)
    }))

    ##plotting
    plot_start <- plot_end <- cna_call <- minor_call <- purity <- ploidy <- diploid <- colour <- sample <- seqlength <- cum_sum_0 <- cum_sum_centro <- NULL

    ##set maximum plotted CNA
    cna_maxd <- cna_df$Total_Copy_Number
    if(max(cna_maxd) > max_cna_maxd){
      cna_maxd[cna_maxd > max_cna_maxd] <- max_cna_maxd
    }

    if(length(cna_maxd)>0){
      ##colours for plotting

      colz <- c("blue", "darkblue", "black", "darkred", "firebrick3", "red2", rep("red",199))
      names(colz) <- c(seq(from=0,to=198,by=1))
      cnames <- sort(unique(cna_maxd))
      colz <- colz[is.element(names(colz), cnames)]

      ##plot data.frame
      plot_df <- data.frame(row.names=seq(from = 1,to = dim(cna_df)[1], by = 1),
                            seqnames = cna_df[,1],
                            plot_start = cna_df[,2] + (cum_sum_add - 1),
                            plot_end = (cna_df[,3] - 1) + cum_sum_add,
                            cna_call = as.numeric(cna_maxd),
                            minor_call = as.numeric(unlist(cna_df$Minor_Copy_Number)),
                            purity = round(as.numeric(cna_df$purity)),
                            ploidy = round(as.numeric(cna_df$ploidy)),
                            diploid = 2,
                            colour = plyr::mapvalues(cna_maxd, cnames, colz),
                            sample = unlist(cna_df$sampleID))

      ##plot
      ggp <- ggplot2::ggplot() +
             ggplot2::geom_hline(data = plot_df,
                    ggplot2::aes(yintercept = ploidy),
                        linetype = 1,
                        color = "purple",
                        size = 0.5) +
             ggplot2::geom_hline(data = plot_df,
                    ggplot2::aes(yintercept = diploid),
                        linetype = 1,
                        color = "orange",
                        size = 0.5) +
             ggplot2::geom_vline(data = seqlengths,
                         ggplot2::aes(xintercept = cum_sum_0),
                         linetype = 1,
                         color = "dodgerblue",
                         size = 0.2) +
             ggplot2::geom_vline(data = seqlengths,
                         ggplot2::aes(xintercept = cum_sum_centro),
                         linetype = 4,
                         color = "grey",
                         size = 0.2) +
             ggplot2::geom_segment(data = plot_df,
                        ggplot2::aes(x = plot_start,
                         xend = plot_end,
                         y = cna_call,
                         yend = cna_call,
                         colour = colour,
                         size = 4.5)) +
             ggplot2::scale_colour_identity() +
             ggplot2::geom_segment(data = plot_df,
                        ggplot2::aes(x = plot_start,
                         xend = plot_end,
                         y = minor_call,
                         yend = minor_call,
                         colour = "forestgreen",
                         size = 2)) +
             ggplot2::scale_x_continuous(name = "Chromosome",
                         labels = as.vector(seqlengths$seqnames),
                         breaks = seqlengths$cum_sum_centro) +
             ggplot2::scale_y_continuous(name = "Total CNA (facets tcn.em)",
                         labels = seq(from = 0, to = max_cna_maxd, by = 1),
                         breaks = seq(from = 0, to = max_cna_maxd, by = 1),
                         limits = c(0, max_cna_maxd)) +
             ggplot2::theme(axis.text.x = ggplot2::element_text(size = 5),
                         panel.grid.major = ggplot2::element_blank(),
                         panel.grid.minor = ggplot2::element_blank(),
                         legend.position="none") +
             ggplot2::facet_grid(sample ~ .)

      ggplot2::ggsave(filename = paste0(work_dir, "/", tag, ".facets_consensus.plot.pdf"), plot = ggp)
    } else {
      pdf(paste0(work_dir, "/", tag, ".EMPTY.facets_consensus.plot.pdf"))
      plot.new()
      dev.off()
    }
  }
}

#' Create lengths of chromosomes for plotting
#'
#' @param in_seqs a jointsegs filename (output from Facets) with columns:
#'    "seg", "num.mark", "nhet", "cnlr.median", "mafR", "segclust", "cnlr.median.clust", "mafR.clust", "cf.em", "tcn.em", "lcn.em"
#' @param dict_file dictionary file for fasta used in alignment
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @return a GRanges object with annotated jointsegs from input
#' @export

seqlengths_df <- function(in_seqs, dict_file, which_genome){

  V1 <- V3 <- V4 <- V5 <- seqnames <- NULL

  ##create centromere data
  url19 <- paste0("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/cytoBand.txt.gz")
  url38 <- paste0("http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz")
  temp19 <- tempfile()
  utils::download.file(url19, temp19)
  centroms19 <- tibble::as_tibble(utils::read.table(temp19))
  temp38 <- tempfile()
  utils::download.file(url38, temp38)
  centroms38 <- tibble::as_tibble(utils::read.table(temp38, fill=TRUE))

  centromeres <- function(chr, which_genome){
    if(which_genome == "hg19"){
      centrom <- centroms19
    } else {
      centrom <- centroms38
    }
    centrout <- dplyr::mutate(centrom, seqnames = gsub("chr", "", V1))
    centrout <- dplyr::select(centrout, seqnames, -V1, dplyr::everything())
    centrout <- dplyr::filter(centrout, V5 %in% "acen")
    centrout <- dplyr::filter(centrout, V4 %in% grep("p11", V4, value=T))
    centrout <- dplyr::filter(centrout, seqnames %in% chr)
    centrout <- dplyr::select(centrout, seqnames, centromere = V3)
    return(centrout)
  }

  ##parse dict
  dict_seqs <- utils::read.table(dict_file, skip=1)
  seqlens <- data.frame(seqnames = gsub("SN:","", dict_seqs[,2]),
                        start = 1,
                        end = as.numeric(gsub("LN:","", dict_seqs[,3])),
                        stringsAsFactors = FALSE)

  ##find those chromosomes in inSeqs
  seqlengths <- seqlens[is.element(seqlens[,1],
                     unique(as.vector(in_seqs))),]

  if(dim(seqlengths)[1] > 0) {
    ## make centromere data from function
    ##load centromere
    centromere <- centromeres(seqlengths[,1], which_genome)
    seqlengths <- dplyr::left_join(seqlengths, centromere, by = "seqnames")

    ##non-numeric chr IDs are numeric as rownames!
    ##need to output a table to convert between inSeqs and newSeqs
    seqlengths$cum_sum_0 <- c(1,cumsum(as.numeric(seqlengths$end))[1:length(seqlengths[,1])-1])
    seqlengths$cum_sum_1 <- c(cumsum(as.numeric(seqlengths$end)))
    seqlengths$cum_sum_centro <- seqlengths$centromere + seqlengths$cum_sum_0
  }

  return(seqlengths)

}

#' Find intersect from list of CNA GRanges
#'
#' @param cna_list list of named GRanges objects
#' @param ps_vec mcols columns to keep p(er) s(ample; appended with list element's name)
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @return GRanges object of all CNA, intersecting or not
#'         includes the ranges and each mcol from ps_vec, annotated `ps_vec[x].sample`
#' @export

master_intersect_cna_grlist <- function(gr_list, ps_vec, which_genome){

  ##check gr is A GRanges object
  if(!as.vector(class(gr_list)) %in% c("GRangesList", "list")){
    stop("Input \'gr_list\' is not a GRangesList nor list object, retry")
  }

  ##use bedr::bedr.join.multiple.region to make a complete set and samples
  ##lapply over gr_list which creates character vector list in "chrX:1-100" format
  ##thinks it's 0-based so add 1...
  if(length(gr_list) > 1){

    join_chr_all_gr <- collapse_gr_list(gr_list, which_genome)

    ##sample names
    samples <- unique(unlist(lapply(unique(join_chr_all_gr$sampleIDs), function(s){
        stringr::str_split(s, ",")[[1]]
      })))

    ##using the above as a master GRanges object, walk through per sample
    ##create per sample df to be added to mcol
    S4Vectors::mcols(join_chr_all_gr) <- c(S4Vectors::mcols(join_chr_all_gr), do.call(cbind, master_mcols(gr_list, join_chr_all_gr, ps_vec)))

    ##return GRanges with mcols per sample

  } else{
    join_chr_all_tb <- tibble::as_tibble(gr_list[[1]])
    join_chr_all_tb$sampleIDs <- rep(names(gr_list), times = dim(join_chr_all_tb)[1])
    join_chr_all_gr <- GenomicRanges::GRanges(join_chr_all_tb)

    ##return single GRanges

  }

  if(length(join_chr_all_gr)!=0){
    join_chr_all_gr <- loh_summary(join_chr_all_gr)
  }
  return(join_chr_all_gr)
}

#' Function to make master table mcols
#'
#' @param gr_list same input as to master_intersect_cna_grlist()
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @return join_chr_all_gr, GRanges of joined samples
#' @export

collapse_gr_list <- function(gr_list, which_genome) {

  ##iterate over gr_list and return bed object list
  chr_list <- lapply(seq_along(gr_list), function(f){
      print(names(gr_list)[f])
      if(length(gr_list[[f]])>0){
        fb <- apply(as.data.frame(gr_list[[f]]), 1, function(ff){
            ff <- unlist(ff)
            paste0("chr", ff[1], ":", gsub(" ", "", ff[2]), "-", gsub(" ", "", ff[3]))
          })
        bedr::bedr.sort.region(fb)
      }
    })

  ##name chr_list
  names(chr_list) <- names(gr_list)

  ##remove any NULL (i.e. no regions in that sample)
  chr_nn_list <- plyr::compact(chr_list)

  ##join non-NULL
  join_chr_all_tb <- tibble::as_tibble(bedr::bedr.join.multiple.region(chr_nn_list, build = which_genome))

  print("Joining into single GRanges...")
  join_chr_all_gr_tb <- tidyr::separate(data = join_chr_all_tb,
                                        col =  index,
                                        into = c("seqnames", "start", "end"),
                                        sep = "[:-]")
  join_chr_all_gr_tb <- dplyr::select(.data = join_chr_all_gr_tb,
                                      seqnames,
                                      start, end,
                                      "samples_n" = n.overlaps,
                                      "sampleIDs" = names)
  join_chr_all_gr <- GenomicRanges::GRanges(seqnames = gsub("chr", "", unlist(join_chr_all_gr_tb[,1])),
                     ranges = IRanges::IRanges(start = as.numeric(unlist(join_chr_all_gr_tb[,2])),
                                               end = as.numeric(unlist(join_chr_all_gr_tb[,3]))),
                     strand = NULL,
                     mcols = join_chr_all_gr_tb[,c("samples_n", "sampleIDs")],
                     seqinfo = GenomicRanges::seqinfo(gr_list[[1]]))
  join_chr_all_gr <- GenomeInfoDb::sortSeqlevels(join_chr_all_gr)
  join_chr_all_gr <- sort(join_chr_all_gr)
  adr <- as.data.frame(join_chr_all_gr)
  GenomicRanges::width(join_chr_all_gr) <- adr$end - adr$start

  colnames(S4Vectors::mcols(join_chr_all_gr)) <- gsub("mcols.", "", colnames(S4Vectors::mcols(join_chr_all_gr)))

  return(join_chr_all_gr)
}

#' Function to make master table mcols
#' @param gr_list same input as to master_intersect_cna_grlist()
#' @param gr_master object resulting from
#' @param ps_vec the genome assembly used, "hg19" or "hg38"
#' @return list of master_df, data frame of master results
#' @export

master_mcols <- function(gr_list, gr_master, ps_vec){
  lapply(seq_along(gr_list), function(ff){
    ##first GRanges object
    gr_ff <- gr_list[[ff]]

    ##where gr_ff intersects with the supplied ranges
    ##NB this will almost certainly be a subset of gr_master ranges
    hits <- base::as.data.frame(GenomicRanges::findOverlaps(gr_ff, gr_master, ignore.strand = TRUE, minoverlap = 2))

    ##due to being a subset, we insert NA rows for those missing in subjectHits (master_gr)
    hits_list <- lapply(seq_along(gr_master), function(x){
      if(x %in% hits$subjectHits){
        ho <- hits[hits$subjectHits == x,]
        rownames(ho) <- x
        return(ho)
      } else {
        ho <- data.frame(queryHits = NA, subjectHits = x)
        rownames(ho) <- x
        return(ho)
      }
    })

    hits_na <- do.call(rbind, hits_list)

    ##mcols of those hits
    # mchits <- S4Vectors::mcols(gr_ff[, ps_vec][hits_na$queryHits])
    mchits <- S4Vectors::mcols(gr_ff[, ps_vec])

    ##name based on list names and return
    names(mchits) <- paste0(names(gr_list)[ff], ".", gsub("mcols.", "", names(mchits)))

    ##create master output, same size as gr_master and with mcols from queryHits
    ##in the place where subjectHits matched
    master_df <- as.data.frame(matrix(nrow = length(gr_master),
                                      ncol = length(names(mchits))))
    colnames(master_df) <- names(mchits)

    ##place mchits annotation as per subjectHits
    for(x in 1:length(gr_master)){
      sh <- hits$subjectHits
      master_df[hits[sh %in% x, 2],] <- unlist(mchits[hits[sh %in% x, 1],])
    }
    return(master_df)
  })
}

#' Allow tunable reformatting of plots
#' @param pattern string e.g. "facets.CNA.CGC.tsv" as output by main functions
#' @param dict_file dictionary file of fasta used to align BAMs
#' @param which_genome the genome assembly used, "hg19" or "hg38"
#' @param tag write output with this tag prefix
#' @param write_out write tsv output?
#' @param max_cna_maxd maximum CNA to show on plot
#' @param sample_map named vector, names are filenames before first dot, elements are new names
#' @param work_dir character path of where files should be found, written (default: ./)
#' @return none, prints a plot and writes output if flag used
#' @export

plot_from_tsv <- function(pattern, dict_file, which_genome, tag, write_out = FALSE, max_cna_maxd = 8, sample_map = NULL, work_dir){
  files_in <- dir(pattern = pattern)
  out_list <- lapply(files_in, function(f){
    ff <- readr::read_tsv(f)
    colnames(ff) <- gsub("mcols.", "", colnames(ff))
    gr <- GenomicRanges::GRanges(seqnames = ff$seqnames,
                  ranges = IRanges::IRanges(start = ff$start, end = ff$end),
                  mcols = ff[, c("seg", "num.mark", "nhet", "cnlr.median", "mafR", "segclust", "cnlr.median.clust", "mafR.clust", "cf.em", "Total_Copy_Number", "Minor_Copy_Number", rev(colnames(ff))[5], rev(colnames(ff))[4])])
    return(list(gr, data.frame(PLOIDY = unique(ff$ploidy), PURITY = unique(ff$purity))))
    })
  names(out_list) <- samples <- gsub("\\.", "", gsub(pattern, "", files_in))

  facets_list <- lapply(out_list, function(f){
    ff <- f[[1]]
    names(ff) <- ff$mcols.seg
    return(ff)
  })
  pp_list <- lapply(out_list, function(f){
    return(f[[2]])
  })

  ##list of only those CNA (i.e. not TCN == 2)
  cna_list <- lapply(facets_list, function(f){
    fo <- f[S4Vectors::mcols(f)$Total_Copy_Number != 2]
    sort(unique(c(fo)))
  })

  somenone::plot_out_list(cna_list, pp_list, dict_file, which_genome, tag, samples, write_out = TRUE, max_cna_maxd, sample_map, work_dir)
}

#' Summarise master table
#' @param cna_master_anno_gr annotated GRanges from master_intersect_cna_grlist + anno_cgc_cna
#' @return list of various delights per sampleIDs combinations extant
#' @export

summarise_master <- function(cna_master_anno_gr){

  cna_master_anno_tb <- tibble::as_tibble(cna_master_anno_gr)
  genome_size <- sum(GenomeInfoDb::seqlengths(GenomeInfoDb::seqinfo(cna_master_anno_gr)))

  ##find widths for each set of sampleIDs found overlapped
  width_unique_list <- lapply(unique(cna_master_anno_tb$sampleIDs), function(f){
    ff <- dplyr::filter(.data = cna_master_anno_tb, sampleIDs %in% f)
    dplyr::summarise(.data = ff, width)
  })

  ##sum those to get summary
  width_sum_unique_list <- lapply(width_unique_list, function(f){
    return(list(sum = sum(f), prop = sum(f)/genome_size))
  })

  ##genes affected by those
  symbol <- grep("_SYMBOL", colnames(cna_master_anno_tb), value = TRUE)[1]

  genes_affected_list <- lapply(unique(cna_master_anno_tb$sampleIDs), function(f){
    ff <- dplyr::filter(.data = cna_master_anno_tb, sampleIDs %in% f)
    fg <- dplyr::summarise(.data = ff, !!as.symbol(symbol))
    fgo <- unlist(na.omit(fg))
    fgo <- gsub("NA;", "", fgo)
    fgo[fgo!="-"]
  })

  #losses and gains
  summarise_tcn <- function(rowin){
    apply(rowin, 1, function(r){
      rowina <- na.omit(r)
      if(all(rowina>2)){
        return("GAIN")
      } else if(all(rowina < 2)){
        return("LOSS")
      } else {
        return("BOTH")
      }
    })
  }

  tcn_summary_list <- lapply(unique(cna_master_anno_tb$sampleIDs), function(f){
    ff <- dplyr::filter(.data = cna_master_anno_tb, sampleIDs %in% f)
    ft <- dplyr::select(.data = ff, tidyselect::ends_with("Total_Copy_Number"))
    table(summarise_tcn(ft))
  })

  ##name all the same
  names(tcn_summary_list) <- names(genes_affected_list) <- names(width_unique_list) <- names(width_sum_unique_list) <- unique(cna_master_anno_tb$sampleIDs)

  ##create table output
  genesaf <- unlist(lapply(genes_affected_list, function(f){paste(sort(f), collapse = ",")}))
  genesaf <- gsub(",NA", "", gsub("NA,", "", genesaf))

  summ_tb <- tibble::tibble(sampleIDs = unique(cna_master_anno_tb$sampleIDs),
                            tcn_summary = unlist(lapply(tcn_summary_list, function(f){paste(names(f), collapse = ";")})),
                            tcn_ns = unlist(lapply(tcn_summary_list, function(f){paste(f, collapse = ";")})),
                            width_sum_total = unlist(lapply(width_sum_unique_list, function(f){f$sum})),
                            width_sum_prop = unlist(lapply(width_sum_unique_list, function(f){f$prop})),
                            n_genes_affected = unlist(lapply(genesaf, function(f){length(strsplit(f, ",")[[1]])})),
                            genes_affected = genesaf)

  return(dplyr::arrange(.data = summ_tb, desc(width_sum_prop)))
}

#' Take input of a GRanges object, and use that seqinfo to bin by bin_size
#'
#' @param gr GRanges object with seqinfo used to make bins
#' @param genome use this to add seqinfo if none extant (e.g. "hg19", "hg38")
#' @param bin_size base pairs by which to bin
#' @param mcol_in string naming mcols to use for binning (one only)
#' @param mcol_in_lim numeric on which to screen mcol_in column; if length>1, take all outside the range suplied
#' @param one_to_x logical indicating whether to use chr1..chrX for bin, as opposed to what is in input GRanges
#' @return GRanges binned on mcol_in
#' @export

bin_granges <- function(gr, genome = NULL, bin_size=NULL, mcol_in, mcol_in_lim = NULL, one_to_x = TRUE){

  ##check gr is A GRanges object
  if(!as.vector(class(gr)) %in% c("GRanges", "GRangesList", "list")){
    stop("Input \'gr\' is not a GRanges, GRangesList nor list object, retry")
  }

  ##if grList, set gr to a single entity therein for binning
  if(as.vector(class(gr)) %in% c("GRangesList", "list")){
    gri <- gr[[1]]
  } else {
    if(!as.vector(class(gr)) == "GRanges"){
      stop("Input \'gr\' list does not contain a GRanges object, retry")
    } else {
      gri <- gr
    }
  }

  ##test seqinfo
  print("Testing seqinfo on GRanges input...")
  gri <- test_seqinfo(gr = gri, genome = genome)

  ##set default
  if(is.null(bin_size)){
    print("Bin size is not specified, setting to 10MB")
    bin_size <- 10000000
  } else {
    bin_size <- as.numeric(bin_size)
  }

  ##make bin gr
  print("Making bins...")
  grb <- bin_maker(grr = gri, bin_size = bin_size, genome = genome, one_to_x = TRUE)

  ##screen based on mcols_in_lim
  if(!is.null(mcol_in_lim)){
    if(length(mcol_in_lim) == 1) {
      grf <- gri[unlist(S4Vectors::mcols(gri[,mcol_in]))!=mcol_in_lim]
    } else {
      grf <- gri[unlist(S4Vectors::mcols(gri[,mcol_in]))<mcol_in_lim[1] & unlist(S4Vectors::mcols(gri[,mcol_in]))>mcol_in_lim[2] ]
    }
  } else {
    grf <- gri
  }

  ##findoverlaps, mcols on mcol_in
  print("Finding overlaps...")
  hits <- as.data.frame(GenomicRanges::findOverlaps(grf, grb, ignore.strand=TRUE, type = "any"))
  bin_mcol_qh <- S4Vectors::mcols(grf)[hits$queryHits, mcol_in]
  S4Vectors::mcols(grb)[mcol_in] <- NA
  S4Vectors::mcols(grb)[hits$subjectHits, mcol_in] <- bin_mcol_qh

  return(grb)

  ##old
  ##N.B. that hits can include multiple 'subjectHits', i.e. bins can be hit by more that one CNA
  ##here we test if: those are the same value (and include just that value)
  ##different value with one == 2, include other
  ##fail, reduce bin size below
  # grb$x <- S4Vectors::mcols(grf[hits$queryHits, mcol_in])
  # gr$CGC_SYMBOLs <- "-"
  # ##loop to collapse symbols per region
  # for(x in 1:max(hits$queryHits)){
  #   hitsx <- as.vector(sort(unique(hits$SYMBOL[hits$queryHits==x])))
  #   hitsx <- hitsx[!is.na(hitsx)]
  #   if(length(hitsx)==0){gr$CGC_SYMBOL[x] <- NA; gr$CGC_SYMBOLs[x] <- 0}
  #   else{
  #     gr$CGC_SYMBOL[x] <- base::paste(hitsx[2:length(hitsx)], collapse=";")
  #     gr$CGC_SYMBOLs[x] <- length(hitsx)-1;
  #   }
  # }
}

#' Take input of a GRanges object, and use that seqinfo to bin by bin_size
#'
#' @param grr GRanges object with seqinfo used to make bins
#' @param bin_size base pairs by which to bin
#' @param genome use this to add seqinfo if none extant (e.g. "hg19", "hg38")
#' @param one_to_x logical indicating whether to use chr1..chrX for bin, as opposed to what is in input GRanges
#' @return GRanges bin
#' @export

bin_maker <- function(grr, bin_size = 30000, genome = NULL, one_to_x = TRUE){

  if(one_to_x == FALSE){
    ##new GRanges across that bin based on that seqinfo
    grri <- test_seqinfo(grr, genome = genome)

    sn <- GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(grri))
    sl <- GenomeInfoDb::seqlengths(GenomeInfoDb::seqinfo(grri))
    seqinf <- GenomeInfoDb::seqinfo(grri)
    genome <- as.vector(GenomeInfoDb::genome(gri)[1])

  } else {
    sn <- c(1:22, "X")
    seqinf <- GenomeInfoDb::fetchExtendedChromInfoFromUCSC(genome)
    seqinf <- seqinf[seqinf$NCBI_seqlevel %in% sn,]
    sl <- seqinf$UCSC_seqlength
    genome <- genome
  }
  ##create IRanges for seqnames and seqlengths
  ##combine into bin GRange
  print(sn)
  print(sl)
  print(seqinf)

  grbList <- lapply(seq_along(sn), function(f){
      st <- seq(from = 0, to = sl[f], by = bin_size)
      nd <- c(seq(st[2], sl[f], by = bin_size), as.vector(sl[f])+1)-1
      grsn <- GenomicRanges::GRanges(seqnames = sn[f],
                                     IRanges::IRanges(start = st, end = nd),
                                     strand = "*")
      GenomeInfoDb::genome(grsn) <- genome
      GenomeInfoDb::seqlengths(grsn) <- sl[f]
      return(grsn)
  })
  grb <- unlist(as(grbList, "GRangesList"))
  S4Vectors::mcols(grb) <- bin_size
  names(S4Vectors::mcols(grb)) <- "bin_size"
  return(grb)
}

#' Test for seqinfo

#' @param gr GRanges to test for seqinfo
#' @param genome string indicating genome to add seqinfo from if missing
#' @return GRanges gr with seqinfo from genome if required
#' @export

test_seqinfo <- function(gr, genome = NULL){
  adf <- as.data.frame(GenomeInfoDb::seqinfo(gr))[1,1]
  if(is.na(adf)){
    if(is.null(genome)){
      stop("Input has no seqinfo defined, and no genome specified, please retry with seqinfo or genome")
    } else {
      print(paste0("Using ", genome, " to get seqinfo"))
      seqinf <- GenomeInfoDb::fetchExtendedChromInfoFromUCSC(genome)
      seqinf <- seqinf[seqinf$NCBI_seqlevel %in% GenomeInfoDb::seqlevels(gr),]
      GenomeInfoDb::seqlengths(gr) <- seqinf$UCSC_seqlength
      GenomeInfoDb::genome(gr) <- genome
      return(gr)
    }
  } else {
    print("Input has seqinfo already")
    return(gr)
  }
}

#' LOH summary: find LOH in GRanges

#' @param join_chr_all_gr GRanges from master_mcols do.call cbind
#' @return GRanges gr with LOH_sum,samples
#' @export

loh_summary <- function(join_chr_all_gr){

  ##find 0 Minor_Copy_Number, create column per sample and overall column
  ##overall indicates if every position in each sample is 0

  join_chr_all_tb <- tibble::as_tibble(join_chr_all_gr)
  mcn <- dplyr::select(.data = join_chr_all_tb, tidyselect::contains("Minor_Copy_Number"))

  ##total sums of mcn
  mut_ret <- function(ro){ sum(na.omit(ro)) }
  LOH_sum <- unlist(apply(mcn, 1, mut_ret))

  LOH_samples <- apply(mcn, 1, function(f){
    flist <- lapply(names(f), function(ff){
      pn <- c()
      if(!is.na(f[ff]) & as.numeric(f[ff])==0){
        pn <- c(pn, ff)
      }
    })
    flist[sapply(flist, is.null)] <- NULL
    paste(flist, collapse=",")
  })

  S4Vectors::mcols(join_chr_all_gr) <- data.frame(S4Vectors::mcols(join_chr_all_gr), LOH_sum = LOH_sum, LOH_samples = LOH_samples)
  return(join_chr_all_gr)
}
brucemoran/somenone documentation built on Nov. 1, 2022, 3:56 p.m.