R/module_Circos.R

Defines functions v_Circos v_prepareVdata_Circos

Documented in v_Circos v_prepareVdata_Circos

#' Prepare data for module: Circos
#'
#' The workflow of vigilante is highly module-based. To ensure a successful and smooth run, vigilante needs to prepare input data before continuing.
#'
#' @param doBAF logic, whether to prepare B Allele Frequency (BAF) data, if FALSE, no BAF data will be available for downstream v_Circos function and no BAF track will be shown on Circos plots; if TRUE, please make sure BAF data files are properly named, see Details for more information about file naming.
#' @param doCNA logic, whether to prepare Copy Number Alteration (CNA) data, if FALSE, no CNA data will be available for downstream v_Circos function and no CNA track will be shown on Circos plots; if TRUE, please make sure CNA data files are properly named, see Details for more information about file naming.
#' @param doMB mixed vector, c(FALSE, "somatic", "germline"), whether to prepare Mutation Burden (MB) data, if FALSE, no MB data will be available for downstream v_Circos function and no MB track will be shown on Circos plots; if TRUE, please follow the instruction and run v_llpMultiLoader function first. Also, if TRUE, specify the mutation type, can be either "somatic" or "germline" or both, each type will be shown as a separate track on Circos plots.
#' @param doGE mixed vector, c(FALSE, "log"), whether to prepare Gene Expression (GE) data, if FALSE, no GE data will be available for downstream v_Circos function and no GE track will be shown on Circos plots; if TRUE, please make sure GE data files are properly named, see Details for more information about file naming. Also, if TRUE, specify the value format, can be either "log" or "TPM", choose "TPM" for showing raw TPM value, choose "log" for showing value after log10 conversion (0 will be converted into negative value based on the value range across all samples).
#' @param doFL logic, whether to prepare Fusion Link (FL) data, if FALSE, no FL data will be available for downstream v_Circos function and no FL track will be shown on Circos plots; if TRUE, please make sure FL data files are properly named, see Details for more information about file naming.
#' @param unifyTrackYlim logic, whether to unify each track's y-axis range based on the value range across all samples, default TRUE.
#'
#' @details
#' Oftentimes input data files generated by upstream tools came with diverse naming conventions. It might be easy for the user to recognize those files, but not for vigilante if there is no recognizable patterns.
#'
#' To make input data files clear to vigilante, it would be nice to have them named something like "studyID_sampleID_(other descriptions).file extension". Here "studyID" is the name of the study or project, and it will be used in multiple naming situations (such as on the plot, or in the output file names), so it is recommended to be concise and meaningful.
#'
#' For module Circos, currently supported input data files are listed below, please contact the author if you want to add more files to the supported list:
#' B Allele Frequency (BAF): *baf.tsv
#' Copy Number Alteration (CNA): *cna.tsv
#' Mutation Burden (MB): see the help file of v_llpMultiLoader function for more information
#' Gene Expression (GE): *cDNA_genes.sf for Salmon
#' Fusion Link (FL): *-fusion-genes.txt for FusionCatcher, *star-fusion*abridged* for STAR-Fusion
#'
#' @return list, because R CMD check discourages assignments to the global environment within functions, user needs to run the function with explicitly assigning the return value to a global variable named "prepareVdata_Circos_returnList", which will be a list containing the required variables for downstream analyses.
#'
#' @import stats magrittr
#'
#' @export
#'
# v_prepareVdata_Circos function
v_prepareVdata_Circos = function(doBAF = FALSE, doCNA = FALSE, doMB = c(FALSE, "somatic", "germline"), doGE = c(FALSE, "log"), doFL = FALSE, unifyTrackYlim = TRUE) {

  # ask user to make sure v_prepareVdata_Circos function return value is assigned to the global variable named "prepareVdata_Circos_returnList"
  status_returnValue = menu(choices = c("Yes", "No"), title = "\nIs v_prepareVdata_Circos function return value assigned to the global variable named 'prepareVdata_Circos_returnList'?")
  if (status_returnValue == 1) {
    print("Function return value is properly assigned, now continue to the next step")
  } else if (status_returnValue == 2) {
    print("Please follow the instruction to assign the function return value before continue to the next step")
    stop()
  } else {
    print("Please choose a valid answer")
    stop()
  }
  rm(status_returnValue)

  # check if globalSettings_returnList exists
  if (!exists("globalSettings_returnList")) {
    print("globalSettings_returnList not detected, please follow the instruction and run v_globalSettings function before continue")
    stop()
  } else {
    print("globalSettings_returnList detected, start extracting global settings")
  }

  # extract global settings from return list
  for (i in 1:length(globalSettings_returnList)) {
    assign(x = names(globalSettings_returnList)[i], value = globalSettings_returnList[[i]])
  }
  rm(i)

  # preset variables for temp_prepareVdata_Circos_returnList
  baf.list.c = NULL
  cna.list.c = NULL
  cna_track_ylim = NULL
  mb_somatic_d.list.c = NULL
  mb_germline_d.list.c = NULL
  mb_track_ylim = NULL
  ge.list.c = NULL
  flf.list.c = NULL
  flt.list.c = NULL
  fl_intraORinter.list.c = NULL
  lb.list.c = NULL
  lb_intraORinter.list.c = NULL

  # if condition for whether to reform BAF data
  if (doBAF == TRUE) {

    # check if BAF data exist
    baf_file_name = dir(path = folder_name, pattern = glob2rx("*baf.tsv"))
    if (length(baf_file_name) == 0) {
      print("BAF data not detected, please double check if BAF data files are properly named, or set doBAF = FALSE to skip BAF data processing")
      stop()
    } else {
      print("Start processing BAF data")
    }

    # load and extract BAF data
    baf_path = paste0("./", folder_name, "/", baf_file_name)
    baf.list = lapply(baf_path, read.table, header = TRUE, stringsAsFactors = FALSE)
    baf.list = lapply(baf.list, function(x) {
      x = cbind(x, x$Position)
      x[, 4] = x[, 3]
      x[, 3] = x[, 2]
      if (length(x[, 1]) > 0) {
        x[, 1] = gsub("chr", "", x[, 1])
        x[, 1] = paste0("chr", x[, 1])
      }
      colnames(x) = c("Chr", "Start", "End", "BAF")
      return(x)
    })
    names(baf.list) = assayID

    # calculate average BAF per sample
    baf.list = lapply(baf.list, function(x) {
      x = dplyr::group_by(x, Chr, Start, End)
      x = dplyr::summarise(x, BAF = mean(BAF))
      x = as.data.frame(x)
      return(x)
    })

    # grouping and calculate average BAF per group
    baf.grp = list()
    for (i in 1:length(grpName)) {
      baf.grp[[i]] = baf.list[names(baf.list) %in% grp.list[[i]]]
    }
    rm(i)
    baf.grp.avg = list()
    baf.grp.avg = lapply(baf.grp, dplyr::bind_rows)
    baf.grp.avg = lapply(baf.grp.avg, function(x) {
      x = dplyr::group_by(x, Chr, Start, End)
      x = dplyr::summarise(x, BAF = mean(BAF))
      x = as.data.frame(x)
    })

    # append grouped list to original list
    baf.list.c = c(baf.list, baf.grp.avg)
    names(baf.list.c) = c(assayID, grpName)

    # remove temp baf data
    rm(baf.grp, baf.grp.avg, baf.list, baf_file_name, baf_path)
    print("BAF data processing completed")
  }

  # if condition for whether to reform CNA data
  if (doCNA == TRUE) {

    # check if CNA data exist
    cna_file_name = dir(path = folder_name, pattern = glob2rx("*cna.tsv"))
    if (length(cna_file_name) == 0) {
      print("CNA data not detected, please double check if CNA data files are properly named, or set doCNA = FALSE to skip CNA data processing")
      stop()
    } else {
      print("Start processing CNA data")
    }

    # load and extract CNA data
    cna_path = paste0("./", folder_name, "/", cna_file_name)
    cna.list = lapply(cna_path, read.table, header = TRUE, stringsAsFactors = FALSE)
    cna.list = lapply(cna.list, function(x) {
      x = cbind(x, x$Position)
      x[, 4] = x[, 3]
      x[, 3] = x[, 2]
      if (length(x[, 1]) > 0) {
        x[, 1] = gsub("chr", "", x[, 1])
        x[, 1] = paste0("chr", x[, 1])
      }
      colnames(x) = c("Chr", "Start", "End", "CNA")
      return(x)
    })
    names(cna.list) = assayID

    # calculate average CNA per sample
    cna.list = lapply(cna.list, function(x) {
      x = dplyr::group_by(x, Chr, Start, End)
      x = dplyr::summarise(x, CNA = mean(CNA))
      x = as.data.frame(x)
      return(x)
    })

    # grouping and calculate average CNA per group
    cna.grp = list()
    for (i in 1:length(grpName)) {
      cna.grp[[i]] = cna.list[names(cna.list) %in% grp.list[[i]]]
    }
    rm(i)
    cna.grp.avg = list()
    cna.grp.avg = lapply(cna.grp, dplyr::bind_rows)
    cna.grp.avg = lapply(cna.grp.avg, function(x) {
      x = dplyr::group_by(x, Chr, Start, End)
      x = dplyr::summarise(x, CNA = mean(CNA))
      x = as.data.frame(x)
    })

    # append grouped list to original list
    cna.list.c = c(cna.list, cna.grp.avg)
    names(cna.list.c) = c(assayID, grpName)

    # set cna_track_ylim for downstream render plot
    cna_track_ylim = lapply(cna.list.c, function(x) {
      x = ceiling(max(abs(x[, 4])))
      return(x)
    })
    if (unifyTrackYlim == TRUE) {
      cna_track_ylim_max = max(unlist(cna_track_ylim))
      cna_track_ylim = lapply(cna_track_ylim, function(x) {
        x = cna_track_ylim_max
        return(x)
      })
      rm(cna_track_ylim_max)
    }

    # remove temp cna data
    rm(cna.grp, cna.grp.avg, cna.list, cna_file_name, cna_path)
    print("CNA data processing completed")
  }

  # if condition for whether to reform MB data
  if (TRUE %in% doMB) {

    # check if required data input exists
    if (!(exists("llpMultiLoader_returnList") | exists("maf_germline.p"))) {
      print("Required llpMultiLoader_returnList or maf_germline.p does not exist, please follow the instruction and run v_llpMultiLoader function first, or set doMB = FALSE to skip MB data processing")
      stop()
    } else {
      print("Start processing MB data")
    }

    # set chr_len_auto and chr_len_max
    if (speciesID %in% c("hg38", "hg19")) {
      chr_len_auto = chr_len_H
    } else {
      chr_len_auto = chr_len_M
    }
    chr_len_max = max(chr_len_auto) + 1e7

    # preset mb_somatic_d.list.c and mb_germline_d.list.c
    mb_empty_template = as.data.frame(matrix(rep(NA, 4), nrow = 1, byrow = TRUE), stringsAsFactors = FALSE)
    mb_empty_template = na.omit(mb_empty_template)
    colnames(mb_empty_template) = c("Chr", "Start", "End", "Count")
    mb_somatic_d.list.c = c(rep(list(mb_empty_template), times = length(plot_name)))
    names(mb_somatic_d.list.c) = c(assayID, grpName)
    mb_germline_d.list.c = c(rep(list(mb_empty_template), times = length(plot_name)))
    names(mb_germline_d.list.c) = c(assayID, grpName)
    rm(mb_empty_template)

    # load, extract and reform MB data, somatic, remember to add "chr" prefix!!!
    if ("somatic" %in% doMB) {
      # load and extract MB data, somatic
      mb_somatic = llpMultiLoader_returnList[[length(llpMultiLoader_returnList)]]@data
      mb_somatic = mb_somatic[, c("Chromosome", "Start_Position", "End_Position", "Tumor_Sample_Barcode")]
      colnames(mb_somatic)[1:3] = c("Chr", "Start", "End")
      mb_somatic$Chr = gsub("chr", "", mb_somatic$Chr)
      mb_somatic$Chr = paste0("chr", mb_somatic$Chr)
      mb_somatic.list.c = list()
      for (i in 1:nrow(groupInfo)) {
        mb_somatic.list.c[[i]] = subset(mb_somatic, subset = Tumor_Sample_Barcode == groupInfo$aliasID[i])
      }
      rm(i, mb_somatic)
      names(mb_somatic.list.c) = assayID

      # grouping MB data, somatic
      mb.list.grp = list()
      for (i in 1:length(grpName)) {
        mb.list.grp[[i]] = dplyr::bind_rows(mb_somatic.list.c[names(mb_somatic.list.c) %in% grp.list[[i]]])
        mb.list.grp[[i]] = unique(mb.list.grp[[i]])
      }
      rm(i)
      mb_somatic.list.c = c(mb_somatic.list.c, mb.list.grp)
      rm(mb.list.grp)
      names(mb_somatic.list.c) = c(assayID, grpName)

      # set empty density list to be filled, somatic
      mb_somatic_d.list.c = lapply(mb_somatic.list.c, function(x) {
        x = circlize::genomicDensity(x, window.size = 1e7, overlap = FALSE, chr.len = chr_len_auto + 1e7)
        x[, 4] = 0
        colnames(x) = c("Chr", "Start", "End", "Count")
        x[, 1] = as.character(x[, 1])
        x[, "unique_identifier"] = paste0(x$Chr, ":", x$Start, ":", x$End)
        return(x)
      })

      # set cut list for downstream filling, somatic
      mb_somatic_cut.list.c = lapply(mb_somatic.list.c, function(x) {
        x[, 2] = cut(unlist(x[, 2]), breaks = seq(0, chr_len_max, 1e7), include.lowest = TRUE)
        x[, 3] = cut(unlist(x[, 3]), breaks = seq(0, chr_len_max, 1e7), include.lowest = TRUE)
        x = x %>% dplyr::group_by(Chr, Start, End) %>% dplyr::summarize(count = dplyr::n())
        x[, 2] = gsub("\\(", "", unlist(x[, 2]))
        x[, 2] = gsub("\\[", "", unlist(x[, 2]))
        x[, 2] = gsub("\\]", "", unlist(x[, 2]))
        x[, "End"] = NULL
        colnames(x)[3] = "Count"
        x = tidyr::separate(x, col = "Start", into = c("Start", "End"), sep = ",", remove = TRUE)
        x[, 2] = as.integer(unlist(x[, 2])) + as.integer(1)
        x[, 3] = as.integer(unlist(x[, 3]))
        x[, "unique_identifier"] = paste0(x$Chr, ":", x$Start, ":", x$End)
        return(x)
      })

      # fill empty density list with cut list, somatic
      mb_somatic_d.list.c = mapply(x = mb_somatic_d.list.c, y = mb_somatic_cut.list.c, SIMPLIFY = FALSE, function(x, y) {
        x[, "Count"] = plyr::mapvalues(x$unique_identifier, from = y$unique_identifier, to = y$Count, warn_missing = FALSE)
        x[, "Count"] = gsub("^.*chr.*$", "0", x[, "Count"])
        x[, "Count"] = as.numeric(x[, "Count"])
        x[, "unique_identifier"] = NULL
        return(x)
      })

      # remove temp MB data variables, somatic
      rm(mb_somatic.list.c, mb_somatic_cut.list.c)
    }

    # load, extract and reform MB data, germline, remember to add "chr" prefix!!!
    if ("germline" %in% doMB) {
      # load and extract MB data, germline
      mb_germline = maf_germline.p@data
      mb_germline = mb_germline[, c("Chromosome", "Start_Position", "End_Position", "Tumor_Sample_Barcode")]
      colnames(mb_germline)[1:3] = c("Chr", "Start", "End")
      mb_germline$Chr = gsub("chr", "", mb_germline$Chr)
      mb_germline$Chr = paste0("chr", mb_germline$Chr)
      mb_germline.list.c = list()
      for (i in 1:nrow(groupInfo)) {
        mb_germline.list.c[[i]] = subset(mb_germline, subset = Tumor_Sample_Barcode == groupInfo$aliasID[i])
      }
      rm(i, mb_germline)
      names(mb_germline.list.c) = assayID

      # grouping MB data, germline
      mb.list.grp = list()
      for (i in 1:length(grpName)) {
        mb.list.grp[[i]] = dplyr::bind_rows(mb_germline.list.c[names(mb_germline.list.c) %in% grp.list[[i]]])
        mb.list.grp[[i]] = unique(mb.list.grp[[i]])
      }
      rm(i)
      mb_germline.list.c = c(mb_germline.list.c, mb.list.grp)
      rm(mb.list.grp)
      names(mb_germline.list.c) = c(assayID, grpName)

      # set empty density list to be filled, germline
      mb_germline_d.list.c = lapply(mb_germline.list.c, function(x) {
        x = circlize::genomicDensity(x, window.size = 1e7, overlap = FALSE, chr.len = chr_len_auto + 1e7)
        x[, 4] = 0
        colnames(x) = c("Chr", "Start", "End", "Count")
        x[, 1] = as.character(x[, 1])
        x[, "unique_identifier"] = paste0(x$Chr, ":", x$Start, ":", x$End)
        return(x)
      })

      # set cut list for downstream filling, germline
      mb_germline_cut.list.c = lapply(mb_germline.list.c, function(x) {
        x[, 2] = cut(unlist(x[, 2]), breaks = seq(0, chr_len_max, 1e7), include.lowest = TRUE)
        x[, 3] = cut(unlist(x[, 3]), breaks = seq(0, chr_len_max, 1e7), include.lowest = TRUE)
        x = x %>% dplyr::group_by(Chr, Start, End) %>% dplyr::summarize(count = dplyr::n())
        x[, 2] = gsub("\\(", "", unlist(x[, 2]))
        x[, 2] = gsub("\\[", "", unlist(x[, 2]))
        x[, 2] = gsub("\\]", "", unlist(x[, 2]))
        x[, "End"] = NULL
        colnames(x)[3] = "Count"
        x = tidyr::separate(x, col = "Start", into = c("Start", "End"), sep = ",", remove = TRUE)
        x[, 2] = as.integer(unlist(x[, 2])) + as.integer(1)
        x[, 3] = as.integer(unlist(x[, 3]))
        x[, "unique_identifier"] = paste0(x$Chr, ":", x$Start, ":", x$End)
        return(x)
      })

      # fill empty density list with cut list, germline
      mb_germline_d.list.c = mapply(x = mb_germline_d.list.c, y = mb_germline_cut.list.c, SIMPLIFY = FALSE, function(x, y) {
        x[, "Count"] = plyr::mapvalues(x$unique_identifier, from = y$unique_identifier, to = y$Count, warn_missing = FALSE)
        x[, "Count"] = gsub("^.*chr.*$", "0", x[, "Count"])
        x[, "Count"] = as.numeric(x[, "Count"])
        x[, "unique_identifier"] = NULL
        return(x)
      })

      # remove temp MB data variables, germline
      rm(mb_germline.list.c, mb_germline_cut.list.c)
    }

    # set mb_track_ylim for downstream render plot
    mb_track_ylim = mapply(x = mb_somatic_d.list.c, y = mb_germline_d.list.c, SIMPLIFY = FALSE, function(x, y) {
      if (nrow(x) >= 1) {
        x = ceiling(max(abs(x[, 4])))
      } else {
        x = 0
      }
      if (nrow(y) >= 1) {
        y = ceiling(max(abs(y[, 4])))
      } else {
        y = 0
      }
      mb_track_ylim_both = max(x, y)
      if (mb_track_ylim_both %% 2 != 0) {
        mb_track_ylim_both = mb_track_ylim_both + 1
      }
      return(mb_track_ylim_both)
    })
    if (unifyTrackYlim == TRUE) {
      mb_track_ylim_max = max(unlist(mb_track_ylim))
      mb_track_ylim = lapply(mb_track_ylim, function(x) {
        x = mb_track_ylim_max
        return(x)
      })
      rm(mb_track_ylim_max)
    }

    # remove temp MB data variables, other remainings
    rm(chr_len_auto, chr_len_max)
    print("MB data processing completed")
  }

  # if condition for whether to reform GE data
  if (TRUE %in% doGE) {

    # check if GE data exist
    ge_file_name = dir(path = folder_name, pattern = glob2rx("*cDNA_genes.sf"))
    if (length(ge_file_name) == 0) {
      print("GE data not detected, please double check if GE data files are properly named, or set doGE = FALSE to skip GE data processing")
      stop()
    } else {
      print("Start processing GE data")
    }

    # load and extract gene expression data
    ge_path = paste0("./", folder_name, "/", ge_file_name)
    ge.list = lapply(ge_path, read.table, header = TRUE, stringsAsFactors = FALSE)
    ge.list = lapply(ge.list, function(x) x[(names(x) %in% c("Name", "TPM"))])
    ge_header = c("ENSG", "TPM")
    ge.list = lapply(ge.list, setNames, ge_header)

    # filter out unexpected ENST/transcripts in ge.list
    ge.list = lapply(ge.list, function(x) {
      temp_ENST = grepl("^ENST.+$", x$ENSG)
      x = subset(x, subset = !temp_ENST)
      return(x)
    })

    # remove ENSG version suffix in ge.list
    ge.list = lapply(ge.list, function(x) {
      x$ENSG = gsub("\\.[[:digit:]]+$", "", x$ENSG)
      return(x)
    })

    # map gene expression data to GRCh38/GRCm38
    if (speciesID == "hg38") {
      ge.list = lapply(ge.list, merge, x = GRCh38G, by = "ENSG")
    } else if (speciesID == "hg19") {
      ge.list = lapply(ge.list, merge, x = GRCh37G, by = "ENSG")
    } else {
      ge.list = lapply(ge.list, merge, x = GRCm38G, by = "ENSG")
    }
    ge.list = lapply(ge.list, function(x) x[, c(3, 4, 5, 6)])
    names(ge.list) = assayID

    # calculate average GE per sample
    ge.list = lapply(ge.list, function(x) {
      x = dplyr::group_by(x, Chr, Start, End)
      x = dplyr::summarise(x, TPM = mean(TPM))
      x = as.data.frame(x)
      return(x)
    })

    # grouping and calculate average GE per group
    ge.grp = list()
    for (i in 1:length(grpName)) {
      ge.grp[[i]] = ge.list[names(ge.list) %in% grp.list[[i]]]
    }
    rm(i)
    ge.grp.avg = list()
    ge.grp.avg = lapply(ge.grp, dplyr::bind_rows)
    ge.grp.avg = lapply(ge.grp.avg, function(x) {
      x = dplyr::group_by(x, Chr, Start, End)
      x = dplyr::summarise(x, TPM = mean(TPM))
      x = as.data.frame(x)
    })

    # append grouped list to original list
    ge.list.c = c(ge.list, ge.grp.avg)
    names(ge.list.c) = c(assayID, grpName)

    # calculate log10ratio for ge.list.c
    if (unifyTrackYlim == TRUE) {
      min0offset_circos_unif = lapply(ge.list.c, function(x) {
        x = max(x[, 4])
        return(x)
      })
      min0offset_circos_unif = dplyr::bind_rows(min0offset_circos_unif)
      min0offset_circos_unif = 10 ^ (-ceiling(log10(max(min0offset_circos_unif))))
    }
    ge.list.c = lapply(ge.list.c, function(x) {
      x[is.na(x[, 4]), 4] = 0 # replace NA with 0
      x[x[, 4] < 0, 4] = 0 # replace negative value with 0 (for subtraction-normolized group)
      min0offset_circos = 10 ^ (-ceiling(log10(max(x[, 4]))))
      x[, 4] = x[, 4] + ifelse(unifyTrackYlim == TRUE, min0offset_circos_unif, min0offset_circos) # add 1e-3 to fix log(0) issue based on unifyTrackYlim condition
      x["log10ratio"] = log10(x[, 4]) # convert TPM to log10 ratio
      return(x)
    })

    # remove temp ge data
    suppressWarnings(rm(ge.grp, ge.grp.avg, ge.list, ge_file_name, ge_path, ge_header, min0offset_circos_unif))
    print("GE data processing completed")
  }

  # if condition for whether to reform FL data
  if (doFL == TRUE) {

    # check if FL data exist
    fl_file_name_fuca = dir(path = folder_name, pattern = glob2rx("*-fusion-genes.txt"))
    fl_file_name_star = dir(path = folder_name, pattern = glob2rx("*star-fusion*abridged*"))
    if (length(fl_file_name_fuca) == 0 | length(fl_file_name_star) == 0) {
      print("FL data not detected, or missing for at least one fusion caller, please double check if FL data files from all fusion callers are properly named, or set doFL = FALSE to skip FL data processing")
      stop()
    } else {
      print("Start processing FL data")
    }

    # load and extract fusion link data, for Fusion Catcher
    fl_path_fuca = paste0("./", folder_name, "/", fl_file_name_fuca)
    fl.list.fuca = lapply(fl_path_fuca, read.delim, header = TRUE, stringsAsFactors = FALSE, fill = TRUE)
    fl.list.fuca = lapply(fl.list.fuca, function(x) x[, c(11, 12)])
    fl_header = c("From", "To")
    fl.list.fuca = lapply(fl.list.fuca, setNames, fl_header)
    fl.list.fuca = lapply(fl.list.fuca, function(x) {
      x = unique(x[fl_header])
    })

    # load and extract fusion link data, for Star Fusion
    fl_path_star = paste0("./", folder_name, "/", fl_file_name_star)
    fl.list.star = lapply(fl_path_star, read.delim, header = TRUE, stringsAsFactors = FALSE, fill = TRUE)
    fl.list.star = lapply(fl.list.star, function(x) x[, c(5, 7)])
    for (i in 1:length(fl.list.star)) {
      fl.list.star[[i]][, 1] = gsub(".*ENSG", "ENSG", fl.list.star[[i]][, 1])
      fl.list.star[[i]][, 1] = gsub("\\..*", "", fl.list.star[[i]][, 1])
      fl.list.star[[i]][, 2] = gsub(".*ENSG", "ENSG", fl.list.star[[i]][, 2])
      fl.list.star[[i]][, 2] = gsub("\\..*", "", fl.list.star[[i]][, 2])
    }
    rm(i)
    fl.list.star = lapply(fl.list.star, setNames, fl_header)
    fl.list.star = lapply(fl.list.star, function(x) {
      x = unique(x[fl_header])
    })

    # convert logical to character for samples with empty fusion data (to avoid grouping error)
    fl.list.fuca = lapply(fl.list.fuca, function(x) {
      x$From = as.character(x$From)
      x$To = as.character(x$To)
      return(x)
    })
    fl.list.star = lapply(fl.list.star, function(x) {
      x$From = as.character(x$From)
      x$To = as.character(x$To)
      return(x)
    })

    # inner-join Fusion Catcher and Star Fusion, find common fusion genes
    fl.list.join = list()
    for (i in 1:length(fl.list.fuca)) {
      fl.list.join[[i]] = merge(fl.list.fuca[[i]], fl.list.star[[i]], by = c("From", "To"))
    }
    rm(i)

    # select only matched gene (match in GRCh38G/GRCh37G)
    if (speciesID == "hg38") {
      matched = lapply(fl.list.join, function(x) {
        x = x[, 1] %in% GRCh38G[, 1] & x[, 2] %in% GRCh38G[, 1]
      })
    } else if (speciesID == "hg19") {
      matched = lapply(fl.list.join, function(x) {
        x = x[, 1] %in% GRCh37G[, 1] & x[, 2] %in% GRCh37G[, 1]
      })
    }
    fl.list.join = mapply(x = fl.list.join, y = matched, SIMPLIFY = FALSE, function(x, y) {
      x = subset(x, subset = y)
      return(x)
    })
    rm(matched)
    names(fl.list.join) = assayID

    # grouping fusion link
    fl.list.grp = list()
    for (i in 1:length(grpName)) {
      fl.list.grp[[i]] = dplyr::bind_rows(fl.list.join[names(fl.list.join) %in% grp.list[[i]]])
      fl.list.grp[[i]] = unique(fl.list.grp[[i]])
    }
    rm(i)
    fl.list.c = c(fl.list.join, fl.list.grp)
    names(fl.list.c) = c(assayID, grpName)

    # split fusion link list into "From" and "To"
    flf.list = list()
    flt.list = list()
    flf.list = lapply(fl.list.c, function(x) as.data.frame(x[, 1], stringsAsFactors = FALSE))
    flt.list = lapply(fl.list.c, function(x) as.data.frame(x[, 2], stringsAsFactors = FALSE))
    flf.list = lapply(flf.list, setNames, "ENSG")
    flt.list = lapply(flt.list, setNames, "ENSG")

    # map fusion link data to GRCh38/GRCm38, remember to keep the original order!!!
    for (i in 1:length(flf.list)) {
      if (speciesID == "hg38") {
        flf.list[[i]] = plyr::join(flf.list[[i]], GRCh38G, by = "ENSG", type = "inner")
      } else if (speciesID == "hg19") {
        flf.list[[i]] = plyr::join(flf.list[[i]], GRCh37G, by = "ENSG", type = "inner")
      } else {
        flf.list[[i]] = plyr::join(flf.list[[i]], GRCm38G, by = "ENSG", type = "inner")
      }
    }
    rm(i)

    for (i in 1:length(flt.list)) {
      if (speciesID == "hg38") {
        flt.list[[i]] = plyr::join(flt.list[[i]], GRCh38G, by = "ENSG", type = "inner")
      } else if (speciesID == "hg19") {
        flt.list[[i]] = plyr::join(flt.list[[i]], GRCh37G, by = "ENSG", type = "inner")
      } else {
        flt.list[[i]] = plyr::join(flt.list[[i]], GRCm38G, by = "ENSG", type = "inner")
      }
    }
    rm(i)

    flf.list = lapply(flf.list, function(x) x[, c(3, 4, 5, 2)])
    flt.list = lapply(flt.list, function(x) x[, c(3, 4, 5, 2)])
    flf.list.c = flf.list
    flt.list.c = flt.list
    names(flf.list.c) = c(assayID, grpName)
    names(flt.list.c) = c(assayID, grpName)
    rm(flf.list, flt.list)

    # classify fuison link into intra-/inter-chr types
    fl_intraORinter.list.c = mapply(x = flf.list.c, y = flt.list.c, SIMPLIFY = FALSE, function(x, y) {
      fl_intraORinter.list.c = x[, 1] == y[, 1]
    })
    fl_intraORinter.list.c = lapply(fl_intraORinter.list.c, function(x) {
      x = gsub(TRUE, "gray33", x)
      x = gsub(FALSE, "firebrick3", x)
    })

    # set gene labels for downstream intra-/inter-chr classification
    lb.list.c = mapply(x = flf.list.c, y = flt.list.c, SIMPLIFY = FALSE, function(x, y) {
      lb.list.c = dplyr::bind_rows(x, y)
    })
    names(lb.list.c) = c(assayID, grpName)

    # classify gene label into intra-/inter-chr types
    lb_intraORinter.list.c = mapply(x = lb.list.c, y = fl_intraORinter.list.c, SIMPLIFY = FALSE, function(x, y) {
      lb_intraORinter.list.c = cbind(x, rep(y, 2))
    })
    lb_intraORinter.list.c = lapply(lb_intraORinter.list.c, function(x) {
      if (speciesID %in% c("hg38", "hg19")) {
        x[, 1] = factor(x[, 1], levels = paste0("chr", c(1:22, "X", "Y")))
      } else if (speciesID %in% c("mm10", "mm9")) {
        x[, 1] = factor(x[, 1], levels = paste0("chr", c(1:19, "X", "Y")))
      }
      x = plyr::arrange(x, Chr, Start, End)
      x = unique(x)
    })

    # extract lb.list.c from lb_intraORinter.list.c and reset lb_intraORinter.list.c to make sure they match each other
    lb.list.c = lapply(lb_intraORinter.list.c, function(x) {
      x = subset(x, select = 1:4)
      return(x)
    })
    lb_intraORinter.list.c = lapply(lb_intraORinter.list.c, function(x) {
      x = as.character(unlist(x[, 5]))
      return(x)
    })

    # remove temp fl data
    rm(fl_file_name_fuca, fl_file_name_star, fl_header, fl_path_fuca, fl_path_star, fl.list.grp, fl.list.join, fl.list.fuca, fl.list.star, fl.list.c)
    print("FL data processing completed")
  }

  # add variables to temp_prepareVdata_Circos_returnList
  temp_prepareVdata_Circos_returnList = list(
    "baf.list.c" = baf.list.c,
    "cna.list.c" = cna.list.c,
    "cna_track_ylim" = cna_track_ylim,
    "mb_somatic_d.list.c" = mb_somatic_d.list.c,
    "mb_germline_d.list.c" = mb_germline_d.list.c,
    "mb_track_ylim" = mb_track_ylim,
    "ge.list.c" = ge.list.c,
    "flf.list.c" = flf.list.c,
    "flt.list.c" = flt.list.c,
    "fl_intraORinter.list.c" = fl_intraORinter.list.c,
    "lb.list.c" = lb.list.c,
    "lb_intraORinter.list.c" = lb_intraORinter.list.c
  )

  # remove variables already added to temp_prepareVdata_Circos_returnList
  rm(list = names(temp_prepareVdata_Circos_returnList))

  # filter out empty variables, using backwards for loop
  for (i in length(temp_prepareVdata_Circos_returnList):1) {
    if (is.null(temp_prepareVdata_Circos_returnList[[i]])) {
      temp_prepareVdata_Circos_returnList[[i]] = NULL
    }
  }
  rm(i)

  # remove extracted global settings
  rm(list = names(globalSettings_returnList))

  # end of v_prepareVdata_Circos function
  print("v_prepareVdata_Circos run completed, return value saved to the global environment in **prepareVdata_Circos_returnList**")
  return(temp_prepareVdata_Circos_returnList)
}



#' Main function for module: Circos
#'
#' Extract, clean and reform NGS data, and then generate comprehensive Circos plot with each track (circle) representing different types of data. Currently supported tracks: BAF (B allele frequency), CNA (copy number alterations), mutation burden (somatic/germline mutations), gene expression (TPM) and gene fusions (cross-check validation is available when 2 or more fusion callers are in use; also, fusion status check and validation through statistics are available via module CSE).
#'
#' @param outputFolderPath string, relative or absolute path to the output folder, trailing slash "/" required, can be set to NULL (no output file will be written to the file system), default "./_VK/_Circos/" for module Circos.
#' @param startNum integer, by default, vigilante will go through every individual sample and sample group, generating a Circos plot per each item in the list. If an integer is specified, vigilante will instead start generating Circos plots at the specified index (e.g. there 10 samples and 2 groups in the list, set startNum to 11 will make vigilante start generating Circos plots at the first group). If set to NULL, vigilante will automatically detect Circos pltos already existing in the output folder and continue at the most recent generated one (e.g. there are 12 Circos plots in total to be generated and 6 of them already exist in the output folder, to make sure all plots are fully-rendered, vigilante will continue at the 6th plot and overwrite it). Since generating comprehensive Circos plots is very computational-intensive, it is possible that function runs get interrupted and half-rendered plots get generated because of running out of memory. Therefore, it is recommended to set startNum to NULL so that vigilante can ensure every item in the list to be plotted get fully-rendered.
#' @param endNum integer, by default, vigilante will go through every individual sample and sample group, generating a Circos plot per each item in the list. If an integer is specified, vigilante will instead stop generating Circos plots at the specified index (e.g. there 10 samples and 2 groups in the list, set endNum to 6 will make the 6th sample as the last one to be plotted, and there will be no Circos plots for items after the 6th sample).
#' @param customPlotNum integer vector, optional, can be set to NULL; if provided, will override "startNum" and "endNum", and only Circos plots at the specified index will be generated (e.g. there 10 samples and 2 groups in the list, set customPlotNum to c(3, 11) will make only the 3rd sample and the 1st group to be plotted).
#' @param doBAF logic, whether to show BAF track on Circos plots.
#' @param doCNA logic, whether to show CNA track on Circos plots.
#' @param doMB mixed vector, c(FALSE, "somatic", "germline"), whether to show MB track on Circos plots; if TRUE, specify the mutation type, can be either "somatic" or "germline" or both, each type will be shown as a separate track on Circos plots.
#' @param doGE mixed vector, c(FALSE, "log"), whether to show GE track on Circos plots; if TRUE, specify the value format, can be either "log" or "TPM", choose "TPM" for showing raw TPM value, choose "log" for showing value after log10 conversion (0 will be converted into negative value based on the value range across all samples), must match the value format chosen in v_prepareVdata_Circos function.
#' @param doFL logic, whether to show FL track will be shown on Circos plots.
#'
#' @details
#' Please be aware that generating comprehensive Circos plots may become very computational-intensive and time-consuming depending on input data, and it is possible that function runs get interrupted and half-rendered plots get generated because of running out of memory. But no need to worry, vigilante has already taken this possible issue into consideration and come with an internal mechanism to help minimize the interruption. Please see the description of argument "startNum" and properly set it to better utilize the interal helper mechanism.
#'
#'
#' @return NULL, when a valid outputFolderPath is provided, output Circos plots will be generated and saved in the provided location, otherwise function run will stop and nothing will be written into the file system.
#'
#' @import circlize grDevices graphics ComplexHeatmap grid
#'
#' @export
#'
# v_Circos function
v_Circos = function(outputFolderPath = "./_VK/_Circos/", startNum = NULL, endNum = NULL, customPlotNum = NULL, doBAF = FALSE, doCNA = FALSE, doMB = c(FALSE, "somatic", "germline"), doGE = c(FALSE, "log"), doFL = FALSE) {

  # check if outputFolderPath is set
  if (is.null(outputFolderPath)) {
    print("v_Circos run stopped: based on user's choice, output plots will not be generated")
    stop()
  } else {
    print(as.character(glue::glue("Based on user's choice, output plots will be generated and saved in {outputFolderPath}")))
  }

  # check if globalSettings_returnList exists
  if (!exists("globalSettings_returnList")) {
    print("globalSettings_returnList not detected, please follow the instruction and run v_globalSettings function before continue")
    stop()
  } else {
    print("globalSettings_returnList detected, start extracting global settings")
  }

  # extract global settings from return list
  for (i in 1:length(globalSettings_returnList)) {
    assign(x = names(globalSettings_returnList)[i], value = globalSettings_returnList[[i]])
  }
  rm(i)

  # check if prepareVdata_Circos_returnList exists
  if (!exists("prepareVdata_Circos_returnList")) {
    print("prepareVdata_Circos_returnList not detected, please follow the instruction and run v_prepareVdata_Circos function before continue")
    stop()
  } else {
    print("prepareVdata_Circos_returnList detected, start extracting Circos variables")
  }

  # extract Circos variables from return list
  for (i in 1:length(prepareVdata_Circos_returnList)) {
    assign(x = names(prepareVdata_Circos_returnList)[i], value = prepareVdata_Circos_returnList[[i]])
  }
  rm(i)

  # set starting number based on already existing CircosPlot file number
  if (is.null(startNum)) {
    circosPlotNum = dir(path = outputFolderPath, pattern = "CircosPlot4K_.+\\.png$")
    if (length(circosPlotNum) <= 1) {
      startNum = 1
      print(as.character(glue::glue("No existing Circos plots detected in {outputFolderPath}, will start from plot 1")))
    } else {
      startNum = length(circosPlotNum) # in case loop is aborted, rerun the script to restart at (recreate) the last existing one regardless it is fully completed or partially created
      print(as.character(glue::glue("{startNum} existing Circos plots detected in {outputFolderPath}, will continue from plot {startNum}")))
    }
  } else {
    startNum = startNum
  }

  # set ending number
  if (is.null(endNum)) {
    endNum = length(plot_name)
  } else {
    endNum = endNum
    if (endNum < startNum) {
      endNum = startNum
    }
  }

  # set finalPlotNum, override startNum and endNum based on customPlotNum
  if (!is.null(customPlotNum)) {
    finalPlotNum = customPlotNum
  } else {
    finalPlotNum = startNum:endNum
  }

  # beginning of the loop
  for (i in finalPlotNum) {

    # progress statement, start, with timestamp
    if (!is.null(customPlotNum)) {
      print(as.character(glue::glue("Start generating plot {i}, {length(finalPlotNum) - grep(i, finalPlotNum)[1]} more in queue, {Sys.time()}")))
    } else {
      print(as.character(glue::glue("Start generating plot {i}, {endNum - i} more in queue, {Sys.time()}")))
    }

    # adjust track.height based on track numbers
    track_height_par = suppressWarnings(min(0.4 / sum(doBAF, doCNA, any(doMB, na.rm = TRUE), any(doGE, na.rm = TRUE), doFL), 0.2))

    # adjust labels.cex based on track index
    labels_cex = c(0.25, 0.25, 0.2, 0.15)
    labels_cex_index = 0

    # save current circos plot to PNG file under folder "Viz"
    png(filename = paste0(outputFolderPath, "CircosPlot4K_", studyID, "_", plot_name[i], ".png"), width = 3339, height = 2160, units = "px", res = 300)

    # run render circos plot here
    circos.clear()

    # set gap based on speciesID
    if (speciesID == "hg38" | speciesID == "hg19") {
      circos.par(track.height = track_height_par, start.degree = 90, points.overflow.warning = FALSE, clock.wise = TRUE, canvas.xlim = c(-1.1, 1.1), canvas.ylim = c(-1.1, 1.1), gap.degree = c(rep(2, 8), 4, rep(2, 14), 4), track.margin = c(0.01, 0.01))
    } else if (speciesID == "mm10" | speciesID == "mm9") {
      circos.par(track.height = track_height_par, start.degree = 90, points.overflow.warning = FALSE, clock.wise = TRUE, canvas.xlim = c(-1.1, 1.1), canvas.ylim = c(-1.1, 1.1), gap.degree = c(rep(2, 8), 4, rep(2, 11), 4), track.margin = c(0.01, 0.01))
    }

    # initialize with Ideogram
    circos.initializeWithIdeogram(plotType = c("ideogram", "labels"), species = speciesID)

    # render 1st circle: B Allele Frequency (BAF)
    if (doBAF == TRUE && length(baf.list.c[[i]][, 1]) > 0) {
      labels_cex_index = labels_cex_index + 1
      circos.genomicTrack(data = baf.list.c[[i]], numeric.column = 4, ylim = c(0, 1), panel.fun = function(region, value, ...) {
        circos.genomicPoints(region, value, col = "#0d80d8", pch = 16, cex = 0.25, ...)
        circos.lines(CELL_META$cell.xlim, c(0.5, 0.5), lty = 1, col = "#00000040", lwd = 0.25) # center line, gray
        circos.lines(CELL_META$cell.xlim, c(1, 1), lty = 1, col = "#0dd84980", lwd = 0.75) # top line, green
        circos.lines(CELL_META$cell.xlim, c(0, 0), lty = 1, col = "#ff000080", lwd = 0.75) # bottom line, red
        circos.lines(CELL_META$cell.xlim, c(0.75, 0.75), lty = 1, col = "#00000040", lwd = 0.1) # 2nd quantile line, gray
        circos.lines(CELL_META$cell.xlim, c(0.25, 0.25), lty = 1, col = "#00000040", lwd = 0.1) # 4th quantile line, gray
        if (CELL_META$sector.index %in% "chr1") {circos.yaxis(side = "left", at = c(0, 0.5, 1), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)} else if (CELL_META$sector.index %in% "chr9") {circos.yaxis(side = "right", at = c(0, 0.5, 1), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)}
      })
    }

    # render 2nd circle: Copy Number Alternations (CNA, somatic?), remember to change ylim!!!
    if (doCNA == TRUE && length(cna.list.c[[i]][, 1]) > 0) {
      labels_cex_index = labels_cex_index + 1
      circos.genomicTrack(data = cna.list.c[[i]], numeric.colum = 4, ylim = c(-cna_track_ylim[[i]], cna_track_ylim[[i]]), panel.fun = function(region, value, ...) {
        circos.genomicLines(region, value, type = "h", baseline = 0, col = ifelse(value[[1]] > 0, "#0dd849", "red"), ...)
        circos.lines(CELL_META$cell.xlim, c(0, 0), lty = 1, col = "#00000040", lwd = 0.25) # center line, gray
        circos.lines(CELL_META$cell.xlim, c(cna_track_ylim[[i]], cna_track_ylim[[i]]), lty = 1, col = "#00000040", lwd = 0.75) # top line, gray
        circos.lines(CELL_META$cell.xlim, c(-cna_track_ylim[[i]], -cna_track_ylim[[i]]), lty = 1, col = "#00000040", lwd = 0.75) # bottom line, gray
        circos.lines(CELL_META$cell.xlim, c(cna_track_ylim[[i]] * 0.5, cna_track_ylim[[i]] * 0.5), lty = 1, col = "#00000040", lwd = 0.1) # 2nd quantile line, gray
        circos.lines(CELL_META$cell.xlim, c(-cna_track_ylim[[i]] * 0.5, -cna_track_ylim[[i]] * 0.5), lty = 1, col = "#00000040", lwd = 0.1) # 4th quantile line, gray
        if (CELL_META$sector.index %in% "chr1") {circos.yaxis(side = "left", at = c(-cna_track_ylim[[i]], 0, cna_track_ylim[[i]]), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)} else if (CELL_META$sector.index %in% "chr9") {circos.yaxis(side = "right", at = c(-cna_track_ylim[[i]], 0, cna_track_ylim[[i]]), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)}
      })
    }

    # render 3rd & 4th circle: Mutation Burden
    if (TRUE %in% doMB) {
      if ("somatic" %in% doMB && length(mb_somatic_d.list.c[[i]][, 1]) > 0) {
        labels_cex_index = labels_cex_index + 1
        circos.genomicTrack(data = mb_somatic_d.list.c[[i]], numeric.colum = 4, ylim = c(0, mb_track_ylim[[i]]), panel.fun = function(region, value, ...) {
          circos.genomicLines(region, value, area = TRUE, baseline = 0, col = "darkorchid", border = NA, ...)
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]] * 0.5, mb_track_ylim[[i]]  * 0.5), lty = 1, col = "#00000040", lwd = 0.25) # center line, gray
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]], mb_track_ylim[[i]]), lty = 1, col = "#00000040", lwd = 0.75) # top line, gray
          circos.lines(CELL_META$cell.xlim, c(0, 0), lty = 1, col = "#00000040", lwd = 0.75) # bottom line, gray
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]] * 0.25, mb_track_ylim[[i]] * 0.25), lty = 1, col = "#00000040", lwd = 0.1) # 2nd quantile line, gray
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]] * 0.75, mb_track_ylim[[i]] * 0.75), lty = 1, col = "#00000040", lwd = 0.1) # 4th quantile line, gray
          if (CELL_META$sector.index %in% "chr1") {circos.yaxis(side = "left", at = c(0, mb_track_ylim[[i]] * 0.5, mb_track_ylim[[i]]), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)} else if (CELL_META$sector.index %in% "chr9") {circos.yaxis(side = "right", at = c(0, mb_track_ylim[[i]] * 0.5, mb_track_ylim[[i]]), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)}
        })
      }
      if ("germline" %in% doMB && length(mb_germline_d.list.c[[i]][, 1]) > 0) {
        labels_cex_index = labels_cex_index + 1
        circos.genomicTrack(data = mb_germline_d.list.c[[i]], numeric.colum = 4, ylim = c(0, mb_track_ylim[[i]]), panel.fun = function(region, value, ...) {
          circos.genomicLines(region, value, area = TRUE, baseline = 0, col = "plum", border = NA, ...)
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]] * 0.5, mb_track_ylim[[i]]  * 0.5), lty = 1, col = "#00000040", lwd = 0.25) # center line, gray
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]], mb_track_ylim[[i]]), lty = 1, col = "#00000040", lwd = 0.75) # top line, gray
          circos.lines(CELL_META$cell.xlim, c(0, 0), lty = 1, col = "#00000040", lwd = 0.75) # bottom line, gray
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]] * 0.25, mb_track_ylim[[i]] * 0.25), lty = 1, col = "#00000040", lwd = 0.1) # 2nd quantile line, gray
          circos.lines(CELL_META$cell.xlim, c(mb_track_ylim[[i]] * 0.75, mb_track_ylim[[i]] * 0.75), lty = 1, col = "#00000040", lwd = 0.1) # 4th quantile line, gray
          if (CELL_META$sector.index %in% "chr1") {circos.yaxis(side = "left", at = c(0, mb_track_ylim[[i]] * 0.5, mb_track_ylim[[i]]), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)} else if (CELL_META$sector.index %in% "chr9") {circos.yaxis(side = "right", at = c(0, mb_track_ylim[[i]] * 0.5, mb_track_ylim[[i]]), labels = TRUE, tick = TRUE, labels.cex = labels_cex[labels_cex_index], tick.length = convert_x(0.75, "mm"), lwd = 0.25)}
        })
      }
    }

    # render 5th circle: Gene Expression
    if (TRUE %in% doGE && length(ge.list.c[[i]][, 1]) > 0) {
      if ("TPM" %in% doGE & !"log" %in% doGE) {
        CCCM.TPM = colorRamp2(c(0, 100), c("white", "#e51091"))
        circos.genomicHeatmap(ge.list.c[[i]], col = CCCM.TPM, numeric.column = 4, side = "inside", border = NA, connection_height = 0.0001, heatmap_height = track_height_par - 0.0001, line_col = NA)
      } else if ("log" %in% doGE & !"TPM" %in% doGE) {
        CCCM.breakpoint = lapply(ge.list.c, function(x) {
          x = max(abs(x[, 5]))
          return(x)
        })
        CCCM.breakpoint = dplyr::bind_rows(CCCM.breakpoint)
        CCCM.breakpoint = max(CCCM.breakpoint)
        CCCM.log10 = colorRamp2(c(-CCCM.breakpoint, 0, CCCM.breakpoint), c("royalblue3", "white", "red3"))
        circos.genomicHeatmap(ge.list.c[[i]], col = CCCM.log10, numeric.column = "log10ratio", side = "inside", border = NA, connection_height = 0.0001, heatmap_height = track_height_par - 0.0001, line_col = NA)
      }
    }

    # render 6th area: Fusion Links
    if (doFL == TRUE && length(flf.list.c[[i]][, 1]) > 0) {
      circos.genomicLink(flf.list.c[[i]], flt.list.c[[i]], col = fl_intraORinter.list.c[[i]], border = NA, directional = 1, arr.width = 0.05, arr.length = 0.1, arr.type = "triangle")
    }

    # render 7th area: Gene Labels
    if (doFL == TRUE && length(flf.list.c[[i]][, 1]) > 0) {
      par(new = TRUE)
      circos.clear()
      if (speciesID == "hg38" | speciesID == "hg19") {
        circos.par(track.height = track_height_par, start.degree = 90, points.overflow.warning = FALSE, clock.wise = TRUE, canvas.xlim = c(-0.775, 0.775), canvas.ylim = c(-0.775, 0.775), gap.degree = c(rep(2, 8), 4, rep(2, 14), 4))
      } else if (speciesID == "mm10" | speciesID == "mm9") {
        circos.par(track.height = track_height_par, start.degree = 90, points.overflow.warning = FALSE, clock.wise = TRUE, canvas.xlim = c(-0.775, 0.775), canvas.ylim = c(-0.775, 0.775), gap.degree = c(rep(2, 8), 4, rep(2, 11), 4))
      }
      # canvas.x/ylim: bigger value = closer to center
      circos.initializeWithIdeogram(plotType = NULL, species = speciesID)
      circos.genomicLabels(lb.list.c[[i]], labels.column = 4, side = "outside", col = lb_intraORinter.list.c[[i]], line_col = lb_intraORinter.list.c[[i]], cex = 0.7, connection_height = 0.05, labels_height = 0.275)
    }

    # render 8th area: Title
    title(main = plot_name[i], line = -4, adj = 0.08)

    # render 9th area: Legend
    if (doBAF == TRUE && length(baf.list.c[[i]][, 1]) > 0) {
      lgd.baf = Legend(at = "BAF", type = "points", legend_gp = gpar(col = "#0d80d8"), title_position = "topleft", title = "\nB-Allele\nFrequency") # legend for BAF
    }

    if (doCNA == TRUE && length(cna.list.c[[i]][, 1]) > 0) {
      lgd.cna = Legend(at = c("Gain", "Loss"), legend_gp = gpar(fill = c("#0dd849", "red")), title_position = "topleft", title = "\nCopy Number\nAlterations") # legend for CNA
    }

    if (TRUE %in% doMB) {
      if (all(c("somatic", "germline") %in% doMB) && (length(mb_somatic_d.list.c[[i]][, 1]) > 0 & length(mb_germline_d.list.c[[i]][, 1]) > 0)) {
        lgd.mb = Legend(at = c("Somatic", "Germline"), legend_gp = gpar(fill = c("darkorchid", "plum")), title_position = "topleft", title = "\nMutation\nBurden") # legend for MB, both tracks
      } else if ("somatic" %in% doMB && length(mb_somatic_d.list.c[[i]][, 1]) > 0) {
        lgd.mb = Legend(at = c("Somatic"), legend_gp = gpar(fill = c("darkorchid")), title_position = "topleft", title = "\nMutation\nBurden") # legend for MB, somatic track only
      } else if ("germline" %in% doMB && length(mb_germline_d.list.c[[i]][, 1]) > 0) {
        lgd.mb = Legend(at = c("Germline"), legend_gp = gpar(fill = c("plum")), title_position = "topleft", title = "\nMutation\nBurden") # legend for MB, germline track only
      }
    }

    if (TRUE %in% doGE && length(ge.list.c[[i]][, 1]) > 0) {
      if ("TPM" %in% doGE & !"log" %in% doGE) {
        lgd.ge = Legend(at = c(0, 100), labels = c("0", "100+"), col_fun = CCCM.TPM, title_position = "topleft", title = "\nTPM") # legend for GE TPM
      } else if ("log" %in% doGE & !"TPM" %in% doGE) {
        lgd.ge = Legend(at = c(-CCCM.breakpoint, 0, CCCM.breakpoint), labels = c(-CCCM.breakpoint, 0, CCCM.breakpoint), col_fun = CCCM.log10, title_position = "topleft", title = "\nExpression\nLog Ratio") # legend for GE log10ratio
      }
    }

    if (doFL == TRUE && length(flf.list.c[[i]][, 1]) > 0) {
      if (all(c("gray33", "firebrick3") %in% lb_intraORinter.list.c[[i]])) {
        lgd.fl = Legend(at = c("Intra-Chr", "Inter-Chr"), type = "lines", legend_gp = gpar(col = c("gray33", "firebrick3"), lwd = 2), title_position = "topleft", title = "\nFusion\nLinks") # legend for FL, both types
      } else if ("gray33" %in% lb_intraORinter.list.c[[i]]) {
        lgd.fl = Legend(at = c("Intra-Chr"), type = "lines", legend_gp = gpar(col = c("gray33"), lwd = 2), title_position = "topleft", title = "\nFusion\nLinks") # legend for FL, intra-chr type only
      } else if ("firebrick3" %in% lb_intraORinter.list.c[[i]]) {
        lgd.fl = Legend(at = c("Inter-Chr"), type = "lines", legend_gp = gpar(col = c("firebrick3"), lwd = 2), title_position = "topleft", title = "\nFusion\nLinks") # legend for FL, inter-chr type only
      }
    }

    circos_moduleStatus = c(lgd.baf = exists("lgd.baf"), lgd.cna= exists("lgd.cna"), lgd.mb = exists("lgd.mb"), lgd.ge = exists("lgd.ge"), lgd.fl = exists("lgd.fl"))
    circos_legend = ""
    for (j in 1:length(circos_moduleStatus)) {
      if (circos_moduleStatus[j] == TRUE) {
        if (nchar(circos_legend) == 0 ) {
          circos_legend = paste0(circos_legend, names(circos_moduleStatus)[j])
        } else {
          circos_legend = paste0(circos_legend, ", ", names(circos_moduleStatus)[j])
        }
      }
    }
    rm(j)
    lgd.list = glue::glue("packLegend({circos_legend})")
    lgd.list = eval(parse(text = lgd.list))
    suppressWarnings(rm(circos_legend, circos_moduleStatus, lgd.baf, lgd.cna, lgd.mb, lgd.ge, lgd.fl))

    pushViewport(viewport(x = 0.005, y = 0.4, width = 0.25, height = 0.25, just = c("left", "bottom"))) # move away from center(0, 0)
    grid.draw(lgd.list)

    # Clear
    circos.clear()

    # after running render, save the plot to a png.file
    dev.off()

    # remove temp variables
    suppressWarnings(rm(track_height_par, labels_cex, labels_cex_index, CCCM.TPM, min0offset_circos, CCCM.breakpoint, CCCM.log10, lgd.list))

    # progress statement, complete, with timestamp
    print(as.character(glue::glue("plot {i} saved in {outputFolderPath}, {Sys.time()}")))

    # end of the loop, always remember to reset loop variable "i"
  }
  rm(i)

  # remove extracted global settings
  rm(list = names(globalSettings_returnList))

  # remove extracted prepareVdata_Circos_returnList variables
  rm(list = names(prepareVdata_Circos_returnList))

  # end of v_Circos function
  print(as.character(glue::glue("v_Circos run completed, output plots saved in {outputFolderPath}")))
  return()
}

# preset globalVariables for R CMD check
utils::globalVariables(c("BAF", "CNA", "Chr", "End", "GRCh37G", "GRCh38G", "GRCm38G", "Start", "TPM", "Tumor_Sample_Barcode", "assayID", "chr_len_H", "chr_len_M", "folder_name", "globalSettings_returnList", "groupInfo", "group_by", "grp.list", "grpName", "llpMultiLoader_returnList", "maf_germline.p", "plot_name", "speciesID", "baf.list.c", "cna.list.c", "cna_track_ylim", "fl_intraORinter.list.c", "flf.list.c", "flt.list.c", "ge.list.c", "lb.list.c", "lb_intraORinter.list.c", "mb_germline_d.list.c", "mb_somatic_d.list.c", "mb_track_ylim", "min0offset_circos", "prepareVdata_Circos_returnList", "studyID"))
yilixu/vigilante documentation built on June 4, 2021, 5:07 a.m.