Nothing
#' @importFrom dplyr %>%
NULL
#' @importFrom utils packageVersion data
#' @export blastp_df
#' @export seq_data
#' @export photosynthesis_gene_list
#' @export PGC_group
#' @export eggnog_df
#' @export KO_group
NULL
#' @title Gene-Cluster Discovery, Annotation and Visualization
#'
#' @description
#' `gclink()` performs **all steps** from raw blastp output to a publication-ready
#' gene-cluster plot **in one call**. It dynamically adapts its workflow:
#' * If `in_seq_data` is provided: Extracts coordinates/sequences, merges data, and generates plots.
#' * If `in_seq_data` is `NULL`: Skips sequence-dependent steps, returning only the cluster table.
#' * If `in_GC_group` is `NULL`: Omits functional-group merging and plotting.
#' * Supports custom gene lists (e.g., photosynthesis, viral genes) via `in_gene_list` for universal
#' cluster detection in bacteria, archaea, or phages.
#'
#' Gene clusters are identified via a density threshold (`AllGeneNum` and `MinConSeq`) due to
#' incomplete gene annotation coverage.
#'
#' @param in_blastp_df A `data.frame` from blast/blastp-style output with columns:
#' \itemize{
#' \item `qaccver`: Genome + contig name (separated by `"---"`), e.g.,
#' `"Kuafubacteriaceae--GCA_016703535.1---JADJBV010000001.1_150"`:
#' \itemize{
#' \item Genome: `"Kuafubacteriaceae--GCA_016703535.1"` (`"--"` separator),
#' \item Contig: `"JADJBV010000001.1"`,
#' \item ORF: `"JADJBV010000001.1_150"` (`"_"` separator),
#' \item Position: `"150"`.
#' }
#' \item `saccver`: Gene name + metadata (separated by `"_"`), e.g.,
#' `"bchC_Methyloversatilis_sp_RAC08_BSY238_2447_METR"`:
#' \itemize{
#' \item Gene: `"bchC"`,
#' \item Metadata(Optional): `"Methyloversatilis_sp_RAC08_BSY238_2447_METR"`.
#' }
#' }
#' eggNOG (evolutionary gene genealogy Nonsupervised Orthologous Groups) format results
#' are supported by renaming annotation columns (e.g., `"GOs"`) to `saccver`.
#' Default: `blastp_df`.
#'
#' @param in_seq_data `NULL` or a `data.frame` with:
#' \describe{
#' \item{`SeqName`}{ORF identifier (Prodigal format: `>ORF_id # start # end # strand # ...`).}
#' \item{`Sequence`}{ORF sequence.}
#' }
#' Example:
#' `"Kuafubacteriaceae--GCA_016703535.1---JADJBV010000001.1_1 # 74 # 1018 # 1 # ..."`
#' Can be imported from **Prodigal** FASTA using:
#' ```r
#' seq_data <- Biostrings::readBStringSet("Prodigal.fasta",format="fasta", nrec=-1L, skip=0L, seek.first.rec=FALSE, use.names=TRUE) %>%
#' data.frame(Sequence = .) %>%
#' tibble::rownames_to_column("SeqName")
#' ```
#' `NULL` skips coordinate extraction and plotting. Default: `seq_data`.
#'
#' @param in_gene_list Character vector of reference genes for cluster detection.
#' Default: `photosynthesis_gene_list`.
#' @param in_GC_group `NULL` or a `data.frame` mapping genes to functional groups
#' (columns: `gene`, `gene_group`). `NULL` skips plotting.
#' Default: `PGC_group`.
#' @param AllGeneNum Integer; max ORFs per cluster. Default: `50`.
#' @param MinConSeq Integer; min consecutive reference genes per cluster. Default: `25`.
#' @param apply_length_filter Logical; whether to apply length filtering based on IQR.
#' If `FALSE`, skips `down_IQR` and `up_IQR` filtering.
#' Default: `FALSE`.
#' @param down_IQR Numeric; lower-bound scale factor for IQR length
#' filtering (see `length_filter`). Default: `10`.
#' @param up_IQR Numeric; upper-bound scale factor for IQR length
#' filtering (see `length_filter`). Default: `10`.
#' @param apply_evalue_filter Logical; whether to filter BLAST hits by e-value cutoff.
#' Requires column `evalue` in input data. Default: `FALSE`.
#' @param min_evalue Numeric; maximum e-value threshold for retaining BLAST hits.
#' Hits with e-value > cutoff will be removed.
#' Typical values: 1e-3 (loose) to 1e-10 (strict).
#' Default: `1`.
#' @param apply_score_filter Logical; whether to filter BLAST hits by bitscore cutoff.
#' Requires column `bitscore` in input data. Default: `FALSE`.
#' @param min_score Numeric; minimum bitscore threshold for retaining BLAST hits.
#' Hits with bitscore < cutoff will be removed.
#' For short proteins (~100 aa), 30-40 is typical;
#' for long proteins (>300 aa), 50-100 is recommended.
#' Default: `10`.
#' @param orf_before_first Integer; number of hypothetical ORFs to insert
#' **before the first gene** of each detected cluster.
#' Useful when the upstream annotation is incomplete or
#' low-confidence; insertion is bounded by the first ORF
#' of the contig. Default: `0`.
#' @param orf_after_last Integer; number of hypothetical ORFs to append
#' **after the last gene** of each detected cluster.
#' Helpful when the downstream annotation is incomplete
#' or low-confidence; insertion is bounded by the last
#' ORF of the contig. Default: `0`.
#' @param orf_range Character. ORF inclusion mode:
#' \itemize{
#' \item `"All"`: Include all ORFs + annotations (default)
#' \item `"OnlyAnnotated"`: Keep only annotated ORFs
#' \item `"IgnoreAnnotated"`: Include all ORFs without annotation merging
#' }
#' @param levels_gene_group Character vector; factor levels for gene-group
#' legends (must include "hypothetical ORF" in case
#' some genes remain unclassified).
#' Ignored if `in_GC_group` is `NULL`.
#' Default: `c('bch','puh','puf','crt','acsF','assembly','regulator','hypothetical ORF')`.
#'
#' @param color_theme Character vector; colours for `gc_plot`, matched to
#' the length and order of `levels_gene_group`.
#' Ignored if plotting is skipped.
#' Default: `c('#3BAA51','#6495ED','#DD2421','#EF9320','#F8EB00','#FF0683','#956548','grey')`.
#' @param genome_subset Character vector or `NULL`; genomes to retain. If `NULL`, all genomes
#' are retained. Default: `NULL`.
#' @return A named list with:
#' \describe{
#' \item{`GC_meta`}{Annotated cluster table (`data.frame`).}
#' \item{`GC_seq`}{FASTA sequences (if `in_seq_data` provided).}
#' \item{`GC_plot`}{ggplot object (if `in_seq_data` and `in_GC_group` provided).}
#' }
#' @export
#' @examples
#' # Case 1: Using blastp result with Full pipeline (Find Cluster + Extract FASTA + Plot Cluster)
#' data(blastp_df)
#' data(seq_data)
#' data(photosynthesis_gene_list)
#' data(PGC_group)
#' gc_list <- gclink(in_blastp_df = blastp_df,
#' in_seq_data = seq_data,
#' in_gene_list = photosynthesis_gene_list,
#' in_GC_group = PGC_group,
#' AllGeneNum = 50,
#' MinConSeq = 25,
#' apply_length_filter = TRUE,
#' down_IQR = 10,
#' up_IQR = 10,
#' orf_before_first = 0,
#' orf_after_last = 0,
#' levels_gene_group = c('bch','puh','puf','crt','acsF','assembly','regulator',
#' 'hypothetical ORF'),
#' color_theme = c('#3BAA51','#6495ED','#DD2421','#EF9320','#F8EB00',
#' '#FF0683','#956548','grey'),
#' genome_subset = NULL)
#' gc_meta = gc_list[["GC_meta"]]
#' gc_seq = gc_list[["GC_seq"]]
#' gc_plot = gc_list[["GC_plot"]]
#' head(gc_meta)
#' head(gc_seq)
#' print(gc_plot)
#'
#' # Case 2: Using eggNOG result with Full pipeline (Find Cluster + Extract FASTA + Plot Cluster)
#' data(eggnog_df)
#' data(seq_data)
#' data(KO_group)
#' KOs = c("K02291","K09844","K20611","K13789",
#' "K09846","K08926","K08927","K08928",
#' "K08929","K13991","K04035","K04039",
#' "K11337","K03404","K11336","K04040",
#' "K03403","K03405","K04037","K03428",
#' "K04038","K06049","K10960","K11333",
#' "K11334","K11335","K08226","K08226",
#' "K09773")
#' rename_KOs = paste0("ko:", KOs)
#' eggnog_df$qaccver = eggnog_df$`#query`
#' eggnog_df$saccver = eggnog_df$KEGG_ko
#' eggnog_df$evalue = eggnog_df$evalue
#' eggnog_df$bitscore = eggnog_df$score
#' eggnog_df$gene = eggnog_df$KEGG_ko
#' gc_list_2 = gclink(in_blastp_df = eggnog_df,
#' in_seq_data = seq_data,
#' in_gene_list = rename_KOs,
#' in_GC_group = KO_group,
#' AllGeneNum = 50,
#' MinConSeq = 25,
#' apply_evalue_filter = FALSE,
#' min_evalue = 1,
#' apply_score_filter = TRUE,
#' min_score = 10,
#' orf_before_first = 1,
#' orf_after_last = 1,
#' levels_gene_group = c('bch','puh','puf','crt',
#' 'acsF','assembly','hypothetical ORF'),
#' color_theme = c('#3BAA51','#6495ED','#DD2421','#EF9320',
#' '#F8EB00','#FF0683','grey'))
#' gc_meta_2 = gc_list_2[["GC_meta"]]
#' gc_seq_2 = gc_list_2[["GC_seq"]]
#' gc_plot_2 = gc_list_2[["GC_plot"]]
#' head(gc_meta_2)
#' head(gc_seq_2)
#' print(gc_plot_2)
#'
#' # SOLUTION FOR "Viewport has zero dimension(s)" ERROR in print(gc_plot)
#' # --------------------------------------------------
#' # When you see this error, try these 3 solutions:
#'
#' # 1. RESIZE RSTUDIO PLOT PANEL (Quickest fix)
#' # Simply click and drag the right border of the plot panel wider
#'
#' # 2. LAUNCH IN NEW WINDOW
#' # dev.new()
#' # print(gc_plot)
#'
#' # 3. SAVE DIRECTLY TO FILE
#' # ggplot2::ggsave('gc_plot.pdf', gc_plot, width=20,height=3)
gclink <- function(in_blastp_df = blastp_df,
in_seq_data = seq_data,
in_gene_list = photosynthesis_gene_list,
in_GC_group = PGC_group,
AllGeneNum = 50,
MinConSeq = 25,
apply_length_filter = FALSE,
down_IQR = 10,
up_IQR = 10,
apply_evalue_filter = FALSE,
min_evalue = 1,
apply_score_filter = FALSE,
min_score = 10,
orf_before_first = 0,
orf_after_last = 0,
orf_range = "All",
levels_gene_group = c('bch','puh','puf','crt','acsF','assembly','regulator',
'hypothetical ORF'),
color_theme = c('#3BAA51','#6495ED','#DD2421','#EF9320','#F8EB00',
'#FF0683','#956548','grey'),
genome_subset = NULL
){
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"0. Starting gclink!")
# 1 Input blastp result, extract orf, Filter based on gene length, evalue, and bitscore from blastp result
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"1. Input blastp result and extract orf!")
bin_genes <- in_blastp_df %>%
orf_extract()
# 1.1
if (apply_length_filter) {
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] ')," Filter based on gene length!")
bin_genes <- bin_genes %>%
length_filter(down_IQR = down_IQR, up_IQR = up_IQR)
}
# 1.2
if (apply_evalue_filter) {
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] ')," Filter based on evalue!")
bin_genes <- bin_genes %>%
dplyr::filter(evalue <= min_evalue)
}
# 1.3
if (apply_score_filter) {
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] ')," Filter based on bitscore!")
bin_genes <- bin_genes %>%
dplyr::filter(bitscore >= min_score)
}
# 2 Finding gene cluster
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"2. Search gene cluster!")
sbgc <- gc_cal(Data = bin_genes,
in_gene_list = in_gene_list,
AllGeneNum = AllGeneNum,
MinConSeq = MinConSeq)
# 3 Add informantion for unannotated ORFs
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"3. Add informantion for unannotated ORFs!")
sbgc_add <- gc_add(Data = sbgc, Annotation = bin_genes, orf_before_first = orf_before_first, orf_after_last = orf_after_last, orf_range = orf_range)
if (!is.null(in_seq_data)) {
# 4 Get ORF loci from CDS FASTA
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"4. Get ORF loci from CDS FASTA!")
GC_location <- orf_locate(in_seq_data)
# 5 Merge orf_position and annotion
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"5. Merge orf_position and annotion!")
GC_gene = merge(sbgc_add, GC_location, by='qaccver', all.x = TRUE) %>%
{.[order(.$gene_cluster, .$orf_position), ]}
GC_gene$gene[is.na(GC_gene$gene)] <- 'unrelated'
# 6 Scale data format
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"6. Scale data format!")
GC_seq <- GC_gene %>%
dplyr::mutate(gene_name_seq = paste0(">", gene, "_", gene_cluster,
"\n", Sequence)) %>%
{.[!(is.na(.$Sequence)),]} %>%
{.$gene_name_seq}
if (!is.null(in_GC_group)) {
# 7 Merge GC_group
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"7. Merge GC_group!")
GC_meta <- merge(GC_gene, in_GC_group, by = 'gene', all.x = TRUE)
# 8 Filter based on genome_subset
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"8. Filter based on genome_subset!")
if (!is.null(genome_subset)) {
GC_meta <- base::subset(GC_meta, genome %in% genome_subset)
}
# 9 Scale and plot
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"9. Plot!")
GC_meta <- gc_scale(GC_meta, levels_gene_group = levels_gene_group)
GC_plot <- gc_plot(GC_meta, color_theme = color_theme)
# 10 reture list
results = list(
GC_meta = GC_meta,
GC_seq = GC_seq,
GC_plot = GC_plot
)
} else {
results = list(
GC_meta = GC_gene,
GC_seq = GC_seq,
GC_plot = NULL
)
}
} else {
results = list(
GC_meta = sbgc_add,
GC_seq = NULL,
GC_plot = NULL
)
}
message(paste0('[',format(Sys.time(), "%Y-%m-%d %H:%M:%S"),'] '),"10. gclink finished!")
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.