# ================================================================= #
# C O N F I G C L A S S E S #
# ================================================================= #
#' Thresholds for alignment significance
#'
#' For details on inputs see main package documentation
#'
#' @slot prot2prot post-correction p-value threshold
#' @slot prot2allorf post-correction p-value threshold
#' @slot prot2transorf post-correction p-value threshold
#' @slot dna2dna post-correction p-value threshold
config_alignment_thresholds <- setClass(
"config_alignment_thresholds",
representation(
prot2prot = "numeric",
prot2allorf = "numeric",
prot2transorf = "numeric",
dna2dna = "numeric"
),
prototype(
prot2prot = 0.05,
prot2allorf = 0.05,
prot2transorf = 0.05,
dna2dna = 0.05
)
)
#' Simulation parameters
#'
#' For details on inputs see main package documentation
#'
#' @slot prot2prot sample size
#' @slot prot2allorf sample size
#' @slot prot2transorf sample size
config_alignment_simulation <- setClass(
"config_alignment_simulation",
representation(
prot2prot = "integer",
prot2allorf = "integer",
prot2transorf = "integer"
),
prototype(
prot2prot = 1000L,
prot2allorf = 1000L,
prot2transorf = 1000L
)
)
#' Alignment settings and thresholds
#'
#' For details on inputs see main package documentation
#'
#' @slot thresholds p-value thresholds for hit significanc
#' @slot simulation simulation parameters
#' @slot substitution_matrix the Smith-Waterman amino acid substitution matrix
#' @slot dna2dna_maxspace largest size of string to compare
#' @slot indel_threshold I don't remember what this does (FIXME: the fuck kind of documentation is this?)
config_alignment <- setClass(
"config_alignment",
representation(
thresholds = "config_alignment_thresholds",
simulation = "config_alignment_simulation",
substitution_matrix = "character",
padjust_method = "character",
dna2dna_maxspace = "integer",
indel_threshold = "numeric"
),
prototype(
thresholds = config_alignment_thresholds(),
simulation = config_alignment_simulation(),
substitution_matrix = "BLOSUM80",
padjust_method = "holm",
dna2dna_maxspace = 1e8L,
indel_threshold = 0.25
)
)
#' Paths to inputs and the focal species name
#'
#' For details on input requirements, see the main package documentation
#'
#' @slot gff named list of GFF3 files (.gff or gff3 extensions)
#' @slot fna named list of genome files (fasta format)
#' @slot syn named list of synteny maps
#' @slot tree directory of phylogenetic tree
#' @slot focal_species name of the focal species
#' @slot query_gene_list file containing query genes (e.g. orphan candidates)
#' @slot control_gene_list file containing control genes
config_input <- setClass(
"config_input",
representation(
gff = "list",
fna = "list",
syn = "list",
tree = "character",
focal_species = "character",
query_gene_list = "character",
control_gene_list = "character"
),
prototype(
gff = list(),
fna = list(),
syn = list(),
tree = NA_character_,
focal_species = NA_character_,
query_gene_list = NA_character_,
control_gene_list = NA_character_
)
)
#' Settings for Synder
#'
#' For details on inputs see main package documentation
#'
#' @slot offsets Base offsets for synder
#' @slot k How many interlopers shall we allow?
config_synder <- setClass(
"config_synder",
representation(
offsets = "integer",
k = "integer",
r = "numeric",
trans = "character"
),
prototype(
offsets = c(1L,1L),
k = 0L,
r = 0,
trans = "i"
)
)
#' Settings for Open Reading Frame finding
#'
#' @slot start Start codon (e.g. "ATG|TTG|CTG")
#' @slot stop Stop codon (e.g. "TAA|TGA|TAG")
#' @slot minlen Minimum ORF length (not including start and stop codons)
config_orf <- setClass(
"config_orf",
representation(
start = "character",
stop = "character",
minlen = "integer"
),
prototype(
start = "ATG",
stop = "TAA|TGA|TAG",
minlen = 30L
)
)
default_decision_tree <- '
gen:
divisor: Is AA similar to a known gene?
O1:
primary: ORFic
secondary: O1
label: ORFic - known protein
trn:
divisor: Is AA similar to any ORF on known mRNA?
O2:
primary: ORFic
secondary: O2
label: ORFic - transcribed ORF
orf:
divisor: Is AA similar to any ORF anywhere?
O3:
primary: ORFic
secondary: O3
label: ORFic - unknown ORF
nuc:
divisor: Is DNA similar to anything?
cds:
divisor: Does match overlap a CDS?
N1:
primary: Non-ORFic
secondary: N1
label: DNA match to CDS
exo:
divisor: Does match overlap an exon?
N2:
primary: Non-ORFic
secondary: N1
label: SI overlaps exon
rna:
divisor: Does match overlap a known transcript?
N3:
primary: Non-ORFic
secondary: N3
label: DNA match to transcript
N4:
primary: Non-ORFic
secondary: N4
label: No DNA match to any known gene
tec:
U7:
primary: Unknown
secondary: U7
label: skipped
una:
divisor: Query maps off target scaffold
U1:
primary: Unknown
secondary: U1
label: unassembled
ind:
divisor: Query maps to zero-length target interval
U2:
primary: Unknown
secondary: U2
label: possible indel
nst:
divisor: Query maps to N-string
U3:
primary: Unknown
secondary: U3
label: possibly in unknown region
scr:
divisor: Query maps inbetween contiguous block
U5:
primary: Unknown
secondary: U5
label: scrambled synteny
U6:
primary: Unknown
secondary: U6
label: good syntenic match, no homology
'
#' The top configuration class
#'
#' The main thing you will need to change is the input.
#'
#' @slot input Paths to inputs and the focal species name
#' @slot synder Parameters for synder
#' @slot alignment Alignment configurations
fagin_config <- setClass(
"fagin_config",
representation(
input = "config_input",
synder = "config_synder",
orf = "config_orf",
alignment = "config_alignment",
decision_tree = "list",
archive = "character",
cache = "logical"
),
prototype(
input = config_input(),
synder = config_synder(),
alignment = config_alignment(),
decision_tree = yaml::yaml.load(default_decision_tree),
archive = 'ARCHIVE',
cache = FALSE
)
)
#' Get a default configuration object
#'
#' Checks existence of all required files and reports species in the tree
#'
#' @export
config <- function(){
con <- fagin_config()
con@input@focal_species <- gsub(" ", "_", con@input@focal_species)
con
}
#' Validate a configuration
#'
#' @export
validate_config <- function(con){
check_files <- function(xs){
for(name in names(xs)){
if(! file.exists(xs[[name]])){
stop(sprintf("Cannot find file '%s' for species '%s'", xs[[name]], name))
}
}
}
check_files(con@input@gff)
check_files(con@input@fna)
check_files(con@input@syn)
if(! file.exists(con@input@tree)){
stop("Tree file not found")
}
if(! file.exists(con@input@query_gene_list)){
stop("Query gene list not found")
}
if(! file.exists(con@input@control_gene_list)){
stop("Control gene list not found")
}
# check whether the tree can be loaded
tree <- ape::read.tree(con@input@tree)
if(! (con@input@focal_species %in% tree$tip.label)){
stop("Focal species is not in tree")
}
cat("Found tree with species:", paste(tree$tip.label, collapse=", "), "\n")
cat("Everything looks good so far\n")
}
# ================================================================= #
# D A T A C L A S S E S #
# ================================================================= #
setOldClass("density")
setOldClass("htest")
setOldClass("phylo")
setOldClass("Seqinfo")
#' Summary of a numeric vector
#'
#' @slot min numeric minimum value
#' @slot q25 numeric 25th quantile
#' @slot median numeric median
#' @slot q75 numeric 75th quantile
#' @slot max numeric maximum value
#' @slot mean numeric mean
#' @slot sd numeric standard deviation
#' @slot n integer total number of elements
#' @slot density density kernel density
numeric_summary <- setClass(
"numeric_summary",
representation(
min = "numeric",
q25 = "numeric",
median = "numeric",
q75 = "numeric",
max = "numeric",
mean = "numeric",
sd = "numeric",
n = "integer",
density = "density"
)
)
#' Summary of a single synteny map
#'
#' @slot nrow integer
#' @slot width numeric_summary
#' @slot score numeric_summary
#' @slot query_target_log2_ratio numeric_summary
synmap_summary <- setClass(
"synmap_summary",
representation(
# length_to_score_coref = "matrix"
nrow = "integer",
width = "numeric_summary",
score = "numeric_summary",
query_target_log2_ratio = "numeric_summary"
)
)
#' Summary of a sequence
#'
#' This is a parent to both DNA and protien summary classes
#'
#' @slot table data.frame of sequence names and lengths
#' @slot comp matrix base composition
seq_summary <- setClass(
"seq_summary",
representation(
table = "data.frame",
comp = "matrix"
)
)
#' Summary of a set of DNA sequences
#'
#' @slot n_triple integer
#' @slot initial_codon integer
#' @slot final_codon integer
dna_summary <- setClass(
"dna_summary",
representation(
n_triple = "integer",
initial_codon = "integer",
final_codon = "integer"
),
contains = "seq_summary"
)
#' Summary of a set of protein sequences
#'
#' @slot initial_residue integer
#' @slot final_residue integer
faa_summary <- setClass(
"faa_summary",
representation(
initial_residue = "integer",
has_internal_stop = "logical",
final_residue = "integer"
),
contains = "seq_summary"
)
#' CDS phase summary
#'
#' @slot initial_residue integer
#' @slot final_residue integer
phase_summary <- setClass(
"phase_summary",
representation(
table = "table",
incomplete_models = "character"
)
)
#' Summary of a GFF file
#'
#' @slot table data.frame
#' @slot mRNA_length numeric_summary
#' @slot CDS_length numeric_summary
#' @slot exon_length numeric_summary
gff_summary <- setClass(
"gff_summary",
representation(
table = "data.frame",
mRNA_length = "numeric_summary",
CDS_length = "numeric_summary",
exon_length = "numeric_summary"
)
)
#' Summary of a IRanges or GRanges file (without the types of a GFF)
#'
#' @slot table data.frame
#' @slot width numeric_summary
granges_summary <- setClass(
"granges_summary",
representation(
table = "data.frame",
width = "numeric_summary"
)
)
#' References to each of the RData files for a given species
#'
#' @slot gff.file character
#' @slot dna.file character
#' @slot aa.file character
#' @slot trans.file character
#' @slot orfgff.file character
#' @slot orffaa.file character
#' @slot transorfgff.file character
#' @slot transorffaa.file character
#' @slot nstring.file character
#' @slot specsum.file character
species_data_files <- setClass(
"species_data_files",
representation(
gff.file = "character",
dna.file = "character",
aa.file = "character",
trans.file = "character",
orfgff.file = "character",
orffaa.file = "character",
transorfgff.file = "character",
transorffaa.file = "character",
nstring.file = "character",
specsum.file = "character"
)
)
#' Summaries of the data stored for each species
#'
#' @slot gff.summary gff_summary
#' @slot dna.summary dna_summary
#' @slot aa.summary faa_summary
#' @slot trans.summary dna_summary
#' @slot orfgff.summary gff_summary
#' @slot orffaa.summary faa_summary
#' @slot transorf.summary dna_summary
#' @slot nstring.summary numeric_summary
species_summaries <- setClass(
"species_summaries",
representation(
gff.summary = "gff_summary",
dna.summary = "dna_summary",
aa.summary = "faa_summary",
trans.summary = "dna_summary",
orfgff.summary = "granges_summary",
orffaa.summary = "faa_summary",
transorfgff.summary = "granges_summary",
transorffaa.summary = "faa_summary",
nstring.summary = "numeric",
model_phases = "list"
)
)
#' Data summaries and references to full data for a given species
#'
#' @slot files species_data_files RData files containing the full data
#' @slot summaries character Filename for saved detailed summaries of all data
#' @slot seqinfo Seqinfo Genomic info
species_meta <- setClass(
"species_meta",
representation(
files = "species_data_files",
summaries = "character",
seqinfo = "Seqinfo"
)
)
#' Summaries of every synteny map and references to full data
#'
#' @slot synmap.file filename,
#' @slot synmap.summary synmap_summary
synteny_meta <- setClass(
"synteny_meta",
representation(
synmap.file = "character",
synmap.summary = "synmap_summary"
)
)
#' Class containing all data required for a Fagin run
#'
#' @slot tree Phologenetic tree of all species in the analysis
#' @slot focal_species The focal species
#' @slot queries A character vector of gene ids (orphan candidates, or whatever)
#' @slot control A character vector of gene ids (control)
#' @slot species A list of species_meta objects
#' @slot synteny_maps A list of synteny maps
derived_input <- setClass(
"derived_input",
representation(
tree = "phylo",
focal_species = "character",
queries = "character",
control = "character",
species = "list",
synmaps = "list"
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.