library("optparse")
#Read command-line options
option_list <- list(
make_option(c("--annot"), type="character", default=NULL,
help="Path to the annotations file made by prepareAnnotations.R.", metavar = "path"),
make_option(c("--batch"), type="character", default = "NULL",
help = "Specify analysis batch. Value '1 200' would split the gene ids into 200 batches and run the first batch.", metavar = "path"),
make_option(c("--out"), type="character", default = "NULL",
help = "Path to the output folder.", metavar = "path"),
make_option(c("--fill"), type="logical", default = TRUE,
help = "Fill alternative internal exons for promoter and 3'end events..", metavar = "path"),
make_option(c("--cage"), type="character", default = NULL,
help = "Path to the CAGE annotations file.", metavar = "path"),
make_option(c("--start_end_diff"), type="integer", default = 20,
help = "Minimal difference (in basepairs) between the alternative promoters or 3'ends.", metavar = "path")
)
opt <- parse_args(OptionParser(option_list=option_list))
print(opt)
annot_file = opt$annot
batch_string = opt$batch
out_dir = opt$out
fill_internal_exons = opt$fill
cage_file = opt$cage
start_end_diff = opt$start_end_diff
dir.create(out_dir)
#Import other dependencies
library("txrevise")
library("purrr")
suppressMessages(library("dplyr"))
suppressMessages(library("rtracklayer"))
#Import prepared transcript annotations
txrevise_data = readRDS(annot_file)
#Import CAGE promoter annotations and merge them into Ensembl annotations
if(!is.null(cage_file)){
cage_data = readRDS(cage_file)
new_exons = c(txrevise_data$exons, cage_data$exons)
new_metadata = dplyr::select(txrevise_data$transcript_metadata,
ensembl_gene_id, ensembl_transcript_id,
longest_start, longest_end, cds_start_NF,
cds_end_NF, cds_start_end_NF)
new_metadata = dplyr::bind_rows(new_metadata, cage_data$transcript_metadata)
txrevise_data = list(exons = new_exons, cdss = txrevise_data$cdss, transcript_metadata = new_metadata)
}
#### Split genes into batches ####
batch_vector = as.integer(unlist(strsplit(batch_string, split = " ")))
gene_ids = unique(txrevise_data$transcript_metadata$ensembl_gene_id)
batch_size = ceiling(length(gene_ids)/batch_vector[2])
batches = txrevise::splitIntoBatches(length(gene_ids), batch_size)
selection = batches == batch_vector[1]
selected_gene_ids = gene_ids[selection]
#Set up output file names
batch_id = paste(batch_vector, collapse = "_")
error_file = file.path(out_dir, paste0("failed_genes.batch_",batch_id, ".txt"))
#Only proceed with event construction if there are any genes in the list
if (length(selected_gene_ids) > 0){
#Construct events
gene_ids_list = setNames(as.list(selected_gene_ids), selected_gene_ids)
#Construct alternative events and remove failed genes
safe_construct = purrr::safely(constructAlternativeEventsWrapper)
alt_events = purrr::map(gene_ids_list, ~safe_construct(., txrevise_data$transcript_metadata,
txrevise_data$exons,
txrevise_data$cdss,
max_internal_diff = 10,
max_start_end_diff = start_end_diff,
fill_internal = fill_internal_exons)$result)
failed_genes = purrr::map_lgl(alt_events, is.null)
alt_events = alt_events[!failed_genes] #Remove failed genes
#Flatten
alt_events = purrr::flatten(alt_events) %>% txrevise::flattenAlternativeEvents(min_alt_event_count = 1)
#Construct event metadata
event_metadata = txrevise::constructEventMetadata(names(alt_events))
#Iterate over groups and positions and export annotations to gff files
for (group in c("grp_1", "grp_2")){
for (event in c("upstream", "downstream", "contained")){
selected_events = dplyr::filter(event_metadata, grp_id == group, event_type == event)
out_file = file.path(out_dir, paste("txrevise",group, event, batch_id, "gff3", sep = "."))
#Only write output if there are any events
if(nrow(selected_events) > 0){
gff_granges = txrevise::transcriptsToAnnotations(alt_events[selected_events$transcript_id], event_metadata)
rtracklayer::export.gff3(gff_granges, out_file)
} else { #Write empty output files
file.create(out_file)
}
}
}
#Write failed genes
failed_names = names(which(failed_genes))
if(length(failed_names) > 0){
write.table(failed_names, error_file, row.names = FALSE, col.names = FALSE, quote = FALSE)
}
} else {
#If there are now genes in the batch then write empty output files
for (group in c("grp_1", "grp_2")){
for (event in c("upstream", "downstream", "contained")){
out_file = file.path(out_dir, paste("txrevise",group, event, batch_id, "gff3", sep = "."))
file.create(out_file)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.