R/sequences_to_tree.R

Defines functions sequences_to_tree

Documented in sequences_to_tree

# -*- tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
# vi: set ts=2 noet:


#' This is the full pipeline taking path to a fasta sequence or a
#' seqinr AA file and generating a newick tree
#'
#' @export
sequences_to_tree <- function(
	sequences,
	output_path,
	run_id,
	user_agent_arg,
	sequence_name_to_tree_name = function(x){tibble::data_frame(tree_name=x)},
	force=FALSE,
	verbose=TRUE
){
	output_prefix <- paste(
		output_path,
		paste(run_id, date_code(), sep="_"),
		sep="/")

	if(file.exists(output_prefix)){
		if(force){
			if(verbose){
				cat("Output path already exists, cleaning it ... \n")
			}
			unlink(output_prefix, recursive=T)
		} else {
			stop(paste0("Output path '", output_prefix, "' already exists, if you would like to overwrite it, please use force=TRUE", sep=""))
		}
	}

	dir.create(output_prefix, recursive=T)
	name_map_fname <- paste0(output_prefix, "/name_map.tsv")
	clustalo_input_fname <- paste0(output_prefix, "/clustalo_input.fasta")
	clustalo_output_fname <- paste0(output_prefix, "/clustalo_output.fasta")
	phyml_input_fname <- paste0(output_prefix, "/phyml_input.phylip")
	phyml_output_tree_fname <- paste0(phyml_input_fname, "_phyml_tree_", run_id, ".txt")
	output_tree_fname <- paste0(output_prefix, "/", run_id, ".newick")
	run_script_fname <- paste0(output_prefix, "/run.sh")


	if(class(sequences) == "character"){
		# assume sequences is actually a path to a (possibly zipped) fasta file
		sequences_input_fname <- sequences
		sequences_output_fname <- paste0(
			output_prefix, "/",
			sequences_input_fname %>%
				stringr::str_split("/") %>%
				unlist %>%
				tail(1))
		file.copy(sequences_input_fname, sequences_output_fname)
		if(verbose){
			cat("Reading in sequences from '", sequences_input_fname, "' ...\n", sep="")
		}

		sequences <- seqinr::read.fasta(sequences_input_fname, "AA")
	} else if(
		# assume sequences is actually something read in from seqinr::read.fasta
		class(sequences) == "list" &&
		length(sequences) > 0 &&
		class(sequences[[1]]) == "SeqFastaAA"){
		sequences_output_fname <- paste0(output_prefix, "/sequences.fa")
		seqinr::write.fasta(
			sequences=sequences,
			names=names(sequences),
			file.out=sequences_output_fname)
	} else {
		stop(paste0("Unable to handle input sequences of class ", class(sequences), ".\n", sep=""))
	}

	if(verbose){
		cat("Generating tree for ", length(sequences), " uniprot sequences ... \n", sep="")
	}

	if(verbose){
		cat("Generating phylip and tree names for sequences ... \n")
	}

  name_map <- sequences %>%
		seqinr::getName() %>%
		sequence_name_to_tree_name() %>%
		dplyr::mutate(phylip_name = sequences %>% length %>% generate_phylip_names)

	readr::write_tsv(name_map, name_map_fname)


	cat(
		"#!/bin/sh\n\n",
		"### Autogenerated from Bethany::sequences_to_tree ####\n",
		"### date: ", date_code(), "\n",
		"### run_id: ", run_id, "\n\n",
		sep="",
		file=run_script_fname)

	if(verbose){
		cat("Writing out fasta for input to Clustal Omega: '", clustalo_input_fname, "' ...\n", sep="")
	}
	seqinr:::write.fasta(
		sequences = sequences,
		names = name_map$phylip_name %>% plyr::llply(identity), # it wants a list
		file.out= clustalo_input_fname)

	if(verbose){
		cat("Generating alignment with Clustal Omega: '", clustalo_input_fname, "' -> '", clustalo_output_fname, "' ... \n", sep="")
		cat("#########################\n")
	}
	clustalo(clustalo_input_fname, clustalo_output_fname, run_script_fname)
	if(verbose){
		cat("#########################\n")
		cat("Done generating alignment with Clustal Omega\n\n")
	}

	if(verbose){
		cat("Converting Clustal Omeaga output from fasta to phylip4 format: '", phyml_input_fname, "' ...\n", sep="")
		cat("#########################\n")
	}
	readseq(
		clustalo_output_fname,
		phyml_input_fname,
		run_script_fname = run_script_fname,
		informat="fasta",
		format="Phylip4")
	if(verbose){
		cat("#########################\n")
		cat("Done converting alignment to phylip4 format\n\n")
	}

	if(verbose){
		cat("Generating phylogeny with PhyML, run_id: '", run_id, "' ... \n", sep="")
		cat("#########################\n")
	}
	phyml(
		phyml_input_fname,
		run_id=run_id,
		run_script_fname=run_script_fname)
	if(verbose){
		cat("#########################\n")
		cat("Done generating phylogeny\n\n")
	}

	if(verbose){
		cat("Renaming generated tree with for viewing ... \n")
	}

	tree <- ape::read.tree(phyml_output_tree_fname)
	if(tree %>% is.null){
		stop(paste0("Failed to generate a phylogeny. Check '", 	output_prefix, "' for output files.\n"))
	}

	tree_2 <- tree %>%
		rename_tree(name_map)

	if(verbose){
		cat("Writing out final tree: '", output_tree_fname, "' ... \n", sep="")
	}
	ape::write.tree(tree_2, output_tree_fname)

}
momeara/Bethany documentation built on Aug. 6, 2019, 9:26 a.m.