PR.PACKAGE <- "Phyloscanner.R.utilities"
PR.read.processed.phyloscanner.output.in.directory <- paste('Rscript', system.file(package=PR.PACKAGE, "phsc.read.processed.phyloscanner.output.in.directory.Rscript"))
PR.pairwise.relationships <- paste('Rscript', system.file(package=PR.PACKAGE, "phsc.pairwise.relationships.Rscript"))
#PR.pairwise.relationships <- paste('Rscript', '/Users/Oliver/git/Phyloscanner.R.utilities/inst/phsc.pairwise.relationships.Rscript')
.onAttach <- function(...)
{
packageStartupMessage("Loaded utility functions for phyloscanner (https://github.com/olli0601/Phyloscanner.R.utilities)")
}
.onLoad <- function(...)
{
suppressMessages(library(ape))
suppressMessages(library(argparse))
#suppressMessages(library(phytools))
##suppressMessages(library(phangorn))
suppressMessages(library(reshape2))
suppressMessages(library(data.table))
suppressMessages(library(RColorBrewer))
suppressMessages(library(grid))
suppressMessages(library(gridExtra))
suppressMessages(library(colorspace))
suppressMessages(library(scales))
suppressMessages(library(ggplot2))
suppressMessages(library(ggtree))
##suppressMessages(library(zoo))
}
phsc.cmd.mafft.add<- function(infile, reffile, outfile, options='')
{
#mafft --reorder --anysymbol --add new_sequences --auto input
tmp <- c( gsub(' ','\\ ',gsub('(','\\(',gsub(')','\\)',infile,fixed=T),fixed=T),fixed=T),
gsub(' ','\\ ',gsub('(','\\(',gsub(')','\\)',reffile,fixed=T),fixed=T),fixed=T),
gsub(' ','\\ ',gsub('(','\\(',gsub(')','\\)',outfile,fixed=T),fixed=T),fixed=T)
)
cmd <- paste('mafft --anysymbol ',options,' --add ',tmp[1],' --auto ',tmp[2],' > ',tmp[3], sep='')
cmd
}
phsc.cmd.blacklist.reads<- function(pr, inputFileName, outputFileName, rawThreshold=1, ratioThreshold=1/200, tipRegex=NA)
{
cmd <- paste(pr, rawThreshold, ratioThreshold, inputFileName, outputFileName)
if(!is.na(tipRegex))
cmd <- paste0(cmd, ' --tipRegex "', tipRegex,'"')
cmd
}
phsc.cmd.NormalisationLookupWriter<- function(pr, scriptdir, inputTreeFileName, normFileName, outputFileName, normFileVar, standardize=FALSE, verbose=NA)
{
cmd <- paste0(pr, ' --scriptdir ',scriptdir,' "',inputTreeFileName, '" "', normFileName, '" "',outputFileName, '" "',normFileVar,'" ')
if(!is.na(standardize) & standardize)
cmd <- paste0(cmd, ' --standardize')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
phsc.cmd.blacklist.dualinfections<- function(pr, inputFileNameDuals, outputFileName, blacklistFileName=NA, summaryFileName=NA, treeFileName=NA, dual.prop.threshold=0.01, windowCount=NA, verbose=NA)
{
cmd <- paste(pr, dual.prop.threshold, inputFileNameDuals, outputFileName)
if(!is.na(treeFileName))
cmd <- paste(cmd, '--treePrefix', treeFileName)
if(!is.na(windowCount))
cmd <- paste(cmd, '--windowCount', windowCount)
if(!is.na(blacklistFileName))
cmd <- paste(cmd, '--existingBlacklistsPrefix', blacklistFileName)
if(!is.na(summaryFileName))
cmd <- paste(cmd, '--summaryFile', summaryFileName)
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
phsc.cmd.blacklist.parsimonybased<- function(pr, inputFileName, outputFileName, dualCandidatesOutputFileName=NA, blackListFileName=NA, rawThreshold=1, ratioThreshold=1/200, sankhoffK=20, multifurcation.threshold=1e-5, outgroupName=NA, tipRegex=NA, branchLengthNormalisation=NA, verbose=NA)
{
cmd <- paste0(pr, ' ',rawThreshold,' ',ratioThreshold,' ',sankhoffK, ' "', inputFileName, '" "',outputFileName,'"')
if(!is.na(dualCandidatesOutputFileName))
cmd <- paste0(cmd, ' --dualsOutputFile "', dualCandidatesOutputFileName,'"')
if(!is.na(outgroupName))
cmd <- paste0(cmd, ' --outgroupName ', outgroupName)
if(!is.na(blackListFileName))
cmd <- paste0(cmd, ' --blacklist "', blackListFileName,'"')
if(!is.na(tipRegex))
cmd <- paste0(cmd, ' --tipRegex "', tipRegex, '"')
if(!is.na(multifurcation.threshold))
cmd <- paste0(cmd, ' --multifurcationThreshold ', multifurcation.threshold,' ')
if(!is.na(branchLengthNormalisation) & class(branchLengthNormalisation)=='character')
cmd <- paste0(cmd, ' --branchLengthNormalisation "', branchLengthNormalisation, '"')
if(!is.na(branchLengthNormalisation) & class(branchLengthNormalisation)=='numeric')
cmd <- paste0(cmd, ' --branchLengthNormalisation ', branchLengthNormalisation)
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
phsc.cmd.blacklist.rogue.geneticdistance<- function(pr, scriptdir, inputTreeFileName, outputFileName, longestBranchLength=0.04, dropProportion=1/100, blackListFileName=NA, outgroupName=NA, tipRegex=NA, verbose=NA)
{
cmd <- paste(pr, '--scriptdir',scriptdir, dropProportion, longestBranchLength, inputTreeFileName, outputFileName)
if(!is.na(tipRegex))
cmd <- paste0(cmd, ' --tipRegex "', tipRegex,'"')
if(!is.na(outgroupName))
cmd <- paste(cmd, '--outgroupName', outgroupName)
if(!is.na(blackListFileName))
cmd <- paste(cmd, '--blacklist', blackListFileName)
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
phsc.cmd.blacklist.rogue.extremeprob<- function(pr, scriptdir, inputTreeFileName, outputFileName, probThreshold=0.001, longestBranchLength=0.04, dropProportion=1/100, blackListFileName=NA, outgroupName=NA, tipRegex=NA, verbose=NA)
{
cmd <- paste(pr, '--scriptdir',scriptdir, dropProportion, probThreshold, longestBranchLength, inputTreeFileName, outputFileName)
if(!is.na(tipRegex))
cmd <- paste0(cmd, ' --tipRegex "', tipRegex,'"')
if(!is.na(outgroupName))
cmd <- paste(cmd, '--outgroupName', outgroupName)
if(!is.na(blackListFileName))
cmd <- paste(cmd, '--blacklist', blackListFileName)
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
#' @import data.table
#' @title Generate bash commands to process phyloscanner output
#' @description This function generates bash commands that combine the various Rscripts in the phyloscanner toolkit
#' @param tmp.dir Directory with phyloscanner output.
#' @param file.patients File name of the file that contains the list of unique individuals/units that the bam files correspond to, possibly after re-naming.
#' @param pty.args List of phyloscanner input variables.
#' @return character string of bash commands.
phsc.cmd.process.phyloscanner.output.in.directory<- function(tmp.dir, file.patients, pty.args)
{
#run.id <- 'ptyr5'; tmp.dir <- '$CWD/pty_16-09-08-07-32-26'; file.bam <- '/Users/Oliver/duke/2016_PANGEAphylotypes/Rakai_ptinput/ptyr5_bam.txt'
# define variables
prog.pty <- pty.args[['prog.pty']]
root.name <- pty.args[['alignments.root']]
split.rule <- pty.args[['split.rule']]
split.kParam <- pty.args[['split.kParam']]
split.proximityThreshold <- pty.args[['split.proximityThreshold']]
split.readCountsMatterOnZeroBranches <- pty.args[['split.readCountsMatterOnZeroBranches']]
split.tiesRule <- pty.args[['split.tiesRule']]
use.blacklisters <- pty.args[['use.blacklisters']]
contaminant.read.threshold <- pty.args[['contaminant.read.threshold']]
contaminant.prop.threshold <- pty.args[['contaminant.prop.threshold']]
dual.prop.threshold <- pty.args[['dual.prop.threshold']]
roguesubtree.prop.threshold <- pty.args[['roguesubtree.prop.threshold']]
roguesubtree.read.threshold <- pty.args[['roguesubtree.read.threshold']]
roguesubtree.kParam <- pty.args[['roguesubtree.kParam']]
rogue.prop.threshold <- pty.args[['rogue.prop.threshold']]
rogue.longestBranchLength <- pty.args[['rogue.longestBranchLength']]
rogue.probThreshold <- pty.args[['rogue.probThreshold']]
dwns.maxReadsPerPatient <- pty.args[['dwns.maxReadsPerPatient']]
bl.normalising.reference.file <- pty.args[['bl.normalising.reference.file']]
bl.normalising.reference.var <- pty.args[['bl.normalising.reference.var']]
tip.regex <- pty.args[['tip.regex']]
mem.save <- pty.args[['mem.save']]
multifurcation.threshold <- pty.args[['multifurcation.threshold']]
split.pruneBlacklist <- pty.args[['split.pruneBlacklist']]
trms.allowMultiTrans <- pty.args[['trms.allowMultiTrans']]
trmw.min.reads <- pty.args[['pw.trmw.min.reads']]
trmw.min.tips <- pty.args[['pw.trmw.min.tips']]
trmw.close.brl <- pty.args[['pw.trmw.close.brl']]
trmw.distant.brl <- pty.args[['pw.trmw.distant.brl']]
prior.keff <- pty.args[['pw.prior.keff']]
prior.neff <- pty.args[['pw.prior.neff']]
prior.keff.dir <- pty.args[['pw.prior.keff.dir']]
prior.neff.dir <- pty.args[['pw.prior.neff.dir']]
prior.calibrated.prob <- pty.args[['pw.prior.calibrated.prob']]
process.window <- pty.args[['process.window']]
combine.processed.windows <- pty.args[['combine.processed.windows']]
verbose <- pty.args[['verbose']]
#
pty.tools.dir.deprecated <- file.path(dirname(prog.pty),'deprecated')
pty.tools.dir <- file.path(dirname(prog.pty),'tools')
prog.pty.bl.normaliser <- paste('Rscript ',file.path(pty.tools.dir.deprecated,'NormalisationLookupWriter.R'),sep='')
prog.pty.readblacklist <- paste('Rscript ',file.path(pty.tools.dir.deprecated,'MakeReadBlacklist.R'),sep='')
prog.pty.readblacklistsankoff <- paste('Rscript ',file.path(pty.tools.dir,'parsimony_based_blacklister.R'),sep='')
prog.pty.rogueblacklist <- paste('Rscript ',file.path(pty.tools.dir.deprecated,'MakeRogueBlacklist.R'),sep='')
prog.pty.roguewblacklist <- paste('Rscript ',file.path(pty.tools.dir.deprecated,'MakeRogueBlacklistWeibull.R'),sep='')
prog.pty.dualblacklist <- paste('Rscript ',file.path(pty.tools.dir.deprecated,'DualPatientBlacklister.R'),sep='')
prog.pty.downsample <- paste('Rscript ',file.path(pty.tools.dir,'downsample_reads.R'),sep='')
prog.pty.split <- paste('Rscript ',file.path(pty.tools.dir,'split_hosts_to_subgraphs.R'),sep='')
prog.pty.smry <- paste('Rscript ',file.path(pty.tools.dir,'summary_statistics.R'),sep='')
prog.pty.lkltrm <- paste('Rscript ',file.path(pty.tools.dir,'classify_relationships.R'),sep='')
prog.pty.lkl.smry <- paste('Rscript ',file.path(pty.tools.dir.deprecated,'TransmissionSummary.R'),sep='')
run.id <- gsub('_rename.txt|_bam.txt|_patient.txt|_patients.txt','',basename(file.patients))
run.id_ <- ifelse(grepl('[a-z0-9]',substring(run.id, nchar(run.id))), paste(run.id,'_',sep=''), run.id)
blacklistFiles <- NA_character_
bl.normalising.file <- NA_character_
#
stopifnot( !is.null(use.blacklisters) & !is.na(use.blacklisters) )
if(any(grepl('MakeReadBlacklist', use.blacklisters)))
stopifnot( !is.null(contaminant.prop.threshold) & !is.na(contaminant.prop.threshold),
!is.null(contaminant.read.threshold) & !is.na(contaminant.read.threshold))
if(any(grepl('ParsimonyBasedBlacklister|parsimony_based_blacklister', use.blacklisters)))
stopifnot( !any(grepl('MakeRogueBlacklist', use.blacklisters)),
!is.null(roguesubtree.prop.threshold) & !is.na(roguesubtree.prop.threshold),
!is.null(roguesubtree.read.threshold) & !is.na(roguesubtree.read.threshold),
!is.null(roguesubtree.kParam) & !is.na(roguesubtree.kParam))
if(any(grepl('MakeRogueBlacklist', use.blacklisters)))
stopifnot( !any(grepl('MakeRogueBlacklistWeibull', use.blacklisters)),
!is.null(rogue.prop.threshold) & !is.na(rogue.prop.threshold),
!is.null(rogue.longestBranchLength) & !is.na(rogue.longestBranchLength))
if(any(grepl('MakeRogueBlacklistWeibull', use.blacklisters)))
stopifnot( !any(grepl('ParsimonyBasedBlacklister', use.blacklisters)),
!is.null(rogue.prop.threshold) & !is.na(rogue.prop.threshold),
!is.null(rogue.longestBranchLength) & !is.na(rogue.longestBranchLength),
!is.null(rogue.probThreshold) & !is.na(rogue.probThreshold))
if(any(grepl('DualPatientBlacklister', use.blacklisters)))
stopifnot(!is.null(dual.prop.threshold) & !is.na(dual.prop.threshold))
if(any(grepl('DownsampleReads|downsample_reads', use.blacklisters)))
stopifnot(!is.null(dwns.maxReadsPerPatient) & !is.na(dwns.maxReadsPerPatient))
if(is.null(tip.regex))
tip.regex <- NA_character_
use.sankhoff.blacklister <- any(grepl('ParsimonyBasedBlacklister|parsimony_based_blacklister', use.blacklisters))
#
cmd <- ''
#
#if(1) # OLD CODE (as long as we work with prev generated zip files) TODO: swith off
#{
# cmd <- paste(cmd, 'for file in DuplicateReadCountsProcessed_*.csv; do\n\tmv "$file" "${file//DuplicateReadCountsProcessed_/',run.id_,'DuplicateReadCounts_}"\ndone',sep='')
#}
#
# bash command to define normalising constants, if
# a reference file and a reference column name in that file are specified
#
if( !is.null(bl.normalising.reference.file) & !is.null(bl.normalising.reference.var) )
{
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
tmp <- phsc.cmd.NormalisationLookupWriter( prog.pty.bl.normaliser,
pty.tools.dir.deprecated,
file.path(tmp.dir,paste0(run.id_,'InWindow_')),
bl.normalising.reference.file,
file.path(tmp.dir,paste0(run.id_,'normconst.csv')),
bl.normalising.reference.var,
standardize=TRUE
)
cmd <- paste(cmd, tmp, sep='\n')
}
bl.normalising.file <- file.path(tmp.dir,paste0(run.id_,'normconst.csv'))
}
#
# bash command to blacklist taxa with duplicate counts that suggest contaminants
#
if( any(grepl('MakeReadBlacklist', use.blacklisters)) )
{
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
cmd <- paste(cmd, '\nfor file in ', run.id_,'DuplicateReadCounts_*.csv; do\n\t',sep='')
tmp <- phsc.cmd.blacklist.reads( prog.pty.readblacklist,
'"$file"',
paste('"${file//DuplicateReadCounts/blacklist}"',sep=''),
contaminant.read.threshold,
contaminant.prop.threshold,
tipRegex=tip.regex)
cmd <- paste(cmd, tmp, '\ndone', sep='')
}
blacklistFiles <- file.path(tmp.dir, paste(run.id_,'blacklist_InWindow_',sep=''))
}
#
# bash command to blacklist taxa with duplicate counts that suggest contaminants
# and identifying contaminants through a Sankhoff parsimony reconstruction
#
if( any(grepl('ParsimonyBasedBlacklister|parsimony_based_blacklister', use.blacklisters)))
{
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
tmp <- phsc.cmd.blacklist.parsimonybased( prog.pty.readblacklistsankoff,
file.path(tmp.dir,paste0(run.id_,'InWindow_')),
file.path(tmp.dir,paste0(run.id_,'blacklistsank_InWindow')),
dualCandidatesOutputFileName=file.path(tmp.dir,paste0(run.id_,'duallistsank_InWindow')),
blackListFileName=blacklistFiles,
rawThreshold=roguesubtree.read.threshold,
ratioThreshold=roguesubtree.prop.threshold,
sankhoffK=roguesubtree.kParam,
multifurcation.threshold=multifurcation.threshold,
outgroupName=root.name,
tipRegex=tip.regex,
branchLengthNormalisation=bl.normalising.file,
verbose=verbose
)
cmd <- paste(cmd, tmp, sep='\n')
}
blacklistFiles <- file.path(tmp.dir, paste(run.id_,'blacklistsank_',sep=''))
}
#
# bash command to make blacklists of rogue taxa for each window based on branch lengths
#
if( any(grepl('MakeRogueBlacklist', use.blacklisters)))
{
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
cmd <- paste(cmd, '\nfor file in ', file.path(tmp.dir,run.id_),'*tree; do\n\t',sep='')
cmd <- paste(cmd,'TMP=${file//tree/csv}\n\t',sep='')
tmp <- ifelse(any(is.na(blacklistFiles)), NA_character_, '"${TMP//InWindow/blacklist_InWindow}"')
tmp <- phsc.cmd.blacklist.rogue.geneticdistance( prog.pty.rogueblacklist,
pty.tools.dir,
'"$file"',
'"${TMP//InWindow/blacklistrogue_InWindow}"',
longestBranchLength=rogue.longestBranchLength,
dropProportion=rogue.prop.threshold,
blackListFileName=tmp,
outgroupName=root.name,
tipRegex=tip.regex)
cmd <- paste(cmd, tmp, '\n\t','mv "${TMP//InWindow/blacklistrogue_InWindow}" "${TMP//InWindow/blacklist_InWindow}"','\n','done', sep='')
}
blacklistFiles <- file.path(tmp.dir, paste(run.id_,'blacklist_',sep=''))
}
#
# bash command to make blacklists of rogue taxa for each window based on Weibull extreme value probability of branch lengths
#
if( any(grepl('MakeRogueBlacklistWeibull', use.blacklisters)))
{
if((is.null(combine.processed.windows) || combine.processed.windows==0))
{
cmd <- paste(cmd, '\nfor file in ', file.path(tmp.dir,run.id_),'*tree; do\n\t',sep='')
cmd <- paste(cmd,'TMP=${file//tree/csv}\n\t',sep='')
tmp <- ifelse(is.na(blacklistFiles), NA_character_, '"${TMP//InWindow/blacklist_InWindow}"')
tmp <- phsc.cmd.blacklist.rogue.extremeprob( prog.pty.roguewblacklist,
pty.tools.dir,
'"$file"',
'"${TMP//InWindow/blacklistrogue_InWindow}"',
probThreshold=rogue.probThreshold,
longestBranchLength=rogue.longestBranchLength,
dropProportion=rogue.prop.threshold,
blackListFileName=tmp,
outgroupName=root.name,
tipRegex=tip.regex)
cmd <- paste(cmd, tmp, '\n\t','mv "${TMP//InWindow/blacklistrogue_InWindow}" "${TMP//InWindow/blacklist_InWindow}"','\n','done', sep='')
}
blacklistFiles <- file.path(tmp.dir, paste(run.id_,'blacklist_',sep=''))
}
#
# bash command to make blacklists of duplicate taxa based on candidate duplicates output from ParsimonyBlacklist.R
#
if( any(grepl('DualPatientBlacklister', use.blacklisters)))
{
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
tmp <- phsc.cmd.blacklist.dualinfections( prog.pty.dualblacklist,
file.path(tmp.dir,paste0(run.id_,'duallistsank_')),
file.path(tmp.dir,paste0(run.id_,'blacklistdual_')),
treeFileName=file.path(tmp.dir,paste0(run.id_,'.*tree')),
summaryFileName=file.path(tmp.dir,paste0(run.id_,'dualsummary.csv')),
blacklistFileName=blacklistFiles,
dual.prop.threshold=dual.prop.threshold)
cmd <- paste(cmd, tmp, sep='\n')
cmd <- paste0(cmd, '\n','for file in ', basename(blacklistFiles),'*csv; do\n\t','cp "$file" "${file//',basename(blacklistFiles),'/',paste0(run.id_,'blacklistfinal_'),'}"','\n','done')
cmd <- paste0(cmd, '\n','for file in ', paste0(run.id_,'blacklistdual_'),'*csv; do\n\t','cp "$file" "${file//blacklistdual/blacklistfinal}"','\n','done')
}
blacklistFiles <- file.path(tmp.dir, paste(run.id_,'blacklistfinal_',sep=''))
}
#
# bash command to downsample tips (add to blacklist)
#
if( any(grepl('DownsampleReads|downsample_reads', use.blacklisters)))
{
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
tmp <- phsc.cmd.blacklist.downsample( prog.pty.downsample,
file.path(tmp.dir,run.id_),
file.path(tmp.dir,paste(run.id_,'blacklistdwns_',sep='')),
maxReadsPerPatient=dwns.maxReadsPerPatient,
blackListFileName=blacklistFiles,
tipRegex=tip.regex,
seed=42L,
hosts=NA,
excludeUnderrepresented=FALSE,
verbose=verbose)
cmd <- paste(cmd, tmp, sep='\n')
}
blacklistFiles <- file.path(tmp.dir, paste(run.id_,'blacklistdwns_InWindow_',sep=''))
#cmd <- paste(cmd, '\n','for file in ', file.path(tmp.dir,paste(run.id_,'blacklistdwns_*',sep='')),'; do\n\t',sep='')
#cmd <- paste(cmd, 'mv "$file" "${file//blacklistdwns/blacklist}"\ndone',sep='')
}
#
# bash command to plot trees and calculate splits
#
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
tmp <- phsc.cmd.SplitPatientsToSubGraphs( prog.pty.split,
file.path(tmp.dir,run.id_),
run.id,
outputdir=tmp.dir,
blacklistFiles=gsub('InWindow_','',blacklistFiles),
outgroupName=root.name,
splitsRule=split.rule,
sankhoff.k=split.kParam,
proximityThreshold=split.proximityThreshold,
readCountsMatterOnZeroBranches=split.readCountsMatterOnZeroBranches,
multifurcation.threshold=multifurcation.threshold,
branchLengthNormalisation=bl.normalising.file,
tipRegex=tip.regex,
pdfwidth=30,
pdfrelheight=0.15,
useff=FALSE,
pruneBlacklist=split.pruneBlacklist,
idFile=file.path(tmp.dir, basename(file.patients)),
verbose=verbose)
cmd <- paste(cmd, tmp, sep='\n')
}
#
# bash command to get likely transmissions
#
if(is.null(combine.processed.windows) || combine.processed.windows==0)
{
tmp <- phsc.cmd.LikelyTransmissions( prog.pty.lkltrm,
file.path(tmp.dir,paste('ProcessedTree_',split.rule,'_',run.id_,sep='')),
file.path(tmp.dir,paste('subgraphs_',split.rule,'_',run.id_,sep='')),
file.path(tmp.dir,substr(run.id_,1,nchar(run.id_)-1)),
branchLengthNormalisation=bl.normalising.file,
collapsedTree=FALSE,
verbose=verbose
)
cmd <- paste(cmd, tmp, sep='\n')
}
#
# bash command to calculate patient stats
#
#file.bam <- paste(run.id_,'bam.txt',sep='')
#cmd <- paste(cmd,"\nsed 's/.*\\///' \"", file.path(tmp.dir,basename(file.bam)), '" > "',file.path(tmp.dir,file.patients),'"', sep='')
if(is.null(process.window) || is.na(process.window))
{
tmp <- phsc.cmd.SummaryStatistics( prog.pty.smry,
pty.tools.dir,
file.path(tmp.dir, basename(file.patients)),
file.path(tmp.dir, paste('ProcessedTree_',split.rule,'_',run.id_,'InWindow_',sep='')),
file.path(tmp.dir, paste('subgraphs_',split.rule,'_',run.id_,'InWindow_',sep='')),
file.path(tmp.dir, run.id_),
tipRegex=tip.regex,
blacklistFiles=blacklistFiles,
windowCoords=NA,
recombinationFiles=NA,
noReadCounts=FALSE,
verbose=verbose
)
cmd <- paste(cmd, tmp, sep='\n')
}
#
# add bash command to get likely transmissions summary
#
if(is.null(process.window) || is.na(process.window))
{
tmp <- phsc.cmd.LikelyTransmissionsSummary( prog.pty.lkl.smry,
pty.tools.dir.deprecated,
file.path(tmp.dir, basename(file.patients)),
file.path(tmp.dir, paste(run.id_,'patStatsFull.csv',sep='')),
file.path(tmp.dir, paste(run.id_,'classification_InWindow_',sep='')),
file.path(tmp.dir, paste(run.id_,'trmStats.csv',sep='')),
file.path(tmp.dir, paste(run.id_,'trmStatsPerWindow.rda',sep='')),
min.threshold=1,
allow.MultiTrans=trms.allowMultiTrans,
verbose=verbose)
cmd <- paste(cmd, tmp, sep='\n')
}
#
# add bash command to calculate pairwise relationships
#
if(is.null(process.window) || is.na(process.window))
{
tmp <- phsc.cmd.pairwise.relationships( file.path(tmp.dir, paste(run.id_,'trmStatsPerWindow.rda',sep='')),
file.path(tmp.dir, paste(run.id_,'pairwise_relationships.rda',sep='')),
trmw.min.reads=trmw.min.reads,
trmw.min.tips=trmw.min.tips,
trmw.close.brl=trmw.close.brl,
trmw.distant.brl=trmw.distant.brl,
prior.keff=prior.keff,
prior.neff=prior.neff,
prior.keff.dir=prior.keff.dir,
prior.neff.dir=prior.neff.dir,
prior.calibrated.prob=prior.calibrated.prob,
rel.likely.pair=TRUE,
rel.likely.pair.by.distance.only=TRUE,
rel.likely.pair.by.topology.only=TRUE,
rel.likely.pair.by.cross.table=TRUE,
rel.direction=TRUE,
rel.chain=TRUE)
cmd <- paste(cmd, tmp, sep='\n')
}
#
# add bash command to compress phyloscanner output
#
if(is.null(process.window) || is.na(process.window))
{
tmp <- phsc.cmd.read.processed.phyloscanner.output.in.directory(file.path(tmp.dir, run.id_),
file.path(tmp.dir, run.id_),
read.likelytransmissions=TRUE,
read.trees=TRUE,
read.subtrees=TRUE,
resume=FALSE,
zip=TRUE)
cmd <- paste(cmd, tmp, sep='\n')
}
#
# if mem-save output delete: .*subtrees_[scfr]_csv.zip .*subtrees_[scfr]_rda.zip .*DuplicateReadCounts.zip .*_blacklist.zip .*_duallist.zip .*_collapsed.zip .*_LikelyTransmissions.zip
#
if(!is.null(mem.save) & mem.save==1)
{
tmp <- c('*_subtrees_r.rda','*subtrees_[scrf]_rda.zip','*DuplicateReadCounts.zip','*_blacklist.zip','*_duallist.zip','*_collapsed.zip','*_LikelyTransmissions.zip')
tmp <- paste('rm ',tmp, sep='',collapse='\n')
cmd <- paste(cmd, tmp, sep='\n')
}
cmd
}
phsc.drop.tip<- function (phy, tip, trim.internal = TRUE, subtree = FALSE, root.edge = 0, rooted = is.rooted(phy))
{
# code from ape
if (!inherits(phy, "phylo"))
stop("object \"phy\" is not of class \"phylo\"")
Ntip <- length(phy$tip.label)
if (is.character(tip))
tip <- which(phy$tip.label %in% tip)
out.of.range <- tip > Ntip
if (any(out.of.range))
{
warning("some tip numbers were larger than the number of tips: they were ignored")
tip <- tip[!out.of.range]
}
if (!length(tip))
return(phy)
if (length(tip) == Ntip)
{
warning("drop all tips of the tree: returning NULL")
return(NULL)
}
wbl <- !is.null(phy$edge.length)
if (!rooted && subtree)
{
phy <- root(phy, (1:Ntip)[-tip][1])
root.edge <- 0
}
phy <- reorder(phy)
NEWROOT <- ROOT <- Ntip + 1
Nnode <- phy$Nnode
Nedge <- dim(phy$edge)[1]
if (subtree)
{
trim.internal <- TRUE
tr <- reorder(phy, "postorder")
N <- .C(node_depth, as.integer(Ntip), as.integer(Nnode),
as.integer(tr$edge[, 1]), as.integer(tr$edge[, 2]),
as.integer(Nedge), double(Ntip + Nnode), 1L)[[6]]
}
edge1 <- phy$edge[, 1]
edge2 <- phy$edge[, 2]
keep <- !logical(Nedge)
keep[match(tip, edge2)] <- FALSE
if (trim.internal)
{
ints <- edge2 > Ntip
repeat
{
sel <- !(edge2 %in% edge1[keep]) & ints & keep
if (!sum(sel))
break
keep[sel] <- FALSE
}
if (subtree)
{
subt <- edge1 %in% edge1[keep] & edge1 %in% edge1[!keep]
keep[subt] <- TRUE
}
if (root.edge && wbl)
{
degree <- tabulate(edge1[keep])
if (degree[ROOT] == 1)
{
j <- integer(0)
repeat
{
i <- which(edge1 == NEWROOT & keep)
j <- c(i, j)
NEWROOT <- edge2[i]
degree <- tabulate(edge1[keep])
if (degree[NEWROOT] > 1)
break
}
keep[j] <- FALSE
if (length(j) > root.edge)
j <- 1:root.edge
NewRootEdge <- sum(phy$edge.length[j])
if (length(j) < root.edge && !is.null(phy$root.edge))
NewRootEdge <- NewRootEdge + phy$root.edge
phy$root.edge <- NewRootEdge
}
}
}
if (!root.edge)
phy$root.edge <- NULL
phy$edge <- phy$edge[keep, ]
if (wbl)
phy$edge.length <- phy$edge.length[keep]
TERMS <- !(phy$edge[, 2] %in% phy$edge[, 1]) #all nodes that do not appear in first column are new tips
oldNo.ofNewTips <- phy$edge[TERMS, 2]
if (subtree)
{
i <- which(tip %in% oldNo.ofNewTips)
if (length(i))
{
phy$tip.label[tip[i]] <- "[1_tip]"
tip <- tip[-i]
}
}
n <- length(oldNo.ofNewTips)
phy$edge[TERMS, 2] <- rank(phy$edge[TERMS, 2])
phy$tip.label <- phy$tip.label[-tip]
if (subtree || !trim.internal)
{
node2tip <- oldNo.ofNewTips[oldNo.ofNewTips > Ntip]
new.tip.label <- if(subtree)
{
paste("[", N[node2tip], "_tips]", sep = "")
}
else
{
if (is.null(phy$node.label))
rep("NA", length(node2tip))
else phy$node.label[node2tip - Ntip]
}
phy$tip.label <- c(phy$tip.label, new.tip.label)
# the order of oldNo.ofNewTips does not mean anything. Need to sort.
# also: node2tip may not have a colour, whereas it should have the colour of one of the tips, so use 1 tip
oldNo.ofNewTips <- c( sort(oldNo.ofNewTips[-which(oldNo.ofNewTips > Ntip)]), tip[1] )
}
phy$Nnode <- dim(phy$edge)[1] - n + 1L
newNb <- integer(Ntip + Nnode)
newNb[NEWROOT] <- n + 1L
sndcol <- phy$edge[, 2] > n
newNb[sort(phy$edge[sndcol, 2])] <- (n + 2):(n + phy$Nnode)
phy$edge[sndcol, 2] <- newNb[phy$edge[sndcol, 2]]
phy$edge[, 1] <- newNb[phy$edge[, 1]]
storage.mode(phy$edge) <- "integer"
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[which(newNb > 0) - Ntip]
# add new tip numbers to newNb
# want to reorder oldNo.ofNewTips so this corresponds to new tips
newNb[oldNo.ofNewTips] <- seq_along(oldNo.ofNewTips)
# select remaining nodes and put into right order
newNb <- subset(data.table(IDX_O=seq_along(newNb), IDX_N=newNb), IDX_N>0)
setkey(newNb, IDX_N)
attr(phy, "INDIVIDUAL") <- attr(phy, "INDIVIDUAL")[ newNb[, IDX_O] ]
attr(phy, "NODE_SHAPES")<- attr(phy, "NODE_SHAPES")[ newNb[, IDX_O] ]
attr(phy, "SPLIT") <- attr(phy, "SPLIT")[ newNb[, IDX_O] ]
# collapse singles
# code from ape
n <- length(phy$tip.label)
e1 <- phy$edge[, 1]
e2 <- phy$edge[, 2]
tab <- tabulate(e1)
if(all(tab > 1))
return(phy)
if(is.null(phy$edge.length))
{
root.edge <- FALSE
wbl <- FALSE
}
if(!is.null(phy$edge.length))
{
wbl <- TRUE
el <- phy$edge.length
}
if (root.edge)
ROOTEDGE <- 0
ROOT <- n + 1L
while (tab[ROOT] == 1)
{
i <- which(e1 == ROOT)
ROOT <- e2[i]
if (wbl) {
if (root.edge)
ROOTEDGE <- ROOTEDGE + el[i]
el <- el[-i]
}
e1 <- e1[-i]
e2 <- e2[-i]
}
singles <- which(tabulate(e1) == 1)
while (length(singles))
{
i <- which(e1 == singles[1])
j <- which(e2 == e1[i])
e2[j] <- e2[i]
if (wbl) {
el[j] <- el[j] + el[i]
el <- el[-i]
}
e1 <- e1[-i]
e2 <- e2[-i]
singles <- which(tabulate(e1) == 1)
}
Nnode <- length(e1) - n + 1L
oldnodes <- unique(e1)
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[oldnodes - n]
newNb <- integer(max(oldnodes))
newNb[ROOT] <- n + 1L
sndcol <- e2 > n
e2[sndcol] <- newNb[e2[sndcol]] <- n + 2:Nnode
e1 <- newNb[e1]
phy$edge <- cbind(e1, e2, deparse.level = 0)
phy$Nnode <- Nnode
if (wbl)
{
if (root.edge)
phy$root.edge <- ROOTEDGE
phy$edge.length <- el
}
# select remaining nodes and put into right order
attr(phy, "INDIVIDUAL") <- attr(phy, "INDIVIDUAL")[c(seq_len(Ntip(phy)), oldnodes)]
attr(phy, "NODE_SHAPES") <- attr(phy, "NODE_SHAPES")[c(seq_len(Ntip(phy)), oldnodes)]
attr(phy, "SPLIT") <- attr(phy, "SPLIT")[c(seq_len(Ntip(phy)), oldnodes)]
phy
}
phsc.extract.clade <- function(phy, node, root.edge = 0, interactive = FALSE)
{
n <- length(phy$tip.label)
if (interactive) {
cat("Click close to the node...\n")
node <- identify(phy)$nodes
} else {
if (length(node) > 1) {
node <- node[1]
warning("only the first value of 'node' has been considered")
}
if (is.character(node)) {
if (is.null(phy$node.label))
stop("the tree has no node labels")
node <- match(node, phy$node.label) + n
if (is.na(node)) stop("'node' not among the node labels.")
}
if (node <= n)
stop("node number must be greater than the number of tips")
}
if (node == n + 1L) return(phy)
keep <- prop.part(phy)[[node - n]]
drop.tip(phy, (1:n)[-keep], root.edge = root.edge, rooted = TRUE)
}
#' @import data.table ape
#' @importFrom phangorn Ancestors Children Siblings Descendants
phsc.phy.collapse.monophyletic.clades<- function(ph, drop.blacklisted=TRUE, tip.regex='(.*)_read_([0-9]+)_count_([0-9]+)')
{
#ph<- phs[[1]]
#references.pattern<- 'REF'
phb <- data.table( IDX=seq_along(ph$tip.label),
BAM=ph$tip.label,
IND= gsub(tip.regex,'\\1',ph$tip.label),
COUNT=sub(tip.regex,'\\3',ph$tip.label),
BLACKLISTED= is.na(attr(ph, 'INDIVIDUAL')[seq_len(Ntip(ph))])
)
set(phb, phb[, which(IND==COUNT)], 'COUNT', NA_character_)
set(phb, NULL, 'COUNT', phb[, as.integer(COUNT)])
set(phb, NULL, 'REF', phb[, is.na(COUNT)])
set(phb, phb[, which(REF)],'IND','REFERENCE')
tmp <- subset(phb, BLACKLISTED & !REF)[,IDX]
if(drop.blacklisted & length(tmp))
{
ph <- phsc.drop.tip(ph, tmp)
phb <- data.table( IDX=seq_along(ph$tip.label),
BAM=ph$tip.label,
IND= gsub(tip.regex,'\\1',ph$tip.label),
COUNT=sub(tip.regex,'\\3',ph$tip.label) )
set(phb, phb[, which(IND==COUNT)], 'COUNT', NA_character_)
set(phb, NULL, 'COUNT', phb[, as.integer(COUNT)])
set(phb, NULL, 'REF', phb[, is.na(COUNT)])
set(phb, phb[, which(REF)],'IND','REFERENCE')
}
# for each patient define mrca (MRCA)
# for each patient,
# determine MRCA (mrca)
# calculate number of other individuals in clade below MRCA (diff)
z <- phb[, {
#IND<- "15861_1_37.bam"
#IDX<- subset(phb, IND=="15861_1_37.bam")[, IDX]
#print(IND)
mrca <- IDX
diff <- 0L
if(length(IDX)>1)
{
mrca <- as.integer(getMRCA(ph, IDX))
tmp <- phsc.extract.clade(ph, mrca, root.edge=1)
#print(tmp)
diff <- length(setdiff(gsub(tip.regex,'\\1',tmp$tip.label), IND))
}
list(MRCA=mrca, DIFF_IND=diff)
}, by=c('IND')]
z <- subset(z, IND!='REFERENCE')
# for all patients that are not monophyletic, there could be several clades
# trace back all ancestors between tips of other individuals and MRCA
# for each such tip, construct the unique path to MRCA that is not on a previous path
# find change points on these unique paths below which the subtree contains reads from the current patient
# the idea is that all monophyletic clades of this patient must break off from one of these paths
tmp <- subset(z, DIFF_IND>0, c(IND, MRCA))
if(!nrow(tmp))
z[, CLADE:=NA_integer_]
if(nrow(tmp))
{
# find all tips that belong to another patient than the current individual
tmp <- tmp[, {
if(MRCA<=Ntip(ph))
tmp <- ph$tip.label[MRCA]
if(MRCA>Ntip(ph))
tmp <- phsc.extract.clade(ph,MRCA)$tip.label
list(MRCA=MRCA, MRCA_IND=IND, BAM=tmp)
}, by='IND']
tmp[, IND:=NULL]
zz <- merge(tmp, subset(phb, select=c(BAM, IND, IDX)), by='BAM')
zz <- subset(zz, MRCA_IND!=IND)
# determine change points
zz <- zz[, {
#IDX<- c(980,910,912,950); MRCA<- 1580; IND<- 'R1_RES669_S20_L001'; MRCA_IND<- 'R1_RES827_S23_L001'
# determine paths to MRCA from each tip
anc.group <- Ancestors(ph, IDX)
if(!is.list(anc.group))
anc.group <- list(anc.group)
anc.group <- lapply(seq_along(anc.group), function(i) anc.group[[i]][ seq_len(which(anc.group[[i]]==MRCA[1])-1)] )
# determine unique paths until we hit a path that is already visited
anc.join <- lapply(seq_along(anc.group), function(i){ unique(unlist(lapply(seq_len(i), function(j) anc.group[[j]]))) })
anc.join <- c(NA,anc.join)
anc.group <- lapply(seq_along(anc.group), function(i) setdiff(anc.group[[i]],anc.join[[i]]) )
# check which clades defined by the mrcas on the ancestor path contain at least one read from MRCA_IND
tmp <- lapply(seq_along(anc.group), function(i) sapply(anc.group[[i]], function(j) any(grepl(MRCA_IND, phsc.extract.clade(ph,j)$tip.label)) ) )
# determine lowest node (counting from tips) which contains at least one read from MRCA_IND
tmp <- lapply(seq_along(tmp), function(i){
ans <- NA_integer_
tmp2 <- integer(0)
if(length(tmp[[i]]))
tmp2<- which(tmp[[i]])
if(length(tmp2))
ans <- as.integer(anc.group[[i]][tmp2[1]])
ans
})
list(IDX=IDX, CHANGE_NODE=unlist(tmp))
},by=c('IND','MRCA','MRCA_IND')]
zz <- subset(zz, !is.na(CHANGE_NODE))
# each ancestor before a change node could have as one of its children a monophyletic clade from this patient
zz <- unique(zz, by=c('MRCA_IND','CHANGE_NODE'))
if(!nrow(zz))
{
set(zz, NULL, c('IDX','MRCA','IND'), NULL)
zz[, CLADE:= rep(NA_integer_,0)]
}
if(nrow(zz))
zz <- zz[, {
#CHANGE_NODE<- 2053; MRCA<- 1580; MRCA_IND<- 'R1_RES827_S23_L001'
#MRCA<- 1212; MRCA_IND<- 'R1_RES669_S20_L001'; CHANGE_NODE<- 1218
# the first two potentially monophyletic clades are the children of the CHANGE_NODE
tmp <- Children(ph, CHANGE_NODE)
# define path from change node to just before mrca
path <- c(CHANGE_NODE, setdiff( Ancestors(ph,CHANGE_NODE), c(MRCA,Ancestors(ph,MRCA)) ))
# monophyletic clades could break off the path ending at MRCA
# the mrca's of these clades are the siblings of the path constructed above
pot.clade <- c(tmp, unlist(Siblings(ph, path)))
# check if potential clades are monophyletic
tmp <- sapply(pot.clade, function(x){
if(x<=Ntip(ph))
ans <- grepl(MRCA_IND, ph$tip.label[x])
if(x>Ntip(ph))
ans <- all(grepl(MRCA_IND, phsc.extract.clade(ph,x)$tip.label))
ans
})
# return monophyletic clades
list(CLADE=as.integer(pot.clade[tmp]))
}, by=c('MRCA_IND','CHANGE_NODE')]
zz[, CHANGE_NODE:=NULL]
setnames(zz,'MRCA_IND','IND')
# merge all monophyletic clades
z <- merge(z, unique(zz), all=1, by='IND', allow.cartesian=TRUE)
}
tmp <- z[, which(DIFF_IND==0)]
set(z, tmp, 'CLADE', z[tmp,MRCA])
z <- subset(z, !is.na(CLADE))
# double check CLADEs (the MONOPH_PA condition is key)
if(nrow(z))
{
tmp <- subset(z, !is.na(CLADE))[,{
if(CLADE<=Ntip(ph))
tmp <- grepl(IND, ph$tip.label[CLADE])
if(CLADE>Ntip(ph))
tmp <- all(grepl(IND, phsc.extract.clade(ph,CLADE)$tip.label))
list(MONOPH=tmp, MONOPH_PA=all(grepl(IND, phsc.extract.clade(ph,Ancestors(ph, CLADE, type="parent"))$tip.label)))
}, by=c('IND','CLADE')]
stopifnot(tmp[, all(MONOPH)], tmp[, !any(MONOPH_PA)])
#
# collapse clades
#
zz <- z[, {
#CLADE<- 292
list(IDX=Descendants(ph, CLADE, type='tips')[[1]])
}, by='CLADE']
zz <- merge(phb, zz, by='IDX')
zz <- merge(zz, zz[, list(CLADE_N_TAXA=length(COUNT), CLADE_LABEL=paste0(IND[1],'_read_',CLADE[1],'_count_',sum(COUNT))), by='CLADE'],by='CLADE')
zz <- subset(zz, CLADE_N_TAXA>1)
for(clu.id in zz[, unique(CLADE)])
{
tmp <- subset(zz, CLADE==clu.id)[, BAM]
z <- phsc.drop.tip(ph, tip=tmp, subtree=TRUE)
if(!is.null(z))
{
ph <- z
ph$tip.label[ which(grepl("[",ph$tip.label,fixed=TRUE)) ] <- subset(zz, CLADE==clu.id)[1,CLADE_LABEL]
}
}
}
ph
}
#' @keywords internal
#' @title Read processed phyloscanner output
#' @description This function creates R data.tables from processed phyloscanner output, for further data analysis in R.
#' @param prefix.infiles Full path name to processed phyloscanner output
#' @param save.file.base Output will be stored to files that start with 'save.file.base'.
#' @param read.likelytransmissions If TRUE, read and process likely transmissions
#' @param read.trees If TRUE, read and process trees
#' @param read.subtrees If TRUE, read and process subtree files
#' @param resume If TRUE, the function does not process existing rda files.
#' @param zip If TRUE, the function zips processed phyloscanner output, and then deletes the zipped, processed phyloscanner output files.
#' @return Nothing, rda objects written to file.
phsc.read.processed.phyloscanner.output.in.directory<- function(prefix.infiles, save.file.base, read.likelytransmissions=TRUE, read.trees=TRUE, read.subtrees=TRUE, resume=FALSE, zip=FALSE)
{
#
# read all likely transmissions in indir (these are files ending in _trmStats.csv)
#
if(read.likelytransmissions)
{
save.file <- paste(save.file.base,'trmStats.rda',sep='')
phsc.read.likelytransmissions(prefix.infiles, prefix.run='ptyr', regexpr.lklsu="classification_.*.csv$", save.file=save.file, resume=resume, zip=zip)
}
#
# read trees
#
if(read.trees)
{
save.file <- paste(save.file.base,'trees.rda',sep='')
tmp <- phsc.read.trees(prefix.infiles, prefix.run='ptyr', regexpr.trees='subgraphs_[crsf]_.*\\.rda$', prefix.wfrom='Window_', prefix.wto='Window_[0-9]+_to_', save.file=save.file, resume=resume, zip=zip)
tmp <- NULL
}
#
# read subtrees files and save (must be run after phsc.read.trees!)
#
if(read.subtrees)
{
save.file <- paste(save.file.base,'subtrees_r.rda',sep='')
stat.subtrees <- phsc.read.subtrees(prefix.infiles, prefix.run='ptyr', regexpr.subtrees='subgraphs_[crsf]_.*\\.rda$', prefix.wfrom='Window_', prefix.wto='Window_[0-9]+_to_', save.file=save.file, resume=resume, zip=zip)
stat.subtrees <- NULL
}
NULL
}
#' @keywords internal
#' @title Calculate prior parameter n0
#' @description This function calculates the prior parameter n0 such that the minimum number of windows needed is at least n.obs in order to select a pair of individuals with confidence at least confidence.cut.
#' @param n.states Number of relationship states.
#' @param n.obs Minimum number of windows to select a pair of individuals with confidence at least confidence.cut.
#' @param confidence.cut Confidence cut off.
#' @return Prior parameter n0
phsc.get.prior.parameter.n0<- function(n.states, keff=2, neff=3, confidence.cut=0.66)
{
#phsc.find.n0.aux<- function(n0, n.states, keff, neff, confidence.cut)
#{
# abs( (n0+n.states*(keff-1))/ (n.states*(neff+n0-2)) - confidence.cut )
#}
phsc.find.n0.aux<- function(n0, n.states, keff, neff, confidence.cut)
{
abs( (n0+n.states*keff)/ (n.states*(neff+n0)) - confidence.cut )
}
ans <- optimize(phsc.find.n0.aux, c(.001,1e2), n.states=n.states, keff=keff, neff=neff, confidence.cut=confidence.cut)
ans <- round(ans$minimum, d=4)
ans
}
#' @keywords internal
phsc.cmd.blacklist.downsample<- function(pr, inputTreeFileName, outputFileName, maxReadsPerPatient=200, blackListFileName=NA, tipRegex=NA, seed=42L, rename=NA, hosts=NA, excludeUnderrepresented=FALSE, verbose=NA)
{
cmd <- paste(pr, maxReadsPerPatient, inputTreeFileName, outputFileName)
if(!is.na(blackListFileName))
cmd <- paste(cmd, '--blacklist', blackListFileName)
if(!is.na(tipRegex))
cmd <- paste0(cmd, ' --tipRegex "', tipRegex,'"')
if(!is.na(hosts))
cmd <- paste(cmd, '--hosts', hosts)
if(!is.na(rename))
cmd <- paste(cmd, '--rename', rename)
if(!is.na(seed))
cmd <- paste(cmd, '--seed', seed)
if(excludeUnderrepresented)
cmd <- paste(cmd, '--excludeUnderrepresented')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
phsc.cmd.SummaryStatistics<- function(pr, scriptdir, file.patients, treeFiles, splitsFiles, outputBaseName, tipRegex=NA, blacklistFiles=NA, windowCoords=NA, recombinationFiles=NA, noReadCounts=NA, verbose=FALSE)
{
cmd <- paste(pr,' --scriptDir ',scriptdir,' "', file.patients, '" "', treeFiles, '" "',splitsFiles, '" "',outputBaseName, '"', sep='')
if(!is.na(tipRegex))
cmd <- paste(cmd, ' --tipRegex "',tipRegex,'"', sep='')
if(!is.na(blacklistFiles))
cmd <- paste(cmd, ' --blacklists "',blacklistFiles,'"', sep='')
if(!is.na(recombinationFiles))
cmd <- paste(cmd, ' --recombinationFiles "',recombinationFiles,'"', sep='')
if(!is.na(windowCoords))
cmd <- paste(cmd, ' --windowCoords "',windowCoords,'"', sep='')
if(!is.na(noReadCounts) & noReadCounts==TRUE)
cmd <- paste0(cmd, ' --noReadCounts')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
phsc.cmd.pairwise.relationships<- function(infile, outfile, trmw.min.reads=20, trmw.min.tips=1, trmw.close.brl=0.035, trmw.distant.brl=0.08, prior.keff=2, prior.neff=3, prior.keff.dir=2, prior.neff.dir=3, prior.calibrated.prob=0.5,
rel.likely.pair=TRUE, rel.likely.pair.by.distance.only=FALSE,rel.likely.pair.by.topology.only=FALSE,rel.likely.pair.by.cross.table=FALSE,rel.direction=TRUE,rel.chain=FALSE, verbose=NA,
pr=PR.pairwise.relationships)
{
cmd <- paste0(pr,' --infile "',infile,'" --outfile "',outfile,'"')
if(!is.na(trmw.min.reads))
cmd <- paste0(cmd, ' --trmw.min.reads ', trmw.min.reads)
if(!is.na(trmw.min.tips))
cmd <- paste0(cmd, ' --trmw.min.tips ', trmw.min.tips)
if(!is.na(trmw.close.brl))
cmd <- paste0(cmd, ' --trmw.close.brl ', trmw.close.brl)
if(!is.na(trmw.distant.brl))
cmd <- paste0(cmd, ' --trmw.distant.brl ', trmw.distant.brl)
if(!is.na(prior.keff))
cmd <- paste0(cmd, ' --prior.keff ', prior.keff)
if(!is.na(prior.neff))
cmd <- paste0(cmd, ' --prior.neff ', prior.neff)
if(!is.na(prior.keff.dir))
cmd <- paste0(cmd, ' --prior.keff.dir ', prior.keff.dir)
if(!is.na(prior.neff.dir))
cmd <- paste0(cmd, ' --prior.neff.dir ', prior.neff.dir)
if(!is.na(prior.calibrated.prob))
cmd <- paste0(cmd, ' --prior.calibrated.prob ', prior.calibrated.prob)
if(rel.likely.pair)
cmd <- paste(cmd, '--rel.likely.pair')
if(rel.likely.pair.by.distance.only)
cmd <- paste(cmd, '--rel.likely.pair.by.distance.only')
if(rel.likely.pair.by.topology.only)
cmd <- paste(cmd,'--rel.likely.pair.by.topology.only')
if(rel.likely.pair.by.cross.table)
cmd <- paste(cmd,'--rel.likely.pair.by.cross.table')
if(rel.direction)
cmd <- paste(cmd,'--rel.direction')
if(rel.chain)
cmd <- paste(cmd,'--rel.chain')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
phsc.cmd.read.processed.phyloscanner.output.in.directory<- function(prefix.infiles, save.file.base, read.likelytransmissions=TRUE, read.trees=TRUE, read.subtrees=TRUE, resume=FALSE, zip=FALSE, pr=PR.read.processed.phyloscanner.output.in.directory)
{
cmd <- paste(pr,' --prefix.infiles "',prefix.infiles,'" --save.file.base "',save.file.base,'"',sep='')
if(read.likelytransmissions)
cmd <- paste(cmd, '--read.likelytransmissions')
if(read.trees)
cmd <- paste(cmd, '--read.trees')
if(read.subtrees)
cmd <- paste(cmd, '--read.subtrees')
if(resume)
cmd <- paste(cmd,'--resume')
if(zip)
cmd <- paste(cmd,'--zip')
cmd
}
#' @keywords internal
phsc.cmd.SplitPatientsToSubtrees<- function(pr, scriptdir, infile, outputdir=NA, blacklistFiles=NA, outputFileIdentifier=NA, outgroupName=NA, splitsRule=NA, sankhoff.k=NA, tiesRule=NA, tipRegex=NA, pdfwidth=30, pdfrelheight=0.15, verbose=NA)
{
cmd <- paste(pr, ' --inputFile "', infile, '"', ' --scriptdir "', scriptdir,'"', sep='')
if(!is.na(blacklistFiles))
cmd <- paste(cmd, ' --blacklist "', blacklistFiles,'"', sep='')
if(!is.na(outputdir))
cmd <- paste(cmd, ' --outputdir "', outputdir,'"', sep='')
if(!is.na(outputFileIdentifier))
cmd <- paste(cmd, ' --outputfileid "', outputFileIdentifier, '"', sep='')
if(!is.na(outgroupName))
cmd <- paste(cmd, ' --outgroupName ', outgroupName, sep='')
if(!is.na(splitsRule))
cmd <- paste(cmd, ' --splitsRule ', splitsRule, sep='')
if(!is.na(sankhoff.k))
cmd <- paste(cmd, ' --kParam ', sankhoff.k, sep='')
if(!is.na(tiesRule))
cmd <- paste(cmd, ' --tiesRule ', tiesRule, sep='')
if(!is.na(tipRegex))
cmd <- paste(cmd, ' --tipRegex "', tipRegex, '"', sep='')
if(!is.na(pdfwidth))
cmd <- paste(cmd, ' --pdfwidth ', pdfwidth, sep='')
if(!is.na(pdfrelheight))
cmd <- paste(cmd, ' --pdfrelheight ', pdfrelheight, sep='')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
phsc.cmd.SplitPatientsToSubGraphs<- function(pr, infile, outputFileIdentifier, outputdir=NA, blacklistFiles=NA, idFile=NA, outgroupName=NA, splitsRule=NA, sankhoff.k=NA, proximityThreshold=NA, readCountsMatterOnZeroBranches=NA, tipRegex=NA, branchLengthNormalisation=NA, pruneBlacklist=FALSE, useff=FALSE, outputAsRDA=1, multifurcation.threshold=1e-5, pdfwidth=30, pdfrelheight=0.15, verbose=NA)
{
cmd <- paste(pr, ' "', infile, '" "',outputFileIdentifier,'"', sep='')
if(!is.na(blacklistFiles))
cmd <- paste(cmd, ' --blacklist "', blacklistFiles,'"', sep='')
if(!is.na(outputdir))
cmd <- paste(cmd, ' --outputdir "', outputdir,'"', sep='')
if(!is.na(idFile))
cmd <- paste(cmd, ' --idFile "', idFile,'"', sep='')
if(!is.na(outgroupName))
cmd <- paste(cmd, ' --outgroupName ', outgroupName, sep='')
if(!is.na(splitsRule))
cmd <- paste(cmd, ' --splitsRule ', splitsRule, sep='')
if(!is.na(sankhoff.k))
cmd <- paste(cmd, ' --kParam ', sankhoff.k, sep='')
if(!is.na(proximityThreshold))
cmd <- paste(cmd, ' --proximityThreshold ', proximityThreshold, sep='')
if(!is.na(readCountsMatterOnZeroBranches) & readCountsMatterOnZeroBranches)
cmd <- paste(cmd, ' --readCountsMatterOnZeroBranches', sep='')
if(!is.na(tipRegex))
cmd <- paste(cmd, ' --tipRegex "', tipRegex, '"', sep='')
if(!is.na(multifurcation.threshold))
cmd <- paste(cmd, ' --multifurcationThreshold ', multifurcation.threshold, sep='')
if(!is.na(branchLengthNormalisation) & class(branchLengthNormalisation)=='character')
cmd <- paste(cmd, ' --branchLengthNormalisation "', branchLengthNormalisation, '"',sep='')
if(!is.na(branchLengthNormalisation) & class(branchLengthNormalisation)=='numeric')
cmd <- paste(cmd, ' --branchLengthNormalisation ', branchLengthNormalisation, sep='')
if(!is.na(outputAsRDA) & outputAsRDA==TRUE)
cmd <- paste(cmd, ' --outputAsRDA', sep='')
if(!is.na(useff) & useff==TRUE)
cmd <- paste(cmd, ' --useff', sep='')
if(!is.na(pruneBlacklist) & pruneBlacklist==TRUE)
cmd <- paste(cmd, ' --pruneBlacklist', sep='')
if(!is.na(pdfwidth))
cmd <- paste(cmd, ' --pdfwidth ', pdfwidth, sep='')
if(!is.na(pdfrelheight))
cmd <- paste(cmd, ' --pdfrelheight ', pdfrelheight, sep='')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
phsc.cmd.LikelyTransmissions<- function(pr, file.tree, file.splits, file.out, tipRegex=NA, collapsedTree=NA, branchLengthNormalisation=NA, verbose=NA)
{
cmd<- paste0(pr, ' "', file.tree, '" "', file.splits, '" "', file.out,'"')
if(!is.na(tipRegex))
cmd <- paste0(cmd, ' --tipRegex "', tipRegex,'"')
if(!is.na(branchLengthNormalisation) & class(branchLengthNormalisation)=='character')
cmd <- paste0(cmd, ' --branchLengthNormalisation "', branchLengthNormalisation, '"')
if(!is.na(branchLengthNormalisation) & class(branchLengthNormalisation)=='numeric')
cmd <- paste0(cmd, ' --branchLengthNormalisation ', branchLengthNormalisation)
if(!is.na(collapsedTree) & collapsedTree==TRUE)
cmd <- paste0(cmd, ' --collapsedTree')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
phsc.cmd.LikelyTransmissionsSummary<- function(pr, scriptdir, file.patients, file.summary, file.lkl, file.out, file.detailed.out=NA, min.threshold=1, allow.MultiTrans=FALSE, verbose=NA)
{
cmd<- paste(pr, ' "',file.patients, '" "', file.lkl, '" "', file.out,'" --scriptdir ', scriptdir, ' --summaryFile "', file.summary, '" --minThreshold ', min.threshold, sep='')
if(!is.na(file.detailed.out))
cmd <- paste(cmd, ' --detailedOutput "', file.detailed.out, '"', sep='')
if(allow.MultiTrans)
cmd <- paste(cmd, ' --allowMultiTrans', sep='')
if(!is.na(verbose) & verbose)
cmd <- paste0(cmd, ' --verbose')
cmd
}
#' @keywords internal
#' @import data.table
#' @title Get transmission assignments per window
#' @description This function extracts all transmission assignments per window for two individuals.
#' @param id1 Identifier of the first individual. This is the entry for that individual in the phyloscanner .bam file.
#' @param id2 Identifier of the second individual.
#' @param infiles One or more file names of transmission stats rda files. These end typically with '_trmStatsPerWindow.rda'.
#' @return data.table with columns ID1 ID2 W_FROM (window start) W_TO (window end), and further columns for each infile that give the transmission assignments per window.
phsc.get.assignments.by.window.for.couple<- function(id1, id2, infiles)
{
tmp <- data.table(F= infiles)
tmp[, RUN:=tmp[, basename(dirname(F))]]
df <- tmp[, {
#load( '~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/Rakai_ptoutput_161027_couples_w270_d20_rerun/ptyr66_trmStatsPerWindow.rda' )
load(F)
df <- subset(window.table, (pat.1==id1 & pat.2==id2) | (pat.1==id2 & pat.2==id1))
setnames(df, c('pat.1','pat.2','type'), c('ID1','ID2','TYPE'))
set(df, NULL, 'SOURCE_FILE', NULL)
set(df, df[, which(is.na(TYPE))], 'TYPE', 'disconnected')
set(df, df[, which(ID1==id1 & TYPE=='anc')], 'TYPE', 'anc_12')
set(df, df[, which(ID1==id2 & TYPE=='anc')], 'TYPE', 'anc_21')
set(df, NULL, 'ID1', id1)
set(df, NULL, 'ID2', id2)
df
}, by='RUN']
setkey(df, ID1, ID2, RUN, W_FROM)
df <- dcast.data.table(df, ID1+ID2+W_FROM+W_TO~RUN, value.var='TYPE')
df
}
#' @keywords internal
#' @title Reconstruct most likely transmission chains, greedy algorithm
#' @description This function reconstructs a maximum probility transmission
#' network from the scores associated with directed and undirected edges.
#' The algorithm starts by keeping the edge with highest score.
#' It then removes the competitor in the opposite direction, and any conflicting edges that would result in indegrees larger than one.
#' By construction, all removed edges have lower probability.
#' The algorithm proceeds until all edges have been processed.
#' @importFrom igraph get.adjacency graph.data.frame
#' @importFrom sna kcycle.census
#' @param rtn data.table with network scores for all individuals that could form a network. Must contain columns 'ID1','ID2','IDCLU','GROUP','TYPE','POSTERIOR_SCORE','KEFF'.
#' @return new data.table with added columns LINK_12 LINK_21 (either 1 or 0), and MX_PROB_12 MX_PROB_21 (associated posterior probabilities)
phsc.get.most.likely.transmission.chains.greedy<- function(rtnn, verbose=0)
{
stopifnot(c('ID1','ID2','IDCLU','TYPE','POSTERIOR_SCORE','KEFF')%in%colnames(rtnn))
stopifnot( !length(setdiff(c('ambiguous','21','12'), rtnn[, unique(TYPE)])) )
rtnn[, ID1_IN_WEIGHT:=0]
set(rtnn, rtnn[, which(TYPE=='ambiguous')],'ID1_IN_WEIGHT', 0.5)
set(rtnn, rtnn[, which(TYPE=='21')],'ID1_IN_WEIGHT', 1)
rtnn[, ID2_IN_WEIGHT:=0]
set(rtnn, rtnn[, which(TYPE=='ambiguous')],'ID2_IN_WEIGHT', 0.5)
set(rtnn, rtnn[, which(TYPE=='12')],'ID2_IN_WEIGHT', 1)
rtm <- rtnn[, list( PROB_21= sum(POSTERIOR_SCORE*ID1_IN_WEIGHT),
KEFF_21= sum(KEFF*ID1_IN_WEIGHT),
PROB_12= sum(POSTERIOR_SCORE*ID2_IN_WEIGHT),
KEFF_12= sum(KEFF*ID2_IN_WEIGHT)), by=c('IDCLU','ID1','ID2')]
# split into separate networks
# this should save time when networks are large because the searching will be much faster
rtmm <- lapply(rtm[, unique(IDCLU)], function(x) subset(rtm, IDCLU==x))
for(i in seq_along(rtmm))
{
# reconstruct maximum path network
ans <- rtmm[[i]]
if(verbose) cat('\nprocessing network',i)
# iterate: pick most likely edge and remove competing ones
set(ans, NULL, c('LINK_12','LINK_21'), 0L)
set(ans, NULL, 'PROB_MAX', ans[, pmax(PROB_12,PROB_21)])
while( ans[, any(PROB_MAX>0)] )
{
z <- ans[, which.max(PROB_MAX)]
#check adding edge at z does not create a cycle
set(ans, z, ifelse(ans[z, PROB_12>PROB_21], 'LINK_12', 'LINK_21'), 1L)
tmp <- subset(ans, LINK_21==1, c(ID1,ID2))
setnames(tmp,c('ID1','ID2'),c('ID2','ID1'))
tmp <- rbind(subset(ans, LINK_12==1, c(ID1,ID2)), tmp)
tmp <- get.adjacency(graph.data.frame(tmp), sparse=FALSE)
ccls <- kcycle.census(tmp,maxlen=ncol(tmp))$cycle.count[,'Agg']
#if adding link at z creates cycle, remove link and set prob to zero
if(any(ccls>0))
{
set(ans, z, ifelse(ans[z, PROB_12>PROB_21], 'LINK_12', 'LINK_21'), 0L)
set(ans, z, ifelse(ans[z, PROB_12>PROB_21], 'PROB_12', 'PROB_21'), 0L)
}
if(all(ccls==0))
{
#delete direct competitor in pair
set(ans, z, ifelse(ans[z, PROB_12>PROB_21], 'PROB_21', 'PROB_12'), 0)
#delete any conflicting incoming edge probabilities into recipient
rec <- ifelse(ans[z, LINK_12==1], 'ID2', 'ID1')
conflicting.inprob.name <- ifelse(rec=='ID2', 'PROB_12', 'PROB_21')
conflicting.inprob.idx <- setdiff(which( ans[[rec]]==ans[[rec]][z] ),z)
set(ans, conflicting.inprob.idx, conflicting.inprob.name, 0)
#is the recipient also possibly in the other column?
rec2 <- ifelse(ans[z, LINK_12==1], 'ID1', 'ID2')
conflicting.inprob.name <- ifelse(rec2=='ID1', 'PROB_21', 'PROB_12')
conflicting.inprob.idx <- setdiff(which( ans[[rec2]]==ans[[rec]][z] ),z)
set(ans, conflicting.inprob.idx, conflicting.inprob.name, 0)
#reset probs (we have dealt with this edge)
set(ans, z, c('PROB_MAX'), 0)
}
# re-calculate max probs
tmp <- ans[, which(LINK_12==0 & LINK_21==0)]
set(ans, tmp, 'PROB_MAX', ans[tmp, pmax(PROB_12,PROB_21)])
if(verbose) cat('\nnumber of edges to process ', ans[, length(which(PROB_MAX>0))])
}
rtmm[[i]] <- ans
}
rtm <- do.call('rbind',rtmm)
setnames(rtm, c('PROB_21','PROB_12','KEFF_21','KEFF_12'), c('MX_PROB_21','MX_PROB_12','MX_KEFF_21','MX_KEFF_12'))
set(rtm, NULL, 'PROB_MAX', NULL)
rtnn <- merge(rtnn, rtm, by=c('ID1','ID2','IDCLU'))
set(rtnn, NULL, c('ID1_IN_WEIGHT','ID2_IN_WEIGHT'), NULL)
rtnn
}
#' @keywords internal
phsc.get.pairwise.relationships.numbers<- function()
{
tmp <- matrix(c('TYPE_PAIR_DI2_NOAMB','2',
'TYPE_PAIR_DI2','3',
'TYPE_PAIR_TO','2',
'TYPE_PAIR_TODI2x2','4',
'TYPE_PAIR_TODI2_NOAMB','2',
'TYPE_PAIR_TODI2','3',
'TYPE_DIR_TODI2','2',
'TYPE_CHAIN_TODI_NOAMB','2',
'TYPE_CHAIN_TODI','3',
'TYPE_ADJ_DIR_TODI2','2',
'TYPE_NETWORK_SCORES','4',
'TYPE_ADJ_NETWORK_SCORES','4',
'TYPE_BASIC','24'), ncol=2,byrow=TRUE)
colnames(tmp) <- c('GROUP','N_TYPE')
tmp <- as.data.table(tmp)
set(tmp, NULL, 'N_TYPE', tmp[, as.integer(N_TYPE)])
tmp
}
#' @keywords internal
#' @import data.table ggplot2
#' @title Read likely transmissions summary files into a data.table
#' @description This function reads likely transmissions summary files from the phyloscanner toolkit.
#' @param prefix.infiles Full path name identifying likely transmissions summary files
#' @param prefix.run Character string to identify separate phyloscanner runs. After the prefix, an integer number is expected. For example, if prefix.run='ptyr', then files are expected to start like 'ptyr123_'.
#' @param regexpr.lklsu Regular expression that identifies likely transmissions summary files in the directory.
#' @param save.file If not missing, function output (a data.table) will be stored to 'save.file', input files will be zipped, and input files will be deleted.
#' @param resume If TRUE and save.file is not missing, the function loads and returns trees stored in save.file.
#' @return Data table with columns PAIR_ID, ID1, ID2, TYPE, WIN_OF_TYPE, PTY_RUN, WIN_TOTAL, SCORE
phsc.read.likelytransmissions<- function(prefix.infiles, prefix.run='ptyr', regexpr.lklsu='trmStats.csv$', save.file=NA, resume=FALSE, zip=FALSE)
{
if(!is.na(save.file) & resume)
{
options(show.error.messages = FALSE)
tmp <- try(suppressWarnings(load(save.file)))
options(show.error.messages = TRUE)
if(!inherits(tmp, "try-error"))
return(df)
}
cat('\nReading files starting', prefix.infiles,'...\n')
dfr <- data.table(FILE_TRSU= list.files(dirname(prefix.infiles), pattern=paste(basename(prefix.infiles), '.*', regexpr.lklsu, sep=''), full.names=TRUE))
dfr[, PTY_RUN:= as.integer(gsub(prefix.run,'',regmatches(FILE_TRSU,regexpr(paste(prefix.run,'[0-9]+',sep=''), FILE_TRSU))))]
setkey(dfr, PTY_RUN)
cat('\nFound likely transmission summary files, n=', nrow(dfr),'...\n')
if(zip & !is.na(save.file))
{
tmp <- copy(dfr)
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE_TRSU, flags = "-ur9XTjq")), by='FILE_TRSU'] )
invisible( file.remove( dfr[, FILE_TRSU] ) )
}
}
}
#' @keywords internal
#' @import data.table ape
#' @title Read subtree information
#' @description This function reads the subtree information that is generated with the phyloscanner toolkit.
#' @param prefix.infiles Full path name that identifies subtree files
#' @param prefix.run Character string to identify separate phyloscanner runs. After the prefix, an integer number is expected. For example, if prefix.run='ptyr', then files are expected to start like 'ptyr123_'.
#' @param regexpr.subtrees Regular expression that identifies subtree files in the directory. By default, this is 'Subtrees_r_.*\\.rda$' or 'Subtrees_c_.*\\.rda$' from the phyloscanner toolkit.
#' @param prefix.wfrom Character string to identify the start of a short read window. After the prefix, an integer number is expected. For example, if prefix.wfrom='Window_', then 'Window_800_to_1100' has start coordinate 800.
#' @param prefix.wto Character string to identify the end of a short read window. After the prefix, an integer number is expected. For example, if prefix.wto='Window_[0-9]+_to_', then 'Window_800_to_1100' has end coordinate 1100.
#' @param save.file If not missing, function output (a data.table) will be stored to 'save.file', input files will be zipped, and input files will be deleted.
#' @param resume If TRUE and save.file is not missing, the function loads and returns subtree info stored in save.file.
#' @param zip If TRUE and save.file is not missing, the function zips and removes subtree files that match the regular expression for subtrees.
#' @return data.table with columns PTY_RUN W_FROM W_TO orig.patients patient.splits tip.names.
phsc.read.subtrees<- function(prefix.infiles, prefix.run='ptyr', regexpr.subtrees='Subtrees_r_.*\\.rda$', prefix.wfrom='Window_', prefix.wto='Window_[0-9]+_to_', save.file=NA, resume=FALSE, zip=FALSE)
{
if(!is.na(save.file) & resume)
{
options(show.error.messages = FALSE)
tmp <- try(suppressWarnings(load(save.file)))
options(show.error.messages = TRUE)
if(!inherits(tmp, "try-error"))
{
cat('\nLoaded from file', save.file,'...\n')
return(rs.subtrees)
}
}
cat('\nReading tree summary files starting', prefix.infiles,'...\n')
dfr <- data.table(FILE_TR= list.files(dirname(prefix.infiles), pattern=regexpr.subtrees, full.names=TRUE))
dfr <- subset(dfr, grepl(basename(prefix.infiles), basename(FILE_TR), fixed=1))
dfr[, PTY_RUN:= as.integer(gsub(prefix.run,'',regmatches(FILE_TR,regexpr(paste(prefix.run,'[0-9]+',sep=''), FILE_TR))))]
dfr[, W_FROM:= as.integer(gsub(prefix.wfrom,'',regmatches(FILE_TR,regexpr(paste(prefix.wfrom,'[0-9]+',sep=''), FILE_TR))))]
dfr[, W_TO:= as.integer(gsub(prefix.wto,'',regmatches(FILE_TR,regexpr(paste(prefix.wto,'[0-9]+',sep=''), FILE_TR))))]
setkey(dfr, PTY_RUN, W_FROM, W_TO)
cat('\nFound tree summary files, n=', nrow(dfr),'...\n')
rs.subtrees <- lapply(seq_len(nrow(dfr)), function(i){
#z <- '/Users/Oliver/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/pty_Rakai_160825/ptyr1_InWindow_1050_to_1299_subtrees_r.rda'
load(dfr[i,FILE_TR])
rs.subgraphs <- as.data.table(rs.subgraphs)
rs.subgraphs[, PTY_RUN:= dfr[i, PTY_RUN]]
rs.subgraphs[, W_FROM:= dfr[i, W_FROM]]
rs.subgraphs[, W_TO:= dfr[i, W_TO]]
rs.subgraphs
})
rs.subtrees <- do.call('rbind', rs.subtrees)
if(!is.na(save.file))
{
cat('\nSave to file', save.file,'...\n')
save(rs.subtrees, file=save.file)
}
if(zip & !is.na(save.file))
{
tmp <- copy(dfr)
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_rda.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE_TR, flags = "-ur9XTjq")), by='FILE_TR'] )
invisible( file.remove( tmp[, FILE_TR] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=gsub('\\.rda','\\.csv',regexpr.subtrees), full.names=TRUE))
tmp <- subset(tmp, grepl(basename(prefix.infiles), basename(FILE), fixed=1))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_csv.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
}
rs.subtrees
}
#' @keywords internal
#' @import data.table ape
#' @title Read short read trees
#' @description This function reads short read trees that are generated with the phyloscanner toolkit.
#' @param prefix.infiles Full path name identifying likely transmissions summary files
#' @param prefix.run Character string to identify separate phyloscanner runs. After the prefix, an integer number is expected. For example, if prefix.run='ptyr', then files are expected to start like 'ptyr123_'.
#' @param regexpr.trees Regular expression that identifies tree summary files in the directory. By default, this is 'Subtrees_r_.*\\.rda$' or 'Subtrees_c_.*\\.rda$' from the phyloscanner toolkit.
#' @param prefix.wfrom Character string to identify the start of a short read window. After the prefix, an integer number is expected. For example, if prefix.wfrom='Window_', then 'Window_800_to_1100' has start coordinate 800.
#' @param prefix.wto Character string to identify the end of a short read window. After the prefix, an integer number is expected. For example, if prefix.wto='Window_[0-9]+_to_', then 'Window_800_to_1100' has end coordinate 1100.
#' @param save.file If not missing, function output (a data.table) will be stored to 'save.file', input files will be zipped, and input files will be deleted.
#' @param resume If TRUE and save.file is not missing, the function loads and returns trees stored in save.file.
#' @param zip If TRUE and save.file is not missing, the function zips and removes trees and tree pdfs that match the regular expression for trees.
#' @return list of named trees in ape format.
phsc.read.trees<- function(prefix.infiles, prefix.run='ptyr', regexpr.trees='Subtrees_r_.*\\.rda$', prefix.wfrom='Window_', prefix.wto='Window_[0-9]+_to_', save.file=NA, resume=FALSE, zip=FALSE)
{
if(!is.na(save.file) & resume)
{
options(show.error.messages = FALSE)
tmp <- try(suppressWarnings(load(save.file)))
options(show.error.messages = TRUE)
if(!inherits(tmp, "try-error"))
{
cat('\nLoaded from file', save.file,'...\n')
ans <- list(phs=phs, dfr=dfr)
return(ans)
}
}
cat('\nReading tree summary files starting', prefix.infiles,'...\n')
dfr <- data.table(FILE_TR= list.files(dirname(prefix.infiles), pattern=regexpr.trees, full.names=TRUE))
dfr <- subset(dfr, grepl(basename(prefix.infiles), basename(FILE_TR),fixed=1))
dfr[, PTY_RUN:= as.integer(gsub(prefix.run,'',regmatches(FILE_TR,regexpr(paste(prefix.run,'[0-9]+',sep=''), FILE_TR))))]
dfr[, W_FROM:= as.integer(gsub(prefix.wfrom,'',regmatches(FILE_TR,regexpr(paste(prefix.wfrom,'[0-9]+',sep=''), FILE_TR))))]
dfr[, W_TO:= as.integer(gsub(prefix.wto,'',regmatches(FILE_TR,regexpr(paste(prefix.wto,'[0-9]+',sep=''), FILE_TR))))]
setkey(dfr, PTY_RUN, W_FROM, W_TO)
dfr[, IDX:= seq_len(nrow(dfr))]
cat('\nFound tree summary files, n=', nrow(dfr),'...\n')
phs <- lapply(seq_len(nrow(dfr)), function(i){
#z <- '/Users/Oliver/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/pty_Rakai_160825/ptyr1_InWindow_1050_to_1299_subtrees_r.rda'
load(dfr[i,FILE_TR])
tree
})
names(phs) <- dfr[, paste(PTY_RUN,'_',W_FROM,'_',W_TO,sep='')]
if(!is.na(save.file))
{
cat('\nSave to file', save.file,'...\n')
save(phs, dfr, file=save.file)
}
if(zip & !is.na(save.file))
{
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern='.*Tree.*.pdf$|tree.*.pdf$', full.names=TRUE))
tmp <- subset(tmp, grepl(basename(prefix.infiles), basename(FILE),fixed=1))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_pdf.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'.*tree$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_newick.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste('^ProcessedTree_.*',basename(prefix.infiles),'.*tree$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_processed_nexus.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'.*fasta$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_fasta.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'.*DuplicateReadCounts.*csv$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_DuplicateReadCounts.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste(basename(prefix.infiles),'.*blacklist.*csv$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_blacklist.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste(basename(prefix.infiles),'.*duallist.*csv$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_duallist.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
tmp <- data.table(FILE= list.files(dirname(prefix.infiles), pattern=paste(basename(prefix.infiles),'.*collapsed.*csv$',sep=''), full.names=TRUE))
if(nrow(tmp))
{
tmp2 <- paste(gsub('\\.rda','',save.file),'_collapsed.zip',sep='')
cat('\nZip to file', tmp2,'...\n')
invisible( tmp[, list(RTN= zip( tmp2, FILE, flags = "-ur9XTjq")), by='FILE'] )
invisible( file.remove( tmp[, FILE] ) )
}
}
list(phs=phs, dfr=dfr)
}
#' @keywords internal
simplify.summary <- function(summary, arrow.threshold, total.trees, plot = F){
done <- rep(FALSE, nrow(summary))
for(line in 1:nrow(summary)){
if(!done[line]){
forwards.rows <- which(summary$host.1 == summary$host.1[line] & summary$host.2 == summary$host.2[line])
backwards.rows <- which(summary$host.1 == summary$host.2[line] & summary$host.2 == summary$host.1[line])
done[forwards.rows] <- T
done[backwards.rows] <- T
summary$ancestry[intersect(forwards.rows, which(summary$ancestry=="trans"))] <- "trans12"
summary$ancestry[intersect(forwards.rows, which(summary$ancestry=="multi_trans"))] <- "multi_trans12"
summary$ancestry[intersect(backwards.rows, which(summary$ancestry=="trans"))] <- "trans21"
summary$ancestry[intersect(backwards.rows, which(summary$ancestry=="multi_trans"))] <- "multi_trans21"
summary[backwards.rows,c(1,2)] <- summary[backwards.rows,c(2,1)]
}
}
summary.wide <- reshape(summary, direction="w", idvar=c("host.1", "host.2"), timevar = "ancestry", v.names = "ancestry.tree.count", drop=c("fraction"))
summary.wide[is.na(summary.wide)] <- 0
summary.wide$total.equiv <- summary.wide$ancestry.tree.count.none + summary.wide$ancestry.tree.count.complex
if("ancestry.tree.count.trans12" %in% colnames(summary.wide)){
summary.wide$total.12 <- summary.wide$ancestry.tree.count.trans12
} else {
summary.wide$total.12 <- rep(0, nrow(summary.wide))
}
if(!is.null(summary.wide$ancestry.tree.count.multi_trans12)){
summary.wide$total.12 <- summary.wide$total.12 + summary.wide$ancestry.tree.count.multi_trans12
}
if("ancestry.tree.count.trans21" %in% colnames(summary.wide)){
summary.wide$total.21 <- summary.wide$ancestry.tree.count.trans21
} else {
summary.wide$total.21 <- rep(0, nrow(summary.wide))
}
if(!is.null(summary.wide$ancestry.tree.count.multi_trans21)){
summary.wide$total.21 <- summary.wide$total.21 + summary.wide$ancestry.tree.count.multi_trans21
}
summary.wide$total <- summary.wide$total.21 + summary.wide$total.12 + summary.wide$total.equiv
dir <- summary.wide$total.12 >= arrow.threshold*total.trees | summary.wide$total.21 >= arrow.threshold*total.trees
summary.wide$arrow[!dir] <- "none"
if(length(which(dir)>0)){
summary.wide$arrow[dir] <- sapply(which(dir), function(x) if(summary.wide$total.12[x]>summary.wide$total.21[x]) "forwards" else "backwards")
summary.wide$label[dir] <- paste0(round(pmax(summary.wide$total.12[dir],summary.wide$total.21[dir])/total.trees, 2),"/", round(summary.wide$total[dir]/total.trees, 2))
}
summary.wide$label[!dir] <- as.character(round(summary.wide$total[!dir]/total.trees, 2))
out.table <- summary.wide[,c("host.1","host.2","arrow","label")]
out.table[which(out.table$arrow=="backwards"),c(1,2)] <- out.table[which(out.table$arrow=="backwards"),c(2,1)]
out.table$arrow <- out.table$arrow!="none"
out <- list(simp.table = out.table)
if(plot){
# okay so we're doing this
arrangement <- ggnet2(out.table[,c(1,2)])$data[,c("label", "x", "y")]
out.table$x.start <- sapply(out.table$host.1, function(x) arrangement$x[match(x, arrangement$label)])
out.table$y.start <- sapply(out.table$host.1, function(x) arrangement$y[match(x, arrangement$label)])
out.table$x.end <- sapply(out.table$host.2, function(x) arrangement$x[match(x, arrangement$label)])
out.table$y.end <- sapply(out.table$host.2, function(x) arrangement$y[match(x, arrangement$label)])
out.table$x.midpoint <- (out.table$x.end + out.table$x.start)/2
out.table$y.midpoint <- (out.table$y.end + out.table$y.start)/2
out.diagram <- ggplot() +
geom_segment(data=out.table[which(out.table$arrow),], aes(x=x.start, xend = x.end, y=y.start, yend = y.end), arrow = arrow(length = unit(0.01, "npc"), type="closed"), col="steelblue3", size=1.5, lineend="round") +
geom_segment(data=out.table[which(!out.table$arrow),], aes(x=x.start, xend = x.end, y=y.start, yend = y.end), col="chartreuse3", size=1.5, lineend="round") +
geom_label(data=arrangement, aes(x=x, y=y, label=label), alpha=0.25, fill="darkgoldenrod3") +
geom_text(data=out.table, aes(x=x.midpoint, y=y.midpoint, label=label)) +
theme_void()
out$simp.diagram <- out.diagram
}
return(out)
}
#' @keywords internal
#' @import data.table ggplot2 scales
#' @title Compare window assignments by different phyloscanner runs
#' @description This function plots the window assignments for each pair of individuals and for several phyloscanner runs.
#' @param rplkl2 Posterior probability assignments for pairs of individuals. This is specified as a data.table with columns RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, TYPE, KEFF, POSTERIOR_ALPHA, POSTERIOR_BETA
#' @param plot.file Name of output file
#' @param cols Colours for phyloscanner runs. This is specified as a named vector which must not be missing.
#' @param cols Colours for relationship types. This is specified as a named vector, which can be null.
#' @param group Relationship group name. This is used to define default colours if cols==NULL.
#' @param height Total height of pdf in inches. This is overwritten if group is specified.
#' @return Plots to file.
phsc.plot.windowassignments.by.runs <- function(rplkl2, plot.file, plot.prob.select, cols.run, cols=NULL, group=NA, height=40)
{
# height<- 40
g_legend<-function(a.gplot)
{
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
legend
}
# define colours to be used
if(is.null(cols))
{
stopifnot(!is.na(group))
cols.type <- phsc.plot.default.colours.for.relationtypes()
stopifnot(group%in%names(cols.type))
cols <- cols.type[[group]]
}
# re-order pairs
setkey(rplkl2, LABEL_SH)
tmp <- subset(rplkl2, TYPE%in%plot.prob.select)
tmp2 <- tmp[, list(D=mean(POSTERIOR_ALPHA/(POSTERIOR_ALPHA+POSTERIOR_BETA))), by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL_SH')]
tmp2 <- tmp2[order(-D),]
tmp2[, PLOT_ID:= 1:nrow(tmp2)]
set(tmp2, NULL, 'PLOT_ID', tmp2[, factor(PLOT_ID, levels=PLOT_ID, labels=LABEL_SH)])
setnames(tmp2, c('PLOT_ID','LABEL_SH'),c('LABEL_SH','OLD_LABEL_SH'))
set(tmp2, NULL, c('OLD_LABEL_SH','D'), NULL)
set(rplkl2, NULL, c('LABEL_SH'), NULL)
rplkl2 <- merge(rplkl2, tmp2, by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID'))
setkey(rplkl2, LABEL)
p1 <- ggplot(rplkl2, aes(x=RUN, y=KEFF, fill=TYPE, colour=RUN)) +
geom_bar(stat='identity',position='stack') +
scale_y_continuous(expand=c(0,0)) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols.run) +
theme_bw() +
theme( legend.position='bottom', axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.spacing=unit(0.4, "lines"), strip.text.y = element_text(angle=180),
strip.background=element_rect(fill="transparent", colour="transparent"),
panel.border=element_rect(color="transparent")) +
facet_grid(LABEL_SH~., switch='y') +
coord_flip() +
guides(fill=guide_legend(ncol=3, byrow = TRUE), colour=guide_legend(ncol=1, byrow = TRUE)) +
labs( x='',
y='non-overlapping windows\n(number)\n',
fill='phylogenetic\nrelationship')
tmp <- subset(rplkl2, TYPE%in%plot.prob.select)
p2 <- ggplot(tmp, aes(x=RUN, fill=TYPE, colour=RUN,
middle= qbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA),
lower=qbeta(0.25, POSTERIOR_ALPHA, POSTERIOR_BETA),
upper=qbeta(0.75, POSTERIOR_ALPHA, POSTERIOR_BETA),
ymin=qbeta(0.025, POSTERIOR_ALPHA, POSTERIOR_BETA),
ymax=qbeta(0.975, POSTERIOR_ALPHA, POSTERIOR_BETA))) +
geom_boxplot(stat='identity') +
scale_y_continuous(expand=c(0,0), limits=c(0,1), breaks=seq(0,1,0.2), labels=percent) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols.run) +
theme_bw() +
theme( legend.position='bottom', axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.spacing=unit(0.4, "lines"), strip.text.y=element_blank(),
strip.background=element_blank(),
panel.border=element_rect(color="transparent")) +
facet_grid(LABEL_SH~., switch='y') +
coord_flip() +
guides(fill=guide_legend(ncol=3, byrow = TRUE), colour=guide_legend(ncol=1, byrow = TRUE)) +
labs( x='',
y='posterior probability\n\n',
fill='phylogenetic\nrelationship')
p3 <- g_legend(p1)
p3$vp <- viewport(layout.pos.row=2, layout.pos.col=1:2)
pdf(file=plot.file, w=12, h=height)
grid.newpage()
pushViewport(viewport(layout = grid.layout(2, 2, heights=unit(c(40,1), "null"), widths=unit(c(7, 3), "null"))))
print(p1+theme(legend.position = 'none'), vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(p2+theme(legend.position = 'none'), vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
grid.draw(p3)
#pushViewport(viewport(layout = grid.layout(1, 2, heights=unit(c(10,1), "null"), widths=unit(c(7, 3), "null"))))
#print(p1+theme(legend.position = 'none'), vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
#print(p2+theme(legend.position = 'none'), vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
dev.off()
}
#' @keywords internal
#' @import data.table ggplot2 scales
#' @title Plot window summaries for pairs of individuals
#' @description This function plots the window summaries for each pair of individuals.
#' @param plot.select Select pairs for plotting. This is specified as a data.table with columns PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL. Each pair has assigned a label that is used as title.
#' @param rpw2 Window assignments for pairs of individuals. This is specified as a data.table with columns PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM, ID_R_MAX, ID_R_MIN, TYPE.
#' @param rplkl2 Posterior probability assignments for pairs of individuals. This is specified as a data.table with columns PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, TYPE, KEFF, POSTERIOR_ALPHA, POSTERIOR_BETA
#' @param plot.file Name of output file
#' @param cols Colours for relationship types. This is specified as a named vector, which can be null.
#' @param group Relationship group name. This is used to define default colours if cols==NULL.
#' @param widhts Widths of subfigures in grid.layout. This is overwritten if group is specified.
#' @param heights Heights of subfigures in grid.layout. This is overwritten if group is specified.
#' @param height Total height of pdf in inches. This is overwritten if group is specified.
#' @return Plots to file.
phsc.plot.windowsummaries.for.pairs<- function(plot.select, rpw2, rplkl2, plot.file, cols=NULL, group=NA, widths=unit(c(4, 6), "null"), heights=unit(c(2, 3.5, 4, 5), "null"), height=9)
{
# widths <- unit(c(4, 6), "null"); heights <- unit(c(2, 3.5, 4, 5), "null"); height <- 9
# define colours to be used
if(is.null(cols))
{
stopifnot(!is.na(group))
cols.type <- phsc.plot.default.colours.for.relationtypes()
stopifnot(group%in%names(cols.type))
cols <- cols.type[[group]]
print(cols)
}
# re-define dimensions if group specified
if(!is.na(group))
{
if(group%in%c('TYPE_DIR_TODI7x3','TYPE_BASIC'))
{
widths <- unit(c(4, 6), "null")
heights <- unit(c(2, 3.5, 4, 15), "null")
height <- 17
}
if(group%in%c('TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI','TYPE_PAIR_TODI2'))
{
heights <- unit(c(2, 3.5, 4, 3.75), "null")
height <- 8
}
if(group%in%c('TYPE_PAIR_TODI3','TYPE_PAIR_DI','TYPE_CHAIN_TODI','TYPE_PAIR_TODI2','TYPE_DIRSCORE_TODI3','TYPE_DIR_TODI3'))
{
heights <- unit(c(2, 3.5, 4, 3.5), "null")
height <- 7
}
}
pdf(file=plot.file, w=10, h=height)
setkey(plot.select, LABEL)
for(i in seq_len(nrow(plot.select)))
{
#i<- 85
#pty_run <- 38; id1 <- '16016_1_4'; id2 <- '15105_1_35'
pty_run <- plot.select[i, PTY_RUN]; id1 <- plot.select[i, FEMALE_SANGER_ID]; id2 <- plot.select[i, MALE_SANGER_ID]
tmp <- subset(rpw2, PTY_RUN==pty_run & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
set(tmp, NULL, 'TYPE', tmp[, gsub(' ',' ',TYPE)])
set(tmp, tmp[, which(PATRISTIC_DISTANCE<1.1e-3)], 'PATRISTIC_DISTANCE', 1.1e-3)
p1 <- ggplot(tmp, aes(x=W_FROM)) +
geom_bar(aes(y=ID_R_MAX, colour=TYPE), stat='identity', fill='transparent') +
geom_bar(aes(y=ID_R_MIN, fill=TYPE), stat='identity', colour='transparent') +
labs(x='', y='number of reads', fill='phylogenetic\nrelationship\n', colour='phylogenetic\nrelationship\n') +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw2[, min(W_FROM)-100], rpw2[, max(W_FROM)+100])) +
scale_y_log10(breaks=c(10,100,1000,1e4,1e5)) +
theme_bw() + theme(legend.position='left') +
guides(fill=FALSE, colour=FALSE)
p2 <- ggplot(tmp, aes(x=W_FROM, y=PATRISTIC_DISTANCE)) +
geom_point(size=1) +
labs(x='window start\n\n', y='patristic distance') +
scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw2[, min(W_FROM)], rpw2[, max(W_FROM)])) +
scale_y_log10(labels=percent, limits=c(0.001, 0.7), expand=c(0,0), breaks=c(0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25)) +
theme_bw() + theme(legend.position='left')
tmp <- subset(rplkl2, PTY_RUN==pty_run & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
set(tmp, NULL, 'TYPE', tmp[, gsub(' ',' ',TYPE)])
p3 <- ggplot(tmp, aes(x=TYPE, y=KEFF, fill=TYPE)) + geom_bar(stat='identity') +
scale_fill_manual(values=cols) +
theme_bw() + theme(legend.position='bottom') +
coord_flip() + guides(fill=FALSE) +
labs(x='', y='\nnon-overlapping windows\n(number)', fill='phylogenetic\nrelationship\n')
p4 <- ggplot(tmp, aes(x=TYPE, middle=qbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA),
ymin=qbeta(0.025, POSTERIOR_ALPHA, POSTERIOR_BETA),
ymax=qbeta(0.975, POSTERIOR_ALPHA, POSTERIOR_BETA),
lower=qbeta(0.25, POSTERIOR_ALPHA, POSTERIOR_BETA),
upper=qbeta(0.75, POSTERIOR_ALPHA, POSTERIOR_BETA),
fill=TYPE)) +
geom_boxplot(stat='identity') +
scale_y_continuous(labels=percent, breaks=seq(0,1,0.2), limits=c(0,1), expand=c(0,0)) +
scale_fill_manual(values=cols) +
theme_bw() + theme(legend.position='right', legend.margin=margin(0, .1, 0, 1, "cm")) +
coord_flip() + guides(fill=guide_legend(ncol=1)) +
labs(x='', y='\nposterior probability\n', fill='phylogenetic\nrelationship\n')
grid.newpage()
pushViewport(viewport(layout = grid.layout(4, 2, heights=heights, widths=widths)))
grid.text(plot.select[i,LABEL], gp=gpar(fontsize=10), vp=viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(p1, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(p2, vp = viewport(layout.pos.row = 3, layout.pos.col = 1:2))
print(p3, vp = viewport(layout.pos.row = 4, layout.pos.col = 1))
print(p4, vp = viewport(layout.pos.row = 4, layout.pos.col = 2))
}
dev.off()
}
#' @keywords internal
#' @import data.table
#' @title Default colours for relationship types
#' @description This function returns default colours for each relationship type in one of the defined relationship groups.
#' @return Named list with names corresponding to RELATIONSHIP GROUPS. For each relationship group, the list contains a named vector. Entries in this vector are colours, and names are relationship types.
phsc.plot.default.colours.for.relationtypes<- function()
{
cols.type <- list()
tmp2 <- do.call('rbind',list(
data.table( TYPE= c("chain fm\nno intermediate\nclose","chain fm\nno intermediate","chain fm\nno intermediate\ndistant",
"chain 21\nno intermediate\nclose","chain 21\nno intermediate","chain 21\nno intermediate\ndistant"),
COLS= brewer.pal(11, 'PiYG')[c(1,2,4)]),
data.table( TYPE= c("chain mf\nno intermediate\nclose","chain mf\nno intermediate","chain mf\nno intermediate\ndistant",
"chain 12\nno intermediate\nclose","chain 12\nno intermediate","chain 12\nno intermediate\ndistant"),
COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
data.table( TYPE= c("intermingled\nno intermediate\nclose","intermingled\nno intermediate","intermingled\nno intermediate\ndistant"),
COLS= brewer.pal(11, 'PRGn')[c(1,2,4)]),
data.table( TYPE= c("chain fm\nwith intermediate\nclose","chain fm\nwith intermediate","chain fm\nwith intermediate\ndistant",
"chain 21\nwith intermediate\nclose","chain 21\nwith intermediate","chain 21\nwith intermediate\ndistant"),
COLS= rev(brewer.pal(11, 'BrBG'))[c(3,4,5)]),
data.table( TYPE= c("chain mf\nwith intermediate\nclose","chain mf\nwith intermediate","chain mf\nwith intermediate\ndistant",
"chain 12\nwith intermediate\nclose","chain 12\nwith intermediate","chain 12\nwith intermediate\ndistant"),
COLS= rev(brewer.pal(11, 'PRGn'))[c(3,4,5)]),
data.table( TYPE= c("intermingled\nwith intermediate\nclose","intermingled\nwith intermediate","intermingled\nwith intermediate\ndistant"),
COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
data.table( TYPE= c("other close","other","other distant"),
COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)]),
data.table( TYPE= c("sibling no intermediate\nclose","sibling no intermediate","sibling no intermediate\ndistant"),
COLS= brewer.pal(11, 'BrBG')[c(5,4,3)]),
data.table( TYPE= c("sibling with intermediate\nclose","sibling with intermediate","sibling with intermediate\ndistant"),
COLS= brewer.pal(11, 'PuOr')[c(4,2,1)])))
cols.type[['TYPE_DIR_TODI7x3']] <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_BASIC']] <- cols.type[['TYPE_DIR_TODI7x3']]
tmp2 <- do.call('rbind',list(
data.table( TYPE= c("pair close","pair","pair distant"),
COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
data.table( TYPE= c("withintermediate close","withintermediate","withintermediate distant"),
COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
data.table( TYPE= c("other close","other","other distant"),
COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_TODI3x3']] <- tmp2
tmp2 <- do.call('rbind',list(
data.table( TYPE= c('close ancestral/\nintermingled\nsibling', 'not close ancestral/\nintermingled\nsibling'),
COLS= brewer.pal(11, 'PRGn')[c(2,4)]),
data.table( TYPE= c('close other','not close other'),
COLS= rev(brewer.pal(11, 'RdGy'))[c(3,5)])))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_TODI2x2']] <- tmp2
tmp2 <- do.call('rbind',list(
data.table( TYPE= c('likely pair', 'chain'),
COLS= brewer.pal(11, 'PRGn')[c(2,4)]),
data.table( TYPE= c('intermediate distance','distant'),
COLS= rev(brewer.pal(11, 'RdGy'))[c(3,5)])))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_TODI']] <- tmp2
tmp2 <- data.table( TYPE= c("chain", "intermediate distance", "distant"),
COLS= c(brewer.pal(11, 'RdBu')[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_CHAIN_TODI']] <- tmp2
tmp2 <- data.table( TYPE= c("no intermediate\n and close", "no intermediate\n but not close", "with intermediate\nor distant"),
COLS= c(brewer.pal(11, 'RdBu')[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_TODI3']] <- tmp2
tmp2 <- data.table( TYPE= c("close", "intermediate\ndistance", "distant"),
COLS= c(rev(brewer.pal(11, 'RdBu'))[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_DI']] <- tmp2
tmp2 <- data.table( TYPE= c("ancestral/\nintermingled", "other"),
COLS= c(rev(brewer.pal(9, 'Greens'))[2], rev(brewer.pal(11, 'RdGy'))[4]))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_TO']] <- tmp2
tmp2 <- data.table( TYPE= c("linked", "unlinked"),
COLS= c(rev(brewer.pal(9, 'Greens'))[2], rev(brewer.pal(11, 'RdGy'))[4]))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_PAIR_TODI2']] <- tmp2
tmp2 <- data.table( TYPE= c("mf","12","fm","21",'ambiguous'),
COLS= c('steelblue2','steelblue2', 'hotpink2','hotpink2','grey50'))
tmp2 <- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
cols.type[['TYPE_DIRSCORE_TODI3']] <- tmp2
cols.type[['TYPE_DIR_TODI3']] <- cols.type[['TYPE_DIRSCORE_TODI3']]
cols.type
}
#' @keywords internal
#' @title Generate bash command to combine single window resumes of a phyloscanner run
#' @param prefix.infiles File name that points phyloscanner output.
#' @param pty.args List of phyloscanner input variables. See examples.
#' @return Character string of phyloscanner commands.
#' @description This function generates bash commands to resume a single phyloscanner run, from the point where all read alignments and read phylogenies were created. The bash script can be called via 'system' in R, or written to file to run on a UNIX system.
phsc.cmd.phyloscanner.one.resume.combinewindows<- function(prefix.infiles, pty.args)
{
stopifnot(pty.args$combine.processed.windows==1)
# create local tmp dir
cmd <- paste("CWD=$(pwd)\n",sep='\n')
cmd <- paste(cmd,"echo $CWD\n",sep='')
tmpdir <- paste('pty','_',format(Sys.time(),"%y-%m-%d-%H-%M-%S"),sep='')
tmpdir <- paste("$CWD/",tmpdir,sep='')
cmd <- paste(cmd,'mkdir -p "',tmpdir,'"\n',sep='')
# copy required files to local tmp dir
file.patient<- list.files(dirname(prefix.infiles), pattern=paste(basename(prefix.infiles),'.*patients.txt',sep=''), full.names=TRUE)
stopifnot(length(file.patient)==1)
cmd <- paste(cmd,'cp "',file.patient,'" "',tmpdir,'"\n',sep='')
tmp <- list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'output_Window.*',sep=''), full.names=TRUE)
for(i in seq_along(tmp))
cmd <- paste(cmd,'unzip -n "',tmp[i],'" -d "',tmpdir,'"\n',sep='')
tmp <- list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'otherstuff_Window.*',sep=''), full.names=TRUE)
for(i in seq_along(tmp))
cmd <- paste(cmd,'unzip -n "',tmp[i],'" -d "',tmpdir,'"\n',sep='')
# cd to tmp dir
cmd <- paste(cmd, 'cd "',tmpdir,'"\n', sep='')
# add all toolkit commands according to pty.args
cmd <- paste(cmd, phsc.cmd.process.phyloscanner.output.in.directory(tmpdir, file.patient, pty.args), collapse='\n',sep='')
# move all files starting with current run ID
run.id <- gsub('_patients.txt','',basename(file.patient))
cmd <- paste(cmd, '\nmv ',run.id,'* "',pty.args$out.dir,'"\n',sep='')
# zip up everything else
tmp <- paste(run.id,'_otherstuff.zip',sep='')
cmd <- paste(cmd, 'for file in *; do\n\tzip -ur9XTj ',tmp,' "$file"\ndone\n',sep='')
cmd <- paste(cmd, 'mv ',tmp,' "',pty.args$out.dir,'"\n',sep='')
# clean up
cmd <- paste(cmd,'cd $CWD\nrm -r "',tmpdir,'"\n',sep='')
cmd
}
#' @keywords internal
#' @title Generate bash command to resume a single window of a phyloscanner run
#' @param prefix.infiles File name that points phyloscanner output.
#' @param pty.args List of phyloscanner input variables. See examples.
#' @return Character string of phyloscanner commands.
#' @description This function generates bash commands to resume a single phyloscanner run, from the point where all read alignments and read phylogenies were created. The bash script can be called via 'system' in R, or written to file to run on a UNIX system.
phsc.cmd.phyloscanner.one.resume.onewindow<- function(prefix.infiles, pty.args)
{
stopifnot(!is.na(pty.args$process.window))
# create local tmp dir
cmd <- paste("CWD=$(pwd)\n",sep='\n')
cmd <- paste(cmd,"echo $CWD\n",sep='')
tmpdir <- paste('pty','_',format(Sys.time(),"%y-%m-%d-%H-%M-%S"),sep='')
tmpdir <- paste("$CWD/",tmpdir,sep='')
cmd <- paste(cmd,'mkdir -p "',tmpdir,'"\n',sep='')
# copy required files to local tmp dir
file.patient<- list.files(dirname(prefix.infiles), pattern=paste(basename(prefix.infiles),'.*patients.txt',sep=''), full.names=TRUE)
stopifnot(length(file.patient)==1)
cmd <- paste(cmd,'cp "',file.patient,'" "',tmpdir,'"\n',sep='')
tmp <- list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'.*fasta.zip',sep=''), full.names=TRUE)
stopifnot(length(tmp)==1)
cmd <- paste(cmd,'unzip -j "',tmp,'" "*',pty.args$process.window,'*" -d "',tmpdir,'"\n',sep='')
tmp <- list.files(dirname(prefix.infiles), pattern=paste('^',basename(prefix.infiles),'.*newick.zip',sep=''), full.names=TRUE)
stopifnot(length(tmp)==1)
cmd <- paste(cmd,'unzip -j "',tmp,'" "*',pty.args$process.window,'*" -d "',tmpdir,'"\n',sep='')
# cd to tmp dir
cmd <- paste(cmd, 'cd "',tmpdir,'"\n', sep='')
# add all toolkit commands according to pty.args
cmd <- paste(cmd, phsc.cmd.process.phyloscanner.output.in.directory(tmpdir, file.patient, pty.args), collapse='\n',sep='')
# zip up window output
run.id <- gsub('_patients.txt','',basename(file.patient))
tmp <- paste(run.id,'_output_Window',pty.args$process.window,'.zip',sep='')
cmd <- paste(cmd, '\nfor file in ',run.id,'*; do\n\tzip -ur9XTj ',tmp,' "$file"\ndone\n',sep='')
cmd <- paste(cmd, 'mv ',tmp,' "',pty.args$out.dir,'"\n',sep='')
# zip up everything else
tmp <- paste(run.id,'_otherstuff_Window',pty.args$process.window,'.zip',sep='')
cmd <- paste(cmd, 'for file in *; do\n\tzip -ur9XTj ',tmp,' "$file"\ndone\n',sep='')
cmd <- paste(cmd, 'mv ',tmp,' "',pty.args$out.dir,'"\n',sep='')
# clean up
cmd <- paste(cmd,'cd $CWD\nrm -r "',tmpdir,'"\n',sep='')
cmd
}
#' @keywords internal
#' @title Count observed relationship states
#' @description This function counts for each pair of individuals their relationship states across all windows (KEFF), and the total number of windows (NEFF). Since windows can be overlapping, the count is a real value.
#' @param df data.table with basic relationship types for paired individuals across windows. Must contain columns 'ID1','ID2','W_FROM','W_TO','TYPE_BASIC'.
#' @param get.groups names of relationship groups
#' @return new data.table with columns ID1 ID2 GROUP TYPE K KEFF N NEFF.
phsc.get.pairwise.relationships.keff.and.neff<- function(df, get.groups, w.slide=NA)
{
stopifnot(c('ID1','ID2','W_FROM','W_TO','TYPE_BASIC')%in%colnames(df))
#
# identify chunks of contiguous windows
#
setkey(df, ID1, ID2, W_FROM)
if(is.na(w.slide))
{
w.slide <- df[, {
ans <- NA_integer_
tmp <- diff(W_FROM)
if(length(tmp))
ans <- min(tmp)
list(W_SLIDE=ans)
}, by=c('ID1','ID2')]
w.slide <- subset(w.slide, !is.na(W_SLIDE))
w.slide <- ifelse(nrow(w.slide), w.slide[, min(W_SLIDE)], 1L)
}
# define chunks
setkey(df, ID1, ID2, W_FROM)
tmp <- df[, {
tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+w.slide)!=W_FROM[-1]) )
list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
}, by=c('ID1','ID2')]
df <- merge(df,tmp,by=c('ID1','ID2','W_FROM','W_TO'))
# define chunk length in terms of non-overlapping windows & number of windows in chunk
tmp <- df[, {
list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
}, by=c('ID1','ID2','CHUNK')]
df <- merge(df,tmp,by=c('ID1','ID2','CHUNK','W_FROM','W_TO'))
# for each chunk, count: windows by type and effective length of chunk
# then sum chunks
rplkl <- df[, list( K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('ID1','ID2','CHUNK','TYPE_BASIC')]
rplkl <- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('ID1','ID2','TYPE_BASIC')]
#
# add relationship types
#
setnames(rplkl, 'TYPE_BASIC', 'TYPE_DIR_TODI7x3') #for backwards compatibility
rplkl <- phsc.get.pairwise.relationships(rplkl, get.groups=get.groups, make.pretty.labels=FALSE)
#for(x in get.groups)
# set(rplkl, NULL, x, gsub('_',' ',rplkl[[x]]))
setnames(rplkl, 'TYPE_DIR_TODI7x3', 'TYPE_BASIC')
# melt relationship groups
rplkl <- melt(rplkl, measure.vars=c(get.groups,'TYPE_BASIC'), variable.name='GROUP', value.name='TYPE')
# # get.groups are not col of rplkl
# rplkl <- melt(rplkl, measure.vars='TYPE_BASIC', variable.name='GROUP', value.name='TYPE')
rplkl <- subset(rplkl, !is.na(TYPE))
# sum K and KEFF of same relationship state
rplkl <- rplkl[, list(V=sum(V)), by=c('ID1','ID2','GROUP','TYPE','STAT')]
# add zero-count relationship states (change to wide table and set NA's to zero's)
tmp <- unique(subset(rplkl, select=c(GROUP,TYPE)))
tmp2 <- unique(subset(rplkl, select=c(ID1,ID2, STAT)))
tmp[, DUMMY:=1L]
tmp2[, DUMMY:=1L]
tmp <- merge(tmp, tmp2, by='DUMMY',allow.cartesian=TRUE)
set(tmp, NULL, 'DUMMY', NULL)
rplkl <- merge(tmp, rplkl, all.x=1, by=c('ID1','ID2','GROUP','TYPE','STAT'))
set(rplkl, rplkl[,which(is.na(V))], 'V', 0)
# expand KEFF and K columns now that everything is done
rplkl <- dcast.data.table(rplkl, ID1+ID2+GROUP+TYPE~STAT, value.var='V')
# calculate N and NEFF
tmp <- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('ID1','ID2','GROUP')]
rplkl <- merge(rplkl, tmp, by=c('ID1','ID2','GROUP'))
rplkl
}
#' @keywords internal
#' @title Calculate basic pairwise relationships
#' @description This function calculates basic pairwise relationships of two individuals in any window. Several different relationship groups can be calculated, for example just using pairwise distance, or using both pairwise distance and topology to define likely pairs.
#' @param df data.table to which basic pairwise relationships will be added. Must contain columns with name 'ADJACENT','TYPE_RAW','PATRISTIC_DISTANCE','PATHS_12','PATHS_21'.
#' @param trmw.close.brl Maximum patristic distance between any two read trees from both individuals in a window to classify the individuals as phylogenetically close.
#' @param trmw.distant.brl Minimum patristic distance between any two read trees from both individuals in a window to classify the individuals as phylogenetically distant.
#' @return input data.table with new relationship column TYPE_BASIC.
phsc.get.basic.pairwise.relationships<- function(df, trmw.close.brl, trmw.distant.brl, verbose=TRUE)
{
df[, TYPE_BASIC:= TYPE_RAW]
stopifnot(c('ADJACENT','CONTIGUOUS','TYPE_BASIC','PATRISTIC_DISTANCE','PATHS_12','PATHS_21')%in%colnames(df))
#
# chains_12
#
tmp <- df[, which(PATHS_12>0 & PATHS_21==0 & ADJACENT & CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12>0 & PATHS_21==0 & CONTIGUOUS, n=', length(tmp),'--> chain_12 with no intermediate')
set(df, tmp, 'TYPE_BASIC', 'chain_12_nointermediate')
tmp <- df[, which(PATHS_12>0 & PATHS_21==0 & ADJACENT & !CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12>0 & PATHS_21==0 & !CONTIGUOUS, n=', length(tmp),'--> chain_12 with intermediate')
set(df, tmp, 'TYPE_BASIC', 'chain_12_withintermediate')
#
# chains_21
#
tmp <- df[, which(PATHS_12==0 & PATHS_21>0 & ADJACENT & CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21>0 & CONTIGUOUS, n=', length(tmp),'--> chain_21 with no intermediate')
set(df, tmp, 'TYPE_BASIC', 'chain_21_nointermediate')
tmp <- df[, which(PATHS_12==0 & PATHS_21>0 & ADJACENT & !CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21>0 & !CONTIGUOUS, n=', length(tmp),'--> chain_21 with intermediate')
set(df, tmp, 'TYPE_BASIC', 'chain_21_withintermediate')
#
# intermingled
#
tmp <- df[, which(PATHS_12>0 & PATHS_21>0 & ADJACENT & CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21==0 & CONTIGUOUS, n=', length(tmp),'--> intermingled with no intermediate')
set(df, tmp, 'TYPE_BASIC', 'intermingled_nointermediate')
tmp <- df[, which(PATHS_12>0 & PATHS_21>0 & ADJACENT & !CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21==0 & !CONTIGUOUS, n=', length(tmp),'--> intermingled with intermediate')
set(df, tmp, 'TYPE_BASIC', 'intermingled_withintermediate')
#
# sibling
#
tmp <- df[, which(PATHS_12==0 & PATHS_21==0 & ADJACENT & CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21==0 & ADJACENT & CONTIGUOUS, n=', length(tmp),'--> sibling with no intermediate')
set(df, tmp, 'TYPE_BASIC', 'sibling_nointermediate')
tmp <- df[, which(PATHS_12==0 & PATHS_21==0 & ADJACENT & !CONTIGUOUS)]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21==0 & ADJACENT & !CONTIGUOUS, n=', length(tmp),'--> sibling with intermediate')
set(df, tmp, 'TYPE_BASIC', 'sibling_withintermediate')
#
# other
#
tmp <- df[, which(PATHS_12==0 & PATHS_21==0 & !ADJACENT )]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21==0 & !ADJACENT, n=', length(tmp),'--> other')
set(df, tmp, 'TYPE_BASIC', 'other')
tmp <- df[, which(PATHS_12>0 & PATHS_21==0 & !ADJACENT )]
if(verbose) cat('\nFound PATHS_12>0 & PATHS_21==0 & !ADJACENT, n=', length(tmp),'--> other')
set(df, tmp, 'TYPE_BASIC', 'other')
tmp <- df[, which(PATHS_12==0 & PATHS_21>0 & !ADJACENT )]
if(verbose) cat('\nFound PATHS_12==0 & PATHS_21>0 & !ADJACENT, n=', length(tmp),'--> other')
set(df, tmp, 'TYPE_BASIC', 'other')
tmp <- df[, which(PATHS_12>0 & PATHS_21>0 & !ADJACENT )]
if(verbose) cat('\nFound PATHS_12>0 & PATHS_21>0 & !ADJACENT, n=', length(tmp),'--> other')
set(df, tmp, 'TYPE_BASIC', 'other')
# check
stopifnot( !nrow(subset(df, TYPE_BASIC=='none')) )
#
# add distance as second dimension
#
if(!is.na(trmw.close.brl) & is.finite(trmw.close.brl))
{
if(verbose) cat('\nidentifying close pairwise assignments using distance=',trmw.close.brl)
tmp <- df[, which(PATRISTIC_DISTANCE<trmw.close.brl)]
if(verbose) cat('\nFound close, n=', length(tmp))
set(df, tmp, 'TYPE_BASIC', df[tmp, paste0(TYPE_BASIC,'_close')])
}
if(!is.na(trmw.distant.brl) & is.finite(trmw.distant.brl))
{
if(verbose) cat('\nidentifying distant pairwise assignments using distance=',trmw.distant.brl)
tmp <- df[, which(PATRISTIC_DISTANCE>=trmw.distant.brl)]
if(verbose) cat('\nFound distant, n=', length(tmp))
set(df, tmp, 'TYPE_BASIC', df[tmp, paste0(TYPE_BASIC,'_distant')])
}
df
}
#' @keywords internal
#' @title Calculate marginal posterior probability for two individuals being in a particular relationship state
#' @description This function calculates the parameters that specify the marginal posterior probability for two individuals being in a particular relationship state. The marginal posterior is Beta distributed and this function calculates the ALPHA and BETA parameters.
#' @param df Input data.table
#' @param n.type Calibration parameter for the prior: minimum number of windows of state to select a pair of individuals with confidence of at least at least confidence.cut, if the total number of windows is n.obs
#' @param n.obs Calibration parameter for the prior: total number of windows.
#' @param confidence.cut Calibration parameter for the prior: confidence cut off.
#' @return Input data.table with two additional columns POSTERIOR_ALPHA and POSTERIOR_BETA
phsc.get.pairwise.relationships.posterior<- function(df, n.type=2, n.obs=3, n.type.dir=n.type, n.obs.dir=n.obs, confidence.cut=0.5)
{
stopifnot(c('GROUP')%in%colnames(df))
tmp <- phsc.get.pairwise.relationships.numbers()
tmp <- tmp[, {
#if(!grepl('_DIR',GROUP))
# z<- phsc.get.prior.parameter.n0(N_TYPE, keff=n.type, neff=n.obs, confidence.cut=confidence.cut)
#if(grepl('_DIR',GROUP))
# z<- phsc.get.prior.parameter.n0(N_TYPE, keff=n.type.dir, neff=n.obs.dir, confidence.cut=confidence.cut)
#list(PAR_PRIOR=z)
list(PAR_PRIOR=N_TYPE)
}, by=c('GROUP','N_TYPE')]
df <- merge(df, tmp, by=c('GROUP'))
df[, POSTERIOR_ALPHA:= PAR_PRIOR/N_TYPE+KEFF]
df[, POSTERIOR_BETA:= PAR_PRIOR*(1-1/N_TYPE)+NEFF-KEFF]
df
}
#' @keywords internal
#' @title Calculate pairwise relationships
#' @description This function calculates pairwise relationships of two individuals in any window. Several different relationship groups can be calculated, for example just using pairwise distance, or using both pairwise distance and topology to define likely pairs.
#' @param df data.table to which new columns of relationship groups will be added. Must contain a column with name TYPE_DIR_TODI7x3. This column contains fundamental relationship states for every window, from which other relationships are derived.
#' @param get.groups names of relationship groups
#' @param make.pretty.labels Logical
#' @return input data.table with new columns. Each new column defines relationship states for a specific relationship group.
phsc.get.pairwise.relationships<- function(df, get.groups=c('TYPE_PAIR_DI2','TYPE_PAIR_TO','TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI2','TYPE_DIR_TODI2','TYPE_NETWORK_SCORES','TYPE_CHAIN_TODI','TYPE_ADJ_NETWORK_SCORES','TYPE_ADJ_DIR_TODI2'), make.pretty.labels=TRUE)
{
#
# group to define likely pair just based on distance
#
#if('TYPE_PAIR_DI'%in%get.groups)
#{
# df[, TYPE_PAIR_DI:= 'intermediate\ndistance']
# set(df, df[, which(grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_DI', 'close')
# set(df, df[, which(grepl('distant',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_DI', 'distant')
#}
#if('TYPE_PAIRSCORE_DI'%in%get.groups)
#{
# df[, TYPE_PAIRSCORE_DI:= NA_character_]
# set(df, df[, which(grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIRSCORE_DI', 'close')
# set(df, df[, which(grepl('distant',TYPE_DIR_TODI7x3))], 'TYPE_PAIRSCORE_DI', 'distant')
#}
if('TYPE_PAIR_DI2_NOAMB'%in%get.groups)
{
df[, TYPE_PAIR_DI2:= 'not close']
set(df, df[, which(grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_DI2', 'close')
}
if('TYPE_PAIR_DI2'%in%get.groups)
{
df[, TYPE_PAIR_DI2:= 'ambiguous']
set(df, df[, which(grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_DI2', 'close')
set(df, df[, which(grepl('distant',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_DI2', 'distant')
}
#
# group to define likely pair just based on topology
#
if('TYPE_PAIR_TO'%in%get.groups)
{
df[, TYPE_PAIR_TO:= 'other']
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
}
#
# group for full 2x2 table of distance and topology
#
if('TYPE_PAIR_TODI2x2'%in%get.groups)
{
df[, TYPE_PAIR_TODI2x2:= 'not close other']
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI2x2', 'close ancestral/\nintermingled/\nsibling')
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI2x2', 'not close ancestral/\nintermingled/\nsibling')
set(df, df[, which(!grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI2x2', 'close other')
}
#
# group to determine linkage - only 2 states, linked / unlinked
#
if('TYPE_PAIR_TODI2_NOAMB'%in%get.groups)
{
df[, TYPE_PAIR_TODI2:= 'unlinked']
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI2', 'linked')
}
if('TYPE_PAIR_TODI2'%in%get.groups)
{
df[, TYPE_PAIR_TODI2:= 'unlinked']
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3) & !grepl('distant',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI2', 'ambiguous')
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI2', 'linked')
}
#
# group to determine likely pairs
#
#if('TYPE_PAIR_TODI'%in%get.groups)
#{
# df[, TYPE_PAIR_TODI:= 'distant/disconnected']
# set(df, df[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'intermediate distance')
# set(df, df[, which(grepl('withintermediate',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'distant/disconnected')
# set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'likely pair')
#}
# same as TYPE_PAIR_TODI but do not count ambiguous distance
#if('TYPE_PAIRSCORE_TODI'%in%get.groups)
#{
# df[, TYPE_PAIRSCORE_TODI:= 'distant/disconnected']
# set(df, df[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIRSCORE_TODI', 'distant/disconnected')
# set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIRSCORE_TODI', 'likely pair')
# set(df, df[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIRSCORE_TODI', NA_character_)
#}
#
# group to determine direction of transmission in likely pairs
#
#if('TYPE_DIR_TODI3'%in%get.groups)
#{
# df[, TYPE_DIR_TODI3:= NA_character_] # non-NA: all relationships that are used for likely pair
# set(df, df[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'ambiguous')
# set(df, df[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'ambiguous')
# set(df, df[, which(grepl('chain_',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'ambiguous')
# set(df, df[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
# set(df, df[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
# set(df, df[, which(grepl('chain_12',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', '12')
# set(df, df[, which(grepl('chain_21',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', '21')
#}
#
# group to determine direction of transmission in likely pairs
# based on contiguous linkage
#
if('TYPE_DIR_TODI2'%in%get.groups)
{
df[, TYPE_DIR_TODI2:= NA_character_] # non-NA: all relationships of a likely pair that have a direction assigned
set(df, df[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI2', 'fm')
set(df, df[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI2', 'mf')
set(df, df[, which(grepl('chain_12',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI2', '12')
set(df, df[, which(grepl('chain_21',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI2', '21')
}
#
# group to determine direction of transmission in likely pairs
# based on adjacent linkage
#
if('TYPE_ADJ_DIR_TODI2'%in%get.groups)
{
df[, TYPE_ADJ_DIR_TODI2:= NA_character_] # non-NA: all relationships of a likely pair that have a direction assigned
set(df, df[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_DIR_TODI2', 'fm')
set(df, df[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_DIR_TODI2', 'mf')
set(df, df[, which(grepl('chain_12',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_DIR_TODI2', '12')
set(df, df[, which(grepl('chain_21',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_DIR_TODI2', '21')
}
#
# group to determine probabilities for transmission networks
# based on contiguous linkage
#
if('TYPE_NETWORK_SCORES'%in%get.groups)
{
df[, TYPE_NETWORK_SCORES:= 'not close/disconnected'] # non-NA: all relationships that are used for likely pair
set(df, df[, which(grepl('sibling',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_NETWORK_SCORES', 'ambiguous')
set(df, df[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_NETWORK_SCORES', 'ambiguous')
set(df, df[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_NETWORK_SCORES', 'fm')
set(df, df[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_NETWORK_SCORES', 'mf')
set(df, df[, which(grepl('chain_12',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_NETWORK_SCORES', '12')
set(df, df[, which(grepl('chain_21',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_NETWORK_SCORES', '21')
}
#
# group to determine probabilities for transmission networks
# based on adjacent linkage
#
if('TYPE_ADJ_NETWORK_SCORES'%in%get.groups)
{
df[, TYPE_ADJ_NETWORK_SCORES:= 'not close/disconnected'] # non-NA: all relationships that are used for likely pair
set(df, df[, which(grepl('sibling',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_NETWORK_SCORES', 'ambiguous')
set(df, df[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_NETWORK_SCORES', 'ambiguous')
set(df, df[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_NETWORK_SCORES', 'fm')
set(df, df[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_NETWORK_SCORES', 'mf')
set(df, df[, which(grepl('chain_12',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_NETWORK_SCORES', '12')
set(df, df[, which(grepl('chain_21',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_ADJ_NETWORK_SCORES', '21')
}
#
# group to transmission networks
#
if('TYPE_CHAIN_TODI_NOAMB'%in%get.groups)
{
df[, TYPE_CHAIN_TODI:= 'distant']
set(df, df[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
}
if('TYPE_CHAIN_TODI'%in%get.groups)
{
df[, TYPE_CHAIN_TODI:= 'distant']
set(df, df[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
set(df, df[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3) & !grepl('distant',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'ambiguous')
set(df, df[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3) & !grepl('distant',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'ambiguous')
}
#
# TYPE_DIR_TODI7x3 make pretty labels
#
if(make.pretty.labels)
{
set(df, NULL, 'TYPE_DIR_TODI7x3', df[, gsub('other_close','other',gsub('other_distant','other',TYPE_DIR_TODI7x3))])
set(df, NULL, 'TYPE_DIR_TODI7x3', df[, gsub('([ho])intermediate','\\1 intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',gsub('(chain_[12][21])_','\\1\n',TYPE_DIR_TODI7x3)))))])
#set(df, NULL, 'TYPE_DIR_TODI7x3', df[, gsub('other_withintermediate_distant','other_distant',gsub('other_withintermediate_close','other_close',gsub('other_withintermediate$','other',gsub('other_nointermediate$','other',gsub('other_nointermediate_distant','other_distant',TYPE_DIR_TODI7x3)))))])
#set(df, NULL, 'TYPE_DIR_TODI7x3', df[, gsub('other_no','other\nno',gsub('([ho])intermediate','\\1 intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',gsub('(chain_[12][21])_','\\1\n',TYPE_DIR_TODI7x3))))))])
for(x in c('TYPE_DIR_TODI7x3',get.groups))
set(df, NULL, x, gsub('_',' ',df[[x]]))
}
df
}
#' @keywords internal
#' @title Wrapper script to calculate pairwise relationships and likelihoods
phsc.get.pairwise.relationships.likelihoods<- function(dwin, trmw.min.reads, trmw.min.tips, trmw.close.brl, trmw.distant.brl, prior.keff, prior.neff, prior.calibrated.prob, relationship.types, prior.keff.dir=prior.keff, prior.neff.dir=prior.neff, verbose=TRUE)
{
setnames(dwin, c('PAT.1','PAT.2','PAT.1_TIPS','PAT.2_TIPS','PAT.1_READS','PAT.2_READS','PATHS.12','PATHS.21'),
c('ID1','ID2','ID1_L','ID2_L','ID1_R','ID2_R','PATHS_12','PATHS_21'))
set(dwin, NULL, 'PATRISTIC_DISTANCE', dwin[, as.numeric(PATRISTIC_DISTANCE)])
#
# selection windows
#
if(verbose) cat('\nReducing transmission window stats to windows with at least',trmw.min.reads,'reads and at least',trmw.min.tips,'tips ...')
dwin <- subset(dwin, ID1_R>=trmw.min.reads & ID2_R>=trmw.min.reads & ID1_L>=trmw.min.tips & ID2_L>=trmw.min.tips)
if(verbose) cat('\nTotal number of windows with trm assignments is',nrow(dwin),'...')
#
# define basic relationship types
#
setnames(dwin, 'TYPE', 'TYPE_RAW')
if(verbose) cat('\nCalculate basic pairwise relationships for windows n=',nrow(dwin),'...')
dwin <- Phyloscanner.R.utilities:::phsc.get.basic.pairwise.relationships(dwin, trmw.close.brl, trmw.distant.brl)
#
# derive other relationship types
#
if(verbose) cat('\nCalculate derived pairwise relationships for windows n=',nrow(dwin),'...')
setnames(dwin, 'TYPE_BASIC', 'TYPE_DIR_TODI7x3') #for backwards compatibility
dwin <- Phyloscanner.R.utilities:::phsc.get.pairwise.relationships(dwin, get.groups=relationship.types, make.pretty.labels=FALSE)
setnames(dwin, 'TYPE_DIR_TODI7x3', 'TYPE_BASIC')
#
# calculate effective K.
# this is based on windows and contiguous chunks of windows
#
# guess W_FROM W_TO from SUFFIX
if(verbose) cat('\nCalculate KEFF and NEFF for windows n=',nrow(dwin),'...')
set(dwin, NULL, 'W_FROM', dwin[, as.integer(gsub('[^0-9]*([0-9]+)_to_([0-9]+).*','\\1', SUFFIX))])
set(dwin, NULL, 'W_TO', dwin[, as.integer(gsub('[^0-9]*([0-9]+)_to_([0-9]+).*','\\2', SUFFIX))])
rplkl <- Phyloscanner.R.utilities:::phsc.get.pairwise.relationships.keff.and.neff(dwin, relationship.types)
#
# calculate marginal posterior for each pairwise relationship state
# this needs a prior which is calibrated as desired.
# the default calibration is KEFF=2 out of NEFF=3 windows are of type t so that the pair is selected to be of type t with posterior probability=50%
#
if(verbose) cat('\nCalculate posterior state probabilities for pairs and relationship groups n=',nrow(rplkl),'...')
rplkl <- Phyloscanner.R.utilities:::phsc.get.pairwise.relationships.posterior(rplkl, n.type=prior.keff, n.obs=prior.neff, n.type.dir=prior.keff.dir, n.obs.dir=prior.neff.dir, confidence.cut=prior.calibrated.prob)
#
# make TYBE_BASIC labels nice
#
tmp <- rplkl[, which(GROUP=='TYPE_BASIC')]
set(rplkl, tmp, 'TYPE', rplkl[tmp, gsub('other_withintermediate_distant','other_distant',gsub('other_withintermediate_close','other_close',gsub('other_withintermediate$','other',gsub('other_nointermediate$','other',gsub('other_nointermediate_distant','other_distant',TYPE)))))])
set(rplkl, tmp, 'TYPE', rplkl[tmp, gsub('other_no','other\nno',gsub('([ho])intermediate','\\1 intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',gsub('(chain_[12][21])_','\\1\n',TYPE))))))])
set(rplkl, tmp, 'TYPE', rplkl[tmp, gsub('_',' ',TYPE)])
set(dwin, NULL, 'TYPE_BASIC', dwin[, gsub('other_withintermediate_distant','other_distant',gsub('other_withintermediate_close','other_close',gsub('other_withintermediate$','other',gsub('other_nointermediate$','other',gsub('other_nointermediate_distant','other_distant',TYPE_BASIC)))))])
set(dwin, NULL, 'TYPE_BASIC', dwin[, gsub('other_no','other\nno',gsub('([ho])intermediate','\\1 intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',gsub('(chain_[12][21])_','\\1\n',TYPE_BASIC))))))])
set(dwin, NULL, 'TYPE_BASIC', dwin[, gsub('_',' ',TYPE_BASIC)])
list(dwin=dwin, rplkl=rplkl)
}
#' @keywords internal
#' @importFrom igraph graph.data.frame igraph.to.graphNEL
#' @importFrom RBGL edmondsOptimumBranching
#' @title Construct maximum probability transmission network
#' @description This function reconstructs a maximum probility transmission
#' network from the scores associated with directed and undirected edges.
#' The algorithm starts by keeping the edge with highest score.
#' It then removes the competitor in the opposite direction, and any conflicting edges that would result in indegrees larger than one.
#' By construction, all removed edges have lower probability.
#' The algorithm proceeds until all edges have been processed.
#' @param rtn data.table with network scores for all individuals that could form a network. Must contain columns 'ID1','ID2','IDCLU','GROUP','TYPE','POSTERIOR_SCORE','KEFF'.
#' @return new data.table with added columns LINK_12 LINK_21 (either 1 or 0), and MX_PROB_12 MX_PROB_21 (associated posterior probabilities)
phsc.get.most.likely.transmission.chains.RBGLedmonds<- function(rtnn, verbose=0)
{
stopifnot(c('ID1','ID2','IDCLU','TYPE','POSTERIOR_SCORE','KEFF')%in%colnames(rtnn))
stopifnot( !length(setdiff(c('ambiguous','21','12'), rtnn[, unique(TYPE)])) )
rtnn[, ID1_IN_WEIGHT:=0]
set(rtnn, rtnn[, which(TYPE=='ambiguous')],'ID1_IN_WEIGHT', 0.5)
set(rtnn, rtnn[, which(TYPE=='21')],'ID1_IN_WEIGHT', 1)
rtnn[, ID2_IN_WEIGHT:=0]
set(rtnn, rtnn[, which(TYPE=='ambiguous')],'ID2_IN_WEIGHT', 0.5)
set(rtnn, rtnn[, which(TYPE=='12')],'ID2_IN_WEIGHT', 1)
rtm <- rtnn[, list( PROB_21= sum(POSTERIOR_SCORE*ID1_IN_WEIGHT),
KEFF_21= sum(KEFF*ID1_IN_WEIGHT),
PROB_12= sum(POSTERIOR_SCORE*ID2_IN_WEIGHT),
KEFF_12= sum(KEFF*ID2_IN_WEIGHT)), by=c('IDCLU','CLU_SIZE','ID1','ID2')]
#
# handle networks of size 2 - this is easy
#
ans <- subset(rtm, CLU_SIZE==2)
set(ans, NULL, c('LINK_12','LINK_21'), 0L)
set(ans, ans[, which(PROB_12>PROB_21)], 'LINK_12', 1L)
set(ans, ans[, which(PROB_21>PROB_12)], 'LINK_21', 1L)
#
# handle networks of size >2 - use Edmonds algorithm
#
rtm <- subset(rtm, CLU_SIZE>2)
rtmm <- lapply(rtm[, unique(IDCLU)], function(x) subset(rtm, IDCLU==x))
for(i in seq_along(rtmm))
{
#
#i <- 13
if(verbose)
cat('\nIDCLU ',i)
adj <- rtmm[[i]]
adj2<- melt(adj, id.vars=c('ID1','ID2'), measure.vars=c('PROB_12','PROB_21'), value.name='weight')
tmp <- subset(adj2, variable=='PROB_21')
setnames(tmp, c('ID1','ID2'), c('ID2','ID1'))
adj2<- rbind(subset(adj2, variable=='PROB_12'), tmp)
adj2<- subset(adj2, weight>0)
# maximisation is over sum of weights, so transform to log prob
# since edmonds cannot handle negative values or zeros, add some constant
set(adj2, NULL, 'weight', adj2[, log(weight) - min(log(weight)) + 1])
adj2[, variable:=NULL]
g <- graph.data.frame(adj2)
g2 <- igraph.to.graphNEL(g)
g3 <- edmondsOptimumBranching(g2)
g3 <- as.data.table(t(g3$edgeList))
setnames(g3, c('from','to'), c('ID1','ID2'))
g3[, LINK_12:= 1L]
g3[, LINK_21:= 0L]
tmp <- copy(g3)
setnames(tmp, c('ID1','ID2', 'LINK_12', 'LINK_21'), c('ID2','ID1', 'LINK_21', 'LINK_12'))
tmp <- rbind(g3, tmp)
adj <- merge(adj, tmp, by=c('ID1','ID2'))
rtmm[[i]] <- adj
}
rtm <- do.call('rbind',rtmm)
ans <- rbind(ans, rtm)
set(ans, ans[, which(LINK_12==1)], 'PROB_21', 0)
set(ans, ans[, which(LINK_21==1)], 'PROB_12', 0)
setnames(ans, c('PROB_21','PROB_12','KEFF_21','KEFF_12'), c('MX_PROB_21','MX_PROB_12','MX_KEFF_21','MX_KEFF_12'))
ans <- merge(rtnn, ans, by=c('ID1','ID2','IDCLU','CLU_SIZE'))
ans
}
#' @keywords internal
#' @title Reconstruct most likely transmission chains
#' @description This function reconstructs most likely transmission chains
#' from the scores associated with directed and undirected edges.
#' @param rtn data.table with network scores for all individuals that could form a network. Must contain columns 'ID1','ID2','IDCLU','GROUP','TYPE','POSTERIOR_SCORE','KEFF'.
#' @return new data.table with added columns LINK_12 LINK_21 (either 1 or 0), and MX_PROB_12 MX_PROB_21 (associated posterior probabilities)
phsc.get.most.likely.transmission.chains<- function(rtnn, verbose=0, method='Edmonds')
{
if(method=='greedy')
return(Phyloscanner.R.utilities:::phsc.get.most.likely.transmission.chains.greedy(rtnn, verbose=verbose))
if(method=='Edmonds')
return(Phyloscanner.R.utilities:::phsc.get.most.likely.transmission.chains.RBGLedmonds(rtnn, verbose=verbose))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.