pty.MRC.stage1.find.pairs.in.networks<- function()
{
require(tidyverse)
require(data.table)
require(phyloscannerR)
require(igraph)
# set up working environment
HOME <<- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live'
indir <- file.path(HOME,'MRCPopSample_phsc_stage1_190715')
outfile <- file.path(indir, 'MRC_phscnetworks_allpairs_by_distance_190715.rda')
control <- list( linked.group='proximity.3.way.cat',
linked.no='distant',
linked.yes='close',
conf.cut=0.6,
neff.cut=3
)
tmp <- find.pairs.in.networks(indir, batch.regex='^ptyr([0-9]+)_.*', control=control, verbose=TRUE)
dpl <- copy(tmp$network.pairs)
dc <- copy(tmp$relationship.counts)
dw <- copy(tmp$windows)
save(dpl, dc, dw, file=outfile)
}
pty.MRC.stage2.identify.potential.networks<- function()
{
require(tidyverse)
require(data.table)
require(phyloscannerR)
require(igraph)
#
# construct potential transmission networks
# need at least 50% of windows in which two pairs have patristic distance <2.5%
#
infile <- file.path('/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live','MRCPopSample_phsc_stage1_190715', 'MRC_phscnetworks_allpairs_by_distance_190715.rda')
control <- list( linked.group='proximity.3.way.cat',
linked.no='distant',
linked.yes='close',
conf.cut=0.6,
neff.cut=3
)
verbose <- TRUE
load(infile)
linked.group <- control$linked.group
linked.no <- control$linked.no
linked.yes <- control$linked.yes
neff.cut <- control$neff.cut
dnet <- dc %>%
filter( CATEGORISATION==linked.group & TYPE==linked.yes & N_EFF>neff.cut) %>%
select(-c(CATEGORICAL_DISTANCE, K, K_EFF, N, N_EFF))
dnet <- dnet %>% filter(SCORE>0.5)
# define potential transmission network membership
if(verbose) cat('\nReconstruct transmission networks among linked pairs, n=',nrow(dnet))
tmp <- dnet %>% select(H1, H2)
tmp <- igraph:::graph.data.frame(tmp, directed=FALSE, vertices=NULL)
rtc <- tibble(ID=V(tmp)$name, CLU=clusters(tmp, mode="weak")$membership)
rtc <- rtc %>%
group_by(CLU) %>%
summarise( CLU_SIZE:=length(ID) ) %>%
arrange(desc(CLU_SIZE)) %>%
ungroup() %>%
mutate( IDCLU:=seq_along(CLU_SIZE) ) %>%
inner_join(rtc, by='CLU') %>%
select(-CLU)
# add info on edges: network membership
setnames(rtc, c('ID'), c('H1'))
dnet <- dnet %>% inner_join(rtc, by='H1')
rtc <- rtc %>% select(-CLU_SIZE)
setnames(rtc, c('H1'), c('H2'))
dnet <- dnet %>% inner_join(rtc, by=c('H2','IDCLU'))
# tabulate size of potential transmission networks
dnet %>% select(IDCLU, CLU_SIZE) %>% distinct() %>% count(CLU_SIZE)
#
# to each potential transmission network add close controls
#
indir <- file.path('/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live','MRCPopSample_phsc_stage1_190715')
batch.regex <- '^ptyr([0-9]+)_.*'
outfile <- file.path('/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live','MRCPopSample_phsc_stage1_190715', 'MRC_phscnetworks_closestpairs_by_distance_190715.rda')
dquery <- dnet %>% select(H1,H2) %>% gather() %>% select(value) %>% rename(host=value) %>% distinct()
linked.group <- control$linked.group
linked.no <- control$linked.no
linked.yes <- control$linked.yes
conf.cut <- control$conf.cut
neff.cut <- control$neff.cut
verbose <- TRUE
infiles <- tibble(F=list.files(indir, pattern='workspace.rda', full.names=TRUE)) %>%
mutate( PTY_RUN:= as.integer(gsub(batch.regex,'\\1',basename(F))) ) %>%
arrange(PTY_RUN)
if(verbose) cat('\nFound phylogenetic relationship files, n=', nrow(infiles))
if(verbose) cat('\nProcessing files...')
dcls <- vector('list', nrow(infiles))
for(i in seq_len(nrow(infiles)))
{
if(verbose) cat('\nReading ', infiles$F[i])
tmp <- load( infiles$F[i] )
# ensure we have multinomial output in workspace
if(!'dwin'%in%tmp)
stop('Cannot find object "dwin". Check that Analyze.trees was run with multinomial=TRUE.')
# ensure IDs are characters
if(!all(c( class( dwin$host.1 )=='character',
class( dwin$host.2 )=='character'
)))
stop('host.1 or host.2 not of character. This is unexpected, contact maintainer.')
# calculate average patristic distance
dwin <- dwin %>% select(host.1,host.2,patristic.distance)
dwin <- dwin %>%
rename(host.1=host.2, host.2=host.1) %>%
bind_rows(dwin) %>%
rename(host=host.1) %>%
inner_join(dquery, by='host')
dcl <- dwin %>%
group_by(host, host.2) %>%
summarise(PDM= mean(patristic.distance)) %>%
arrange(PDM) %>%
top_n(10, -PDM)
# save
dcls[[i]] <- dcl
}
dcls <- do.call('rbind', dcls)
save(dcls, file=outfile)
# end of pre-processing, continue
infile <- file.path('/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live','MRCPopSample_phsc_stage1_190715', 'MRC_phscnetworks_closestpairs_by_distance_190715.rda')
load(infile)
dcls <- dcls %>%
group_by(host, host.2) %>%
summarise(PDM= mean(PDM)) %>%
arrange(PDM) %>%
top_n(10, -PDM)
dnet <- dnet %>%
select(IDCLU,CLU_SIZE,H1,H2) %>%
gather('variable','host',-IDCLU,-CLU_SIZE) %>%
select(-variable) %>%
distinct() %>%
arrange(IDCLU)
dcntrl <- dnet %>%
full_join(dcls, by='host') %>%
group_by(IDCLU, CLU_SIZE) %>%
filter(!host.2 %in% host) %>%
select(-host) %>%
group_by(IDCLU, host.2) %>%
summarise(PDM= mean(PDM)) %>%
arrange(IDCLU,PDM) %>%
top_n(4, -PDM) %>%
select(IDCLU, host.2) %>%
rename(host= host.2) %>%
mutate(CONTROL= 'yes')
dnet <- dnet %>%
select(IDCLU, host) %>%
mutate(CONTROL= 'no') %>%
bind_rows(dcntrl) %>%
group_by(IDCLU) %>%
arrange(IDCLU,CONTROL)
# add BAM files
infile <- file.path('/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live','MRCPopSample_phsc_stage1_input','phsc_runs_MRC_stage1_n2531_181026.csv')
outfile <- file.path('/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live','MRCPopSample_phsc_stage2_input','phsc_runs_MRC_stage2_n825_190719.csv')
dbam <- as_tibble(read.csv(infile, stringsAsFactors=FALSE)) %>%
select(IND,BAM) %>%
distinct()
pty.runs <- dnet %>%
rename(PTY_RUN:= IDCLU, IND:=host) %>%
inner_join(dbam, by='IND')
# write csv file
write.csv(pty.runs, file=outfile)
}
pty.MRC.stage3.assortativity<- function()
{
ass <- function(m)
{
m <- m / sum(m)
total.sources <- apply(m, 1, sum)
total.recipients <- apply(m, 2, sum)
ans <- sum( diag(m) ) - sum( total.sources * total.recipients )
ans <- ans / (1 - sum(total.sources * total.recipients))
ans
}
# sources in rows, recipients in cols
m <- matrix(c(50,50,50,50), 2, 2)
ass(m)
# 0
m <- matrix(c(200,0,0,0), 2, 2)
ass(m)
# NaN
m <- matrix(c(100,0,0,100), 2, 2)
ass(m)
# 1
m <- matrix(c(0,100,100,0), 2, 2)
ass(m)
# -1
# all the same assortativiy, but
# epidemiologic very different meaning
m <- matrix(c(10,20,10,30), 2, 2)
ass(m)
# 0.08
m <- matrix(c(30,20,10,10), 2, 2)
ass(m)
# 0.08
m <- matrix(c(10,10,20,30), 2, 2)
ass(m)
# 0.08
}
pty.MRC.stage2.make.reference.alignment<- function()
{
require(ape)
require(big.phylo)
# new LANL 2019 subtype consensus sequences
infile <- '~/Box Sync/OR_Work/2019/2019_PANGEA_BBosa/2019_New_ConsensusGenomesOneEach_GeneCut.fasta'
outfile <- '~/git/phyloscanner/phyloscannerR/inst/extdata/HIV1_LANL_2019_consensus_seqs.fasta'
seq <- read.dna(infile, format='fasta')
seq <- seq[c( "B.FR.83.HXB2_LAI_IIIB_BRU_K03455", "CON_A1(172)", "CON_A2(4)", "CON_A3(3)",
"A4.CD.97.97CD_KTB13_AM000054", "CON_A6(42)", "CON_B(1204)", "CON_C(720)",
"CON_D(71)", "CON_F1(39)", "CON_F2(9)", "CON_G(79)",
"CON_H(9)", "CON_J(6)", "CON_K(3)", "CON_OF_CONS"),]
seq <- big.phylo:::seq.rmgaps(seq, rm.only.col.gaps=1, verbose=0)
tmp <- gsub('CON','REF',gsub('\\)','',gsub('\\(','_s',rownames(seq))))
tmp[grepl('HXB2',tmp)] <- 'REF_HXB2'
tmp[grepl('A4',tmp)] <- 'REF_A4_s1'
tmp[grepl('REF_OF_REFS',tmp)] <- 'REF_consensus'
rownames(seq) <- tmp
write.dna(seq, file=outfile, format='fasta', colsep='', nbcol=-1)
}
pty.MRC.stage2.generate.read.alignments.ref.choice<- function()
{
require(data.table)
require(ape)
indir <- '~/Box Sync/OR_Work/2019/2019_PANGEA_BBosa'
infiles <- data.table(F=list.files(indir, pattern='tree$', full.names=TRUE, recursive=TRUE))
infiles[, ANA:= basename(dirname(F))]
infiles[, W_FROM:= as.integer(gsub('^.*InWindow_([0-9]+)_to.*$','\\1',basename(F)))]
dd <- infiles[, {
#infile <- infiles[670,F]
infile <- F
tree <- read.tree(infile)
# reroot 2012 files ( I specified inccorrect root )
if(any(tree$tip.label=='REF_CPX_AF460972'))
tree <- root(tree, 'REF_CPX_AF460972')
# plot pdf
tip.col <- rep('black', Ntip(tree))
tip.col[grepl('REF',tree$tip.label)] <- 'red'
outfile <- gsub('\\.tree','_tree.pdf',infile)
pdf(outfile, w=10, h=30)
plot(tree, tip.col=tip.col, cex=0.5)
dev.off()
# calculate distance to root
tmp <- node.depth.edgelength(tree)
list( DR_MAX= max(tmp), DR_MED=median(tmp) )
}, by=c('ANA','W_FROM')]
ggplot(dd, aes(x=W_FROM, colour=ANA, y=DR_MED, group=ANA)) + geom_point() + geom_line()
outfile <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/pilot_refalignment_LANL2012selected_HKY/dist_to_root_max.pdf'
ggsave(file=outfile, w=10, h=6)
#
}
pty.MRC.stage2.generate.read.alignments<- function()
{
# set up working environment
require(Phyloscanner.R.utilities)
require(phyloscannerR)
require(data.table)
if(1)
{
HOME <<- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live'
data.dir <- '/rds/general/project/ratmann_pangea_deepsequencedata/live/PANGEA2_MRC'
prog.pty <- '/rds/general/user/or105/home/libs_sandbox/phyloscanner/phyloscanner_make_trees.py'
}
if(0)
{
HOME <<- '~/sandbox/DeepSeqProjects'
data.dir <- '~/sandbox/DeepSeqProjects/MRCPopSample_data'
prog.pty <- '/Users/Oliver/git/phylotypes/phyloscanner_make_trees.py'
}
in.dir <- file.path(HOME,'MRCPopSample_phsc_stage2_input')
work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
out.dir <- file.path(HOME,"MRCPopSample_phsc_stage2_output_newali_250_HKC")
#out.dir <- file.path(HOME,"MRCPopSample_phsc_stage2_output_newali_300")
dir.create(in.dir)
dir.create(work.dir)
dir.create(out.dir)
pty.runs <- as.data.table(read.csv(file.path(in.dir, "phsc_runs_MRC_stage2_n825_190719.csv")))
#pty.runs <- subset(pty.runs, PTY_RUN==130)
pty.runs <- unique(pty.runs,by=c('PTY_RUN','IND'))
#
# define phyloscanner input args to generate read alignments
#
pty.select <- NA
pty.args <- list( prog.pty=prog.pty,
prog.mafft='mafft',
data.dir=data.dir,
work.dir=work.dir,
out.dir=out.dir,
#alignments.file=system.file(package="Phyloscanner.R.utilities", "HIV1_compendium_AD_B_CPX_v2.fasta"),
#alignments.root='REF_CPX_AF460972',
#alignments.pairwise.to='REF_B_K03455',
alignments.file=system.file(package='phyloscannerR', file.path('extdata','HIV1_LANL_2019_consensus_seqs.fasta')),
alignments.pairwise.to='REF_HXB2',
alignments.root='REF_consensus',
window.automatic='',
merge.threshold=0,
min.read.count=1,
quality.trim.ends=23,
min.internal.quality=23,
merge.paired.reads=TRUE,
no.trees=TRUE,
dont.check.duplicates=FALSE,
dont.check.recombination=TRUE,
win=c(800,9400,25,250),
#win=c(800,9400,30,300),
keep.overhangs=FALSE,
mem.save=0,
verbose=TRUE,
select=pty.select #of 240
)
#save(pty.args, file=file.path(in.dir, 'phsc_args_stage2_create_read_alignments.rda'))
#
# define bash scripts to generate read alignments for all pairs of batches
#
# check which (if any) batches have already been processed, and remove from TODO list
tmp <- data.table(FILE_FASTA=list.files(out.dir, pattern='^ptyr[0-9]+_trees$', full.names=TRUE))
tmp[, PTY_RUN:= as.integer(gsub('ptyr([0-9]+)_.*','\\1',basename(FILE_FASTA)))]
tmp <- merge(tmp, tmp[, list(FILE_FASTA_N= length(list.files(FILE_FASTA, pattern='fasta$'))), by='PTY_RUN'], by='PTY_RUN')
pty.runs <- merge(pty.runs, tmp, by='PTY_RUN', all.x=1)
#pty.runs <- subset(pty.runs, is.na(FILE_FASTA_N) || FILE_FASTA_N==0)
#pty.runs <- subset(pty.runs, FILE_FASTA_N==0)
#pty.runs <- subset(pty.runs, is.na(FILE_FASTA_N))
# search for bam files and references and merge with runs
pty.runs <- subset(pty.runs, select=c(PTY_RUN, IND))
tmp <- phsc.find.bam.and.references(pty.args[['data.dir']], regex.person='^([A-Z0-9]+-[A-Z0-9]+)-.*$')
pty.runs <- merge(pty.runs, tmp, by='IND')
# create UNIX bash scripts
pty.runs <- unique(pty.runs, by=c('PTY_RUN','SAMPLE','BAM'))
pty.runs <- pty.runs[, list(BAM=BAM, REF=REF, SAMPLE=SAMPLE, RENAME_ID=paste0(IND,'-fq',seq_len(length(BAM)))), by=c('PTY_RUN','IND')]
setkey(pty.runs, PTY_RUN)
setnames(pty.runs, c('IND','SAMPLE'), c('UNIT_ID','SAMPLE_ID'))
pty.c <- phsc.cmd.phyloscanner.multi(pty.runs, pty.args)
pty.c[, CASE_ID:= seq_len(nrow(pty.c))]
#
# submit jobs to PBS job scheduler
#
# define PBS variables
hpc.load <- "module load intel-suite/2015.1 mpi raxml/8.2.9 mafft/7 anaconda/2.3.0 samtools" # make third party requirements available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 71 # walltime
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "6gb" # RAM
hpc.array <- pty.c[, max(CASE_ID)] # number of runs for job array
# define PBS header for job scheduler. this will depend on your job scheduler.
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if(!is.na(hpc.array))
pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='')
if(!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
cat(pbshead)
# create PBS job array
cmd <- pty.c[, list(CASE=paste0(CASE_ID,')\n',CMD,';;\n')), by='CASE_ID']
cmd <- cmd[, paste0('case $PBS_ARRAY_INDEX in\n',paste0(CASE, collapse=''),'esac')]
cmd <- paste(pbshead,cmd,sep='\n')
# submit job
outfile <- gsub(':','',paste("readali",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(pty.args[['work.dir']], outfile)
cat(cmd, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
}
pty.MRC.stage1.generate.read.alignments<- function()
{
# set up working environment
require(Phyloscanner.R.utilities)
#HOME <<- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live'
#data.dir <- '/rds/general/project/ratmann_pangea_deepsequencedata/live/PANGEA2_MRC'
#prog.pty <- '/rds/general/user/or105/home/phyloscanner/phyloscanner_make_trees.py'
HOME <<- '~/sandbox/DeepSeqProjects'
data.dir <- '~/sandbox/DeepSeqProjects/MRCPopSample_data'
prog.pty <- '/Users/Oliver/git/phylotypes/phyloscanner_make_trees.py'
in.dir <- file.path(HOME,'MRCPopSample_phsc_stage1_input')
work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
out.dir <- file.path(HOME,"MRCPopSample_phsc_stage1_output")
dir.create(in.dir)
dir.create(work.dir)
dir.create(out.dir)
#
# define phyloscanner runs for all pairs of batches of individuals of the population-based sample
#
if(0)
{
infile <- 'phsc_runs_MRC_data.csv' # file listing all available bam files and corresponding individual IDs
outfile <- 'phsc_runs_MRC_stage1_n2531_181026.csv'
dbam <- as.data.table(read.csv(file.path(in.dir,infile)))
dbam <- subset(dbam, grepl('remap\\.bam',BAM))
setnames(dbam,'INDIVIDUAL','IND')
# make data.table with just individuals
dind <- unique(subset(dbam, select=c(IND)))
# create batches and create runs of pairs of batches
pty.runs<- phsc.define.stage1.analyses(dind$IND, batch.size=50)
# add BAM files per individual
pty.runs<- merge(pty.runs, subset(dbam, select=c(IND, BAM)), by='IND', allow.cartesian=TRUE)
setkey(pty.runs, PTY_RUN)
# save to file
write.csv(pty.runs, file=file.path(in.dir,outfile), row.names=FALSE)
}
if(1)
{
pty.runs<- as.data.table(read.csv(file.path(in.dir, "phsc_runs_MRC_stage1_n2531_181026.csv")))
}
#
# define phyloscanner input args to generate read alignments
#
pty.select <- NA #1:2
pty.args <- list( prog.pty=prog.pty,
prog.mafft='mafft',
data.dir=data.dir,
work.dir=work.dir,
out.dir=out.dir,
alignments.file=system.file(package="Phyloscanner.R.utilities", "HIV1_compendium_AD_B_CPX_v2.fasta"),
alignments.root='REF_CPX_AF460972',
alignments.pairwise.to='REF_B_K03455',
window.automatic='',
merge.threshold=2,
min.read.count=1,
quality.trim.ends=23,
min.internal.quality=23,
merge.paired.reads=TRUE,
no.trees=TRUE,
dont.check.duplicates=FALSE,
dont.check.recombination=TRUE,
win=c(800,9400,25,250),
keep.overhangs=FALSE,
mem.save=0,
verbose=TRUE,
select=pty.select #of 240
)
save(pty.args, file=file.path(in.dir, 'phsc_args_stage1_create_read_alignments.rda'))
#
# define bash scripts to generate read alignments for all pairs of batches
#
# check which (if any) batches have already been processed, and remove from TODO list
tmp <- data.table(FILE_FASTA=list.files(out.dir, pattern='^ptyr[0-9]+_trees$', full.names=TRUE))
tmp[, PTY_RUN:= as.integer(gsub('ptyr([0-9]+)_.*','\\1',basename(FILE_FASTA)))]
tmp <- merge(tmp, tmp[, list(FILE_FASTA_N= length(list.files(FILE_FASTA, pattern='fasta$'))), by='PTY_RUN'], by='PTY_RUN')
pty.runs <- merge(pty.runs, tmp, by='PTY_RUN', all.x=1)
pty.runs <- subset(pty.runs, !is.na(FILE_FASTA_N))
pty.runs <- subset(pty.runs, FILE_FASTA_N==0)
# search for bam files and references and merge with runs
pty.runs <- subset(pty.runs, select=c(PTY_RUN, IND))
tmp <- phsc.find.bam.and.references(pty.args[['data.dir']], regex.person='^([A-Z0-9]+-[A-Z0-9]+)-.*$')
pty.runs <- merge(pty.runs, tmp, by='IND')
# create UNIX bash scripts
pty.runs <- unique(pty.runs, by=c('PTY_RUN','UNIT_ID','BAM'))
pty.runs <- pty.runs[, list(BAM=BAM, REF=REF, SAMPLE=SAMPLE, RENAME_ID=paste0(IND,'-fq',seq_len(length(BAM)))), by=c('PTY_RUN','IND')]
setkey(pty.runs, PTY_RUN)
setnames(pty.runs, c('IND','SAMPLE'), c('UNIT_ID','SAMPLE_ID'))
pty.c <- phsc.cmd.phyloscanner.multi(pty.runs, pty.args)
pty.c[, CASE_ID:= seq_len(nrow(pty.c))]
#
# submit jobs to PBS job scheduler
#
# define PBS variables
hpc.load <- "module load intel-suite/2015.1 mpi raxml/8.2.9 mafft/7 anaconda/2.3.0 samtools" # make third party requirements available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 171 # walltime
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "12gb" # RAM
hpc.array <- pty.c[, max(CASE_ID)] # number of runs for job array
# define PBS header for job scheduler. this will depend on your job scheduler.
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if(!is.na(hpc.array))
pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='')
if(!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
cat(pbshead)
# create PBS job array
cmd <- pty.c[, list(CASE=paste0(CASE_ID,')\n',CMD,';;\n')), by='CASE_ID']
cmd <- cmd[, paste0('case $PBS_ARRAY_INDEX in\n',paste0(CASE, collapse=''),'esac')]
cmd <- paste(pbshead,cmd,sep='\n')
# submit job
outfile <- gsub(':','',paste("readali",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(pty.args[['work.dir']], outfile)
cat(cmd, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
}
pty.MRC.stage1.zip.trees<- function()
{
require(data.table)
require(Phyloscanner.R.utilities)
# set up working environment
HOME <<- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live'
data.dir <- '/rds/general/project/ratmann_pangea_deepsequencedata/live/PANGEA2_MRC'
#in.dir <- file.path(HOME,'MRCPopSample_phsc_stage1_output')
#work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
#out.dir <- file.path(HOME,"MRCPopSample_phsc_stage1_output")
#
# zip trees
if(1)
{
indirs <- file.path(HOME,'MRCPopSample_phsc_stage1_output')
indirs <- file.path(HOME,'MRCPopSample_phsc_stage2_output_newali_250_HKC')
#
indirs <- list.files(indirs, pattern='^ptyr[0-9]+_trees$', full.names=TRUE)
allwin <- data.table(W_FROM=seq(800,9150,25))
#allwin <- data.table(W_FROM=seq(890,9150,30))
#allwin <- data.table(W_FROM=seq(800,9050,125))
#indirs <- '/work/or105/Gates_2014/2015_PANGEA_DualPairsFromFastQIVA/RakaiAll_output_170301_w250_s20_p35_stagetwo/ptyr97_trees'
for(i in seq_along(indirs))
{
indir <- indirs[i]
pty.run <- as.integer(gsub('^ptyr([0-9]+)_.*','\\1',basename(indir)))
# check if we have all fasta and tree files
infiles <- data.table(F=list.files(indir,pattern='ptyr.*fasta$',full.names=TRUE))
infiles[, W_FROM:= as.integer(gsub('.*_InWindow_([0-9]+)_.*','\\1',basename(F)))]
infiles <- merge(allwin, infiles, by='W_FROM', all.x=1)
missfs <- subset(infiles, is.na(F))[, W_FROM]
if(length(missfs))
cat('\nIn',indir,'Found missing fasta files for',paste(missfs,collapse=', '))
infiles <- data.table(F=list.files(indir,pattern='ptyr.*tree$',full.names=TRUE))
infiles[, W_FROM:= as.integer(gsub('.*_InWindow_([0-9]+)_.*','\\1',basename(F)))]
infiles <- merge(allwin, infiles, by='W_FROM', all.x=1)
misstrs <- subset(infiles, is.na(F))[, W_FROM]
if(length(misstrs))
cat('\nIn',indir,'Found missing tree files for',paste(misstrs,collapse=', '))
zipit <- 0
if(!length(missfs) & !length(misstrs))
{
cat('\nIn',indir,'Found all fasta and tree files')
zipit <- 1
}
#if(!length(setdiff(misstrs,missfs)))
#{
# cat('\nIn',indir,'Found all tree files for which there is a fasta file')
# zipit <- 1
#}
zipit <- 1
#
if(zipit)
{
cat('\nProcess',indir)
# first combine all zip files into ptyrXXX_otherstuff.zip
infiles <- data.table(F=list.files(indir,pattern='zip$',full.names=TRUE))
infiles[, PTY_RUN:= gsub('^ptyr([0-9]+)_.*','\\1',basename(F))]
stopifnot(!nrow(subset(infiles, is.na(PTY_RUN))))
tmp <- infiles[1, file.path(dirname(F),paste0('ptyr',PTY_RUN,'_otherstuff.zip'))]
cat('\nZip to file', tmp,'...\n')
suppressWarnings(invisible( infiles[, list(RTN= unzip(F, overwrite=FALSE, exdir=file.path(indir,'tmp42'))), by='F'] ))
invisible( infiles[, list(RTN= file.remove(F)), by='F'] )
invisible( zip( tmp, file.path(indir,'tmp42'), flags = "-umr9XTjq") )
# now zip fasta files
infiles <- data.table(F=list.files(indir,pattern='ptyr.*fasta$',full.names=TRUE))
infiles[, PTY_RUN:= gsub('^ptyr([0-9]+)_.*','\\1',basename(F))]
invisible( infiles[, file.rename(F, file.path(indir,'tmp42',basename(F))), by='F'] )
tmp <- infiles[1, file.path(dirname(F),paste0('ptyr',PTY_RUN,'_trees_fasta.zip'))]
invisible( zip( tmp, file.path(indir,'tmp42'), flags = "-umr9XTjq") )
# now zip tree files
infiles <- data.table(F=list.files(indir,pattern='^ptyr.*tree$',full.names=TRUE))
infiles[, PTY_RUN:= gsub('^ptyr([0-9]+)_.*','\\1',basename(F))]
invisible( infiles[, file.rename(F, file.path(indir,'tmp42',basename(F))), by='F'] )
tmp <- infiles[1, file.path(dirname(F),paste0('ptyr',PTY_RUN,'_trees_newick.zip'))]
invisible( zip( tmp, file.path(indir,'tmp42'), flags = "-umr9XTjq") )
# remove tmp dir
invisible( file.remove( file.path(indir,'tmp42') ) )
# move one level down
infiles <- data.table(F=list.files(indir, full.names=TRUE))
invisible( infiles[, file.rename(F, file.path(dirname(indir),basename(F))), by='F'] )
cat('\nDone',indir)
}
#if(!length(misstrs))
if(zipit)
invisible(unlink(indir, recursive=TRUE))
# expand again if asked to
#if(length(misstrs))
if(0)
{
cat('\nExtract',file.path(dirname(indir),paste0('ptyr',pty.run,'_trees_fasta.zip')))
unzip(file.path(dirname(indir),paste0('ptyr',pty.run,'_trees_fasta.zip')), junkpaths=TRUE, exdir=indir)
cat('\nExtract',file.path(dirname(indir),paste0('ptyr',pty.run,'_trees_newick.zip')))
unzip(file.path(dirname(indir),paste0('ptyr',pty.run,'_trees_newick.zip')), junkpaths=TRUE, exdir=indir)
}
}
}
}
pty.MRC.stage1.generate.trees<- function()
{
require(data.table)
require(Phyloscanner.R.utilities)
# set up working environment
HOME <<- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live'
data.dir <- '/rds/general/project/ratmann_pangea_deepsequencedata/live/PANGEA2_MRC'
prog.pty <- '/rds/general/user/or105/home/phyloscanner/phyloscanner_make_trees.py'
#HOME <<- '~/sandbox/DeepSeqProjects'
#data.dir <- '~/sandbox/DeepSeqProjects/MRCPopSample_data'
#prog.pty <- '/Users/Oliver/git/phylotypes/phyloscanner_make_trees.py'
in.dir <- file.path(HOME,'MRCPopSample_phsc_stage1_output')
work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
out.dir <- file.path(HOME,"MRCPopSample_phsc_stage1_output")
#
# generate trees
infiles <- data.table(FI=list.files(in.dir, pattern='fasta$', full.names=TRUE, recursive=TRUE))
infiles[, FO:= gsub('fasta$','tree',FI)]
infiles[, PTY_RUN:= as.integer(gsub('^ptyr([0-9]+)_.*','\\1',basename(FI)))]
infiles[, W_FROM:= as.integer(gsub('.*InWindow_([0-9]+)_.*','\\1',basename(FI)))]
# check which (if any) trees have already been processed, and remove from TODO list
tmp <- data.table(FT=list.files(out.dir, pattern='^ptyr.*tree$', full.names=TRUE, recursive=TRUE))
tmp[, PTY_RUN:= as.integer(gsub('^ptyr([0-9]+)_.*','\\1',basename(FT)))]
tmp[, W_FROM:= as.integer(gsub('.*InWindow_([0-9]+)_.*','\\1',basename(FT)))]
infiles <- merge(infiles, tmp, by=c('PTY_RUN','W_FROM'), all.x=1)
infiles <- subset(infiles, is.na(FT))
setkey(infiles, PTY_RUN, W_FROM)
# determine number of taxa per fasta file, and sort alignments by size
# so that trees with smallest size are generated first
#tmp <- infiles[, list(N_TAXA=as.integer(system(paste0('grep -c "^>" ', FI), intern=TRUE))), by=c('FI')]
#infiles <- merge(infiles, tmp, by='FI')
#infiles <- infiles[order(N_TAXA),]
#
# define pbs header, max 10,000 trees (re-start a few times)
# create header first because RAXML call depends on single-core / multi-core
# TODO:
# 30 runs are roughly 10,000 trees
# every time you set up a new job array, select the next 30 runs.
# For example, the job array will be 151:180
#infiles <- subset(infiles, PTY_RUN%in%c(1101:1400))
#infiles <- subset(infiles, PTY_RUN%in%c(1001:1400))
infiles[, CASE_ID2:= seq_len(nrow(infiles))]
infiles[, CASE_ID:= ceiling(CASE_ID2/1)]
print(infiles)
#infiles[, CASE_ID:= ceiling(CASE_ID2/1)]
stop()
hpc.load <- "module load intel-suite/2015.1 mpi raxml/8.2.9" # make third party requirements available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 123 # walltime
# TODO:
# run either this block to submit a job array to college machines
# the choice depends on whether the previous job array on college machines is done,
# or on whether the previous job array on Oliver's machines is done.
if(0)
{
hpc.q <- NA # PBS queue
hpc.mem <- "2gb" # RAM
raxml.pr <- ifelse(hpc.nproc==1, 'raxmlHPC-SSE3', 'raxmlHPC-PTHREADS-SSE3') #on older machines without AVX instructions
}
# or run this block to submit a job array to Oliver's machines
if(1)
{
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "6gb" # RAM
raxml.pr <- ifelse(hpc.nproc==1, 'raxmlHPC-AVX','raxmlHPC-PTHREADS-AVX') #on newer machines with AVX instructions
}
#
#
hpc.array <- infiles[, max(CASE_ID)] # number of runs for job array
# define PBS header for job scheduler. this will depend on your job scheduler.
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if(!is.na(hpc.array))
pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='')
if(!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
cat(pbshead)
#
# create UNIX bash script to generate trees with RAxML
#
raxml.args <- ifelse(hpc.nproc==1, '-m GTRCAT --HKY85 -p 42 -o REF_B_K03455', paste0('-m GTRCAT --HKY85 -T ',hpc.nproc,' -p 42 -o REF_B_K03455'))
pty.c <- infiles[, list(CMD=raxml.cmd(FI, outfile=FO, pr=raxml.pr, pr.args=raxml.args)), by=c('CASE_ID','CASE_ID2')]
pty.c <- pty.c[, list(CMD=paste(CMD, collapse='\n')), by='CASE_ID']
pty.c[1,cat(CMD)]
#
# make array job
cmd <- pty.c[, list(CASE=paste0(CASE_ID,')\n',CMD,';;\n')), by='CASE_ID']
cmd <- cmd[, paste0('case $PBS_ARRAY_INDEX in\n',paste0(CASE, collapse=''),'esac')]
cmd <- paste(pbshead,cmd,sep='\n')
# submit job
outfile <- gsub(':','',paste("trs",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(work.dir, outfile)
cat(cmd, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
}
pty.MRC.stage2.generate.trees<- function()
{
require(data.table)
require(Phyloscanner.R.utilities)
# set up working environment
if(1)
{
HOME <<- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live'
data.dir <- '/rds/general/project/ratmann_pangea_deepsequencedata/live/PANGEA2_MRC'
}
if(0)
{
HOME <<- '~/sandbox/DeepSeqProjects'
data.dir <- '~/sandbox/DeepSeqProjects/MRCPopSample_data'
}
if(0)
{
in.dir <- file.path(HOME,'MRCPopSample_phsc_stage2_output_newali_HKC')
out.dir <- file.path(HOME,"MRCPopSample_phsc_stage2_output_newali_HKC")
work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
ali.root <- 'REF_consensus'
tree.args <- '-m GTRCAT --HKY85'
}
if(1)
{
in.dir <- file.path(HOME,'MRCPopSample_phsc_stage2_output_newali_250_HKC')
out.dir <- file.path(HOME,"MRCPopSample_phsc_stage2_output_newali_250_HKC")
work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
ali.root <- 'REF_consensus'
tree.args <- '-m GTRCAT --HKY85'
}
if(0)
{
in.dir <- file.path(HOME,'MRCPopSample_phsc_stage2_output_newali_300')
work.dir <- file.path(HOME,"MRCPopSample_phsc_work")
out.dir <- file.path(HOME,"MRCPopSample_phsc_stage2_output_newali_300")
ali.root <- 'REF_consensus'
tree.args <- '-m GTRCAT --HKY85'
}
#
# generate trees
infiles <- data.table(FI=list.files(in.dir, pattern='fasta$', full.names=TRUE, recursive=TRUE))
infiles[, FO:= gsub('fasta$','tree',FI)]
infiles[, PTY_RUN:= as.integer(gsub('^ptyr([0-9]+)_.*','\\1',basename(FI)))]
infiles[, W_FROM:= as.integer(gsub('.*InWindow_([0-9]+)_.*','\\1',basename(FI)))]
# check which (if any) trees have already been processed, and remove from TODO list
tmp <- data.table(FT=list.files(out.dir, pattern='^ptyr.*tree$', full.names=TRUE, recursive=TRUE))
tmp[, PTY_RUN:= as.integer(gsub('^ptyr([0-9]+)_.*','\\1',basename(FT)))]
tmp[, W_FROM:= as.integer(gsub('.*InWindow_([0-9]+)_.*','\\1',basename(FT)))]
infiles <- merge(infiles, tmp, by=c('PTY_RUN','W_FROM'), all.x=1)
infiles <- subset(infiles, is.na(FT))
setkey(infiles, PTY_RUN, W_FROM)
# determine number of taxa per fasta file, and sort alignments by size
# so that trees with smallest size are generated first
#tmp <- infiles[, list(N_TAXA=as.integer(system(paste0('grep -c "^>" ', FI), intern=TRUE))), by=c('FI')]
#infiles <- merge(infiles, tmp, by='FI')
#infiles <- infiles[order(N_TAXA),]
#
# define pbs header, max 10,000 trees (re-start a few times)
# create header first because RAXML call depends on single-core / multi-core
# TODO:
# 30 runs are roughly 10,000 trees
# every time you set up a new job array, select the next 30 runs.
# For example, the job array will be 151:180
#infiles <- subset(infiles, PTY_RUN%in%c(1101:1400))
#infiles <- subset(infiles, PTY_RUN%in%c(1001:1400))
infiles[, CASE_ID2:= seq_len(nrow(infiles))]
infiles[, CASE_ID:= ceiling(CASE_ID2/1)]
print(infiles)
#infiles[, CASE_ID:= ceiling(CASE_ID2/1)]
stop()
hpc.load <- "module load intel-suite/2015.1 mpi raxml/8.2.9" # make third party requirements available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 123 # walltime
# TODO:
# run either this block to submit a job array to college machines
# the choice depends on whether the previous job array on college machines is done,
# or on whether the previous job array on Oliver's machines is done.
if(0)
{
hpc.q <- NA # PBS queue
hpc.mem <- "2gb" # RAM
raxml.pr <- ifelse(hpc.nproc==1, 'raxmlHPC-SSE3', 'raxmlHPC-PTHREADS-SSE3') #on older machines without AVX instructions
}
# or run this block to submit a job array to Oliver's machines
if(1)
{
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "6gb" # RAM
raxml.pr <- ifelse(hpc.nproc==1, 'raxmlHPC-AVX','raxmlHPC-PTHREADS-AVX') #on newer machines with AVX instructions
}
#
#
hpc.array <- infiles[, max(CASE_ID)] # number of runs for job array
# define PBS header for job scheduler. this will depend on your job scheduler.
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if(!is.na(hpc.array))
pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='')
if(!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
cat(pbshead)
#
# create UNIX bash script to generate trees with RAxML
#
raxml.args <- ifelse(hpc.nproc==1, paste0(tree.args,' -p 42 -o ', ali.root), paste0(tree.args,' -T ',hpc.nproc,' -p 42 -o ',ali.root))
pty.c <- infiles[, list(CMD=raxml.cmd(FI, outfile=FO, pr=raxml.pr, pr.args=raxml.args)), by=c('CASE_ID','CASE_ID2')]
pty.c <- pty.c[, list(CMD=paste(CMD, collapse='\n')), by='CASE_ID']
pty.c[1,cat(CMD)]
#
# make array job
cmd <- pty.c[, list(CASE=paste0(CASE_ID,')\n',CMD,';;\n')), by='CASE_ID']
cmd <- cmd[, paste0('case $PBS_ARRAY_INDEX in\n',paste0(CASE, collapse=''),'esac')]
cmd <- paste(pbshead,cmd,sep='\n')
# submit job
outfile <- gsub(':','',paste("trs",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(work.dir, outfile)
cat(cmd, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
}
pty.2.infer.phylo.relationships.on.stage2.trees<- function()
{
require(Phyloscanner.R.utilities)
#
# INPUT ARGS PLATFORM
if(1)
{
HOME <<- '~/sandbox/DeepSeqProjects'
# unzip files in Data Set S1 to directory with full path name as in 'in.dir'
# Data Set S1 contains batches of deep sequence trees that are to be processed with phyloscanner
in.dir <- file.path(HOME,'RakaiPopSample_deepseqtrees')
# define output directory
out.dir <- file.path(HOME,"RakaiPopSample_phyloscanner_out")
# define directory for temporary files
work.dir <- file.path(HOME,"RakaiPopSample_phyloscanner_work")
# define path to 'phyloscanner_make_trees.py'
prog.pty <- '/Users/Oliver/git/phylotypes/phyloscanner_make_trees.py'
}
# create directories if they dont exist
dir.create(out.dir, showWarnings=FALSE)
dir.create(work.dir, showWarnings=FALSE)
#
# phyloscanner default input arguments to process deep-sequence trees
pty.args <- list( prog.pty=prog.pty,
prog.mafft=NA,
prog.raxml=NA,
data.dir=NA,
work.dir=work.dir,
out.dir=out.dir,
alignments.file=system.file(package="Phyloscanner.R.utilities", "HIV1_compendium_AD_B_CPX_v2.fasta"),
alignments.root='REF_CPX_AF460972',
alignments.pairwise.to='REF_B_K03455',
bl.normalising.reference.file=system.file(package="Phyloscanner.R.utilities", "data", "hiv.hxb2.norm.constants.rda"),
bl.normalising.reference.var='MEDIAN_PWD',
window.automatic= '',
merge.threshold=0,
min.read.count=1,
quality.trim.ends=23,
min.internal.quality=23,
merge.paired.reads=TRUE,
no.trees=FALSE,
dont.check.duplicates=FALSE,
dont.check.recombination=TRUE,
num.bootstraps=1,
all.bootstrap.trees=TRUE,
strip.max.len=350,
min.ureads.individual=NA,
win=c(800,9400,25,250),
keep.overhangs=FALSE,
use.blacklisters=c('ParsimonyBasedBlacklister','DownsampleReads'),
tip.regex='^(.*)_fq[0-9]+_read_([0-9]+)_count_([0-9]+)$',
roguesubtree.kParam=20,
roguesubtree.prop.threshold=0,
roguesubtree.read.threshold=20,
dwns.maxReadsPerPatient=50,
multifurcation.threshold=1e-5,
split.rule='s',
split.kParam=20,
split.proximityThreshold=0,
split.readCountsMatterOnZeroBranches=TRUE,
split.pruneBlacklist=FALSE,
trms.allowMultiTrans=TRUE,
pw.trmw.min.reads=30,
pw.trmw.min.tips=1,
pw.trmw.close.brl=0.025,
pw.trmw.distant.brl=0.05,
pw.prior.keff=2,
pw.prior.neff=3,
pw.prior.keff.dir=2,
pw.prior.neff.dir=3,
pw.prior.calibrated.prob=0.66,
mem.save=0,
verbose=TRUE,
select=NA
)
save(pty.args, file=file.path(out.dir, 'pty.args.rda'))
#
# phyloscanner runs on batches of deep-sequence trees
if(1)
{
# for each batch of trees to be processed:
# find list of patients in input directory
pty.c <- data.table(FILE_BAM=list.files(in.dir, pattern='_patients.txt', full.names=TRUE))
pty.c[, PTY_RUN:= as.integer(gsub('ptyr','',gsub('_patients.txt','',basename(FILE_BAM))))]
# check which (if any) batches have already been processed, and remove from TODO list
tmp <- data.table(FILE_TRMW=list.files(out.dir, pattern='_pairwise_relationships.rda', full.names=TRUE))
tmp[, PTY_RUN:= as.integer(gsub('ptyr','',gsub('_pairwise_relationships.rda','',basename(FILE_TRMW))))]
pty.c <- merge(pty.c, tmp, by='PTY_RUN', all.x=1)
pty.c <- subset(pty.c, is.na(FILE_TRMW))
# for each batch of deep-sequence trees: create bash script to run phyloscanner
setkey(pty.c, PTY_RUN)
pty.c <- pty.c[, {
#FILE_BAM<- '/work/or105/Gates_2014/2015_PANGEA_DualPairsFromFastQIVA/Rakai_ptoutput_160915_couples_w270/ptyr1_bam.txt'
#FILE_BAM<- '/Users/Oliver/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/Rakai_ptoutput_160915_couples_w270/ptyr1_bam.txt'
#cat('\n',FILE_BAM)
prefix.infiles <- gsub('patients.txt','',FILE_BAM)
cmd <- phsc.cmd.phyloscanner.one.resume(prefix.infiles, pty.args)
list(CMD=cmd)
}, by='PTY_RUN']
pty.c[1,cat(CMD)]
}
#
# run phyloscanner on each batch
# option1: process each batch on a standard UNIX desktop
if(1)
{
# write bash scripts to file
invisible(pty.c[, {
outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(pty.args[['work.dir']], outfile)
cat(CMD, file=outfile)
Sys.chmod(outfile, mode="777")
Sys.sleep(1)
}, by='PTY_RUN'])
# now run each bash script manually on the command prompt
}
#
# run phyloscanner on each batch
# option2: process each batch on a high performance environment with a job scheduling system
# write bash scripts to file and submit to job scheduling system on a high performance environment
if(0)
{
# define variables for PBS header. this will depend on your job scheduler.
hpc.load <- "module load R/3.3.3" # make R available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 15 # walltime
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "6gb" # RAM
# define PBS header
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:59,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if (!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
# for each batch of deep-sequence trees: submit PBS job
invisible(pty.c[, {
cmd <- paste(pbshead,'cd $TMPDIR',sep='\n')
cmd <- paste(cmd,CMD,sep='\n')
outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(pty.args[['work.dir']], outfile)
cat(CMD, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
Sys.sleep(1)
}, by='PTY_RUN'])
}
}
pty.3.infer.transmission.networks.from.phylo.relationships<- function()
{
require(Phyloscanner.R.utilities)
# setting up workspace
HOME <- '~/sandbox/DeepSeqProjects'
indir <- file.path(HOME, 'RakaiPopSample_phyloscanner_analysis')
outdir <- indir
outfile.base <- file.path(outdir,'phsc_analysis_of_dataset_S1')
neff.cut <- 3
conf.cut <- 0.6
# get meta-data for individuals in general pop cohort
infile <- "~/Dropbox (SPH Imperial College)/2017_phyloscanner_validation/Supp_Data/Data_Set_S2.csv"
dmeta <- as.data.table(read.csv(infile, stringsAsFactors=FALSE))
# get pairs between whom linkage is not excluded
tmp <- phsc.find.linked.pairs(indir, batch.regex='^ptyr([0-9]+)_.*', conf.cut=conf.cut, neff.cut=neff.cut, verbose=TRUE, dmeta=dmeta)
rtp <- copy(tmp$linked.pairs)
rplkl <- copy(tmp$relationship.counts)
rpw <- copy(tmp$windows)
save(rtp, rplkl, rpw, file=paste0(outfile.base,'_allpairs.rda'))
# plot phyloscan
rpw2 <- subset(rpw, ID1=='RkA04565F' & ID2=='RkA05315F')
p <- phsc.plot.phyloscan(rpw2)
p
ggsave(file=paste0(outfile.base,'_phyloscan_RkA04565F_RkA05315F.png'), width=6, height=2.8, units='in', dpi=400)
# reconstruct transmission networks
tmp <- phsc.find.transmission.networks.from.linked.pairs(rtp, rplkl, conf.cut=conf.cut, neff.cut=neff.cut, verbose=TRUE)
rtn <- copy(tmp$transmission.networks)
rtnn <- copy(tmp$most.likely.transmission.chains)
save(rtn, rtnn, file=paste0(outfile.base,'_allnetworks.rda'))
# plot transmission networks: one example
idclus <- sort(unique(rtn$IDCLU))
di <- copy(dmeta)
df <- subset(rtn, IDCLU==idclus[34])
set(df, NULL, c('ID1_SEX','ID2_SEX'), NULL)
p <- phsc.plot.transmission.network(df, di, point.size=10,
edge.gap=0.04,
edge.size=0.4,
curvature= -0.2,
arrow=arrow(length=unit(0.04, "npc"), type="open"),
curv.shift=0.06,
label.size=3,
node.label='ID',
node.fill='SEX',
node.shape.values=c('NA'=16),
node.fill.values=c('F'='hotpink2', 'M'='steelblue2'),
threshold.linked=0.6)
png(file=paste0(outfile.base,'_trmnetwork_34.png'), width=6, height=6, units='in', res=400)
print(p)
dev.off()
# plot the corresponding most likely transmission chain
layout <- p$layout
di <- copy(dmeta)
df <- subset(rtnn, IDCLU==idclus[34])
p2 <- phsc.plot.most.likely.transmission.chain(df, di, point.size=10,
edge.gap=0.04,
edge.size=0.4,
curvature= -0.2,
arrow=arrow(length=unit(0.04, "npc"), type="open"),
curv.shift=0.06,
label.size=3,
node.label='ID',
node.fill='SEX',
node.shape.values=c('NA'=16),
node.fill.values=c('F'='hotpink2', 'M'='steelblue2'),
threshold.linked=0.6,
layout=layout)
png(file=paste0(outfile.base,'_trmchain_34.png'), width=6, height=6, units='in', res=400)
print(p2)
dev.off()
}
pty.4.highlysupported.links<- function()
{
require(Phyloscanner.R.utilities)
# setting up workspace
HOME <- '~/sandbox/DeepSeqProjects'
indir <- file.path(HOME, 'RakaiPopSample_phyloscanner_analysis')
# load pairs and networks
infile.pairs <- file.path(indir,'phsc_analysis_of_dataset_S1_allpairs.rda')
infile.networks <- file.path(indir,'phsc_analysis_of_dataset_S1_allnetworks.rda')
load( infile.pairs ) # loads rtp, rplkl, rpw
load( infile.networks ) # loads rtn, rtnn
# classify linkages
conf.cut <- 0.6
rtnn[, SELECT:= NA_character_]
set(rtnn, rtnn[, which(is.na(PTY_RUN))], 'SELECT', 'insufficient deep sequence data for at least one individual')
set(rtnn, rtnn[, which(!is.na(PTY_RUN) & is.na(LINK_12) & is.na(LINK_21))], 'SELECT', 'ph unlinked pair')
set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED<=conf.cut)], 'SELECT', 'unclear if pair ph linked or unlinked')
set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED>conf.cut)], 'SELECT', 'ph linked pair direction not resolved')
set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED>conf.cut & POSTERIOR_SCORE_12>conf.cut)], 'SELECT', 'ph linked pair direction 12')
set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED>conf.cut & POSTERIOR_SCORE_21>conf.cut)], 'SELECT', 'ph linked pair direction 21')
# add gender
tmp <- subset(rtp, select=c(ID1, ID2, ID1_SEX, ID2_SEX))
rtnn <- merge(rtnn, tmp, by=c('ID1','ID2'))
# re-order so male is ID1
tmp <- subset(rtnn, ID1_SEX=='F' & ID2_SEX=='M')
setnames(tmp, colnames(tmp), gsub('xx','ID2',gsub('ID2','ID1',gsub('ID1','xx',gsub('xx','12',gsub('12','21',gsub('21','xx',colnames(tmp))))))))
set(tmp, NULL, 'SELECT', tmp[, gsub('xx','12',gsub('12','21',gsub('21','xx',SELECT)))])
rtnn <- rbind(subset(rtnn, !(ID1_SEX=='F' & ID2_SEX=='M')), tmp)
# define gender pair
rtnn[, PAIR_SEX:= paste0(ID1_SEX,ID2_SEX)]
# select pairs with highly supported linkages
rtp <- subset(rtnn, !grepl('unlinked|insufficient',SELECT))
rtp[, table(PAIR_SEX)]
#FF MF MM
#80 376 81
# conclusion: high proportion of missed intermediates
# select male-female pairs with highly supported direction
rtpd <- subset(rtnn, ID1_SEX=='M' & ID2_SEX=='F' & grepl('direction 12|direction 21',SELECT))
set(rtpd, NULL, 'SELECT', rtpd[, gsub('ph linked pair direction 21','fm',gsub('ph linked pair direction 12','mf',SELECT))])
setnames(rtpd, 'SELECT', 'PHYSCANNER_DIR')
rtpd <- subset(rtpd, select=c(ID1, ID2, PHYSCANNER_DIR))
# 293/376
# read pairs with clinical data on direction (assuming these are linked)
infile <- '~/sandbox/DeepSeqProjects/RakaiPopSample_data/Dataset_S3.csv'
red <- as.data.table(read.csv(infile))
setnames(red, c('MALE_ID','FEMALE_ID'), c('ID1','ID2'))
rtpd <- merge(rtpd, red, by=c('ID1','ID2'))
rtpd[, PHYSCANNER_DIR_CONSISTENT:= as.integer(PHYSCANNER_DIR==EPID_EVIDENCE_DIR)]
rtpd[, table(PHYSCANNER_DIR_CONSISTENT)]
}
vignetter.make<- function()
{
require(rmarkdown)
require(markdowntemplates)
setwd('/Users/Oliver/git/phyloscanner/phyloscannerR/vignettes')
infile <- 'phyloscannerR_reconstruct_transmission_networks.Rmd'
rmarkdown::render(infile, output_format='pdf_document')
}
pty.MRC.stage3.evaluate.distances <- function()
{
require(tidyverse)
file.nets <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_250_HKC_analysis/MRCUVRI_phscnetworks_190827.rda'
load(file.nets)
# calculate mean subgraph distance between any pair
tmp <- dw %>% group_by(PTY_RUN, H1, H2) %>%
summarize(MPD:= mean(PATRISTIC_DISTANCE),
MPDA := mean(PATRISTIC_DISTANCE[ADJACENT])) %>%
ungroup()
# select pairs in networks between whom a link is not rejected
tmp <- dnet %>% filter(TYPE=='not.close.or.nonadjacent' & SCORE<0.6) %>%
select(PTY_RUN, H1, H2) %>%
left_join(tmp, by=c('PTY_RUN','H1','H2'))
dnet %>% select(PTY_RUN, H1, H2, IDCLU, CLU_SIZE) %>%
distinct() %>%
left_join(tmp, by=c('PTY_RUN','H1','H2')) %>%
mutate(CLU_CAT:= cut(CLU_SIZE, breaks=c(1,2,5,10,20))) %>%
group_by(CLU_CAT) %>%
summarise(MPDA_MEDIAN:= quantile(MPDA, p=0.5),
MPDA_CL:= quantile(MPDA, p=0.25),
MPDA_CU:= quantile(MPDA, p=0.75)) %>%
ungroup()
}
pty.MRC.stage2.get.networks<- function()
{
require(tidyverse)
require(data.table)
require(RBGL)
require(igraph)
require(network)
require(ggnet)
require(phyloscannerR)
if(0)
{
indir <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_300_HKC_phsc'
file.pairs <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_300_HKC_analysis/MRCUVRI_phscallpairs_190827.rda'
file.nets <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_300_HKC_analysis/MRCUVRI_phscnetworks_190827.rda'
}
if(1)
{
indir <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_250_HKC_phsc'
file.pairs <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_250_HKC_analysis/MRCUVRI_phscallpairs_190827.rda'
file.nets <- '/Users/or105/Box/OR_Work/2019/2019_PANGEA_BBosa/MRCPopSample_phsc_stage2_output_newali_250_HKC_analysis/MRCUVRI_phscnetworks_190827.rda'
}
# find all pairs between whom transmission is not excluded
control <- list(linked.group='close.and.adjacent.cat',linked.no='not.close.or.nonadjacent',linked.yes='close.and.adjacent', conf.cut=0.6, neff.cut=3)
tmp <- find.pairs.in.networks(indir, batch.regex='^ptyr([0-9]+)_.*', control=control, verbose=TRUE)
dpl <- copy(tmp$network.pairs)
dc <- copy(tmp$relationship.counts)
dw <- copy(tmp$windows)
save(dpl, dc, dw, file=file.pairs)
# construct networks between pairs
tmp <- find.networks(dc, control=control, verbose=TRUE)
dnet <- copy(tmp$transmission.networks)
dchain <- copy(tmp$most.likely.transmission.chains)
# construct highly supported pairs
conf.cut <- 0.6
dconfpairs <- dchain %>%
filter( SCORE_LINKED>conf.cut & pmax(SCORE_DIR_12, SCORE_DIR_21)>conf.cut) %>%
select(-c(SCORE_NW_12, SCORE_NW_12, SCORE_NW_AMB))
save(dpl, dc, dw, dnet, dchain, dconfpairs, file=file.nets)
load(file.nets)
setdiff(unique(sort( dnet%>%pull(IDCLU) )), unique(sort( dchain%>%pull(IDCLU) )) )
# 34 80 96
# plot all networks
idclus <- sort(unique(dnet$IDCLU))
control<- list()
control$point.size = 10
control$edge.gap = 0.04
control$edge.size = 2
control$curvature = -0.2
control$arrow = arrow(length = unit(0.04, "npc"), type = "open")
control$curv.shift = 0.06
control$label.size = 3
control$node.label = "H"
control$node.fill = NA_character_
control$node.shape = NA_character_
control$node.shape.values = c(`NA` = 16)
control$node.fill.values = c(`NA` = "steelblue2")
control$threshold.linked = 0.6
pns <- lapply(seq_along(idclus), function(i)
{
idclu <- idclus[i]
df <- dnet %>%
filter(IDCLU == idclu)
di <- df %>%
select(H1, H2) %>%
gather('HOST_TYPE','H') %>%
select(-HOST_TYPE) %>%
distinct()
p <- phyloscannerR:::plot.network(df, di, control)
p
})
pdf(file= gsub('\\.rda','_trmnetwork_all.pdf',file.nets), w=8, h=8)
for(i in seq_along(pns))
print(pns[[i]])
dev.off()
# plot corresponding most likely chains
pcs <- lapply(seq_along(idclus), function(i)
{
idclu <- idclus[i]
control$layout <- pns[[i]][['layout']]
df <- dchain %>%
filter(IDCLU == idclu)
di <- df %>%
select(H1, H2) %>%
gather('HOST_TYPE','H') %>%
select(-HOST_TYPE) %>%
distinct()
p <- phyloscannerR:::plot.chain(df, di, control)
p
})
pdf(file= gsub('\\.rda','_mostlikelychains_all.pdf',file.nets), w=8, h=8)
for(i in seq_along(pns))
print(pns[[i]])
dev.off()
# plot phyloscans of all likely pairs
hosts <- dw %>% select(H1, H2) %>%
gather('HOST_TYPE','H') %>%
select(-HOST_TYPE) %>%
distinct() %>%
arrange(H) %>%
pull(H)
tmp <- copy(dw)
tmp <- produce.pairwise.graphs2(NULL, hosts=hosts, dwin=tmp, inclusion = "both")
pdf(file= gsub('\\.rda','_phyloscans_all.pdf',file.nets), w=10, h=250)
print( tmp$graph )
dev.off()
}
pty.MRC.stage2.get.phylo.relationships<- function()
{
require(tidyverse)
require(data.table)
require(phyloscannerR)
if(0)
{
tree.dir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_stage2_output_newali_300_HKC'
tmpdir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_tmp'
outdir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_stage2_output_newali_300_HKC_phsc'
prog.phyloscanner_analyse_trees <- '/rds/general/user/or105/home/libs_sandbox/phyloscanner/phyloscanner_analyse_trees.R'
}
if(1)
{
tree.dir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_stage2_output_newali_250_HKC'
tmpdir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_tmp'
outdir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_stage2_output_newali_250_HKC_phsc'
prog.phyloscanner_analyse_trees <- '/rds/general/user/or105/home/libs_sandbox/phyloscanner/phyloscanner_analyse_trees.R'
}
# set phyloscanner variables
# arguments as used for RCCS analysis
control <- list()
control$allow.mt <- TRUE
control$alignment.file.directory = NULL
control$alignment.file.regex = NULL
control$blacklist.underrepresented = FALSE
control$count.reads.in.parsimony = TRUE
control$distance.threshold <- '0.025 0.05'
control$do.dual.blacklisting = FALSE
control$duplicate.file.directory = NULL
control$duplicate.file.regex = NULL
control$file.name.regex = "^\\D*([0-9]+)_to_([0-9]+)\\D*$"
control$guess.multifurcation.threshold = FALSE
control$max.reads.per.host = 50
control$min.reads.per.host <- 30
control$min.tips.per.host <- 1
control$multifurcation.threshold = 1e-5
control$multinomial= TRUE
control$norm.constants = NULL
control$norm.ref.file.name = system.file('HIV_DistanceNormalisationOverGenome.csv',package='phyloscannerR')
control$norm.standardise.gag.pol = TRUE
control$no.progress.bars = TRUE
control$outgroup.name = "REF_consensus"
control$output.dir = outdir
control$parsimony.blacklist.k = 20
control$prune.blacklist = FALSE
control$post.hoc.count.blacklisting <- TRUE
control$ratio.blacklist.threshold = 0
control$raw.blacklist.threshold = 10
control$recombination.file.directory = NULL
control$recombination.file.regex = NULL
control$relaxed.ancestry = TRUE
control$sankoff.k = 20
control$sankoff.unassigned.switch.threshold = 0
control$seed = 42
control$splits.rule = 's'
control$tip.regex = "^(.*)-fq[0-9]+_read_([0-9]+)_count_([0-9]+)$"
control$tree.file.regex = "^ptyr[0-9]+_InWindow_([0-9]+_to_[0-9]+)\\.tree$"
control$use.ff = FALSE
control$user.blacklist.directory = NULL
control$user.blacklist.file.regex = NULL
control$verbosity = 1
# make bash for many files
df <- tibble(F=list.files(tree.dir))
df <- df %>%
mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\2', F),
RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F))) %>%
mutate(TYPE:= gsub('^([^\\.]+)\\.[a-z]+$','\\1',TYPE)) %>%
spread(TYPE, F) %>%
set_names(~ str_to_upper(.))
tmp <- sort(as.integer(gsub('ptyr([0-9]+)_(.*)','\\1',list.files(outdir, pattern='_workspace.rda$'))))
df <- df %>% filter(!RUN%in%tmp)
valid.input.args <- cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees)
cmds <- vector('list',nrow(df))
for(i in seq_len(nrow(df)))
{
# set input args
control$output.string <- paste0('ptyr',df$RUN[i])
# make script
tree.input <- file.path(tree.dir, df$TREES_NEWICK[i])
cmd <- cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees,
tree.input,
control,
valid.input.args=valid.input.args)
cmds[[i]] <- cmd
}
cat(cmds[[100]])
#
# submit array job to HPC
#
# make header
hpc.load <- "module load anaconda3/personal" # make third party requirements available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 23 # walltime
if(0)
{
hpc.q <- NA # PBS queue
hpc.mem <- "36gb" # RAM
}
# or run this block to submit a job array to Oliver's machines
if(1)
{
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "6gb" # RAM
}
hpc.array <- length(cmds) # number of runs for job array
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if(!is.na(hpc.array))
pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='')
if(!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
cat(pbshead)
# make array job
for(i in 1:length(cmds))
cmds[[i]]<- paste0(i,')\n',cmds[[i]],';;\n')
cmd <- paste0('case $PBS_ARRAY_INDEX in\n',paste0(cmds, collapse=''),'esac')
cmd <- paste(pbshead,cmd,sep='\n')
# submit job
outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(tmpdir, outfile)
cat(cmd, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
}
pty.MRC.stage1.get.phylo.relationships<- function()
{
require(tidyverse)
require(data.table)
require(phyloscannerR)
if(1)
{
#indir <- '/Users/Oliver/sandbox/DeepSeqProjects/RakaiPopSample_deepseqtrees'
tmpdir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_tmp'
outdir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_stage1_190715'
prog.phyloscanner_analyse_trees <- '/rds/general/user/or105/home/libs_sandbox/phyloscanner/phyloscanner_analyse_trees.R'
tree.dir <- '/rds/general/project/ratmann_pangea_analyses_mrc_uvri/live/MRCPopSample_phsc_stage1_output'
}
# set phyloscanner variables
# arguments as used for RCCS analysis
control <- list()
control$allow.mt <- TRUE
control$alignment.file.directory = NULL
control$alignment.file.regex = NULL
control$blacklist.underrepresented = FALSE
control$count.reads.in.parsimony = TRUE
control$distance.threshold <- '0.025 0.05'
control$do.dual.blacklisting = FALSE
control$duplicate.file.directory = NULL
control$duplicate.file.regex = NULL
control$file.name.regex = "^\\D*([0-9]+)_to_([0-9]+)\\D*$"
control$guess.multifurcation.threshold = FALSE
control$max.reads.per.host = 50
control$min.reads.per.host <- 30
control$min.tips.per.host <- 1
control$multifurcation.threshold = 1e-5
control$multinomial= TRUE
control$norm.constants = NULL
control$norm.ref.file.name = system.file('HIV_DistanceNormalisationOverGenome.csv',package='phyloscannerR')
control$norm.standardise.gag.pol = TRUE
control$no.progress.bars = TRUE
control$outgroup.name = "REF_CPX_AF460972"
control$output.dir = outdir
control$parsimony.blacklist.k = 20
control$prune.blacklist = FALSE
control$post.hoc.count.blacklisting <- TRUE
control$ratio.blacklist.threshold = 0
control$raw.blacklist.threshold = 20
control$recombination.file.directory = NULL
control$recombination.file.regex = NULL
control$relaxed.ancestry = TRUE
control$sankoff.k = 20
control$sankoff.unassigned.switch.threshold = 0
control$seed = 42
control$splits.rule = 's'
control$tip.regex = "^(.*)-fq[0-9]+_read_([0-9]+)_count_([0-9]+)$"
control$tree.file.regex = "^ptyr[0-9]+_InWindow_([0-9]+_to_[0-9]+)\\.tree$"
control$use.ff = FALSE
control$user.blacklist.directory = NULL
control$user.blacklist.file.regex = NULL
control$verbosity = 1
# make bash for many files
df <- tibble(F=list.files(tree.dir))
df <- df %>%
mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\2', F),
RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F))) %>%
mutate(TYPE:= gsub('^([^\\.]+)\\.[a-z]+$','\\1',TYPE)) %>%
spread(TYPE, F) %>%
set_names(~ str_to_upper(.))
tmp <- sort(as.integer(gsub('ptyr([0-9]+)_(.*)','\\1',list.files(outdir, pattern='_workspace.rda$'))))
df <- df %>% filter(!RUN%in%tmp)
valid.input.args <- cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees)
cmds <- vector('list',nrow(df))
for(i in seq_len(nrow(df)))
{
# set input args
control$output.string <- paste0('ptyr',df$RUN[i])
# make script
tree.input <- file.path(tree.dir, df$TREES_NEWICK[i])
cmd <- cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees,
tree.input,
control,
valid.input.args=valid.input.args)
cmds[[i]] <- cmd
}
cat(cmds[[100]])
#
# submit array job to HPC
#
# make header
hpc.load <- "module load anaconda3/personal" # make third party requirements available
hpc.select <- 1 # number of nodes
hpc.nproc <- 1 # number of processors on node
hpc.walltime <- 23 # walltime
if(0)
{
hpc.q <- NA # PBS queue
hpc.mem <- "36gb" # RAM
}
# or run this block to submit a job array to Oliver's machines
if(1)
{
hpc.q <- "pqeelab" # PBS queue
hpc.mem <- "6gb" # RAM
}
hpc.array <- length(cmds) # number of runs for job array
pbshead <- "#!/bin/sh"
tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "")
pbshead <- paste(pbshead, tmp, sep = "\n")
pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n")
if(!is.na(hpc.array))
pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='')
if(!is.na(hpc.q))
pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n")
pbshead <- paste(pbshead, hpc.load, sep = "\n")
cat(pbshead)
# make array job
for(i in 1:length(cmds))
cmds[[i]]<- paste0(i,')\n',cmds[[i]],';;\n')
cmd <- paste0('case $PBS_ARRAY_INDEX in\n',paste0(cmds, collapse=''),'esac')
cmd <- paste(pbshead,cmd,sep='\n')
# submit job
outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.'))
outfile <- file.path(tmpdir, outfile)
cat(cmd, file=outfile)
cmd <- paste("qsub", outfile)
cat(cmd)
cat(system(cmd, intern= TRUE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.