#' Make list of RCCS couples with phyloscanner output
couples.find <- function()
{
require(data.table)
if(0)
{
linked.group <- 'TYPE_PAIR_TODI2'
linked.type.yes <- 'linked'
dir.group <- 'TYPE_DIR_TODI2'
}
if(1)
{
linked.group <- 'TYPE_CHAIN_TODI'
linked.type.yes <- 'chain'
linked.type.no <- 'distant'
dir.group <- 'TYPE_ADJ_DIR_TODI2'
}
indir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
outdir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
# load phyloscanner results for couples
infile.trmpairs.todi <- file.path(indir, "data", "todi_pairs_171122_cl25_d50_prior23_min30_withmetadata.rda")
load(infile.trmpairs.todi)
rca <- subset(rtp, COUPLE!='no couple')
# add anonymised labels (need this to interface with published data)
infile.anonymised <- file.path(indir, "data", "todi_pairs_171122_cl25_d50_prior23_min30_anonymised_RIDs.csv")
dan <- as.data.table(read.csv(infile.anonymised))
setnames(dan, c('ID','AID'), c('MALE_RID','MALE_ARID'))
rca <- merge(rca, subset(dan, select=c(MALE_RID, MALE_ARID)), by='MALE_RID')
setnames(dan, c('MALE_RID','MALE_ARID'), c('FEMALE_RID','FEMALE_ARID'))
rca <- merge(rca, subset(dan, select=c(FEMALE_RID, FEMALE_ARID)), by='FEMALE_RID')
# add sampling dates
tmp <- rs[, list(SEQ_DATE=min(SEQ_DATE)), by='RID']
setnames(tmp, c('RID','SEQ_DATE'), c('MALE_RID','MALE_SEQ_DATE'))
rca <- merge(rca, tmp, by='MALE_RID')
setnames(tmp, c('MALE_RID','MALE_SEQ_DATE'), c('FEMALE_RID','FEMALE_SEQ_DATE'))
rca <- merge(rca, tmp, by='FEMALE_RID')
#
# define meta-variables on epidemiologic evidence for direction
# of transmission
rca[, EXT_TYPE:=NA_character_]
set(rca, rca[, which(FEMALE_LASTNEGDATE>=MALE_FIRSTPOSDATE)], 'EXT_TYPE', 'serodisc-mf')
set(rca, rca[, which(MALE_LASTNEGDATE>=FEMALE_FIRSTPOSDATE)], 'EXT_TYPE', 'serodisc-fm')
# add extra couples in who one has CD4<400 and the other has CD4>800
# for male potential recipient - evaluate CD4 around time male first positive
tmp <- rca[, which(is.na(EXT_TYPE) &
MALE_RECENTCD4>800 & abs(MALE_RECENTCD4DATE-MALE_FIRSTPOSDATE)<2 &
FEMALE_RECENTCD4<400 & abs(FEMALE_RECENTCD4DATE-MALE_FIRSTPOSDATE)<2)]
set(rca, tmp, 'EXT_TYPE', 'CD4disc-fm')
# for female potential recipient - evaluate CD4 around time female first positive
tmp <- rca[, which(is.na(EXT_TYPE) &
FEMALE_RECENTCD4>800 & abs(FEMALE_RECENTCD4DATE-FEMALE_FIRSTPOSDATE)<2 &
MALE_RECENTCD4<400 & abs(MALE_RECENTCD4DATE-FEMALE_FIRSTPOSDATE)<2)]
set(rca, tmp, 'EXT_TYPE', 'CD4disc-mf')
# separate into EXT_TYPE and EXT_DIR
set(rca, NULL, 'EXT_DIR', rca[, gsub('^([a-zA-Z0-9]+)-([a-zA-Z]+)$','\\2',EXT_TYPE)])
set(rca, NULL, 'EXT_TYPE', rca[, gsub('^([a-zA-Z0-9]+)-([a-zA-Z]+)$','\\1',EXT_TYPE)])
#
# define phylogenetically inferred direction of transmission
rca[, PHSC_DIR:= NA_character_]
set(rca, rca[, which(grepl('direction resolved',SELECT) & POSTERIOR_SCORE_MF>POSTERIOR_SCORE_FM)], 'PHSC_DIR', 'mf')
set(rca, rca[, which(grepl('direction resolved',SELECT) & POSTERIOR_SCORE_MF<POSTERIOR_SCORE_FM)], 'PHSC_DIR', 'fm')
# save
save(rca, file=file.path(outdir,'analyses','190611_phsccouples.rda'))
}
#' Ask Tanya to provide infection time estimates for RCCS couples
couples.csv.for.Tanya<- function()
{
indir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
infile.couples <- file.path(indir,'analyses','190611_phsccouples.rda')
load(infile.couples)
write.csv( data.table(RID=sort( unique(c( rca[, MALE_RID], rca[, FEMALE_RID] )) )), row.names=FALSE, file=file.path(indir,'analyses','190611_phsccouples_NeedInfTimes.csv'))
}
#' Find relationship statistics for RCCS couples
couples.relationship.stats <- function()
{
require(data.table)
require(tidyverse)
require(ape)
require(phytools)
# locally
if(1)
{
indir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
infile.couples <- file.path(indir,'analyses','190611_phsccouples.rda')
rel.dir <- "~/sandbox/DeepSeqProjects/RakaiPopSample_phyloscanner_analysis"
}
# list relationship files
df <- tibble(F=list.files(rel.dir)) %>%
filter(grepl('pairwise_relationships.rda',F)) %>%
mutate(PTY_RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F)))
# load couples and select those between whom we want to compute tip-to-tip distances
load(infile.couples)
set(rca, NULL, 'MALE_ARID', as.character(rca$MALE_ARID))
set(rca, NULL, 'FEMALE_ARID', as.character(rca$FEMALE_ARID))
rcl <- subset(rca, grepl('most likely',SELECT))
# select phyloscanner runs with linked couples
tmp <- unique(subset(rcl, select=PTY_RUN))
df <- merge(tmp, as.data.table(df), by='PTY_RUN')
# for each run, unzip and calculate tip-to-tip distances among linked couples
ans <- vector('list', 0)
for(run in df$PTY_RUN)
{
#run <- 5 #for devel
# load relationship file and extract basic statistics for all windows
tmpfile <- subset(df,PTY_RUN==run)[,file.path(rel.dir, F)]
load(tmpfile)
dwin <- subset(dwin, select=c(SUFFIX, W_FROM, W_TO, ID1, ID2, ADJACENT, CONTIGUOUS, PATHS_12, PATHS_21))
dwin[, ID1_GENDER:= gsub('.*([MF])$','\\1',ID1)]
dwin[, ID2_GENDER:= gsub('.*([MF])$','\\1',ID2)]
dwin <- subset(dwin, ID1_GENDER=='F' & ID2_GENDER=='M' | ID1_GENDER=='M' & ID2_GENDER=='F')
tmp <- subset(dwin, ID1_GENDER=='F' & ID2_GENDER=='M')
setnames(tmp, c('ID1','ID2','PATHS_12','PATHS_21','ID1_GENDER','ID2_GENDER'), c('ID2','ID1','PATHS_21','PATHS_12','ID2_GENDER','ID1_GENDER'))
dwin <- rbind( subset(dwin, ID1_GENDER=='M' & ID2_GENDER=='F'), tmp)
# select couples
tmp <- data.table(ID1= rcl$MALE_ARID, ID2= rcl$FEMALE_ARID)
dwin <- merge(tmp, dwin, by=c('ID1','ID2'))
# return
dwin[, PTY_RUN:= run]
ans[[length(ans)+1L]] <- copy(dwin)
}
ans <- do.call('rbind', ans)
setkey(ans, PTY_RUN, ID1, ID2)
write.csv(ans, row.names=FALSE, file=file.path(indir,'analyses','190611_phsccouples_basic_topology_stats.csv'))
}
#' Find relationship statistics for RCCS couples based on the latest version of phyloscanner, version 1.8.0
couples.stats.phyloscanner.1.8.0 <- function()
{
require(data.table)
require(tidyverse)
require(ape)
require(phytools)
# locally
if(1)
{
indir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
infile.couples <- file.path(indir,'analyses','190611_phsccouples.rda')
rel.dir <- "~/sandbox/DeepSeqProjects/RakaiPopSample_phsc_out190626"
}
# list relationship files
df <- tibble(F=list.files(rel.dir)) %>%
filter(grepl('workspace.rda',F)) %>%
mutate(PTY_RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F)))
# load couples and select those between whom we want to compute tip-to-tip distances and relationships
load(infile.couples)
set(rca, NULL, 'MALE_ARID', as.character(rca$MALE_ARID))
set(rca, NULL, 'FEMALE_ARID', as.character(rca$FEMALE_ARID))
rcl <- subset(rca, grepl('most likely',SELECT))
# select phyloscanner runs with linked couples
tmp <- unique(subset(rcl, select=PTY_RUN))
df <- merge(tmp, as.data.table(df), by='PTY_RUN')
# for each run, unzip and calculate tip-to-tip distances among linked couples
ans <- vector('list', 0)
for(run in df$PTY_RUN)
{
print(run)
#run <- 5 #for devel
# load phyloscanner output file and extract mean tip to tip distances
tmpfile <- subset(df,PTY_RUN==run)[,file.path(rel.dir, F)]
load(tmpfile)
#
rclr <- subset(rcl, PTY_RUN==run)
for(j in seq_len(nrow(rclr)))
for(k in seq_along(phyloscanner.trees))
{
dc <- phyloscanner.trees[[k]][['classification.results']][['classification']]
# extract topology statistics
dc <- dc %>%
filter( (host.1==rclr[j, FEMALE_ARID] & host.2==rclr[j, MALE_ARID]) |
(host.2==rclr[j, FEMALE_ARID] & host.1==rclr[j, MALE_ARID]) )
# extract tip to tip distances
tmp <- c( phyloscanner.trees[[k]][['tips.for.hosts']][[rclr[j, FEMALE_ARID]]], phyloscanner.trees[[k]][['tips.for.hosts']][[rclr[j, MALE_ARID]]])
if(length(tmp)>0 && nrow(dc)>0)
{
tmp <- ape:::keep.tip(phyloscanner.trees[[k]][['tree']], tmp)
tmp <- adephylo:::distTips( tmp )
tmp <- as.matrix(tmp)
tmp <- as_tibble(melt(tmp, as.is=TRUE, varnames=c('MALE_TAXA','FEMALE_TAXA'), value.name = "PD"))
# calculate mean tip to tip distance between taxa from male and female
tmp <- tmp %>%
mutate( MALE_ARID= gsub('^([^_]+).*','\\1', MALE_TAXA),
FEMALE_ARID= gsub('^([^_]+).*','\\1', FEMALE_TAXA) ) %>%
filter( MALE_ARID!=FEMALE_ARID) %>%
summarise(MPD=mean(PD)) %>%
as.double()
dc <- dc %>%
mutate( MPD:= tmp,
MPD_STD:= tmp/phyloscanner.trees[[k]][['normalisation.constant']],
W_FROM:= phyloscanner.trees[[k]][['window.coords']][['start']],
W_TO:= phyloscanner.trees[[k]][['window.coords']][['end']],
PTY_RUN:= run)
# return
ans[[length(ans)+1L]] <- dc
}
}
}
ans <- do.call('rbind', ans)
save(ans, file=file.path(indir,'analyses','190627_phsccouples_topology_and_distance.rda'))
}
#' Collect initial data set for Melodie
networks.make.data.190726<- function()
{
require(tidyverse)
infile <- '~/sandbox/DeepSeqProjects/RakaiPopSample_phyloscanner_analysis_190706/Rakai_phscnetworks_190706.rda'
load(infile)
# select transmission networks
idclus <- c(5, 24, 26, 31, 78, 87, 106, 458, 459, 463)
# get pairs of individuals in transmission networks
tmp <- dnet %>%
filter(IDCLU %in% idclus) %>%
select(-c(CATEGORISATION, TYPE, SCORE, H1_SEX, H1_SAMPLING_YEAR, H2_SEX, H2_SAMPLING_YEAR)) %>%
distinct() %>%
arrange(IDCLU, H1, H2)
# make all pairwise combinations of individuals
dnet2 <- tmp %>% select( -c(CLU_SIZE,PTY_RUN) ) %>%
gather('variable','host',H1,H2) %>%
select( -variable ) %>%
arrange(IDCLU,host) %>%
distinct() %>%
group_by(IDCLU)
dnet2 <- as.data.table(dnet2)[, {
tmp <- as.data.table(t(combn(host, 2)))
setnames(tmp, c('V1','V2'), c('H1','H2'))
tmp
}, by='IDCLU']
dnet2 <- as_tibble(dnet2) %>%
filter(H1<H2)
dnet2 <- dnet2 %>%
full_join( tmp %>% select(PTY_RUN,IDCLU) %>% distinct(), by='IDCLU')
# go back to PTY_RUN data to extract window information
idruns <- sort( unique( dnet2$PTY_RUN ) )
indir <- '~/sandbox/DeepSeqProjects/RakaiPopSample_phyloscanner_analysis_190706'
infiles <- tibble(F=list.files(indir, pattern='workspace.rda$',full.names=TRUE)) %>%
mutate(PTY_RUN:= as.integer(gsub('^ptyr([0-9]+)_.*$','\\1',basename(F)))) %>%
filter(PTY_RUN %in% idruns)
dwin <- lapply( seq_len(nrow(infiles)), function(i){
cat('\nprocess', infiles$F[i])
load( infiles$F[i] )
dprs <- dnet2 %>% filter(PTY_RUN==infiles$PTY_RUN[i])
ans <- vector('list',0)
for(j in seq_len(nrow(dprs)))
for(k in seq_along(phyloscanner.trees))
{
dc <- phyloscanner.trees[[k]][['classification.results']][['classification']]
# extract topology statistics
dc <- dc %>%
filter( (host.1==dprs$H1[j] & host.2==dprs$H2[j]) |
(host.2==dprs$H1[j] & host.1==dprs$H2[j]) ) %>%
rename( H1:= host.1, H2:=host.2)
tmp <- dc %>%
filter(H1>H2) %>%
rename(H1:=H2, H2:=H1, paths12:=paths21, paths21:=paths12, nodes1:=nodes2, nodes2:=nodes1) %>%
mutate(ancestry:= gsub('tmp$','anc',gsub('Tmp$','Anc',gsub('anc$','desc',gsub('Anc$','Desc',gsub('desc$','tmp',gsub('Desc$','Tmp',ancestry)))))))
dc <- dc %>%
filter(H1<H2) %>%
bind_rows(tmp) %>%
inner_join(dprs[j,], by=c('H1','H2'))
# extract tip to tip distances
tmp <- c( phyloscanner.trees[[k]][['tips.for.hosts']][[dprs$H1[j]]], phyloscanner.trees[[k]][['tips.for.hosts']][[dprs$H2[j]]])
if(length(tmp)>0 && nrow(dc)>0)
{
tmp <- ape:::keep.tip(phyloscanner.trees[[k]][['tree']], tmp)
tmp <- adephylo:::distTips( tmp )
tmp <- as.matrix(tmp)
tmp <- as_tibble(melt(tmp, as.is=TRUE, varnames=c('H1_TAXA','H2_TAXA'), value.name = "PD"))
# calculate mean tip to tip distance between taxa from male and female
tmp <- tmp %>%
mutate( H1= gsub('^([^_]+).*','\\1', H1_TAXA),
H2= gsub('^([^_]+).*','\\1', H2_TAXA) ) %>%
filter( H1<H2) %>%
summarise(MPD=mean(PD)) %>%
as.double()
tmp <- dc %>%
mutate( MPD:= tmp,
MPD_STD:= tmp/phyloscanner.trees[[k]][['normalisation.constant']],
W_FROM:= phyloscanner.trees[[k]][['window.coords']][['start']],
W_TO:= phyloscanner.trees[[k]][['window.coords']][['end']])
# return
ans[[length(ans)+1L]] <- tmp
}
}
ans <- do.call('rbind',ans)
ans
})
dwin <- do.call('rbind',dwin)
# for each pair only keep results from run with most windows
tmp <- dwin %>%
group_by(IDCLU, PTY_RUN, H1, H2) %>%
summarise(N= length(W_FROM)) %>%
group_by(IDCLU, H1, H2) %>%
summarise(PTY_RUN:= PTY_RUN[which.max(N)[1]])
dwin <- tmp %>% inner_join(dwin, by=c('IDCLU','H1','H2','PTY_RUN'))
# add pairwise combinations without phyloscanner evaluation as
# missing
dwin <- dnet2 %>%
select(-PTY_RUN) %>%
distinct() %>%
full_join(dwin, by=c('IDCLU','H1','H2'))
# get most likely transmission chain for selected transmission networks
dchain <- dchain %>% filter( IDCLU %in% idclus)
# get most likely transmission networks
dnet <- dnet %>% filter( IDCLU %in% idclus)
# get meta-data for individuals
hivc.db.Date2numeric<- function( x )
{
if(!class(x)%in%c('Date','character')) return( x )
x <- as.POSIXlt(x)
tmp <- x$year + 1900
x <- tmp + round( x$yday / ifelse((tmp%%4==0 & tmp%%100!=0) | tmp%%400==0,366,365), d=3 )
x
}
dmeta <- dnet2 %>% select(H1,H2) %>% gather() %>% distinct() %>% rename(AID=value) %>% select(-key)
dmeta <- as_tibble(read.csv("~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_anonymised_RIDs.csv", stringsAsFactors=FALSE)) %>%
select(ID,AID) %>%
distinct() %>%
right_join(dmeta, by='AID') %>%
rename(RID:=ID)
stopifnot( 0==dmeta %>% filter(is.na(RID)) %>% nrow() )
infile <- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/180322_sampling_by_gender_age.rda'
load(infile)
dmeta <- dmeta %>%
left_join(as_tibble(rd), by='RID') %>%
select(RID, AID, PID, SEX, BIRTHDATE, LASTNEGDATE, FIRSTPOSDATE, VISIT, DATE, COMM_NUM_A, ARVSTARTDATE, FIRSTSELFREPORTART, EST_DATEDIED) %>%
rename(VISIT_ROUND:= VISIT, VISIT_DATE:= DATE)
tmp <- as_tibble(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data shared with consortium/Pangea_Rakai_RCCS_Metadata_11Dec2015.csv', stringsAsFactors=FALSE)) %>%
select(studyid, Pangea.id, sampleDate) %>%
rename(RID:= studyid, PID:= Pangea.id, SEQ_DATE:=sampleDate) %>%
mutate(SEQ_DATE=hivc.db.Date2numeric(SEQ_DATE))
dmeta <- dmeta %>% left_join(tmp, by=c('RID','PID'))
dmeta <- dmeta %>% select(-c(RID,PID))
# save
outfile <- '~/Box Sync/OR_Work/MSCs/2019_Melodie/data/Rakai_phscnetworks_190706_selected_for_networkinference.rda'
save(dnet, dchain, dwin, dmeta, file=outfile)
}
#' Extract the phyloscanner summary stats for each individual to estimate infection times
participants.phyloscanner.summarystats<- function()
{
require(data.table)
require(tidyverse)
require(ape)
require(phytools)
# locally
if(1)
{
indir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
#infile.couples <- file.path(indir,'analyses','190611_phsccouples.rda')
rel.dir <- "~/Box Sync/OR_Work/MSCs/2019_Melodie/data/RakaiPopSample_phsc_out190626"
}
# list relationship files
df <- tibble(F=list.files(rel.dir)) %>%
filter(grepl('workspace.rda',F)) %>%
mutate(PTY_RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F)))
ans <- vector('list', 0)
for(run in df$PTY_RUN)
{
print(run)
#run <- 5 #for devel
tmpfile <- df %>% filter(PTY_RUN==run) %>% transmute(F:= file.path(rel.dir, F)) %>% as.character()
# load phyloscanner output file and extract mean tip to tip distances
load(tmpfile)
ans[[length(ans)+1L]] <- summary.stats
}
for(run in seq_along(ans))
ans[[run]]<- dplyr:::select(ans[[run]],setdiff(1:ncol(ans[[run]]),contains('prop.gp')))
ans <- do.call('rbind', ans)
save(ans, file=file.path(indir,'analyses','190627_phsccouples_summary_stats.rda'))
}
#' Find mean patristic tip to tip distances for RCCS couples
couples.distances.from.trees <- function()
{
require(data.table)
require(tidyverse)
require(ape)
require(phytools)
# locally
if(0)
{
indir <- '~/Box Sync/OR_Work/MSCs/2019_Melodie'
infile.couples <- file.path(indir,'analyses','190611_phsccouples.rda')
tree.dir <- "~/sandbox/DeepSeqProjects/RakaiPopSample_deepseqtrees"
}
# on HPC
if(1)
{
indir <- '/rds/general/user/or105/home/WORK/Gates_2014/networks'
infile.couples <- file.path(indir,'analyses','190611_phsccouples.rda')
tree.dir <- "/rds/general/project/ratmann_pangea_deepsequencedata/live/PANGEA2_RCCS_trees"
}
# download the phyloscanner tree of the Rakai population-based analysis
if(0)
{
tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208473/Dataset_S1.tar?sequence=1"
# specify directory to untar public data
# download and untar
download.file(tmp, destfile="Dataset_S1.tar", method="curl")
untar("Dataset_S1.tar", exdir=tree.dir, extras='-xvf')
}
# list zipped tree files. One zip file contains the viral trees of individuals in one putative transmission network.
df <- tibble(F=list.files(tree.dir))
df <- df %>%
mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\2', F),
PTY_RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F))) %>%
mutate(TYPE:= gsub('^([^\\.]+)\\.[a-z]+$','\\1',TYPE)) %>%
spread(TYPE, F) %>%
set_names(~ str_to_upper(.))
# load couples and select those between whom we want to compute tip-to-tip distances
load(infile.couples)
set(rca, NULL, 'MALE_ARID', as.character(rca$MALE_ARID))
set(rca, NULL, 'FEMALE_ARID', as.character(rca$FEMALE_ARID))
rcl <- subset(rca, grepl('most likely',SELECT))
# select phyloscanner runs with linked couples
df <- unique(subset(rcl, select=PTY_RUN)) %>%
inner_join(df, by='PTY_RUN')
df <- tibble(FO=list.files(file.path(indir,'analyses'), pattern='mpd_[0-9]+')) %>%
mutate(PTY_RUN:= as.integer(gsub('^.*_mpd_([0-9]+).*$','\\1', FO))) %>%
full_join(df, by='PTY_RUN')
df <- df %>%
filter(is.na(FO)) %>%
arrange(PTY_RUN)
df <- df %>%
mutate(CASE_ID:= seq_len(nrow(df))) %>%
mutate(CASE_ID2:= floor(CASE_ID/10))
df <- df <- df %>% filter(CASE_ID2==0)
df <- as.data.table(df)
# for each run, unzip and calculate tip-to-tip distances among linked couples
for(run in df$PTY_RUN)
{
#run <- 5 #for devel
# unzip
tmpfile <- subset(df,PTY_RUN==run)[,file.path(tree.dir, TREES_NEWICK)]
tmpfile2<- file.path(indir,'analyses',basename(gsub('\\.zip','',tmpfile)))
unzip(tmpfile, exdir=tmpfile2)
# read all trees across genome
tmp <- list.files(tmpfile2, pattern='tree$', full.names=TRUE)
trees <- lapply(tmp, ape:::read.tree)
names(trees) <- basename(tmp)
# get taxa names between whom we want to compute tip-to-tip calculate distances
tmp <- lapply(seq_along(trees), function(j) {
data.table(TREE= names(trees)[j], TAXA= trees[[j]][['tip.label']])
})
dt <- do.call('rbind',tmp)
dt <- subset(dt, !grepl('^REF', TAXA))
dt[, ARID:= gsub('^([A-Za-z0-9]+)_.*','\\1',TAXA)]
rclr<- subset(rcl, PTY_RUN==run)
dt <- subset(dt, ARID%in%unique(c(rclr$FEMALE_ARID, rclr$MALE_ARID)))
ans <- vector('list', 0)
for(j in seq_len(nrow(rclr)))
for(t in unique(dt$TREE))
{
#t <- "ptyr5_InWindow_1000_to_1249.tree" #for devel
# get data.table of male tips vs female tips
zm <- subset(dt, TREE==t & ARID==rclr$MALE_ARID[j])
zf <- subset(dt, TREE==t & ARID==rclr$FEMALE_ARID[j])
setnames(zm, c('TAXA','ARID'), c('MALE_TAXA','MALE_ARID'))
setnames(zf, c('TAXA','ARID'), c('FEMALE_TAXA','FEMALE_ARID'))
z <- merge(zm, zf, by='TREE', allow.cartesian=TRUE)
# drop tips to calculate distances as fast as possible, calculate distances
tmp <- ape:::keep.tip(trees[[t]], subset(dt, TREE==t)[, TAXA])
tmp <- adephylo:::distTips( tmp )
# merge with z to extract those distances that we want
tmp <- as.matrix(tmp)
tmp <- as.data.table(melt(tmp, as.is=TRUE, varnames=c('MALE_TAXA','FEMALE_TAXA'), value.name = "PD"))
z <- merge(z, tmp, by=c('MALE_TAXA','FEMALE_TAXA'))
# return mean tip-to-tip distance
ans[[length(ans)+1L]] <- data.table(TREE=t, MALE_ARID=z$MALE_ARID[1], FEMALE_ARID=z$FEMALE_ARID[1], MPD=z[, mean(PD)])
}
ans <- do.call('rbind', ans)
ans <- subset(ans, !is.na(MALE_ARID) & !is.na(FEMALE_ARID))
write.csv(ans, row.names=FALSE, file=file.path(indir,'analyses',paste0('190611_phsccouples_mpd_',run,'.csv')))
unlink(tmpfile2, recursive=TRUE)
}
}
networks.with.high.prop.women<- function()
{
infile <- "~/sandbox/DeepSeqProjects/RakaiPopSample_data/Dataset_S2.csv"
dmeta <- as_tibble(read.csv(infile, stringsAsFactors=FALSE))
infile <- '~/sandbox/DeepSeqProjects/RakaiPopSample_phyloscanner_analysis_190706/Rakai_phscnetworks_190706.rda'
load(infile)
# select networks of size > 4 with at most one man
# or size 4 all women
# extract females
dfem <- dnet %>%
filter(IDCLU%in%c(4, 8, 27, 29, 30, 33, 36, 40, 68)) %>%
select(H1,H2) %>%
gather('key','AID') %>%
select(AID) %>%
distinct() %>%
filter(grepl('F$',AID))
# de-anonymise
infile <- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_anonymised_RIDs.csv'
da <- as_tibble(read.csv(infile, stringsAsFactors=FALSE))
dfem <- dfem %>% left_join(da, by='AID')
#
outfile <- '~/Box Sync/OR_Work/2019/2019_RCCS_high_risk_women/191011_femaleonly_primary_selection.csv'
write.csv(dfem, file=outfile)
# select networks of size > 3 with at most one man
# extract females
dfem <- dnet %>%
filter(IDCLU%in%c(50, 54, 60, 61, 64, 65, 69, 72 )) %>%
select(H1,H2) %>%
gather('key','AID') %>%
select(AID) %>%
distinct() %>%
filter(grepl('F$',AID))
# de-anonymise
infile <- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_anonymised_RIDs.csv'
da <- as_tibble(read.csv(infile, stringsAsFactors=FALSE))
dfem <- dfem %>% left_join(da, by='AID')
#
outfile <- '~/Box Sync/OR_Work/2019/2019_RCCS_high_risk_women/191011_femaleonly_backup_selection.csv'
write.csv(dfem, file=outfile)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.