R/haircut.dev.R

Defines functions dev.AC.data dev.align.haircutcontigs dev.haircut

dev.AC.data<- function()
{
	require(data.table)
	indir		<- "/work/or105/PANGEA_AC/150921/"
	infiles		<- data.table(FASTQ=list.files(indir, pattern='fastq\\.gz$|fastq$', recursive=1))
	infiles[, RUN:= sapply(strsplit(FASTQ,'/',fixed=1),'[[',1)]
	infiles[, ID:= gsub('_S[0-9]+_.*','',basename(FASTQ))]
	infiles[, IDS:= gsub('_S|_','',regmatches(basename(FASTQ),regexpr('_S[0-9]+_',basename(FASTQ))))]
	infiles[, IDL:= gsub('_L|_','',regmatches(basename(FASTQ),regexpr('_L[0-9]+_',basename(FASTQ))))]
	infiles[, IDR:= gsub('_R|_','',regmatches(basename(FASTQ),regexpr('_R[0-9]+_',basename(FASTQ))))]
	
	ctfiles		<- data.table(IVA=list.files(indir, recursive=1))
	ctfiles		<- subset(ctfiles, grepl('contigs_hit_ref',IVA)) 
	ctfiles[, RUN:= sapply(strsplit(IVA,'/',fixed=1),'[[',1)]
	ctfiles[, ID:= gsub('.fasta','',gsub('.assembly_contigs_hit_ref','',sapply(strsplit(IVA,'/',fixed=1),'[[',2)))]	
	infiles		<- merge(infiles, ctfiles, by=c('RUN','ID'), all=1)
	
	ctfiles		<- data.table(KRAKEN=list.files(indir, pattern='kraken.report$', recursive=1))	
	ctfiles[, ID:= gsub('.kraken.report','',sapply(strsplit(KRAKEN,'/',fixed=1),'[[',2))]	
	infiles		<- merge(infiles, ctfiles, by=c('ID'), all=1)
	
	write.csv( infiles, row.names=FALSE, file= paste(indir, '/infiles_151020.csv',sep='') )
	save(infiles, file=paste(indir, '/infiles_151020.R',sep=''))
	if(1)
	{
		#	only consider files with RESXxx, PRESxxx and TASPxxx
		pngfiles	<- subset(infiles, !grepl('^UID|^Undetermined',ID))		
		pngfiles[, FASTQ_NEW:= paste(RUN,'_',basename(FASTQ),sep='')]
		tmp			<- pngfiles[, which(!is.na(IVA))]
		pngfiles[, IVA_NEW:= NA_character_]		
		set(pngfiles, tmp, 'IVA_NEW', pngfiles[tmp, paste(RUN,'_',gsub('\\.assembly_contigs_hit_ref','',basename(IVA)),sep='')])
		#	move files
		tmp			<- subset(pngfiles, !is.na(IVA) & IDR==1)
		setkey(tmp, IVA)
		tmp			<- unique(tmp)
		tmp[, file.rename(paste(indir,'/',IVA,sep=''), paste(indir,'/IVA/',IVA_NEW,sep='')), by='IVA']		
		pngfiles[, file.rename(paste(indir,'/',FASTQ,sep=''), paste(indir,'/FASTQ/',FASTQ_NEW,sep='')), by='FASTQ']
		
		write.csv( pngfiles, row.names=FALSE, file= paste(indir, '/pngfiles_151020.csv',sep='') )
		#	check pngfiles
		stopifnot(nrow(subset(pngfiles, is.na(FASTQ)))==0)	
		#	no IVA contigs --> 124
		write.csv( subset(pngfiles, IDR==1 & is.na(IVA)), row.names=FALSE, file= paste(indir, '/infiles_151020_noIVA.csv',sep='') )
		#	check: same S for each ID? -->  Run2_96_600V3_20140814 RES320
		tmp	<- subset(pngfiles, !is.na(IVA))[, list(IDS_N=length(unique(IDS))), by=c('RUN','ID')]
		subset(tmp, IDS_N>1)
		#	check: more than two R files ?
		tmp	<- subset(pngfiles, !is.na(IVA))[, list(IDR_N=length(IDR)), by=c('RUN','ID')]
		subset(tmp, IDR_N>2)
		#	total fastq 853
		nrow(subset(pngfiles, !is.na(FASTQ) & IDR==1))	
		#	total contigs 729
		write.table( subset(pngfiles, !is.na(IVA) & IDR==1, select=IVA), quote=FALSE, col.names=FALSE, row.names=FALSE, file= paste(indir, '/infiles_151020_yesIVA.csv',sep='') )
		nrow(subset(pngfiles, !is.na(IVA) & IDR==1))
		#	number contigs per RUN
		subset(pngfiles, !is.na(IVA) & IDR==1)[, table(RUN)]
		#	number fastq per RUN
		subset(pngfiles, !is.na(FASTQ) & IDR==1)[, table(RUN)]
		#	no kraken report
		write.csv( subset(pngfiles, IDR==1 & is.na(KRAKEN)), row.names=FALSE, file= paste(indir, '/infiles_151020_noKraken.csv',sep='') )
	}
	if(0)
	{
		#	no FASTQ 
		stopifnot(nrow(subset(infiles, is.na(FASTQ)))==0)	
		#	no IVA contigs
		subset(infiles, IDR==1 & is.na(IVA))
		#	check: same S for each ID?
		tmp	<- subset(infiles, !is.na(IVA))[, list(IDS_N=length(unique(IDS))), by=c('RUN','ID')]
		subset(tmp, IDS_N>1)
		#	check: more than two R files ?
		tmp	<- subset(infiles, !is.na(IVA))[, list(IDR_N=length(IDR)), by=c('RUN','ID')]
		subset(tmp, IDR_N>2)
		#	total fastq
		nrow(subset(infiles, !is.na(FASTQ) & IDR==1))	
		#	total contigs
		nrow(subset(infiles, !is.na(IVA) & IDR==1))
		#	number contigs per RUN
		subset(infiles, !is.na(IVA) & IDR==1)[, table(RUN)]
		#	number fastq per RUN
		subset(infiles, !is.na(FASTQ) & IDR==1)[, table(RUN)]
	}
	
}

dev.align.haircutcontigs<- function()
{
	if(0)
	{
		#	extract the haircut contigs and prepare alignment files
		indir1		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150902_model150816b'
		indir2		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150902_model150816b_manual'
		outdir		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150903_model150816b'
		infiles 	<- rbind( 	data.table(FILE=list.files(indir2, pattern='\\.fasta$', recursive=T, full.name=T)),
								data.table(FILE=list.files(indir1, pattern='\\.fasta$', recursive=T, full.name=T))	)		
		infiles[, PNG_ID:= gsub('_nohair','',gsub('_wref','',gsub('\\.fasta','',basename(FILE))))]
		infiles[, MNL:= factor(grepl('manual',FILE), levels=c(TRUE,FALSE), labels=c('Y','N'))]
		infiles		<- dcast.data.table(infiles, PNG_ID~MNL, value.var='FILE')
		tmp			<- infiles[, which(is.na(Y))]
		set(infiles, tmp, 'Y', infiles[tmp, N])
		infiles		<- melt(infiles, id.vars='PNG_ID', measure.vars='Y', value.name='FILE')
		#	extract just the curated contigs from file
		cs			<- sapply(seq_len(nrow(infiles)), function(i)
				{
					cat('\nprocess',infiles[i,FILE])
					#FILE	<- infiles[i,FILE]
					#OFILE	<- infiles[1,OFILE]
					#PNG_ID	<- infiles[1,PNG_ID] 
					cr				<- read.dna(infiles[i,FILE], format='fasta')
					cr				<- cr[grepl(infiles[i,PNG_ID],rownames(cr)),]	
					as.list(cr)
				})
		#	reformat to list of sequences
		cs			<- unlist(cs, recursive=F)
		names(cs)	<- NULL
		csi			<- data.table(TAXON= sapply(cs, rownames), LEN= sapply(cs, ncol))
		csi[, PNG_ID:= sapply(strsplit(gsub('^\\.','',TAXON),'.',fixed=T),'[[',1)]
		cs			<- do.call(c,lapply(cs, as.list))	
		#	extract all contigs with min length into single alignment and save
		tmp			<- merge(infiles, subset(csi, LEN==min(LEN), PNG_ID)[1,], by='PNG_ID')
		cr			<- read.dna(tmp[1,FILE], format='fasta')
		cr			<- as.list(cr[!grepl(tmp[1,PNG_ID],rownames(cr)),])
		cr			<- c(cr, cs[ subset(csi, LEN==min(LEN))[, TAXON] ])	
		write.dna(cr, file= paste(outdir, '/', 'contigs_minlen.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
		write.dna(cr, file= paste(outdir, '/', 'contigs_stub.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
		save(cr, csi, cs, file= paste(outdir, '/', 'contigs.R', sep=''))
		#	extract all others 
		tmp			<- subset(csi, LEN>min(LEN))
		setkey(tmp, LEN)
		tmp[, BATCH_ID:= ceiling(seq_len(nrow(tmp))/200) ]
		invisible(sapply(tmp[, unique(BATCH_ID)], function(b)
						{
							write.dna(cs[ subset(tmp, BATCH_ID==b)[, TAXON] ], file= paste(outdir, '/', 'contigs_batch',b,'.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
						}))				
	}
	if(1)	#check if we missed any contigs
	{
		indir		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150903_model150816b'
		infile		<- 'contigs_0_20.fasta'
		seq			<- read.dna(paste(indir,infile,sep='/'), format='fasta')
		cpr			<- data.table(TAXON=rownames(seq))
		cmissed		<- merge(data.table(TAXON=setdiff( csi[,TAXON], cpr[,TAXON] )), tmp, by='TAXON')
		cmissed		<- cs[ cmissed[,TAXON] ]
		
		cr			<- read.dna( '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150902_model150816b/12559_1_1_wref_nohair.fasta', format='fasta')
		cr			<- cr[1:200,] 
		write.dna(c(cmissed, as.list(cr)), file= paste(indir, '/', 'contigs_missed.fasta', sep=''), format='fasta', colsep='', nbcol=-1)		
	}
	if(1)	#clean up alignment
	{
		indir		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150903_model150816b'
		infile		<- 'contigs_0_20b.fasta'
		seq			<- read.dna(paste(indir,infile,sep='/'), format='fasta')
		tmp			<- which( duplicated(rownames(seq)) )
		rownames(seq)[tmp] 
		seq			<- seq[ setdiff( seq_len(nrow(seq)), tmp ), ]
		tmp			<- which(!grepl('^\\.[0-9]+_', rownames(seq)))		
		seq			<- rbind(seq[tmp,], seq[ setdiff( seq_len(nrow(seq)), tmp ), ])
		write.dna(seq, file= paste(indir, '/', 'contigs_cnsalign_PNGIDn3366_CNTGSn6120.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
		#	even more missed?
		tmp			<- setdiff(csi[,TAXON],rownames(seq)[201:nrow(seq)])
		cr			<- read.dna( '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150902_model150816b/12559_1_1_wref_nohair.fasta', format='fasta')
		cr			<- cr[1:200,] 		
		write.dna(c(cs[tmp], as.list(cr)), file= paste(indir, '/', 'contigs_missed2.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
		#	--> turns out these are just crap
	
		indir	 	<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150903_model150816b'
		infiles		<- list.files(indir, pattern='*stripped*',full.names=TRUE)		
		invisible(lapply(seq_along(infiles), function(i)
						{
							infile			<- infiles[i]
							seq				<- read.dna(infile, format='fasta')
							tmp				<- which( duplicated(rownames(seq)) )
							rownames(seq)[tmp] 
							seq				<- seq[ setdiff( seq_len(nrow(seq)), tmp ), ]
							rownames(seq)	<- gsub(' (stripped)','',rownames(seq), fixed=TRUE)							
							tmp				<- which(!grepl('^\\.[0-9]+_', rownames(seq)))		
							seq				<- rbind(seq[tmp,], seq[ setdiff( seq_len(nrow(seq)), tmp ), ])
							tmp				<- tail(strsplit(basename(infile),'_')[[1]],1)
							write.dna(seq, file= paste(indir, '/', 'contigs_cnsalign_PNGIDn3366_CNTGSn6120_',tmp, sep=''), format='fasta', colsep='', nbcol=-1)							
						}))
	}
	if(1)
	{
		indir		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150903_model150816b'
		load( paste(indir, '/', 'contigs.R', sep='') )
		#	
		#	add all others to alignment in batches of 200
		batch.id	<- ".14683_1_18.2.2_cut"
		write.dna( cs[ subset(tmp, TAXON==".14683_1_18.2.2_cut")[, TAXON] ], file= paste(outdir, '/', 'contigs_BATCH',batch.id,'.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
		cmd			<- cmd.align.contigs.with.ref(paste(outdir, '/', 'contigs_BATCH',batch.id,'.fasta', sep=''), paste(outdir, '/', 'contigs_minlen.fasta', sep=''), paste(outdir, '/', 'contigs_minlen_BATCH',batch.id,'.fasta', sep=''), options='')		
		
		batch.id	<- "14683_1_18"
		write.dna( cs[ subset(tmp, PNG_ID=="14683_1_18")[, TAXON] ], file= paste(outdir, '/', 'contigs_BATCH',batch.id,'.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
		cmd			<- cmd.align.contigs.with.ref(paste(outdir, '/', 'contigs_BATCH',batch.id,'.fasta', sep=''), paste(outdir, '/', 'contigs_minlen.fasta', sep=''), paste(outdir, '/', 'contigs_minlen_BATCH',batch.id,'.fasta', sep=''), options='')	
		
		batch.id	<- 20
		write.dna( cs[ subset(tmp, BATCH_ID==batch.id)[, TAXON] ], file= paste(outdir, '/', 'contigs_BATCH',batch.id,'.fasta', sep=''), format='fasta', colsep='', nbcol=-1)	
		cmd			<- cmd.align.contigs.with.ref(paste(outdir, '/', 'contigs_BATCH',batch.id,'.fasta', sep=''), paste(outdir, '/', 'contigs_minlen.fasta', sep=''), paste(outdir, '/', 'contigs_minlen_BATCH.fasta', sep=''), options='')
		
	}
		
}


dev.haircut<- function()	
{
	if(0)	#check which files not aligned 
	{
		dscp	<- as.data.table(read.csv("/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/pngfiles_151020.csv", stringsAsFactors=FALSE))
		dscp[, ID:= paste(RUN,'_',ID,sep='')]
		
		dr		<- data.table(BLAST=list.files("~/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_151026_unaligned_raw"))
		dr[, ID:= gsub('_hiv','',sapply(strsplit(BLAST,'.',fixed=1),'[[',1))]
		dscp	<- merge(dscp, dr, by='ID', all.x=1)
		subset(dscp, is.na(BLAST) & IDR==1 & !is.na(IVA))	
		
		subset(dscp, IDR==1)
	}
	if(0)	#check alignment: code
	{
		indir		<- paste(DATA, 'contigs_150408_wref', sep='/' )
		indir		<- paste(DATA, 'contigs_150902_wref', sep='/' )				
		infiles		<- data.table(FILE=list.files(indir, pattern='fasta$', recursive=T))
		infiles[, PNG_ID:= gsub('\\.fasta','',gsub('_frclen|_refc|_refr|_wRefs','',FILE))]
		infiles[, AL_TYPE:= gsub('_*','',gsub('\\.fasta','',gsub('[0-9]*','',FILE)))]				
		tmp			<- infiles[, {
					#FILE		<- infiles[1,FILE]
					tmp			<- paste(indir, FILE, sep='/')
					tmp			<- gsub(' ','\\ ',gsub('(','\\(',gsub(')','\\)',tmp,fixed=T),fixed=T),fixed=T)					
					cmd			<- paste("awk '/^>/ {if (seqlen) print seqlen;print;seqlen=0;next} {seqlen+=length($0)}END{print seqlen}' ", tmp, sep='')
					tmp			<- system(cmd, intern=TRUE)					
					list(LEN=as.numeric(tmp[2]))
				}, by='FILE']
		infiles		<- merge(infiles, tmp, by='FILE')		
		tmp			<- dcast.data.table(infiles, PNG_ID~AL_TYPE, value.var='LEN')
		#	suspect that raw is in reverse if alignment length is larger than 12k		
		stopifnot( nrow(subset(tmp, wRefs>12e3 & refc>12e3))==0 )
		#	mv refc to wRefs in case wRef length > 12e3
		alfixup		<- merge(subset(tmp, wRefs>12e3 & refc<=12e3, PNG_ID), infiles, by='PNG_ID')
		alfixup		<- dcast.data.table(alfixup, PNG_ID~AL_TYPE, value.var='FILE')
		alfixup[, {
						file.rename( paste(indir,wRefs,sep='/'), paste(indir,gsub('wRefs\\.fasta','fndrvs\\.fasta',wRefs),sep='/'))
						file.copy( paste(indir,refc,sep='/'), paste(indir,gsub('refc\\.fasta','wRefs\\.fasta',wRefs),sep='/'), overwrite=TRUE)
						NULL
				}, by='PNG_ID']		
		
		cat(cmd.haircut.check.alignment(indir, indir))
	}
	if(0)	#check alignment: cmd
	{
		indir		<- paste(DATA, 'contigs_151026_wref', sep='/' )		
		argv		<<- cmd.haircut.check.alignment(indir, indir)
		argv		<<- paste('-',unlist(strsplit(argv,' -|\n')),sep='')
	}
	if(0)
	{
		indir		<- paste(DATA, 'contigs_151026_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_151026_wref_cutstat', sep='/' )				
		argv		<<- cmd.haircut.cutstat(indir, outdir, batch.n=200, batch.id=1)
		argv		<<- paste('-',unlist(strsplit(argv,' -|\n')),sep='')
	}
	if(1)
	{
		indir.st		<- paste(DATA,'contigs_150408_wref_cutstat',sep='/')
		indir.al		<- paste(DATA,'contigs_150408_wref',sep='/')
		outdir			<- paste(DATA,'contigs_150408_model150816a',sep='/')
		indir.st		<- paste(DATA,'contigs_151026_wref_cutstat',sep='/')
		indir.al		<- paste(DATA,'contigs_151026_wref',sep='/')
		outdir			<- paste(DATA,'contigs_151026_model150816b',sep='/')
		CNF.contig.idx	<- 2
		argv			<<- cmd.haircut.call(indir.st, indir.al, outdir, CNF.contig.idx=CNF.contig.idx)
		argv			<<- paste('-',unlist(strsplit(argv,' -|\n')),sep='')
		
	}
	if(0)
	{
		#fixup mafft
		indir		<- paste(DATA, 'contigs_150408_merged_unaligned', sep='/' )
		outdir		<- paste(DATA, 'contigs_150408_wref2', sep='/' )
		batch.n		<- 200
		
		tmp			<- data.table(INFILE=list.files(indir, pattern='fasta$', recursive=T))
		tmp[, PNG_ID:= gsub('\\.fasta','',gsub('_cut|_raw','',INFILE))]
		tmp[, OUTFILE:= gsub('\\.fasta','_wRefs\\.fasta', gsub('_hiv|_HIV','',basename(INFILE)))]
		reffile		<- system.file(package="PANGEAhaircut", "HIV1_COM_2012_genome_DNA_WithExtraA1UG.fasta")
		
		i			<- tmp[, which(PNG_ID=='12559_1_13')]
		INFILE		<- tmp[i, INFILE]
		OUTFILE		<- tmp[i, OUTFILE]
		reffile		<- system.file(package="PANGEAhaircut", "HIV1_COM_2012_genome_DNA_WithExtraA1UG.fasta")
		cat(cmd.align.contigs.with.ref(paste(indir,'/',INFILE,sep=''), reffile, paste(outdir,'/',OUTFILE,sep='')))
		#
		indir.cut	<- '/Users/Oliver/Dropbox (Infectious Disease)/PANGEA_data/contigs_150408/HIVcontigs_cut'
		indir.raw	<- '/Users/Oliver/Dropbox (Infectious Disease)/PANGEA_data/contigs_150408/HIVcontigs_Raw'
		outdir		<- paste(DATA, 'contigs_150408_wref2', sep='/' )
		infiles		<- data.table(INFILECUT=list.files(indir.cut, pattern='fasta$', recursive=T))
		infiles[, PNG_ID:= gsub('_hiv','',gsub('\\.fasta','',gsub('_cut|_raw','',INFILECUT)))]
		tmp			<- data.table(INFILECUT=list.files(indir.raw, pattern='fasta$', recursive=T))
		tmp			<- data.table(INFILERAW=list.files(indir.raw, pattern='fasta$', recursive=T))
		tmp[, PNG_ID:= gsub('_hiv','',gsub('\\.fasta','',gsub('_cut|_raw','',INFILERAW)))]
		infiles		<- merge(infiles, tmp, all=TRUE, by='PNG_ID')
		infiles[, OUTFILE1:= paste(PNG_ID,'_c.fasta',sep='')]
		infiles[, OUTFILE2:= paste(PNG_ID,'_refc.fasta',sep='')]
		infiles[, OUTFILE3:= paste(PNG_ID,'_wRefs.fasta',sep='')]
		infiles[, OUTFILE4:= paste(PNG_ID,'_refr.fasta',sep='')]
		infiles[, OUTFILE5:= paste(PNG_ID,'_frclen.fasta',sep='')]
		
		i			<- infiles[, which(PNG_ID=='13554_1_14')]
		i			<- infiles[, which(PNG_ID=='13554_1_17')]
		#i			<- infiles[, which(PNG_ID=='12559_1_13')]
		
		INFILECUT	<- infiles[i, INFILECUT]
		INFILERAW	<- infiles[i, INFILERAW]
		OUTFILE1	<- infiles[i, OUTFILE1]
		OUTFILE2	<- infiles[i, OUTFILE2]
		OUTFILE3	<- infiles[i, OUTFILE3]
		OUTFILE4	<- infiles[i, OUTFILE4]
		OUTFILE5	<- infiles[i, OUTFILE5]
		reffile		<- system.file(package="PANGEAhaircut", "HIV1_COM_2012_genome_DNA_WithExtraA1UG.fasta")
		#reffilen7	<- '/Users/Oliver/Dropbox (Infectious\ Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref2/HIV1_COM_n7.fasta'
		cmd			<- cmd.add.tag.to.fasta.names( paste(indir.cut,'/',INFILECUT,sep=''), paste(outdir,'/',OUTFILE1,sep=''), tag='_cut')
		cmd			<- paste(cmd, cmd.align.contigs.with.ref(paste(outdir,'/',OUTFILE1,sep=''), reffile, paste(outdir,'/',OUTFILE2,sep='')), sep='\n')
		cmd			<- paste(cmd, cmd.align.contigs.with.ref(paste(indir.raw,'/',INFILERAW,sep=''), paste(outdir,'/',OUTFILE2,sep=''), paste(outdir,'/',OUTFILE3,sep='')), sep='\n')
		cmd			<- paste(cmd, cmd.align.contigs.with.ref(paste(indir.raw,'/',INFILERAW,sep=''), paste(outdir,'/',OUTFILE2,sep=''), paste(outdir,'/',OUTFILE5,sep=''), options='--keeplength --op 0.1'), sep='\n')
		cmd			<- paste(cmd, cmd.align.contigs.with.ref(paste(indir.raw,'/',INFILERAW,sep=''), reffile, paste(outdir,'/',OUTFILE4,sep='')), sep='\n')		
		tmp			<- paste(outdir,'/',OUTFILE1,sep='')
		tmp			<- gsub(' ','\\ ',gsub('(','\\(',gsub(')','\\)',tmp,fixed=T),fixed=T),fixed=T)
		cmd			<- paste(cmd, '\n','rm ',tmp,sep='')		
		cat(cmd)		
	}
	if(0)
	{
		#	read just one file
		#	determine statistics for each contig after LTR		
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEA_data/InterestingContigAlignments'
		outdir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut'
		infiles	<- data.table(FILE=list.files(indir, pattern='fasta$', recursive=T))
		infiles[, PNG_ID:= gsub('_wRefs\\.fasta','',gsub('_cut|_raw','',FILE))]
		infiles[, BLASTnCUT:= regmatches(FILE,regexpr('cut|raw',FILE))]
		set(infiles, NULL, 'BLASTnCUT', infiles[, factor(BLASTnCUT, levels=c('cut','raw'), labels=c('Y','N'))])
		par		<- c('FRQx.quantile'=0.05, 'CNS_AGR.window'=200, 'GPS.window'=200)
		
		
		file	<- paste(indir, infiles[1, FILE], sep='/')
		#	read Contigs+Rrefs: cr
		cr		<- read.dna(file, format='fasta')			
		#	determine start of non-LTR position and cut 
		tmp		<- haircut.find.nonLTRstart(cr)
		cr		<- cr[, seq.int(tmp, ncol(cr))]
		#	determine reference sequences. 
		#	non-refs have the first part of the file name in their contig name and are at the top of the alignment
		tmp		<- strsplit(basename(file), '_')[[1]][1]
		tx		<- data.table(TAXON= rownames(cr), CONTIG=as.integer(grepl(tmp, rownames(cr))) )
		stopifnot( all( tx[, which(CONTIG==1)] == seq.int(1, tx[, length(which(CONTIG==1))]) ) )
		cat(paste('\nFound contigs, n=', tx[, length(which(CONTIG==1))]))
		#	determine base frequencies at each site amongst references.
		tmp		<- cr[subset(tx, CONTIG==0)[, TAXON],]
		cnsr	<- haircut.getconsensus(tmp, par, bases=c('a','c','g','t','-') )	#	CoNSensus of References: cnsr
		#	for each contig, determine %agreement with consensus on rolling window
		cnsc	<- rbind(cnsr, cr[subset(tx, CONTIG==1)[, TAXON],])
		#	determine first and last non-gap sites
		tx		<- data.table(	TAXON= rownames(cnsc), 
				FIRST= apply( as.character(cnsc), 1, function(x) which(x!='-')[1] ),
				LAST= ncol(cnsc)-apply( as.character(cnsc), 1, function(x) which(rev(x)!='-')[1] )		)
		#	get cut statistics
		cnsc.df	<- haircut.get.cut.statistics(cnsc, tx, par, outdir=NA, file=NA, mode='rolling')
		cnsc.df	<- haircut.get.cut.statistics(cnsc, tx, par, outdir=outdir, file=file, mode='rolling')		
	}
	if(0)	#extract just the curated contigs from file
	{
		indir		<- '/Users/Oliver/Dropbox (Infectious Disease)/PANGEA_data/contigs_150408/CuratedAlignmentsToRefs'
		outdir		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_model150816acur'
		infiles 	<- data.table(FILE=list.files(indir, pattern='\\.fasta$', recursive=T, include.dirs=T))
		infiles[, PNG_ID:= gsub('\\.fasta','',basename(FILE))]
		infiles[, CFILE:= paste(PNG_ID,'_curated.fasta',sep='')]
		#	extract just the curated contigs from file
		invisible(infiles[,{
					cat('\nprocess',FILE)
					#FILE	<- infiles[1,FILE]
					#CFILE	<- infiles[1,CFILE]
					#PNG_ID	<- infiles[1,PNG_ID] 
					cr				<- read.dna(paste(indir,'/',FILE,sep=''), format='fasta')
					cr				<- cr[grepl(PNG_ID,rownames(cr)),]
					rownames(cr)	<- paste(rownames(cr),'_cur',sep='')
					cr				<- seq.rmgaps(cr, rm.only.col.gaps=0, rm.char='-', verbose=0)		
					write.dna(cr, file=paste(outdir,CFILE,sep='/'), format='fasta', colsep='', nbcol=-1)
					NULL
				}, by='FILE'])
		#	align against automatically created contigs
		indir		<- '/Users/Oliver/Dropbox (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_model150816a'
		tmp		 	<- data.table(AFILE=list.files(indir, pattern='\\.fasta$', recursive=T, include.dirs=T))
		tmp[, PNG_ID:= gsub('_wref_nohair\\.fasta','',basename(AFILE))]		
		cat('\nNo Automated Contigs?\n', paste(setdiff(infiles[, PNG_ID], tmp[,PNG_ID]), collapse='\n'))
		cat('\nNo Curated Contigs?\n', paste(setdiff(tmp[, PNG_ID], tmp[,PNG_ID]), collapse='\n'))
		infiles		<- merge(infiles, tmp, by='PNG_ID')
		infiles[, OUTFILE:= paste(PNG_ID,'_nohair_wref_wcu.fasta',sep='')]
		
		tmp			<- infiles[, {
					#AFILE		<- infiles[1,AFILE]
					#CFILE		<- infiles[1,CFILE]
					#OUTFILE		<- infiles[1,OUTFILE]		
					list(CMD=cmd.align.contigs.with.ref(paste(outdir,'/',CFILE,sep=''), paste(indir,'/',AFILE,sep=''), paste(outdir,'/',OUTFILE,sep='')))			
				}, by='PNG_ID']
		cmd			<- paste(tmp$CMD, collapse='\n')
		cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=1, hpc.mem="5000mb")
		cat(cmd)		
		outfile		<- paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.')
		cmd.hpccaller(paste(DATA,"tmp",sep='/'), outfile, cmd)
		#PNG_ID	<- infiles[1,PNG_ID] 
		
	}
	if(0)
	{
		#	process several files
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEA_data/InterestingContigAlignments'
		outdir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/interesting_150408'		
		par		<- c('FRQx.quantile'=NA, 'FRQx.thr'=NA, 'CNS_FRQ.window'=200, 'CNS_AGR.window'=200, 'GPS.window'=200)
		haircutwrap.get.cut.statistics(indir, par, outdir=outdir)
	}
	if(0)
	{
		#	run mafft --add to get contigs+ref
		indir		<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_merged_unaligned'
		outdir		<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref'
		reffile		<- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEA_data/HIV1_COM_2012_genome_DNA_WithExtraA1UG.fasta'
		cmd			<- cmdwrap.align.contigs.with.ref(indir, reffile, outdir, batch.n=200, batch.id=2)
		cmd			<- cmdwrap.align.contigs.with.ref(indir, reffile, outdir)
		cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=4, hpc.mem="5000mb")
		cat(cmd)		
		outdir		<- paste(DATA,"tmp",sep='/')
		outfile		<- paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.')
		cmd.hpccaller(outdir, outfile, cmd)	
		#haircutwrap.align.contigs.with.ref(indir, reffile, outdir)
	}	
	if(0)
	{
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref'
		outfile	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_cutsubraw.R'
		txe		<- haircutwrap.get.subset.among.raw.and.cut.contigs(indir, outfile)
	}
	if(0)	# get training data
	{	
		#	get contigs that are to be used for training: this determines the 1's and excludes some 0's
		#	read curated contigs 
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEA_data/contigs_150408/CuratedAlignmentsToRefs'
		outfile	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_curated.R'
		ctrain	<- haircut.get.curated.contigs(indir, outfile)		
		setnames(ctrain, c('FILE','LEN','FIRST','LAST'),c('ANS_FILE','ANS_LEN','ANS_FIRST','ANS_LAST'))
		#	remove cut & raw contigs that are identical as double 1's and as 0's
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref'
		outfile	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_trainingset_subsets.R'
		ctrain	<- haircut.get.training.contigs(indir, outfile, ctrain)
		set(ctrain, NULL, 'CUT', ctrain[, factor(CUT, levels=c('cut','raw'), labels=c('Y','N'))])
		setnames(ctrain, 'CUT', 'BLASTnCUT')		
		#	now create training data set: 
		#	expand the curated contigs in the training data to 1's per site, and
		#	expand all non-existing curated contigs to 0's, provided they are not excluded because identical to a 1
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref_cutstat'
		outdir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_train'
		outfile	<- 'contigs_150408_train'
		par		<- c(	'FRQx.quantile'=NA, 'FRQx.thr'=NA, 'CNS_FRQ.window'=200, 'CNS_AGR.window'=200, 'GPS.window'=200, 
						'PRCALL.thrmax'=0.8, 'PRCALL.thrstd'=10, 'PRCALL.cutprdcthair'=150, 'PRCALL.cutprdctcntg'=50, 'PRCALL.cutrawgrace'=100, 'PRCALL.rmintrnlgpsblw'=100 ,'PRCALL.rmintrnlgpsend'=9700)		
		haircut.get.training.data(indir, ctrain, par, outdir, outfile)
	}
	if(0)	#	fit model
	{
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_train'
		outfile	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/model_150816a.R'
		tmp		<- haircut.get.fitted.model.150816a(indir, outfile)
	}
	if(0)	#	call contigs on training data and plot
	{		
		#mfile	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/model_150816a.R'
		#	get model coefficients across the chunks
		tmp						<- haircut.get.fitted.model.150816a()
		ctrmc					<- tmp$coef		
		predict.fun				<- tmp$predict
		#	get contigs that were used for training		
		outfile	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_trainingset_subsets.R'
		ctrain	<- haircut.get.training.contigs(NULL, outfile, NULL)
		set(ctrain, NULL, 'CUT', ctrain[, factor(CUT, levels=c('cut','raw'), labels=c('Y','N'))])
		setnames(ctrain, 'CUT', 'BLASTnCUT')		
		#	get covariates for all contigs
		indir.st<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref_cutstat'
		indir.al<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_wref'
		outdir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_model150816a'
		par		<- c(	'FRQx.quantile'=NA, 'FRQx.thr'=NA, 'CNS_FRQ.window'=200, 'CNS_AGR.window'=200, 'GPS.window'=200, 
						'PRCALL.thrmax'=0.8, 'PRCALL.thrstd'=10, 'PRCALL.cutprdcthair'=100, 'PRCALL.cutrawgrace'=100, 'PRCALL.rmintrnlgpsblw'=100 ,'PRCALL.rmintrnlgpsend'=9700)
		haircutwrap.get.call.for.PNG_ID(indir.st,indir.al,outdir,ctrmc,ctrev,predict.fun,par,ctrain=ctrain)
	}
	if(0)	# evaluate training data
	{	
		#	
		indir	<- '/Users/Oliver/Dropbox\ (Infectious Disease)/OR_Work/2015/2015_PANGEA_haircut/contigs_150408_train'
		outfile	<- 'model_150811a.R'
		#
		#	find 20 worst false pos
		#
		ctrfp	<- do.call('rbind', lapply(seq(1,10000,200), function(site)
				{
					ctr		<- haircut.calculate.training.data(indir, site)	
					tmp		<- seq(ctr[, min(SITE)-1],ctr[, max(SITE)],10)
					ctr[, CHUNK:=cut(SITE, breaks=tmp, labels=tmp[-length(tmp)])]	
					do.call('rbind',lapply( ctr[, unique(CHUNK)], function(chunk){
								ctrch	<- subset(ctr, CHUNK==chunk)
								ctrchfp	<- subset( ctrch, ANS_CALL==0 & AGRpc>subset( ctrch, ANS_CALL==1 )[, min(AGRpc)])
								ctrchfp	<- ctrchfp[, list(ANS_CALL=mean(ANS_CALL), AGRpc=mean(AGRpc), GPS=mean(GPS), CNS_FRQr=mean(CNS_FRQr)), by=c('PNG_ID','TAXON')]
								setkey(ctrchfp, AGRpc)
								ctrchfp[ seq.int(max(1,nrow(ctrchfp)-20), nrow(ctrchfp)), ]	
							}))										
				}))		
		save(ctrfp, file= paste(indir,'/fp.R',sep=''))
		setkey(ctrfp, PNG_ID, TAXON)
		ctrfp[, length(unique(TAXON))]
		#
		#	show all three types of information across sites
		#
		invisible(do.call('rbind', lapply(seq(1,10000,200), function(site)
					{
						cat('\nProcess',site)
						ctr		<- haircut.calculate.training.data(indir, site)	
						tmp		<- seq(ctr[, min(SITE)-1],ctr[, max(SITE)],10)
						ctr[, CHUNK:=cut(SITE, breaks=tmp, labels=tmp[-length(tmp)])]
						ggplot(ctr, aes(x=GPS, y=FRQ, colour=factor(ANS_CALL, levels=c(0,1), labels=c('N','Y')))) + 
								geom_point(alpha=0.6, size=1) + 
								facet_wrap(~CHUNK, ncol=5) +
								theme_bw() + labs(x='%gappiness', y='%agreement with consensus\nline: %agreement of consensus with references', colour='In curated contigs') + theme(legend.position='bottom')
						ggsave(file=paste(outdir,'/',outfile,'_ANSCALLBYFRQGPS_SITE',site,'.pdf',sep=''), w=9, h=9)						
					})))
		#
		#	calculate model coefficients across the chunks
		#
		tmp						<- haircut.get.fitted.model.150811a(indir, outfile)
		ctrmc					<- tmp$coef
		ctrev					<- tmp$ev
		model.150811a.predict	<- tmp$predict
	
		
		
		
		#	select threshold
		ctrt		<- subset(ctrev, THR==0.8)
		ggplot(melt(ctrt, id.vars=c('CHUNK'), measure.vars=c('THR','SENS','SPEC','FDR','FOR')), aes(x=CHUNK, y=value, colour=variable)) + geom_line() +
					scale_x_continuous(breaks=seq(0,20e3,1e3), expand=c(0,0)) +
					scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.05)) +
					theme_bw() + labs(x='base relative to HXB2') + theme(panel.grid.major=element_line(colour="grey50", size=0.2), panel.grid.minor=element_line(colour="grey70", size=0.1))
		ggsave(file=paste(outdir,'/',outfile,'_model.150811a_THR_const80.pdf',sep=''), w=10, h=4)
		#
		setkey(ctrev, CHUNK, FDR)
		ctrt		<- ctrev[, {
					z		<- which(FDR<= max(0.01, min(FDR, na.rm=T)))[1]
					if(!is.finite(z) | THR[z]<0.8)
						z	<- which(THR==0.8)
					list(THR=THR[z], SENS=SENS[z], SPEC=SPEC[z], FDR=FDR[z], FOR=FOR[z])
				}, by='CHUNK']
		ggplot(melt(ctrt, id.vars=c('CHUNK'), measure.vars=c('THR','SENS','SPEC','FDR','FOR')), aes(x=CHUNK, y=value, colour=variable)) + geom_line() +
				scale_x_continuous(breaks=seq(0,20e3,1e3), expand=c(0,0)) +
				scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.05)) +
				theme_bw() + labs(x='base relative to HXB2') + theme(panel.grid.major=element_line(colour="grey50", size=0.2), panel.grid.minor=element_line(colour="grey70", size=0.1))
		ggsave(file=paste(outdir,'/',outfile,'_model.150811a_THR_BYFDR01.pdf',sep=''), w=10, h=4)
		#	just const threshold??



		tmp		<- ctrev[, list(FDR=min(FDR)), by='CHUNK']
		ggplot(tmp, aes(x=FDR)) + geom_histogram()
		subset(tmp, FDR>0.05)
		
		ctr		<- haircut.calculate.training.data(indir, 1)	
		tmp		<- seq(ctr[, min(SITE)-1],ctr[, max(SITE)],10)
		ctr[, CHUNK:=cut(SITE, breaks=tmp, labels=tmp[-length(tmp)])]
		
		ctrp	<- merge(ctr, ctrmc, by='CHUNK')
		ctrp[, PR_CALL:=model.150811a.predict(AGRpc, GPS, BETA0, BETA1, BETA2)]
		setkey(ctrp, CHUNK)
		ctrev	<- as.data.table(expand.grid(THR= seq(0.05,0.95,0.01), CHUNK= as.character(ctrp[, unique(CHUNK)]), stringsAsFactors=FALSE))
		ctrev	<- ctrev[, {
					tmp	<- ctrp[CHUNK,][,table(ANS_CALL, factor(PR_CALL>=THR, levels=c('TRUE','FALSE'), labels=c('TRUE','FALSE')))]
					list(TP= tmp['1','TRUE'], FP= tmp['0','TRUE'], FN= tmp['1','FALSE'], TN= tmp['0','FALSE'] )					
				}, by=c('CHUNK','THR')]
		ctrev	<- merge(ctrev, ctrev[, list(SENS= TP/(TP+FN), SPEC=TN/(TN+FP), FDR=FP/(FP+TP), FOR=FN/(FN+TN)), by=c('CHUNK','THR')], by=c('CHUNK','THR'))
		#	plot Sensitivity & Specificity
		ggplot(melt(ctrev, id.vars=c('CHUNK','THR'), measure.vars=c('SENS','SPEC','FDR','FOR')), aes(x=THR, y=100*value, colour=CHUNK)) + 
				geom_line() + labs(x='threshold on predicted call probability',y='%', colour='site\n(base relative to HXB2)') +
				theme(legend.position='bottom') + facet_wrap(~variable, ncol=2, scales='free') +
				guides(col = guide_legend(ncol=5, byrow=TRUE))
		ggsave(file=paste(outdir,'/',outfile,'_model.150811a_SensSpec_SITE',site,'.pdf',sep=''), w=9, h=9)
		#
		ctrev
		
		ctrp[CHUNK, table(ANS_CALL, PR_CALL>=THR)]
		

		#	calculate model coefficients across the chunks		
		ctrmc	<- do.call('rbind',lapply(ctr[, unique(CHUNK)], function(chunk)
				{
					ctrch	<- subset(ctr, CHUNK==chunk)
					ctrchm	<- gamlss(ANS_CALL~AGRpc+GPS, data=ctrch, family=BI())				#as good as 'AGRpc+GPS+CNS_FRQr' in terms of FN, FP
					ctrchmc	<- data.table(CHUNK=chunk, BETA0=coef(ctrchm)[1], BETA1=coef(ctrchm)[2], BETA2=coef(ctrchm)[3])					
				}))
		#	evaluate FN FP
		#	return the whole lot
		#	determine threshold based on FN FP, and make call
	
		
		
		#	add rolling mean by AGRpc
		site	<- 1610; chunk<- '1610'
		site	<- 9350; chunk<- '9350'
		site	<- 3650; chunk<- '3650'
		ctr		<- haircut.load.training.data(indir, site)		
		tmp		<- seq(ctr[, min(SITE)-1],ctr[, max(SITE)],10)
		ctr[, CHUNK:=cut(SITE, breaks=tmp, labels=tmp[-length(tmp)])]
		ctrch	<- subset(ctr, CHUNK==chunk)
		setkey(ctrch, FRQ)
		ctrch[, ANS_CALLrm:=ctrch[, rollapply(ANS_CALL, width=100, FUN=mean, align="center", partial=TRUE)]]	
		
		#setkey(ctrch, GPSc, AGRpc)
		#ctrch	<- merge(ctrch, ctrch[, list(TAXON=TAXON, SITE=SITE, BLASTnCUT=BLASTnCUT, ANS_CALLrm2=rollapply(ANS_CALL, width=100, FUN=mean, align="center", partial=TRUE)), by='GPSc'] , by=c('TAXON','SITE','BLASTnCUT','GPSc'))		
		#tmp		<- seq(0,1.01,0.01)
		#ctr[, GPSC:= cut(GPS, breaks=tmp, labels=tmp[-length(tmp)])]
		#ctr[, AGRpcC:= cut(AGRpc, breaks=tmp, labels=tmp[-length(tmp)])]		
		
		ctrchm	<- gamlss(ANS_CALL~FRQ, data=ctrch, family=BI())
		ctrch[, PR_CALL:= predict(ctrchm, type='response', what='mu')]
		ggplot(melt(ctrch, id.vars=c('FRQ','GPS'), measure.vars=c('ANS_CALLrm','PR_CALL')), aes(x=FRQ, y=value, colour=variable)) + geom_line()
		ctrchm2	<- gamlss(ANS_CALL~FRQ, sigma.formula=~FRQ, data=ctrch, family=BB(), i.control=glim.control(cyc=100,cc=1e-4))
		ctrch[, PR_CALL2:= predict(ctrchm2, type='response', what='mu')]
		ggplot(melt(ctrch, id.vars=c('FRQ','GPS'), measure.vars=c('ANS_CALLrm','PR_CALL','PR_CALL2')), aes(x=FRQ, y=value, colour=variable)) + geom_line()
		
		ctrchs	<- ctrch[, {
					z	<- sample(length(AGRpc), min(length(AGRpc),50))
					list(AGRpc=AGRpc[z], GPS=GPS[z], ANS_CALL=ANS_CALL[z], CHUNK=CHUNK[z])	
				}, by=c('AGRpcC','GPSC')]
		
		ctrchms	<- gamlss(ANS_CALL~AGRpc+GPS, data=ctrchs, family=BI())
		ctrchm2	<- gamlss(ANS_CALL~AGRpc+GPS, data=ctrchs, family=BB())
		ctrchm3	<- gamlss(ANS_CALL~AGRpc+GPS, sigma.formula=~AGRpc+GPS, data=ctrchs, family=BB())
		#	predict sigma: how does it look?
		ctrchsp	<- as.data.table(expand.grid(AGRpc=seq(0.01,0.99,0.01), GPS=seq(0.1,0.9,0.1)))
		mu		<- predict(ctrchm2, newdata=ctrchsp, what='mu', type='response', se.fit=TRUE)
		
		predict(ctrchm2, what='mu', type='response', se.fit=TRUE)
		max(predict(ctrchm2, what='mu', type='response', se.fit=TRUE)$se.fit)
		
		ctrchsp[, mu:=mu]
		ctrchsp[, TYPE:='s']
		ctrchp	<- as.data.table(expand.grid(AGRpc=seq(0.01,0.99,0.01), GPS=seq(0.1,0.9,0.1)))
		mu		<- predict(ctrchm, newdata=ctrchp, what='mu', type='response')
		ctrchp[, mu:=mu]
		ctrchp[, TYPE:='a']
		ctrchp	<- rbind(ctrchp, ctrchsp)
		ggplot(ctrchp, aes(x=AGRpc, y=mu, group=TYPE, colour=TYPE)) + geom_line() + facet_grid(GPS~.)
		sigma	<- predict(ctrchm3, newdata=ctrchp, what='sigma', type='response')				
		ctrchp[, mu:=mu]
		ctrchp[, sigma:=sigma]
		ctrchp[, varn1:=mu*(1-mu)]
		
		
		
		#ctrchm	<- gamlss(ANS_CALL~AGRpc+bs(GPSc,df=3), data=ctrch, family=BI())
		
		
		
		
		ctrchm	<- gamlss(ANS_CALL~AGRpc, data=ctrch, family=BI())				#as good as 'AGRpc+GPS+CNS_FRQr' in terms of FN, FP
		ctrchmc	<- data.table(CHUNK=chunk,BETA0=coef(ctrchm)[1], BETA1=coef(ctrchm)[2])
		#ctrch[, PR_CALL:= predict(ctrchm, type='response', what='mu')]
		ctrch	<- merge(ctrch, ctrchmc, by='CHUNK')
		ctrch[, PR_CALL2:= exp(BETA0+BETA1*AGRpc)/(exp(BETA0+BETA1*AGRpc)+1)]
		
		
		ggplot(melt(ctrch, id.vars='AGRpc', measure.vars=c('ANS_CALLrm','PR_CALL')), aes(x=AGRpc, y=value, colour=variable)) + geom_line()
		ggsave(file=paste(outdir,'/',outfile,'_SITE',chunk,'_ANS_CALLrmBYAGRpc.pdf',sep=''), w=6, h=4)
		
		
		plot(ctrchm)	#looks pretty good
		Rsq(ctrchm)		#60%
		AIC(ctrchm)
		
		
		ggplot(ctrch, aes(x=PR_CALL, y=ANS_CALL)) + geom_point()
		#	get sens and spec
		ctrev	<- data.table(THR= seq(0.05,0.95,0.01))
		ctrev	<- ctrev[, {
					tmp	<- ctrch[, table(ANS_CALL, PR_CALL>=THR)]
					list(TP= tmp['1','TRUE'], FP= tmp['0','TRUE'], FN= tmp['1','FALSE'], TN= tmp['0','FALSE'] )					
				}, by='THR']
		ctrev	<- merge(ctrev, ctrev[, list(SENS= TP/(TP+FN), SPEC=TN/(TN+FP), FDR=FP/(FP+TP), FOR=FN/(FN+TN)), by='THR'], by='THR')
		ggplot(melt(ctrev, id.vars='THR', measure.vars=c('SENS','SPEC','FDR','FOR')), aes(x=THR, y=value, colour=variable)) + geom_line() + theme(legend.position='bottom')
		ggsave(file=paste(outdir,'/',outfile,'_SITE',chunk,'_SensSpec.pdf',sep=''), w=4, h=4)
		ggplot(ctrev, aes(x=1-SPEC, y=SENS)) + geom_line() + scale_x_continuous(limit=c(0,1),expand=c(0,0)) + scale_y_continuous(limit=c(0,1),expand=c(0,0))
		ggsave(file=paste(outdir,'/',outfile,'_SITE',chunk,'_ROC.pdf',sep=''), w=4, h=4)
		#ctrchm2	<- gamlss(ANS_CALL~AGRpc+GPS+CNS_FRQr, data=ctrch, family=BB())	#BB does not always converge
	}
}
##--------------------------------------------------------------------------------------------------------
pipeline.various<- function()
{
	if(0)
	{		
		indir.cut	<- paste(DATA, 'contigs_150902_unaligned_cut', sep='/' )
		indir.raw	<- paste(DATA, 'contigs_150902_unaligned_raw', sep='/' )
		outdir		<- paste(DATA,'contigs_150902_model150816b',sep='/')
		batch.n		<- 200
		tmp			<- data.table(FILE=list.files(indir.raw, pattern='fasta$', recursive=T))
		tmp[, BATCH:= ceiling(seq_len(nrow(tmp))/batch.n)]
		tmp			<- tmp[, max(BATCH)]
		for(batch.id in seq.int(1,tmp))
		{	
			cmd			<- cmd.haircut.pipeline(indir.cut, indir.raw, outdir, batch.n=batch.n, batch.id=batch.id)
			cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=1, hpc.mem="5000mb")
			#cat(cmd)		
			outfile		<- paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.')
			cmd.hpccaller(paste(DATA,"tmp",sep='/'), outfile, cmd)	
		}
	}
	if(0)
	{		
		indir.cut	<- paste(DATA, 'contigs_150408_unaligned_cut', sep='/' )
		indir.raw	<- paste(DATA, 'contigs_150408_unaligned_raw', sep='/' )
		outdir		<- paste(DATA,'contigs_150408_model150816b',sep='/')
		batch.n		<- 200
		tmp			<- data.table(FILE=list.files(indir.raw, pattern='fasta$', recursive=T))
		tmp[, BATCH:= ceiling(seq_len(nrow(tmp))/batch.n)]
		tmp			<- tmp[, max(BATCH)]
		for(batch.id in seq.int(1,tmp))
		{	
			cmd			<- cmd.haircut.pipeline(indir.cut, indir.raw, outdir, batch.n=batch.n, batch.id=batch.id)
			cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=1, hpc.mem="5000mb")
			#cat(cmd)		
			outfile		<- paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.')
			cmd.hpccaller(paste(DATA,"tmp",sep='/'), outfile, cmd)	
		}
	}
	if(0)
	{		
		indir.cut	<- paste(DATA, 'contigs_150408_unaligned_cut', sep='/' )
		indir.raw	<- paste(DATA, 'contigs_150408_unaligned_raw', sep='/' )
		outdir		<- paste(DATA, 'contigs_150408_wref', sep='/' )
		indir.cut	<- paste(DATA, 'contigs_150902_unaligned_cut', sep='/' )
		indir.raw	<- paste(DATA, 'contigs_150902_unaligned_raw', sep='/' )
		outdir		<- paste(DATA, 'contigs_150902_wref', sep='/' )		
		indir.cut	<- paste(DATA, 'contigs_151026_unaligned_cut', sep='/' )
		indir.raw	<- paste(DATA, 'contigs_151026_unaligned_raw', sep='/' )
		outdir		<- paste(DATA, 'contigs_151026_wref', sep='/' )		
		
		batch.n		<- 100
		tmp			<- data.table(FILE=list.files(indir.raw, pattern='fasta$', recursive=T))
		tmp[, BATCH:= ceiling(seq_len(nrow(tmp))/batch.n)]
		tmp			<- tmp[, max(BATCH)]
		for(batch.id in seq.int(1,tmp))
		{	
			cmd			<- cmdwrap.align.contigs.with.ref(indir.cut, indir.raw, outdir, batch.n=batch.n, batch.id=batch.id)
			cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeph', hpc.walltime=1, hpc.mem="1000mb")
			cat(cmd)		
			outfile		<- paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.')
			cmd.hpccaller(DATA, outfile, cmd)	
		}
	}
	if(0)
	{
		indir		<- paste(DATA, 'contigs_150408_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_150408_wref_cutstat', sep='/' )
		indir		<- paste(DATA, 'contigs_150902_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_150902_wref_cutstat', sep='/' )		
		indir		<- paste(DATA, 'contigs_151026_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_151026_wref_cutstat', sep='/' )		
		
		batch.n		<- 100
		tmp			<- data.table(FILE=list.files(indir, pattern='wRefs\\.fasta$', recursive=T))
		tmp[, BATCH:= ceiling(seq_len(nrow(tmp))/batch.n)]
		tmp			<- tmp[, max(BATCH)]
		for(batch.id in seq.int(1,tmp))
		{			
			cmd			<- cmd.haircut.cutstat(indir, outdir, batch.n=batch.n, batch.id=batch.id)
			cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=4, hpc.mem="5000mb")
			cat(cmd)		
			cmd.hpccaller(DATA, paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)	
		}	
	}
	if(1)
	{
		indir.st		<- paste(DATA,'contigs_150408_wref_cutstat',sep='/')
		indir.al		<- paste(DATA,'contigs_150408_wref',sep='/')
		outdir			<- paste(DATA,'contigs_150408_model150816a',sep='/')
		trainfile		<- paste(DATA,'contigs_150408_trainingset_subsets.R',sep='/')
		indir.st		<- paste(DATA,'contigs_150902_wref_cutstat',sep='/')
		indir.al		<- paste(DATA,'contigs_150902_wref',sep='/')
		outdir			<- paste(DATA,'contigs_150902_model150816b',sep='/')
		indir.st		<- paste(DATA,'contigs_151026_wref_cutstat',sep='/')
		indir.al		<- paste(DATA,'contigs_151026_wref',sep='/')
		outdir			<- paste(DATA,'contigs_151026_model150816b',sep='/')		
		trainfile		<- NA
		CNF.contig.idx	<- 2		
		batch.n			<- 50
		
		tmp			<- data.table(INFILE=list.files(indir.al, pattern='wRefs\\.fasta$', recursive=T))
		tmp[, BATCH:= ceiling(seq_len(nrow(tmp))/batch.n)]
		tmp			<- tmp[, max(BATCH)]
		for(batch.id in seq.int(1,tmp))
		{			
			cmd			<- cmd.haircut.call(indir.st, indir.al, outdir, trainfile=trainfile, batch.n=batch.n, batch.id=batch.id, CNF.contig.idx=CNF.contig.idx)
			cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=4, hpc.mem="5000mb")
			cat(cmd)		
			cmd.hpccaller(DATA, paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)	
		}	
	}
	if(0)
	{
		#haircut.QC.flatten.curated()
		haircut.QC.flatten.automated()
		haircut.QC.align.curated()
	}
}
##--------------------------------------------------------------------------------------------------------
##	HAIRCUT program, version 15086 to: 
##	- align contigs to references
##	- calculate and save haircut statistics
##--------------------------------------------------------------------------------------------------------
prog.haircut.150806<- function()
{	
	if(0)
	{
		indir		<- paste(DATA, 'contigs_150408_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_150408_wref_cutstat', sep='/' )		
		par			<- c('FRQx.quantile'=NA, 'FRQx.thr'=NA, 'CNS_FRQ.window'=200, 'CNS_AGR.window'=200, 'GPS.window'=200)
		batch.n		<- NA
		batch.id	<- NA
		haircutwrap.get.cut.statistics(indir, par, outdir=outdir, batch.n=batch.n, batch.id=batch.id)
	}
	if(0)
	{
		indir		<- paste(DATA, 'contigs_150408_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_150408_wref_cutstat', sep='/' )		
		batch.n		<- 200
		tmp			<- data.table(FILE=list.files(indir, pattern='fasta$', recursive=T))
		tmp[, BATCH:= ceiling(seq_len(nrow(tmp))/batch.n)]
		tmp			<- tmp[, max(BATCH)]
		for(batch.id in seq.int(1,tmp))
		{	
			
			cmd			<- cmd.haircut.pipeline(indir, outdir, batch.n=batch.n, batch.id=batch.id)
			cmd			<- cmd.hpcwrapper(cmd, hpc.nproc= 1, hpc.q='pqeelab', hpc.walltime=4, hpc.mem="5000mb")
			cat(cmd)		
			cmd.hpccaller(paste(DATA,"tmp",sep='/'), paste("hrct",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)	
		}	
	}
	if(0)
	{
		indir		<- paste(DATA, 'contigs_151026_wref', sep='/' )
		outdir		<- paste(DATA, 'contigs_151026_wref_cutstat', sep='/' )				
		cat(cmd.haircut.cutstat(indir, outdir, batch.n=200, batch.id=1))		
	}
	if(0)
	{
		#	get model coefficients across the chunks
		mfile					<- paste(DATA,'model_150816a.R',sep='/')		
		tmp						<- haircut.get.fitted.model.150816a(NA, mfile)
		ctrmc					<- tmp$coef		
		predict.fun				<- tmp$predict
		#	get contigs that were used for training
		outfile	<- paste(DATA,'contigs_150408_trainingset_subsets.R',sep='/')
		ctrain	<- haircut.get.training.contigs(NA, outfile, NA)
		set(ctrain, NULL, 'CUT', ctrain[, factor(CUT, levels=c('cut','raw'), labels=c('Y','N'))])
		setnames(ctrain, 'CUT', 'BLASTnCUT')		
		#	get covariates for all contigs
		indir.st<- paste(DATA,'contigs_150408_wref_cutstat',sep='/')
		indir.al<- paste(DATA,'contigs_150408_wref',sep='/')
		outdir	<- paste(DATA,'contigs_150408_model150816a',sep='/')
		par		<- c(	'FRQx.quantile'=NA, 'FRQx.thr'=NA, 'CNS_FRQ.window'=200, 'CNS_AGR.window'=200, 'GPS.window'=200, 
				'PRCALL.thrmax'=0.8, 'PRCALL.thrstd'=10, 'PRCALL.cutprdcthair'=150, 'PRCALL.cutprdctcntg'=50, 'PRCALL.cutrawgrace'=100, 'PRCALL.rmintrnlgpsblw'=100 ,'PRCALL.rmintrnlgpsend'=9700,
				'PRCALL.mxgpinref'=100)
		haircutwrap.get.call.for.PNG_ID(indir.st,indir.al,outdir,ctrmc,predict.fun,par,ctrain=ctrain)
	}
}
##--------------------------------------------------------------------------------------------------------
##	HAIRCUT program to call parts of contigs
##--------------------------------------------------------------------------------------------------------
haircutprog.get.call.for.PNG_ID<- function()
{
	verbose			<- 1
	mfile			<- paste(DATA,'model_150816a.R',sep='/')
	trainfile		<- paste(DATA,'contigs_150408_trainingset_subsets.R',sep='/')
	indir.st		<- paste(DATA,'contigs_150408_wref_cutstat',sep='/')
	indir.al		<- paste(DATA,'contigs_150408_wref',sep='/')
	outdir			<- paste(DATA,'contigs_150408_model150816a',sep='/')
	batch.n			<- NA
	batch.id		<- NA
	
	if(exists("argv"))
	{
		#	args input
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,6),
									mfile= return(substr(arg,8,nchar(arg))),NA)	}))
		if(length(tmp)>0) mfile<- tmp[1]		
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,10),
									trainfile= return(substr(arg,12,nchar(arg))),NA)	}))
		if(length(tmp)>0) trainfile<- tmp[1]		
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,9),
									indir.st= return(substr(arg,11,nchar(arg))),NA)	}))
		if(length(tmp)>0) indir.st<- tmp[1]		
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,9),
									indir.al= return(substr(arg,11,nchar(arg))),NA)	}))
		if(length(tmp)>0) indir.al<- tmp[1]
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,7),
									outdir= return(substr(arg,9,nchar(arg))),NA)	}))
		if(length(tmp)>0) outdir<- tmp[1]			
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,8),
									batch.n= return(as.numeric(substr(arg,10,nchar(arg)))),NA)	}))
		if(length(tmp)>0) batch.n<- tmp[1]
		tmp<- na.omit(sapply(argv,function(arg)
						{	switch(substr(arg,2,9),
									batch.id= return(as.numeric(substr(arg,11,nchar(arg)))),NA)	}))
		if(length(tmp)>0) batch.id<- tmp[1]
	}
	if(verbose)
	{
		cat('\ninput args\n',paste(mfile, trainfile, indir.st, indir.al, outdir, batch.n, batch.id, sep='\n'))
	}	
	tmp						<- haircut.get.fitted.model.150816a(NULL, mfile)
	ctrmc					<- tmp$coef		
	predict.fun				<- tmp$predict
	#	get contigs that were used for training
	ctrain	<- NULL
	if(!is.na(trainfile))
	{
		ctrain	<- haircut.get.training.contigs(NULL, trainfile, NULL)
		set(ctrain, NULL, 'CUT', ctrain[, factor(CUT, levels=c('cut','raw'), labels=c('Y','N'))])
		setnames(ctrain, 'CUT', 'BLASTnCUT')				
	}
	#	get covariates for all contigs
	par		<- c(	'FRQx.quantile'=NA, 'FRQx.thr'=NA, 'CNS_FRQ.window'=200, 'CNS_AGR.window'=200, 'GPS.window'=200, 
			'PRCALL.thrmax'=0.8, 'PRCALL.thrstd'=10, 'PRCALL.cutprdcthair'=150, 'PRCALL.cutprdctcntg'=50, 'PRCALL.cutrawgrace'=100, 'PRCALL.rmintrnlgpsblw'=100 ,'PRCALL.rmintrnlgpsend'=9700)
	haircutwrap.get.call.for.PNG_ID(indir.st,indir.al,outdir,ctrmc,predict.fun,par,ctrain=ctrain, batch.n=batch.n, batch.id=batch.id)
}
olli0601/PANGEAhaircut documentation built on May 24, 2019, 12:52 p.m.