##--------------------------------------------------------------------------------------------------------
## get contigs to be used for training. this excludes duplicate raw/cut contigs for which a curated answer is available
##--------------------------------------------------------------------------------------------------------
#' @title Generate curated contigs that are used for training
#' @export
haircut.get.training.contigs<- function(indir=NA, outfile=NA, ctrain=NULL)
{
if(!is.na(outfile))
{
options(show.error.messages = FALSE)
readAttempt <-try(suppressWarnings(load(outfile)))
options(show.error.messages = TRUE)
}
if( inherits(readAttempt, "try-error") )
{
stopifnot(!is.na(indir),!is.null(ctrain))
infiles <- data.table(FILE=list.files(indir, pattern='fasta$', recursive=T))
infiles[, PNG_ID:= gsub('_wRefs\\.fasta','',gsub('_cut|_raw','',basename(FILE)))]
infiles[, BLASTnCUT:= regmatches(FILE,regexpr('cut|raw',FILE))]
# identify identical raw and cut contigs, and identify cut contigs that are part of the raw contig
txe <- infiles[,{
#png_id <- '12559_1_11'
#files <- subset(infiles, PNG_ID==png_id)[, FILE]
#blastncut<- subset(infiles, PNG_ID==png_id)[, BLASTnCUT]
cat('\nProcess', PNG_ID)
tx <- haircut.get.subset.among.raw.and.cut.contigs(indir, FILE, PNG_ID, BLASTnCUT)
tx
}, by='PNG_ID']
# consider identical contigs. merge cut contigs into equal contigs, so we can look at all equal pairs
txec <- merge(subset(txe, EQ), ctrain, all.x=TRUE, by=c('PNG_ID','TAXON'))
tmp <- subset(txe, EQ & CUT=='cut', select=c(PNG_ID, OCNTG, CCNTG, TAXON, EQ_FIRST, EQ_LAST))
setnames(tmp, c('TAXON','CCNTG','EQ_FIRST','EQ_LAST'),c('TAXON_CUT','CCNTG_CUT','EQ_FIRST_CUT','EQ_LAST_CUT'))
txec <- merge(txec, tmp, all.x=TRUE, allow.cartesian=TRUE,by=c('PNG_ID','OCNTG'))
tmp <- subset(ctrain, select=c(PNG_ID, TAXON, ANS_LEN))
setnames(tmp, c('TAXON','ANS_LEN'), c('TAXON_CUT','ANS_LEN_CUT'))
txec <- merge(txec, tmp, all.x=1, by=c('PNG_ID','TAXON_CUT'))
# work out which contigs to use for training
txec[, USE_IN_TRAIN:='Y']
# if raw and cut contigs in curated and identical: don t use raw (double counting).
tmp <- txec[, which(CUT=='raw' & is.na(CCNTG_CUT) & !is.na(ANS_LEN) & !is.na(ANS_LEN_CUT))]
cat(paste('\nFound raw and cut contigs in curated that are identical: don t use raw for training to avoid double counting, n=', length(tmp)))
set(txec, tmp, 'ANS_FILE', NA_character_)
set(txec, tmp, c('ANS_FIRST','ANS_LAST'), NA_integer_) #need to set ANS_LEN to NA later
set(txec, tmp, 'USE_IN_TRAIN', 'N')
#
# if raw and concatenated cut contigs identical and only raw in curated: don t use cut in training as 0
tmp <- subset(txec, USE_IN_TRAIN=='Y' & CUT=='raw' & is.na(CCNTG_CUT) & !is.na(ANS_LEN) & is.na(ANS_LEN_CUT))[, TAXON_CUT]
cat(paste('\nFound raw and cut contigs identical and only raw in curated: don t use cut in training as 0, n=', length(tmp)))
set(txec, txec[, which(TAXON%in%tmp & CUT=='cut')], 'USE_IN_TRAIN', 'N')
# if raw and concatenated cut contigs identical and only cut in curated: don t use raw in training as 0
tmp <- txec[, which(USE_IN_TRAIN=='Y' & CUT=='raw' & is.na(CCNTG_CUT) & is.na(ANS_LEN) & !is.na(ANS_LEN_CUT))]
cat(paste('\nFound raw and cut contigs identical and only cut in curated: don t use raw in training as 0, n=', length(tmp)))
set(txec, tmp, 'USE_IN_TRAIN', 'N')
#
# if cut contig subset of raw contig and both in curated: should not happen
tmp <- nrow(subset(txec, USE_IN_TRAIN=='Y' & CUT=='raw' & !is.na(CCNTG_CUT) & !is.na(ANS_LEN) & !is.na(ANS_LEN_CUT)))
stopifnot(tmp==0)
# if cut contig subset of raw contig and only cut in curated: use cut only partial
tmp <- txec[, which(USE_IN_TRAIN=='Y' & CUT=='cut' & !is.na(CCNTG_CUT) & !is.na(EQ_FIRST) & !is.na(ANS_LEN) & !is.na(ANS_LEN_CUT))]
cat(paste('\nFound cut contig subset of raw contig and only cut in curated: use cut only partial, n=', length(tmp)))
set(txec, tmp, 'USE_IN_TRAIN', 'P')
# if cut contig subset of raw contig and only cut in curated: dont use raw as 0's
tmp <- txec[, which(USE_IN_TRAIN=='Y' & CUT=='raw' & !is.na(CCNTG_CUT) & !is.na(EQ_FIRST_CUT) & is.na(ANS_LEN) & !is.na(ANS_LEN_CUT))]
cat(paste('\nFound cut contig subset of raw contig and only cut in curated: dont use raw as 0s, n=', length(tmp)))
set(txec, tmp, 'USE_IN_TRAIN', 'N')
# if cut contig subset of raw contig and only cut in curated: use cut only partial
tmp <- subset(txec, CUT=='raw' & !is.na(CCNTG_CUT) & !is.na(EQ_FIRST_CUT) & is.na(ANS_LEN) & !is.na(ANS_LEN_CUT))[, TAXON_CUT]
tmp <- txec[, which(TAXON%in%tmp & CUT=='cut')]
cat(paste('\nFound cut contig subset of raw contig and only cut in curated: dont use raw as 0s, n=', length(tmp)))
set(txec, tmp, 'USE_IN_TRAIN', 'P')
# can now set ANS_LEN to NA from above case
set(txec, txec[, which(is.na(ANS_FIRST) & !is.na(ANS_LEN))], 'ANS_LEN', NA_integer_)
# add contigs that are in curated and unique amongst cut and raw
tmp <- merge(subset(txe, !EQ), ctrain, by=c('PNG_ID','TAXON'))
tmp[, USE_IN_TRAIN:='Y']
stopifnot( length(intersect( tmp[, TAXON], txec[,TAXON] ))==0 )
txec <- rbind(txec, tmp, use.names=TRUE, fill=TRUE)
# if cut contig subset of raw contig and only raw in curated: dont use cut as 0's
tmp <- subset(txec, CUT=='raw' & !is.na(CCNTG_CUT) & !is.na(EQ_FIRST_CUT) & !is.na(ANS_LEN) & is.na(ANS_LEN_CUT), c(PNG_ID, OCNTG))
tmp <- subset( merge(tmp, txe, by=c('PNG_ID','OCNTG')), CUT=='cut' )[, unique(TAXON)]
cat(paste('\nFound cut contig subset of raw contig and only raw in curated: dont use cut as 0, n=', length(tmp)))
tmp <- txec[, which(TAXON%in%tmp & CUT=='cut')]
set(txec, tmp, 'USE_IN_TRAIN', 'N')
# delete tmp cols
set(txec, NULL, c('TAXON_CUT','CCNTG_CUT','ANS_LEN_CUT','EQ_FIRST_CUT','EQ_LAST_CUT'), NULL)
setkey(txec, PNG_ID, TAXON, CUT)
txec <- unique(txec)
save(txec, file=outfile)
#print( txec[, table(USE_IN_TRAIN)] )
}
txec
}
##--------------------------------------------------------------------------------------------------------
## get training data set: contig statistics aligned to sites where curated contigs were not cut
##--------------------------------------------------------------------------------------------------------
haircut.get.training.data<- function(indir, ctrain, par, outdir, outfile)
{
require(plyr)
infiles <- data.table(INFILE=list.files(indir, pattern='\\.R$', recursive=T))
infiles[, BATCH:= ceiling(seq_len(nrow(infiles))/100)]
for(b in infiles[, unique(BATCH)])
{
# add curated answers to data by batch
tmp <- subset(infiles, BATCH==b)[, {
cat(paste('\nProcess', INFILE ))
# INFILE <- infiles[12,INFILE]
load( paste(indir,INFILE,sep='/'))
# select contig+refs for which curated answer available
#cm <- merge( unique(subset(cnsc.df, select=c(PNG_ID, TAXON, BLASTnCUT))), subset( ctrain, select=c(PNG_ID, TAXON, BLASTnCUT, USE_IN_TRAIN, EQ_FIRST, EQ_LAST, ANS_FILE, ANS_LEN, ANS_FIRST, ANS_LAST) ), all.x=1, by=c('PNG_ID','TAXON','BLASTnCUT') )
cm <- merge( unique(subset(cnsc.df, TAXON!='consensus',select=c(PNG_ID, TAXON, BLASTnCUT))), subset( ctrain, select=c(PNG_ID, TAXON, BLASTnCUT, USE_IN_TRAIN, EQ_FIRST, EQ_LAST, ANS_FILE, ANS_LEN, ANS_FIRST, ANS_LAST) ), all.x=1, by=c('PNG_ID','TAXON','BLASTnCUT') )
# code below identifies ANS_CALL=1
# we also need ANS_CALL=0; these are all those that don t have a curated file + all those that have USE_IN_TRAIN=='Y'
cm <- subset(cm, is.na(USE_IN_TRAIN) | USE_IN_TRAIN=='Y' | USE_IN_TRAIN=='P')
if( cm[, all(is.na(ANS_LEN))] )
{
cat(paste('\nNo data contigs matched in training data set', INFILE ))
ca <- merge(subset(cnsc.df, select=c(TAXON, SITE, FRQ, FRQ_STD, AGRpc, GPS, PNG_ID, BLASTnCUT)), subset(cm, select=TAXON), by='TAXON')
ca[, ANS_CALL:=0L]
}
if( cm[, !all(is.na(ANS_LEN))] && ncol(cnsc)!=subset(cm, !is.na(ANS_LEN))[1, ANS_LEN] )
{
cat(paste('\nMatching contigs exist in training data set, but are not necessarily aligned', INFILE ))
cr <- read.dna(subset(cm, !is.na(ANS_LEN))[1, ANS_FILE], format='fasta')
# determine start of non-LTR position and cut
cr <- cr[, seq.int(haircut.find.nonLTRstart(cr), ncol(cr))]
# determine consensus to calculate offset
tmp <- strsplit(basename(subset(cm, !is.na(ANS_LEN))[1, ANS_FILE]), '_')[[1]][1]
tx <- data.table(TAXON= rownames(cr), CONTIG=as.integer(grepl(tmp, rownames(cr))) )
tmp <- cr[subset(tx, CONTIG==0)[, TAXON],]
rp <- haircut.get.frequencies(tmp, bases=c('a','c','g','t','-') )
crc <- haircut.get.consensus.from.frequencies(rp, par)$DNAbin
cdc <- cnsc['consensus', ]
# calculate site offset in the curated sequence (crc)
offset <- haircut.calculate.offset(crc,cdc)
# update ANS_FIRST and ANS_LAST according to offset of curated contig in data contigs
cm <- cm[, list(ANS_LEN=ANS_LEN+offset[ANS_LEN], ANS_FIRST=ANS_FIRST+offset[ANS_FIRST], ANS_LAST=ANS_LAST+offset[ANS_LAST]), by=c('PNG_ID','TAXON','BLASTnCUT','USE_IN_TRAIN','EQ_FIRST','EQ_LAST')]
if(any(offset!=0) && ncol(cnsc)!=subset(cm, !is.na(ANS_LEN))[1, ANS_LEN])
warning(paste('\nPerhaps check: Found unequal lengths for', INFILE))
}
if( cm[, !all(is.na(ANS_LEN))] )
{
# expand to SITEs that were called manually
ca <- subset(cm, !is.na(ANS_LEN))[, list(SITE=seq.int(ANS_FIRST,ANS_LAST), ANS_CALL=1L), by='TAXON']
# keep only those contigs that are not in USE_IN_TRAIN=='N'
tmp <- merge(subset(cnsc.df, select=c(TAXON, SITE, FRQ, FRQ_STD, AGRpc, GPS, PNG_ID, BLASTnCUT)), subset(cm, select=c(TAXON, USE_IN_TRAIN, EQ_FIRST, EQ_LAST)), by='TAXON')
ca <- merge(tmp, ca, all.x=1, by=c('TAXON','SITE'))
ca <- subset(ca, USE_IN_TRAIN=='Y' | is.na(USE_IN_TRAIN) | (USE_IN_TRAIN=='P' & SITE>=EQ_FIRST & SITE<=EQ_LAST))
set(ca, ca[,which(is.na(ANS_CALL))],'ANS_CALL',0L)
set(ca, NULL, c('USE_IN_TRAIN','EQ_FIRST','EQ_LAST'), NULL)
}
tmp <- subset(cnsc.df, TAXON=='consensus')
setnames(tmp, c('FRQ','FRQ_STD','AGRpc','GPS'), c('CNS_FRQ','CNS_FRQ_STD','CNS_AGRpc','CNS_GPS'))
ca <- merge(ca, subset(tmp, select=c(SITE, CNS_FRQ, CNS_FRQ_STD, CNS_AGRpc, CNS_GPS)), by='SITE')
ca
}, by='INFILE']
# save batches in chunks of sites
cat(paste('\nSave batch to file', b ))
tmp[, CHUNK:= cut(SITE, breaks=c(-1,seq.int(200, 10000, 200),Inf), labels=c(seq.int(200, 10200, 200) ))]
for(c in tmp[, unique(CHUNK)])
{
ctr <- subset(tmp, CHUNK==c)
save(ctr, file=paste(outdir, '/', outfile,'_sites',c,'_batch',b,'.R',sep=''))
}
cat(paste('\nSaved batch to file', b ))
}
# load batches and save chunks
ofiles <- data.table(FILE=list.files(outdir, pattern='\\.R$', recursive=T))
ofiles <- subset(ofiles, !grepl('SITES', FILE))
set(ofiles, NULL, 'SITES', ofiles[, substring(regmatches(FILE, regexpr('sites[0-9]+',FILE)), 6)])
ofiles[, {
ctr <- do.call('rbind',lapply(FILE, function(x)
{
load(paste(outdir,x,sep='/'))
ctr
}))
save(ctr, file=paste(outdir, '/', outfile,'_SITES',SITES,'.R',sep=''))
}, by='SITES']
# rm intermediates files
invisible( file.remove(ofiles[, paste(outdir,FILE,sep='/')]) )
#Perhaps check: Found unequal lengths for 15065_1_10_cut_wRefs_HAIRCUTSTAT_thr5_aw200_fw100_gw200.R 1367
#Perhaps check: Found unequal lengths for 15070_1_9_raw_wRefs_HAIRCUTSTAT_thr5_aw200_fw100_gw200.R 1522
#Perhaps check: Found unequal lengths for 15099_1_87_raw_wRefs_HAIRCUTSTAT_thr5_aw200_fw100_gw200.R 1924
#Perhaps check: Found unequal lengths for 15172_1_43_cut_wRefs_HAIRCUTSTAT_thr5_aw200_fw100_gw200.R 2325
NULL
}
##--------------------------------------------------------------------------------------------------------
## determine if cut contigs are identical to a stretch or possibly the whole raw contig with the same contig ID
##--------------------------------------------------------------------------------------------------------
haircutwrap.get.subset.among.raw.and.cut.contigs<- function(indir, outfile)
{
options(show.error.messages = FALSE)
readAttempt <-try(suppressWarnings(load(outfile)))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
infiles <- data.table(FILE=list.files(indir, pattern='fasta$', recursive=T))
infiles[, PNG_ID:= gsub('_wRefs\\.fasta','',gsub('_cut|_raw','',basename(FILE)))]
infiles[, BLASTnCUT:= regmatches(FILE,regexpr('cut|raw',FILE))]
# identify identical raw and cut contigs, and identify cut contigs that are part of the raw contig
txe <- infiles[,{
#png_id <- '12559_1_11'
#files <- subset(infiles, PNG_ID==png_id)[, FILE]
#blastncut<- subset(infiles, PNG_ID==png_id)[, BLASTnCUT]
cat('\nProcess', PNG_ID)
tx <- haircut.get.subset.among.raw.and.cut.contigs(indir, FILE, PNG_ID, BLASTnCUT)
tx
}, by='PNG_ID']
save(txe, file=outfile)
}
txe
}
##--------------------------------------------------------------------------------------------------------
## return EQ= 1 and EQ_FIRST EQ_LAST positions, if cut contig are subsets of the raw contig in that:
## - there is only once cut contig which disagrees on up to x positions, where x is the difference in alignment lengths in the cut and raw files
## - the cut contigs matches the raw contig without gaps, where the raw contig is set to start from the start position of the cut contig
##--------------------------------------------------------------------------------------------------------
haircut.get.subset.among.raw.and.cut.contigs<- function(indir, files, png_id, blastncut)
{
crs <- lapply(files, function(x)
{
cr <- read.dna(file=paste(indir,'/',x,sep=''), format='fasta')
if(is.matrix(cr))
{
cr <- cr[, seq.int(haircut.find.nonLTRstart(cr), ncol(cr))]
tmp <- strsplit(basename(x), '_')[[1]][1]
tx <- data.table(TAXON= rownames(cr), CONTIG=as.integer(grepl(tmp, rownames(cr))) )
cr <- cr[ subset(tx, CONTIG==1)[, TAXON], ]
}
if(!is.matrix(cr))
cr <- as.DNAbin(matrix(vector('character',0), nrow=2, byrow=T, dimnames=list(c('DUMMY','DUMMY'),c())))
cr
})
names(crs) <- blastncut
# get contig table
tx <- do.call('rbind',lapply(seq_along(crs), function(i) data.table( TAXON=rownames(crs[[i]]),
CUT= blastncut[i],
FIRST= apply( as.character(crs[[i]]), 1, function(x) which(x!='-')[1] ),
LAST= ncol(crs[[i]])-apply( as.character(crs[[i]]), 1, function(x) which(rev(x)!='-')[1] ) + 1L,
CRS_ID=i) ))
tx <- subset(tx, !is.na(FIRST) & !is.na(LAST)) #some contigs may just be in LTR
tx[, CNTG:=tx[, gsub(paste(png_id,'.',sep=''),'',substring(TAXON, regexpr(png_id, TAXON)))]]
tx[, OCNTG:= tx[, sapply(strsplit(CNTG,'.',fixed=T),'[[',1)]]
tx[, CCNTG:= NA_character_]
tx[, EQ:=FALSE]
tx[, EQ_FIRST:=NA_integer_]
tx[, EQ_LAST:=NA_integer_]
tmp <- tx[, which(grepl('.',CNTG,fixed=T))]
if(length(tmp))
set(tx, tmp, 'CCNTG', tx[tmp, sapply(strsplit(CNTG,'.',fixed=T),'[[',2)])
tmp <- subset(tx, CUT=='cut' & !is.na(CCNTG))[, list(CCNTGn=length(CCNTG)), by='OCNTG']
tmp <- subset(tmp, CCNTGn==1)[, OCNTG] #check for cut contigs that should be present in multiple cuts by naming scheme, but after LTR removal there is only one cut
if(length(tmp))
{
cat('\nFound lone cuts for which multiple cuts are expected by naming scheme, n=', length(tmp))
tmp <- tx[, which(CUT=='cut' & OCNTG%in%tmp)]
set(tx, tmp, 'CCNTG', NA_character_)
set(tx, tmp, 'CNTG', tx[tmp, OCNTG])
}# check if there are contigs with corresponding name in 'cut' and that are identical / subset
# differences in gaps are allowed: these will come from different alignment
txe <- dcast.data.table(tx, CNTG~CUT, value.var='TAXON')
txe <- subset( txe, !is.na(cut) & !is.na(raw) )
if(nrow(txe) && c('cut','raw')%in%colnames(txe))
{
txe <- txe[, {
x <- sub('^-*','',sub('-*$','',paste(as.vector(as.character(crs[['cut']][cut,])),collapse='')))
y <- sub('^-*','',sub('-*$','',paste(as.vector(as.character(crs[['raw']][raw,])),collapse='')))
#print(CNTG)
#print(x)
#print(y)
z <- nchar(x)
if(nchar(x)==nchar(y))
{
z2 <- x==y
z3 <- 1L
attr(z3, 'match.length')<- nchar(y)
}
if(nchar(x)!=nchar(y))
{
z <- gsub('-','',x)
z2 <- z==gsub('-','',y)
z3 <- haircut.large.regexpr(z,gsub('-','',y))
z <- nchar(x)
}
list(EQtmp=z2, EQ_FIRSTtmp=as.integer(z3), EQ_LASTtmp=as.integer(z3+attr(z3, 'match.length')-1L+nchar(x)-z))
}, by='CNTG']
tx <- merge(tx, txe, all.x=TRUE, by='CNTG')
tmp <- tx[, which(!is.na(EQ_FIRSTtmp) & EQ_FIRSTtmp>0)]
set(tx, tmp, 'EQ_FIRST', tx[tmp,FIRST]+tx[tmp,EQ_FIRSTtmp]-1L)
set(tx, tmp, 'EQ_LAST', tx[tmp,FIRST]+tx[tmp,EQ_LASTtmp]-tx[tmp,EQ_FIRSTtmp])
set(tx, tmp, 'EQ', tx[tmp,EQtmp])
set(tx, NULL, c('EQ_FIRSTtmp','EQ_LASTtmp','EQtmp'),NULL)
}
# see if cut contigs are subset of the raw contig
txe <- subset(tx, !is.na(CCNTG))
if(nrow(txe))
{
tmp <- subset(tx, CUT=='raw')
setnames(tmp, colnames(tmp)[ !grepl('OCNTG',colnames(tmp))], paste( colnames(tmp)[ !grepl('OCNTG',colnames(tmp))], '_raw',sep='' ))
txe <- merge(txe, tmp, by='OCNTG')
txe <- txe[, {
z <- sub('-*$','',paste(as.vector(as.character(crs[['cut']][TAXON,])),collapse=''))
x <- sub('^-*','',z)
tmp <- nchar(z)-nchar(x) #number '-' clipped up to start of cut contig
tmp <- max(1L, 1L+tmp-abs(diff(c(ncol(crs[['cut']]), ncol(crs[['raw']]))))) #read pos in raw contig
y <- sub('-*$','',paste(as.vector(as.character(crs[['raw']][TAXON_raw, seq.int(tmp, ncol(crs[['raw']])) ])),collapse=''))
# y starts just a few sites before x should fit, need to determine exact position
#print(CNTG)
#print(x)
#print(y)
#print(tmp)
z <- gsub('-','',x) #rm internal '-' because cut and raw are not in same alignment
z2 <- gsub('-','',y) #rm internal '-' because cut and raw are not in same alignment
z3 <- regexpr( substr(z,1,min(9,nchar(z))), z2 )
#print(z3)
if(z3>0)
{
z2 <- substring(z2, z3)
z2 <- haircut.large.regexpr(z, z2)
}
if( z3<0 || z3> (abs(diff(c(ncol(crs[['cut']]), ncol(crs[['raw']]))))+1) ) #positioned raw to fit cut within alignment accuracy - reject match if that s not the case
{
z3 <- -1L
}
#print(z2)
list(EQ_FIRSTtmp=as.integer(z3), EQ_LASTtmp=as.integer(z3+attr(z2, 'match.length')-1L+nchar(x)-nchar(z)) )
}, by=c('CNTG','OCNTG')]
txe <- subset(txe, EQ_FIRSTtmp>0)
tx <- merge(tx,txe,all.x=TRUE,by=c('CNTG','OCNTG'))
tmp <- tx[, which(!is.na(EQ_FIRSTtmp) & EQ_FIRSTtmp>0)]
set(tx, tmp, 'EQ_FIRST', tx[tmp,FIRST]+tx[tmp,EQ_FIRSTtmp]-1L)
set(tx, tmp, 'EQ_LAST', tx[tmp,FIRST]+tx[tmp,EQ_LASTtmp]-tx[tmp,EQ_FIRSTtmp])
set(tx, NULL, c('EQ_FIRSTtmp','EQ_LASTtmp'),NULL)
set(tx, tx[, which( CUT=='raw' & OCNTG%in%txe[, unique(OCNTG)] )], 'EQ', TRUE)
set(tx, tx[, which( !is.na(EQ_FIRST))], 'EQ', TRUE)
}
tx <- merge(tx, data.table(INFILE=files, CUT=blastncut), by='CUT')
subset(tx, select=c(INFILE, TAXON, CUT, CNTG, OCNTG, CCNTG, FIRST, LAST, EQ, EQ_FIRST, EQ_LAST))
}
##--------------------------------------------------------------------------------------------------------
## get curated contigs: names of curated contigs and first/last sites where they were not cut
##--------------------------------------------------------------------------------------------------------
haircut.get.curated.contigs<- function(indir, outfile)
{
options(show.error.messages = FALSE)
readAttempt <-try(suppressWarnings(load(outfile)))
options(show.error.messages = TRUE)
tmp <- !inherits(readAttempt, "try-error")
if(!tmp)
{
infiles <- data.table(FILE=list.files(indir, recursive=T, pattern='fasta$'))
infiles[, PNG_ID:= gsub('\\.fasta','',basename(FILE))]
stopifnot( nrow(infiles)==infiles[, length(unique(PNG_ID))] )
ctrain <- infiles[, {
file <- paste(indir, FILE, sep='/')
cat(paste('\nProcess', file))
# read Contigs+Rrefs: cr
cr <- read.dna(file, format='fasta')
# determine start of non-LTR position and cut
tmp <- haircut.find.nonLTRstart(cr)
cat(paste('\nFound end of LTR at=', tmp-1))
cr <- cr[, seq.int(tmp, ncol(cr))]
# determine reference sequences.
# non-refs have the first part of the file name in their contig name and are at the top of the alignment
tmp <- strsplit(basename(file), '_')[[1]][1]
tx <- data.table(TAXON= rownames(cr), CONTIG=as.integer(grepl(tmp, rownames(cr))) )
stopifnot( all( tx[, which(CONTIG==1)] == seq.int(1, tx[, length(which(CONTIG==1))]) ) )
cat(paste('\nFound contigs, n=', tx[, length(which(CONTIG==1))]))
cns <- cr[subset(tx, CONTIG==1)[, TAXON],]
# determine first and last non-gap sites
tx <- data.table( TAXON= rownames(cns), PNG_ID=PNG_ID, LEN= ncol(cr),
FIRST= apply( as.character(cns), 1, function(x) which(x!='-')[1] ),
LAST= ncol(cns)-apply( as.character(cns), 1, function(x) which(rev(x)!='-')[1] )+1L )
subset(tx, !is.na(FIRST) & !is.na(LAST)) # some contigs only map into LTR
}, by='FILE']
set(ctrain, NULL, 'FILE', ctrain[, paste(indir, FILE, sep='/')])
save(ctrain, file=outfile)
}
ctrain
}
##--------------------------------------------------------------------------------------------------------
## Binomial model using FRQ and GAP
##--------------------------------------------------------------------------------------------------------
#' @import data.table zoo plyr ape reshape2 ggplot2
#' @export
haircut.get.fitted.model.150816a<- function(indir=NA, outfile=NA)
{
if(is.na(outfile) | is.na(indir)) # load from R package
{
options(show.error.messages = FALSE)
readAttempt <-try(suppressWarnings(load(system.file(package='PANGEAhaircut', "model_150816a.R"))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
stop('Cannot find model file on system location')
}
if(!is.na(outfile) & !is.na(indir)) # load from outfile, and if not there create
{
options(show.error.messages = FALSE)
readAttempt <-try(suppressWarnings(load(outfile)))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
ctrmc <- do.call('rbind', lapply(seq(1,10001,200), function(site)
{
cat('\nProcess',site,'\n')
ctr <- haircut.load.training.data(indir, site)
tmp <- seq(ctr[, min(SITE)-1],ctr[, max(SITE)+10],10)
tmp <- tmp[tmp<=10000 | tmp==floor(ctr[, max(SITE)+10]/10)*10]
ctr[, CHUNK:=cut(SITE, breaks=tmp, labels=tmp[-length(tmp)])]
ctrmc <- do.call('rbind',lapply(ctr[, unique(CHUNK)], function(chunk)
{
ctrch <- subset(ctr, CHUNK==chunk)
ctrchm <- gamlss(ANS_CALL~FRQ+GPS, data=ctrch, family=BI()) #as good as 'AGRpc+GPS+CNS_FRQr' in terms of FN, FP
ctrchmc <- data.table(CHUNK=chunk, BETA0=coef(ctrchm)[1], BETA1=coef(ctrchm)[2], BETA2=coef(ctrchm)[3])
}))
}))
# deal with end of genome where little data is available: we estimated coefs for all sites > 1e4, now split up using CHUNK notation
ctr <- haircut.load.training.data(indir, 10001)
tmp <- seq(ctr[, min(SITE)-1],ctr[, max(SITE)+10],10)
tmp <- as.data.table(expand.grid(CHUNK=as.character(tmp[tmp>10000]), BETA0=subset(ctrmc, CHUNK==10000)[, BETA0], BETA1=subset(ctrmc, CHUNK==10000)[, BETA1], BETA2=subset(ctrmc, CHUNK==10000)[, BETA2], stringsAsFactors=F))
ctrmc <- rbind(ctrmc, tmp)
# model predict function, so we save mem by not having to call 'predict'
model.150816a.predict<- function(frq, gps, b0, b1, b2)
{
stopifnot(all(!is.na(frq)), all(!is.na(gps)))
b0[which(is.na(b0))] <- 0
b1[which(is.na(b1))] <- 0
b2[which(is.na(b2))] <- 0
exp(b0+b1*frq+b2*gps)/(exp(b0+b1*frq+b2*gps)+1)
}
# calculate Sensitivity & Specificity on training data
ctrev <- do.call('rbind', lapply(seq(1,10200,200), function(site)
{
cat('\nProcess',site,'\n')
ctr <- haircut.load.training.data(indir, site)
tmp <- seq(ctr[, min(SITE)-1],ctr[, max(SITE)+10],10)
ctr[, CHUNK:=cut(SITE, breaks=tmp, labels=tmp[-length(tmp)])]
ctrp <- merge(ctr, ctrmc, by='CHUNK')
ctrp[, PR_CALL:=model.150816a.predict(FRQ, GPS, BETA0, BETA1, BETA2)]
setkey(ctrp, CHUNK)
if(0)
{
ctrev <- as.data.table(expand.grid(THR= seq(0.05,0.95,0.01), CHUNK= as.character(ctrp[, unique(CHUNK)]), stringsAsFactors=FALSE))
ctrev <- ctrev[, {
tmp <- ctrp[CHUNK,][,table(ANS_CALL, factor(PR_CALL>=THR, levels=c('TRUE','FALSE'), labels=c('TRUE','FALSE')))]
list(TP= tmp['1','TRUE'], FP= tmp['0','TRUE'], FN= tmp['1','FALSE'], TN= tmp['0','FALSE'] )
}, by=c('CHUNK','THR')]
ctrev <- merge(ctrev, ctrev[, list(SENS= TP/(TP+FN), SPEC=TN/(TN+FP), FDR=FP/(FP+TP), FOR=FN/(FN+TN)), by=c('CHUNK','THR')], by=c('CHUNK','THR'))
}
if(1)
{
ctrev <- as.data.table(expand.grid(THR= c(seq(2,10,1), seq(15,30,5)), CHUNK= as.character(ctrp[, unique(CHUNK)]), stringsAsFactors=FALSE))
ctrev <- ctrev[, {
ctrp[, CNS_PR_CALL:=CNS_FRQ-THR*CNS_FRQ_STD]
set(ctrp, NULL, 'CNS_PR_CALL', ctrp[,model.150816a.predict(CNS_PR_CALL, CNS_GPS, BETA0, BETA1, BETA2)])
tmp <- ctrp[CHUNK,][,table(ANS_CALL, factor(PR_CALL>=CNS_PR_CALL, levels=c('TRUE','FALSE'), labels=c('TRUE','FALSE')))]
list(TP= tmp['1','TRUE'], FP= tmp['0','TRUE'], FN= tmp['1','FALSE'], TN= tmp['0','FALSE'] )
}, by=c('CHUNK','THR')]
ctrev <- merge(ctrev, ctrev[, list(SENS= TP/(TP+FN), SPEC=TN/(TN+FP), FDR=FP/(FP+TP), FOR=FN/(FN+TN)), by=c('CHUNK','THR')], by=c('CHUNK','THR'))
}
# plot Sensitivity & Specificity
ggplot(melt(ctrev, id.vars=c('CHUNK','THR'), measure.vars=c('SENS','SPEC','FDR','FOR')), aes(x=THR, y=100*value, colour=CHUNK)) +
scale_x_reverse() +
geom_line() + labs(x='standard deviations of consensus call prob\nPR_CALL_CNT >= PR_CALL_CNS-x*std(PR_CALL_CNS)',y='%', colour='site\n(base relative to HXB2)') +
theme_bw() + theme(legend.position='bottom') + facet_wrap(~variable, ncol=2, scales='free') +
guides(col = guide_legend(ncol=5, byrow=TRUE))
ggsave(file=paste(outdir,'/model.150816a_SensSpec_SITE',site,'.pdf',sep=''), w=9, h=9)
#
ctrev
}))
set(ctrev, NULL, 'CHUNK', ctrev[, as.numeric(CHUNK)])
save(ctrmc, ctrev, model.150816a.predict, file=outfile)
}
}
list(coef=ctrmc, predict=model.150816a.predict)
}
##--------------------------------------------------------------------------------------------------------
## deal with large expressions by performing them subsequently
##--------------------------------------------------------------------------------------------------------
haircut.large.regexpr<- function(pattern, x, n=500, ...)
{
tmp <- seq_len(ceiling( nchar(pattern)/500 ))*500
tmp <- matrix( c(1, tmp[-length(tmp)]+1, tmp[-length(tmp)], nchar(pattern), rep(NA, 2*length(tmp))), byrow=T, nrow=4 )
for(j in seq_len(ncol(tmp)))
{
z <- regexpr(substr(pattern, tmp[1,j], tmp[2,j]), x, ...)
tmp[4,j] <- attr(z,'match.length')
tmp[3,j] <- as.numeric(z)
}
tmp <- tmp[, as.logical(cummin( tmp[4,]>0 )), drop=FALSE]
z <- all( tmp[3, ]+tmp[4, ]-1==tmp[2, ] ) #check if complete match
if(ncol(tmp) & z)
{
ans <- tmp[3,1]
attr(ans,'match.length') <- sum(tmp[4,])
}
if(!ncol(tmp) | !z)
{
ans <- -1
attr(ans,'match.length') <- -1
}
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.