R/fastqTools.R

Defines functions `fastqDemultiplex` `fastqCompressFile` `fastqCompressFolder` `fastqToPairedFiles` `fastqWriter` `fastqReader` `fastqToPeptides` `bam2fastq` `fastqPatternSearch` `cropFastq` `cropBases` `clipFastq` `clipBases` `fastqToFasta` `fastqMerge` `fastqToChunks` `fastqToOverlapReadlets` `test.writeFastq` `writeFastq` `test.readFastq` `readFastq`

# fastqTools.R


`readFastq` <- function( filein, maxReads=NULL, verbose=TRUE) {

	fileToUse <- allowCompressedFileName( filein)

	if ( ! file.readable( fileToUse)) {
		warning( paste( "readFastq:  unable to read FASTQ file", fileToUse))
		return( data.frame())
	}

	# catch compressed files
	conIn <- openCompressedFile( fileToUse, open="r")
	if (verbose) cat( "\nReading file: ", fileToUse)

	# read in the raw text
	chunkSize <- 1000000
	if ( ! is.null(maxReads) && (maxReads*4) < chunkSize) chunkSize <- maxReads * 4

	readIDs <- readSeqs <- scores <- vector()

	repeat {

		fastqText <- readLines( con=conIn, n=chunkSize, warn=FALSE)
		if ( length( fastqText) < 1) break

		# get the delimited lines
		idLines <- grep( "^@", fastqText)
		scoreMarks <- grep( "^\\+", fastqText)
		if ( length( idLines) < 1 || length(scoreMarks) < 1 ) {
			warning( paste( "readFastq:  not a FASTQ format file: ", fileToUse))
			return( data.frame())
		}

		idLines <- base::sort( intersect( idLines, seq( 1, length(fastqText), by=4)))
		scoreMarks <- base::sort( intersect( scoreMarks, seq( 3, length(fastqText), by=4)))
		if ( (length( idLines) != length(scoreMarks)) || any( idLines >= scoreMarks)) {
			warning( paste( "readFastq:  bad FASTQ file: ", fileToUse, 
					"\tMismatched readIDs and/or quality scores..."))
			return( data.frame())
		}

		# now get the parts
		readIDs <- base::append( readIDs, sub( "^@", "", fastqText[ idLines]))
		readSeqs <- base::append( readSeqs, fastqText[ (idLines+1)])
		scores <- base::append( scores, fastqText[ (scoreMarks+1)])
	
		if (verbose) cat(".")
		if ( ! is.null( maxReads)) {
			if ( length(readIDs) >= maxReads) break
		}
	} # end of 'repeat' loop...
	close( conIn)
	
	if ( any( base::nchar( readSeqs) != base::nchar( scores))) 
			warning( "readFastq:  some Read & Score lengths disagree")

	out <- data.frame( readIDs, readSeqs, scores, stringsAsFactors=FALSE)
	colnames( out) <- FASTQ_COLUMNS
	rownames( out) <- 1:nrow(out)

	if ( ! is.null( maxReads)) {
		if ( nrow(out) >= maxReads) out <- out[ 1:maxReads, ]
	}

	if (verbose) cat( "\nN_Reads: ", prettyNum( nrow(out), big.mark=","), "\n")
	return( out)
}


`test.readFastq` <- function() {

	tmpFile <- build.testFastq()
	zz <- readFastq( tmpFile, verbose=FALSE)
	checkEquals( dim(zz), c(1000,3))
	checkEquals( colnames(zz), FASTQ_COLUMNS)
	remove.testFile( tmpFile)
}


# efficient writing of .fastq to a file

`writeFastq` <- function( x, fileout, compress=FALSE, verbose=TRUE) {

	if ( ! all( colnames( x) == FASTQ_COLUMNS)) {
		warning( paste( "writeFastq:  unexpected column names: ", colnames(x)))
		warning( "No file written...")
		return( NULL)
	}

	fileToUse <- fileout

	# catch compressed file
	hasGZ <- (regexpr( "\\.gz$", fileout) > 0)
	if ( compress || hasGZ) {
		if ( ! hasGZ) fileToUse <- paste( fileout, "gz", sep=".")
		conOut <- gzfile( fileToUse, open="w")
	} else {
		conOut <- file( fileToUse, open="w")
	}

	# prepend the .fastq markers to the needed spots
	idText <- base::paste( "@", x$READ_ID, sep="")
	out <- base::paste( idText, x$READ_SEQ, "+", x$SCORE, sep="\n")

	# write it out
	writeLines( out, con=conOut, sep="\n")
	if( verbose) cat( "\nWrote file: \t", fileToUse, "\nN_Reads: \t", nrow( x),"\n")
	close( conOut)
	return( fileToUse)
}


`test.writeFastq` <- function() {

	tmpFile <- build.testFastq()
	zz <- readFastq( tmpFile, verbose=FALSE)
	tmpFile2 <- writeFastq( zz, fileout="DuffyTools.test2.fastq", verbose=FALSE)
	zz2 <- readFastq( tmpFile2, verbose=FALSE)
	checkEquals( zz, zz2)
	remove.testFile( tmpFile)
	remove.testFile( tmpFile2)
}


# make shorter reads from the ends for splice discovery

`fastqToOverlapReadlets` <- function( filein, fileout, segmentLength=12, verbose=TRUE) {

	fileToUse <- allowCompressedFileName( filein)
	if ( ! file.exists( fileToUse)) stop( paste("Can't find input file: ", fileToUse))
	conIn <- openCompressedFile( fileToUse, open="r")

	if ( regexpr( ".gz$", fileout) > 0) {
		conOut <- gzfile( fileout, open="w")
	} else {
		conOut <- file( fileout, open="w")
	}

	chunkSize <- 400000
	nread <- 0
	if (verbose) cat( "\nReading file: ", fileToUse)

	repeat {
		chunk <- readLines( conIn, n=chunkSize)
		if ( length( chunk) < 1) break

		# get the lines we want
		ids <- chunk[ seq( 1, length(chunk), by=4)]
		seqs <- chunk[ seq( 2, length(chunk), by=4)]
		scores <- chunk[ seq( 4, length(chunk), by=4)]
		N <- length( ids)

		# get the actual read lengths
		if ( nread == 0) {
			alllens <- base::nchar( seqs)
			maxlen <- max( alllens)
			segmentLength <- ceiling(segmentLength/2) * 2
			overlap <- segmentLength / 2
			nOverlaps <- ceiling( (maxlen - segmentLength) / overlap) + 1
			if (verbose) cat( "\n readLen, segLength, overlap, nSegs: ", maxlen, segmentLength, overlap, nOverlaps)
		}

		myStarts <- seq( 1, (maxlen-overlap), by=overlap)
		myStops <- seq( segmentLength, (maxlen+overlap-1), by=overlap)
		if ( length( myStarts) != nOverlaps || length( myStarts) != nOverlaps) {
			cat( "bad overlap sizes...?")
			stop("")
		}
		myStarts[nOverlaps] <- maxlen - segmentLength + 1
		myStops[nOverlaps] <- maxlen

		# get the substring chunks
		allSeqs <- allScores <- allIDs <- matrix( nrow=N, ncol=nOverlaps)
		for ( k in 1:nOverlaps) {
			allSeqs[ ,k] <- base::substr( seqs, myStarts[k], myStops[k])
			allScores[ ,k] <- base::substr( scores, myStarts[k], myStops[k])
			allIDs[ ,k] <- base::paste( ids, "/seg", k, sep="")
		}

		# interleave the output to keep the pairs together
		outIDs <- as.vector( t( allIDs))
		outSeqs <- as.vector( t( allSeqs))
		outScores <- as.vector( t( allScores))

		# format as Fastq (the ID still has the '@'...)
		outTxt <- base::paste( outIDs, "\n", outSeqs, "\n+\n", outScores, sep="")
		writeLines( outTxt, con=conOut)

		nread <- nread + length( chunk)

		if (verbose) cat( ".")
	}

	close( conIn)
	close( conOut)
	if (verbose) cat( "\nN_reads:            ", round( as.integer(nread)/4))
	if (verbose) cat( "\nWrote file:         ", fileout, "\n")

	return( list( "Segments"=nOverlaps, "Overlap"=overlap))
}


`fastqToChunks` <- function( filein, fileroot=sub( "(fq|fastq)(.gz)?$","",filein), 
				filetail=sub( fileroot, "", filein, fixed=T), 
				chunk.size=1000000) {

	filein <- allowCompressedFileName( filein)
	if ( ! file.exists( filein)) stop( paste("Can't find input file: ", filein))
	conIn <- openCompressedFile( filein, open="r")


	blockSize <- min( chunk.size, 500000) * 4
	nread <- 0
	nchunk <- 0
	cat( "\nBreaking: ", basename(filein), " into chunks of ", chunk.size, "reads.\n")

	nThisChunk <- 0
	makeNewFile <- TRUE

	repeat {

		if ( makeNewFile) {
			nchunk <- nchunk + 1
			fileout <- paste( fileroot, "chunk", nchunk, ".", filetail, sep="")
			if ( regexpr( ".gz$", fileout) > 0) {
				conOut <- gzfile( fileout, open="w")
			} else {
				conOut <- file( fileout, open="w")
			}
			makeNewFile <- FALSE
		}

		thisBlockSize <- min( blockSize, (chunk.size-nThisChunk)*4)
		chunk <- readLines( conIn, n=thisBlockSize)
		if ( length( chunk) < 1) break

		nread <- nread + length(chunk)/4
		nThisChunk <- nThisChunk + length(chunk)/4

		# get the lines we want
		who <- seq( 1, length(chunk), by=4)
		ids <- chunk[ who]
		seqs <- chunk[ who + 1]
		scores <- chunk[ who + 3]
		writeLines( base::paste( ids, seqs, "+", scores, sep="\n"), con=conOut)
		cat( ".")

		if ( nThisChunk >= chunk.size) {
			close( conOut)
			cat( " ", nread, basename(fileout), "\n")
			makeNewFile <- TRUE
			nThisChunk <- 0
		}
	}
	
	# close the last one...
	close( conOut)
	cat( "\n", nread, basename(fileout))

	close( conIn)
	cat( "\nN_reads:            ", round( as.integer(nread)/4))
	return()
}


`fastqMerge` <- function( filesin, fileout, verbose=TRUE) {

	# use system level zcat to do the merge
	cmdText <- "zcat -f "
	infileText <- paste( filesin, collapse=" ")
	
	doZip <- (regexpr( "gz$", fileout) > 0)
	if ( doZip) {
		cmdText <- paste( cmdText, infileText, " | gzip > ", fileout, sep=" ")
	} else {
		cmdText <- paste( cmdText, infileText, " > ", fileout, sep=" ")
	}

	if (verbose) cat( "\nMerging: \n  ", cmdText)
	catch.system( cmdText)
	if (verbose) cat( "\nDone.\n")
}


`fastqToFasta` <- function( filein, fileout, Qscores=TRUE) {

	filein <- allowCompressedFileName( filein)
	if ( ! file.exists( filein)) stop( paste("Can't find input file: ", filein))
	conIn <- openCompressedFile( filein, open="r")

	if ( regexpr( ".gz$", fileout) > 0) {
		conOut <- gzfile( fileout, open="w")
	} else {
		conOut <- file( fileout, open="w")
	}

	chunkSize <- 800000
	nread <- 0

	repeat {
		chunk <- readLines( conIn, n=chunkSize)
		if ( length( chunk) < 1) break

		nread <- nread + length( chunk)

		# get the lines we want
		ids <- chunk[ seq( 1, length(chunk), by=4)]
		seqs <- chunk[ seq( 2, length(chunk), by=4)]

		if ( Qscores) {
			scores <- chunk[ seq( 4, length(chunk), by=4)]
			avgScores <- apply( phredScoreStringToInt( scores), MARGIN=1, FUN=mean)
			newIds <- base::paste( sub( "@", ">", ids, fixed=TRUE), ":Q=", formatC( avgScores, digits=2, format="f"), sep="")
		} else {
			newIds <- sub( "@", ">", ids, fixed=TRUE)
		}
		writeLines( base::paste( newIds, seqs, sep="\n"), con=conOut)
		cat( ".")
	}

	close( conIn)
	close( conOut)
	cat( "\nN_reads:            ", round( as.integer(nread)/4))

	return()
}


`clipBases` <- function( mySEQ, myScores, laneID, clip5prime, clip3prime, scoresAsIntegers=TRUE) {

	# parse the clip instructions...
	if ( is.null( clip5prime)) clip5prime <- 0
	if ( is.null( clip3prime)) clip3prime <- 0
	clipLanes <- vector()
	if ( ! is.null( names(clip5prime))) clipLanes <- names( clip5prime)
	if ( ! is.null( names(clip3prime))) clipLanes <- unique.default( base::append( clipLanes, names( clip3prime)))
	nClipLanes <- length( clipLanes)
	doByLane <- (nClipLanes > 0)
	if ( doByLane) {
		# there were some named lanes, so expand to a complete set
		trims <- matrix( 0, nrow=max(as.numeric(clipLanes)), ncol=2)
		trims[ as.numeric( names( clip5prime)), 1] <- base::unlist( clip5prime)
		trims[ as.numeric( names( clip3prime)), 2] <- base::unlist( clip3prime)
	} 

	# get the lengths
	nlines <- length( mySEQ)
	nbases <- base::nchar( mySEQ)
	nscoreCh <- base::nchar( myScores)

	# and remove the leading spaces before the first base score
	if ( scoresAsIntegers) {
	  repeat {
		hasBlank <- which( base::substr( myScores,1,1) == " ")
		if ( length( hasBlank) < 1) break
		myScores[ hasBlank] <- sub( " ", "", myScores[ hasBlank], fixed=TRUE)
		nscoreCh[ hasBlank] <- nscoreCh[ hasBlank] - 1
	  }
	  scoreList <- strsplit( myScores, split=" ", fixed=TRUE)
	  lens <- sapply( scoreList, length)
	  if ( any( lens != nbases)) warning("clipBases:  Read -- Quality score length mis-match")
	}

	# ready to do the clipping...
	if ( ! doByLane) {
		cat( " clipping all lanes(5',3')=", clip5prime, clip3prime)
		new5 <- 1 + clip5prime
		for ( i in 1:nlines) {
			new3 <- nbases[i] - clip3prime
			mySEQ[i] <- base::substr( mySEQ[i], new5, new3)
			if ( scoresAsIntegers) {
				scoretmp <- scoreList[[i]][ new5:new3]
				myScores[i] <- base::paste( scoretmp, collapse=" ")
			} else {
				myScores[i] <- base::substr( myScores[i], new5, new3)
			}
		}

	} else {
		for ( ilane in 1:nClipLanes) {
			lane <- clipLanes[ ilane]
			this5 <- trims[ as.integer( lane),1]
			this3 <- trims[ as.integer( lane),2]
			if ( all( c( this5, this3) == 0)) next
			who <- which( laneID == lane)
			if ( length( who) < 1) next
			new5 <- 1 + this5
			cat( " clipping lane",lane,"(5',3')=", this5, this3)
			for ( i in who) {
				new3 <- nbases[i] - this3
				mySEQ[i] <- base::substr( mySEQ[i], new5, new3)
				if ( scoresAsIntegers) {
					scoretmp <- scoreList[[i]][ new5:new3]
					myScores[i] <- base::paste( scoretmp, collapse=" ")
				} else {
					myScores[i] <- base::substr( myScores[i], new5, new3)
				}
			}
		}
	}

	# ready.
	out <- list( "seqs"=mySEQ, "scores"=myScores)
	return( out)
}


`clipFastq` <- function( filein, fileout, clip5prime=0, clip3prime=0) {

	# clip bases off an existing fastq file

	filein <- allowCompressedFileName( filein)
	if ( ! file.exists( filein)) stop( paste("Can't find input file: ", filein))
	conIn <- openCompressedFile( filein, open="r")

	if ( regexpr( ".gz$", fileout) > 0) {
		conOut <- gzfile( fileout, open="w")
	} else {
		conOut <- file( fileout, open="w")
	}

	chunkSize <- 800000
	nread <- 0

	cat( "\nClipping  ( 5', 3') = ", clip5prime, clip3prime, "\n")
	repeat {
		chunk <- readLines( conIn, n=chunkSize)
		if ( length( chunk) < 1) break

		nread <- nread + length( chunk)

		# get the lines we want
		ids <- chunk[ seq( 1, length(chunk), by=4)]
		seqs <- chunk[ seq( 2, length(chunk), by=4)]
		scores <- chunk[ seq( 4, length(chunk), by=4)]

		ans <- clipBases( seqs, scores, ids, clip5=clip5prime, clip3=clip3prime, scoresAsIntegers=FALSE)

		newSeqs <- ans$seq
		newScores <- ans$scores

		newLen <- base::nchar( newSeqs[1])
		newIds <- sub( "=[0-9]+$", base::paste( "=",newLen, sep=""), ids) 
		newId2 <- sub( "@","+", newIds, fixed=TRUE)
		writeLines( base::paste( newIds, newSeqs, newId2, newScores, sep="\n"), con=conOut)
		cat( ".")
	}

	close( conIn)
	close( conOut)
	cat( "\nN_reads trimmed:            ", round( as.integer(nread)/4))
	cat( "\nNew Read Length:  ", base::nchar( newSeqs[1]), "\n")

	return()
}


`cropBases` <- function( mySEQ, myScores, laneID, crop5primePattern=NULL, crop3primePattern=NULL) {

	# get the lengths
	nlines <- length( mySEQ)
	nbases <- base::nchar( mySEQ)
	nscoreCh <- base::nchar( myScores)

	# ready to do the pattern search
	newSEQ <- mySEQ
	newScores <- myScores
	n3cropped <- n5cropped <- 0
	if ( ! is.null( crop3primePattern)) {
		hitPt <- regexpr( crop3primePattern, newSEQ)
		goodHit <- which( hitPt > 0)
		if ( nNow <- length(goodHit)) {
			newSEQ[goodHit] <- substr( newSEQ[goodHit], 1, hitPt[goodHit]-1)
			newScores[goodHit] <- substr( newScores[goodHit], 1, hitPt[goodHit]-1)
			n3cropped <- nNow
		}
	}
	if ( ! is.null( crop5primePattern)) {
		hitPt <- regexpr( crop5primePattern, newSEQ)
		hitLen <- attributes(hitPt)$match.length
		goodHit <- which( hitPt > 0)
		if ( nNow <- length(goodHit)) {
			newSEQ[goodHit] <- substring( newSEQ[goodHit], hitPt[goodHit] + hitLen[goodHit])
			newScores[goodHit] <- substring( newScores[goodHit], hitPt[goodHit] + hitLen[goodHit])
			n5cropped <- nNow
		}
	}

	# ready.
	out <- list( "seqs"=newSEQ, "scores"=newScores, "n5cropped"=n5cropped, "n3cropped"=n3cropped)
	return( out)
}


`cropFastq` <- function( filein, fileout, crop5primePattern=NULL, crop3primePattern=NULL) {

	# crop bases off an existing fastq file reads, using a DNA regular expression pattern

	filein <- allowCompressedFileName( filein)
	if ( ! file.exists( filein)) stop( paste("Can't find input file: ", filein))
	conIn <- openCompressedFile( filein, open="r")

	if ( regexpr( ".gz$", fileout) > 0) {
		conOut <- gzfile( fileout, open="w")
	} else {
		conOut <- file( fileout, open="w")
	}

	chunkSize <- 800000
	nread <- 0
	n5cropped <- n3cropped <- 0

	cat( "\nCropping reads by sequence pattern (5' | 3') = ", crop5primePattern, "|", crop3primePattern, "\n")
	repeat {
		chunk <- readLines( conIn, n=chunkSize)
		if ( length( chunk) < 1) break

		nread <- nread + length( chunk)

		# get the lines we want
		ids <- chunk[ seq( 1, length(chunk), by=4)]
		seqs <- chunk[ seq( 2, length(chunk), by=4)]
		scores <- chunk[ seq( 4, length(chunk), by=4)]

		ans <- cropBases( seqs, scores, ids, crop5=crop5primePattern, crop3=crop3primePattern)

		newSeqs <- ans$seq
		newScores <- ans$scores
		n5cropped <- n5cropped + ans$n5cropped
		n3cropped <- n3cropped + ans$n3cropped

		newLen <- base::nchar( newSeqs[1])
		newIds <- sub( "=[0-9]+$", base::paste( "=",newLen, sep=""), ids) 
		newId2 <- sub( "@","+", newIds, fixed=TRUE)
		writeLines( base::paste( newIds, newSeqs, newId2, newScores, sep="\n"), con=conOut)
		cat( ".")
	}

	close( conIn)
	close( conOut)
	cat( "\nN_Reads scanned:              ", round( as.integer(nread)/4))
	cat( "\nN_Reads cropped from 5' end:  ", n5cropped)
	cat( "\nN_Reads cropped from 3' end:  ", n3cropped)
	cat( "\nWrote new FASTQ file:         ", fileout)

	return()
}


`fastqPatternSearch` <- function( filein, patterns, max.mismatch=0, chunkSize=1000000,
				mode=c("count","fastq")) {

	require( Biostrings)
	mode <- match.arg( mode)

	fileToUse <- allowCompressedFileName( filein)
	if ( ! file.exists( fileToUse)) stop( paste("Can't find input file: ", fileToUse))
	conIn <- openCompressedFile( fileToUse, open="r")

	nread <- 0
	cat( "\nReading file: ", fileToUse, "\n")

	nPatt <- length( patterns)
	findCounts <- rep( 0, times=nPatt)
	findIDs <- findSeqs <- findQuals <- vector()
	nFound <- 0

	repeat {
		chunk <- readLines( conIn, n=chunkSize)
		if ( length( chunk) < 1) break

		# get the lines we want
		ids <- chunk[ seq( 1, length(chunk), by=4)]
		seqs <- chunk[ seq( 2, length(chunk), by=4)]
		N <- length( ids)

		# turn these into a serchable string
		subjects <- DNAStringSet( seqs)
		for ( k in 1:nPatt) {
			v <- vcountPattern( patterns[k], subjects, max.mismatch=max.mismatch)
			nHits <- sum(v)
			if ( ! nHits) next
			# we got at least one hit
			findCounts[k] <- findCounts[k] + nHits
			if (mode == "fastq") {
				# we need to know which one(s) got hit
				isHit <- which( v > 0)
				now <- (nFound + 1) : (nFound + nHits)
				findIDs[now] <- ids[ isHit]
				findSeqs[now] <- seqs[ isHit]
				quals <- chunk[ seq( 4, length(chunk), by=4)]
				findQuals[now] <- quals[ isHit]
				nFound <- nFound + nHits
			}
		}
		nread <- nread + N
		cat( "\rReads: ", formatC( as.integer(nread), big.mark=","), "\tHits: ", sum( findCounts))
	}
	cat( "\n")

	close( conIn)

	if ( mode == "count") {
		out <- findCounts
		names(out) <- patterns
	} else {
		out <- ""
		if (nFound > 0) out <- paste( "@", findIDs, "\n", findSeqs, "\n+\n", findQuals, sep="")
	}
	return( out)
}


`bam2fastq` <- function( bamfile, outfile=sub( ".bam$", "", bamfile), verbose=TRUE) {


	if (verbose) cat( "\nConverting BAM file: ", bamfile)
	if (verbose) cat( "\nCreating FASTQ(s):   ", outfile, "\n")

	#cmdline <- paste( "bam2fastq.pl ", " --filter '-F 0x000'   --prefix ", outfile, "  ", bamfile)

	# now using samtools fastq utility...
	samtools <- Sys.which( "samtools")
	if ( samtools == "") stop( "Executable not found on search path:  'samtools'")

	# the input must be sorted on read name field
	cmdLine1 <- paste( samtools, " sort -n ", bamfile)

	# then we can convert, with or without compression
	compressFlag <- if (grepl( "\\.gz$", outfile)) " -c 6 " else ""
	cmdLine2 <- paste( samtools, " fastq -n ", compressFlag, " -0 ", outfile, " - ")

	# make a pipe
	trapStdErr <- if (verbose) "" else " 2>/dev/null"
	cmdLine <- paste( cmdLine1, "|", cmdLine2, trapStdErr)
	if (verbose) cat( "Command Line: ", cmdLine, "\n")
	catch.system( cmdLine)

	if (verbose) cat( "  Done.\n")
	return( )
}


`fastqToPeptides` <- function( filein, fileout, chunkSize=100000, maxReads=NULL, maxPeptides=NULL,
				lowComplexityFilter=FALSE, trim5=0, trim3=0, clipAtStop=TRUE, ...) {

	filein <- allowCompressedFileName( filein)
	if ( ! file.exists( filein)) stop( paste("Can't find input file: ", filein))
	conIn <- openCompressedFile( filein, open="r")

	chunkLines <- chunkSize * 4

	GREP <- base::grep
	LAPPLY <- base::lapply
	NCHAR <- base::nchar
	SETDIFF <- base::setdiff
	WHICH <- base::which

	# local function to parallelize
	myDNAtoAAfunc <- function( x, peps, cnts) {

		peptides <- counts <- vector()
		nout <- 0

		LAPPLY( x, function(i) {
				dna <- peps[ i]
				# they have to be full length with no N's or non-AA calls
				# 'full length is a bit too restrictive...  we lose many peptides
				# at the true stop codon of the protein...  Relax a bit.

				# If Clipping at Stop Codon:
				# keep any where the STOP was in the last 25% of the peptide and a hard minimum length
				lenFullAA <- floor( NCHAR(dna)/3 * 0.75)
				if (lenFullAA < 5) lenFullAA <- 5

				# If not clipping at Stop Codon, let's reject those with 2+ stops??

				AAs <- DNAtoAA( dna, clipAtStop=clipAtStop)
				if (clipAtStop) {
					good <- SETDIFF( WHICH( NCHAR(AAs) >= lenFullAA), GREP( "?", AAs, fixed=T))
				} else {
					good <- GREP( "^[A-Z]+\\*?[A-Z]*$", AAs)
					hasSTOP <- GREP( "*", AAs[good])
					# do some trimming before/past Stops
					if ( length(hasSTOP)) {
						AAs <- AAs[good]
						good <- 1:length(AAs)
						whereStop <- regexpr( "*", AAs, fixed=T)
						isFront <- which( whereStop <= lenFullAA/3)
						if ( length(isFront)) AAs[isFront] <- substr( AAs[isFront], whereStop[isFront], nchar(AAs[isFront]))
						isBack <- which( whereStop >= lenFullAA)
						if ( length(isBack)) AAs[isBack] <- substr( AAs[isBack], 1, whereStop[isBack])
					}
				}
				if ( ngood <- length(good)) {
					nnow <- (nout+1) : (nout+ngood)
					peptides[ nnow] <<- AAs[ good]
					counts[ nnow] <<- cnts[i]
					nout <<- nout + ngood
				}
				return(NULL)
			})

		# the mapping from DNA to AA may have generated duplicates
		if ( length( peptides) < 1) return( data.frame())
		tapply( 1:length(peptides), factor( peptides), FUN=function(x) {
				if ( length(x) < 2) return()
				totalCnt <- sum( counts[x])
				counts[ x[1]] <<- totalCnt
				counts[ x[2:length(x)]] <<- 0
				return()
			}, simplify=FALSE)
		keep <- which( counts > 0)

		# final ans is a table of peptides with counts
		ans <- list( "Peptide"=peptides[keep], "Count"=counts[keep])
		return(ans)
	}  # end of local function


	nread <- ntotal <- nUtotal <- 0
	repeat {
		chunk <- readLines( conIn, n=chunkLines)
		if ( length( chunk) < 1) break

		nReadsNow <- length(chunk)/4
		nread <- nread + nReadsNow
		cat( "\nReads: ", prettyNum( as.integer( nread), big.mark=","))

		# get the raw read lines we want
		seqs <- chunk[ seq.int( 2, length(chunk), 4)]

		# allow trimming
		if ( trim5 > 0) {
			cat( " trim5:", trim5)
			N <- max( nchar(seqs))
			seqs <- substr( seqs, (trim5+1), N)
		}
		if ( trim3 > 0) {
			cat( " trim3:", trim3)
			N <- max( nchar(seqs))
			seqs <- substr( seqs, 1, (N-trim3))
		}

		# but only do one of each duplicate
		seqCntsTbl <- table( seqs)
		uniqSeqs <- names( seqCntsTbl)
		uniqCnts <- as.vector( seqCntsTbl)
		nReadsNow <- N <- length(uniqCnts)
		rm( seqCntsTbl)
		nUtotal <- nUtotal + N
		cat( "  Unique: ", prettyNum( as.integer( nUtotal)))

		coreOpt <- getOption("cores")
		nCores <- if ( is.null(coreOpt)) 1 else as.integer( coreOpt)
		if ( nCores < 2 || N < 100) {
			doMULTICORE <- FALSE
			ans <- myDNAtoAAfunc( 1:N, uniqSeqs, uniqCnts)
		} else {
			doMULTICORE <- TRUE
			starts <- seq.int( 1, nReadsNow, round(nReadsNow/nCores))
			stops <- c( starts[2:length(starts)] - 1, nReadsNow)
		        cat( "  use", nCores, "cores..")
			locs <- vector( mode="list")
			for ( i in 1:length(starts)) locs[[i]] <- starts[i] : stops[i]

			ans <- multicore.lapply( locs, FUN=myDNAtoAAfunc, peps=uniqSeqs, cnts=uniqCnts, 
						preschedule=TRUE)
		}

		# extract all the little data frame answers
		bigp <- bigc <- vector()
		if ( ! doMULTICORE) {
			bigp <- ans$Peptide
			bigc <- ans$Count
		} else {
		    cat( "  merge..")
		    for ( k in 1:length(ans)) {
			obj <- ans[[k]]
			lenNow <- length(obj$Peptide)
			if ( lenNow < 1) next
			newlocs <- (length(bigp) + 1) : (length(bigp) + lenNow)
			bigp[ newlocs] <- obj$Peptide
			bigc[ newlocs] <- obj$Count
		    }
		}
		N <- length(bigp)
		if ( ! N) next

		# the mapping from DNA to AA may have generated duplicates
		if ( N > 1) {
			cat( "  dups..")
			tapply( 1:N,  factor( bigp), FUN=function(x) {
					if ( length(x) < 2) return()
					totalCnt <- sum( bigc[x])
					bigc[ x[1]] <<- totalCnt
					bigc[ x[2:length(x)]] <<- 0
					return()
				}, simplify=FALSE)
		}
		keep <- which( bigc > 0)
		ansDF <- data.frame( "Peptide"=bigp[keep], "Count"=bigc[keep], stringsAsFactors=F)
		nout <- nrow(ansDF)

		# allow dropping of unwanted peptides
		if ( lowComplexityFilter) {
			cat( "  lowComplexity..")
			toDrop <- lowComplexityPeptides( ansDF$Peptide, ...)
			if ( length( toDrop)) {
				ansDF <- ansDF[ -toDrop, ]
				nout <- nrow(ansDF)
			}
		}
		if ( ! nout) next

		ntotal <- ntotal + nout
		cat( "  Peptides: ", prettyNum( as.integer( ntotal)), "  ", as.percent( ntotal, big.value=nUtotal*100, 
				percentSign=FALSE), "per Read")

		#write.table( ansDF, file=conOut, col.names=(ntotal == nout), sep="\t", quote=F, row.names=F)
		if ( ntotal == nout) {
			write.table( ansDF, file=fileout, append=FALSE, col.names=TRUE, sep="\t", quote=F, row.names=F)
		} else {
			write.table( ansDF, file=fileout, append=TRUE, col.names=FALSE, sep="\t", quote=F, row.names=F)
		}

		# perhaps stop early...
		if ( ! is.null( maxReads)) {
			if ( nRead >= maxReads) break
		}
		if ( ! is.null( maxPeptides)) {
			if ( ntotal >= maxPeptides) break
		}
	}

	close( conIn)
	#close( conOut)
	cat( "\nTotal_Reads:            ", prettyNum( as.integer(nread), big.mark=","))
	cat( "\nTotal_Peptides:         ", prettyNum( as.integer(ntotal), big.mark=","))
	cat( "\n")
	return( ntotal)
}


`fastqReader` <- function() {

	filename <- ""
	con <- NULL
	N <- 0

	initialize <- function( file) {

		cat( "\nInitializing FASTQ reader for: ", file)
		if ( ! file.exists( file)) {
			cat( "\nFile not found: ", file)
			return( "Error")
		}
		con <<- openCompressedFile( file, open="rb")
		filename <<- file
		return( file)
	}

	read <- function( n=1) {
	
		nlines <- n * 4
		txt <- readLines( con, n=nlines, skipNul=TRUE)
		nread <- length( txt)
		if ( nread < 4) return(NULL)
		isRID <- seq( 1, nread, by=4)
		N <<- N + length(isRID)
		# strip off the "@" from the IDs
		ridStrs <- txt[isRID]
		ridStrs <- base::substr( ridStrs, 2, nchar(ridStrs))
		return( list( "rid"=ridStrs, "seq"=txt[isRID+1], "score"=txt[isRID+3]))
	}

	finalize <- function() {
		if ( !is.null(con)) base::close(con)
		cat( "\nRead", N, "records from: ", filename, "\n")
		return()
	}

	return( environment())
}


`fastqWriter` <- function() {

	filename <- ""
	con <- NULL
	N <- 0

	initialize <- function( file) {

		cat( "\nInitializing FASTQ writer for: ", file)
		openMode <- "w"
		if (grepl( "gz$", file)) openMode <- "wb"
		con <<- gzfile( file, open=openMode)
		filename <<- file
		return( file)
	}

	write <- function( rid, seq, score) {
	
		#txt <- base::paste( "@", rid, "\n", seq, "\n+\n", score, sep="")
		#writeLines( txt, con=con)
		writeLines( base::paste( "@", rid, "\n", seq, "\n+\n", score, sep=""), con=con)
		N <<- N + length(rid)
		return()
	}

	finalize <- function() {
		if ( !is.null(con)) base::close(con)
		cat( "\nWrote", N, "records to: ", filename, "\n")
		return()
	}

	return( environment())
}


`fastqToPairedFiles` <- function( file, secondFile=NULL, max.buf=20000) {

	options( warn=-1)
	on.exit( options( warn=0))

	fin <- fastqReader()
	if ( fin$initialize(file) != file) return()

	f1name <- sub( "fastq.gz", "1.fastq.gz", file)
	fout1 <- fastqWriter()
	fout1$initialize(f1name)
	on.exit( fout1$finalize(), add=TRUE)

	f2name <- sub( "fastq.gz", "2.fastq.gz", file)
	fout2 <- fastqWriter()
	fout2$initialize(f2name)
	on.exit( fout2$finalize(), add=TRUE)

	rids1 <- rids2 <- seqs1 <- seqs2 <- scores1 <- scores2 <- rep.int("", 100)
	n1 <- n2 <- nout <- 0

	find1 <- function( rid) {
			if ( n1 < 1) return(0)
			who <- (1:n1)[ rid == rids1[1:n1]][1]
			return( if (is.na(who)) 0 else who)
	}

	find2 <- function( rid) {
			if ( n2 < 1) return(0)
			who <- (1:n2)[ rid == rids2[1:n2]][1]
			return( if (is.na(who)) 0 else who)
	}

	store1 <- function( rid, seq, score) {
			where <- if ( n1 > 0) (1:n1)[ rids1[1:n1] == ""][1] else NA
			#where <- if ( n1 > 0) which( rids1[1:n1] == "")[1] else NA
			if ( is.na( where)) {
				n1 <<- n1 + 1
				where <- n1
			}
			rids1[where] <<- rid
			seqs1[where] <<- seq
			scores1[where] <<- score
			return()
	}

	store2 <- function( rid, seq, score) {
			where <- if ( n2 > 0) (1:n2)[ rids2[1:n2] == ""][1] else NA
			#where <- if ( n2 > 0) which( rids2[1:n2] == "")[1] else NA
			if ( is.na( where)) {
				n2 <<- n2 + 1
				where <- n2
			}
			rids2[where] <<- rid
			seqs2[where] <<- seq
			scores2[where] <<- score
			return()
	}

	squeeze1 <- function() {
			isBlank <- which( rids1 == "")
			if ( length( isBlank) > 0) {
				rids1 <<- rids1[ -isBlank]
				seqs1 <<- seqs1[ -isBlank]
				scores1 <<- scores1[ -isBlank]
				n1 <<- length(rids1)
				if ( n1 > max.buf) {
					drops <- 1 : (n1-max.buf)
					rids1 <<- rids1[ -drops]
					seqs1 <<- seqs1[ -drops]
					scores1 <<- scores1[ -drops]
					n1 <<- length(rids1)
				}
			}
	}

	squeeze2 <- function() {
			isBlank <- which( rids2 == "")
			if ( length( isBlank) > 0) {
				rids2 <<- rids2[ -isBlank]
				seqs2 <<- seqs2[ -isBlank]
				scores2 <<- scores2[ -isBlank]
				n2 <<- length(rids2)
				if ( n2 > max.buf) {
					drops <- 1 : (n2-max.buf)
					rids2 <<- rids2[ -drops]
					seqs2 <<- seqs2[ -drops]
					scores2 <<- scores2[ -drops]
					n2 <<- length(rids2)
				}
			}
	}



	# the main loop is to look for mate pairs within the file, buffering up as we go
	# so read one at a time
	cat("\n")
	repeat {
		item <- fin$read(1)
		if (is.null(item)) break
		
		rid1 <- rid2 <- item[[1]]
		nc <- nchar( rid1)
		mate <- substr( rid1, nc, nc)
		rid <- substr( rid1, 1, nc-1)
		seq <- item[[2]]
		score <- item[[3]]
		is1 <- (mate == '1')
		hit <- FALSE

		if (is1) {
			where2 <- find2( rid)
			if ( where2 == 0) {
				store1( rid, seq, score)
			} else {
				fout1$write( rid1, seq, score)
				rid2 <- paste( rid, "2", sep="")
				fout2$write( rid2, seqs2[where2], scores2[where2])
				rids2[where2] <- ""
				nout <- nout + 1
				hit <- TRUE
			}
		} else {
			where1 <- find1( rid)
			if ( where1 == 0) {
				store2( rid, seq, score)
			} else {
				fout2$write( rid2, seq, score)
				rid1 <- paste( rid, "1", sep="")
				fout1$write( rid1, seqs1[where1], scores1[where1])
				rids1[where1] <- ""
				nout <- nout + 1
				hit <- TRUE
			}
		}
		if ( hit && nout %% 100 == 0) {
			squeeze1()
			squeeze2()
			cat( "\rPairs:", nout, "  Buf1:", n1, "  Buf2:", n2)
		}
	}

	# done reading the file...
	fin$finalize()
	cat( "\rPairs:", nout, "  Buf1:", n1, "  Buf2:", n2)

	# if a second file was given, (the 'No Hits'), use it as a read only source of possible second mates
	if ( ! is.null( secondFile)) {
	    if ( fin$initialize(secondFile) == secondFile) {
		cat("\n")
		repeat {
			item <- fin$read( n=10000)
			if (is.null(item)) break
			
			ridset <- item[[1]]
			seqset <- item[[2]]
			scoreset <- item[[3]]
			hit <- FALSE

			# see if any of whats in our buffers match these
			try1 <- paste( rids1, "2", sep="")
			where <- match( ridset, try1, nomatch=0)
			if ( any( where > 0)) {
				set2 <- which( where > 0)
				set1 <- where[ set2]
				for (j in 1:length(set2)) {
					j1 <- set1[j]
					j2 <- set2[j]
					fout1$write( rids1[j1], seqs1[j1], scores1[j1])
					fout2$write( ridset[j2], seqset[j2], scoreset[j2])
					rids1[j1] <- ""
					nout <- nout + 1
				}
				hit <- TRUE
			}

			try2 <- paste( rids2, "1", sep="")
			where <- match( ridset, try2, nomatch=0)
			if ( any( where > 0)) {
				set1 <- which( where > 0)
				set2 <- where[ set1]
				for (j in 1:length(set1)) {
					j1 <- set1[j]
					j2 <- set2[j]
					fout1$write( ridset[j1], seqset[j1], scoreset[j1])
					fout2$write( rids2[j2], seqs2[j2], scores2[j2])
					rids2[j2] <- ""
					nout <- nout + 1
				}
				hit <- TRUE
			}
			if (hit) {
				squeeze1()
				squeeze2()
				cat( "\rPairs:", nout, "  Buf1:", n1, "  Buf2:", n2, "   ")
				if ( n1 == 0 && n2 == 0) break
			}
		}
		fin$finalize()
	    }
	}

	# all done now
}


`fastqCompressFolder` <- function( folder, recursive=FALSE) {

	fastqPattern <- "(fq|fastq)$"
	fastqFiles <- dir( folder, recursive=recursive, pattern=fastqPattern, full.name=T)
	N <- length( fastqFiles)
	if ( ! N) return( NULL)

	out <- data.frame()
	cat( "\nNumber of .FASTQ files to compress:  ", length(fastqFiles), "\n")
	for ( i in 1:N) {
		f <- fastqFiles[i]
		cat( "\r",i, " Compressing ", basename(f))
		ans <- fastqCompressFile( f)
		if ( is.null(ans)) next
		out <- rbind( out, as.data.frame(ans))
		cat( "  done.  Pct_Reduction: ", ans$PctReduced)
	}

	rownames(out) <- 1:nrow(out)
	ttlReduction <- sum( out$Reduction)
	cat( "\nFile Size Reduction: ", prettyNum( ttlReduction, big.mark=","), "bytes.\n")
	return( out)
}


`fastqCompressFile` <- function( f) {

	info1 <- file.info( f)
	if ( info1$isdir) {
		cat( "\nSkipping folder: ", f, "\n")
		return( NULL)
	}
	f2 <- paste( f, "gz", sep=".")
	cmdline <- paste( "gzip", f, sep=" ")
	catch.system( cmdline)
	info2 <- file.info( f2)

	size1 <- info1$size
	size2 <- info2$size
	reduction <- size1 - size2
	pct <- round( reduction * 100 / size1, digits=2)
	out <- data.frame( "File"=basename(f), "Original"=size1, "Compressed"=size2, 
			"Reduction"=reduction, "PctReduced"=pct, stringsAsFactors=F)
	return(out)
}


`fastqDemultiplex` <- function( fastqFileSet, indexFile, sep=",", sampleColumn="SampleID", 
				index1Column="i7", index2Column="i5", plexStripPattern="^.+0:", 
				buffer.size=100000, max.mismatch=1) {

	# set up to demultiplex a set of FASTQ files into many separate new FASTQ files
	# by extracting the multiplex adaptor sequences from the read ID line of each raw read
	indexTbl <- read.delim( indexFile, sep=sep, as.is=T, header=T)
	if ( ! (sampleColumn %in% colnames(indexTbl))) {
		cat( "\nFailed to find the sample ID column in Index file.  Tried: ", sampleColumn)
		cat( "\nCheck Index file contents, and file separator.  Used:      ", sep, "\n")
		return(NULL)
	}
	index1 <- indexTbl[[ index1Column]]
	if ( is.null(index1)) {
		cat( "\nFailed to find Index 1 column in Index file.  Tried: ", index1Column)
		return(NULL)
	}
	hasIndex2 <- FALSE
	if ( !is.null(index2Column) && (index2Column != "")) {
		index2 <- indexTbl[[ index2Column]]
		if ( is.null(index2)) {
			cat( "\nFailed to find Index 2 column in Index file.  Tried: ", index2Column)
			return(NULL)
		}
		hasIndex2 <- TRUE
	}

	# OK, the Index table seems valid, so make the file names we will create and the multiplex adaptors we will search for
	outIDs <- indexTbl[[ sampleColumn]]
	outFiles <- paste( gsub( " ", ".", outIDs), "fastq.gz", sep=".")
	NS <- length( outIDs)

	# make sure all input files are found
	if ( ! all( file.exists( fastqFileSet))) {
		cat( "\nError:  Some input FASTQ files not found.")
		return(NULL)
	}
	NFIN <- length(fastqFileSet)

	# we may allow the set of plex patterns to grow if we condone mismatches, so build the table 
	# of pointers from plex patterns to output files
	if (hasIndex2) {
		plexPatterns <- paste( index1, sapply( index2, myReverseComplement), sep="+")
	} else {
		plexPatterns <- index1
	}
	plexFilePtrs <- 1:NS
	nNewPlex <- 0

	# create the FQ writers for each output file
	# this makes a list of closures, each one a function for writing reads out to that one file
	fqList <- vector( mode="list", length=NS)
	for ( i in 1:NS) {
		myFile <- outFiles[i]
		fqList[[i]] <- fastqWriter()
		if ( fqList[[i]]$initialize(myFile) != myFile) {
			cat( "\nFailed to initialize new FQ writer for file: ", myFile)
			return( NULL)
		}
	}
		
	UNIQUE <- base::unique.default
	MATCH <- base::match
	WHICH <- base::which


	# local function to extract the multiplex part from the read ID lines
	extractPlexString <- function( rids, plexStripPattern) {
					return( sub( plexStripPattern, "", rids))
				}


	# we are now ready to start reading in from the set of input files
	options( warn=-1)
	on.exit( options( warn=0))

	totalIn <- totalOut <- totalDrop <- totalPerfect <- totalMismatch <- 0

	# tally up what never did map to output files
	worstPlexNotFound <- vector()

	for ( iIN in 1:NFIN) {
		cat( "\n")
		infile <- fastqFileSet[iIN]
		fin <- fastqReader()
		if ( fin$initialize(infile) != infile) {
			cat( "\nError initializing FQ reader for file: ", infile)
			cat( "\nSkipping this file..")
			next
		}

		nin <- nout <- ndrop <- nperfect <- nmismatch <- 0

		# try to only test for mismatches if we keep finding them
		nTryMissNoneFound <- 0

		# the main loop is to look at the indexing info in the ID line, and pass it off to one output file
		cat("\nReading: ", iIN, "of", NFIN, " \t", basename(infile), "\n")
		repeat {
			item <- fin$read( buffer.size)
			if (is.null(item)) break
			
			rids <- item[[1]]
			seqs <- item[[2]]
			scores <- item[[3]]
			nReads <- length(rids)
			if (nReads < 1) break
			nin <- nin + nReads
	
			# get just the multiplex info
			myPlexs <- extractPlexString( rids, plexStripPattern)

			# all of these should exist in our table
			# write reads to each output
			wh <- MATCH( myPlexs, plexPatterns, nomatch=0)
			for ( iOut in UNIQUE(wh)) {
				if ( iOut == 0) next
				now <- WHICH( wh == iOut)
				myFQ <- plexFilePtrs[ iOut]
				fqList[[myFQ]]$write( rids[now], seqs[now], scores[now])
			}
			nout <- nout + sum( wh > 0)
			nperfect <- nperfect + sum( wh > 0 & wh <= NS)
			nmismatch <- nmismatch + sum( wh > NS)

			# see if any of the unmatched plex strings are 'close enough'
			# to consider extending our set 
			if ( max.mismatch > 0 && nTryMissNoneFound < 10) {
				foundNew <- FALSE
				badPlex <- UNIQUE( myPlexs[ wh == 0])
				if ( length( badPlex)) {
					plexDist <- adist( badPlex, plexPatterns[1:NS])
					# given this small matrix of edit distances, see if any are good enough
					# and unique enough, to warrant being added
					for (k in 1:nrow(plexDist)) {
						bestDist <- min( plexDist[ k, ])
						if ( bestDist > max.mismatch) next
						bestPlex <- WHICH( plexDist[ k, ] == bestDist)
						if ( length(bestPlex) > 1) next
						# OK, we have one new plex string that is close enough to one and
						# only one original index string.  Add it to the list, and point 
						# at the same file
						plexPatterns <- c( plexPatterns, badPlex[k])
						plexFilePtrs <- c( plexFilePtrs, plexFilePtrs[bestPlex])
						nNewPlex <- nNewPlex + 1
						cat( "\nAdd Mmismatch: ", badPlex[k], " \tAs: ", plexPatterns[bestPlex],
								" \tSample: ", bestPlex, " \tEditDist: ", bestDist)
						foundNew <- TRUE
						nTryMissNoneFound <- 0
					}
				}
				if ( ! foundNew) nTryMissNoneFound <- nTryMissNoneFound + 1

				# if we just added some new plax patterns see if we can write some reads out
				if (foundNew) {
					cat( "\n")
					whonew <- WHICH( wh == 0)
					rids2 <- rids[ whonew]
					seqs2 <- seqs[ whonew]
					scores2 <- scores[ whonew]
					myPlexs <- myPlexs[ whonew]
					wh <- MATCH( myPlexs, plexPatterns, nomatch=0)
					for ( iOut in unique(wh)) {
						if ( iOut == 0) next
						now <- WHICH( wh == iOut)
						myFQ <- plexFilePtrs[ iOut]
						fqList[[myFQ]]$write( rids2[now], seqs2[now], scores2[now])
					}
					nout <- nout + sum( wh > 0)
					nperfect <- nperfect + sum( wh > 0 & wh <= NS)
					nmismatch <- nmismatch + sum( wh > NS)
				}
			}
			ndrop <- ndrop + sum( wh == 0)
			plexNotFound <- table( myPlexs[ wh == 0])
			worstPlexNotFound <- if ( length( worstPlexNotFound)) plexNotFound else mergeTables( worstPlexNotFound, plexNotFound)
			# heartbeat diagnostics
			cat( "\rN_In, N_Out, N_Drop, %Perfect, %Mismatch, %Drop: ", nin, nout, ndrop, 
					round(nperfect*100/nin,digits=1), round(nmismatch*100/nin,digits=1), 
					round(ndrop*100/nin,digits=1))
			if ( nin %% 10000000 == 0) {
				cat( "\nMost seen unexpected multiplex strings:\n")
				print( sort( worstPlexNotFound, decreasing=T)[1:10])
			}
		}

		# done reading this input file...
		fin$finalize()
		cat( "\rN_In, N_Out, N_Drop, %Perfect, %Mismatch, %Drop: ", nin, nout, ndrop, 
					round(nperfect*100/nin,digits=1), round(nmismatch*100/nin,digits=1), 
					round(ndrop*100/nin,digits=1))
		totalIn <- totalIn + nin
		totalOut <- totalOut + nout
		totalDrop <- totalDrop + ndrop
		totalPerfect <- totalPerfect + nperfect
		totalMismatch <- totalMismatch + nmismatch

	}  # looping over all input files..

	# lastly, close all these new files
	for ( i in 1:NS) {
		fqList[[i]]$finalize()
	}

	# all done now
	cat( "\n\nDone.")
	cat( "\rN_In, N_Out, N_Drop, %Perfect, %Mismatch, %Drop: ", totalIn, totalOut, totalDrop, 
					round(totalPerfect*100/totalIn,digits=1), round(totalMismatch*100/totalIn,digits=1), 
					round(totalDrop*100/totalIn,digits=1), "\n")
}
robertdouglasmorrison/DuffyTools documentation built on April 16, 2024, 6:31 a.m.