R/fastaTools.R

Defines functions `gene2Fasta` `getFastaSeqFromFilePath` wrap.text `as.text.Fasta` `writeLongFasta` `writeFasta` `is.Fasta` `as.Fasta` `as.Fasta.data.frame` `readFasta` `loadFasta`

# fastaTools.R  --  collection of FASTA file manipulation routines


`loadFasta` <- function( file="file.fasta", mode=c("character","BStrings"), verbose=TRUE, short.desc=TRUE) {

	require( Biostrings)

	file <- allowCompressedFileName( file)
	if (verbose) cat( "\nLoading Fasta file: ", file, "...")

	fa <- readBStringSet( file)

	# do we send back character strings or BStrings?
	mode <- match.arg( mode)

	nams <- names(fa)
	if ( mode == "character") {
		seqs <- as.character(fa, use.names=FALSE)
	} else {
		seqs <- fa
	}

	# for consistency with other tools, trim the descriptor after the first blank
	if ( short.desc) {
		nams <- sub( " .+", "", nams)
	}

	return( list( "desc"=nams, "seq"=seqs))
}


`readFasta` <- function( file="file.fasta", verbose=TRUE, short.desc=TRUE) {

	return( loadFasta( file, verbose=verbose, short.desc=short.desc))
}


`as.Fasta.data.frame` <- function( fasta) {

	return( data.frame( "desc"=fasta$desc, "seq"=fasta$seq, stringsAsFactors=FALSE))
}


`as.Fasta` <- function( desc, seq) {

	if ( length( desc) != length( seq)) stop( "as.Fasta:  unequal length arguments")

	return( list( "desc"=desc, "seq"=seq))
}


`is.Fasta` <- function( x) return( is.list(x) && all( c("desc","seq") %in% names(x)))


`writeFasta` <- function( fasta, file=NULL, line.width=80) {

	if ( is.null( file)) stop( "writeFasta:  required 'file' argument is missing")
	require( Biostrings)
	# convert to a Biostrings object
	bstring <- BStringSet( fasta$seq)
	names( bstring) <- fasta$desc
	writeXStringSet( bstring, filepath=file, width=line.width)
	#writeLines( as.text.Fasta( fasta, line.width=line.width), con=file, sep="\n")
}


`writeLongFasta` <- function( desc, seq, file=NULL) {

	if ( is.null( file)) stop( "writeLongFasta:  required 'file' argument is missing")
	writeLines( base::paste( ">", desc, "\n", seq, sep=""), con=file, sep="\n")
}


`as.text.Fasta` <- function( fasta, line.width=80) {

	if ( ! is.list( fasta)) return("")
	N <- length(fasta$desc)
	if ( is.null(N) || N < 1) return("")

	out <- base::paste( ">", fasta$desc, "\n", wrap.text( fasta$seq, line.width=line.width), sep="")
	return( out)
}


wrap.text <- function( txt, line.width=60) {

	# re-format fasta text to be line wrapped to a fixed width
	N <- length(txt)
	if ( is.null(txt) || N < 1) return("")
	txtlen <- base::nchar( txt)
	SUBSTR <- base::substr
	SAPPLY <- base::sapply
	PASTE <- base::paste

	out <- SAPPLY( 1:N, function(i) {
			if ( (nch <- txtlen[i]) < 1) return("")
			oldtxt <- txt[i]
			smltxt <- SAPPLY( seq.default( 1, nch, by=line.width), function(j) {
					last <- min( (j+line.width-1), nch)
					SUBSTR( oldtxt, j, last)
				})
			PASTE( smltxt, collapse="\n")
		})
	return( out)
}


# smart fasta file lookup...
FastaFilePathEnv <- new.env( parent=emptyenv())
assign( "currentFastaFile", "", envir=FastaFilePathEnv)
assign( "currentFastaObject", NULL, envir=FastaFilePathEnv)


# get one FASTA sequence, by filename and seqID.  returns a Biostrings object.
`getFastaSeqFromFilePath` <- function( filePath, seqID, verbose=FALSE) {

	# see if we need to read a different file
	alreadyLoaded <- ( filePath == get( "currentFastaFile", envir=FastaFilePathEnv))

	if ( verbose) cat( "\nGetting FASTA seq for: ",seqID)
	if ( ! alreadyLoaded) {
		# we could be given an explicit filename OR a directory
		info <- file.info( filePath)
		if ( any( is.na( info$size))) stop( paste( "getFastaSeqFromFilePath:  file not found:  ", filePath))
		isDirectory <- info$isdir
		if( isDirectory) {
			pathRelative <- TRUE
			files <- dir( filePath)
			# try to find a file that has that seqID as part of its name
			if (verbose) cat( "   trying", length(files), "files in folder.")
			curSpecies <- getCurrentSpecies()
			tryFileName <- paste( seqID, ".fa", sep="")
			hit <- pmatch( tryFileName, files, nomatch=0)
			if ( hit == 0) {
				if (verbose) cat( "   trying prepend of speciesID.")
				tryFileName <- sub( paste( curSpecies,"_",sep=""), "", tryFileName, fixed=TRUE)
				hit <- pmatch( tryFileName, files, nomatch=0)
			}

			if ( hit == 0) {
			    files <- dir( filePath, full.name=T)
			    # last chance:  see if any subfolders have that file
			    myfolders <- files[ file.info( files)$isdir]
			    if ( length( myfolders) > 0) {
				pathRelative <- FALSE
			    	if (verbose) cat( "   trying", length( myfolders), "subfolders.")
			  	morefiles <- vector()
			  	for( f in myfolders) morefiles <- append( morefiles, dir( f, full.name=T))
			  	tryFileName <- paste( seqID, ".fa", sep="")
				#hit <- pmatch( tryFileName, morefiles, nomatch=0)
				hit <- grep( tryFileName, morefiles)
				hit <- if ( length(hit) > 0) hit[1] else 0
				if ( hit == 0) {
				    if (verbose) cat( "   trying prepend of speciesID.")
				    tryFileName <- sub( paste( curSpecies,"_",sep=""), "", tryFileName, fixed=TRUE)
				    #hit <- pmatch( tryFileName, morefiles, nomatch=0)
				    hit <- grep( tryFileName, morefiles)
				    hit <- if ( length(hit) > 0) hit[1] else 0
				}
				files <- morefiles
			    }
			}

			if ( hit == 0) stop( paste( "\nUnable to find FASTA file for:  ", seqID, "  in folder:  ", filePath))

			if (pathRelative) {
				useFile <- file.path( filePath, files[ hit])
			} else {
				useFile <- files[ hit]
			}
		} else {
			useFile <- filePath
		}

		# ok, we have a file, so load it
		assign( "currentFastaObject", loadFasta( useFile), envir=FastaFilePathEnv)
		assign( "currentFastaFile", useFile, envir=FastaFilePathEnv)
	}

	# now look for that seqID
	fasta <- get( "currentFastaObject", envir=FastaFilePathEnv)
	where <- match( seqID, fasta$desc, nomatch=0)
	if ( where > 0) return( fasta$seq[ where])

	# complain and quit if not found
	warning( paste( "Fasta descriptor:  ", seqID, "   not found in Fasta file/path:  ", filePath))
	return( "")
}


`gene2Fasta` <- function( genes, genomicDNAfilePath, mode=c("gdna","cdna","aa"), utr.tail.length=0, verbose=FALSE) {

	mode <- match.arg( mode)
	if ( mode != "gdna" && utr.tail.length > 0) stop( "UTR tails only compatible with 'gdna' mode.")

	gmap <- getCurrentGeneMap()
	smap <- getCurrentSeqMap()
	who <- match( genes, gmap$GENE_ID, nomatch=0)
	if ( any( who == 0)) {
		who <- match( genes, shortGeneName(gmap$GENE_ID,keep=1), nomatch=0)
		if ( all( who == 0)) stop( "No matching genes found in current gene map.")
		if ( any( who == 0)) {
			cat( "\nSome genes not found in current gene map.")
			cat( "\nN=", sum( who == 0), genes[who == 0])
		}
	}
	gmap <- gmap[ who, ]

	outDesc <- outSeq <- vector()
	visitOrder <- order( gmap$SEQ_ID)

	for ( i in visitOrder) {
		thisGene <- gmap$GENE_ID[i]
		thisSeqID <- gmap$SEQ_ID[i]

		gdna <- getFastaSeqFromFilePath( filePath=genomicDNAfilePath, seqID=thisSeqID )

		if ( mode == "gdna") {
			str <- substr( gdna, gmap$POSITION[i] , gmap$END[i])
			if ( utr.tail.length > 0) {
				gPos <- max( 1, gmap$POSITION[i] - utr.tail.length)
				gEnd <- min( smap$LENGTH[match(thisSeqID,smap$SEQ_ID)], gmap$END[i] + utr.tail.length)
				str <- substr( gdna, gPos, gEnd)
			}
			if ( gmap$STRAND[i] == "-") str <- myReverseComplement(str)
		} else {
			str <- convertGenomicDNAtoCodingDNA( geneID=thisGene, genomicDNA=gdna)
			if (mode == "aa") str <- DNAtoAA( str, readingFrame=1, clipAtStop=F)
		} 

		outDesc[i] <- thisGene
		outSeq[i] <- str

		if (verbose) cat( "\r", i, "  ", thisGene, "   N_Ch=", nchar(str))
	}

	return( as.Fasta( outDesc, outSeq))
}
robertdouglasmorrison/DuffyTools documentation built on April 16, 2024, 6:31 a.m.