# pipe.BCR.TCR.Profile.R
# turn RNA-seq data into BCR & TCR sequences, to search for V-D-J and CDR3 motifs
`pipe.BCR.TCR.Profile` <- function( sampleID, bcrtcrFastaFile=NULL, annotationFile="Annotation.txt", optionsFile="Options.txt",
results.path=NULL, gatherGeneAlignments.mode=c("all","best.one"),
doCutadapt=TRUE, cutadaptProgram=Sys.which("cutadapt"),
doSpades=FALSE, spadesProgram=Sys.which("spades.py"), spades.path=dirname(spadesProgram),
spades.mode=c("isolate","rna","meta"), kmerSizes=NULL, spades.args="",
min.aa.length=60, max.aa.length=210, min.score.per.aa=2, min.v.score=100, verbose=TRUE) {
if ( is.null( results.path)) results.path <- getOptionValue( optionsFile, "results.path",
notfound=".", verbose=F)
# make sure the reference file of var genes is readable
if ( ! is.null( bcrtcrFastaFile)) {
if ( ! file.exists( bcrtcrFastaFile)) {
cat( "\nError: 'bcrtcrFastaFile' of example BCR.TCR proteins not found: ", bcrtcrFastaFile)
return(NULL)
}
}
# path for all results
keyword <- "BCR.TCR"
spades.output.path <- file.path( results.path, "SpadesContigs", sampleID, keyword)
contigsFastaFile <- file.path( spades.output.path, "contigs.fasta")
peptidesFastaFile <- file.path( spades.output.path, paste( sampleID, keyword, "Peptides.fasta", sep="."))
proteinsFastaFile <- sub( "Peptides.fasta$", "Proteins.fasta", peptidesFastaFile)
proteinsTextFile <- sub( "Proteins.fasta$", "BestProteinHits.txt", proteinsFastaFile)
# remove any files we know we will remake
if (doSpades) file.delete( c( contigsFastaFile, peptidesFastaFile, proteinsFastaFile, proteinsTextFile))
# get the universe of BCR & TCR genes from the annotation
setCurrentSpecies( "Hs_grc")
gmap <- getCurrentGeneMap()
tcrMap <- gmap[ grep( "^TR[ABDG][VDJC]", gmap$GENE_ID), ]
bcrMap <- gmap[ grep( "^IG[HKL][VDJCG]", gmap$GENE_ID), ]
tcrgenes <- sort( unique( c( bcrMap$GENE_ID, tcrMap$GENE_ID)))
alignedReadsFile <- file.path( results.path, "fastq", paste( sampleID, "BCR.TCR.Hits.fastq.gz", sep="."))
nohitReadsFile <- file.path( results.path, "fastq", paste( sampleID, "noHits.fastq.gz", sep="."))
if ( doSpades || !file.exists(contigsFastaFile)) {
# verify we see the denovo tool executable location
if ( spades.path == "") {
cat( "\nError: path to SPAdes executable program not found..")
return(NULL)
}
# step 1: gather all aligned reads that land in/near the TCR genes loci
if ( ! file.exists( alignedReadsFile)) {
cat( "\nGathering BCR/TCR Gene aligned reads..\n")
gatherGeneAlignments.mode <- match.arg( gatherGeneAlignments.mode)
pipe.GatherGeneAlignments( sampleID, genes=tcrgenes, asFASTQ=TRUE, mode=gatherGeneAlignments.mode,
fastq.keyword="BCR.TCR.Hits")
}
# in most cases call cutadapt to trim off primer ends
# if we are doing the Cutadapt pass, check for those files, and call it if we need.
fastqFiles <- c( "BCR.TCR.Hits", "noHits")
if (doCutadapt) {
inputFastqFiles <- c( alignedReadsFile, nohitReadsFile)
filesToDo <- checkOrCallCutadapt( inputFastqFiles, asMatePairs=FALSE, forceMatePairs=FALSE,
cutadaptProgram=cutadaptProgram, kmer.size=45, verbose=verbose)
fastqFiles <- paste( fastqFiles, "trimmed", sep=".")
}
# step 2: create contigs of anything that may be TCR reads
spades.mode <- match.arg( spades.mode)
pipe.SpadesContigs( sampleID, fastqSource=fastqFiles, folderName="BCR.TCR",
spades.mode=spades.mode, kmerSizes=kmerSizes, spades.args=spades.args,
spades.path=spades.path, keyword=keyword, makePep=T)
# cleanup SPAdes temp files
cleanupSpadesFiles( path=spades.output.path, verbose=F)
} else {
cat( "\n\nUsing previously calculated Spades contigs..\n")
}
# because of reading frame and stop codon trimming, and that SPAdes has no length cutoff,
# we may have peptides that are now shorter than the minimun we want, or too long to be a VDJ construct.
pepFA <- loadFasta( peptidesFastaFile, short=T, verbose=F)
len <- nchar( pepFA$seq)
drops1 <- which( len < min.aa.length)
drops2 <- which( len > max.aa.length)
drops <- sort( union( drops1, drops2))
if ( length( drops)) {
pepFA <- as.Fasta( pepFA$desc[-drops], pepFA$seq[-drops])
writeFasta( pepFA, peptidesFastaFile, line=100)
}
NPEP <- length( pepFA$desc)
# step 3: Throw those contigs as peptides against the given set of full length TCR gene proteins
if ( ! is.null( bcrtcrFastaFile) && ! file.exists( proteinsTextFile)) {
cat( "\n\nSearching Contigs for BCR/TCR constructs..")
proteinAns <- bestSpadesProteins( sampleID, outpath=spades.output.path, proteinFastaFile=bcrtcrFastaFile,
keyword=keyword, verbose=verbose)
# visit each peptide's best match explicitly, and either drop, keep, and maybe trim
protDesc <- protSeq <- vector()
NPROT <- 0
for ( i in 1:nrow(proteinAns)) {
thisScore <- proteinAns$ScorePerAA[i]
if ( thisScore < min.score.per.aa) next
thisID <- proteinAns$ContigID[i]
where <- match( thisID, pepFA$desc, nomatch=0)
if ( ! where) next
NPROT <- NPROT + 1
protDesc[NPROT] <- thisID
protSeq[NPROT] <- pepFA$seq[where]
# see if we should trim any untranslated UTR from the peptide
thisProtStart <- proteinAns$ProteinStart[i]
if ( thisProtStart < 1) protSeq[NPROT] <- substring( protSeq[NPROT], abs(thisProtStart)+2)
}
if (NPROT) {
protFA <- as.Fasta( protDesc, protSeq)
writeFasta( protFA, proteinsFastaFile, line=100)
cat( "\nWrote", NPROT, "BCR/TCR proteins as FASTA to: ", basename(proteinsFastaFile))
}
# at this point, we have a small chance that none look like BCR/TCR genes
if ( ! NPROT) {
cat( "\nNo peptides look like BCR/TCR proteins..")
write.table( proteinAns, proteinsTextFile, sep="\t", quote=F, row.names=F)
return( invisible( proteinAns))
}
} else {
if ( file.exists( proteinsTextFile)) {
proteinAns <- read.delim( proteinsTextFile, as.is=T)
NPROT <- nrow( proteinAns)
protFA <- loadFasta( proteinsFastaFile)
}
}
# step 4: call IgBlast
# at this point, we know which contigs are at least the correct length range, and perhap which look like example BCR/TCR,
# so we can narrow down the raw contigs to those most likely to be true BCR/TCR constructs
igblastOutputFile1 <- file.path( spades.output.path, "IgBlast.BCR.Output.txt")
igblastOutputFile2 <- file.path( spades.output.path, "IgBlast.TCR.Output.txt")
resultFile1 <- file.path( spades.output.path, paste( sampleID, "IgBlast.BCR.Final.Results.txt", sep="."))
resultFile2 <- file.path( spades.output.path, paste( sampleID, "IgBlast.TCR.Final.Results.txt", sep="."))
usableContigIDs <- pepFA$desc
if ( exists( "proteinAns") && NPROT) usableContigIDs <- protFA$desc
usableContigIDs <- sub( "_FR[1:6]$", "", usableContigIDs)
contigsFA <- loadFasta( contigsFastaFile)
keep <- which( contigsFA$desc %in% usableContigIDs)
if ( ! length(keep)) {
# small chance that the descriptor lines have different suffixes, strip down and retest
contigDesc <- sub( "(NODE_[0-9]+)(_.+)", "\\1", contigsFA$desc)
usableContigIDs <- sub( "(.+_)(NODE_[0-9]+)(_.+)", "\\2", usableContigIDs)
keep <- which( contigDesc %in% usableContigIDs)
}
if ( ! length(keep)) {
cat( "\n Warning: no contigs look like BCR/TCR sequences. Not calling IgBlast.")
file.delete( c( resultFile1, resultFile2))
return(NULL)
}
igblastFA <- as.Fasta( contigsFA$desc[keep], contigsFA$seq[keep])
igblastContigsFile <- file.path( spades.output.path, "IgBlast.Contigs.fasta")
writeFasta( igblastFA, igblastContigsFile, line=100)
# look for both TCR and BCR by calling the IgBlast tool
callIgBlast( igblastContigsFile, outfile=igblastOutputFile1, db="IG")
ansIG <- readIgBlastOutput( igblastOutputFile1)
ansIG <- subset( ansIG, V_SCORE >= min.v.score)
cat( "\n N_IG Constructs: ", nrow(ansIG))
callIgBlast( igblastContigsFile, outfile=igblastOutputFile2, db="TR")
ansTCR <- readIgBlastOutput( igblastOutputFile2)
ansTCR <- subset( ansTCR, V_SCORE >= min.v.score)
cat( "\n N_TCR Constructs: ", nrow(ansTCR))
# write out final results
write.table( ansIG, resultFile1, sep="\t", quote=F, row.names=F)
write.table( ansTCR, resultFile2, sep="\t", quote=F, row.names=F)
out <- c( "N_BCR"=nrow(ansIG), "N_TCR"=nrow(ansTCR))
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.