CHNOSZ: R/util.blast.R

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# CHNOSZ/blast.R
# functions to analyze BLAST output files
# 20100320 jmd

## read a BLAST tabular output file, and filter by similarity,
## E-value and max hits per query
read.blast <- function(file, similarity=30, evalue=1e-5, max.hits=1, min.length=NA, quiet=FALSE) {
  # display some information about the file
  if("connection" %in% class(file)) fname <- summary(file)$description
  else fname <- basename(file)
  cat(paste("read.blast: reading", fname, "\n"))
  # read the blast tabular file
  blast <- read.csv(file,header=FALSE,sep="\t",stringsAsFactors=FALSE)
  if(!quiet) cat(paste("  read",nrow(blast),"lines with",length(unique(blast$V1)),"query sequences\n"))
  # take out rows that don't meet our desired similarity
  if(!is.na(similarity)) {
    is <- which(blast$V3 >= similarity)
    blast <- blast[is,]
    if(!quiet) cat(paste("  similarity filtering leaves",length(is),"lines and",length(unique(blast$V1)),"query sequences\n"))
  }
  # take out rows that don't meet our desired e-value
  if(!is.na(evalue)) {
    ie <- which(blast$V11 <= evalue)
    blast <- blast[ie,]
    if(!quiet) cat(paste("  evalue filtering leaves",length(ie),"lines and",length(unique(blast$V1)),"query sequences\n"))
  }
  # take out rows that don't have the minimum alignment length
  if(!is.na(min.length)) {
    ie <- which(blast$V4 >= min.length)
    blast <- blast[ie,]
    if(!quiet) cat(paste("  alignment length filtering leaves",length(ie),"lines and",length(unique(blast$V1)),"query sequences\n"))
  }
  # now take only max hits for each query sequence
  if(!is.na(max.hits)) {
    query.shift <- query <- blast$V1
    lq <- length(query)
    # for short (i.e., 1 query sequence) files, make sure that the hits get counted
    query.shift[max((lq-max.hits+1),1):lq] <- -1
    #query.shift <- query.shift[c((max.hits+1):lq,1:max.hits)]
    query.shift <- query.shift[c((lq-max.hits+1):lq,1:(lq-max.hits))]
    ib <- which(query!=query.shift)
    blast <- blast[ib,]
    if(!quiet) cat(paste("  max hits filtering leaves",length(ib),"lines and",length(unique(blast$V1)),"query sequences\n"))
  }
  # assign meaningful column names
  # http://bergmanlab.smith.man.ac.uk/?p=41, accessed 20111223
  colnames(blast) <- c("queryId", "subjectId", "percIdentity", "alnLength", "mismatchCount", 
    "gapOpenCount", "queryStart", "queryEnd", "subjectStart", "subjectEnd", "eVal", "bitScore")
  return(blast)
}

## process a blast table, identify the taxon
## for each hit
id.blast <- function(blast, gi.taxid, taxid.names, min.taxon=0, 
  min.query=0, min.phylum=0, take.first=TRUE) {
  # what are gi numbers of the hits
  # we use def2gi to extract just the gi numbers
  gi <- def2gi(blast[,2])
  # what taxid do they hit
  cat("id.blast: getting taxids ... ")
  imatch <- match(gi,gi.taxid[[1]])
  taxid <- gi.taxid[[2]][imatch]
  # what phyla are these
  cat("getting taxid.names ... ")
  iphy <- match(taxid,taxid.names$taxid)
  phylum <- taxid.names$phylum[iphy]
  species <- taxid.names$species[iphy]
  cat(paste(length(unique(phylum)),"unique phyla,",length(unique(taxid)),"unique taxa\n"))
  # now expand phyla into our blast table
  # we really don't want our stringsAsFactors (to use NAphylum, below)
  tax.out <- data.frame(taxid=taxid, phylum=as.character(phylum), species=species, 
    stringsAsFactors=FALSE)
  blast.out <- blast[,c(1,2,3,11)]
  colnames(blast.out) <- c("queryId","subjectId","percIdentity","eVal")
  blast <- cbind(blast.out,tax.out)
  # drop taxa that do not appear a certain number of times
  blast$taxid <- as.character(blast$taxid)
  nt <- table(blast$taxid)
  it <- which(nt/sum(nt) >= min.taxon)
  itt <- which(blast$taxid %in% names(nt)[it])
  blast <- blast[itt,]
  cat(paste("  min taxon abundance filtering leaves",length(unique(blast$queryId)),
    "query sequences,",length(unique(blast$phylum)),"phyla,",length(unique(blast$taxid)),"taxa\n"))
  # only take phylum assignments that make up at least a certain 
  # fraction ('amin') of hits to the query sequence
  uquery <- unique(blast$queryId)
  iquery <- match(uquery,blast$queryId)
  # function to select the (highest) represented phylum for each query
  iqfun <- function(i) {
    if((i-1)%%1000==0) cat(paste(i,""))
    myiq <- which(blast$query==uquery[i])
    # we don't have to do the calculation if there's only one hit
    if(length(myiq)==1) {
      iq <- iquery[i]
    } else {
      # take those hits, count each phyla, take the highest abundance,
      # check if it's above the minimum proportion of hits
      p <- as.character(blast$phylum[myiq])
      np <- table(p)
      pp <- np/sum(np)
      ip <- which.max(pp)
      if(pp[ip] < min.query) return(numeric())
      # use the first hit to that phylum
      myphy <- blast$phylum[myiq]
      iphy <- which(names(pp[ip])==myphy)
      iq <- myiq[iphy[1]]
    }
    return(iq)
  }
  # using lapply is tempting, but haven't got it working
  #iq <- as.numeric(lapply(1:length(uquery),iqfun))
  #iq <- iq[!is.na(iq)]
  if(min.query!=0) {
    cat("  filtering by phylum representation per query... ")
    iq <- numeric()
    for(i in 1:length(uquery)) iq <- c(iq,iqfun(i))
    cat("done!\n")
    blast <- blast[iq,]
    cat(paste("  min query representation filtering leaves",length(iq),"query sequences,",length(unique(blast$phylum)),
      "phyla,",length(unique(blast$taxid)),"taxa\n"))
  } else {
    # we'll just take the first hit for each query sequence
    if(take.first) blast <- blast[iquery,]
  }
  # now on to drop those phyla that are below a certain relative abundance
  # we count NA as a real phylum
  iNA <- which(is.na(blast$phylum))
  blast$phylum[iNA] <- "NAphylum"
  np <- table(blast$phylum)
  ip <- which(np/sum(np) >= min.phylum)
  ipp <- which(blast$phylum %in% names(np)[ip])
  blast <- blast[ipp,]
  cat(paste("  min phylum abundance filtering leaves",length(ipp),"query sequences,",length(unique(blast$phylum)),
    "phyla,",length(unique(blast$taxid)),"taxa\n"))
  return(blast)
} 

write.blast <- function(blast,outfile) {
  # using the output of read.blast
  # create a trimmed "blast format" output file with data only
  # in the following columns
  # 1 - query sequence ID
  # 2 - hit sequence ID (i.e., FASTA defline .. extract gi number)
  # 3 - similarity
  # 11 - evalue
  not <- rep(NA,length(blast[[1]]))
  not7 <- data.frame(not,not,not,not,not,not,not)
  c2 <- paste("gi",def2gi(blast[[2]]),sep="|")
  out <- data.frame(blast[[1]],c2,blast[[3]],not7,blast[[11]],not)
  write.table(out,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE,na="")
  return(invisible(out))
}

def2gi <- function(def) {
  # extract gi numbers from FASTA deflines 20110131
  stuff <- strsplit(def,"\\|")
  gi <- sapply(1:length(stuff),function(x) {
    # the gi number should be in the 2nd position (after "gi")
    if(length(stuff[[x]])==1) return(stuff[[x]][1])
    else return(stuff[[x]][2])
  })
  return(gi)
}

Questions? Problems? Suggestions? or email at ian@mutexlabs.com.

Please suggest features or report bugs with the GitHub issue tracker.

All documentation is copyright its authors; we didn't write any of that.