R/read.gff.R

Defines functions read.gff

#' @export
read.gff <- function(file,  locus.tags=TRUE, nrows = -1  ){
  # code from Chris Stubben's genomes2 package modified by Jean Elbers
  # https://github.com/cstubben/genomes2/blob/master/R/read.gff.R
  # columns 1 and 2 are saved in metadata (full seqid with version# and source )
  #  columns 4 = start , 5 = end and 7 = strand are saved in GRange
  
  x <- read.delim(file, stringsAsFactors=FALSE, comment.char="#", header=FALSE, nrows=nrows)
  # use tags instead of attributes() function
  colnames(x) <- c("seqid", "source", "feature", "start", "end", "score", "strand", "phase", "tags")
  
  # sep 4 , 2014 - temp fix for new FTP site with "." in strand...
  n<- x$strand=="."
  if(any(n)){
    x$strand[n] <- "+"
    print("WARNING: changing '.' to '+' in strand column")
  }
  
  # SAVE OR remove version number?   - note strsplit2 in limma
    seqid  <- unique(x$seqid)
  # if( all(grepl("\\.[0-9]$", seqid )) )  x$seqid<- strsplit2(x$seqid, ".", fixed=TRUE)
  
  
  ## extra ID tag added for plotting 
  x$id <-  gsub("ID=([^;]*).*", "\\1", x$tags)   #  -always at start of tags?
  
  ## FULL table (except score and phase)
  if(!locus.tags){
    gff <- GRanges(seqnames=x$seqid, ranges=IRanges( x$start, x$end), strand=x$strand, data.frame(x[, c(10, 3, 6,8, 9 ) ])  )
    
  }else{
    
    # SPLIT - using GENE or locus_tag key?  
    n <- x$tags %like% '*locus_tag=*'
    genes <- subset(x, n)
    y <-  subset(x, !n)
    
    ##FIX: gffs from IMG with exit with error since no rows in y (all locus tags)
    
    # get Parent ids for matching
    y$parent<-NA
    n <- y$tags %like% '*Parent=*'
    y$parent[n] <- gsub(".*Parent=([^;]*).*", "\\1", y$tags[n])
    
    # get protein_ids (for matching to fasta and other files)   ## ADDED Dec 23, 2013
    y$pid <- ""
    n<- y$tags %like% '*protein_id=*'
    if(sum(n)>0) y$pid[n]  <-  gsub(".*protein_id=([^;.]*).*", "\\1", y$tags[n])
    
    # get Products 
    y$product <- ""
    n<- y$tags %like% '*product=*'
    if(sum(n)>0) y$product[n]  <-  gsub(".*product=([^;]*).*", "\\1", y$tags[n])
    
    ## add notes if missing product (for ncRNAs and  transposase fragments)
    n<- is.na(y$product) & y$tags %like% '*Note=*'
    y$product[n]  <-  gsub(".*Note=([^;]*).*", "\\1", y$tags[n])
    
    n   <- grep("%", y$product)
    if(length(n)>0) y$product[n] <- as.vector(sapply(y$product[n] , URLdecode))
    
    ## ADD locus tags  
    # Mar 2013 Mycoplasma has old_locus_tag, add semi-colon    ;locus_tag=MG_515;old_locus_tag=MG323.1
    genes$locus <- gsub(".*;locus_tag=([^;]*).*", "\\1", genes$tags)
    
    ## FIND protein coding and other genes types USING parent key
    n <- match(  genes$id, y$parent )  
    n2 <- !is.na(n) 
    genes$feature[n2] <-  y$feature[n[n2]]   # overwrite pseudo tags
    
    genes$description <- ""
    genes$description[n2] <-  y$product[n[n2]]
    genes$pid[n2]         <-  y$pid[n[n2]]
    
    ###   FIX pseudogenes
    genes$feature[ genes$tags %like% '*pseudo=true*'] <- "pseudo"
    
    ## GENES
    n <- which(genes$feature =="gene" )
    
    ## genes can match to region (other) and tRNAs
    if(length(n) > 0){
      n2 <- match(  paste(genes$start[n], genes$end[n]), paste(y$start, y$end)  )
      # check for matches. - and set type = other below
      n3 <- !is.na(n2) 
      
      genes$feature[n[n3]] <-  y$feature[n2[n3]]
      genes$description[n[n3]] <-  y$product[n2[n3]] 
    }
    
    ## Y pestis pCD1 has 'transcript'  = miscRNA and region = other AT NCBI  
    genes$feature [genes$feature == "transcript" ] <-  "miscRNA"
    genes$feature [genes$feature == "region" ] <-  "other"
    genes$feature [genes$feature == "gene" ] <-  "other"   
    
    
    ## get gene AND gene_synonym
    genes$gene <- ""
    n<-grep("gene=", genes$tag)
    if(length(n)>0)  genes$gene[n] <- gsub(".*gene=([^;]*).*", "\\1", genes$tags[n])
    
    n<-grep("gene_synonym=", genes$tag)
    if(length(n)>0)  genes$gene[n] <- paste( genes$gene[n] ,  gsub(".*gene_synonym=([^;]*).*", "\\1", genes$tags[n]) , sep=",")
    
    n   <- grep("%", genes$gene)
    # if no matches, avoid changing genes to list() 
    if(length(n)>0)  genes$gene[n] <- as.vector(sapply(genes$gene[n] , URLdecode))
    
    ## check duplicate tags (combined using JOINs)
    
    if( any(table(genes$locus)>1) ){
      # or create GRange and then reduce()
      print("Warning: grouping coordinates for duplicate locus tags") 
      genes2 <- by(genes, genes$locus, function(x){
        data.frame(locus=x$locus[1], seqid=x$seqid[1], start=min(x$start), end = max(x$end), strand= x$strand[1], 
                   pid=x$pid[1], feature=x$feature[1], description=x$description[1], gene=x$gene[1] ) 
      })
      genes <- do.call('rbind', genes2) 
      
    } 
    
    # re-order ?
    if( any(diff(order(genes$start))!=1) ){
      genes <-genes[order(genes$start),] 
    }
    gff <- GRanges(seqnames=genes$seqid, ranges=IRanges( genes$start, genes$end), strand=genes$strand, genes[, c("locus", "pid", "feature", "description", "gene") ]  )
  }
  ##  sequence length should be in first row (and max end) 
  ## gff may have multiple seq ids (then find if "source" tag present
  if(length(seqid)==1)  seqlengths(gff) <- max(x$end)
  # add defline for read.gff - parse from source tag???  - use full acc with version number
  metadata(gff) <- list(source=unique(x$source) , defline=seqid)
  gff
}
jelber2/rGO2TR documentation built on Feb. 22, 2024, 12:50 p.m.