R/gene_position.R

Defines functions gene_position

## Requirement: "tibble" & "stringr"

gene_position = function(position_gb){
  
  get_gene_position = function(data, search_pattern, type, name){
    
    orientation = list()
    
    position = lapply(data, function(x) grep(search_pattern, x$V1))
    
    index_gene = lapply(data, function(x) grep(type, x$V1))
    
    for(i in 1:length(index_gene)){ index_gene[[i]] = tail(index_gene[[i]][index_gene[[i]] < tail(position[[i]], 1)], 1)}
    
    for(i in 1:length(data)){ position[[i]] = data[[i]][index_gene[[i]],] }
    
    for(j in 1:length(data)){ 
      
      if(length(grep("complement", position[[j]])) != 0) orientation[j] = "complement"
      
      else orientation[j] = "normal" 
      
    }
    
    position = lapply(position, function(x) gsub("join\\(|,|complement\\(|\\)|>|<", "", x))
    
    position = unlist(lapply(position, function(x) ifelse(length(x) == 0, NA, x)))
    
    start_position = as.numeric(word(word(position, -1), 1, sep = fixed("..")))
    
    end_position = as.numeric(word(word(position, -1), -1, sep = fixed("..")))
    
    output = tibble(data.frame(start_position, end_position, unlist(orientation)))
    
    colnames(output) = paste(c("START", "END", "ORIENTATION"), name, sep = "_")
    
    return(output)
    
  }
  
  colnames(position_gb) = "V1"
  
  position_splited = split(position_gb, findInterval(1:nrow(position_gb), which(grepl("LOCUS", position_gb$V1)) + 1))[-1]
  
  position_12S = suppressWarnings(get_gene_position(position_splited, '12S ribosomal RNA"|12S ribosomal RNA subunit"|12 ribosomal RNA"|12S rivbosomal RNA"|small subunit ribosomal RNA"|s-RNA"|s-rRNA"|rrnS"|12S rRNA"|12SrRNA"|12S"|small ribosomal RNA|12 rRNA"', "rRNA |gene |mRNA ", "12S"))
  
  position_16S = suppressWarnings(get_gene_position(position_splited, '16S ribosomal RNA"|16S ribosomal RNA subunit"|16S rivbosomal RNA"|16 ribosomal RNA"|16S ribosamal RNA"|large subunit ribosomal RNA"|l-RNA"|l-rRNA"|rrnL"|16S rRNA"|16S"|large ribosomal RNA|l6S ribosomal RNA"|16 rRNA"', "rRNA |gene |mRNA ", "16S"))
  
  position_COI = suppressWarnings(get_gene_position(position_splited, 'oxidase subunit I"|ocidase subunit I"|oxidase subunit 1"|oxidase subunit-I"|oxidase subunit-1"|cox1"|CoxI"|oxidase I"|oxidase 1"|oxidase I;|COX1"|COI"|oxidase subunit idase subunit I"|oxidase subunits I"|oxidase subunits 1"|oxydase subunit 1"|oxydase subunit I"', "CDS |gene ", "COI"))
  
  position_CytB = suppressWarnings(get_gene_position(position_splited, 'ytochrome b"|ytochorome b"|ytochrome-b"|ytochrome-B"|ytohrome b"|ytchorome b"|CYTB"|ytochome b"|cytB"|cytochrome b;|Cyt b"|cytb"|Cytb"|Cyt b"', "CDS |gene ", "CytB"))
  
  accession = unlist(lapply(position_splited, function(x) x[which(grepl("VERSION", x$V1)), ]))
  
  accession = tibble(data.frame(ACCESSION = word(word(accession, -2, sep = fixed(".")), -1)))
  
  final_df = tibble(cbind(accession, position_12S, position_16S, position_COI, position_CytB))
  
  return(list(COMPLETE_POSITIONS = final_df[rowSums(is.na(final_df)) == 0, ],
              MISSING_POSITIONS = final_df[rowSums(is.na(final_df)) %in% c(1:7), ]))
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.