R/fct_geneIDs.R

Defines functions check_IDs get_locus aggregate_splice_variants are_splice_variants

Documented in aggregate_splice_variants are_splice_variants check_IDs get_locus

#' Are splice variants
#' 
#' @description Detects weather or not the data contains splice variants 
#' instead of unique locus. Returns TRUE if all genes ids match 
#' the regex for splice variants
#'
#' @param gene_ids gene vector to be tested
#'
#' @return boolean
are_splice_variants <- function(gene_ids){
  #return(sum(stringr::str_detect( gene_ids, pattern = "\\.[[:digit:]]+$")) == length(gene_ids))
  return(sum(stringr::str_detect( gene_ids, pattern = "\\.[[:digit:]]+$")) > 0)
}

#' Aggregate splice variants 
#' 
#' @description Merge all splice variants of an expression dataset into 
#' unique locus, unaware of alternative splicing, by summing
#' all variants of the same gene
#'
#' @param data expression dataframe with splice variants as rownames
#' and samples as columns
#' @return dataframe with aggregated rows
#' @export
#' @examples 
#' data("abiotic_stresses")
#' head(aggregate_splice_variants(abiotic_stresses$normalized_counts))
aggregate_splice_variants <- function(data){
  if(are_splice_variants(rownames(data))){
    
    data <- data.frame(data, check.names = FALSE)
    
    locus <- stringr::str_replace_all(rownames(data), 
                                      pattern = "\\.[[:digit:]]+$", "")
    data$locus <- locus
    data_locus <- aggregate(. ~ locus, data, sum)
    rownames(data_locus) <- unique(locus)
    return(data_locus[, colnames(data_locus) != "locus"])
  }
  else print("The input data did not contain splice variants.
             Or at least not enterely.")
}


#' Get gene IDs loci, without splicing information
#' 
#' @description Get the locus IDs from splice variants IDs
#' (remove the .1, .2 from a list of gene IDs)
#' @param gene_ids list of gene ids with splice variants information
#' @param unique boolean, weather or not to return unique locus vector
#' @return character vector of gene IDs without alternative transcripts 
#' information
#' @export
#' @examples 
#' splice_variants <- rownames(abiotic_stresses$normalized_counts)[1:20]
#' get_locus(splice_variants)
get_locus <- function(gene_ids, unique = TRUE){
  if(unique){
    return(unique(
      stringr::str_replace_all(gene_ids, 
                               pattern = "\\.[[:digit:]]+$", "")))
  }
  else{
    return(
      stringr::str_replace_all(gene_ids, 
                               pattern = "\\.[[:digit:]]+$", ""))
  }
}



#' Check compatibility between gene IDs and an organism
#'
#' @param ids character vector of gene identifiers to be tested
#' @param organism organism, should be betwwen "Arabidopsis thaliana", "Homo sapiens", "Mus musculus", 
#' "Caenorhabditis elegans", "Escherichia coli", "Drosophilia melanogaster",
#' "Lupinus albus"
#'
#' @return boolean, TRUE if all of the gene IDs match the expected regex for
#' the specified organism.
#' @export
#'
#' @examples 
#' data("abiotic_stresses")
#' check_IDs(rownames(abiotic_stresses$raw_counts),
#' organism = "Arabidopsis thaliana")
#' check_IDs(rownames(abiotic_stresses$raw_counts),
#' organism = "Homo sapiens")
check_IDs <- function(ids, organism){
  if(organism == "Arabidopsis thaliana")
    pattern = "^AT[[:alnum:]][GTE]{1,2}[[:digit:]]{5}"
    # to deny transposable elements : 
    # "^AT[[:alnum:]]G[[:digit:]]{5}"
    
  
  if(organism == "Homo sapiens")
    pattern = "^ENSG[[:digit:]]{11}"
  
  if(organism == "Mus musculus")
    pattern = "^ENSMUSG[[:digit:]]{11}"
  
  if(organism == "Lupinus albus")
    pattern = "^Lalb_Chr[[:digit:]]{2}(c[[:digit:]]{2})?g[[:digit:]]{7}"
  
  if(organism == "Drosophilia melanogaster")
    pattern = "^FBgn[[:digit:]]{7}"
  
  if(organism == "Caenorhabditis elegans")
    pattern = "^WBGene[[:digit:]]{8}"
  
  if(organism == "Escherichia coli")
    pattern = "^[a-z]{3}"
  
  if(organism == "Oryza sativa (rapdb)")
    pattern = "^Os[[:digit:]]{2}g[[:digit:]]{7}"
  
  if(organism == "Oryza sativa (msu)")
    pattern = "^LOC_Os[[:digit:]]{2}g[[:digit:]]{5}|fgenesh.gene"
  
  if(organism == "Oryza glaberrima")
    pattern = "^ORGLA[[:digit:]]{2}G[[:digit:]]{7}"
  
  matched <- sum(stringr::str_detect(ids, pattern = pattern))
  if( matched == length(ids))
    return(TRUE)
  else{
    if(matched > 0 & matched < length(ids)){
      warning("Some of the gene IDs do not match the expected regex")
      return(FALSE)
    }
    else{
      warning("None of the gene IDs match the expected regex")
      return(FALSE)
    }
  }
}
OceaneCsn/DIANE documentation built on Jan. 10, 2024, 6:43 p.m.