data-raw/df_Q16658_WithOutOutliers.R

## code to prepare `df_Q16658_WithOutOutliers` dataset goes here
library(magrittr)
`%!in%` = Negate(`%in%`)
outliers <- scan("data-raw/df_Q16658/outliers_fasta_headers.txt",
                 what = "character")
avoid_words <- c('delete','fragment','ubiquitin')
df_Q16658_WithOutOutliers <- FascinRSCA::df_Q16658 %>%
  dplyr::filter(dplyr::if_all(c('Protein names',
    'Gene names', 'Entry name'), ~stringr::str_to_lower(.) %>%
  tidyr::replace_na('') %>% # Para no eliminar las entradas con un NA en la anotación
  stringr::str_detect(paste(avoid_words,collapse =  '|'))%>% `!`))%>%
  dplyr::distinct(taxid, fasta_seq, .keep_all = TRUE)%>%
  dplyr::filter(fasta_header %!in% outliers)

df_Q16658_WithOutOutliers$annotation <- df_Q16658_WithOutOutliers %>%
  dplyr::filter(fasta_header %!in% outliers)%>%
  dplyr::mutate(`Gene names` = toupper(`Gene names`),
                `Protein names` = toupper(`Protein names`))%>%
  tidyr::replace_na(list(`Gene names` = "", `Protein names` = "unknown"))%>%
  dplyr::mutate(
    annotation = dplyr::case_when(
      stringr::str_detect(`Gene names`, 'FSCN2') & stringr::str_detect(`Protein names`, 'FASCIN.+2.+ISOFORM.+1') ~ "fascin2a",
      stringr::str_detect(`Gene names`, 'FSCN2') & stringr::str_detect(`Protein names`, 'FASCIN.+2.+ISOFORM.+2') ~ "fascin2b",
      stringr::str_detect(`Gene names`, 'FSCN1') & stringr::str_detect(`Protein names`, 'FASCIN') ~ "fascin1",
      stringr::str_detect(`Gene names`, 'FSCN2') & stringr::str_detect(`Protein names`, 'FASCIN') ~ "fascin2",
      !stringr::str_detect(`Gene names`, 'FSCN\\d') & stringr::str_detect(`Protein names`, 'FASCIN') ~ "fascin",
      stringr::str_detect(`Gene names`, 'UNCHARACTERIZED') | stringr::str_detect(`Protein names`, 'UNCHARACTERIZED') ~ "uncharacterized",
      !stringr::str_detect(`Gene names`, 'FSCN\\d') & (stringr::str_detect(`Protein names`, "SINGED(?!.+LIKE)")| stringr::str_detect(`Protein names`, 'SN')) ~ "singed",
      TRUE                      ~  "unknown"
    )) %>%
  dplyr::pull(annotation)

usethis::use_data(df_Q16658_WithOutOutliers, overwrite = TRUE)
currocam/FascinRSCA documentation built on March 21, 2022, 6:29 a.m.