knitr::opts_chunk$set(echo = TRUE,
  dpi=150)

`%!in%` = Negate(`%in%`)
library(ggplot2)
library(magrittr)
library(stringr)
library(FascinRSCA)
df <- FascinRSCA::df_Q16658

Search for outliers

First, we looked at the statistical significance values:

summary(df$pvalue)

A continuación, eliminaremos las secuencias que sean fragmentos, las que han sido borradas y aquellas anotadas como Ubiquitin ligasa.

print("Before filtering using annotation")
dim(df)
avoid_words <- c('delete','fragment','ubiquitin')

df %<>%  dplyr::filter(dplyr::if_all(c('Protein names','Gene names',
                                       'Entry name'),
  ~str_to_lower(.) %>%
  tidyr::replace_na('') %>% # Para no eliminar las entradas con un NA en la anotación
  stringr::str_detect(paste(avoid_words,collapse =  '|'))%>% `!`))
print("After filtering using annotation")
dim(df)

Vamos a eliminar las secuencias redundates al 100% para la misma especie (secuencias repetidas).

df %<>% dplyr::distinct(Status, taxid, fasta_seq, .keep_all = TRUE)
print("After filtering duplicated sequences")
dim(df)
def_outliers <- vector()
df %<>% dplyr::mutate(ann = paste(fasta_header, species, sep = '\n')) 

All sequences

En primer lugar, observamos la distribución de las secuencias en base a su carga y longitud de secuencia.

df %>% tidyr::drop_na(phylum, charge)%>%
  dplyr::mutate(phylum = forcats::fct_lump_min(phylum, 10))%>%
  {outliers_boxplot_with_plotly(.$phylum, .$charge, .$ann, "Charge distribution across phyla in Fascin homologous sequences")}
df %>% tidyr::drop_na(phylum, residue_n)%>%
  dplyr::mutate(phylum = forcats::fct_lump_min(phylum, 10))%>%
  {outliers_boxplot_with_plotly(.$phylum, .$residue_n,
                    .$ann,"Length distribution across phyla in Fascin homologous sequences")}
outliers <- setNames(c("ENSNMEG00000009168.1|ENSNMEP00000009468.1",
              "A0A226PTP4_COLVI|A0A226PTP4",
              "ENSFALG00000012651.1|ENSFALP00000013204.1",
              "ENSVVUG00000019375.1|ENSVVUP00000026397.1",
              "ENSPTEG00000031612.1|ENSPTEP00000033001.1",
              "A0A2U3W8G7_ODORO|A0A2U3W8G7",
              "A0A401S4A7_CHIPU|A0A401S4A7",
              "A0A1W4WIZ5_AGRPL|A0A1W4WIZ5", 
              "A0A1W4WV15_AGRPL|A0A1W4WV15"),
              c("Helmeted guineafowl","Colinus virginianus", "Ficedula albicollis",
                "Red fox", "Ugandan red Colobus", "Odobenus rosmarus divergens", 
                "Chiloscyllium punctatum", "Agrilus planipennis", "Agrilus planipennis")) 

df %<>% subset(fasta_header %!in% outliers)
def_outliers %<>% append(outliers)

Chordata

Vamos a excluir todos los outliers por carga.

df %>% dplyr::filter(phylum == 'Chordata')%>%
  tidyr::drop_na(class, charge) %>%
  dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>%
  {outliers_boxplot_with_plotly(.$class, .$charge, .$ann,
    "Charge distribution across Chordata in Fascin homologous sequences")}
df %>% dplyr::filter(phylum == 'Chordata')%>%
  tidyr::drop_na(class, residue_n) %>%
  dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>%
  {outliers_boxplot_with_plotly(.$class, .$residue_n, .$ann,
    "Length distribution across Chordata in Fascin homologous sequences")}
outliers <- setNames(c("ENSPMJG00000017038.1|ENSPMJP00000023042.1", 
              "ENSTGUG00000007668.1|ENSTGUP00000007888.1",
              "ENSPVIG00000000772.1|ENSPVIP00000000835.1",
              "ENSECAG00000008056.2|ENSECAP00000030944.1",
              "ENSECAG00000008056.2|ENSECAP00000042721.1",
              "ENSECAG00000008056.2|ENSECAP00000039899.1",
              "G5BI79_HETGA|G5BI79",#
              "ENSSSCG00070015918.1|ENSSSCP00070026109.1",
              "L5L203_PTEAL|L5L203",
              "ENSMLEG00000031848.1|ENSMLEP00000015343.1",
              "ENSEBUG00000014215.1|ENSEBUP00000023069.1",
              "ENSCMIG00000018465.1|ENSCMIP00000044679.1"),
              c("Great Tit", "Taeniopygia guttata","Central bearded dragon",
                "Equus caballus", "Equus caballus", "Equus caballus", 
                "Heterocephalus glaber","Pig USMARC", "Pteropus alecto",
                "Mandrillus leucophaeus", "Hagfish", "Elephant shark")) 

df %<>% subset(fasta_header %!in% outliers) 
df %<>% dplyr::filter(class != "Actinopteri")
dim(df)
def_outliers %<>% append(outliers)

Mammalia

df %>% dplyr::filter(class == 'Mammalia')%>%
  tidyr::drop_na(order, charge) %>%
  dplyr::mutate(order = forcats::fct_lump_min(order, 5))%>%
  {outliers_boxplot_with_plotly(.$order, .$charge, .$ann,
    "Charge distribution across Mammalia in Fascin homologous sequences")}
df %>% dplyr::filter(class == 'Mammalia')%>%
  tidyr::drop_na(order, residue_n) %>%
  dplyr::mutate(order = forcats::fct_lump_min(order, 5))%>%
  {outliers_boxplot_with_plotly(.$order, .$residue_n, .$ann,
    "Length distribution across Chordata in Fascin homologous sequences")}
outliers <- setNames(c("ENSGGOG00000027785.2|ENSGGOP00000021898.2",
              "ENSSDAG00000013746.1|ENSSDAP00000015277.1", 
              "ENSSDAG00000013746.1|ENSSDAP00000015287.1", 
              "ENSOARG00000018489.1|ENSOARP00000019847.1", 
              "ENSSSCG00000026155.2|ENSSSCP00000057954.1", 
              "ENSPCAG00000000879.1|ENSPCAP00000000808.1", 
              "ENSSSCG00070011455.1|ENSSSCP00070018470.1", 
              "ENSPTRG00000041962.2|ENSPTRP00000061120.2"),
              c("Gorilla gorilla gorilla", "Daurian ground squirrel",
                "Daurian ground squirrel","Ovis aries", "Sus scrofa",
                "Hyrax", "Pig USMARC", "Pan troglodytes")) 
df %<>% subset(fasta_header %!in% outliers) 
dim(df)
def_outliers %<>% append(outliers)

Primates

df %>% dplyr::filter(order == 'Primates')%>%
  tidyr::drop_na(family, charge) %>%
  {outliers_boxplot_with_plotly(.$family, .$charge, .$ann)}
df %>% dplyr::filter(order == 'Primates')%>%
  tidyr::drop_na(family, residue_n) %>%
  {outliers_boxplot_with_plotly(.$family, .$residue_n, .$ann)}

Non chordata

df %>% dplyr::filter(phylum != 'Chordata')%>%
  tidyr::drop_na(class, charge) %>%
  dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>%
  {outliers_boxplot_with_plotly(.$class, .$charge, .$ann,
    "Charge distribution across other phyla in Fascin homologous sequences")}
df %>% dplyr::filter(phylum != 'Chordata')%>%
  tidyr::drop_na(class, residue_n) %>%
  dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>%
  {outliers_boxplot_with_plotly(.$class, .$residue_n, .$ann,
    "Length distribution across other phyla in Fascin homologous sequences")}
outliers <- setNames(c("A0A484AZ75_DRONA|A0A484AZ75",
              "A0A1B0FA94_GLOMM|A0A1B0FA94"),
              c("Drosophila navojoa",
                "Glossina morsitans morsitans")) 
df %<>% subset(fasta_header %!in% outliers) 
dim(df)
def_outliers %<>% append(outliers)

Disribución de carge y longitud de secuencia

Creamos una nueva variable con la anotación:

library(dplyr)
library(stringr)
df$annotation <- df %>%
  dplyr::filter(fasta_header %!in% def_outliers)%>%
  dplyr::mutate(`Gene names` = toupper(`Gene names`),
                `Protein names` = toupper(`Protein names`))%>%
  tidyr::replace_na(list(`Gene names` = "", `Protein names` = "unknown"))%>%
  dplyr::mutate(
    annotation = dplyr::case_when(
      str_detect(`Gene names`, 'FSCN2') & str_detect(`Protein names`, 'FASCIN.+2.+ISOFORM.+1') ~ "fascin2a",
      str_detect(`Gene names`, 'FSCN2') & str_detect(`Protein names`, 'FASCIN.+2.+ISOFORM.+2') ~ "fascin2b",      
      str_detect(`Gene names`, 'FSCN1') & str_detect(`Protein names`, 'FASCIN') ~ "fascin1",
      str_detect(`Gene names`, 'FSCN2') & str_detect(`Protein names`, 'FASCIN') ~ "fascin2",
      !str_detect(`Gene names`, 'FSCN\\d') & str_detect(`Protein names`, 'FASCIN') ~ "fascin",
      str_detect(`Gene names`, 'UNCHARACTERIZED') | str_detect(`Protein names`, 'UNCHARACTERIZED') ~ "uncharacterized",
      !str_detect(`Gene names`, 'FSCN\\d') & (str_detect(`Protein names`, "SINGED(?!.+LIKE)")| str_detect(`Protein names`, 'SN')) ~ "singed",
      TRUE                      ~  "unknown"
    )) %>%
  dplyr::pull(annotation)

Realizamos el gráfico:

df %>%
dplyr::group_by(phylum, order, annotation)%>%
  dplyr::summarise(charge = mean(charge), 
                   residue_n = mean(residue_n),
                   size = dplyr::n())%>%
 plotly::plot_ly(x = ~charge, y = ~residue_n, size = ~size,
                 type = 'scatter', mode = 'markers',color = ~phylum,
                 colors = 'Paired',#sizes = c(10, 50),
                 marker = list(opacity = 0.5, sizemode = 'diameter'),
                 hoverinfo = 'text',
                 text = ~paste('Annotation:', annotation, 'Order: <i>', order))%>%
plotly::layout(title = "Distribution of charge and sequence's length")
df %>% dplyr::mutate(Proteins_names_collapsed = forcats::fct_collapse(df$`Protein names`,
  fascin1 =  c("Fascin actin-bundling protein 1"),
  fascin2 = c("Fascin-2 (Retinal fascin)", "fascin-2 isoform X1",
              "Fascin actin-bundling protein 2, retinal",
              "Fascin-2", "fascin-2 isoform X2"),
  fascin = c("Fascin (55 kDa actin-bundling protein) (Singed-like protein) (p55)",
             "Fascin", "Fascin (Singed-like protein)",
             "Protein singed-like Protein", "GH24440","GM11990"),
  singed  = c("Protein singed","Singed","protein singed isoform X3", "sn", "protein singed", "Singed, isoform B (Singed, isoform E) (Singed, isoform G)", "Blast:Protein singed"),
  other = c("GH24440", "GM11990"),
  uncharacterized = c("Uncharacterized protein", "Uncharacterized protein, isoform A (Uncharacterized protein, isoform B)")))%>%
  dplyr::group_by(phylum, order, Proteins_names_collapsed)%>%
  dplyr::summarise(charge = mean(charge), 
                   residue_n = mean(residue_n),
                   size = dplyr::n())%>%
 plotly::plot_ly(x = ~charge, y = ~residue_n, size = ~size,
                 type = 'scatter', mode = 'markers',color = ~phylum,
                 colors = 'Paired',#sizes = c(10, 50),
                 marker = list(opacity = 0.5, sizemode = 'diameter'),
                 hoverinfo = 'text',
                 text = ~paste('Annotation:', Proteins_names_collapsed, 'Order: <i>', order))%>%
plotly::layout(title = "Distribution of charge and sequence's length")
print(def_outliers)
readr::write_lines(def_outliers, paste0("../data-raw/df_Q16658/",
                                        "outliers_fasta_headers.txt"))


currocam/FascinRSCA documentation built on March 21, 2022, 6:29 a.m.