knitr::opts_chunk$set(echo = TRUE,
  dpi=200)

The purpose of this notebook is to discriminate the homologous sequences to fascin that will be used for the SCA analysis and their reannotation.

# This is code to replicate the analysis and figures made to distinguish between
#FSCN1 and FSCN2 sequences. Code developed by currocam.
#load libraries
library(FascinRSCA)
library(tidyverse)
library(magrittr)
`%!in%` = Negate(`%in%`)
# Input file 
outliers <- scan("../data-raw/df_Q16658/outliers_fasta_headers.txt",
                 what = "character")
outliers %<>% append("ENSPTRG00000041962.2|ENSPTRP00000061120.2")

df <- FascinRSCA::df_Q16658_WithOutOutliers %>%
  filter(fasta_header%!in% outliers)%>%
  mutate(charge_seq_length=charge/residue_n,
         annotation_original = annotation,
         annotation_cll = fct_collapse(annotation,
          fascin1 = c("fascin1"),
          fascin2 = c("fascin2", "fascin2a", "fascin2b"),
          fascin = c("fascin"),
          others = c("uncharacterized", "singed", "unknown")),
         Status = replace_na(Status, 'unknown') %>% factor())
colour_values <- setNames(c('#999999','#E69F00', '#56B4E9', '#111111'),
                          c('Fascin','Fascin1', 'Fascin2', 'Others'))
status_shape <- set_names(c(3, 16, 17), c("NaN", "reviewed", "unreviewed"))

Taxonomic distribution

This is how our sequences are taxonomically distributed:

table_tax <- df %>%
      mutate(phylum= fct_lump(phylum, 15, other_level= "Others")) %>%
      mutate(class= fct_lump(class, 10, other_level= "Others")) %>%
      drop_na(phylum, class)   %>% 
      group_by(phylum, class,annotation) %>%
      summarise(n = n()) %>%
      arrange(phylum, class,annotation)

ggplot(table_tax, aes(fill=annotation, y=n, x=phylum)) + 
    geom_bar(position="stack", stat="identity")+
    theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  ggtitle("Taxonomically distributed sequences")+
  ylab("Number of sequences")+
    scale_x_discrete(guide = guide_axis(angle = 60))
ggsave(filename = "figures/FSCN1_FSCN2/taxonomic_distribution.pdf")
knitr::kable(table_tax,
 caption = "Taxonomically distributed sequences")
base <- df %>%
  filter(class %in% c('Mammalia' ))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  mutate(order= fct_lump(order, 3, other_level= "Others")) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=Status), size=3)+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  theme(axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  ggtitle("Differences between sequences in  Mammalia's species")+
  ylab("Charge divided by sequence length")+
  xlab("Species")

base + facet_wrap(~order, ncol = 2, scales ="free_x")

Carnivora

There are no annotated sequences reviewed in Carnivora. The sequences are distributed according to the pattern discussed above. There are 2 FSCN1 sequences with an index higher than 0.01, belonging to Vulpes vulpes and Felis catus.

base <- df %>%
  filter(order %in% c('Carnivora' ))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=Status), size=5)+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in  Carnivora's species")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
  geom_hline(yintercept=0.010, linetype="dashed", color = "blue")+
    scale_x_discrete(guide = guide_axis(angle = 60))


base + facet_wrap(~suborder, ncol = 2, scales ="free_x")
library(tidyverse)
library(magrittr)
df %<>% mutate(annotation = case_when(
  fasta_header == "ENSNVIG00000005656.1|ENSNVIP00000007116.1" ~ "fascin1", #American mink
  fasta_header == "A0A2Y9JHA1_ENHLU|A0A2Y9JHA1" ~ "fascin1", #Enhydra lutris kenyoni
  fasta_header == "A0A2Y9L644_ENHLU|A0A2Y9L644" ~ "fascin2", #Enhydra lutris kenyoni
  fasta_header == "ENSCAFG00020017175.1|ENSCAFP00020021724.1" ~ "fascin1", #Dingo
  fasta_header == "ENSCAFG00020009227.1|ENSCAFP00020011452.1" ~ "fascin2", #Dingo
  fasta_header == "ENSVVUG00000006354.1|ENSVVUP00000008455.1" ~ "fascin1", #Red fox
  fasta_header == "ENSPPRG00000003969.1|ENSPPRP00000009547.1" ~ "fascin1", #Leopard
  fasta_header == "ENSPPRG00000024836.1|ENSPPRP00000000416.1" ~ "fascin2", #Leopard
  fasta_header == "A0A3Q7TUG0_URSAR|A0A3Q7TUG0" ~ "fascin2a", #Ursus arctos horribilis, anotado como fscn2b para mantener cohesión
  TRUE ~ as.character(annotation)
))

Primates

In primates it works better to set a slightly higher limit, 0.0115, to include sequences from the Aotidae and Cebidae families. There is one automatically annotated FSCN2 sequence in Callithrix jacchus with an index lower than FSCN1, it is an exception. In Otolemur garnettii sequences the pattern is followed but shifted upwards. There are only reviewed annotated sequences in Homo sapiens.

base <- df %>%
  filter(order %in% c('Primates'))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in Primates")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
  geom_hline(yintercept=0.0115, linetype="dashed", color = "red")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~suborder, ncol = 2, scales ="free_x")
df %<>% mutate(annotation = case_when(
  fasta_header == "ENSCANG00000013485.1|ENSCANP00000003877.1" ~ "fascin1", #Colobus angolensis palliatus
  fasta_header == "ENSTGEG00000001910.1|ENSTGEP00000002162.1" ~ "fascin1", #Gelada
  fasta_header == "ENSTGEG00000009852.1|ENSTGEP00000012087.1" ~ "fascin2", #Gelada
  fasta_header == "ENSMFAG00000037869.1|ENSMFAP00000037244.1" ~ "fascin2", #Macaca fascicularis
  fasta_header == "ENSMMUG00000001583.3|ENSMMUP00000002112.3" ~ "fascin2", #Macaca mulatta
  fasta_header == "ENSRBIG00000007902.1|ENSRBIP00000001562.1" ~ "fascin2", #Rhinopithecus bieti
  fasta_header == "ENSRROG00000034748.1|ENSRROP00000021684.1" ~ "fascin2", #Rhinopithecus roxellana
  fasta_header == "ENSPTEG00000031612.1|ENSPTEP00000033004.1" ~ "fascin2", #Ugandan red Colobus
  fasta_header == "ENSPSMG00000010879.1|ENSPSMP00000015242.1" ~ "fascin1", #Greater bamboo lemur
  fasta_header == "ENSMICG00000010211.3|ENSMICP00000009297.2" ~ "fascin1", #Mouse lemur
  fasta_header == "ENSMICG00000031762.2|ENSMICP00000018701.2" ~ "fascin2", #Mouse lemur
  TRUE ~ as.character(annotation)
))

Rodentia

Most of the Rodentia sequences follow the pattern, however there are more sequences that deviate from it. All annotated sequences reviewed follow the pattern.

base <- df %>%
  filter(order %in% c('Rodentia'))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in Rodentia")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
  geom_hline(yintercept=0.01, linetype="dashed", color = "blue")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~suborder+family, ncol = 6, scales ="free_x")
df %<>% mutate(annotation = case_when(
  fasta_header == "ENSCCNG00000009421.1|ENSCCNP00000008921.1" ~ "fascin2", #American beaver
  fasta_header == "ENSFDAG00000016895.1|ENSFDAP00000008686.1" ~ "fascin", #Damara mole rat
  fasta_header == "ENSHGLG00000012271.1|ENSHGLP00000017138.1" ~ "fascin2", #Naked mole-rat male Como está anotado
  fasta_header == "ENSHGLG00100005772.1|ENSHGLP00100007730.1" ~ "fascin2", #Naked mole-rat male Como está anotado
  fasta_header == "ENSCLAG00000013884.1|ENSCLAP00000020263.1" ~ "fascin1", #Long-tailed chinchilla
  fasta_header == "ENSCLAG00000014531.1|ENSCLAP00000021213.1" ~ "fascin2", #Long-tailed chinchilla
  fasta_header == "ENSODEG00000012533.1|ENSODEP00000017010.1" ~ "fascin1", #Degu
  fasta_header == "ENSODEG00000008403.1|ENSODEP00000011308.1" ~ "fascin2", #Degu
  fasta_header == "ENSCGRG00015000357.1|ENSCGRP00015000341.1" ~ "fascin2", #Chinese hamster CHOK1GS
  fasta_header == "ENSCGRG00000018194.1|ENSCGRP00000025123.1" ~ "fascin2", #Chinese hamster CriGri
  fasta_header == "ENSCGRG00015013911.1|ENSCGRP00015018116.1" ~ "fascin1", #Cricetulus griseus
  fasta_header == "A0A1A6HKX6_NEOLE|A0A1A6HKX6" ~ "fascin1", #Neotoma lepida
  fasta_header == "A0A1A6HXR6_NEOLE|A0A1A6HXR6" ~ "fascin2", #Neotoma lepida
  fasta_header == "ENSPEMG00000024060.2|ENSPEMP00000028558.1" ~ "fascin1", #Northern American deer mouse
  fasta_header == "ENSPEMG00000028380.1|ENSPEMP00000031506.1" ~ "fascin2", #Northern American deer mouse
  fasta_header == "ENSMOCG00000001115.1|ENSMOCP00000001465.1" ~ "fascin1", #Prairie vole
  fasta_header == "ENSMOCG00000018078.1|ENSMOCP00000020377.1" ~ "fascin2", #Prairie vole
  fasta_header == "ENSJJAG00000019494.1|ENSJJAP00000018145.1" ~ "fascin1", #Lesser Egyptian jerboa
  fasta_header == "ENSJJAG00000015122.1|ENSJJAP00000011928.1" ~ "fascin2", #Lesser Egyptian jerboa
  fasta_header == "MGP_SPRETEiJ_G0028973.1|MGP_SPRETEiJ_P0074768" ~ "fascin1", #Algerian mouse
  fasta_header == "MGP_SPRETEiJ_G0018357.1|MGP_SPRETEiJ_P0032954" ~ "fascin2", #Algerian mouse
  fasta_header == "ENSMUGG00000013467.1|ENSMUGP00000015650.1" ~ "fascin1", #Mongolian gerbil
  fasta_header == "ENSMUGG00000001646.1|ENSMUGP00000001760.1" ~ "fascin2", #Mongolian gerbil
  fasta_header == "MGP_CASTEiJ_G0029422.1|MGP_CASTEiJ_P0075825" ~ "fascin1", #Mouse CAST/EiJ
  fasta_header == "MGP_CASTEiJ_G0018799.1|MGP_CASTEiJ_P0033632" ~ "fascin2", #Mouse CAST/EiJ
  fasta_header == "MGP_DBA2J_G0030137.1|MGP_DBA2J_P0075380" ~ "fascin1", #Mouse DBA/2J
  fasta_header == "MGP_DBA2J_G0019328.1|MGP_DBA2J_P0033800" ~ "fascin2", #Mouse DBA/2J
  fasta_header == "MGP_FVBNJ_G0019320.1|MGP_FVBNJ_P0033798" ~ "fascin2", #Mouse FVB/NJ
  fasta_header == "MGP_PWKPhJ_G0029137.1|MGP_PWKPhJ_P0075276" ~ "fascin1", #Mouse PWK/PhJ
  fasta_header == "MGP_PWKPhJ_G0018565.1|MGP_PWKPhJ_P0033324" ~ "fascin2", #Mouse PWK/PhJ
  fasta_header == "MGP_WSBEiJ_G0029496.1|MGP_WSBEiJ_P0074413" ~ "fascin1", #Mouse WSB/EiJ
  fasta_header == "MGP_WSBEiJ_G0018851.1|MGP_WSBEiJ_P0033219" ~ "fascin2", #Mouse WSB/EiJ
  fasta_header == "MGP_CAROLIEiJ_G0027969.1|MGP_CAROLIEiJ_P0071948" ~ "fascin1", #Ryukyu mouse
  fasta_header == "MGP_CAROLIEiJ_G0017513.1|MGP_CAROLIEiJ_P0031506" ~ "fascin2", #Ryukyu mouse
  fasta_header == "MGP_PahariEiJ_G0024541.1|MGP_PahariEiJ_P0060161" ~ "fascin1", #Shrew mouse
  fasta_header == "MGP_PahariEiJ_G0018639.1|MGP_PahariEiJ_P0038222" ~ "fascin2", #Shrew mouse
  fasta_header == "ENSMSIG00000029517.1|ENSMSIP00000035482.1" ~ "fascin1", #Steppe mouse
  fasta_header == "ENSMSIG00000014526.1|ENSMSIP00000016977.1" ~ "fascin2", #Steppe mouse
  fasta_header == "ENSNGAG00000003408.1|ENSNGAP00000003153.1" ~ "fascin1", #Upper Galilee mountains blind mole rat
  fasta_header == "ENSMMMG00000006216.1|ENSMMMP00000006939.1" ~ "fascin2", #Alpine marmot
  fasta_header == "ENSUPAG00010011357.1|ENSUPAP00010014127.1" ~ "fascin1", #Arctic ground squirrel
  fasta_header == "ENSUPAG00010019602.1|ENSUPAP00010024748.1" ~ "fascin2", #Arctic ground squirrel
  fasta_header == "ENSSTOG00000020923.2|ENSSTOP00000021848.2" ~ "fascin1", #Ictidomys tridecemlineatus
  fasta_header == "ENSSTOG00000028223.2|ENSSTOP00000023251.1" ~ "fascin2", #Ictidomys tridecemlineatus
  fasta_header == "ENSCAPG00000016296.1|ENSCAPP00000016394.1" ~ "fascin2", #Brazilian guinea pig
  TRUE ~ as.character(annotation)
))

Artiodactyla

base <- df %>%
  filter(order %in% c('Artiodactyla'))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in Artiodactyla order")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
  geom_hline(yintercept=0.009, linetype="dashed", color = "green")+
  geom_hline(yintercept=0.01, linetype="dashed", color = "blue")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~order+suborder, ncol = 2, scales ="free_x")
df %<>% mutate(annotation = case_when(
  fasta_header == "ENSBBBG00000009842.1|ENSBBBP00000017441.1" ~ "fascin1", #American bison
  fasta_header == "ENSBMUG00000021206.1|ENSBMUP00000028298.1" ~ "fascin1", #Wild yak
  fasta_header == "ENSTTRG00000006435.1|ENSTTRP00000006091.1" ~ "fascin2", #Dolphin
  fasta_header == "ENSMFAG00000037869.1|ENSMFAP00000037244.1" ~ "fascin2", #Macaca fascicularis
  fasta_header == "ENSMMUG00000001583.3|ENSMMUP00000002112.3" ~ "fascin2", #Macaca mulatta
  fasta_header == "ENSRBIG00000007902.1|ENSRBIP00000001562.1" ~ "fascin2", #Rhinopithecus bieti
  fasta_header == "ENSRROG00000034748.1|ENSRROP00000021684.1" ~ "fascin2", #Rhinopithecus roxellana
  fasta_header == "ENSPTEG00000031612.1|ENSPTEP00000033004.1" ~ "fascin2", #Ugandan red Colobus
  fasta_header == "ENSPSMG00000010879.1|ENSPSMP00000015242.1" ~ "fascin1", #Greater bamboo lemur
  fasta_header == "ENSMICG00000010211.3|ENSMICP00000009297.2" ~ "fascin1", #Mouse lemur
  fasta_header == "ENSMICG00000031762.2|ENSMICP00000018701.2" ~ "fascin2", #Mouse lemur
  TRUE ~ as.character(annotation)
))

Other orders in Mammalia

base <- df %>%
  filter(class %in% c('Mammalia'))%>%
  filter(order %!in% c('Primates', 'Rodentia', 'Carnivora', 'Artiodactyla'))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in other orders in Mammalia")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
#  geom_hline(yintercept=0.009, linetype="dashed", color = "green")+
  geom_hline(yintercept=0.01, linetype="dashed", color = "blue")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~order+suborder, ncol = 4, scales ="free_x")
df %<>% mutate(annotation = case_when(
  fasta_header == "ENSPVAG00000017295.1|ENSPVAP00000016319.1" ~ "fascin1", #Megabat 
  fasta_header == "ENSPCIG00000015239.2|ENSPCIP00000019991.1" ~ "fascin1", #Koala
  fasta_header == "ENSPCIG00000029609.2|ENSPCIP00000040449.1" ~ "fascin2", #Koala
  fasta_header == "ENSMEUG00000013398.1|ENSMEUP00000012221.1" ~ "fascin2", #Wallaby
  fasta_header == "ENSEEUG00000002694.1|ENSEEUP00000002460.1" ~ "fascin2", #Hedgehog
  fasta_header == "ENSEASG00005000015.1|ENSEASP00005000018.1" ~ "fascin1", #Donkey
  fasta_header == "ENSEASG00005005735.1|ENSEASP00005007879.1" ~ "fascin2", #Donkey
  fasta_header == "A0A2Y9DGX1_TRIMA|A0A2Y9DGX1" ~ "fascin2", #Trichechus manatus latirostris
  TRUE ~ as.character(annotation)))

Other classes in Chordata

Aves

In Aves the pattern just works in Galliformes.

base <- df %>%
  filter(class %in% c('Aves'))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  mutate(order= fct_lump(order, 2, other_level= "Other orders")) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=Status), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in Aves")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
  geom_hline(yintercept=0.007, linetype="dashed", color = "green")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~order, ncol = 1, scales ="free_x")

En el caso de los Passeriformes nos encontramos que hay un patrón similar, pero la anotación está al revés. Entrando en la url, encontramos que "The sequence shown here is derived from an EMBL/GenBank/DDBJ whole genome shotgun (WGS) entry which is preliminary data". En ese sentido, comparando la interacción por string, nos parece razonable que el patrón sea al revés.

Amazona aestiva tiene muy poca diferencia, por lo que lo dejaremos como fascina.

df %<>% mutate(annotation = case_when(
  fasta_header == "A0A226NJP1_CALSU|A0A226NJP1" ~ "fascin1", #Callipepla squamata
  fasta_header == "A0A226MR24_CALSU|A0A226MR24" ~ "fascin2", #Callipepla squamata
  fasta_header == "A0A226PLS9_COLVI|A0A226PLS9" ~ "fascin2", #Colinus virginianus
  fasta_header == "ENSNMEG00000003221.1|ENSNMEP00000002566.1" ~ "fascin2", #Helmeted guineafowl
  fasta_header == "ENSCJPG00005000395.1|ENSCJPP00005000313.1" ~ "fascin1", #Japanese quail
  fasta_header == "ENSCJPG00005021245.1|ENSCJPP00005027399.1" ~ "fascin2", #Japanese quail
  # Passeriformes
  fasta_header == "ENSLCOG00000010881.1|ENSLCOP00000015232.1" ~ "fascin2", #Blue-crowned manakin
  fasta_header == "ENSLCOG00000000385.1|ENSLCOP00000000419.1" ~ "fascin1", #Blue-crowned manakin
#  fasta_header == "A0A3L8S262_CHLGU|A0A3L8S262" ~ "fascin2", #Chloebia gouldiae
  fasta_header == "A0A3L8SG84_CHLGU|A0A3L8SG84" ~ "fascin1", #Chloebia gouldiae
  fasta_header == "ENSJHYG00000006117.1|ENSJHYP00000007676.1" ~ "fascin1", #Dark-eyed junco
  fasta_header == "ENSPMJG00000017038.1|ENSPMJP00000023043.1" ~ "fascin2", #Great Tit
  fasta_header == "ENSPMJG00000017401.1|ENSPMJP00000023569.1" ~ "fascin1", #Great Tit
#  fasta_header == "A0A3M0J8U2_HIRRU|A0A3M0J8U2" ~ "fascin2", #Hirundo rustica rustica
  fasta_header == "A0A3M0KY54_HIRRU|A0A3M0KY54" ~ "fascin1", #Hirundo rustica rustica
  ##
  fasta_header == "A0A0Q3XBK7_AMAAE|A0A0Q3XBK7" ~ "fascin1", #Amazona aestiva
  fasta_header == "A0A0Q3XBK7_AMAAE|A0A0Q3XBK7" ~ "fascin2", #Amazona aestiva
  fasta_header == "ENSNPEG00000000307.1|ENSNPEP00000000338.1" ~ "fascin1", #Chilean tinamou
  fasta_header == "ENSNPEG00000010488.1|ENSNPEP00000014026.1" ~ "fascin2", #Chilean tinamou
  fasta_header == "ENSDNVG00000009693.1|ENSDNVP00000013742.1" ~ "fascin2", #Emu
  fasta_header == "ENSARWG00000006036.1|ENSARWP00000008623.1" ~ "fascin2", #Okarito brown kiwi
  fasta_header == "ENSABRG00000010882.1|ENSABRP00000012102.1" ~ "fascin2", #Pink-footed goose
  fasta_header == "ENSCPUG00000010544.1|ENSCPUP00000015119.1" ~ "fascin1", #Ruff
  fasta_header == "ENSCPGG00000016204.1|ENSCPGP00000023396.1" ~ "fascin2", #Spoon-billed sandpiper
  TRUE ~ as.character(annotation)))

Rest of Chordata

base <- df %>%
  filter(phylum %in% c('Chordata'))%>%
  filter(class %!in% c('Mammalia', 'Actinopteri', 'Aves'))%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
#  mutate(family= fct_lump(family, 3, other_level= "Others families")) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=Status), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in other Chordata's clasess")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~class, ncol = 1, scales ="free_x")
df %<>% mutate(annotation = case_when(
  #fasta_header == "A0A1L8ETI6_XENLA|A0A1L8ETI6" ~ "fascin2", #Xenopus laevis
  #fasta_header == "FASC_XENLA|Q91837" ~ "fascin2", #Xenopus laevis
  fasta_header == "A0A401P713_SCYTO|A0A401P713" ~ "fascin1", #Scyliorhinus torazame
  fasta_header == "A0A401PG70_SCYTO|A0A401PG70" ~ "fascin2", #Scyliorhinus torazame
  fasta_header == "ENSSMRG00000020250.1|ENSSMRP00000026211.1" ~ "fascin1", #Argentine black and white tegu
  fasta_header == "ENSSMRG00000004642.1|ENSSMRP00000005740.1" ~ "fascin2", #Argentine black and white tegu
  fasta_header == "ENSPVIG00000008238.1|ENSPVIP00000010398.1" ~ "fascin1", #Central bearded dragon
  fasta_header == "ENSPVIG00000000772.1|ENSPVIP00000000848.1" ~ "fascin", #Central bearded dragon anotado como fascin-like
  #Poca diferencia, lo pondría en duda, pero viene anotado así
  fasta_header == "ENSNSUG00000014196.1|ENSNSUP00000019414.1" ~ "fascin", #Mainland tiger snake 
  fasta_header == "ENSNSUG00000009748.1|ENSNSUP00000013015.1" ~ "fascin", #Mainland tiger snake
  fasta_header == "ENSSPUG00000002887.1|ENSSPUP00000003759.1" ~ "fascin1", #Tuatara
  fasta_header == "ENSSPUG00000014373.1|ENSSPUP00000018627.1" ~ "fascin2", #Tuatara
  fasta_header == "ENSCABG00000005872.1|ENSCABP00000007792.1" ~ "fascin1", #Abingdon island giant tortoise
  fasta_header == "ENSCABG00000012992.1|ENSCABP00000017491.1" ~ "fascin2", #Abingdon island giant tortoise
  fasta_header == "A0A1U7SFR2_ALLSI|A0A1U7SFR2" ~ "fascin2", #Alligator sinensis
  species == "Australian saltwater crocodile" ~ "fascin", # Diferencia de dos aminoácidos
  fasta_header == "ENSCPBG00000020810.1|ENSCPBP00000029528.1" ~ "fascin1", #Painted turtle
  fasta_header == "ENSCPBG00000010404.1|ENSCPBP00000013973.1" ~ "fascin2", #Painted turtle
  TRUE ~ as.character(annotation)))

Arthropoda, Mollusca and others

base <- df %>%
  filter(phylum %!in% c('Chordata'))%>%
  drop_na(class)%>%
  arrange(desc(phylum, class, order, family, genus, species)) %>%
  ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) +
  geom_point(alpha=1, aes(shape=Status), size=4 )+
  theme(axis.text.x = element_text(face = "italic", size=13),
        axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
        strip.text = element_text(face = "italic", size=13))+
  scale_shape_discrete(drop=TRUE,
        limits = levels(df$Status))+
  scale_colour_discrete(drop=TRUE,
        limits = levels(df$annotation_cll))+
  ggtitle("Differences between sequences in other phyla")+
  ylab("Charge divided by sequence length")+
  xlab("Species")+
      scale_x_discrete(guide = guide_axis(angle = 60))
base + facet_wrap(~phylum, ncol = 2, scales ="free_x") 
df %<>% mutate(annotation = case_when(
  fasta_header == "D6WYE3_TRICA|D6WYE3" ~ "fascin", #Tribolium castaneum
  fasta_header == "B4JLX1_DROGR|B4JLX1" ~ "fascin", #Drosophila grimshawi
  fasta_header == "B4IL65_DROSE|B4IL65" ~ "fascin", #Drosophila grimshawi 
  TRUE ~ as.character(annotation)))

Final dataset

freezing <- df %>% filter(class %!in% c('Actinopteri', 'Chondrichthyes')) %>%
  select(-c(B, Z, charge_seq_length, annotation_cll)) #Problem with B & Z in EMBOSS Pepstats
freezing$reannotated <- factor(ifelse(freezing$annotation != freezing$annotation_original, "Yes", "No"))
library(hrbrthemes)
library(viridis)
library(FascinRSCA)
freezing %>%
  group_by(class, annotation)%>%
  summarise(n = n(),
            charge = mean (charge),
            residue_n = mean(residue_n), .groups = "drop")%>%
  ggplot(aes(x=charge, y=residue_n, size=n, fill=class)) +
    geom_point(alpha=0.5, shape=21, color="black") +
    scale_size(range = c(.1, 24), name="Number of sequences") +
    #scale_fill_viridis(discrete=TRUE, guide="none", option="A") +
    theme_ipsum() +
    ylab("Charge") +
    xlab("Number of residues") 

ggsave(filename = "figures/FSCN1_FSCN2/bubble_plot.tiff", dpi = 400)
df %>%
  mutate(class= fct_lump(class, 7, other_level= "Others")) %>%
  filter(class != "Others")%>%
  ggplot(aes(x=class, y=charge/residue_n)) +
  geom_violin(show.legend = FALSE, adjust = 0.5)+
  geom_dotplot(aes(fill = as.factor(annotation)), binaxis = "y",
               binwidth = 0.0005,
               stackdir = "center") +
  theme(axis.text.x = element_text(face = "italic", size=13),
      axis.ticks.x=element_blank(),legend.text = element_text(size = 15),
      strip.text = element_text(face = "italic", size=13))+
  scale_x_discrete(guide = guide_axis(angle = 60))
ggsave(filename = "figures/FSCN1_FSCN2/violin_plot.pdf", dpi = 300)
write.csv(freezing, paste("../data", 'freezing_fascin.csv', sep="/"), row.names = TRUE)
usethis::use_data(freezing, overwrite = TRUE)
freezing %>% count(reannotated)
human <- subset(freezing, species == 'Homo sapiens')
freezingNonRedundant <- rbind(human, freezing) %>% distinct(fasta_seq, .keep_all = TRUE)
write.csv(freezingNonRedundant, paste("../data", 'freezingNonRedundant_fascin.csv', sep="/"), row.names = TRUE)
usethis::use_data(freezingNonRedundant, overwrite = TRUE)

write.csv(df %>% filter(class %in% c('Actinopteri', 'Chondrichthyes')), 
  paste("../data", 'peces_fascin.csv', sep="/"), row.names = TRUE)


currocam/FascinRSCA documentation built on March 21, 2022, 6:29 a.m.