knitr::opts_chunk$set(echo = TRUE, dpi=200)
The purpose of this notebook is to discriminate the homologous sequences to fascin that will be used for the SCA analysis and their reannotation.
# This is code to replicate the analysis and figures made to distinguish between #FSCN1 and FSCN2 sequences. Code developed by currocam. #load libraries library(FascinRSCA) library(tidyverse) library(magrittr) `%!in%` = Negate(`%in%`) # Input file outliers <- scan("../data-raw/df_Q16658/outliers_fasta_headers.txt", what = "character") outliers %<>% append("ENSPTRG00000041962.2|ENSPTRP00000061120.2") df <- FascinRSCA::df_Q16658_WithOutOutliers %>% filter(fasta_header%!in% outliers)%>% mutate(charge_seq_length=charge/residue_n, annotation_original = annotation, annotation_cll = fct_collapse(annotation, fascin1 = c("fascin1"), fascin2 = c("fascin2", "fascin2a", "fascin2b"), fascin = c("fascin"), others = c("uncharacterized", "singed", "unknown")), Status = replace_na(Status, 'unknown') %>% factor()) colour_values <- setNames(c('#999999','#E69F00', '#56B4E9', '#111111'), c('Fascin','Fascin1', 'Fascin2', 'Others')) status_shape <- set_names(c(3, 16, 17), c("NaN", "reviewed", "unreviewed"))
This is how our sequences are taxonomically distributed:
table_tax <- df %>% mutate(phylum= fct_lump(phylum, 15, other_level= "Others")) %>% mutate(class= fct_lump(class, 10, other_level= "Others")) %>% drop_na(phylum, class) %>% group_by(phylum, class,annotation) %>% summarise(n = n()) %>% arrange(phylum, class,annotation) ggplot(table_tax, aes(fill=annotation, y=n, x=phylum)) + geom_bar(position="stack", stat="identity")+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ ggtitle("Taxonomically distributed sequences")+ ylab("Number of sequences")+ scale_x_discrete(guide = guide_axis(angle = 60)) ggsave(filename = "figures/FSCN1_FSCN2/taxonomic_distribution.pdf")
knitr::kable(table_tax, caption = "Taxonomically distributed sequences")
base <- df %>% filter(class %in% c('Mammalia' ))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% mutate(order= fct_lump(order, 3, other_level= "Others")) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=Status), size=3)+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ theme(axis.text.x = element_blank(), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ ggtitle("Differences between sequences in Mammalia's species")+ ylab("Charge divided by sequence length")+ xlab("Species") base + facet_wrap(~order, ncol = 2, scales ="free_x")
There are no annotated sequences reviewed in Carnivora. The sequences are distributed according to the pattern discussed above. There are 2 FSCN1 sequences with an index higher than 0.01, belonging to Vulpes vulpes and Felis catus.
base <- df %>% filter(order %in% c('Carnivora' ))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=Status), size=5)+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in Carnivora's species")+ ylab("Charge divided by sequence length")+ xlab("Species")+ geom_hline(yintercept=0.010, linetype="dashed", color = "blue")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~suborder, ncol = 2, scales ="free_x")
library(tidyverse) library(magrittr) df %<>% mutate(annotation = case_when( fasta_header == "ENSNVIG00000005656.1|ENSNVIP00000007116.1" ~ "fascin1", #American mink fasta_header == "A0A2Y9JHA1_ENHLU|A0A2Y9JHA1" ~ "fascin1", #Enhydra lutris kenyoni fasta_header == "A0A2Y9L644_ENHLU|A0A2Y9L644" ~ "fascin2", #Enhydra lutris kenyoni fasta_header == "ENSCAFG00020017175.1|ENSCAFP00020021724.1" ~ "fascin1", #Dingo fasta_header == "ENSCAFG00020009227.1|ENSCAFP00020011452.1" ~ "fascin2", #Dingo fasta_header == "ENSVVUG00000006354.1|ENSVVUP00000008455.1" ~ "fascin1", #Red fox fasta_header == "ENSPPRG00000003969.1|ENSPPRP00000009547.1" ~ "fascin1", #Leopard fasta_header == "ENSPPRG00000024836.1|ENSPPRP00000000416.1" ~ "fascin2", #Leopard fasta_header == "A0A3Q7TUG0_URSAR|A0A3Q7TUG0" ~ "fascin2a", #Ursus arctos horribilis, anotado como fscn2b para mantener cohesión TRUE ~ as.character(annotation) ))
In primates it works better to set a slightly higher limit, 0.0115, to include sequences from the Aotidae and Cebidae families. There is one automatically annotated FSCN2 sequence in Callithrix jacchus with an index lower than FSCN1, it is an exception. In Otolemur garnettii sequences the pattern is followed but shifted upwards. There are only reviewed annotated sequences in Homo sapiens.
base <- df %>% filter(order %in% c('Primates'))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in Primates")+ ylab("Charge divided by sequence length")+ xlab("Species")+ geom_hline(yintercept=0.0115, linetype="dashed", color = "red")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~suborder, ncol = 2, scales ="free_x")
df %<>% mutate(annotation = case_when( fasta_header == "ENSCANG00000013485.1|ENSCANP00000003877.1" ~ "fascin1", #Colobus angolensis palliatus fasta_header == "ENSTGEG00000001910.1|ENSTGEP00000002162.1" ~ "fascin1", #Gelada fasta_header == "ENSTGEG00000009852.1|ENSTGEP00000012087.1" ~ "fascin2", #Gelada fasta_header == "ENSMFAG00000037869.1|ENSMFAP00000037244.1" ~ "fascin2", #Macaca fascicularis fasta_header == "ENSMMUG00000001583.3|ENSMMUP00000002112.3" ~ "fascin2", #Macaca mulatta fasta_header == "ENSRBIG00000007902.1|ENSRBIP00000001562.1" ~ "fascin2", #Rhinopithecus bieti fasta_header == "ENSRROG00000034748.1|ENSRROP00000021684.1" ~ "fascin2", #Rhinopithecus roxellana fasta_header == "ENSPTEG00000031612.1|ENSPTEP00000033004.1" ~ "fascin2", #Ugandan red Colobus fasta_header == "ENSPSMG00000010879.1|ENSPSMP00000015242.1" ~ "fascin1", #Greater bamboo lemur fasta_header == "ENSMICG00000010211.3|ENSMICP00000009297.2" ~ "fascin1", #Mouse lemur fasta_header == "ENSMICG00000031762.2|ENSMICP00000018701.2" ~ "fascin2", #Mouse lemur TRUE ~ as.character(annotation) ))
Most of the Rodentia sequences follow the pattern, however there are more sequences that deviate from it. All annotated sequences reviewed follow the pattern.
base <- df %>% filter(order %in% c('Rodentia'))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in Rodentia")+ ylab("Charge divided by sequence length")+ xlab("Species")+ geom_hline(yintercept=0.01, linetype="dashed", color = "blue")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~suborder+family, ncol = 6, scales ="free_x")
df %<>% mutate(annotation = case_when( fasta_header == "ENSCCNG00000009421.1|ENSCCNP00000008921.1" ~ "fascin2", #American beaver fasta_header == "ENSFDAG00000016895.1|ENSFDAP00000008686.1" ~ "fascin", #Damara mole rat fasta_header == "ENSHGLG00000012271.1|ENSHGLP00000017138.1" ~ "fascin2", #Naked mole-rat male Como está anotado fasta_header == "ENSHGLG00100005772.1|ENSHGLP00100007730.1" ~ "fascin2", #Naked mole-rat male Como está anotado fasta_header == "ENSCLAG00000013884.1|ENSCLAP00000020263.1" ~ "fascin1", #Long-tailed chinchilla fasta_header == "ENSCLAG00000014531.1|ENSCLAP00000021213.1" ~ "fascin2", #Long-tailed chinchilla fasta_header == "ENSODEG00000012533.1|ENSODEP00000017010.1" ~ "fascin1", #Degu fasta_header == "ENSODEG00000008403.1|ENSODEP00000011308.1" ~ "fascin2", #Degu fasta_header == "ENSCGRG00015000357.1|ENSCGRP00015000341.1" ~ "fascin2", #Chinese hamster CHOK1GS fasta_header == "ENSCGRG00000018194.1|ENSCGRP00000025123.1" ~ "fascin2", #Chinese hamster CriGri fasta_header == "ENSCGRG00015013911.1|ENSCGRP00015018116.1" ~ "fascin1", #Cricetulus griseus fasta_header == "A0A1A6HKX6_NEOLE|A0A1A6HKX6" ~ "fascin1", #Neotoma lepida fasta_header == "A0A1A6HXR6_NEOLE|A0A1A6HXR6" ~ "fascin2", #Neotoma lepida fasta_header == "ENSPEMG00000024060.2|ENSPEMP00000028558.1" ~ "fascin1", #Northern American deer mouse fasta_header == "ENSPEMG00000028380.1|ENSPEMP00000031506.1" ~ "fascin2", #Northern American deer mouse fasta_header == "ENSMOCG00000001115.1|ENSMOCP00000001465.1" ~ "fascin1", #Prairie vole fasta_header == "ENSMOCG00000018078.1|ENSMOCP00000020377.1" ~ "fascin2", #Prairie vole fasta_header == "ENSJJAG00000019494.1|ENSJJAP00000018145.1" ~ "fascin1", #Lesser Egyptian jerboa fasta_header == "ENSJJAG00000015122.1|ENSJJAP00000011928.1" ~ "fascin2", #Lesser Egyptian jerboa fasta_header == "MGP_SPRETEiJ_G0028973.1|MGP_SPRETEiJ_P0074768" ~ "fascin1", #Algerian mouse fasta_header == "MGP_SPRETEiJ_G0018357.1|MGP_SPRETEiJ_P0032954" ~ "fascin2", #Algerian mouse fasta_header == "ENSMUGG00000013467.1|ENSMUGP00000015650.1" ~ "fascin1", #Mongolian gerbil fasta_header == "ENSMUGG00000001646.1|ENSMUGP00000001760.1" ~ "fascin2", #Mongolian gerbil fasta_header == "MGP_CASTEiJ_G0029422.1|MGP_CASTEiJ_P0075825" ~ "fascin1", #Mouse CAST/EiJ fasta_header == "MGP_CASTEiJ_G0018799.1|MGP_CASTEiJ_P0033632" ~ "fascin2", #Mouse CAST/EiJ fasta_header == "MGP_DBA2J_G0030137.1|MGP_DBA2J_P0075380" ~ "fascin1", #Mouse DBA/2J fasta_header == "MGP_DBA2J_G0019328.1|MGP_DBA2J_P0033800" ~ "fascin2", #Mouse DBA/2J fasta_header == "MGP_FVBNJ_G0019320.1|MGP_FVBNJ_P0033798" ~ "fascin2", #Mouse FVB/NJ fasta_header == "MGP_PWKPhJ_G0029137.1|MGP_PWKPhJ_P0075276" ~ "fascin1", #Mouse PWK/PhJ fasta_header == "MGP_PWKPhJ_G0018565.1|MGP_PWKPhJ_P0033324" ~ "fascin2", #Mouse PWK/PhJ fasta_header == "MGP_WSBEiJ_G0029496.1|MGP_WSBEiJ_P0074413" ~ "fascin1", #Mouse WSB/EiJ fasta_header == "MGP_WSBEiJ_G0018851.1|MGP_WSBEiJ_P0033219" ~ "fascin2", #Mouse WSB/EiJ fasta_header == "MGP_CAROLIEiJ_G0027969.1|MGP_CAROLIEiJ_P0071948" ~ "fascin1", #Ryukyu mouse fasta_header == "MGP_CAROLIEiJ_G0017513.1|MGP_CAROLIEiJ_P0031506" ~ "fascin2", #Ryukyu mouse fasta_header == "MGP_PahariEiJ_G0024541.1|MGP_PahariEiJ_P0060161" ~ "fascin1", #Shrew mouse fasta_header == "MGP_PahariEiJ_G0018639.1|MGP_PahariEiJ_P0038222" ~ "fascin2", #Shrew mouse fasta_header == "ENSMSIG00000029517.1|ENSMSIP00000035482.1" ~ "fascin1", #Steppe mouse fasta_header == "ENSMSIG00000014526.1|ENSMSIP00000016977.1" ~ "fascin2", #Steppe mouse fasta_header == "ENSNGAG00000003408.1|ENSNGAP00000003153.1" ~ "fascin1", #Upper Galilee mountains blind mole rat fasta_header == "ENSMMMG00000006216.1|ENSMMMP00000006939.1" ~ "fascin2", #Alpine marmot fasta_header == "ENSUPAG00010011357.1|ENSUPAP00010014127.1" ~ "fascin1", #Arctic ground squirrel fasta_header == "ENSUPAG00010019602.1|ENSUPAP00010024748.1" ~ "fascin2", #Arctic ground squirrel fasta_header == "ENSSTOG00000020923.2|ENSSTOP00000021848.2" ~ "fascin1", #Ictidomys tridecemlineatus fasta_header == "ENSSTOG00000028223.2|ENSSTOP00000023251.1" ~ "fascin2", #Ictidomys tridecemlineatus fasta_header == "ENSCAPG00000016296.1|ENSCAPP00000016394.1" ~ "fascin2", #Brazilian guinea pig TRUE ~ as.character(annotation) ))
base <- df %>% filter(order %in% c('Artiodactyla'))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in Artiodactyla order")+ ylab("Charge divided by sequence length")+ xlab("Species")+ geom_hline(yintercept=0.009, linetype="dashed", color = "green")+ geom_hline(yintercept=0.01, linetype="dashed", color = "blue")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~order+suborder, ncol = 2, scales ="free_x")
df %<>% mutate(annotation = case_when( fasta_header == "ENSBBBG00000009842.1|ENSBBBP00000017441.1" ~ "fascin1", #American bison fasta_header == "ENSBMUG00000021206.1|ENSBMUP00000028298.1" ~ "fascin1", #Wild yak fasta_header == "ENSTTRG00000006435.1|ENSTTRP00000006091.1" ~ "fascin2", #Dolphin fasta_header == "ENSMFAG00000037869.1|ENSMFAP00000037244.1" ~ "fascin2", #Macaca fascicularis fasta_header == "ENSMMUG00000001583.3|ENSMMUP00000002112.3" ~ "fascin2", #Macaca mulatta fasta_header == "ENSRBIG00000007902.1|ENSRBIP00000001562.1" ~ "fascin2", #Rhinopithecus bieti fasta_header == "ENSRROG00000034748.1|ENSRROP00000021684.1" ~ "fascin2", #Rhinopithecus roxellana fasta_header == "ENSPTEG00000031612.1|ENSPTEP00000033004.1" ~ "fascin2", #Ugandan red Colobus fasta_header == "ENSPSMG00000010879.1|ENSPSMP00000015242.1" ~ "fascin1", #Greater bamboo lemur fasta_header == "ENSMICG00000010211.3|ENSMICP00000009297.2" ~ "fascin1", #Mouse lemur fasta_header == "ENSMICG00000031762.2|ENSMICP00000018701.2" ~ "fascin2", #Mouse lemur TRUE ~ as.character(annotation) ))
base <- df %>% filter(class %in% c('Mammalia'))%>% filter(order %!in% c('Primates', 'Rodentia', 'Carnivora', 'Artiodactyla'))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=factor(Status)), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in other orders in Mammalia")+ ylab("Charge divided by sequence length")+ xlab("Species")+ # geom_hline(yintercept=0.009, linetype="dashed", color = "green")+ geom_hline(yintercept=0.01, linetype="dashed", color = "blue")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~order+suborder, ncol = 4, scales ="free_x")
df %<>% mutate(annotation = case_when( fasta_header == "ENSPVAG00000017295.1|ENSPVAP00000016319.1" ~ "fascin1", #Megabat fasta_header == "ENSPCIG00000015239.2|ENSPCIP00000019991.1" ~ "fascin1", #Koala fasta_header == "ENSPCIG00000029609.2|ENSPCIP00000040449.1" ~ "fascin2", #Koala fasta_header == "ENSMEUG00000013398.1|ENSMEUP00000012221.1" ~ "fascin2", #Wallaby fasta_header == "ENSEEUG00000002694.1|ENSEEUP00000002460.1" ~ "fascin2", #Hedgehog fasta_header == "ENSEASG00005000015.1|ENSEASP00005000018.1" ~ "fascin1", #Donkey fasta_header == "ENSEASG00005005735.1|ENSEASP00005007879.1" ~ "fascin2", #Donkey fasta_header == "A0A2Y9DGX1_TRIMA|A0A2Y9DGX1" ~ "fascin2", #Trichechus manatus latirostris TRUE ~ as.character(annotation)))
In Aves the pattern just works in Galliformes.
base <- df %>% filter(class %in% c('Aves'))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% mutate(order= fct_lump(order, 2, other_level= "Other orders")) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=Status), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in Aves")+ ylab("Charge divided by sequence length")+ xlab("Species")+ geom_hline(yintercept=0.007, linetype="dashed", color = "green")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~order, ncol = 1, scales ="free_x")
En el caso de los Passeriformes nos encontramos que hay un patrón similar, pero la anotación está al revés. Entrando en la url, encontramos que "The sequence shown here is derived from an EMBL/GenBank/DDBJ whole genome shotgun (WGS) entry which is preliminary data". En ese sentido, comparando la interacción por string, nos parece razonable que el patrón sea al revés.
Amazona aestiva tiene muy poca diferencia, por lo que lo dejaremos como fascina.
df %<>% mutate(annotation = case_when( fasta_header == "A0A226NJP1_CALSU|A0A226NJP1" ~ "fascin1", #Callipepla squamata fasta_header == "A0A226MR24_CALSU|A0A226MR24" ~ "fascin2", #Callipepla squamata fasta_header == "A0A226PLS9_COLVI|A0A226PLS9" ~ "fascin2", #Colinus virginianus fasta_header == "ENSNMEG00000003221.1|ENSNMEP00000002566.1" ~ "fascin2", #Helmeted guineafowl fasta_header == "ENSCJPG00005000395.1|ENSCJPP00005000313.1" ~ "fascin1", #Japanese quail fasta_header == "ENSCJPG00005021245.1|ENSCJPP00005027399.1" ~ "fascin2", #Japanese quail # Passeriformes fasta_header == "ENSLCOG00000010881.1|ENSLCOP00000015232.1" ~ "fascin2", #Blue-crowned manakin fasta_header == "ENSLCOG00000000385.1|ENSLCOP00000000419.1" ~ "fascin1", #Blue-crowned manakin # fasta_header == "A0A3L8S262_CHLGU|A0A3L8S262" ~ "fascin2", #Chloebia gouldiae fasta_header == "A0A3L8SG84_CHLGU|A0A3L8SG84" ~ "fascin1", #Chloebia gouldiae fasta_header == "ENSJHYG00000006117.1|ENSJHYP00000007676.1" ~ "fascin1", #Dark-eyed junco fasta_header == "ENSPMJG00000017038.1|ENSPMJP00000023043.1" ~ "fascin2", #Great Tit fasta_header == "ENSPMJG00000017401.1|ENSPMJP00000023569.1" ~ "fascin1", #Great Tit # fasta_header == "A0A3M0J8U2_HIRRU|A0A3M0J8U2" ~ "fascin2", #Hirundo rustica rustica fasta_header == "A0A3M0KY54_HIRRU|A0A3M0KY54" ~ "fascin1", #Hirundo rustica rustica ## fasta_header == "A0A0Q3XBK7_AMAAE|A0A0Q3XBK7" ~ "fascin1", #Amazona aestiva fasta_header == "A0A0Q3XBK7_AMAAE|A0A0Q3XBK7" ~ "fascin2", #Amazona aestiva fasta_header == "ENSNPEG00000000307.1|ENSNPEP00000000338.1" ~ "fascin1", #Chilean tinamou fasta_header == "ENSNPEG00000010488.1|ENSNPEP00000014026.1" ~ "fascin2", #Chilean tinamou fasta_header == "ENSDNVG00000009693.1|ENSDNVP00000013742.1" ~ "fascin2", #Emu fasta_header == "ENSARWG00000006036.1|ENSARWP00000008623.1" ~ "fascin2", #Okarito brown kiwi fasta_header == "ENSABRG00000010882.1|ENSABRP00000012102.1" ~ "fascin2", #Pink-footed goose fasta_header == "ENSCPUG00000010544.1|ENSCPUP00000015119.1" ~ "fascin1", #Ruff fasta_header == "ENSCPGG00000016204.1|ENSCPGP00000023396.1" ~ "fascin2", #Spoon-billed sandpiper TRUE ~ as.character(annotation)))
base <- df %>% filter(phylum %in% c('Chordata'))%>% filter(class %!in% c('Mammalia', 'Actinopteri', 'Aves'))%>% arrange(desc(phylum, class, order, family, genus, species)) %>% # mutate(family= fct_lump(family, 3, other_level= "Others families")) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=Status), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in other Chordata's clasess")+ ylab("Charge divided by sequence length")+ xlab("Species")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~class, ncol = 1, scales ="free_x")
df %<>% mutate(annotation = case_when( #fasta_header == "A0A1L8ETI6_XENLA|A0A1L8ETI6" ~ "fascin2", #Xenopus laevis #fasta_header == "FASC_XENLA|Q91837" ~ "fascin2", #Xenopus laevis fasta_header == "A0A401P713_SCYTO|A0A401P713" ~ "fascin1", #Scyliorhinus torazame fasta_header == "A0A401PG70_SCYTO|A0A401PG70" ~ "fascin2", #Scyliorhinus torazame fasta_header == "ENSSMRG00000020250.1|ENSSMRP00000026211.1" ~ "fascin1", #Argentine black and white tegu fasta_header == "ENSSMRG00000004642.1|ENSSMRP00000005740.1" ~ "fascin2", #Argentine black and white tegu fasta_header == "ENSPVIG00000008238.1|ENSPVIP00000010398.1" ~ "fascin1", #Central bearded dragon fasta_header == "ENSPVIG00000000772.1|ENSPVIP00000000848.1" ~ "fascin", #Central bearded dragon anotado como fascin-like #Poca diferencia, lo pondría en duda, pero viene anotado así fasta_header == "ENSNSUG00000014196.1|ENSNSUP00000019414.1" ~ "fascin", #Mainland tiger snake fasta_header == "ENSNSUG00000009748.1|ENSNSUP00000013015.1" ~ "fascin", #Mainland tiger snake fasta_header == "ENSSPUG00000002887.1|ENSSPUP00000003759.1" ~ "fascin1", #Tuatara fasta_header == "ENSSPUG00000014373.1|ENSSPUP00000018627.1" ~ "fascin2", #Tuatara fasta_header == "ENSCABG00000005872.1|ENSCABP00000007792.1" ~ "fascin1", #Abingdon island giant tortoise fasta_header == "ENSCABG00000012992.1|ENSCABP00000017491.1" ~ "fascin2", #Abingdon island giant tortoise fasta_header == "A0A1U7SFR2_ALLSI|A0A1U7SFR2" ~ "fascin2", #Alligator sinensis species == "Australian saltwater crocodile" ~ "fascin", # Diferencia de dos aminoácidos fasta_header == "ENSCPBG00000020810.1|ENSCPBP00000029528.1" ~ "fascin1", #Painted turtle fasta_header == "ENSCPBG00000010404.1|ENSCPBP00000013973.1" ~ "fascin2", #Painted turtle TRUE ~ as.character(annotation)))
base <- df %>% filter(phylum %!in% c('Chordata'))%>% drop_na(class)%>% arrange(desc(phylum, class, order, family, genus, species)) %>% ggplot(aes(y=charge_seq_length, x=species, color=annotation_cll)) + geom_point(alpha=1, aes(shape=Status), size=4 )+ theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_shape_discrete(drop=TRUE, limits = levels(df$Status))+ scale_colour_discrete(drop=TRUE, limits = levels(df$annotation_cll))+ ggtitle("Differences between sequences in other phyla")+ ylab("Charge divided by sequence length")+ xlab("Species")+ scale_x_discrete(guide = guide_axis(angle = 60)) base + facet_wrap(~phylum, ncol = 2, scales ="free_x")
df %<>% mutate(annotation = case_when( fasta_header == "D6WYE3_TRICA|D6WYE3" ~ "fascin", #Tribolium castaneum fasta_header == "B4JLX1_DROGR|B4JLX1" ~ "fascin", #Drosophila grimshawi fasta_header == "B4IL65_DROSE|B4IL65" ~ "fascin", #Drosophila grimshawi TRUE ~ as.character(annotation)))
freezing <- df %>% filter(class %!in% c('Actinopteri', 'Chondrichthyes')) %>% select(-c(B, Z, charge_seq_length, annotation_cll)) #Problem with B & Z in EMBOSS Pepstats freezing$reannotated <- factor(ifelse(freezing$annotation != freezing$annotation_original, "Yes", "No"))
library(hrbrthemes) library(viridis) library(FascinRSCA) freezing %>% group_by(class, annotation)%>% summarise(n = n(), charge = mean (charge), residue_n = mean(residue_n), .groups = "drop")%>% ggplot(aes(x=charge, y=residue_n, size=n, fill=class)) + geom_point(alpha=0.5, shape=21, color="black") + scale_size(range = c(.1, 24), name="Number of sequences") + #scale_fill_viridis(discrete=TRUE, guide="none", option="A") + theme_ipsum() + ylab("Charge") + xlab("Number of residues") ggsave(filename = "figures/FSCN1_FSCN2/bubble_plot.tiff", dpi = 400)
df %>% mutate(class= fct_lump(class, 7, other_level= "Others")) %>% filter(class != "Others")%>% ggplot(aes(x=class, y=charge/residue_n)) + geom_violin(show.legend = FALSE, adjust = 0.5)+ geom_dotplot(aes(fill = as.factor(annotation)), binaxis = "y", binwidth = 0.0005, stackdir = "center") + theme(axis.text.x = element_text(face = "italic", size=13), axis.ticks.x=element_blank(),legend.text = element_text(size = 15), strip.text = element_text(face = "italic", size=13))+ scale_x_discrete(guide = guide_axis(angle = 60)) ggsave(filename = "figures/FSCN1_FSCN2/violin_plot.pdf", dpi = 300)
write.csv(freezing, paste("../data", 'freezing_fascin.csv', sep="/"), row.names = TRUE) usethis::use_data(freezing, overwrite = TRUE) freezing %>% count(reannotated)
human <- subset(freezing, species == 'Homo sapiens') freezingNonRedundant <- rbind(human, freezing) %>% distinct(fasta_seq, .keep_all = TRUE) write.csv(freezingNonRedundant, paste("../data", 'freezingNonRedundant_fascin.csv', sep="/"), row.names = TRUE) usethis::use_data(freezingNonRedundant, overwrite = TRUE) write.csv(df %>% filter(class %in% c('Actinopteri', 'Chondrichthyes')), paste("../data", 'peces_fascin.csv', sep="/"), row.names = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.