knitr::opts_chunk$set(echo = TRUE, dpi=150) `%!in%` = Negate(`%in%`) library(ggplot2) library(magrittr) library(stringr) library(FascinRSCA) df <- FascinRSCA::df_Q16658
First, we looked at the statistical significance values:
summary(df$pvalue)
A continuación, eliminaremos las secuencias que sean fragmentos, las que han sido borradas y aquellas anotadas como Ubiquitin ligasa.
print("Before filtering using annotation") dim(df) avoid_words <- c('delete','fragment','ubiquitin') df %<>% dplyr::filter(dplyr::if_all(c('Protein names','Gene names', 'Entry name'), ~str_to_lower(.) %>% tidyr::replace_na('') %>% # Para no eliminar las entradas con un NA en la anotación stringr::str_detect(paste(avoid_words,collapse = '|'))%>% `!`)) print("After filtering using annotation") dim(df)
Vamos a eliminar las secuencias redundates al 100% para la misma especie (secuencias repetidas).
df %<>% dplyr::distinct(Status, taxid, fasta_seq, .keep_all = TRUE) print("After filtering duplicated sequences") dim(df)
def_outliers <- vector() df %<>% dplyr::mutate(ann = paste(fasta_header, species, sep = '\n'))
En primer lugar, observamos la distribución de las secuencias en base a su carga y longitud de secuencia.
df %>% tidyr::drop_na(phylum, charge)%>% dplyr::mutate(phylum = forcats::fct_lump_min(phylum, 10))%>% {outliers_boxplot_with_plotly(.$phylum, .$charge, .$ann, "Charge distribution across phyla in Fascin homologous sequences")}
df %>% tidyr::drop_na(phylum, residue_n)%>% dplyr::mutate(phylum = forcats::fct_lump_min(phylum, 10))%>% {outliers_boxplot_with_plotly(.$phylum, .$residue_n, .$ann,"Length distribution across phyla in Fascin homologous sequences")}
outliers <- setNames(c("ENSNMEG00000009168.1|ENSNMEP00000009468.1", "A0A226PTP4_COLVI|A0A226PTP4", "ENSFALG00000012651.1|ENSFALP00000013204.1", "ENSVVUG00000019375.1|ENSVVUP00000026397.1", "ENSPTEG00000031612.1|ENSPTEP00000033001.1", "A0A2U3W8G7_ODORO|A0A2U3W8G7", "A0A401S4A7_CHIPU|A0A401S4A7", "A0A1W4WIZ5_AGRPL|A0A1W4WIZ5", "A0A1W4WV15_AGRPL|A0A1W4WV15"), c("Helmeted guineafowl","Colinus virginianus", "Ficedula albicollis", "Red fox", "Ugandan red Colobus", "Odobenus rosmarus divergens", "Chiloscyllium punctatum", "Agrilus planipennis", "Agrilus planipennis")) df %<>% subset(fasta_header %!in% outliers) def_outliers %<>% append(outliers)
Vamos a excluir todos los outliers por carga.
df %>% dplyr::filter(phylum == 'Chordata')%>% tidyr::drop_na(class, charge) %>% dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>% {outliers_boxplot_with_plotly(.$class, .$charge, .$ann, "Charge distribution across Chordata in Fascin homologous sequences")}
df %>% dplyr::filter(phylum == 'Chordata')%>% tidyr::drop_na(class, residue_n) %>% dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>% {outliers_boxplot_with_plotly(.$class, .$residue_n, .$ann, "Length distribution across Chordata in Fascin homologous sequences")}
outliers <- setNames(c("ENSPMJG00000017038.1|ENSPMJP00000023042.1", "ENSTGUG00000007668.1|ENSTGUP00000007888.1", "ENSPVIG00000000772.1|ENSPVIP00000000835.1", "ENSECAG00000008056.2|ENSECAP00000030944.1", "ENSECAG00000008056.2|ENSECAP00000042721.1", "ENSECAG00000008056.2|ENSECAP00000039899.1", "G5BI79_HETGA|G5BI79",# "ENSSSCG00070015918.1|ENSSSCP00070026109.1", "L5L203_PTEAL|L5L203", "ENSMLEG00000031848.1|ENSMLEP00000015343.1", "ENSEBUG00000014215.1|ENSEBUP00000023069.1", "ENSCMIG00000018465.1|ENSCMIP00000044679.1"), c("Great Tit", "Taeniopygia guttata","Central bearded dragon", "Equus caballus", "Equus caballus", "Equus caballus", "Heterocephalus glaber","Pig USMARC", "Pteropus alecto", "Mandrillus leucophaeus", "Hagfish", "Elephant shark")) df %<>% subset(fasta_header %!in% outliers) df %<>% dplyr::filter(class != "Actinopteri") dim(df) def_outliers %<>% append(outliers)
df %>% dplyr::filter(class == 'Mammalia')%>% tidyr::drop_na(order, charge) %>% dplyr::mutate(order = forcats::fct_lump_min(order, 5))%>% {outliers_boxplot_with_plotly(.$order, .$charge, .$ann, "Charge distribution across Mammalia in Fascin homologous sequences")}
df %>% dplyr::filter(class == 'Mammalia')%>% tidyr::drop_na(order, residue_n) %>% dplyr::mutate(order = forcats::fct_lump_min(order, 5))%>% {outliers_boxplot_with_plotly(.$order, .$residue_n, .$ann, "Length distribution across Chordata in Fascin homologous sequences")}
outliers <- setNames(c("ENSGGOG00000027785.2|ENSGGOP00000021898.2", "ENSSDAG00000013746.1|ENSSDAP00000015277.1", "ENSSDAG00000013746.1|ENSSDAP00000015287.1", "ENSOARG00000018489.1|ENSOARP00000019847.1", "ENSSSCG00000026155.2|ENSSSCP00000057954.1", "ENSPCAG00000000879.1|ENSPCAP00000000808.1", "ENSSSCG00070011455.1|ENSSSCP00070018470.1", "ENSPTRG00000041962.2|ENSPTRP00000061120.2"), c("Gorilla gorilla gorilla", "Daurian ground squirrel", "Daurian ground squirrel","Ovis aries", "Sus scrofa", "Hyrax", "Pig USMARC", "Pan troglodytes")) df %<>% subset(fasta_header %!in% outliers) dim(df) def_outliers %<>% append(outliers)
df %>% dplyr::filter(order == 'Primates')%>% tidyr::drop_na(family, charge) %>% {outliers_boxplot_with_plotly(.$family, .$charge, .$ann)}
df %>% dplyr::filter(order == 'Primates')%>% tidyr::drop_na(family, residue_n) %>% {outliers_boxplot_with_plotly(.$family, .$residue_n, .$ann)}
df %>% dplyr::filter(phylum != 'Chordata')%>% tidyr::drop_na(class, charge) %>% dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>% {outliers_boxplot_with_plotly(.$class, .$charge, .$ann, "Charge distribution across other phyla in Fascin homologous sequences")}
df %>% dplyr::filter(phylum != 'Chordata')%>% tidyr::drop_na(class, residue_n) %>% dplyr::mutate(class = forcats::fct_lump_min(class, 10))%>% {outliers_boxplot_with_plotly(.$class, .$residue_n, .$ann, "Length distribution across other phyla in Fascin homologous sequences")}
outliers <- setNames(c("A0A484AZ75_DRONA|A0A484AZ75", "A0A1B0FA94_GLOMM|A0A1B0FA94"), c("Drosophila navojoa", "Glossina morsitans morsitans")) df %<>% subset(fasta_header %!in% outliers) dim(df) def_outliers %<>% append(outliers)
Creamos una nueva variable con la anotación:
library(dplyr) library(stringr) df$annotation <- df %>% dplyr::filter(fasta_header %!in% def_outliers)%>% dplyr::mutate(`Gene names` = toupper(`Gene names`), `Protein names` = toupper(`Protein names`))%>% tidyr::replace_na(list(`Gene names` = "", `Protein names` = "unknown"))%>% dplyr::mutate( annotation = dplyr::case_when( str_detect(`Gene names`, 'FSCN2') & str_detect(`Protein names`, 'FASCIN.+2.+ISOFORM.+1') ~ "fascin2a", str_detect(`Gene names`, 'FSCN2') & str_detect(`Protein names`, 'FASCIN.+2.+ISOFORM.+2') ~ "fascin2b", str_detect(`Gene names`, 'FSCN1') & str_detect(`Protein names`, 'FASCIN') ~ "fascin1", str_detect(`Gene names`, 'FSCN2') & str_detect(`Protein names`, 'FASCIN') ~ "fascin2", !str_detect(`Gene names`, 'FSCN\\d') & str_detect(`Protein names`, 'FASCIN') ~ "fascin", str_detect(`Gene names`, 'UNCHARACTERIZED') | str_detect(`Protein names`, 'UNCHARACTERIZED') ~ "uncharacterized", !str_detect(`Gene names`, 'FSCN\\d') & (str_detect(`Protein names`, "SINGED(?!.+LIKE)")| str_detect(`Protein names`, 'SN')) ~ "singed", TRUE ~ "unknown" )) %>% dplyr::pull(annotation)
Realizamos el gráfico:
df %>% dplyr::group_by(phylum, order, annotation)%>% dplyr::summarise(charge = mean(charge), residue_n = mean(residue_n), size = dplyr::n())%>% plotly::plot_ly(x = ~charge, y = ~residue_n, size = ~size, type = 'scatter', mode = 'markers',color = ~phylum, colors = 'Paired',#sizes = c(10, 50), marker = list(opacity = 0.5, sizemode = 'diameter'), hoverinfo = 'text', text = ~paste('Annotation:', annotation, 'Order: <i>', order))%>% plotly::layout(title = "Distribution of charge and sequence's length")
df %>% dplyr::mutate(Proteins_names_collapsed = forcats::fct_collapse(df$`Protein names`, fascin1 = c("Fascin actin-bundling protein 1"), fascin2 = c("Fascin-2 (Retinal fascin)", "fascin-2 isoform X1", "Fascin actin-bundling protein 2, retinal", "Fascin-2", "fascin-2 isoform X2"), fascin = c("Fascin (55 kDa actin-bundling protein) (Singed-like protein) (p55)", "Fascin", "Fascin (Singed-like protein)", "Protein singed-like Protein", "GH24440","GM11990"), singed = c("Protein singed","Singed","protein singed isoform X3", "sn", "protein singed", "Singed, isoform B (Singed, isoform E) (Singed, isoform G)", "Blast:Protein singed"), other = c("GH24440", "GM11990"), uncharacterized = c("Uncharacterized protein", "Uncharacterized protein, isoform A (Uncharacterized protein, isoform B)")))%>% dplyr::group_by(phylum, order, Proteins_names_collapsed)%>% dplyr::summarise(charge = mean(charge), residue_n = mean(residue_n), size = dplyr::n())%>% plotly::plot_ly(x = ~charge, y = ~residue_n, size = ~size, type = 'scatter', mode = 'markers',color = ~phylum, colors = 'Paired',#sizes = c(10, 50), marker = list(opacity = 0.5, sizemode = 'diameter'), hoverinfo = 'text', text = ~paste('Annotation:', Proteins_names_collapsed, 'Order: <i>', order))%>% plotly::layout(title = "Distribution of charge and sequence's length")
print(def_outliers) readr::write_lines(def_outliers, paste0("../data-raw/df_Q16658/", "outliers_fasta_headers.txt"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.