Este cuaderno muestra de forma ordenada el código empleado para cargar el dataset en los scripts hmmer_Q16658.R
, df_Q16658.R
y df_Q16658_WithOutliers.R
. La lista de outliers utilizada en eset último script se obtiene en el siguiente cuaderno.
knitr::opts_chunk$set(echo = TRUE)
Leemos todos los archivos .json y .fa que nos descargamos de HMMER. Para ello hacemos uso de las funciones Biostrings::readAAStringSet
y de la función propia create_hmmer_json_dataframe
.
library(FascinRSCA) path <- "data-raw/hmmer_Q16658/" files_json <- purrr::map_chr(dir(path, pattern = "*.json"), ~paste0(path, .x)) dataset_names <- c("Ensembl", "RefProteomes", "SwissProt") files_fasta <- purrr::map_chr(dir(path, pattern = "*fullseq.fa"), ~paste0(path, .x)) df <- FascinRSCA::create_hmmer_json_dataframe(files_json, dataset_names, desc_null = TRUE) %>% dplyr::select(-c(domains, seqs, pdbs)) %>% dplyr::distinct() %>% dplyr::arrange(file_label)
We load the fasta file
fasta_seq <- Biostrings::readAAStringSet(files_fasta)
Changue order manually some entries and we save the data.
purrr::walk2(c(38, 112, 313, 636), c(39, 113, 314, 637), function(x, y){ temp <- df[y,] df[y,] <<- df[x,] df[x,] <<- temp }) if (all(df$acc == names(fasta_seq))){ df <- df %>% dplyr::mutate(fasta_header = paste(df$acc,df$acc2, sep = "|") %>% make.unique(sep = '_'), fasta_seq = as.character(fasta_seq)) hmmer_Q16658 <- df usethis::use_data(hmmer_Q16658, overwrite = TRUE) }
df <- FascinRSCA::hmmer_Q16658 hmmer_Q16658_tax <- FascinRSCA::create_ncbi_basic_tax_dataframe(as.character(df$taxid))%>% dplyr::right_join(df, by = "taxid")%>% dplyr::select(-species.x)%>% dplyr::mutate(species = species.y)%>% dplyr::select(-species.y)
Escribimos las secuencias en dos archivos porque la herramienta online tiene un máximo de 500 secuencias.
path_pepstats <- "../data-raw/pepstats_Q16658/" fasta_seq <- hmmer_Q16658_tax %>% dplyr::pull(fasta_seq) %>% Biostrings::AAStringSet() names(fasta_seq) <- df$fasta_header fasta_seq[1:500] %>% Biostrings::writeXStringSet(paste0(path_pepstats, 'fascin_homologous_1_500.fasta')) fasta_seq[501:length(fasta_seq)] %>% Biostrings::writeXStringSet(paste0(path_pepstats, 'fascin_homologous_501_860.fasta')) write(hmmer_Q16658_tax$fasta_header, paste0(path_pepstats, "id_pepstats.txt"))
Use pepstats EMBOSS in web. Concatenate files:
```{bash, concatenate_pepstats, eval=FALSE} cat fascin_homologous_* > fascin_homologous_1_860.pepstats
Call python script: ```{bash parser_pepstats, eval=FALSE} python3 parser_pepstats.py -i fascin_homologous_1_860.pepstats -n ../id_pepstats.txt -o fascin_homologous_1_860 --csv
Guardamos las estadísticas calculadas por pepstats a nuestro dataframe.
pepstats_Q16658 <- hmmer_Q16658_tax %>% dplyr::left_join(by = c("fasta_header" = "names"), readr::read_csv(paste0(path_pepstats, "EMBOSS/fascin_homologous_1_860.csv"))) if (!all(nchar(pepstats_Q16658$fasta_seq) == pepstats_Q16658$residue_n)){ print("¡No concuerdan las longitudes de secuencia calculadas con las de las secuencias!")}
UniProt_results <- FascinRSCA::annotation_uniprot_query(pepstats_Q16658$acc, pepstats_Q16658$acc2) %>% dplyr::select("Entry", "Entry name", "Status", "Protein names", "Gene names", "Organism","Length", "acc", "acc2") df_Q16658 <- UniProt_results %>% dplyr::distinct(acc, acc2, .keep_all = TRUE) %>% dplyr::right_join(pepstats_Q16658, by = c("acc" = "acc","acc2" = "acc2")) usethis::use_data(df_Q16658, overwrite = TRUE)
print(str(df_Q16658))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.