data-raw/df_Q16658.R

## code to prepare `df_Q16658` dataset goes here
# NCBI taxonomic INFO
df <- FascinRSCA::hmmer_Q16658
hmmer_Q16658_tax <- FascinRSCA::create_ncbi_basic_tax_dataframe(as.character(df$taxid))%>%
  dplyr::right_join(df, by = "taxid")%>%
  dplyr::select(-species.x)%>%
  dplyr::mutate(species = species.y)%>%
  dplyr::select(-species.y)

# EMBOSS Pepstats
path_pepstats <- "data-raw/pepstats_Q16658/"

fasta_seq <- hmmer_Q16658_tax %>%
  dplyr::pull(fasta_seq) %>%
  Biostrings::AAStringSet()

names(fasta_seq) <- df$fasta_header
fasta_seq[1:500] %>%
  Biostrings::writeXStringSet(paste0(path_pepstats,
  'fascin_homologous_1_500.fasta'))
fasta_seq[501:length(fasta_seq)] %>%
  Biostrings::writeXStringSet(paste0(path_pepstats,
  'fascin_homologous_501_860.fasta'))

write(hmmer_Q16658_tax$fasta_header, paste0(path_pepstats, "id_pepstats.txt"))
# Use pepstats EMBOSS
# Concatenate files
# cat fascin_homologous_* > fascin_homologous_1_860.pepstats
# Call python script
# python3 parser_pepstats.py -i fascin_homologous_1_860.pepstats -n ../id_pepstats.txt -o fascin_homologous_1_860 --csv
pepstats_Q16658 <- hmmer_Q16658_tax %>%
  dplyr::left_join(by = c("fasta_header" = "names"), readr::read_csv(paste0(path_pepstats,
                                                                            "EMBOSS/fascin_homologous_1_860.csv")))
if (!all(nchar(pepstats_Q16658$fasta_seq) == pepstats_Q16658$residue_n)){
  stop()
}
# UniProt Query
UniProt_results <- FascinRSCA::annotation_uniprot_query(pepstats_Q16658$acc, pepstats_Q16658$acc2) %>%
  dplyr::select("Entry", "Entry name", "Status",
                "Protein names", "Gene names",
                "Organism","Length", "acc", "acc2")
df_Q16658 <- UniProt_results %>%
  dplyr::distinct(acc, acc2, .keep_all = TRUE) %>%
  dplyr::right_join(pepstats_Q16658, by = c("acc" = "acc","acc2" = "acc2"))
print(str(df_Q16658))
usethis::use_data(df_Q16658, overwrite = TRUE)
currocam/FascinRSCA documentation built on March 21, 2022, 6:29 a.m.