tfhunter: Identification and classification of plant transcription factors

Data creation

Data in data/

This object contains the protein sequences of the algae Galdieria sulphuraria, and it was downloaded from PLAZA Diatoms. Only genes that have domains of interest (used for classification) were kept for package size issues.

gsu <- Biostrings::readAAStringSet(
  "ftp://ftp.psb.ugent.be/pub/plaza/plaza_diatoms_01/Fasta/proteome.selected_transcript.gsu.fasta.gz"
)
names(gsu) <- gsub("\ .*", "", names(gsu))

# The code below must be run after identifying TF domains with annotate_pfam()
data("gsu_annotation")
gsu <- gsu[names(gsu) %in% gsu_annotation$Gene]

outfile <- paste0(tempdir(), "/gsu.fasta")
Biostrings::writeXStringSet(gsu, filepath = outfile)
gsu <- Biostrings::readAAStringSet(outfile)

usethis::use_data(
  gsu, compress = "xz", overwrite = TRUE
)

data(gsu)
gsu_annotation <- annotate_pfam(gsu, mode = "local", evalue = 1e-05)

usethis::use_data(
  gsu_annotation, compress = "xz", overwrite = TRUE
)

The classification scheme is the same as implemented in PlantTFDB.

library(tidyverse)
table <- rvest::read_html("http://planttfdb.gao-lab.org/help_famschema.php") %>%
  rvest::html_table()
table <- as.data.frame(table[[3]])

table <- table[-1, -1]
names(table) <- c("Family", "Subfamily", "DBD", "Auxiliary", "Forbidden")
table[table == ""] <- NA

table$DBD <- gsub("\\r\\n\\t\\t\\r\\n\\t\\t", " or ", table$DBD)
table$DBD <- gsub("\ \\(self-build\\)", "", table$DBD)
table$Auxiliary <- gsub("\ \\(self-build\\)", "", table$Auxiliary)
table$Auxiliary <- gsub("BELLor", "BELL or", table$Auxiliary)
table$Forbidden <- gsub("\\r\\n\\t\\tor", " or ", table$Forbidden)

classification_scheme <- table
classification_scheme$DBD[3] <- gsub("or", "and", classification_scheme$DBD[3])

usethis::use_data(classification_scheme,
                  compress = "xz", overwrite = TRUE)

gsu_families <- classify_tfs(gsu_annotation)

usethis::use_data(gsu_families, compress = "xz", overwrite = TRUE)

This data set contains a SummarizedExperiment of motif counts for each family in simulated species.

data(gsu)
set.seed(123) # for reproducibility

# Simulate 5 different species by sampling 100 random genes from `gsu`
proteomes <- list(
    Gsu1 = gsu[sample(names(gsu), 50, replace = FALSE)],
    Gsu2 = gsu[sample(names(gsu), 50, replace = FALSE)],
    Gsu3 = gsu[sample(names(gsu), 50, replace = FALSE)],
    Gsu4 = gsu[sample(names(gsu), 50, replace = FALSE)]
)

# Create simulated species metadata
species_metadata <- data.frame(
    row.names = names(proteomes),
    Division = "Rhodophyta",
    Origin = c("US", "Belgium", "China", "Brazil")
)

tf_counts <- get_tf_counts(proteomes, species_metadata)

usethis::use_data(tf_counts, compress = "xz")

Data in inst/extdata

The seed alignments for all TF families were downloaded from PFAM and converted to profile HMMs using the command hmmbuild from HMMER. Domains that were self-built by PlantTFDB were kindly provided by the database maintainers by email.

library(tidyverse)
domains <- c(
  AP2 = "PF00847", B3 = "PF02362", auxin_resp = "PF06507", 
  GAGA_bind = "PF06217", DUF822 = "PF05687", HLH = "PF00010",
  bZIP_1 = "PF00170", zf_B_box = "PF00643", CCT = "PF06203", 
  Zf_Dof = "PF02701", GATA_zf = "PF00320", Zf_LSD1 = "PF06943",
  Peptidase_C14 = "PF00656", YABBY = "PF04690", Zf_C2H2 =  "PF00096",
  RNase_T = "PF00929", Zf_CCCH =  "PF00642", RRM_1 = "PF00076",
  Helicase_C = "PF00271", CG1 = "PF03859", TCR = "PF03638",
  E2F_TDP = "PF02319", EIN3 = "PF04873", FAR1 = "PF03101",
  STAT = "PF02864", Response_reg = "PF00072", DUF573 = "PF04504",
  GRAS = "PF03514", WRC = "PF08879", QLQ = "PF08880",
  Homeobox = "PF00046", SMART = "PF01852", ELK = "PF03789",
  PHD = "PF00628", HSF = "PF00447", DUF260 = "PF03195",
  FLO_LFY = "PF01698", SRF_TF = "PF00319", K_box = "PF01486",
  MYB_DNAbind = "PF00249", SWIRM = "PF04433", NAM = "PF02365",
  Zf_NF_X1 = "PF01422", CBFB_NFYA = "PF02045", RWP_RK = "PF02042",
  NOZZLE = "PF08744", S1FA = "PF04689", SBP = "PF03110",
  DUF702 = "PF05142", TCP = "PF03634", WRKY = "PF03106",
  Whirly = "PF08536", ZF_HD_dimer = "PF04770"
)

# Define function to download seed alignments for PFAM domains
download_pfam_seeds <- function(domains = NULL) {

  d <- lapply(domains, function(x) {
    url <- paste0('http://pfam.xfam.org/family/', x,
                  '/alignment/seed/format?format=fasta')
    aln_path <- paste0(tempdir(), "/", x, ".fasta")
    if(!file.exists(aln_path)) {
      options(timeout = 5)
      result <- tryCatch({
        downloader::download(url = url, destfile = aln_path)
        res <- TRUE
      },
      error = function(e) return(FALSE),
      warning = function(w) return(FALSE)
      )

      if (!result) {
        file.remove(aln_path)
      }
    }
    return(NULL)
  })
  files <- list.files(tempdir(), pattern = ".fasta")
  ds <- sapply(domains, function(x) return(TRUE %in% 
                                             stringr::str_detect(files, x)))
  status <- data.frame(
    domain = domains,
    download_status = ds
  )
  return(status)
}

download_aln <- download_pfam_seeds(domains)


# Define function to build profile HMMs from alignments
build_profile_HMM <- function(domains = NULL) {

  f <- lapply(domains, function(x) {

    aln_path <- paste0(tempdir(), "/", x, ".fasta")
    hmm_path <- paste0(tempdir(), "/", x, ".hmm")
    build <- system2("hmmbuild", args = c(hmm_path, aln_path))
    sedargs <- paste0("s/aln$/", x, "/g")
    rename_aln <- system2("sed", args = c("-i", sedargs, hmm_path))
    return(NULL)
  })
  files <- list.files(tempdir(), pattern = ".hmm")
  build <- sapply(domains, function(x) return(TRUE %in% 
                                             stringr::str_detect(files, x)))
  status_df <- data.frame(
    domain = domains,
    build_status = build
  )
  return(status_df)
}

build_profile_HMM(domains)

# Combine .hmm files in a single one
combine_domains <- function() {
  files <- list.files(tempdir(), pattern = ".hmm", full.names = TRUE)
  files <- paste0(files, collapse = " ")
  out_hmm <- file.path(tempdir(), "PFAM.hmm")
  system2("cat", args = c(files, " > ", out_hmm))
  return(NULL)
}

combine_domains()
fs::file_move(path = file.path(tempdir(), "PFAM.hmm"),
              new_path = here::here("inst", "extdata", "PFAM.hmm"))

Internal data (sysdata.rda)

This data frame contains the sequence and domain cutoff for each family. The values were retrieved from the Supplementary Materials of the PlantTFDB v2 paper.

library(here)

# Create a list of PFAM domains
pfam_domains <- readLines(here("inst", "extdata", "PFAM.hmm"))
pfam_domains <- pfam_domains[grep("^NAME", pfam_domains)]
pfam_domains <- gsub(".*PF", "PF", pfam_domains)

# Create a list of self-built domains
selfbuilt <- readLines(here("inst", "extdata", "self_built.hmm"))
selfbuilt <- selfbuilt[grep("^NAME", selfbuilt)]
selfbuilt <- gsub("NAME  ", "", selfbuilt)


score_cutoff <- data.frame(domain = c(pfam_domains, selfbuilt))

score_cutoff$domaincutoff <- c(
    11, 15, 20, NA, 12.2, 19.7, 21.5, NA, 21, 22, # 1:10
    22.5, NA, 17, 16, NA, NA, NA, 16, NA, 20.7, # 11:20
    NA, 20.8, 22, 23, 30.2, 21.5, 26, 150, 21.3, 22, # 21:30
    23, 22, 20.5, 21, 20.9, NA, 25, NA, 29.5, 22, # 31:40
    21, 20.5, 22, 21.3, 25, NA, 23, NA, 20.2, 20.2, # 41:50
    20.2, 21.3, NA, 20, 24, 27, 24, 50, 150, 100, # 51:60
    NA, NA, NA, 16
)

usethis::use_data(score_cutoff, compress = "xz", internal = TRUE)

almeidasilvaf/tfhunter documentation built on March 19, 2023, 8:53 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

almeidasilvaf/tfhunter
Identification and classification of plant transcription factors

inst/script/data_creation.md
In almeidasilvaf/tfhunter: Identification and classification of plant transcription factors

Data creation

Data in data/

gsu.rda

gsu_annotation.rda

classification_scheme.rda

gsu_families.rda

tf_counts.rda

Data in inst/extdata

*.hmm files (profile HMMs)

Internal data (sysdata.rda)

R Package Documentation

Browse R Packages

We want your feedback!

almeidasilvaf/tfhunter Identification and classification of plant transcription factors

inst/script/data_creation.md In almeidasilvaf/tfhunter: Identification and classification of plant transcription factors

Data creation

Data in data/

gsu.rda

gsu_annotation.rda

classification_scheme.rda

gsu_families.rda

tf_counts.rda

Data in inst/extdata

*.hmm files (profile HMMs)

Internal data (sysdata.rda)

R Package Documentation

Browse R Packages

We want your feedback!

almeidasilvaf/tfhunter
Identification and classification of plant transcription factors

inst/script/data_creation.md
In almeidasilvaf/tfhunter: Identification and classification of plant transcription factors