TMHMM: Transmembrane Helix Prediction

read_partition <- function(x, i) {
    Biostrings::readAAStringSet(x) %>%
        base::as.data.frame() %>%
        tibble::rownames_to_column(var = "PROTEIN") %>%
        tidyr::separate(x, c("SEQUENCE", "TOPOLOGY"), sep = "#") %>%
        dplyr::mutate(PARTITION = base::as.integer(i - 1))
}

PROTEINS <-
    base::system.file("extdata", package = "TMHMM") %>%
    base::dir(pattern = "set160.[0-9].labels", full.names = TRUE) %>%
    purrr::imap(read_partition) %>%
    dplyr::bind_rows() %>%
    dplyr::mutate(PARTITION = forcats::as_factor(PARTITION)) %>%
    dplyr::mutate_if(base::is.character, stringr::str_trim)

PREDICTIONS <-
    base::system.file("extdata/160.predictions", package = "TMHMM") %>%
    Biostrings::readAAStringSet() %>%
    base::as.data.frame() %>%
    tibble::rownames_to_column(var = "PROTEIN") %>%
    dplyr::transmute(
        PREDICTION = stringr::str_remove_all(x, "\\s+\\S+#\\s+\\S+0\\s")
    ) %>%
    dplyr::mutate_if(base::is.character, stringr::str_trim)

VALIDATION <-
    base::character()

for (i in 0:9) {
    state_names <-
        base::c("i", "M", "o")

    symbol_names <-
        base::c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P",
                "Q", "R", "S", "T", "V", "W", "Y")

    from <-
        dplyr::filter(PROTEINS, PARTITION != i) %>%
        magrittr::use_series(TOPOLOGY) %>%
        stringr::str_replace_all("([aA-zZ])", " \\1") %>%
        stringr::str_replace(" ([aA-zZ])", "\\1") %>%
        stringr::str_split(" ") %>%
        purrr::map(base::rev) %>%
        purrr::map(magrittr::extract, -1) %>%
        purrr::map(base::rev) %>%
        purrr::reduce(base::c)

    to <-
        dplyr::filter(PROTEINS, PARTITION != i) %>%
        magrittr::use_series(TOPOLOGY) %>%
        stringr::str_replace_all("([aA-zZ])", " \\1") %>%
        stringr::str_replace(" ([aA-zZ])", "\\1") %>%
        stringr::str_split(" ") %>%
        purrr::map(magrittr::extract, -1) %>%
        purrr::reduce(base::c)

    transition_probabilities <-
        base::with(base::data.frame(from, to), base::table(from, to)) %>%
        base::prop.table(1)

    states <-
        dplyr::filter(PROTEINS, PARTITION != i) %>%
        magrittr::use_series(TOPOLOGY) %>%
        stringr::str_replace_all("([aA-zZ])", " \\1") %>%
        stringr::str_replace(" ([aA-zZ])", "\\1") %>%
        stringr::str_split(" ") %>%
        purrr::reduce(base::c)

    symbols <-
        dplyr::filter(PROTEINS, PARTITION != i) %>%
        magrittr::use_series(SEQUENCE) %>%
        stringr::str_replace_all("([aA-zZ])", " \\1") %>%
        stringr::str_replace(" ([aA-zZ])", "\\1") %>%
        stringr::str_split(" ") %>%
        purrr::reduce(base::c)

    emission_probabilities <-
        base::with(base::data.frame(states, symbols),
                   base::table(states, symbols)) %>%
        base::prop.table(1)

    hmm_model <-
        HMM::initHMM(state_names, symbol_names,
                     transProbs = transition_probabilities,
                     emissionProbs = emission_probabilities)

    state_path <-
        dplyr::filter(PROTEINS, PARTITION == i) %>%
        magrittr::use_series(SEQUENCE) %>%
        stringr::str_replace_all("([aA-zZ])", " \\1") %>%
        stringr::str_replace(" ([aA-zZ])", "\\1") %>%
        stringr::str_split(" ") %>%
        purrr::map(~ HMM::viterbi(hmm_model, .x)) %>%
        purrr::map(base::paste, sep = "", collapse = "") %>%
        base::as.character()

    VALIDATION <-
        base::c(VALIDATION, state_path)
}

VALIDATION <-
    base::as.data.frame(VALIDATION, stringsAsFactors = FALSE)

TMHMM <-
    dplyr::bind_cols(PROTEINS, PREDICTIONS, VALIDATION) %>%
    dplyr::mutate(PROTEIN = forcats::as_factor(PROTEIN)) %>%
    dplyr::mutate_if(base::is.character, stringr::str_to_upper)

usethis::use_data(TMHMM)