R/pp_matrix_tm.R

# library(tm)
#
# ids <- dataset
#
# library(dplyr)
# # ids <- ids %>%
#     filter(`percolator q-value` < 0.01) %>%
#     select(sequence, `protein id`) %>%
#     group_by(sequence) %>%
#     summarise(all_matched_proteins = paste(`protein id`, sep = ",", collapse = ","))
# proteins <- ids[['all_matched_proteins']]
# names(proteins) <- ids[['sequence']]
#
# comma_tokens <- Regexp_Tokenizer(',')
# pp_as_corpus <- Corpus(VectorSource(ids[['all_matched_proteins']]))
# meta(pp_as_corpus)
# meta(pp_as_corpus, 'labels', 'indexed') <- names(proteins)
# pp_corpus
# pp_matrix <- DocumentTermMatrix(pp_as_corpus, control = list(tokenize = comma_tokens))
# nonsparse_pp_matrix <- as.matrix(pp_matrix)
# ?DocumentTermMatrix
# ?Corpus
mstaniak/SharedPeptides documentation built on Jan. 21, 2020, 7:29 p.m.