#install.packages("devtools") library(devtools) library(dplyr) install_git("https://github.com/MaudeDavidLab/GMEmbeddings", ref = "master") library(GMEmbeddings)
data_dir <- "inst/extdata/test_dataset_1/"
seqtab <- read.csv(paste0(data_dir, "asv_table.csv"), row.names = 1) seqtab <- t(seqtab)
fasta <- read.table(paste0(data_dir,"repseqs.fasta"), quote="\"", comment.char="") seqs <- fasta$V1[seq(2, nrow(fasta), 2)] ids <- gsub(">", "", fasta$V1[seq(1, nrow(fasta), 2)]) colnames(seqtab) <- ids
blast_hits <- read.delim(paste0(data_dir,"best_hits.tsv"), header = FALSE, sep = " ") colnames(blast_hits) <- c("qseqid", "sseqid", "qseq", "sseq", "evalue", "bitscore", "length", "pident") print(dim(blast_hits))
id_thresh = 99 #consider hits where the percent identity to the query sequence is 99% or higher best_hits = getBestHits(blast_hits = blast_hits, id_thresh = id_thresh)
aligned = best_hits %>% group_by(qseq) %>% summarize(num_aligned = n())
threshold <- 50 drop_seqs <- aligned$qseq[aligned$num_aligned > threshold] best_hits <- best_hits[!(best_hits$qseq %in% drop_seqs), ] aligned = best_hits %>% group_by(qseq) %>% summarize(num_aligned = n()) cat("Median embedding sequences hit: ", median(aligned$num_aligned), "\n") cat("Mean embedding sequences hit: ", mean(aligned$num_aligned), "\n") cat("Max embedding sequences hit: ", max(aligned$num_aligned), "\n")
svg(file= paste0(data_dir, "embedding_sequences_hit.svg")) hist(aligned$num_aligned, breaks = 40, main = "Number of embedding sequences hit by Pilot sequences", xlab = "# embedding seqs hit") dev.off()
Make sure you use a file called 'glove_emb_id' and NOT 'glove_emb_fullseq'. The package uses ids to match alignment within the blast_hits file, and will not work if provided an embedding matrix with nucleotide sequences instead of id numbers
embedding_filepath <- system.file("extdata/glove_transformation_matrices/", "glove_emb_id_50.txt", package = "GMEmbeddings") embedding_matrix <- read.delim(embedding_filepath, row.names = 1, sep = "\t") embedding_matrix <- embedding_matrix[rownames(embedding_matrix) != "<unk>", ]
result <- EmbedAsvTable(seqtab, best_hits, embedding_matrix, id_thresh = id_thresh) embedded <- result$embedded num_seqs_aligned <- result$num_seqs_aligned percent_sequences_aligned <- result$percent_sequences_aligned colnames(embedded) <- seq(1, ncol(embedded))
out_filepath <- paste0(data_dir, "embedded_50.csv") write.csv(embedded, out_filepath, quote = F)
library(KEGGREST) embed <- read.table("inst/extdata/glove_transformation_matrices/glove_emb_fullseq_50.txt", quote="\"", comment.char="", row.names = 1) corrmat <- getPathwayCorrelationMatrix(embed) fig <- getHeatmap(corrmat)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.