knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(Biostrings)
FMR1_kmer <- readRDS("data/FMR1dat/kmer_FMR1_compare.txt")

d <- FMR1_kmer %>% dplyr::select(kmer, log2FC) %>% filter(!is.infinite(log2FC)) %>% dist(method = "euclidean")
fit <- hclust(d, method="ward")
plot(fit)
groups <- cutree(fit, k=5)
rect.hclust(fit, k=5, border="red")

d <- FMR1_kmer %>% dplyr::select(kmer, log2FC) %>% filter(!is.infinite(log2FC), log2FC > 1) %>% as.data.frame()
rownames(d) <- d$kmer
d <- d[-1] %>% dist(method = "euclidean")
fit <- hclust(d, method="ward")
groups <- cutree(fit, k=4)
rect.hclust(fit, k=4, border="red")

d <- FMR1_kmer %>% dplyr::select(kmer, log2FC) %>% filter(!is.infinite(log2FC), log2FC > 1.5 | log2FC < -1.5) %>% as.data.frame()
rownames(d) <- d$kmer
d <- d[-1] %>% dist(method = "euclidean")
fit <- hclust(d, method="ward")
groups <- cutree(fit, k=4)
rect.hclust(fit, k=4, border="red")
#BCrank creates motifs from ranked fasta files. importantly, ours are not ranked at all. usually BCRank is given all the sequences from chipseq peaks ranked on magnitude. we gave a much smaller fasta

library(BCRANK)


#FMR1_kmer %>% arrange(p_adj) %>% pull(., kmer) %>% as.character() %>% DNAStringSet() %>% writeXStringSet(., "data/FMR1dat/rankedKmer.fa")
#didnt work with ranked kmers...

bcrank_out <- bcrank("data/FMR1dat/longest_3UTR_FMR1.fa")
topMotif <- toptable(bcrank_out, 1)
weightMatrix <- pwm(topMotif, normalize=FALSE)
weightMatrixNormalized <- pwm(topMotif, normalize=TRUE)
library(seqLogo)
seqLogo(weightMatrixNormalized)
#motifRG uses MEME-like algorithms to identify motifs enriched in a forground sequence set over a background set.
#I think this is most usefull for us

library(motifRG)
#this is empty
#FMR1_out <- findMotifFasta("data/FMR1dat/longest_3UTR_FMR1.fa", "data/FMR1dat/longest_3UTR_FMR1_ctrl.fa", max.motif = 3, enriched = TRUE)

#ZBP1_out <- findMotifFasta("data/ZBP1dat/longest_whole_FL.fa","data/ZBP1dat/longest_whole_FL_ctrl.fa",max.motif=3,enriched=T)
#saveRDS(ZBP1_out, "data/ZBP1dat/motifRGouput.txt")
ZBP1_out < readRDS("data/ZBP1dat/motifRGoutput.txt")

pwm.match <- refinePWMMotif(ZBP1_out$motifs[[1]]@match$pattern, readDNAStringSet("data/ZBP1dat/longest_whole_FL.fa"))
seqLogo::seqLogo(pwm.match$model$prob)

pwm.match <- refinePWMMotif(ZBP1_out$motifs[[2]]@match$pattern, readDNAStringSet("data/ZBP1dat/longest_whole_FL.fa"))
seqLogo::seqLogo(pwm.match$model$prob)

pwm.match <- refinePWMMotif(ZBP1_out$motifs[[3]]@match$pattern, readDNAStringSet("data/ZBP1dat/longest_whole_FL.fa"))
seqLogo::seqLogo(pwm.match$model$prob)
#Gadem also identifies motifs enriched in a sequence set however there is no background set comparison

library(rGADEM)
library("BSgenome.Mmusculus.UCSC.mm10")

#gadem <- GADEM(readDNAStringSet("data/FMR1dat/longest_3UTR_FMR1.fa"), verbose=1,genome=Mmusculus)
#saveRDS(gadem, "FMR1gademoutput")
gadem <- readRDS("FMR1gademoutput")

getPWM(gadem)[[1]] %>% seqLogo::seqLogo()
getPWM(gadem)[[2]] %>% seqLogo::seqLogo()
getPWM(gadem)[[3]] %>% seqLogo::seqLogo()
library(universalmotif)
#doesnt run on windows...
run_meme(target.sequences = readDNAStringSet("data/FMR1dat/longest_3UTR_FMR1.fa"), 
        ouput = "data/FMR1dat/MEME", 
        control.sequences = readDNAStringSet("data/FMR1dat/longest_3UTR_FMR1_ctrl.fa"))


TaliaferroLab/FeatureReachR documentation built on Aug. 15, 2021, 2:21 p.m.