CpG Count as a motif

tss_seqs <- seqinr::read.fasta("../data/motifs/promoter_sequences/all_genes_tss_1000.fasta", forceDNAtolower = FALSE, as.string = TRUE)
tss_seqs_s <- toupper(as.character(tss_seqs))
names(tss_seqs_s) <- gsub("\\(-\\)|\\(\\+\\)","",names(tss_seqs))

tss_seqs_s <- substr(tss_seqs_s,500,2000)

#CG_count <- sapply(tss_seqs_s,function(x) stringr::str_count(x,"CG"))
CG_locations <- sapply(tss_seqs_s,function(x) stringr::str_locate_all(x,"CG")[[1]][,1])

CG_count <- sapply(CG_locations, function(x) sum(x>450 & x<1000))

hist(CG_count, breaks=1000)
abline(v=20,col="blue", lwd=3)
abline(v=45,col="blue", lwd=3)

hist(unlist(CG_locations), breaks=100)
abline(v=500, col="blue", lwd=3)

For best matches, find probability of given gene regulated

best_denovo <- readRDS("../data/motifs/best_denovo_V3.rds")
motif_matches <- readRDS("../data/motifs/motif_matches_v3.rds")

# Load Regprobs
HM11_regprobs <- readRDS("../data/motifs/HocomocoV11_regprobs_matrix_core_and_extra.rds")
str(HM11_regprobs)

# load motif categories
motif_groups <- read.table("../data/motifs/motif_results_groups.csv", fill=T, sep = ",", stringsAsFactors=F)
motif_groups$group <- c(1:(nrow(motif_groups))) #c(NA,1:(nrow(motif_groups)-1))
motif_groups <- data.table(melt(motif_groups, id.vars="group"))[value!=""]


regprobs_matrix <- HM11_regprobs[,best_denovo$Target_ID]
colnames(regprobs_matrix) <- sapply(strsplit(colnames(regprobs_matrix), "_"),function(x) x[1])

regprobs_matrix_all <- HM11_regprobs[,motif_matches[Target.Name %in% motif_groups[!is.na(group)]$value]$Target_ID]
colnames(regprobs_matrix_all) <- sapply(strsplit(colnames(regprobs_matrix_all), "_"),function(x) x[1])

str(regprobs_matrix)
str(regprobs_matrix_all)

add custom (new) motifs to matrix

results_custom <- readRDS("../data/motifs/MF_results_custom_V3b.rds")
regprobs_matrix_custom <- sapply(results_custom, function(x) x$dt$regprob)
rownames(regprobs_matrix_custom) <- results_custom[[1]]$dt$sequence

regprobs_matrix_custom <- regprobs_matrix_custom[which(rownames(regprobs_matrix_custom) %in% names(tss_seqs_s)), ]

# add CpG Motif to matrix
regprobs_matrix_custom <- cbind("CpG"=sqrt(CG_count)[rownames(regprobs_matrix_custom)]/max(CG_count),
                                regprobs_matrix_custom)
SDAresults$loadings_split <- split_loadings()#[rownames(HM11_regprobs),]

loadings_subset <- t(SDAresults$loadings[[1]][,rownames(regprobs_matrix_custom)])
qplot(1:50,tapply(CG_count[names(sort(loadings_subset[,"V5"],T))],
                  cut(seq_along(CG_count[names(loadings_subset[,"V5"])]), 50),
                  function(x) mean(x,na.rm=T)),
      ylab="Mean CpG count",
      xlab="Binned Component 5 loadings")

summary(lm(loadings_subset[,"V5"] ~ CG_count[rownames(loadings_subset)]*(loadings_subset[,"V5"]>0)))

Ranked Component Loading vs Motif Probability

# plot raw associations
# regrank_matrix <- apply(regprobs_matrix_custom, 2, rank)

plot(rank(SDAresults$loadings[[1]][5,rownames(regprobs_matrix)]),rank(regprobs_matrix[,"NRF1"]))
plot(rank(SDAresults$loadings[[1]][5,rownames(regprobs_matrix)]),rank(regprobs_matrix[,"SP2"]))
plot(rank(SDAresults$loadings[[1]][42,rownames(regprobs_matrix)]),rank(regprobs_matrix[,"MYBA"]))
plot(rank(SDAresults$loadings[[1]][11,rownames(regprobs_matrix)]),rank(regprobs_matrix[,"SPI1"]))

plot(rank(SDAresults$loadings[[1]][17,rownames(regprobs_matrix)]),rank(regprobs_matrix_custom[,"ATF1"]))

Binned Component Loading vs Mean Motif Probability

# For a give component, display binned associations

library(testisAtlas)

motif_order <- c("CpG","SP2","TAF1","GABPA","NRF1","ETV5","TYY1","NFYA","ZN143","MYBA","STRA8","RFX6","ATF1","RARA","TBP","SPI1","CREMT")

binned_regprobs_custom <- binned_mean_regprob(regprobs = regprobs_matrix_custom[,motif_order])

plot_component_regprobs(11, binned_regprobs_custom)
plot_component_regprobs(5, binned_regprobs_custom)
plot_component_regprobs(42, binned_regprobs_custom)
plot_component_regprobs(28, binned_regprobs_custom)
plot_component_regprobs(30, binned_regprobs_custom)
plot_component_regprobs(17, binned_regprobs_custom)

compare to the mean loadings per bin of loading

bin_factor <- cut(seq_along(regprobs_matrix[,1]), 50)

tmp <- sort(SDAresults$loadings[[1]][30,])
tmp <- tmp[names(tmp) %in% rownames(regprobs_matrix)]
plot(tapply(tmp, bin_factor, mean), main="30")

tmp <- sort(SDAresults$loadings[[1]][5,])
tmp <- tmp[names(tmp) %in% rownames(regprobs_matrix)]
plot(tapply(tmp, bin_factor, mean), main="5")

tmp <- sort(SDAresults$loadings[[1]][17,])
tmp <- tmp[names(tmp) %in% rownames(regprobs_matrix)]
plot(tapply(tmp, bin_factor, mean), main="17")

tmp <- sort(SDAresults$loadings[[1]][42,])
tmp <- tmp[names(tmp) %in% rownames(regprobs_matrix)]
plot(tapply(tmp, bin_factor, mean), main="42")

Mean regprob by component

library(ComplexHeatmap)
library(RColorBrewer)

plot_meanprob_bycomp(binned_regprobs_custom)

Heatmap(t(binned_regprobs_custom["SP2",,component_order]),
        col=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)),
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_title = "SP2")

Heatmap(t(binned_regprobs_custom["RFX6",,component_order]),
        col=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)),
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_title = "RFX2")

Heatmap(t(binned_regprobs_custom["GABPA",,component_order]),
        col=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)),
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_title = "GABPA")

Heatmap(t(binned_regprobs_custom["MYBA",,component_order]),
        col=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)),
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_title = "MYBA")

Heatmap(t(binned_regprobs_custom["ATF1",,component_order]),
        col=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)),
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_title = "ATF1*")

mean_prob_matrix <- cbind(binned_regprobs_custom[,50,],binned_regprobs_custom[,1,])
colnames(mean_prob_matrix) <- colnames(SDAresults$loadings_split)

mean_prob_matrix <- t(mean_prob_matrix[,ordering[-half_exclusions]])
rownames(mean_prob_matrix) <- paste(rownames(mean_prob_matrix), component_names[-half_exclusions])

NMF::aheatmap(scale(mean_prob_matrix), Rowv = NA, hclustfun = "ward.D2")
regprob_enrich <- function(i,motif,n=500){
  a=regprobs_matrix_custom[names(sort(SDAresults$loadings_split[rownames(regprobs_matrix_custom),i],T))[1:n],motif]
  b=regprobs_matrix_custom[names(sort(SDAresults$loadings_split[rownames(regprobs_matrix_custom),i],T))[(n+1):4512],motif]
  return(wilcox.test(a,b, alternative = "greater")$p.value)
}


comps <- 1:100
names(comps) <- colnames(SDAresults$loadings_split)
combined <- sapply(comps, function(x) regprob_enrich(x, "STRA8", n=500))
#plot(-log10(combined))

  tmp <- data.table(
    component = factor(names(combined), levels=names(combined)[c(rbind(component_order_all,component_order_all+50))]),
    p.value = combined
  )

  tmp$name <- rep(component_order_dt$name,2)

  tmp[order(component), rank := 1:100]
  tmp[rank >= 41, Type := "Meiotic"]
  tmp[rank < 41, Type := "Somatic"]
  tmp$OR = 1

  manhatten_plot(tmp[!component %in% half_exclusion_comps], topn = 4)

Correlation of Motif Probability and Gene Loadings

For selected motifs

motif_order <- c("CpG","SP2","TAF1","GABPA","NRF1","ETV5","TYY1","NFYA","ZN143","STRA8","MYBA","RFX6","ATF1","RARA","TBP","SPI1","CREMT")

vvv3 <- correlate_regprobs(regprobs = regprobs_matrix_custom[,motif_order])

vvv3 <- vvv3[ordering[-half_exclusions],]
rownames(vvv3) <- paste(rownames(vvv3), component_names[-half_exclusions])

#motif_order <- c("SP2","ETV5","CpG","TAF1","ZN143","NRF1","GABPA","NFYA","ZFP42","MYBA","MLX","CREM","RFX2","TBP","STAT1","SPIB")
#vvv3 <- vvv3[,motif_order]
#c("Sp2","Etv5","CpG","Taf1","Zfp143","Gabpa","Nrf1","Nfya","Yy1/Zfp42","Mybl1","Mlx","Crem","Rfx2","Tbp","Stat1","Spi1","Crem-t")

motif_gene_names <- c("CpG","Sp2","Taf1","Gabpa","Nrf1","Etv5","Zfp42\n (Yy1)","Nfya","Zfp143","Stra8*","Mybl1","Rfx2\n (Rfx6)","Crem\n (Atf1)","Rara","Tbp","Spi1","Crem-t")
colnames(vvv3) <- motif_gene_names

Create annotations

library(ComplexHeatmap)

results_custom_motifs <- sapply(results_custom, function(x) x$scoremat)
hascpg <- c("CpG"=NA,sapply(results_custom_motifs, function(x) MotifFinder::submotif(exp(x))))

#meth_sens = c("SP2"="No","ETV5"="Yes","CpG"="NA","TAF1"="No","ZN143"="No","NRF1"="Yes","GABPA"="Yes","NFYA"="Yes",
#                      "ZFP42"="Yes","MYBA"="No","MLX"="Yes","CREM"="Yes","RFX2"="No","TBP"="No","STAT1"="No","SPI1"="Yes","ATF1"="NA")

hascpg["CpG"] <- max(hascpg, na.rm = T)
#hascpg2["CpG"] <- min(hascpg2, na.rm = T)

dt1 <- data.table("Max(P(CpG)):" = hascpg[motif_order],
                  "Max(P(CpG))>0.3:" = hascpg[motif_order] > 0.3)

#"P(CpG2):" = 1-hascpg2[motif_order],
#,"Meth Sensitive?:" = meth_sens[motif_order]

ha1 = HeatmapAnnotation(df = dt1,
                        simple_anno_size = unit(1.5, "mm"),
                        show_annotation_name = FALSE,
    col = list("Has CpG?:"=c("Yes"="purple","No"="grey"),
               "Meth Sensitive?:"=c("Yes"="dark green","No"="grey","NA"="grey28"),
               "Max(P(CpG))>0.3:"=c("TRUE"="purple","FALSE"="grey"),
               "Max(P(CpG)):"=circlize::colorRamp2(c(min(hascpg), max(hascpg)), c("white", "red"))), #colorRamp2(seq(0, 1, by = 0.25), viridisLite::viridis(5, direction = -1))
               annotation_legend_param = list(legend_direction="horizontal",
                                              ncol=2,
                                              labels_gp = gpar(fontsize = 7),
                                              title_position = "leftcenter")
)

plot heatmap

pdf(height=6, width=8.5, file="../results/motifs/motifs_fig_DN.pdf")
draw(
Heatmap(vvv3,
        col = log_colour_scale(vvv3),
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_order = motif_gene_names,
        row_order = 1:nrow(vvv3),
        top_annotation = ha1,
        heatmap_legend_param = list(legend_direction = "horizontal", title = "t-value"),
        row_names_gp = gpar(fontsize = 4),
        show_heatmap_legend = T),
split = rep(c("  Somatic","", " Diploid", "Haploid"), c(18,4,32,18)),
row_dend_gp = gpar(fontsize = 3),
annotation_legend_side = "bottom",
heatmap_legend_side = "bottom")
dev.off()
# generic version of above (without annotation and custom ordering)

save_regprobs_heatmap <- function(regprobs_matrix = regprobs_matrix_C2N, file="../results/motifs/motifs_fig_DN_C2N.pdf", width=30){

  vvv3 <- correlate_regprobs(regprobs = regprobs_matrix)

vvv3 <- vvv3[ordering[-half_exclusions],]
rownames(vvv3) <- paste(rownames(vvv3), component_names[-half_exclusions])

  pdf(height=5, width=width, file=file)
  draw(Heatmap(vvv3,
        col = log_colour_scale(vvv3),
        cluster_rows = FALSE,
        row_order = 1:nrow(vvv3),
        heatmap_legend_param = list(legend_direction = "horizontal", title = "t-value"),
        row_names_gp = gpar(fontsize = 3),
        column_names_gp = gpar(fontsize = 3),
        show_heatmap_legend = T),
    split = rep(c("  Somatic","", " Diploid", "Haploid"), c(18,4,32,18)),
    heatmap_legend_side = "bottom")
  dev.off()

}

More motifs

# select motifs, HCMC regprobs
save_regprobs_heatmap(regprobs_matrix, "../results/motifs/motifs_fig_DN_HCMCregprobs_subset.pdf", width=5)

# all denovo motifs, but using hocomoco regprobs
save_regprobs_heatmap(regprobs_matrix_all, "../results/motifs/motifs_fig_DN_HCMCregprobs.pdf", width=10)

# All denovo motifs, denovo regprobbs
save_regprobs_heatmap(regprobs_matrix_custom, "../results/motifs/motifs_fig_DN_All.pdf", width=10)

All Hocomoco Motifs

regprobs_matrix_core <- sapply(results_core, function(x) x$dt$regprob)
rownames(regprobs_matrix_core) <- results_core[[1]]$dt$sequence

save_regprobs_heatmap(regprobs_matrix_core, "../results/motifs/motifs_fig_HM_core.pdf", width=100)

save_regprobs_heatmap(HM11_regprobs[,1:300], "../results/motifs/motifs_fig_HM11_regprobsA.pdf", width=100)
save_regprobs_heatmap(HM11_regprobs[,301:600], "../results/motifs/motifs_fig_HM11_regprobsB.pdf", width=100)
save_regprobs_heatmap(HM11_regprobs[,601:900], "../results/motifs/motifs_fig_HM11_regprobsC.pdf", width=100)
save_regprobs_heatmap(HM11_regprobs[,901:1276], "../results/motifs/motifs_fig_HM11_regprobsD.pdf", width=100)

All Denovo motifs from single components

results_C2N <- readRDS("../data/motifs/MF_results_C2N_V3.rds")
regprobs_matrix_C2N <- sapply(results_C2N, function(x) x$dt$regprob)
rownames(regprobs_matrix_C2N) <- results_C2N[[1]]$dt$sequence

regprobs_matrix_C2N <- regprobs_matrix_C2N[which(rownames(regprobs_matrix_C2N) %in% names(tss_seqs_s)), ]

save_regprobs_heatmap(regprobs_matrix_C2N, "../results/motifs/motifs_fig_DN_C2N.pdf", width = 7)

#####


results_C5N <- readRDS("../data/motifs/MF_results_C5N_V3.rds")
regprobs_matrix_C5N <- sapply(results_C5N, function(x) x$dt$regprob)
rownames(regprobs_matrix_C5N) <- results_C5N[[1]]$dt$sequence

regprobs_matrix_C5N <- regprobs_matrix_C5N[which(rownames(regprobs_matrix_C5N) %in% names(tss_seqs_s)), ]

save_regprobs_heatmap(regprobs_matrix_C5N, "../results/motifs/motifs_fig_DN_C5N.pdf", width = 7)

######

results_C11P <- readRDS("../data/motifs/MF_results_C11P_V3.rds")
regprobs_matrix_C11P <- sapply(results_C11P, function(x) x$dt$regprob)
rownames(regprobs_matrix_C11P) <- results_C11P[[1]]$dt$sequence

regprobs_matrix_C11P <- regprobs_matrix_C11P[which(rownames(regprobs_matrix_C11P) %in% names(tss_seqs_s)), ]

save_regprobs_heatmap(regprobs_matrix_C11P, "../results/motifs/motifs_fig_DN_C11P.pdf", width = 7)

######

results_C3P <- readRDS("../data/motifs/MF_results_C3P_V3.rds")
regprobs_matrix_C3P <- sapply(results_C3P, function(x) x$dt$regprob)
rownames(regprobs_matrix_C3P) <- results_C3P[[1]]$dt$sequence

regprobs_matrix_C3P <- regprobs_matrix_C3P[which(rownames(regprobs_matrix_C3P) %in% names(tss_seqs_s)), ]

save_regprobs_heatmap(regprobs_matrix_C3P, "../results/motifs/motifs_fig_DN_C3P.pdf", width = 7)


######

results_novel_A <- readRDS("../data/motifs/MF_results_novel_A.rds")
regprobs_matrix_novel_A <- sapply(results_novel_A, function(x) x$dt$regprob)
rownames(regprobs_matrix_novel_A) <- results_novel_A[[1]]$dt$sequence

save_regprobs_heatmap(regprobs_matrix_novel_A, "../results/motifs/motifs_fig_DN_novelA.pdf", width=15)



results_novel_B <- readRDS("../data/motifs/MF_results_novel_B.rds")
regprobs_matrix_novel_B <- sapply(results_novel_B, function(x) x$dt$regprob)
rownames(regprobs_matrix_novel_B) <- results_novel_B[[1]]$dt$sequence

save_regprobs_heatmap(regprobs_matrix_novel_B, "../results/motifs/motifs_fig_DN_novelB.pdf", width=15)

Distribution of Positions relative to TSS

whichpos_matrix_custom <- sapply(results_custom, function(x) x$dt[order(-regprob)][1:3000]$whichpos)

whichpos_dt <- melt(data.table(t(whichpos_matrix_custom[,motif_order[c(-1)]]), keep.rownames = T), id.vars = "rn")

whichpos_dt[,rn := factor(rn, levels=motif_order[c(-1)])]

ggplot(whichpos_dt, aes(value-500)) +
  geom_density() +
  facet_wrap(~rn, scales="free_y") +
  geom_vline(xintercept = 0, col='red') + 
  xlab("Distance from TSS") + 
  ylab("Relative Density")

Attempt: expression due to single TF

common_genes <- rownames(regprobs_matrix_custom)[rownames(regprobs_matrix_custom) %in% colnames(data)]

tmp <- data[,common_genes] * matrix(rep(t(regprobs_matrix_custom[common_genes,"CREMT",drop=F]),nrow(data)),nrow=nrow(data),byrow = T)
tmp2 <- Matrix::rowMeans(tmp)
ggplot(data.table(Stra8=tmp2, cell=names(tmp2))[cell_data,on="cell"], aes(Tsne1_QC1, Tsne2_QC1, colour=Stra8)) + geom_point(stroke=0, size=1) + scale_colour_viridis(direction = -1)


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.