# start off with merged, normalised data - chip + rna
# taken from test_masking.r
# start_data = max reg region over gene body
# LOOP THROUGH RELEVANT LUNG GO TERMS -------------------------------------
require(qusage)
msig_go_bp = read.gmt("c5.all.v6.1.symbols.gmt")
lung_go = data.frame(
ids = c(
"GO:0006855","GO:0042908","GO:0006281","GO:0004033","GO:0006805","GO:0009494","GO:0017144",
"GO:0002314","GO:0002377","GO:0002467","GO:0016064","GO:0019882","GO:0042571","GO:0006915",
"GO:0008219","GO:0000302","GO:0072593","GO:0006802","GO:0006979","GO:0034599","GO:0010193",
"GO:0070994","GO:0000303","GO:0006801","GO:0006749","GO:0033355","GO:0034635","GO:0036347",
"GO:1901370","GO:0006809","GO:0071731","GO:0007263","GO:0035713","GO:0046210","GO:0042744",
"GO:0050665","GO:0045071","GO:0051607","GO:0050691","GO:0045087","GO:0045088","GO:0045089",
"GO:0002218","GO:0002227","GO:0008063","GO:0034121","GO:0002224","GO:0001816","GO:0001817",
"GO:0001819","GO:0002775","GO:0002794","GO:0002777","GO:0032602","GO:0032642","GO:0001875",
"GO:0071222","GO:0038187","GO:1904019"
),
names=NA,
genes=NA
)
# add names
ensembl = useMart("ensembl", dataset="hsapiens_gene_ensembl")
get_names = getBM(attributes=c("name_1006","go_id"), filters="go", values=lung_go, mart=ensembl)
lung_go$names = get_names$name_1006[match(lung_go$ids, get_names$go_id)]
for(i in 1:length(lung_go$ids)) {
print(paste("Query", i))
go_res = getBM(attributes=c("hgnc_symbol","go_id"), filters="go", values=lung_go$ids[i], mart=ensembl)
if(!dim(go_res)[1]) next
lung_go$genes[i] = list(unique(go_res$hgnc_symbol))
}
lung_go = tbl_df(lung_go)
# EXAMPLE -----------------------------------------------------------------
load("z:/sandbox/epiChoose/r_data/column_annotation/gene_list_all.RData")
# pick out the lung and blood samples
sample_ix = c(179:182,196,202,187,193:195,222,251)
# sample labels
single_labels = rownames(start_data[[1]]$res)[sample_ix]
group_labels = c(rep("GSK",10), rep("ENCODE",2))
diff_test <- function(dat, parametric=TRUE, log_s=FALSE) {
if(log_s) {
dat$Score = log(dat$Score+1)
}
if(parametric) {
y = pairlist(pairwise.t.test(dat$Score, dat$`Cell Line`, p.adjust.method="BH", paired=TRUE))
y_df = tidy(y[[1]])
} else {
y = conover.test(dat$Score, dat$`Cell Line`, method="bh", table=TRUE)
y_df = data.frame(comps=y$comparisons, P=y$P.adjusted, test_stat=y$T)
}
return(y_df)
}
all_res = data.frame()
all_res_means = data.frame()
go_list = lung_go$genes
# go_list = msig_go_bp
go_names = lung_go$names
# go_names = names(msig_go_bp)
for(i in 1:length(go_list)) {
if(i %in% c(5598)) next
genes = go_list[[i]]
if(is.null(genes)) next
col_ix = which(gene_list_all$hgnc_symbol %in% genes)
if(length(col_ix)<2) next
end_data = start_data
# slice matrices
for(j in 1:length(end_data)) { # each data type
end_data[[j]]$res = end_data[[j]]$res[sample_ix,col_ix]
}
res = dist_mat(end_data, comp_ix=list(c(1:7,11:12), c(8:10)), labels=single_labels)
comp_ix = c(1,3,15)
y_all = data.frame()
for(j in 1:length(end_data)) {
x = end_data[[j]]$res[comp_ix,]
if(all(is.na(x))) next
y = melt(as.matrix(x))
y = cbind(y, names(end_data[j]))
names(y) = c("Cell Line", "Gene", "Score", "Assay")
if(length(which((filter(y, !is.na(Score)) %>% dplyr::select(Gene) %>% table()) == length(unique(y$`Cell Line`)))) < 2) next # if there are < 2 genes with observations from all cell lines, skip ...
y_all = rbind(y_all, y)
}
y_all = tbl_df(y_all)
# y_all = filter(y_all, !is.na(Score))
ggplot(y_all, aes(x=Gene, y=Score)) + geom_bar(aes(fill=`Cell Line`), position="dodge", stat="identity") + theme_thesis(10) + theme(axis.text.x=element_text(angle=45, hjust=1)) + facet_wrap(~Assay, nrow=2, scales="free")
ggplot(y_all, aes(x=`Cell Line`, y=log(Score+1))) + geom_boxplot() + theme_thesis(10) + facet_wrap(~Assay, nrow=2, scales="free") + theme(axis.text.x=element_text(angle=45, hjust=1))
# ggplot(y_all, aes(x=`Cell Line`, y=Score)) + geom_point() + geom_line(aes(group=Gene)) + theme_thesis(10) + facet_wrap(~Assay, nrow=2, scales="free")
all_res_means = rbind(all_res_means, data.frame(y_all %>% group_by(Assay, `Cell Line`) %>% summarise(mean=mean(Score, na.rm=TRUE)), go=go_names[i]))
all_res = rbind(all_res, data.frame(y_all %>% group_by(Assay) %>% do(diff_test(.,log_s=TRUE)), go=go_names[i], n=length(go_list[[i]])))
}
all_res = tbl_df(all_res)
# ADD LFC -----------------------------------------------------------------
calc_fc <- function(x, all_res_means) {
g1_score = all_res_means$mean[all_res_means$Assay==x[1] & all_res_means$go==x[5] & all_res_means$Cell.Line==x[2]]
g2_score = all_res_means$mean[all_res_means$Assay==x[1] & all_res_means$go==x[5] & all_res_means$Cell.Line==x[3]]
return(log(g1_score, base=2)-log(g2_score, base=2))
}
# add log fold change
all_res$LFC = apply(all_res, 1, calc_fc, all_res_means)
# ANALYSE -----------------------------------------------------------------
# check significant rna + h3k27ac results with a bonferroni cut-off
View(filter(all_res, Assay=="RNA" | Assay=="H3K27ac", p.value<0.05) %>% arrange(p.value))
# GLOBAL ------------------------------------------------------------------
# DISTANCE BY REGULATORY TYPE ---------------------------------------------
# look at overall pc plots splitting by segmentation categories
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.