########################################
## Check expression data
########################################
rm(list=ls())
#library('biomaRt')
library(gplots)
setwd("~/rosalind_lustre/mourikisa/data/OAC/")
## Get the data for the new samples
fns = list.files("rnaseq_data/FPKM_Juliane_03_12_17/counts/", pattern = "txt")
mappings_16 = read.table("rnaseq_data/old/16_samples/wgs_rnaseq_mapping.txt", header = F, sep="\t")
colnames(mappings_16) = c("id1", "sample", "id")
mappings_16 = mappings_16 %>% dplyr::select(sample, id)
mappings_19 = read.table("rnaseq_data/old/19_samples/rna_seq_mapping.txt", header = F, sep="\t")
colnames(mappings_19) = c("id1", "sample", "id")
mappings_19 = mappings_19 %>% mutate(id2=gsub("(\\.[^\\.]*)$", "", id)) %>%
mutate(id2=gsub("(\\.[^\\.]*)$", "", id2)) %>%
dplyr::select(-id) %>% rename(id=id2) %>%
dplyr::select(sample, id)
mappings = rbind(mappings_16, mappings_19)
fpkms = NULL
for(fn in fns){
d = read.table(paste0("rnaseq_data/FPKM_Juliane_03_12_17/counts/", fn), header = T, sep="\t")
d = d %>% tibble::rownames_to_column() %>% mutate(fn=fn) %>% rename(gene=rowname)
s = unlist(strsplit(fn, "_primaryReads"))[1]
d = d %>% mutate(id=s)
d = d %>% left_join(mappings)
fpkms = rbind(fpkms, d)
}
## Exclude sample LP6005690-DNA_F02 because this was sequenced twice - the one excluded sequenced in the pilot of ICGC
fpkms = fpkms %>% subset(id!="SLX-10113.D705_D502")
load("~/rosalind_lustre/mourikisa/data/geneInfoNCG5_2.Rdata")
geneInfo = geneInfo %>% subset(duplicability!=-1) %>% dplyr::select(entrez, symbol, refdna) %>%
mutate(refdna=strsplit(refdna, ";")) %>% unnest()
fpkms = fpkms %>% rename(refdna=gene) %>% left_join(geneInfo)
save(fpkms, file="rnaseq_data/FPKM_Juliane_03_12_17/fpkms.Rdata")
# ## To check if I can reduce the lost NMs via biomart
# ## convert NMs ids to entrez ids
# mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
# genes <- unique(fpkms$refdna)
# glist <- getBM(filters= "refseq_mrna", attributes= c("refseq_mrna", "entrezgene", "hgnc_symbol"),values=genes,mart= mart)
# glist = glist %>% rename(refdna=refseq_mrna, entrez_biomart=entrezgene, hgnc_symbol_biomart=hgnc_symbol)
# detach("package:biomaRt", unload=TRUE)
# ## Sometime I run the script from this bit that's why the next couple of lines are repeated
# ## I will intersect the list from biomart with NCG
# load("~/rosalind_lustre/mourikisa/data/geneInfoNCG5_2.Rdata")
# geneInfo = geneInfo %>% subset(duplicability!=-1) %>% dplyr::select(entrez, symbol, refdna) %>%
# mutate(refdna=strsplit(refdna, ";")) %>% unnest()
# glist = glist %>% subset(entrez_biomart%in%geneInfo$entrez)
#
# fpkms = fpkms %>% left_join(glist)
# save(fpkms, file="~/rosalind_lustre/mourikisa/data/OAC/rnaseq_data/fpkms_plus_biomart_annotation.Rdata")
## I collapse different mRNAs and take the max fpkm
expression = fpkms %>% subset(!is.na(entrez)) %>% group_by(sample, entrez, symbol) %>% summarise(R.FPKM_union=max(R.FPKM_union), R.FPKM_intersectionNotEmpt=max(R.FPKM_intersectionNotEmpt)) %>% ungroup()
load("~/rosalind_lustre/mourikisa/data/geneInfoNCG5_2.Rdata")
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/training_set_noScale.Rdata")
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/validation_set_noScale.Rdata")
cohort = rbind(training_ns%>% tibble::rownames_to_column(), validation_ns%>% tibble::rownames_to_column()%>%mutate(type="P"))
cohort = cohort %>% separate(rowname, into=c("ct", "sample", "entrez"), sep="\\.") %>%
separate(sample, into=c("sample", "normal"), sep="_vs_") %>%
mutate(entrez=as.numeric(entrez))
cohort = cohort %>% left_join(geneInfo%>%subset(duplicability!=-1)%>%dplyr::select(entrez, symbol, cancer_type)%>%rename(gene_type=cancer_type))
## Check how many samples in our cohort
venn(list(rnaseq=unique(fpkms$sample), samples_161=unique(cohort$sample)))
## Select only samples in our cohort
expression = expression %>% subset(sample%in%cohort$sample)
## Amplified helpers
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/patient_stratification_onlyTop10.Rdata")
dt = d
dt = dt %>% separate(sample, into=c("sample", "normal"), sep="_vs_")
a = dt %>% subset(CNVGain==1) %>% dplyr::select(sample, entrez, symbol) %>% unique %>% mutate(type="Amplification") %>% mutate(key=paste0(sample, ".", entrez))
## Plots for amplifications (I decided to go with entrez ids) & get rid of some duplicated entrez due to the conversion
toPlot = expression %>% subset(entrez%in%a$entrez) %>% mutate(key=paste0(sample, ".", entrez))
genesWithAtLeastOneAmp = toPlot %>% subset(key%in%a$key) %>% .$entrez %>% unique ## we do that because we want only those that are amplified in at least one sample
toPlot = toPlot %>% subset(entrez%in%genesWithAtLeastOneAmp) %>% mutate(type=ifelse(key %in% a$key, "Amplification", "WT"))
toPlot = toPlot %>% left_join(toPlot %>% group_by(type) %>% summarise(n=n())) %>% mutate(label=paste0(type, "(n=", n, ")"))
ggplot(toPlot, aes(x=label, y=R.FPKM_union)) + geom_violin() + geom_boxplot(width=0.1) +
scale_y_log10(breaks=c(-10, 0, 1, 3, 5, 10, 25, 50, seq(min(toPlot$R.FPKM_union), max(toPlot$R.FPKM_union), 100)))+
#theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
) +
ylab("log10(FPKM)") + xlab("") + ggtitle("Expression comparison for 202 amplified helper genes in 32 samples")
write.table(toPlot, file="~/Desktop/amplified_vs_WT_helpers.tsv", row.names = F, quote = F, sep = "\t")
print(wilcox.test(toPlot$FPKM[toPlot$type=="Amplification"], toPlot$FPKM[toPlot$type=="WT"]))
print(summary(toPlot$FPKM[toPlot$type=="Amplification"]))
print(summary(toPlot$FPKM[toPlot$type=="WT"]))
print(wilcox.test(toPlot$R.FPKM_union[toPlot$type=="Amplification"], toPlot$R.FPKM_union[toPlot$type=="WT"]))
print(summary(toPlot$R.FPKM_union[toPlot$type=="Amplification"]))
print(summary(toPlot$R.FPKM_union[toPlot$type=="WT"]))
print(wilcox.test(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="Amplification"], toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="WT"]))
print(summary(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="Amplification"]))
print(summary(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="WT"]))
## Amplified drivers
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/patient_stratification_onlyCGC.Rdata")
dc = d
dc = dc %>% separate(sample, into=c("sample", "normal"), sep="_vs_")
a = dc %>% subset(CNVGain==1) %>% dplyr::select(sample, entrez, symbol) %>% unique %>% mutate(type="Amplification") %>% mutate(key=paste0(sample, ".", entrez))
toPlot = expression %>% subset(entrez%in%a$entrez) %>% mutate(key=paste0(sample, ".", entrez))
genesWithAtLeastOneAmp = toPlot %>% subset(key%in%a$key) %>% .$entrez %>% unique ## we do that because we want only those that are amplified in at least one sample
toPlot = toPlot %>% subset(entrez%in%genesWithAtLeastOneAmp) %>% mutate(type=ifelse(key %in% a$key, "Amplification", "WT"))
toPlot = toPlot %>% left_join(toPlot %>% group_by(type) %>% summarise(n=n())) %>% mutate(label=paste0(type, "(n=", n, ")"))
print(wilcox.test(toPlot$R.FPKM_union[toPlot$type=="Amplification"], toPlot$R.FPKM_union[toPlot$type=="WT"]))
print(summary(toPlot$R.FPKM_union[toPlot$type=="Amplification"]))
print(summary(toPlot$R.FPKM_union[toPlot$type=="WT"]))
print(wilcox.test(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="Amplification"], toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="WT"]))
print(summary(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="Amplification"]))
print(summary(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="WT"]))
## Amplified genes overall
t = cohort
a = t %>% subset(CNVGain==1) %>% dplyr::select(sample, entrez, symbol) %>% unique %>% mutate(type="Amplification") %>% mutate(key=paste0(sample, ".", entrez))
toPlot = expression %>% subset(entrez%in%a$entrez) %>% mutate(key=paste0(sample, ".", entrez))
genesWithAtLeastOneAmp = toPlot %>% subset(key%in%a$key) %>% .$entrez %>% unique ## we do that because we want only those that are amplified in at least one sample
toPlot = toPlot %>% subset(entrez%in%genesWithAtLeastOneAmp) %>% mutate(type=ifelse(key %in% a$key, "Amplification", "WT"))
toPlot = toPlot %>% left_join(toPlot %>% group_by(type) %>% summarise(n=n())) %>% mutate(label=paste0(type, "(n=", n, ")"))
print(wilcox.test(toPlot$R.FPKM_union[toPlot$type=="Amplification"], toPlot$R.FPKM_union[toPlot$type=="WT"]))
print(summary(toPlot$R.FPKM_union[toPlot$type=="Amplification"]))
print(summary(toPlot$R.FPKM_union[toPlot$type=="WT"]))
print(wilcox.test(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="Amplification"], toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="WT"]))
print(summary(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="Amplification"]))
print(summary(toPlot$R.FPKM_intersectionNotEmpt[toPlot$type=="WT"]))
##############
## Old code
##############
fns = list.files("rnaseq_data/16_samples/")
fns = fns[!grepl("mapping", fns)]
rpkms_16 = data.frame()
for(fn in fns){
d = read.table(paste0("rnaseq_data/16_samples/", fn), header = T, sep="\t")
d = d %>% tibble::rownames_to_column() %>% mutate(id=gsub("_rpkm.txt", "", fn)) %>% rename(gene=rowname)
rpkms_16 = rbind(rpkms_16, d)
}
## Get the mappings
mappings_16 = read.table("rnaseq_data/16_samples/wgs_rnaseq_mapping.txt", header = F, sep="\t")
colnames(mappings_16) = c("id1", "sample", "id")
mappings_16 = mappings_16 %>% dplyr::select(sample, id)
rpkms_16 = rpkms_16 %>% left_join(mappings_16)
###########################################################
## C = Number of reads mapped to a gene
## N = Total mapped reads in the experiment
## L = exon length in base-pairs for a gene
## Equation: RPKM = (10^9 * C)/(N * L)
###########################################################
## Get RPKM
samples2N = rpkms_16 %>% group_by(id) %>% summarise(N=sum(readcounts))
rpkms_16 = rpkms_16 %>% left_join(samples2N)
rpkms_16 = rpkms_16 %>% mutate(rpkm=exp( log(readcounts) + log(1e9) - log(genelength) - log(N) ))
## Get the data for the old samples
fns = list.files("rnaseq_data/19_samples/")
fns = fns[!grepl("mapping", fns)]
rpkms_19 = data.frame()
for(fn in fns){
d = read.table(paste0("rnaseq_data/19_samples/", fn), header = T, sep="\t")
d = d %>% tibble::rownames_to_column() %>% mutate(id=gsub("_rpkm.txt", "", fn)) %>% rename(gene=rowname)
rpkms_19 = rbind(rpkms_19, d)
}
## Get the mappings
mappings_19 = read.table("rnaseq_data/19_samples/rna_seq_mapping.txt", header = F, sep="\t")
colnames(mappings_19) = c("id1", "sample", "id")
mappings_19 = mappings_19 %>% mutate(id2=gsub("(\\.[^\\.]*)$", "", id)) %>%
mutate(id2=gsub("(\\.[^\\.]*)$", "", id2)) %>%
dplyr::select(-id) %>% rename(id=id2) %>%
dplyr::select(sample, id)
rpkms_19 = rpkms_19 %>% left_join(mappings_19)
## Exclude sample LP6005690-DNA_F02 because this was sequenced twice
rpkms_19 = rpkms_19 %>% subset(sample!="LP6005690-DNA_F02")
## Get RPKM also for this one
samples2N = rpkms_19 %>% group_by(id) %>% summarise(N=sum(readcounts))
rpkms_19 = rpkms_19 %>% left_join(samples2N)
rpkms_19 = rpkms_19 %>% mutate(rpkm=exp( log(readcounts) + log(1e9) - log(genelength) - log(N) ))
## Combine the two tables
rpkms = rbind(rpkms_16, rpkms_19 %>% dplyr::select(-R.FPKM))
## convert ensembl ids to entrez ids
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- rpkms$gene
genes = gsub("\\..*","",genes)
genes = unique(genes)
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id", "entrezgene", "hgnc_symbol"),values=genes,mart= mart)
G_list = G_list %>% rename(gene=ensembl_gene_id)
G_list = G_list %>% subset(!duplicated(gene)) ## some ensembl ids return more than one symbols (they are only 10 anyways)
rpkms$gene = gsub("\\..*","",rpkms$gene)
rpkms = rpkms %>% left_join(G_list)
save(rpkms, file="~/athena/data/OAC/rnaseq_data/rpkm.Rdata")
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/training_set_noScale.Rdata")
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/validation_set_noScale.Rdata")
cohort = rbind(training_ns%>% tibble::rownames_to_column(), validation_ns%>% tibble::rownames_to_column()%>%mutate(type="P"))
cohort = cohort %>% separate(rowname, into=c("ct", "sample", "entrez"), sep="\\.") %>%
separate(sample, into=c("sample", "normal"), sep="_vs_") %>%
mutate(entrez=as.numeric(entrez))
load("~/rosalind_lustre/mourikisa/data/geneInfoNCG5_2.Rdata")
cohort = cohort %>% left_join(geneInfo%>%subset(duplicability!=-1)%>%dplyr::select(entrez, symbol, cancer_type)%>%rename(gene_type=cancer_type))
## Check how many samples in our cohort
venn(list(rnaseq=unique(rpkms$sample), samples_161=unique(cohort$sample)))
rpkms = rpkms %>% subset(sample%in%cohort$sample)
## Exclude genes that cannot be mapped to entrez/symbol
rpkms = rpkms %>% subset(!(is.na(entrezgene) & is.na(hgnc_symbol)))
## Amplified helpers
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/gsea.withAmp.top10.plusCGC.containers.corrected.Rdata")
t = genes2paths %>% subset(gene_type!="cgc")
t = t %>% separate(sample, into=c("sample", "normal"), sep="_vs_")
a = t %>% subset(CNVGain==1) %>% dplyr::select(sample, entrez, symbol) %>% unique %>% mutate(type="Amplification") %>% mutate(key=paste0(sample, ".", entrez))
## Plots for amplifications (I decided to go with entrez ids) & get rid of some duplicated entrez due to the conversion
toPlot = rpkms %>% subset(entrezgene%in%a$entrez) %>% mutate(key=paste0(sample, ".", entrezgene)) %>% arrange(sample, desc(rpkm)) %>% subset(!duplicated(key))
genesWithAtLeastOneAmp = toPlot %>% subset(key%in%a$key) %>% .$entrezgene %>% unique ## we do that because we want only those that are amplified in at least one sample
toPlot = toPlot %>% subset(entrezgene%in%genesWithAtLeastOneAmp) %>% mutate(type=ifelse(key %in% a$key, "Amplification", "WT"))
toPlot = toPlot %>% left_join(toPlot %>% group_by(type) %>% summarise(n=n())) %>% mutate(label=paste0(type, "(n=", n, ")"))
ggplot(toPlot, aes(x=label, y=rpkm)) + geom_violin() + geom_boxplot(width=0.1) +
scale_y_log10(breaks=c(0, 1, 3, 5, 10, 25, 50, seq(min(toPlot$rpkm), max(toPlot$rpkm), 100)))+
theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
) +
ylab("Expression (RPKM)") + xlab("") + ggtitle("Expression comparison for 209 amplified helper genes in 32 samples \n (top 10 plus CGC)")
write.table(toPlot, file="~/Desktop/amplified_helpers.tsv", row.names = F, quote = F, sep = "\t")
print(wilcox.test(toPlot$rpkm[toPlot$type=="Amplification"], toPlot$rpkm[toPlot$type=="WT"]))
print(wilcox.test(toPlot$rpkm[toPlot$type=="Amplification"], toPlot$rpkm[toPlot$type=="WT"], alternative = "greater"))
print(summary(toPlot$rpkm[toPlot$type=="Amplification"]))
print(summary(toPlot$rpkm[toPlot$type=="WT"]))
## Amplified drivers
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/gsea.withAmp.top10.plusCGC.containers.corrected.Rdata")
t = genes2paths %>% subset(gene_type=="cgc")
t = t %>% separate(sample, into=c("sample", "normal"), sep="_vs_")
a = t %>% subset(CNVGain==1) %>% dplyr::select(sample, entrez, symbol) %>% unique %>% mutate(type="Amplification") %>% mutate(key=paste0(sample, ".", entrez))
toPlot = rpkms %>% subset(entrezgene%in%a$entrez) %>% mutate(key=paste0(sample, ".", entrezgene)) %>% arrange(sample, desc(rpkm)) %>% subset(!duplicated(key))
genesWithAtLeastOneAmp = toPlot %>% subset(key%in%a$key) %>% .$entrezgene %>% unique ## we do that because we want only those that are amplified in at least one sample
toPlot = toPlot %>% subset(entrezgene%in%genesWithAtLeastOneAmp) %>% mutate(type=ifelse(key %in% a$key, "Amplification", "WT"))
toPlot = toPlot %>% left_join(toPlot %>% group_by(type) %>% summarise(n=n())) %>% mutate(label=paste0(type, "(n=", n, ")"))
ggplot(toPlot, aes(x=label, y=rpkm)) + geom_violin() + geom_boxplot(width=0.1) +
scale_y_log10(breaks=c(0, 1, 3, 5, 10, 25, 50, seq(min(toPlot$rpkm), max(toPlot$rpkm), 100)))+
theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
) +
ylab("Expression (RPKM)") + xlab("") + ggtitle("Expression comparison for 52 amplified driver genes (top 10 plus CGC)")
write.table(toPlot, file="~/Desktop/amplified_drivers.tsv", row.names = F, quote = F, sep = "\t")
print(wilcox.test(toPlot$rpkm[toPlot$type=="Amplification"], toPlot$rpkm[toPlot$type=="WT"]))
print(wilcox.test(toPlot$rpkm[toPlot$type=="Amplification"], toPlot$rpkm[toPlot$type=="WT"], alternative = "greater"))
print(summary(toPlot$rpkm[toPlot$type=="Amplification"]))
print(summary(toPlot$rpkm[toPlot$type=="WT"]))
## Amplified genes in prediction set
t = cohort %>% subset(gene_type!="cgc")
a = t %>% subset(CNVGain==1) %>% dplyr::select(sample, entrez, symbol) %>% unique %>% mutate(type="Amplification") %>% mutate(key=paste0(sample, ".", entrez))
toPlot = rpkms %>% subset(entrezgene%in%a$entrez) %>% mutate(key=paste0(sample, ".", entrezgene)) %>% arrange(sample, desc(rpkm)) %>% subset(!duplicated(key))
genesWithAtLeastOneAmp = toPlot %>% subset(key%in%a$key) %>% .$entrezgene %>% unique ## we do that because we want only those that are amplified in at least one sample
toPlot = toPlot %>% subset(entrezgene%in%genesWithAtLeastOneAmp) %>% mutate(type=ifelse(key %in% a$key, "Amplification", "WT"))
toPlot = toPlot %>% left_join(toPlot %>% group_by(type) %>% summarise(n=n())) %>% mutate(label=paste0(type, "(n=", n, ")"))
ggplot(toPlot, aes(x=label, y=rpkm)) + geom_violin() + geom_boxplot(width=0.1) +
scale_y_log10(breaks=c(0, 1, 3, 5, 10, 25, 50, 100, 500, 1000, 2000, seq(min(toPlot$rpkm), max(toPlot$rpkm), 3000)))+
theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
) +
ylab("Expression (RPKM)") + xlab("") + ggtitle("Expression comparison for 5,039 amplified genes in prediction set")
write.table(toPlot, file="~/Desktop/amplified_prediction_set.tsv", row.names = F, quote = F, sep = "\t")
print(wilcox.test(toPlot$rpkm[toPlot$type=="Amplification"], toPlot$rpkm[toPlot$type=="WT"]))
print(wilcox.test(toPlot$rpkm[toPlot$type=="Amplification"], toPlot$rpkm[toPlot$type=="WT"], alternative = "greater"))
print(summary(toPlot$rpkm[toPlot$type=="Amplification"]))
print(summary(toPlot$rpkm[toPlot$type=="WT"]))
## Check individual genes
## E2F
load("~/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/gsea.withAmp.top10.plusCGC.containers.corrected.Rdata")
t = genes2paths %>% subset(grepl("E2F", symbol))
t = t %>% separate(sample, into=c("sample", "normal"), sep="_vs_")
rpkms %>% subset(sample%in%t$sample) %>% select(sample) %>% unique
rpkms %>% subset(entrezgene%in%t$entrez) %>% select(entrezgene) %>% unique %>% nrow
genes = c("E2F1", "E2F2", "E2F3", "E2F4", "E2F5" )
plots = list()
for(g in genes){
s_amp = t %>% subset(symbol==g) %>% .$sample %>% unique
length(s_amp)
expr = rpkms %>% subset(entrezgene==unique(t$entrez[t$symbol==g]))
expr = expr %>% mutate(type=ifelse(sample%in%s_amp, "Amplification", "WT"))
expr = expr %>% left_join(expr %>% count(type)) %>% mutate(label=paste0(type, "(n=", n, ")"))
p = ggplot(expr, aes(x=type, y=rpkm)) + geom_boxplot(width=0.1) +
geom_point() +
theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
) +
ylab(paste0(g, " expression (RPKM)")) + xlab("")
plots[[g]] = p
}
grid.arrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], plots[[5]], ncol=3)
## Bring Copy number in to select the gene
toPlot = toPlot %>% left_join(syscans%>%dplyr::select(sample, symbol, CNVGain, Copy_number)%>%unique)
## Bring also whether they have
sigma = read_xlsx("~/Desktop/amplified_genes_no_cgc_sigma.xlsx", 1) %>% rename(symbol=Gene)
toPlot = toPlot %>% left_join(sigma)
write.table(toPlot, file="~/Desktop/amplifications_vs_WT_syscans.tsv", quote = F, row.names = F, sep="\t")
## Plots for Homozygous deletions
homs = syscans %>% subset(Copy_number==0 & symbol%in%h) %>% dplyr::select(sample, symbol) %>% unique %>% mutate(type="HomDel")
toPlot = rpkms %>% subset(hgnc_symbol%in%h) %>% rename(symbol=hgnc_symbol) %>% left_join(homs) %>% mutate(type=ifelse(is.na(type), "WT", type))
ggplot(toPlot, aes(x=type, y=rpkm)) + geom_boxplot() +
scale_y_continuous(limits = c(0,10)) +
theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
)
print(wilcox.test(toPlot$rpkm[toPlot$type=="HomDel"], toPlot$rpkm[toPlot$type=="WT"]))
print(summary(toPlot$rpkm[toPlot$type=="HomDel"]))
print(summary(toPlot$rpkm[toPlot$type=="WT"]))
print(length(toPlot$rpkm[toPlot$type=="HomDel"]))
print(length(toPlot$rpkm[toPlot$type=="WT"]))
print(length(unique(toPlot$entrez[toPlot$type=="HomDel"])))
print(length(unique(toPlot$sample[toPlot$type=="HomDel"])))
print(length(unique(toPlot$entrez[toPlot$type=="WT"])))
print(length(unique(toPlot$sample[toPlot$type=="WT"])))
write.table(toPlot, file="~/Desktop/homdels_vs_WT_syscans.tsv", quote = F, row.names = F, sep="\t")
## Get the genes we selected
cans = c("KAT2A", "KAT2B", "ROCK1", "ABI2", "PTK2", "IQGAP1", "COL4A1", "EIF4G1", "NBEA")
plots = list()
for (c in cans){
print(c)
amplifs = syscans %>% subset(CNVGain==1 & symbol==c) %>% dplyr::select(sample, symbol) %>% unique %>% mutate(type="A")
if(sum(amplifs$sample%in%unique(rpkms$sample))==0){
stop("None of the samples has expression")
}
toPlot = rpkms %>% subset(hgnc_symbol==c) %>% rename(symbol=hgnc_symbol) %>% left_join(amplifs) %>% mutate(type=ifelse(is.na(type), "WT", type))
p = ggplot(toPlot, aes(x=type, y=rpkm)) + geom_boxplot() +
scale_y_continuous(limits = c(0,100)) +
theme_boss() +
theme(
axis.title.x = element_blank(),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black")
)
plots[[c]] = p
print(wilcox.test(toPlot$rpkm[toPlot$type=="A"], toPlot$rpkm[toPlot$type=="WT"]))
print(summary(toPlot$rpkm[toPlot$type=="A"]))
print(summary(toPlot$rpkm[toPlot$type=="WT"]))
print(length(toPlot$rpkm[toPlot$type=="A"]))
print(length(toPlot$rpkm[toPlot$type=="WT"]))
print(length(unique(toPlot$entrez[toPlot$type=="A"])))
print(length(unique(toPlot$sample[toPlot$type=="A"])))
print(length(unique(toPlot$entrez[toPlot$type=="WT"])))
print(length(unique(toPlot$sample[toPlot$type=="WT"])))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.