if(!exists("VERBOSE")) VERBOSE = F
.pardefault <- par(no.readonly = T)
.parpin <-par()$pin
# .pardefault <- par()
stackedGeneBar<-function(tcga_som,
title="Mutation types for the top 20 most mutated genes",
colorMatchesFile="./reference_data/MAFcolorMatches.txt"){
ufilt = tcga_som
gs = summarize_by(col=tcga_som[,"Hugo_Symbol"], display=F)
top20 = head(gs[order(gs[,2],decreasing=T),],20)
gsnames = top20$types
toprows = ufilt[ufilt$Hugo_Symbol%in%gsnames,]
slimrows = toprows[,c("Hugo_Symbol","Variant_Classification")]
slimrows = merge(x=slimrows, y=top20, by.x="Hugo_Symbol", by.y="types")
slimrows = slimrows[order(slimrows$counts),]
slimrows$Hugo_Symbol<-factor(slimrows$Hugo_Symbol, levels=top20$types)
colcols = getColorSequence(cnames=unique(slimrows$Variant_Classification),
fname=colorMatchesFile)
colnames = colors()[colcols]
names(colnames)<-names(colcols)
p1 = ggplot(data=slimrows, aes(x=Hugo_Symbol,
fill=factor(Variant_Classification)))+
geom_bar(color="black")+
coord_flip()+
scale_fill_manual(values=colnames)+
theme_bw()+
theme(legend.title=element_blank())+
ggtitle(title)
print(p1)
}
simpleGGHist<-function(dataSet, xlab, ylab, mainTitle, showPlot=T){
require(ggplot2)
p = qplot(x=as.vector(dataSet), geom="histogram")+
theme_bw()+
xlab(xlab)+
ylab(ylab)+
ggtitle(mainTitle)
if(showPlot){
print(p)
} else {
return(p)
}
}
promptNumeric <- function (prompt) {
while(T){
line=readline(prompt)
if(line=="") break
line = suppressWarnings(expr=as.numeric(line))
if(!is.na(line)) break
print("Sorry, your input could not be understood, please try again")
}
return(line)
}
longTextBarPlot<-function(data, lab, main=""){
bpdata = barplot(data, horiz=T, main=main)
xmin = par("usr")[1]
xrange = par("usr")[2] - par("usr")[1]
xcoord = xmin + (xrange/40)
nlev = length(data)
text(x=xcoord, y=bpdata, labels=lab, pos=4)
}
###############################################################################
#
# helperFunctions.R: This file contains the all helper functions not directly related to any other source file.
# author: Frank Kramer <dev@frankkramer.de>
#
# This is released under GPL-2.
#
# Documentation was created using roxygen
#
###############################################################################
#' Replace factors/levels in a data.frame and use plain strings instead
#'
#' This function takes a data.frame as argument and returns it with strings instead of factors.
#'
#' @param df any data.frame with factor levels in at least one column
#' @return The data.frame is returned using strings instead of factors.
#' @author Frank Kramer
unfactorize <- function (df) {
if(!("data.frame" %in% class(df))) { stop("Error: unfactorize: data.frame as argument expected") }
for (i in 1:ncol(df)) {
if (class(df[,i]) == "factor") df[,i] <- as.character(df[,i])
}
df
}
save.plot<-function(pname){
return(AutoSavePlot(pname))
}
AutoSavePlot<-function(pname){
if(is.null(pname)){
pname = "tmp_image_file"
}
if(!grepl(pattern=".png$", x=pname)) pname = paste(pname,".png",sep="")
pfname = plotFileName(pname)
dev.copy(png,pfname)
dev.off()
return(pfname)
}
plotFileName<-function(pname, addTime=F){
#takes a name, makes sure there's a directory there, adds time stamp to make it unique, adds a postfix to make sure
autoincrament = F
root="./output/imageTemp/"
if(grepl(pattern="[/]",x=pname)){#the a file path was passed, find the root
root = paste(paste(strsplit(x=pname, split="[/]")[[1]][1:(length(strsplit(x=pname, split="[/]")[[1]])-1)],collapse="/"),
"/",
sep="")
pname = strsplit(x=pname, split="[/]")[[1]][length(strsplit(x=pname, split="[/]")[[1]])]#the last part of the path is the actual file name
}
dir.create(path=root, recursive=T, showWarnings=F)
ctime = ""
if(addTime) ctime = gsub(pattern=" ", replacement=".", x=paste0(as.character(Sys.time()),"."))
fullPath = paste(root, "graphic", gsub(pattern="[-:]", replacement=".", x=ctime), pname, sep="")
#check if it exists, if it does, append a number
while(file.exists(fullPath)&autoincrament){
pname = paste("1", pname, sep=".")
fullPath = paste(root, "graphic",gsub(pattern="[-:]", replacement=".", x=as.character(Sys.time())),".", pname, sep="")
}
return(fullPath)
}
#pgvmFromStacked makes a patient gene value matrix from data in stacked format
#rows = genes, columns = patients, values = value for gene, ie, for a gene which has a missense mutation,
# the value would be "missense"
#if more than one value are given for a particular patient-gene combination
#takes: stackedData: data frame with patient gene data in stacked format
# valueCol: the name of the column with the gene values
# patientIdCol: the name of the column with patient ids
# geneCol: the name of the column with the gene identifiers
# blankSymbol: what is placed in the pvm when the stacked file does not give a value
# for a given patient-gene combination
#returns: patient gene value matrix
pgvmFromStacked<-function(stackedData, valueCol, patientIdCol, geneCol, blankSymbol=NA){
#1: find list of unique pids
pids = unique(stackedData[,patientIdCol])
#2: find lis of unique genes
ugenes = unique(stackedData[,geneCol])
#case to handle: if patient does not have a value for a gene,
#it will be set to NULL
pvm = matrix(data = blankSymbol, ncol=length(pids), nrow=length(ugenes), dimnames=list(ugenes, pids))
#3: put values
for(p in pids){
prows = stackedData[stackedData[,patientIdCol] == p,]
for(g in ugenes){
pgrows = prows[prows[,geneCol]==g,]
if(nrow(pgrows) == 1){
pvm[g,p] = pgrows[1,valueCol]
}else if(nrow(pgrows) > 1){
#case to handle: if there are more than one value for a gene..
cat("\nNote: multiple values found for this patient-gene combo:", p, g, "\n")
pvm[g,p] = paste(pgrows[,valueCol], collapse=" ")
}
}
}
return(pvm)
}
PGMFromVector<-function(genevector){
if(is.data.frame(genevector)) genevector = as.matrix(genevector)
if(is.matrix(genevector)) genevector = as.vector(genevector)
pgmout = matrix(data=T,
nrow=length(genevector),
ncol=1,
dimnames=list(genevector, "Genes_in_set"))
return(pgmout)
}
#selectionList()
#prompts the user with a numbered list of selection options
#takes a vector of options to choose from (does not have to be unique set)
#returns logic vector indicating which items in the input vector match the/those
# item(s) chosen by user
selectionList<-function(valcol, verbose=T){
usel = unique(valcol)
useltmp = gsub(pattern="_",replacement=" ",x=usel)
uselmat = matrix(data=1:length(usel), ncol=1, dimnames=list(useltmp, "selection number"))
print(uselmat)
uin = readline("Please enter the number(s) corresponding to your selection(s)\n")
uin = as.integer(strsplit(x=uin, split=" ")[[1]])
lvout = valcol%in%usel[uin]
return(lvout)
}
filePrompt<-function(defaultfile){
#prompts user to select file for data input
#provides default file option
#returns file name
fsel = readline(paste("\nTo select a data file, Enter s\n",
"To load the default data from \n",defaultfile,",\njust press enter \n",sep=""))
if(fsel=="s"){
pfile = file.choose()
}else{
pfile = defaultfile
}
cat("\nLoading data from:\n",pfile,"\n")
return(pfile)
}
#'@title openPGM
#'@description Opens a patient gene matrix file.
#'@param fname The file path to the patient gene matrix file.
#'@return The patient gene matrix (logic matrix with gene identifiers given as row names and patient identifiers gien as column names)
#'@export
openPGM<-function(fname = NULL){
#opens a patient gene matrix file
#takes: 1) file name
# or 2) no arg (in this case, user will be prompted to select a file)
#returns: patient gene matrix
if(is.null(fname)){
fname = file.choose()
}
pgmdat = read.table(file=fname,check.names=T,
header=T,
row.names=1,
sep="\t",
stringsAsFactors=F)
pgmdat = as.matrix(pgmdat)
return(pgmdat)
}
#extract_pid_w_matchnormal
#like extract_pid but leaves the sample type code at the end (match/normal)
#takes: n: a string that might contain a TCGA id, ex: TCGA-IQ-7632-01 in unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt
#returns: the TCGA id (with -'s replaced by .'s) if an id is found; NULL if there is not a TCGA id in the input string
extract_pid_w_matchnormal<-function(n=NULL)
{
if(is.null(n)){n="unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt"}
#first replace all special characters with spaces
#en = gsub(pattern="[[:punct:]]", replacement=" ", x=n)
sen = strsplit(n, split="[[:punct:]]")[[1]]
#find where "TCGA" is
i = grep(pattern="TCGA", x=sen)
if(length(i)<1){
return(NULL)
}
#take out the TCGA part, plus the next two parts
sub = sen[c(i, i+1, i+2, i+3)]
sub[4] = gsub(pattern="[a-zA-Z]*", "", x=sub[4])
out = paste(sub, sep=".", collapse=".")
return(out)
}
toPGMWithCoverage<-function(sds, coverageSet){
pgm = toPGM(sds)
additionalRowNames = setdiff(coverageSet, rownames(pgm))
newChunk = matrix(data=F,
nrow=length(additionalRowNames),
ncol=ncol(pgm),
dimnames=list(additionalRowNames, colnames(pgm)))
}
##toPGM
# converts a stacked format to a patient gene matrix
#
#takes: a data frame with columns "Symbol" and "Ids"
#returns: patient gene matrix: rownames = gene names,
# colnames = patientIDs
# cell values: logical, T if gene is active in that patient
toPGM<-function(sds){
#find unique patient ids
pids = unique(sds$Ids)
pids = make.names(pids)
sds$Ids = make.names(sds$Ids)
#find unique genes affected
syms = unique(sds$Symbol)
#make the matrix
out = matrix(data=F, nrow=length(syms), ncol=length(pids), dimnames=list(syms,pids))
#for each patient, find the set of genes
for(p in pids){
#get the current patient's active genes
cursyms = sds$Symbol[sds$Ids == p]
#assign the cells for that patient, for those genes
out[cursyms, p] = T
}
return(out)
}
# toBipartateGraph
# takes a data frame with two columns
# makes a bipartate graph from the two columns
toBipartateGraph<-function(dfin){
colnames(dfin)<-c("Ids", "Symbol")
#find unique patient ids
pids = unique(dfin$Ids)
pids = make.names(pids)
dfin$Ids = make.names(dfin$Ids)
#find unique genes affected
syms = unique(dfin$Symbol)
#make the matrix
out = matrix(data=F, nrow=length(syms), ncol=length(pids), dimnames=list(syms,pids))
#for each patient, find the set of genes
for(p in pids){
#get the current patient's active genes
cursyms = dfin$Symbol[dfin$Ids == p]
#assign the cells for that patient, for those genes
out[cursyms, p] = T
}
return(out)
}
paths<-function(smd){
return(smd@paths)
}
#rounds columns in table to "figs" number of significant digits
#and replaces underscores in col names with spaces
cleanTables<-function(tab,figs=3, verbose=F){
cat("\nCleaning table..")
if(!(is.data.frame(tab)|is.matrix(tab))){
cat("WAIT, that's not a table..\n")
return(tab)
}
if(nrow(tab)){
for(i in 1:ncol(tab)){
ccol = tab[,i]
cname = colnames(tab)[i]
if(verbose){
print(cname)
print(tab[1,i])
print(typeof(ccol))
print(mode(ccol))
}
if(typeof(tab[,i])%in%c("integer","double")){
tab[,i] = round(tab[,i],digits=figs)
}
}
}else{
print("table has no rows")
}
if(ncol(tab)){
#replace underscores:
colnames(tab)<-gsub(pattern="[_]", replacement=" ", x=colnames(tab))
}else{
print("table has no columns")
}
cat("cleaned\n")
return(tab)
}
#qinfo: gives you quick info about a data structure
#takes: ds: vector, dataframe or matrix
qinfo<-function(ds,mx=25){
print(mode(ds))
print(typeof(ds))
print(class(ds))
if(is.vector(ds)){
print(head(ds))
print(summary(ds))
}else if(is.matrix(ds)|is.data.frame(ds)){
print(dim(ds))
if(ncol(ds)<5){
print(head(ds))
print(summary(ds))
}else{
print(ds[1:min(nrow(ds),5),1:4])
print(summary(ds[,1:4]))
print(colnames(ds)[1:min(ncol(ds),mx)])
print(rownames(ds)[1:min(nrow(ds),mx)])
}
}else if(!is.list(ds)){
print(ds)
}else if(is.list(ds)){
print(names(ds)[1:min(mx,length(ds))])
for(i in 1:length(ds)){
print(names(ds)[i])
qinfo(ds[[i]])
}
}
}
#extract_pid
#takes: n: a string that might contain a TCGA id, ex: TCGA-IQ-7632-01 in unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt
#returns: the TCGA id (with -'s replaced by .'s) if an id is found; NULL if there is not a TCGA id in the input string
extract_pid<-function(n=NULL)
{
if(is.null(n)){
n="unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt"
cat("\nNote: extracting example pid from\n", n, "\n")
}
#first replace all special characters with spaces
#en = gsub(pattern="[[:punct:]]", replacement=" ", x=n)
sen = strsplit(n, split="[[:punct:]]")[[1]]
#find where "TCGA" is
i = grep(pattern="TCGA", x=sen)
if(length(i)<1){
print("** Note: TCGA id not found, returning barcodes as pids.")
return(n)
}
#take out the TCGA part, plus the next two parts
sub = sen[c(i, i+1, i+2)]
out = paste(sub, sep=".", collapse=".")
return(out)
}
test.summarize_by<-function(){
tset1 = c(rep("thirty", times=30), rep("twenty", times=20), rep("not ten", times=11))
tset1res = summarize_by(col=tset1, display=T, barPlotTitle="test summarize by", left_margin_factor=1.1)
checkEquals(checkNames=F,
current=tset1res,
target=cbind.data.frame(stringsAsFactors=F,
types=c("not ten", "thirty","twenty"), counts=c(11,30,20)))
}
###summarize_by
#summarize table by categories in a particular column
#used for making a quick summary of different types of somatic mutations in TCGA somatic mutation tables
#takes: the table
# the column to be summarized
# whether or not you want to display the summary to screen and bar graph
#returns: a summary table: category column, count column
#'@title summarize_by
#'@description summarizes the data levels in a column of categorical data and optionally provides a barplot output
#'@param col Vector, the categorical values to be summarized
#'@param display A flag indicating if the data should be displayed in a bar plot.
#'@param barPlotTitle The title that should be given to the bar plot
#'@param left_margin_factor A numerical factor expanding the left margin of the bar plot. Used to adjust in situations where there is insufficient room to display the text of each bar's label
#'@return A two column data.frame with columns "types", the different levels and "counts", the different numer of times each level occurs
#'@import ggplot2
summarize_by<-function(col, display=F, barPlotTitle="Counts across types", left_margin_factor=1)
{
ctab = table(col)
out = cbind.data.frame(rownames(ctab), as.vector(ctab), stringsAsFactors=F)
colnames(out)<-c("types", "counts")
out = out[order(out$types),]
out$counts = as.numeric(out$counts)
types = out$types
counts = out$counts
if(display){
p = ggplot(out, aes(x=types, y=counts))+
geom_bar(stat='identity')+
coord_flip()+
ggtitle(barPlotTitle)
print(p)
# oldmar <- par()$mar
# while(T){
# res = try({
# par(mar=c(5.1, max(4.1,max(left_margin_factor*nchar(types))/2.5) ,4.1 ,2.1))
# # try(displayGraph(w), silent=T)
# barplot(counts, horiz=T, las=2, main = barPlotTitle, xlab="Number found in data set", names.arg=types)
# par(oldmar)
# }, silent=T)
# # if(!grepl(pattern="Error", x=res)) break
# if(is.null(res)) break
# par(oldmar)
# readline(prompt="There seems to have been an error with plotting the bar graph.\nPlease increase the size of the plot window, the press enter")
# }
# par(oldmar)
# print(out)
}
return(out)
}
#join_tables
#takes: orig: a table to which columns from new_t are to be joined (if this is null t2 will be returned with adjusted names)
# new_t: joins columns from new_t to orig, filling 0s in where new_t doesn't have a value
# name_prefix: prefix to be appended to columns from new_t
# fill: value to be placed where new_t contains no value (typically 0 or NA)
#returns: t1 columns joined to t2 columns, new rows filled with the value in "fill"
joinTables<-function(t1,t2, name_prefix=NULL, fill=0){
print("inside joinTables()")
if(is.null(name_prefix)){#invent a name prefix
name_prefix = as.character(sum(ncol(t1), ncol(t2)))
}
#change t2 column names
t2n = colnames(t2)
newt2n = paste(name_prefix, t2n, sep=".")
print(newt2n)
colnames(t2)<-newt2n
if(is.null(t1)){
return(t2)
}
allrownames=union(rownames(t1), rownames(t2))
#make new chunk
newchunkrows = setdiff(allrownames, rownames(t2))
tmpnew = data.frame(matrix(ncol=ncol(t2), nrow=length(newchunkrows), data=fill), row.names=newchunkrows)
colnames(tmpnew)<-colnames(t2)
newchunk = rbind(t2, tmpnew)
rowstoold = setdiff(allrownames, rownames(t1))
tmpold = data.frame(matrix(ncol=ncol(t1), nrow=length(rowstoold), data=fill), row.names=rowstoold)
colnames(tmpold)<-colnames(t1)
updatedold = rbind(t1, tmpold)
#join the old and the new
out = cbind(updatedold, newchunk[rownames(updatedold),])
cat("\nTables Joined.. \n")
return(out)
}
#getEmpP
#get empirical p-value
getEmpP<-function(path,pgm,reps=1000,paths){
#some number of genes
#draw random genes for pathway
#for each gene: see if it's in the pgm
# if it is, set it to the score from the pgm
# if it's not, set it to zero
#score = sum for all genes in pathway
#add score to set of scores for ECDF
flatpgm = pgm%*%rep(T, ncol(pgm))
ngenes = length(path) #the number of genes in the path
vals = rep(0,times=reps)
for(i in 1:reps){
res = rep(x=0,times=ngenes) # results from this iteration
#select random genes for pathway
curPath = sample(colnames(paths),size=ngenes,replace=F)
curInSomatic = rownames(pgm)%in%curPath
#now get the scores for the genes that are actually in the pathway
curScores = flatpgm[curInSomatic] #extract the scores for all the genes in the current randomized pathway
curScore = sum(curScores)
vals[i] = curScore
}
genesInPath = rownames(flatpgm)%in%path
trueScore = sum(flatpgm[genesInPath])
curFunction <- ecdf(vals)
pout = 1-curFunction(trueScore)
return(pout)
}#getEmpP()
#goal: provided this set of data for each piece of the project:
# Panel used: Described in Tyner et al Oct. 2012*
# Total number of unique genes targeted by drug or mutation : 290
# Unique genes from targeted genes found in the provided set of pathways : 180
# genes targeted, but not found in the current set of pathways : 110
# Pathways used : KEGG PATHWAYS / 2011-Mar-14**
# Number of unique genes found in all pathways : 5894
# Total number of unique pathways examined : 229
# Number of unique pathways targeted by aberrations or drugs : 98
# List of genes targeted, but not found in current set of pathways
# AAK1 ACVRL1 ADCK3 ADCK4 ALK ANKK1 NUAK1 AURKB AURKC AXL BMP2K BLK BMX BRSK1 BRSK2 CAMK1 CAMK1D CAMK1G CDK11B CDK11A CDK19 CDK3 CDK8 CDK9 CIT CLK1 CLK2 CLK3 CLK4 DCLK1 DCLK2 DCLK3 DDR1 DDR2 DMPK CDC42BPG STK17A STK17B DYRK1B MAPK6 MAPK4 MAPK15 FRK GAK LATS1 LATS2 STK10 LTK MAP4K5 MARK1 MARK2 MARK3 MARK4 MELK MERTK MAP3K9 MAP3K10 CDC42BPA CDC42BPB MST1 STK24 MST4 MUSK MYO3A MYO3B STK38L NEK1 NEK2 NEK5 NEK6 NEK7 NEK9 CDK16 CDK17 CDK18 CDK14 PIM3 PKN1 PKN2 PLK3 PLK4 PRKD1 PRKD2 PRKD3 PTK6 RIOK3 ROS1 MYLK4 NUAK2 SIK1 SIK2 SRMS SRPK1 SRPK2 STK16 STK33 TESK1 TIE1 TLK1 TLK2 TNIK TNK1 TNK2 TNNI3K TSSK1B TYRO3 STK32B STK32C STK25 ZAK
#
startHTMLPlugIns<-function(){
if(!require("hwriterPlus")){
print("Trying to install hwriterPlus so that HTML output can be generated")
install.packages("hwriterPlus")
if(require("hwriterPlus")){
print("hwriterPlus installed and loaded")
} else {
stop("could not install hwriterPlus")
}
}
if(!require("hwriter")){
print("Trying to install hwriterPlus so that HTML output can be generated")
install.packages("hwriter")
if(require("hwriter")){
print("hwriter installed and loaded")
} else {
stop("could not install hwriter")
}
}
if(!require("xtable")){
print("Trying to install hwriterPlus so that HTML output can be generated")
install.packages("hwriterPlus")
if(require("xtable")){
print("xtable installed and loaded")
} else {
stop("could not install xtable")
}
}
}
WriteSettings<-function(sumset, p){
dwu = sumset[["settings"]]
if(is.null(dwu)) return()
if(is.list(dwu)&!is.data.frame(dwu)) dwu = saveSettings(set=dwu)
print("Inside WriteSettings()... ")
#WriteDataWorkUp()
#builds the data work up table and outputs it to the HTML page
#takes: sumset: the summary set
# p: the handle for the html page being created
#returns: nothing
hwrite(paste("<A name=\"section",".002","\">","Settings used for this data set", "</A>", sep=""),p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
cat("\nWriting settings to HTML..\n")
hwrite(dwu, p, table.style= 'border-collapse:collapse', table.cellpadding='5px', col.names=F)
}
WriteDataWorkUp<-function(sumset, p, fpath){
dwu = sumset[["Data_work_up_notes"]]
if(is.null(dwu)|!length(dwu)) return()
if(!is.data.frame(dwu)&is.list(dwu)){
dwu = saveDataWorkUpNotes(set=dwu, fpath=fpath)
}
print("Inside WriteDataWorkUp()... ")
#WriteDataWorkUp()
#builds the data work up table and outputs it to the HTML page
#takes: sumset: the summary set
# p: the handle for the html page being created
#returns: nothing
hwrite(paste("<A name=\"section",".001","\">","Data work-up notes", "</A>", sep=""),p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
cat("\nOutputting data work up notes.\n")
print(class(dwu))
print(dim(dwu))
if(is.data.frame(dwu)|is.matrix(dwu)){
dwu_tab = dwu
}else{
print(dwu)
dwu_names = names(dwu)
dwu_vals =c()
for(i in 1:length(dwu)){
if(length(dwu[[i]])){
if(length(dwu[[i]][[1]])==1&!is.matrix(dwu[[i]])){
# dwu_names = c(dwu_names,names(dwu)[i])
dwu_vals = c(dwu_vals, dwu[[i]][[1]])
}
}
}
print("bout to bind the data fram")
dwu_tab = cbind.data.frame(dwu_names, dwu_vals,stringsAsFactors=F)
print("bound")
}
cat("\nwriting work up notes to HTML..\n")
hwrite(dwu_tab, p, table.style= 'border-collapse:collapse', table.cellpadding='5px', col.names=F)
}
#htmlSummary outputs summary information from a summaryTable output list to an HTML file
#the input sumset must have, at the very least:
# $summarystats: list of summary stats (path coverage, etc. .)
# $pathsummary: the generalized output table
# $genesummary: counts of times each gene is targeted across cohort
htmlSummary<-function(sumset,
fname="./test.html",
plimit=.05,
sortColumn=5,
pagetitle=NULL,
path_detail=NULL){
cat("\nCreating HTML summary. . . \n")
cat("\nFile name of HTML summary:\n")
print(fname)
#startHTMLPlugIns()
targetType=sumset$summarystats[2,2]
headings = c()
#check if there's a coverage summary
baseDir = paste(strsplit(fname, "/")[[1]][1:(length(strsplit(fname, "/")[[1]])-1)], collapse="/")
if("coverage_summary"%in%names(sumset)){
print("Found coverage summary")
htmlSummary(path_detail=path_detail, sumset=sumset$coverage_summary, fname=paste(baseDir, "coverage_summary.html", sep="/"), path_detail)
if(!"summarystats"%in%names(sumset)){#if a coverage summary is all there is, then exit
return()
}
cat("Back to writing",fname,"\n" )
}
if(!file.exists(fname)){
cat("\nCreating file folder..\n")
dir.create(path=baseDir, recursive=T,showWarnings=F)
}
cat("names in sumset:\n")
print(names(sumset))
print(ncol(sumset$patientGeneMatrix))
if(is.null(names(sumset$summarystats))) return() #if it's blank, get outta there
print("just about to look at the image slots...")
multiPatients = F
if(!is.null(sumset$patientsums)){
multiPatients = nrow(as.matrix(sumset$patientsums))>1
}
cat("\nOpening HTML page..\n")
print(fname)
p = openPage(fname)
if(!is.null(pagetitle)) hwrite(gsub(pattern="_",replacement=" ",x=pagetitle), p, heading=1)
headings = c(paste("Summary of",sumset$summarystats[1,2],"used in enrichment testing"),
paste("Distribution of",targetType,"genes across patients"),
paste("Top 20 most commonly",targetType,"genes, across cohort"),
"Analysis of pathways",
"Individual patient enrichments for the patients with the top 5 highest mutation rates ",
headings)
print("headings:")
print(headings)
print(multiPatients)
if(("imageSlots"%in%names(sumset)&!is.null(sumset$imageSlots$imageSlots))|multiPatients){
#establish image directory: any images will go here
imageDir = paste(baseDir, "/imageSlots/", sep="")
cat("\nCreating image directory..\n")
dir.create(imageDir,recursive=T,showWarnings=F)
headings = c(headings, "Path images")
}
#for all the headings, turn the heading into an anchor
links = headings
hwrite("<A name=\"section0.1\">Table of contents</A>", p, heading=2)
if("Data_work_up_notes"%in%names(sumset)) hwrite(paste("<A href=\"#section",".001","\">","Data work-up notes", "</A>", sep=""),p,heading=3)
if("settings"%in%names(sumset)) hwrite(paste("<A href=\"#section",".002","\">","Settings", "</A>", sep=""),p,heading=3)
for(i in 1:length(headings)){
links[i] = paste("<A href=\"#section",i,"\">",headings[i], "</A>", sep="")
headings[i] = paste("<A name=\"section",i,"\">",headings[i], "</A>", sep="")
hwrite(links[i],p,heading=3)
}
WriteDataWorkUp(sumset, p, baseDir)
WriteSettings(sumset, p=p)
cat("\nTarget type:",targetType,"\n")
hwrite(headings[1],p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
print("just wrote heading 1")
#output the summary stats
sum_stats=sumset$summarystats
print(names(sumset))
print(sum_stats)
hwrite(sum_stats, p, table.style= 'border-collapse:collapse', table.cellpadding='5px')
print("Just wrote sum_stats")
#output most targeted genes
if(multiPatients){#if more than one patient's worth of gene data is being summarized here,
cat("\nOutputting distribution of",targetType,"genes across all patients\n")
#output information about the counts for each gene across the cohort
hwrite(headings[2],p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
#put a histogram into the summary
imageName = paste("dist_",targetType,"_genes_per_patient.jpg",sep="")
imgloc = paste(baseDir,"/imageSlots/",imageName,sep="")
jpeg(imgloc)
hist(sumset$patientsums[,1],breaks=20,main=paste("Distribution of",targetType,"gene counts for all patients:"),xlab=paste("Number of",targetType,"genes in patient"))
dev.off()
hwriteImage(paste("./imageSlots/", imageName, sep=""), p, br=TRUE)
genelist = sumset$genesummary
geneindex = order(genelist[,1],decreasing=T)
# ordgenelist = as.matrix(genelist[geneindex[ 1:min( 20,length(geneindex) ) ],],ncol=1)
ordgenelist = genelist[geneindex[ 1:min( 20,length(geneindex) ) ],,drop=F]
if(!is.null(sumset$preFilteringGeneSummary)){
print("!is.null(sumset$preFilteringGeneSummary)")
genelistpre = sumset$preFilteringGeneSummary
geneindexpre = order(genelistpre[,2],decreasing=T)
ordgenelistpre = as.matrix(genelistpre[geneindexpre[1:20],],ncol=1)
ordgenelist = cbind(rownames(ordgenelist),ordgenelist[,1])
ordgenelistpre = GenesInPaths(ordgenelistpre, path_detail)
ordgenelist = GenesInPaths(ordgenelist, path_detail)
ordgenelist = cbind(ordgenelistpre, ordgenelist)
print(head(ordgenelist))
colnames(ordgenelist)<-c("Top mutated genes, before filtering",
"Number of patients with mutated gene, pre filtering",
"In the current set of pathways?",
"Top mutated genes, after filtering",
"Number of patients with mutated gene, after filtering",
"In the current set of pathways?")
}else{
ordgenelist = GenesInPaths(ordgenelist, path_detail)
}
print("Bout to print heading 3")
hwrite(headings[3],p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
hwrite(ordgenelist, p, table.style= 'border-collapse:collapse', table.cellpadding='1px', row.names=is.null(sumset$preFilteringGeneSummary),col.names=T)
}
hwrite("<br><br>",p)
#pathway analysis/coverage/enrichment
hwrite(headings[4],p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
#output the significant paths
ptable = sumset$pathsummary
tmpTest = F
if(is.null(ptable)){
tmpTest = T
}else if(!ncol(ptable)|!nrow(ptable)){
tmpTest=T
}
if(tmpTest){
print(sumset$summarystats[1,2])
# readline("empty table")
hwrite(paste("There are no",targetType,"genes in this patient.",p))
}else{
ptable = cleanTables(ptable)
colnames(ptable) = gsub(pattern="_",replacement=" ",x=colnames(ptable))
if(ncol(ptable)<6){
print("Coverage summary.")
sortColumn = 4
}
hwrite(paste("Pathways sorted by",colnames(ptable)[sortColumn],"."),p,heading=2)
torder = order(ptable[,sortColumn],decreasing=T)
hwrite(ptable[torder,], p, table.style= 'border-collapse:collapse', table.cellpadding='1px', row.names=F)
}
#determine, for the patients with the most mutation, in the paths that are most significantly
#mutated, which genes are mutated the most
#find genes and pathways mutated in patients with highest overall mutational load
if(length(sumset$path_summary_each_patient)){
hwrite(headings[5],p,heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
print("About to write info about top 5 most affected patients.. . ")
if(length(sumset$patientsums)>1){
#first: select to 5 patients
ordmurates = sumset$patientsums[order(sumset$patientsums,decreasing=T),]
print(head(ordmurates))
top5pat = names(ordmurates[1:5])#patients with the top 5 highest mutation rates
#second: select pathways most enriched paths in those patients
for(ci in 1:length(top5pat)){
cp = top5pat[ci]
cat("\nPatient: ",cp,"\n")
#Error in if (nrow(sigpaths)) { : argument is of length zero
cpat = sumset$path_summary_each_patient[[cp]]
if(!is.null(cpat$pathsummary)){
hwrite(paste("<br><br>In patient",cp,"(",ci,"of 5 patients with the largest number of",targetType,"genes):"),p,heading=2)
#third: find genes in those pathways which are significantly aberrational
sigpaths = cpat$pathsummary[cpat$pathsummary$hyperg_p_w_FDR<0.05,,drop=F]
if(nrow(sigpaths)){
hwrite(paste(targetType,"genes in pathways significantly enriched (hypergeometric, with BD FDR correction) for",targetType,"genes"),p,heading=2)
}else{
hwrite(paste("No paths were found to be significantly enriched for",targetType,"genes as to the hypergeometric test with BH FDR"),p,heading=2)
sigpaths = cpat$pathsummary[cpat$pathsummary$hyperg_p_w_FDR<0.05,]
if(nrow(sigpaths)){
hwrite(paste(targetType,"genes in pathways significantly enriched (hypergeometric test) for",targetType,"genes"),p,heading=2)
}else{
hwrite(paste("No paths were found to be significantly enriched for",targetType,"genes as to BH FDR"),p,heading=2)
hwrite(paste(targetType,"genes in top 5 pathways most enriched (proportion enrichment) for ",targetType,"genes"),p,heading=2)
sigpaths = cpat$pathsummary[order(cpat$pathsummary[,5],decreasing=T),][1:5,]
}
}
sigpaths2 = cbind(sigpaths,cpat$active_genes_ea_path[rownames(sigpaths),])
colnames(sigpaths2) = gsub(pattern="_",replacement=" ",x=colnames(sigpaths2))
colnames(sigpaths2)[ncol(sigpaths2)] = paste(targetType,"genes in this pathway")
hwrite(sigpaths2, p, table.style= 'border-collapse:collapse', table.cellpadding='1px', row.names=F)
hwrite(paste(nrow(sigpaths2),"pathways total."),p,heading=2)
hwrite("<br>",p)
#find commonality of genes in top paths
tmp = strsplit(cpat$active_genes_ea_path[rownames(sigpaths),],split=" ")
names(tmp) = rownames(sigpaths)
tmpmat = list_to_table(tmp)
gcounts = matrix(rep(T,nrow(tmpmat))%*%tmpmat,ncol=1)
rownames(gcounts) = colnames(tmpmat)
hwrite(paste("Number of the above",nrow(tmpmat),"pathways each gene is found",targetType,"in."),p,heading=2)
colnames(gcounts) = paste("Number of the above pathways the gene is found in")
hwrite(gcounts, p, table.style= 'border-collapse:collapse', table.cellpadding='1px', row.names=T)
hwrite(paste(ncol(tmpmat),"genes total."),p,heading=2)
}
}
}
}
print("About to write image tags... ")
print(names(sumset))
#this is defined above: imageDir = paste(baseDir, "/imageSlots/")
#search the summary for imageSlots
if("imageSlots"%in%names(sumset)){
if(length(sumset$imageSlots)){
print("looking at image slots")
print(fname)
hwrite(x=headings[7], page=p, heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
print(sumset$imageSlots)
images = as.matrix(sumset$imageSlots$imageSlots, ncol=1)
print(images)
if(nrow(images)){
for(i in 1:nrow(images)){
iname = rownames(images)[i]
print(iname)
ifile = images[i,1]
hwrite(iname,p,heading=2)
tmpImageLoc = paste("./imageSlots/",ifile, sep="")
hwriteImage(tmpImageLoc, p, br=TRUE)
}
}
}else{
hwrite(x="Sorry, no images were found", page=p, heading=2)
}
}
closePage(p)
cat("\nSaved summary to HTML file: ",fname,"\n")
return()
}# htmlSummary()
# htmlSummary(sumset=somatic_summary,
# fname="./quick_check_aml_somatic_nci.html")
#htmlSummary(sumset=somatic_summary, fname="./output/testSavesomaticsummary.html")
#toHTML outputs an HTML file
#the input table_list cat have:
# tables
#
# image file names
#
toHTML<-function(table_list,
fname="./test.html",
plimit=.05,
maxrows=100,
reorderTables=NULL,
pagetitle=NULL,
limit_col=NULL,
path_detail=NULL){
cat("____________________________Saving HTML page to :",fname,"\n")
#startHTMLPlugIns()
fnameroot = paste(strsplit(fname, split="/")[[1]][1:(length(strsplit(fname, split="/")[[1]])-1)],sep="/",collapse="/")
print(fnameroot)
print(fname)
p = openPage(fname)
if(!is.null(pagetitle)) hwrite(pagetitle, p, heading=1)
sl1 = "<A href=\"#section"
sl2 = "\">"
sl3 = "</A><BR>"
hwrite("<A name=\"section0.1\">Table of contents</A>", p, heading=2)
#first, print the list of elements
hwrite("These are the sections of this HTML page:", p, heading=3)
for(i in 1:length(table_list)){
curlink = paste(sl1,i,sl2, names(table_list)[i], sl3, sep="")
hwrite(curlink, p, heading=3)
}
trg1 = "<A name=\"section"
trg2 = "\">"
trg3 = "</A>"
if(length(table_list)){
if("settings"%in%names(table_list)){
#if it's from the overlap analysis
if(class(table_list$settings)=="list"){
if(sum( names(table_list$settings)%in%c("basic_overlap", "defaultSummaryTable", "combinedAberrations") ) ){
nsnames = paste0("setting_for_",names(table_list$settings))
names(table_list$settings)<-nsnames
table_list[nsnames] = table_list$settings
table_list$settings=NULL
}
}
}
for(i in 1:length(table_list)){
if(!is.list(table_list)) print(table_list)
curname = names(table_list)[i]
if(VERBOSE) print(curname)
if(is.null(curname)) print(table_list)
if(VERBOSE) print("names(table_list)")
if(VERBOSE) print(names(table_list))
if(VERBOSE) print(i)
if(VERBOSE) print("stops here boo hoo")
postfix = strsplit(curname,split="\\." )[[1]][ length(strsplit(curname,split="\\.")[[1]]) ]
if( grepl(pattern="setting", x=curname) & class(table_list[[i]])!="data.frame" ){
table_list[[i]] = listToDf(lst=table_list[[i]])
}
if( postfix %in% c("png","jpeg","bmp")){#if it's an image
curnameanchor = paste(trg1, i, trg2, curname, trg3, sep="")
if(VERBOSE) print(curnameanchor)
hwrite(curnameanchor, p, heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
cat("\nImage found")
hwriteImage(curname, p, br=TRUE)
}else if(curname=="imageSlots"){
#add anchor
curnameAnchor = paste(trg1, i, trg2, "Network diagrams of pathways which are significantly enriched in aberration and contain drug targets", trg3, sep="")
if(VERBOSE) print(curnameAnchor)
hwrite(curnameAnchor, p, heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
cat("Image list found...\n")
cat("(file:",fname,")\n")
cat("\nCurrent item being output:",curname,"\n")
#if the curname is imageSlots,
if(VERBOSE) print(names(table_list))
#get the matrix out and print the images
imageSet = table_list[["imageSlots"]][["imageSlots"]]
#check if there are images in there
if(is.null(imageSet)){
if(VERBOSE) print("no images found in image folder")
hwrite("Sorry, images not available", p, heading=2)
}else{
if(VERBOSE) print("images found..")
if(VERBOSE) print(imageSet)
imageSetNames = names(imageSet)
if(is.matrix(imageSet)) imageSetNames = rownames(imageSet)
for(im in 1:length(imageSet)){
imageName = imageSetNames[im]
if(VERBOSE) print(imageName)
imageFile = paste("./imageSlots/",imageSet[im], sep="")
if(VERBOSE) print(imageFile)
hwrite(imageName, p, heading=3)
hwriteImage(imageFile, p, br=TRUE)
}
}
}else if(grepl("_each_patient",x=curname, ignore.case=T)){#it's the ... for each patient
print("OLA eap")
eaPatSet = table_list[[i]]
for(patsetname in names(eaPatSet)){
print("**********************************************************")
ePatRoot = paste(fnameroot,curname,patsetname,paste("overlap_analysis/"),sep="/")
SaveToHTML_inner(results=eaPatSet[[patsetname]],
study_name=patsetname,
fileRoot=ePatRoot,
selNames=names(eaPatSet[[patsetname]]))
}
}else if(grepl("_summary",x=curname,ignore.case=T)){
print("Found a summary:::::::::::::::::::::::::::+:::::::::::::::::::::::::")
sumfname =paste(fnameroot,curname,paste(curname,"_.html",sep=""),sep="/")
ctab = table_list[[curname]]
htmlSummary(path_detail=path_detail,
sumset=ctab,
fname=sumfname,
pagetitle=curname)
}else if(is.list(table_list[[i]])&!is.data.frame(table_list[[i]])){
print("is.list(table_list[[i]])&!is.data.frame(table_list[[i]])")
#if it's a list, then call toHTML again
listfname = paste(fnameroot,curname,sep="/")
toHTML(path_detail=path_detail,
table_list=table_list[[i]],
limit_col=limit_col,
pagetitle=paste(pagetitle, curname,sep=":"),
reorderTables=reorderTables,
fname=listfname,
plimit=plimit,
maxrows=maxrows)
}else{ # it's not an image, summary table, folder of images or 'analysis each patient' set
curname = paste(trg1, i, trg2, curname, trg3, sep="")
hwrite(curname, p, heading=2)
hwrite("<A href=\"#section0.1\">Back to top</A><BR>", p, heading=4)
initial_table = table_list[[i]]#if it's turned into a vector clean table will not work; needs to be data frame
if(is.vector(initial_table)&!is.list(initial_table)){
initial_table = as.matrix(initial_table,ncol=1)
}
print(dim(initial_table))
if(nrow(initial_table)){
curtab = cleanTables(tab=initial_table)#makes sure there are only 3 sig figs instead of 9
print(dim(initial_table))
if(!is.null(limit_col)&sum(limit_col%in%colnames(curtab))){#if there are columns to limit the output by
cat("There are columns by which to limit the number of table rows printed.\n")
for(c in limit_col){#for each type of limit that is to be set, scan the table for the limit's column,
if(c%in%colnames(curtab)){#if the column is found, limit and sort the table based on that column
curindex = as.double(curtab[,c])<plimit
tmptab=curtab[curindex,]
curorder = 1:length(tmptab)
if(!is.null(reorderTables)){
print("reorderTables")
curorder = order(tmptab[,c],decreasing=reorderTables)
}
colnames(tmptab)<-gsub(pattern="_",replacement=" ",x=colnames(tmptab))
hwrite(paste(curname,"limited to ",c,"<",as.character(plimit),"(", as.character(nrow(tmptab)), "records exist, showing top", as.character(min(maxrows,nrow(tmptab))) ," )"), p, heading=2)
hwrite(x=tmptab[curorder[1:min(maxrows,nrow(tmptab))],], page=p,
row.names=!sum(rownames(tmptab)==tmptab[,1]),
table.style= 'border-collapse:collapse',
table.cellpadding='1px')
print("header and table written")
}
}
}else{ #if there are no columns to limit the output by
curtab = curtab[1:min(maxrows,nrow(curtab)),,drop=F]#limit the number of rows
if(!is.null(reorderTables)){#order the rows if wanted
curtab = curtab[order(curtab[,5],decreasing=T),,drop=F]
}
title = paste("(", as.character(nrow(initial_table)), "records, showing top", as.character(nrow(curtab)) ,")")
hwrite(title, p, heading=4)
colnames(curtab)<-gsub(pattern="_",replacement=" ",x=colnames(curtab))
hasRowNames = !sum(rownames(curtab)==curtab[,1])
hwrite(curtab,row.names=hasRowNames,
p,
table.style= 'border-collapse:collapse',
table.cellpadding='1px')
#readline("press enter to continue")
cat("~")
}
}else{
hwrite(paste("Zero",curname,"were found."), p, heading=4)
}#if/else there are rows in curtab
}#if/else is image
}#for each element in the table_list
}
cat("\nHTML summary written to",fname,"\n")
closePage(p)
}
#'@title Save results to an HTML page.
#'@description Writes data from the \code{results} slot of a \code{Study} object to an HTML document.
#'@param study A \code{Study} object with results sets from one or more study arm.
#'@import xtable
#'@import hwriter
#'@export
SaveToHTML<-function(study){
# if(!require("hwriterPlus")){
# print("Trying to install hwriterPlus so that HTML output can be generated")
# install.packages("hwriterPlus")
# if(require("hwriterPlus")){
# print("hwriterPlus installed and loaded")
# } else {
# stop("could not install hwriterPlus")
# }
# }
# if(!require("xtable")){
# print("Trying to install hwriterPlus so that HTML output can be generated")
# install.packages("hwriterPlus")
# if(require("xtable")){
# print("xtable installed and loaded")
# } else {
# stop("could not install xtable")
# }
# }
SaveToHTML_inner(study_name=study@studyMetaData@studyName,
results=study@results,
path_detail=study@studyMetaData@paths)
}
SaveToHTML_inner<-function(results,
study_name,
selNames=NULL,
fileRoot=NULL,
path_detail=NULL){
print("Saving summary to HTML..")
if(!grepl(pattern="^test", ignore.case=T, x=study_name)){
study_name = paste0("study_", study_name)
}
if(is.null(fileRoot)) fileRoot = paste("./output/",study_name,"/results",sep="")
print("file root:")
print(fileRoot)
isels = names(results)
badi = grep(pattern="file", x=names(results), ignore.case=T)
if(length(badi)){
isels = names(results)[-badi]
}
#pick out all the indexes except the path_file_name
if(is.null(selNames)){
#let the user decide
selvector.li = selectionList(valcol=isels)
#pullOutTheCorrectNames
selNames = isels[selvector.li]
}
for(n in selNames){
print("selnames:")
print(selNames)
print(n)
#save em
ssum = results[[n]]
if(grepl(pattern="summary", x=n, ignore.case=T)){
print("chose summary")
root2 = paste(fileRoot,"/",n,"/",sep="")
sumfname = paste(root2,n,"_ViewableSummary.html",sep="")
# readline("enter to continue")
htmlSummary(path_detail=path_detail, sumset=ssum, fname=sumfname, pagetitle=n)
htmlPerPatient(results=results, summ=n, fileroot=root2, overlap=F, path_detail=path_detail)
}else if(grepl(pattern="overlap_analysis$", x=n, ignore.case=T)){
print("overlap analysis selected")
root2 = paste(fileRoot,"/",n,"/",sep="")
dir.create(root2,recursive=T,showWarnings=F)
outHTMLname = paste(root2, "overlap_analysis.html",sep="")
print(outHTMLname)
print(n)
# readline("enter to continue")
toHTML(path_detail=path_detail,
table_list=results$overlap_analysis,
fname=outHTMLname,
maxrows=10000,
pagetitle=n)
}
}
}
htmlPerPatient<-function(results, summ, fileroot, overlap=T, path_detail=NULL){
cat("\nWriting an HTML summary for each patient...........\n")
fileroot = paste(fileroot,"path_summary_each_patient/",sep="")
psep = results[[summ]]$path_summary_each_patient
for(patient in names(psep)){
cat("\n***************************",patient,"*******************************\n")
#for each patient, establish a summary
perPatRoot = paste(fileroot,patient,"/",summ,"_ViewableSummary.html",sep="")
print(summ)
ss = psep[[patient]]
if(overlap) ss = psep[[patient]][[summ]]
htmlSummary(path_detail=path_detail, sumset=ss, fname=perPatRoot)
}
}
test.twoHistOnePlot<-function(){
prefilt = data.frame(muts = sample(x=100:200, replace=T, size=100), stringsAsFactors=F)
postfilt = data.frame(muts=sample(x=50:150, replace=T, size=100), stringsAsFactors=F)
twoHistOnePlot(dataset1=prefilt,
dataset2=postfilt,
x_label="Number of mutations",
y_label="Number of patients",
legend_titles=c("Before filtering", "After filtering"),
main_title="Distributions of mutations in patients before and after filtering")
}
#twoHistOnePlot
#
#'@title Puts two histograms on the same plot.
#'@description Puts two histograms on the same plot, shows each with a different color, and overlap with a third color.
#'@param dataset1 A numeric data set; \code{data.frame or matrix} with one column.
#'@param dataset2 A numeric data set; \code{data.frame or matrix} with one column.
#'@param frequency_not_density Depricated.
#'@param main_title The main title for the plot.
#'@param x_label The label to be put along the x axis.
#'@param y_label The label to be put along the y axis.
#'@param legend_titles Character vector, length 2, giving the names of the data in dataset1 and dataset2.
#'@param breaks Depricated.
#'@import ggplot2
twoHistOnePlot<-function(dataset1, dataset2,
frequency_not_density=T,
main_title="main_title not set",
x_label="x_label not set",
y_label="Frequency",
legend_titles=c("dataset 1","dataset 2"),
breaks=c(30,30)){
cat("\nReporting distributions of ", legend_titles[1],"and",legend_titles[2],"...")
#first combine the two data sets
colnames(dataset1)<-"dat"
colnames(dataset2)<-"dat"
stackedSet = rbind.data.frame(dataset1, dataset2)
idCol = c(rep(legend_titles[1],times=nrow(dataset1)), rep(legend_titles[2], times=nrow(dataset2)))
stackedSet = cbind.data.frame(stackedSet, idCol)
colnames(stackedSet)<-c("value","idCol")
# pmin = min(c(dataset1, dataset2))
# pmax = max(c(dataset1, dataset2))
#
# p1 <- hist(dataset1, freq=frequency_not_density,breaks=breaks[1],)
# p2 <- hist(dataset2, freq=frequency_not_density,breaks=breaks[2])
#
# hist = plot( p2, col=rgb(0,0,1,1/4), xlim=c(pmin,pmax),
# freq=frequency_not_density,
# main=main_title,
# xlab=x_label,
# ylab = y_label) # first histogram
# hist2 = plot( p1, col=rgb(1,0,0,1/4), xlim=c(pmin,pmax),
# add=T,
# freq=frequency_not_density) # second
#
# if(!is.null(legend_titles)){
# legend("topright", inset=.05,
# legend_titles,
# fill=c(rgb(1,0,0,1/4),rgb(0,0,1,1/4)),
# horiz=F)
# }
bwidth = ifelse(test=length(unique(stackedSet$value))>10, yes=3, no=1)
p1 = ggplot(stackedSet, aes(x=value, fill=idCol)) +
geom_histogram(alpha=0.5, position="identity", binwidth=bwidth)+
ggtitle(main_title)+
theme_bw()+
theme(legend.title=element_blank())+
xlab(x_label)
# if(min(stackedSet[,1])<=0){
# rng = (range(stackedSet$value)[2]-range(stackedSet$value)[1])*.1
# p1 = p1+scale_x_continuous(limits=c( (min(stackedSet[,"value"])-rng), (max(stackedSet[,"value"])+rng ) ))
# }
print(p1)
cat("...done\n")
return(p1)
}
scatterhist <- function(x, y, xlab="", ylab="", main=""){
zones=matrix(c(2,0,1,3), ncol=2, byrow=TRUE)
layout(zones, widths=c(4/5,1/5), heights=c(1/5,4/5))
xhist = hist(x, plot=FALSE)
yhist = hist(y, plot=FALSE)
top = max(c(xhist$counts, yhist$counts))
par(mar=c(3,3,1,1))
plot(x,y, main="")
par(mar=c(0,3,1,1))
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0)
par(mar=c(3,0,1,1))
barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE)
par(oma=c(3,3,0,0))
mtext(text=main,side=3,outer=T,padj=2)
mtext(xlab, side=1, line=1, outer=TRUE, adj=0,
at=.8 * (mean(x) - min(x))/(max(x)-min(x)))
mtext(ylab, side=2, line=1, outer=TRUE, adj=0,
at=(.8 * (mean(y) - min(y))/(max(y) - min(y))))
par(mfrow = c(1,1))
par(mar=c(5, 4, 4, 2) + 0.1 )
}
scatterhist2 <- function(x1, y1, x2, y2, xlab="", ylab="", main="", legendTxt=NULL){
zones=matrix(c(2,0,1,3), ncol=2, byrow=TRUE)
layout(zones, widths=c(4/5,1/5), heights=c(1/5,4/5))
xhist = twoHistOnePlot(dataset1=x1, dataset2=x2,
legend_titles=NULL,
main_title="",
y_label="",
x_label="")#hist(x, plot=FALSE)
yhist = twoHistOnePlot(dataset1=y1, dataset2=y2,
legend_titles=NULL,
main_title="",
y_label="",
x_label="")#hist(y, plot=FALSE)
top = max(c(xhist$p1$density, yhist$p1$density, xhist$p2$density, yhist$p2$density))
par(mar=c(3,3,1,1))
plot(x1,y1, main=main, col=rgb(0,0,1,1/4), pch=46)
points(x2,y2,col=rgb(1,0,0,1/4), pch=46)
if(!is.null(legendTxt)){
legend("topright", c(legendTxt[1], legendTxt[2]), fill=c(rgb(1,0,0,1/4),rgb(0,0,1,1/4)))
}
par(mar=c(0,3,1,1))
barplot(xhist$p1$density, axes=FALSE, ylim=c(0, top), space=0, col=rgb(0,0,1,1/4))
barplot(xhist$p2$density, axes=FALSE, ylim=c(0, top), space=0, add=T, col=rgb(1,0,0,1/4))
par(mar=c(3,0,1,1))
barplot(yhist$p1$density, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE, col=rgb(0,0,1,1/4))
barplot(yhist$p2$density, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE, add=T, col=rgb(1,0,0,1/4))
par(oma=c(3,3,0,0))
mtext(xlab, side=1, line=1, outer=TRUE, adj=0,
at=.8 * (mean(x1) - min(x1))/(max(x1)-min(x1)))
mtext(ylab, side=2, line=1, outer=TRUE, adj=0,
at=(.8 * (mean(y1) - min(y1))/(max(y1) - min(y1))))
par(mfrow = c(1,1))
par(mar=c(5, 4, 4, 2) + 0.1 )
}
addGeneInfo<-function(genelist, path_detail){
#adds additional gene info to a list of genes
#indicates genes which did not have hugo records
#initilize variables
symtab = path_detail$symtable
# #assure there are no symbols given as "\\N"
# genelist = genelist[genelist!="\\N"]
#attempt to make sure the symbols match
genelist = corsym(genelist, symref=symtab, verbose=T)
#extract the needed columns
xsymtab = symtab[,c("Approved.Symbol", "Approved.Name", "Locus.Type", "Chromosome", "Previous.Names")]
rownames(xsymtab) = xsymtab$Approved.Symbol
#figure out which gene symbols we actually have info on:
have = genelist%in%xsymtab$Approved.Symbol
#reduce the gene list to those we have info on and report those we dont have info on
goodset = genelist[have]
badset = genelist[!have]
out=list()
out$'Genes and their records' = xsymtab[goodset,]
out$'Gene symbols for which records could not be found' = badset
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.