R/ROKETworkflow.R

Defines functions new_ANA_AGG ANA_TEST_final RDA_REG_final get_fNULL_MOD new_ANA_COV RDA_getEUC_DIST RDA_getOT_DIST RDA_makeCOVAR RDA_getCYT RDA_calcTMB RDA_prepCLIN calc_full_DIST comb_geneSim run_full_geneSim make_tb1 get_full_geneSim_ME get_geneSim_ME get_full_geneSim_path get_geneSim_path make_SPM_mat prep_SPMs remove_lowImpact keep_varClass keep_GDCFILTER get_GTF get_PATH down_SPMs down_PanCan get_REF name_TUMORTYPES make_DP_VAF smart_DTR make_sampID setdirs

Documented in ANA_TEST_final calc_full_DIST new_ANA_AGG run_full_geneSim setdirs

# ----------
# ROKET workflow functions
# ----------

# ----------
# Minor functions
# ----------

#' @title setdirs
#' @param git_dir A string containing the full
#'	path to the parent directory containing
#'	the cloned ROKETworkflow repository
#' @param work_dir A string containing the full
#'	path to the working directory for downloading,
#'	processing, and analyzing real data
#' @param dataset A string for the abbreviated
#'	cancer type such as BRCA, LGG, STAD, etc.
#' @param verbose Boolean set to TRUE to display
#'	verbose output
#' @export
setdirs = function(git_dir,work_dir,dataset,verbose = TRUE){
	
	# Make directories
	if( verbose ) cat(sprintf("%s: Make directories ...\n",date()))
	if( !dir.exists(work_dir) ) stop(sprintf("Make working directory = %s",work_dir))
	proj_dir 	= file.path(work_dir,"kSMASH"); smart_mkdir(proj_dir)
	ref_dir 	= file.path(proj_dir,"REF"); 		smart_mkdir(ref_dir)
	panc_dir 	= file.path(proj_dir,"PanCan"); smart_mkdir(panc_dir)
	type_dir 	= file.path(proj_dir,dataset); 	smart_mkdir(type_dir)
	spms_dir 	= file.path(type_dir,"SPMs"); 	smart_mkdir(spms_dir)
	out_dir 	= file.path(type_dir,"output"); smart_mkdir(out_dir)
	ithOT_dir	= file.path(type_dir,"OT"); smart_mkdir(ithOT_dir)
	reg_dir		= file.path(out_dir,"regress"); smart_mkdir(reg_dir)
	fin_dir		= file.path(proj_dir,"final");	smart_mkdir(fin_dir)
	
	# Output
	if( verbose ) cat(sprintf("%s: Output ...\n",date()))
	list(git_dir = git_dir,proj_dir = proj_dir,ref_dir = ref_dir,
		panc_dir = panc_dir,type_dir = type_dir,spms_dir = spms_dir,
		dataset = dataset,ithOT_dir = ithOT_dir,out_dir = out_dir,
		reg_dir = reg_dir,fin_dir = fin_dir)
	
}
make_sampID = function(origIDs){
	cat(sprintf("%s: Making universal ID ...\n",date()))
	sapply(origIDs,function(xx) 
		paste(strsplit(xx,"-")[[1]][1:3],collapse="-"),USE.NAMES=FALSE)
}
smart_DTR = function(...){
	data.table::fread(...,
		data.table = FALSE)
}
make_DP_VAF = function(DATA){
	req_names = c("t_alt_count","t_ref_count")
	smart_reqNames(DATA = DATA,REQ = req_names)
	
	DATA$tDP = DATA$t_alt_count + DATA$t_ref_count
	DATA$tVAF = DATA$t_alt_count / DATA$tDP
	DATA$tVAF[DATA$tDP == 0] = 0
	DATA
}
name_TUMORTYPES = function(){
	out = c()
	out = rbind(out,smart_df(V1 = "BLCA",V2 = "Bladder Urothelial Carcinoma"))
	out = rbind(out,smart_df(V1 = "BRCA",V2 = "Breast invasive carcinoma"))
	out = rbind(out,smart_df(V1 = "CESC",V2 = "Cervical squamous cell carcinoma and endocervical adenocarcinoma"))
	out = rbind(out,smart_df(V1 = "COAD",V2 = "Colon and Rectum adenocarcinoma"))
	out = rbind(out,smart_df(V1 = "ESCA",V2 = "Esophageal carcinoma"))
	out = rbind(out,smart_df(V1 = "GBM",V2 = "Glioblastoma multiforme"))
	out = rbind(out,smart_df(V1 = "HNSC",V2 = "Head and Neck squamous cell carcinoma"))
	out = rbind(out,smart_df(V1 = "KIRC",V2 = "Kidney renal clear cell carcinoma"))
	out = rbind(out,smart_df(V1 = "LGG",V2 = "Brain Lower Grade Glioma"))
	out = rbind(out,smart_df(V1 = "LIHC",V2 = "Liver hepatocellular carcinoma"))
	out = rbind(out,smart_df(V1 = "LUAD",V2 = "Lung adenocarcinoma"))
	out = rbind(out,smart_df(V1 = "LUSC",V2 = "Lung squamous cell carcinoma"))
	out = rbind(out,smart_df(V1 = "PAAD",V2 = "Pancreatic adenocarcinoma"))
	out = rbind(out,smart_df(V1 = "SARC",V2 = "Sarcoma"))
	out = rbind(out,smart_df(V1 = "SKCM",V2 = "Skin Cutaneous Melanoma"))
	out = rbind(out,smart_df(V1 = "STAD",V2 = "Stomach adenocarcinoma"))
	out = rbind(out,smart_df(V1 = "UCEC",V2 = "Uterine Corpus Endometrial Carcinoma"))
	
	return(out)
}


# ----------
# Real Data: Check/Download files
# ----------
get_REF = function(my_dirs){
	
	# GDC's hg38 GTF file
	gtf_fn = file.path(my_dirs$ref_dir,"gencode.gene.info.v22.tsv")
	if( !file.exists(gtf_fn) ){
		url_link = "https://api.gdc.cancer.gov/data/b011ee3e-14d8-4a97-aed4-e0b10f6bbe82"
		stop(sprintf("Download %s and rename %s",url_link,gtf_fn))
	}
	
	# Cytolytic activity
	url_link = "https://www.cell.com/cms/10.1016"
	url_link = sprintf("%s/j.cell.2014.12.033/attachment",url_link)
	url_link = sprintf("%s/fdbc71ba-1786-469f-9a7b-6c05e2ab76c4/mmc1.xlsx")
	cyt_fn = file.path(my_dirs$panc_dir,"mmc1 CYT.xlsx")
	if( !file.exists(cyt_fn) ){
		stop(sprintf("Download %s and rename %s",url_link,cyt_fn))
	}
	
}
down_PanCan = function(my_dirs,getCNA = FALSE,
	samp = NULL,override = FALSE,show = TRUE){
	
	get_REF(my_dirs = my_dirs)
	dataset = my_dirs$dataset
	
	# Source: https://gdc.cancer.gov/about-data/publications/pancanatlas
	
	# Download absolute purity/ploidy from TCGA PanCan
	if( show ) cat(sprintf("%s: Get Absolute purity/ploidy ...\n",date()))
	absPP_fn = file.path(my_dirs$panc_dir,"TCGA_mastercalls.abs_tables_JSedit.fixed.txt")
	if( !file.exists(absPP_fn) ){
		url_link = "http://api.gdc.cancer.gov/data/4f277128-f793-4354-a13d-30cc7fe9f6b5"
		stop(sprintf("Download %s and rename %s",url_link,absPP_fn))
	}
	
	# Download absolute copy number from TCGA PanCan
	if( show ) cat(sprintf("%s: Get Absolute copy number ...\n",date()))
	absCN_fn = file.path(my_dirs$panc_dir,"TCGA_mastercalls.abs_segtabs.fixed.txt")
	absCN_fn2 = sprintf("%s.gz",absCN_fn)
	if( !file.exists(absCN_fn2) ){
		url_link = "http://api.gdc.cancer.gov/data/0f4f5701-7b61-41ae-bda9-2805d1ca9781"
		stop(sprintf("Download %s, unzip and rename %s",url_link,absCN_fn))
	}
	absCN_fn = absCN_fn2
	
	# Download TCGA PanCan clinical outcomes
	if( show ) cat(sprintf("%s: Get TCGA PanCan clinical outcomes ...\n",date()))
	clin_fn = file.path(my_dirs$panc_dir,"TCGA-CDR-SupplementalTableS1.xlsx")
	if( !file.exists(clin_fn) ){
		url_link = "https://api.gdc.cancer.gov/data/1b5f413e-a8d1-4d10-92eb-7c4ae739ed81"
		stop(sprintf("Download %s and rename %s",url_link,clin_fn))
	}
	
	# Import/Polish clin data
	if( show ) cat(sprintf("%s: Import/Prep clinical data ...\n",date()))
	clin = suppressWarnings(readxl::read_excel(clin_fn,sheet = "TCGA-CDR"))
	clin = as.data.frame(clin,stringsAsFactors = FALSE)
	clin = clin[,names(clin) != "...1"]
	clin = name_change(clin,"bcr_patient_barcode","ID")
	for(vv in names(clin)){
		idx = which(clin[,vv] %in% c("[Not Available]","[Not Applicable]"))
		if( length(idx) > 0 ) clin[idx,vv] = NA
	}
	# dim(clin); clin[1:3,]
	
	# Subset tumor type (clin, copy number, and SPMs)
	if( show ) cat(sprintf("%s: Subset tumortype = %s ...\n",date(),dataset))
	smart_table(clin$type)
	if( dataset == "COAD" ){
		clin = clin[which(clin$type %in% c("COAD","READ")),]
	} else if( dataset %in% c("BLCA","BRCA","CESC","ESCA","GBM","HNSC","KIRC",
			"LAML","LGG","LIHC","LUAD","LUSC","OV","PAAD","SARC","SKCM",
			"STAD","UCEC") ){
		clin = clin[which(clin$type == dataset),]
	} else {
		stop("Add code in down_PanCan()")
	}
	# dim(clin); clin[1:3,]
	
	if( show ) cat(sprintf("%s: Import ABSOLUTE purity/ploidy data ...\n",date()))
	absPP = smart_DTR(absPP_fn,sep = "\t",header = TRUE)
	absPP$ID = make_sampID(origIDs = absPP$array)
	absPP = absPP[which(absPP$ID %in% clin$ID),]
	
	# remove samples with multiple CN
	if( show ) cat(sprintf("%s: Remove sample duplicates with lower purity ...\n",date()))
	tab = table(absPP$ID)
	tab = tab[tab > 1]
	for(ID in names(tab)){
		idx = which(absPP$ID == ID)
		# remove duplicate with lower purity
		tmp_df = absPP[idx,]
		tmp_df = tmp_df[order(tmp_df$purity),]
		rm_samp = tmp_df$sample[1]
		absPP = absPP[which(absPP$sample != rm_samp),]
	}
	# dim(absPP); absPP[1:3,]
	dat = smart_merge(clin,absPP,all.x = TRUE); rm(absPP)
	
	# Get follow up CLIN pancancer data
	fu_fn = file.path(my_dirs$panc_dir,"clinical_PANCAN_patient_with_followup.tsv")
	fu = smart_DTR(fu_fn,header = TRUE,sep = "\t",data.table = FALSE)
	fu = name_change(fu,"bcr_patient_barcode","ID")
	int_subj = intersect(fu$ID,dat$ID); length(int_subj)
	fu = fu[fu$ID %in% int_subj,] # get tumor-type specific subjects
	fu = apply(fu,2,function(xx){
		xx[xx %in% c("","[Not Applicable]","[Not Available]",
			"[Unknown]","[Not Reported]","[Not Evaluated]","[Discrepancy]")] = NA
		xx
		})
	fu = smart_df(fu)
	num_uniq_value = apply(fu,2,function(xx) length(unique(xx))) # remove columns with no variation
	table(num_uniq_value)
	fu = fu[,num_uniq_value > 1]
	int_vars = intersect(names(fu),names(dat)); length(int_vars)
	int_vars = int_vars[int_vars != "ID"]
	fu = fu[,!(names(fu) %in% int_vars)]
	if( "height" %in% names(fu) ) fu$height = as.numeric(fu$height)
	if( "weight" %in% names(fu) ) fu$weight = as.numeric(fu$weight)
	if( all(c("height","weight") %in% names(fu)) ){
		fu$BMI = fu$weight / (fu$height / 100)^2
	}
	dim(fu); fu[1:3,]
	dim(dat)
	dat = smart_merge(dat,fu,all = TRUE)
	dim(dat)
	
	if( getCNA ){
		if( show ) cat(sprintf("%s: Import ABSOLUTE CN data ...\n",date()))
		absCN = smart_DTR(absCN_fn,sep = "\t",header = TRUE)
		absCN = absCN[which(absCN$Sample %in% dat$array),]
		absCN$ID = make_sampID(origIDs = absCN$Sample)
		absCN$Chr = paste0("chr",absCN$Chromosome)
		absCN = absCN[which(!is.na(absCN$Chromosome)),]
		absCN$tCN = absCN$Modal_HSCN_1 + absCN$Modal_HSCN_2
		# dim(absCN); absCN[1:3,]
	} else {
		absCN = NULL
	}
	
	# Other PanCan info like sequencing center and WGA status
	if( show ) cat(sprintf("%s: Get WGA and seq center data ...\n",date()))
	clin2_fn = file.path(my_dirs$panc_dir,"12864_2017_3770_MOESM2_ESM.xlsx")
	if( !file.exists(clin2_fn) ){
		url_link = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5467262/bin/12864_2017_3770_MOESM2_ESM.xlsx"
		stop(sprintf("Download %s and rename %s",url_link,clin2_fn))
	}
	
	clin_ex_fn = file.path(my_dirs$panc_dir,"clin_ex.rds")
	if( !file.exists(clin_ex_fn) ){
		clin2 = read_excel(clin2_fn,sheet = 1)
		clin2 = as.data.frame(clin2,stringsAsFactors = FALSE)
		saveRDS(clin2,clin_ex_fn)
	}
	clin2 = readRDS(clin_ex_fn)
	
	if( dataset == "COAD" ){
		clin2 = clin2[which(clin2$CANCER %in% c("COAD","READ")),]
	} else if( dataset %in% c("BLCA","BRCA","CESC","ESCA","GBM","HNSC","KIRC",
			"LAML","LGG","LIHC","LUAD","LUSC","OV","PAAD","SARC","SKCM",
			"STAD","UCEC") ){
		clin2 = clin2[which(clin2$CANCER == dataset),]
	} else {
		stop("Add code for this dataset")
	}
	
	clin2$ID = make_sampID(origIDs = clin2$TCGA_ID)
	clin2 = smart_rmcols(OBJ = clin2,rm_names = c("TCGA_ID","VCF_ID"))
	dim(clin2); clin2[1:3,]
	
	if( show ) cat(sprintf("%s: Merge clinical and sequencing data ...\n",date()))
	dat = smart_merge(dat,clin2[,c("ID","SAMPLE_TYPE","WGA_STATUS",
		"CENTER","SEQUENCER","KIT","BWA","X20","YEAR_PUBLISHED")],all = TRUE)
	
	if( dataset == "BRCA" ){
		url_link = "https://www.cell.com/cms/10.1016/j.ccell.2018.03.014"
		url_link = sprintf("%s/attachment/dd9a9dca-90c0-42f4-8d9d-304ca0cffd7e/mmc4.xlsx",url_link)
		pam50_fn = file.path(my_dirs$panc_dir,
			"1-s2.0-S1535610818301193-mmc4_Berger2018_gynec_breast.xlsx")
		if( !file.exists(pam50_fn) ){
			stop(sprintf("Download %s and rename %s",url_link,pam50_fn))
		}
		pam50 = suppressWarnings(read_excel(pam50_fn,skip = 1))
		pam50 = as.data.frame(pam50,stringsAsFactors = FALSE)
		pam50 = name_change(pam50,"Sample.ID","ID")
		smart_table(pam50$Tumor.Type,pam50$vital_status)
		pam50 = pam50[which(pam50$Tumor.Type == "BRCA"),]
		pam50 = name_change(pam50,"BRCA_Subtype_PAM50","PAM50")
		pam50$PAM50[pam50$PAM50 == "NA"] = NA
		smart_table(pam50$PAM50,pam50$vital_status)
		pam50 = pam50[,c("ID","PAM50")]
		dim(pam50); pam50[1:3,]
		dat = smart_merge(dat,pam50,all = TRUE)
	}
	
	if( !is.null(samp) ){ # Subset sample's data
		cat(sprintf("%s: Subset sample data ...\n",date()))
		absCN = absCN[which(absCN$ID == samp),]
		dat = dat[which(dat$ID == samp),]
	}
	
	list(absCN = absCN,dat = dat)
}
down_SPMs = function(my_dirs){
	dataset = my_dirs$dataset
	
	if( dataset == "BLCA" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.BLCA.mutect.0e239d8f-47b0-4e47-9716-e9ecc87605b9.DR-10.0.somatic.maf.gz")
	} else if( dataset == "BRCA" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.BRCA.mutect.995c0111-d90b-4140-bee7-3845436c3b42.DR-10.0.somatic.maf.gz")
	} else if( dataset == "CESC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.CESC.mutect.5ffa70b1-61b4-43d1-b10a-eda412187c17.DR-10.0.somatic.maf.gz")
	} else if( dataset == "ESCA" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.ESCA.mutect.7f8e1e7c-621c-4dfd-8fad-af07c739dbfc.DR-10.0.somatic.maf.gz")
	} else if( dataset == "GBM" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.GBM.mutect.da904cd3-79d7-4ae3-b6c0-e7127998b3e6.DR-10.0.somatic.maf.gz")
	} else if( dataset == "HNSC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.HNSC.mutect.1aa33f25-3893-4f37-a6a4-361c9785d07e.DR-10.0.somatic.maf.gz")
	} else if( dataset == "KIRC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.KIRC.mutect.2a8f2c83-8b5e-4987-8dbf-01f7ee24dc26.DR-10.0.somatic.maf.gz")
	} else if( dataset == "LAML" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.LAML.mutect.27f42413-6d8f-401f-9d07-d019def8939e.DR-10.0.somatic.maf.gz")
	} else if( dataset == "LGG" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.LGG.mutect.1e0694ca-fcde-41d3-9ae3-47cfaf527f25.DR-10.0.somatic.maf.gz")
	} else if( dataset == "LIHC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.LIHC.mutect.a630f0a0-39b3-4aab-8181-89c1dde8d3e2.DR-10.0.somatic.maf.gz")
	} else if( dataset == "LUAD" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.LUAD.mutect.0458c57f-316c-4a7c-9294-ccd11c97c2f9.DR-10.0.somatic.maf.gz")
	} else if( dataset == "LUSC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.LUSC.mutect.95258183-63ea-4c97-ae29-1bae9ed06334.DR-10.0.somatic.maf.gz")
	} else if( dataset == "OV" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.OV.mutect.b22b85eb-2ca8-4c9f-a1cd-b77caab999bd.DR-10.0.somatic.maf.gz")
	} else if( dataset == "PAAD" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.PAAD.mutect.fea333b5-78e0-43c8-bf76-4c78dd3fac92.DR-10.0.somatic.maf.gz")
	} else if( dataset == "SARC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.SARC.mutect.cc207fe8-ee0a-4b65-82cb-c8197d264126.DR-10.0.somatic.maf.gz")
	} else if( dataset == "SKCM" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.SKCM.mutect.4b7a5729-b83e-4837-9b61-a6002dce1c0a.DR-10.0.somatic.maf.gz")
	} else if( dataset == "STAD" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.STAD.mutect.c06465a3-50e7-46f7-b2dd-7bd654ca206b.DR-10.0.somatic.maf.gz")
	} else if( dataset == "THCA" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.THCA.mutect.13999735-2e70-439f-a6d9-45d831ba1a1a.DR-10.0.somatic.maf.gz")
	} else if( dataset == "UCEC" ){
		spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.UCEC.mutect.d3fa70be-520a-420e-bb6d-651aeee5cb50.DR-10.0.somatic.maf.gz")
	} else {
		stop("No code for that tumor type")
	}
	
	if( dataset != "COAD" && !file.exists(spms_fn) ){
		stop(sprintf("Download %s SPMs and rename %s",
			dataset,spms_fn))
	} else if( dataset == "COAD" ){
		coad_spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.COAD.mutect.03652df4-6090-4f5a-a2ff-ee28a37f9301.DR-10.0.somatic.maf.gz")
		if( !file.exists(coad_spms_fn) )
			stop(sprintf("Download COAD SPMs and rename %s",coad_spms_fn))
		
		read_spms_fn = file.path(my_dirs$spms_dir,
			"TCGA.READ.mutect.faa5f62a-2731-4867-a264-0e85b7074e87.DR-10.0.somatic.maf.gz")
		if( !file.exists(read_spms_fn) )
			stop(sprintf("Download READ SPMs and rename %s",read_spms_fn))
		spms_fn = c(coad_spms_fn,read_spms_fn)
	
	}
	
	spms_fn
}
get_PATH = function(my_dirs,gmt_fn = NULL){
	
	if( is.null(gmt_fn) ){
		gmt_fn = file.path(my_dirs$git_dir,
			"ROKETworkflow/inst/extdata",
			"c2.cp.reactome.v7.2.symbols.gmt")
	}
	
	gmt = list()
	tmp_gmt = readLines(gmt_fn)
	tmp_gmt = t(sapply(tmp_gmt,function(xx){
		yy = strsplit(xx,"\t")[[1]][-2]
		c(yy[1],paste(yy[-1],collapse=" "))
		},USE.NAMES = FALSE))
	dim(tmp_gmt); tmp_gmt[1:4,]
	
	# make gene by pathway matrix
	nPATH = nrow(tmp_gmt)
	uGENE = unique(strsplit(paste(tmp_gmt[,2],collapse = " ")," ")[[1]])
	nGENE = length(uGENE); nGENE
	pp = matrix(0,nGENE,nPATH)
	pp = smart_names(pp,ROW = uGENE,COL = tmp_gmt[,1])
	pp[1:4,1:2]
	for(ii in seq(nPATH)){
		# ii = 1
		smart_progress(ii = ii,nn = nPATH,iter = 2e1,iter2 = 1e3)
		tmp_genes = strsplit(tmp_gmt[ii,2]," ")[[1]]
		pp[tmp_genes,ii] = 1
		rm(tmp_genes)
	}
	pp = pp[rowSums(pp) > 0,]
	
	return(pp)
}
get_GTF = function(my_dirs){
	
	gtf_fn = file.path(my_dirs$ref_dir,"gencode.gene.info.v22.tsv")
	gtf = data.table::fread(gtf_fn,sep = "\t",header = TRUE)
	table(gtf$gene_type)
	gtf = gtf[gtf$gene_type == "protein_coding" 
		& gtf$seqname %in% paste0("chr",1:22),]
	gtf = name_change(gtf,"gene_id","ENSG")
	gtf = name_change(gtf,"gene_name","GENE")
	gtf = name_change(gtf,"seqname","Chr")
	gtf$ENSG0 = sapply(gtf$ENSG,function(xx) 
		strsplit(xx,"[.]")[[1]][1],USE.NAMES = FALSE)
	dim(gtf); gtf[1:3,]
	
	return(gtf)
	
}


# ----------
# Real Data: Pre-Processing functions
# ----------
keep_GDCFILTER = function(DATA,VAR = "GDC_FILTER",rmWGA = TRUE){
	cat(sprintf("%s: Filter ndp ...\n",date()))
	DATA = DATA[!grepl("ndp",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter NonExonic ...\n",date()))
	DATA = DATA[!grepl("NonExonic",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter exac ...\n",date()))
	DATA = DATA[!grepl("common_in_exac",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter bitgt ...\n",date()))
	DATA = DATA[!grepl("bitgt",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter gdc_pon ...\n",date()))
	DATA = DATA[!grepl("gdc_pon",DATA[[VAR]]),,drop=FALSE]
	if( rmWGA ){
		cat(sprintf("%s: Filter wga ...\n",date()))
		DATA = DATA[!grepl("wga",DATA[[VAR]]),,drop=FALSE]
	}
	DATA
}
keep_varClass = function(DATA,VAR = "Variant_Classification"){
	cat(sprintf("%s: Filter specific variant classes ...\n",date()))
	keep_classes = c("Frame_Shift_Del","Frame_Shift_Ins",
		"In_Frame_Del","In_Frame_Ins","Missense_Mutation","Nonsense_Mutation","Nonstop_Mutation",
		"Splice_Region","Splice_Site","Translation_Start_Site")
	DATA[which(DATA[[VAR]] %in% keep_classes),,drop=FALSE]
}
remove_lowImpact = function(DATA,VAR = "Consequence"){
	cat(sprintf("%s: Filter synonymous ...\n",date()))
	DATA = DATA[!grepl("synonymous",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter 3_prime_UTR ...\n",date()))
	DATA = DATA[!grepl("3_prime_UTR",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter 5_prime_UTR ...\n",date()))
	DATA = DATA[!grepl("5_prime_UTR",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter NMD ...\n",date()))
	DATA = DATA[!grepl("NMD_",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter intron ...\n",date()))
	DATA = DATA[!grepl("intron_",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter non_coding_transcript ...\n",date()))
	DATA = DATA[!grepl("non_coding_transcript",DATA[[VAR]]),,drop=FALSE]
	cat(sprintf("%s: Filter stop_retain ...\n",date()))
	DATA = DATA[!grepl("stop_retain",DATA[[VAR]]),,drop=FALSE]
	DATA
}
prep_SPMs = function(my_dirs,samp = NULL,rmWGA = TRUE){
	rm_names = c("all_effects","src_vcf_id","tumor_bam_uuid",
		"normal_bam_uuid","case_id","Matched_Norm_Sample_Barcode",
		"Tumor_Sample_UUID","Matched_Norm_Sample_UUID")
	
	# Get SPMs
	dataset = my_dirs$dataset
	raw_spms_fn = file.path(my_dirs$spms_dir,"raw_spms.rds")
	if( !file.exists(raw_spms_fn) ){
		spms_fn = down_SPMs(my_dirs = my_dirs)
		spms = c()
		for(fn in spms_fn){
			tmp_spms = smart_DTR(fn,header = TRUE,sep = "\t")
			tmp_spms = smart_rmcols(OBJ = tmp_spms,rm_names = rm_names)
			tmp_spms = make_DP_VAF(DATA = tmp_spms)
			spms = rbind(spms,tmp_spms)
		}
		spms$ID = make_sampID(origIDs = spms$Tumor_Sample_Barcode)
		saveRDS(spms,raw_spms_fn)
	}
	spms = readRDS(raw_spms_fn)
	
	# Filter VCs
	spms = keep_GDCFILTER(DATA = spms,rmWGA = rmWGA)
	spms = keep_varClass(DATA = spms)
	spms = remove_lowImpact(DATA = spms)
	
	if( !is.null(samp) ){ # Subset sample data
		cat(sprintf("%s: Subset sample's SPMs ...\n",date()))
		spms = spms[which(spms$ID == samp),,drop=FALSE]
	}
	
	spms
}
make_SPM_mat = function(my_dirs,rmWGA = TRUE){
	
	dataset = my_dirs$dataset
	mat_fn = file.path(my_dirs$spms_dir,sprintf("%s_mat.rds",dataset))
	if( file.exists(mat_fn) ){
		mut_spm = readRDS(mat_fn)
		return(mut_spm)
	}
	
	# Process gtf: Convert all genes to Hugo symbols
	gtf = get_GTF(my_dirs = my_dirs)
	dim(gtf); gtf[1:3,]
	
	# Get SPMs
	spms = prep_SPMs(my_dirs = my_dirs,rmWGA = rmWGA)
	spms = spms[which(spms$t_alt_count > 0),]
	length(unique(spms$Gene))
	length(unique(spms$Hugo_Symbol))
	
	# Make SPMs MUT matrix
	cat(sprintf("%s: Make SPM mutation matrix ...\n",date()))
	spms_ids = unique(spms$ID)
	nsamp_spms = length(spms_ids); nsamp_spms
	mut_spm = matrix(0,nrow(gtf),nsamp_spms)
	mut_spm = smart_names(mut_spm,ROW = gtf$ENSG0,COL = spms_ids)
	# mut_spm[1:5,1:4]
	cnt = 1
	for(samp in spms_ids){
		# samp = spms_ids[1]; samp
		smart_progress(ii = cnt,nn = nsamp_spms,iter = 5,iter2 = 1e2)
		idx = which(spms$ID == samp)
		ensgs = unique(spms$Gene[idx])
		ensgs = intersect(ensgs,rownames(mut_spm))
		# ensgs[1:10]
		mut_spm[ensgs,samp] = 1
		cnt = cnt + 1
		rm(idx,ensgs)
	}
	
	mut_spm = mut_spm[rowSums(mut_spm) >= 1,]
	gtf = gtf[which(gtf$ENSG0 %in% rownames(mut_spm)),]
	gtf = gtf[match(rownames(mut_spm),gtf$ENSG0),]
	rownames(mut_spm) = gtf$GENE
	dim(mut_spm); mut_spm[1:10,1:5]
	saveRDS(mut_spm,mat_fn)
	
	return(mut_spm)
}


# ----------
# Real Data: Gene Similarity functions
# ----------
get_geneSim_path = function(gene1,gene2,path,NN){
	
	if( gene1 == gene2 ){
		return(0)
	}
	
	# NN = num balls = num genes
	idx1 = path[gene1,] == 1
	idx2 = path[gene2,] == 1
	
	# KK = num white balls = num genes with same pathways as gene1 and gene2
	idx_path = which(idx1 & idx2); # idx_path
	num_path = length(idx_path); num_path
	
	if( num_path > 0 ){
		idx_gene = rowSums(path[,idx_path,drop = FALSE]) == num_path
		KK = sum(idx_gene); KK
	} else {
		KK = 0
	}
	
	if( KK < 2 ){
		# stop(sprintf("check %s and %s",gene1,gene2))
		return(1)
	}
	
	# Calculate prob of drawing two genes that are on the same pathway
	prob = dhyper(
		x = 2,       # num white balls drawn
		m = KK,      # num white balls in urn
		n = NN - KK, # num black balls in urn
		k = 2        # num balls drawn
		)
	prob
	
}
get_full_geneSim_path = function(use_genes,path){
	
	use_genes2 = intersect(use_genes,rownames(path))
	ngenes = length(use_genes2); ngenes
	mat = matrix(NA,ngenes,ngenes)
	mat = smart_names(mat,ROW = use_genes2,COL = use_genes2)
	NN = nrow(path)
	
	cnt = 1; tot = ngenes^2
	for(ii in seq(ngenes)){
	for(jj in seq(ngenes)){
		smart_progress(ii = cnt,nn = tot,iter = 1e2,iter2 = 4e3)
		
		if( ii <= jj ){
			# ii = 5; jj = ii
			tmp_out = tryCatch(get_geneSim_path(
				gene1 = use_genes2[ii],
				gene2 = use_genes2[jj],
				path = path,NN = NN),
				warning = function(ww){
					cmd = sprintf("check %s and %s:",
						use_genes2[ii],use_genes2[jj])
					stop(cmd)
					})
			
			mat[ii,jj] = tmp_out
			mat[jj,ii] = mat[ii,jj]
			
		}
		cnt = cnt + 1
	}}
	
	return(mat)
}
get_geneSim_ME = function(gene1,gene2,mSPM,log10_TMB){
	if(FALSE){
		gene1 = "DMD"
		gene2 = "ABCA13"
		# mSPM
		
	}
	
	tab = table(mSPM[gene1,],mSPM[gene2,])
	names(attr(tab,"dimnames")) = c(gene1,gene2); tab
	marg_out = fisher.test(tab,alternative = "less")
	MARG = list(OR = as.numeric(marg_out$estimate),
		PVAL = marg_out$p.value)
	
	log10_TMB2 = (log10_TMB)^2
	glm_out1 = tryCatch(glm(mSPM[gene2,] ~ mSPM[gene1,]
		+ log10_TMB + log10_TMB2,
		family = "binomial"),warning = function(ww){NULL})
	# summary(glm_out)
	
	glm_out2 = tryCatch(glm(mSPM[gene1,] ~ mSPM[gene2,]
		+ log10_TMB + log10_TMB2,
		family = "binomial"),warning = function(ww){NULL})
	# summary(glm_out2)
	
	if( is.null(glm_out1) | is.null(glm_out2) | MARG$OR == 0 ){
		COND = MARG
	} else {
		PVAL_1 = pnorm(summary(glm_out1)$coefficients[2,3])
		PVAL_2 = pnorm(summary(glm_out2)$coefficients[2,3])
		PVAL = sqrt(PVAL_1 * PVAL_2)
		COND = list(PVAL = PVAL)
	}
	
	list(MARG = MARG,COND = COND)
}
get_full_geneSim_ME = function(use_genes,mSPM,
	strat = "all",TYPE = "marg"){
	
	if(FALSE){
		min_gene_mut = 45
		use_genes = names(tb1)[which(tb1 >= min_gene_mut)]
		length(use_genes)
		
		use_genes = use_genes
		mSPM = matSPM
		strat = c("all","nHM","HM")[1]
		TYPE = c("marg","cond")[2]
		
	}
	
	DAT = smart_df(ID = colnames(mSPM),
		TMB = colSums(mSPM))
	DAT$TMB_bin = ifelse(log10(DAT$TMB) > 2.5,1,0)
	
	if( strat %in% c("HM","nHM") ){
		if( strat == "nHM" ) 	DAT = DAT[which(DAT$TMB_bin == 0),]
		if( strat == "HM" ) 	DAT = DAT[which(DAT$TMB_bin == 1),]
	}
	mSPM = mSPM[,DAT$ID]
	TMB = DAT$TMB
	log10_TMB = log10(TMB)
	
	dim(mSPM)
	int_genes = intersect(use_genes,rownames(mSPM))
	ngenes = length(int_genes); ngenes
	mat = matrix(NA,ngenes,ngenes)
	mat = smart_names(mat,int_genes,int_genes)
	
	cnt = 1; tot = ngenes^2
	for(ii in seq(ngenes)){
	for(jj in seq(ngenes)){
		# ii = 1; jj = 2
		smart_progress(ii = cnt,nn = tot,iter = 5e2,iter2 = 1e4)
		
		if( ii < jj ){
			
			# int_genes[ii]; int_genes[jj]
			
			ana_out = tryCatch(get_geneSim_ME(
				gene1 = int_genes[ii],
				gene2 = int_genes[jj],
				mSPM = mSPM,log10_TMB = log10_TMB),
				warning = function(ww){NULL})
			
			if( is.null(ana_out) ){
				stop("Check gene pair")
			}
			
			mat[ii,jj] = ifelse(TYPE == "marg",
				ana_out$MARG$PVAL,ana_out$COND$PVAL)
			mat[jj,ii] = mat[ii,jj]
			
		}
		
		cnt = cnt + 1
		
	}}
	diag(mat) = 0
	# dim(mat); mat[1:5,1:5]
	# summary(diag(mat))
	
	sim_mat = mat
	if(FALSE){ # Make PVAL matrix symmetric
		sim_mat[sim_mat > -1] = NA
		for(ii in seq(ngenes)){
		for(jj in seq(ngenes)){
			if( ii == jj ){
				sim_mat[ii,ii] = mat[ii,ii]
			} else if( ii < jj ){
				# Average the logs() <=> tilde(p_ij) = sqrt(p_ij * p_ji)
				sim_mat[ii,jj] = sqrt(mat[ii,jj] * mat[jj,ii])
				sim_mat[jj,ii] = sim_mat[ii,jj]
			}
		}}
	}
	
	sgenes = intersect(rownames(mat),
		c("TP53","KRAS","BRAF","PIK3CA","TTN"))
	
	# Show top gene pairs
  tmp_mat = sim_mat
  tmp_mat[lower.tri(tmp_mat,diag = TRUE)] = 1
  xx = tmp_mat
  xx = xx[upper.tri(xx)]
	top_num = min(c(20,length(xx))); top_num
  xx2 = unique(sort(xx))[1:top_num]; # xx2
  tmp_idx = which(tmp_mat < xx2[top_num],arr.ind = TRUE); # tmp_idx
	res = c()
	for(jj in seq(nrow(tmp_idx))){
		# jj = 1
		tmp_mat[tmp_idx[jj,1],tmp_idx[jj,2],drop = FALSE]
		res = rbind(res,smart_df(G1 = rownames(tmp_mat)[tmp_idx[jj,1]],
			G2 = rownames(tmp_mat)[tmp_idx[jj,2]],
			ME = tmp_mat[tmp_idx[jj,1],tmp_idx[jj,2]]))
	}
	rm(tmp_mat)
	res = res[order(res$ME),]
	rownames(res) = NULL
	# print(res)
	
	# Calculate geneSim
	gsim = -log(sim_mat)
	upnorm = quantile(gsim[upper.tri(gsim)],0.99); upnorm
	# upnorm = max(gsim[upper.tri(gsim)])
	gsim = gsim / upnorm
	# gsim[sgenes,sgenes]
	# any(gsim > 1)
	gsim[gsim > 1] = 1
	# diag(gsim) = 1 
	
	list(PVAL = mat,SIM = sim_mat,RES = res,GS = gsim)
}
make_tb1 = function(my_dirs,rmWGA = TRUE,show = TRUE){
	
	dataset = my_dirs$dataset
	
	# Save gene mutation frequency
	tb1_fn = file.path(my_dirs$spms_dir,sprintf("%s_tb1.rds",dataset))
	if( !file.exists(tb1_fn) ){
		
		# Get SPM data
		print(dataset)
		print(rmWGA)
		spms = prep_SPMs(my_dirs = my_dirs,rmWGA = rmWGA)
		print(spms[1:10,c("Tumor_Sample_Barcode","Hugo_Symbol")])
		u_spms = unique(spms[,c("Hugo_Symbol","Tumor_Sample_Barcode")])
		print(dim(u_spms))
		
		tb1 = table(u_spms$Hugo_Symbol)
		print(sort(tb1,decreasing = TRUE)[1:30])
		rm(spms,u_spms)
		saveRDS(tb1,tb1_fn)
		
	}
	tb1 = readRDS(tb1_fn)
	
	for(freq in c(1,2,5,10,15,20,30,40,50,60)){
		# freq = 1
		if( !show ) next
		cat(sprintf("\nGenes mutated at least %s times ...\n",freq))
		print(table(tb1 >= freq))
	}
	
	return(tb1)
}

#' @title run_full_geneSim
#' @param my_dirs A list generated by \code{setdirs()}
#'	of directories specific to a cancer type
#' @param min_gene_mut A non-negative integer to filter
#'	genes, based on mutation frequency, to include in distance 
#'	calculations
#' @param rmWGA Boolean set to TRUE to exclude WGA samples
#' @export
run_full_geneSim = function(my_dirs,min_gene_mut,rmWGA = TRUE){
	if(FALSE){
		dataset = "BLCA"
		min_gene_mut = 5
		rmWGA = TRUE
		
	}
	
	dataset = my_dirs$dataset
	tb1 = make_tb1(my_dirs = my_dirs,rmWGA = rmWGA)
	
	# Make SPM matrix
	matSPM = make_SPM_mat(my_dirs = my_dirs,rmWGA = rmWGA)
	
	# Set reference genes
	fav_genes = c("TP53","ATM","KRAS","BRAF",
		"NRAS","ERBB2","APC","SOX9","TTN")
	
	# Output GO-based of all genes
	all_GOsim_fn = file.path(my_dirs$spms_dir,"gsim_GO_all.rds")
	if( !file.exists(all_GOsim_fn) ){
		
		# Get hsGO
		cat(sprintf("%s: Get hsGO ...\n",date()))
		hsGO = godata("org.Hs.eg.db",
			keytype = "SYMBOL",ont = "BP",computeIC = FALSE)
		
		cat(sprintf("%s: Get GO-based gene_sim ...\n",date()))
		gene_sim_GO = mgeneSim(genes = names(tb1),
			semData = hsGO,measure = "Wang",verbose = TRUE,
			combine = "BMA")
		saveRDS(gene_sim_GO,all_GOsim_fn)
	}
	gene_sim_GO = readRDS(all_GOsim_fn)
	
	# Output PATH-based of all genes
	all_PATHsim_fn = file.path(my_dirs$spms_dir,"gsim_PATH_all.rds")
	if( !file.exists(all_PATHsim_fn) ){
		
		# Import Pathways
		cat(sprintf("%s: Import REACTOME ...\n",date()))
		react = get_PATH(my_dirs = my_dirs)
		dim(react); react[1:5,1:3]
		
		cat(sprintf("%s: Get PATH-based gene_sim ...\n",date()))
		tmp_out = get_full_geneSim_path(
			use_genes = names(tb1),path = react)
		gene_sim_PATH = 1 - tmp_out; rm(tmp_out)
		saveRDS(gene_sim_PATH,all_PATHsim_fn)
	}
	gene_sim_PATH = readRDS(all_PATHsim_fn)
	
	# Output ME-based of all genes
	all_MEsim_fn = file.path(my_dirs$spms_dir,"gsim_ME_all.rds")
	if( !file.exists(all_MEsim_fn) ){
		cat(sprintf("%s: Get ME-based gene_sim ...\n",date()))
		strat = "all"
		TYPE = "cond"
		tmp_gene_mut = 0
		while(TRUE){
			use_genes2 = names(tb1)[which(tb1 >= tmp_gene_mut)]
			cat(sprintf("minGM_thres = %s, nGenes = %s ...\n",
				tmp_gene_mut,length(use_genes2)))
			if( length(use_genes2) <= 300 ){ # need to set an upper limit
				break
			}
			tmp_gene_mut = tmp_gene_mut + 1
		}
		tmp_out = get_full_geneSim_ME(use_genes = use_genes2,
			mSPM = matSPM,strat = strat,TYPE = TYPE)
		gene_sim_ME = tmp_out$GS; rm(tmp_out)
		saveRDS(gene_sim_ME,all_MEsim_fn)
	}
	gene_sim_ME = readRDS(all_MEsim_fn)
	
	# If output file exists, we're done
	gsim_fn = file.path(my_dirs$spms_dir,
		sprintf("gsim_%s.rds",min_gene_mut))
	if( file.exists(gsim_fn) ){
		gsim = readRDS(gsim_fn)
		return(gsim)
	}
	
	# Decide on gene set
	use_genes = names(tb1)[which(tb1 >= min_gene_mut)]
	print(sprintf("Num genes = %s",length(use_genes)))
	
	if( length(use_genes) <= 3 ) return(NULL)
	
	# GO-based geneSim
	cat(sprintf("%s: Get GO-based geneSim ...\n",date()))
	int_GO_genes = intersect(use_genes,rownames(gene_sim_GO))
	gene_sim_GO = gene_sim_GO[int_GO_genes,int_GO_genes]
	
	# PATH-based geneSim
	cat(sprintf("%s: Get PATH-based geneSim ...\n",date()))
	int_PATH_genes = intersect(use_genes,rownames(gene_sim_PATH))
	gene_sim_PATH = gene_sim_PATH[int_PATH_genes,int_PATH_genes]
	# out = get_full_geneSim_path(use_genes = use_genes,
		# path = react)
	# gene_sim_PATH = 1 - out; rm(out)
	
	# ME-based geneSim
	cat(sprintf("%s: Get ME-based geneSim ...\n",date()))
	ME_genes = intersect(use_genes,rownames(gene_sim_ME))
	gene_sim_ME = gene_sim_ME[ME_genes,ME_genes]
	
	# Combine geneSims into array
	all_genes = unique(c(rownames(gene_sim_GO),
		rownames(gene_sim_PATH),
		rownames(gene_sim_ME)))
	num_genes = length(all_genes); num_genes

	fin = array(data = NA,dim = c(num_genes,num_genes,3),
		dimnames = list(all_genes,all_genes,c("GO","PATH","ME")))
	# str(fin)
	
	# GO
	genes_go = intersect(rownames(gene_sim_GO),all_genes)
	diag(fin[,,"GO"]) = 1
	fin[genes_go,genes_go,"GO"] = gene_sim_GO[genes_go,genes_go]
	rm(genes_go)
	any(is.na(fin[,,"GO"]))
	
	# PATH
	genes_path = intersect(rownames(gene_sim_PATH),all_genes)
	diag(fin[,,"PATH"]) = 1
	fin[genes_path,genes_path,"PATH"] = gene_sim_PATH[genes_path,genes_path]
	rm(genes_path)
	any(is.na(fin[,,"PATH"]))
	
	# ME
	genes_me = intersect(rownames(gene_sim_ME),all_genes)
	diag(fin[,,"ME"]) = 1
	fin[genes_me,genes_me,"ME"] = gene_sim_ME[genes_me,genes_me]
	rm(genes_me)
	any(is.na(fin[,,"ME"]))
	
	# Output
	saveRDS(fin,gsim_fn)
	
	return(fin)
	
}


# ----------
# Real Data: Optimal Transport Analysis
# ----------
comb_geneSim = function(WTS,geneSims){
	if(FALSE){
		WTS = rep(1,3)
		WTS = c(1,0,1)
		WTS = c(0,1,0)
		geneSims = gsim
	}
	
	# geneSims[1:5,1:5,]
	# any(geneSims[,,"PATH"] == 0)
	
	genes = rownames(geneSims); # genes
	ngenes = length(genes); ngenes
	fin = matrix(0,ngenes,ngenes)
	diag(fin) = 1
	fin = smart_names(fin,ROW = genes,COL = genes)
	
	for(ii in seq(ngenes)){
	for(jj in seq(ngenes)){
		if( ii < jj ){
			# ii = 1; jj = 5
			geneSims[ii,jj,]
			nWTS = WTS * (1 * !is.na(geneSims[ii,jj,]))
			if( sum(nWTS) == 0 ){
				nWTS = rep(0,3)
			} else {
				nWTS = nWTS / sum(nWTS)			
			}
			nWTS
			vec_gs = geneSims[ii,jj,]
			vec_gs[is.na(vec_gs)] = 0
			tmp_genesim = sum(vec_gs * nWTS)
			fin[ii,jj] = tmp_genesim
		}
	}}
	fin[lower.tri(fin)] = t(fin)[lower.tri(fin)]
	# fin[1:5,1:5]
	
	return(fin)
}

#' @title calc_full_DIST
#' @param my_dirs A list generated by \code{setdirs()}
#'	of directories specific to a cancer type
#' @param rmWGA Boolean set to TRUE to exclude WGA samples
#' @param min_gene_mut A non-negative integer to filter
#'	genes, based on mutation frequency, to include in distance 
#'	calculations
#' @param LAMBDA A positive numeric value or \code{Inf} to
#'	indicate unbalanced or balanced optimal transport distance
#'	calculation, respectively.
#' @param ncores A positive integer to specify the number of
#'	threads to decrease computational runtime.
#' @export
calc_full_DIST = function(my_dirs,rmWGA = TRUE,
	min_gene_mut,LAMBDA,ncores = 1){
	
	if(FALSE){
		rmWGA = TRUE
		min_gene_mut = 15
		# RULE = c("highLAM-lowTMB","highLAM-highTMB")[1]
		BAL = c(TRUE,FALSE)[1]
		LAMBDAs = c(1,1)
		
	}
	
	RULE = "highLAM-lowTMB"
	dataset = my_dirs$dataset
	
	# Set reference genes
	fav_genes = c("TP53","ATM","KRAS","BRAF","NRAS",
		"ERBB2","APC","SOX9","TTN","FRAS1")
	
	# Get geneSims
	gsim = run_full_geneSim(my_dirs = my_dirs,
		min_gene_mut = min_gene_mut,rmWGA = TRUE)
	str(gsim)
	sgenes = intersect(fav_genes,dimnames(gsim)[[1]]); sgenes
	round(gsim[sgenes,sgenes,],3)
	
	# Get SPM matrix
	mat_fn = file.path(my_dirs$spms_dir,sprintf("%s_mat.rds",dataset))
	matSPM = readRDS(mat_fn)
	# summary(colSums(matSPM))
	
	# Get weights
	wts = matrix(c(
		1,1,1, # equal
		1,0,0, # one approach
		0,1,0,
		0,0,1,
		1,1,0, # two approaches
		1,0,1,
		0,1,1),ncol = 3,byrow = TRUE)
	wts = t(apply(wts,1,function(xx) xx / sum(xx)))
	wts
	
	for(ww in seq(nrow(wts))){
		# ww = 4
		if( ww %in% c(1,5,6,7) ) next
		
		ww_str = paste(round(wts[ww,],3),collapse="-"); ww_str
		cat(sprintf("%s: Weights = (%s) ...\n",date(),ww_str))
		
		# Out filename
		LAMBDA_2 = LAMBDA
		BAL = ifelse(is.infinite(LAMBDA),TRUE,FALSE)
		if( BAL ) LAMBDA_2 = 1
		rds_fn = file.path(my_dirs$ithOT_dir,
			sprintf("OT_GM.%s_%s_BAL.%s_L1.%s_L2.%s_ww%s.rds",
			min_gene_mut,RULE,BAL,LAMBDA_2,LAMBDA_2,ww))
		if( file.exists(rds_fn) ) next
		
		# Define current geneSim
		tmp_gsim = comb_geneSim(WTS = wts[ww,],geneSims = gsim)
		# dim(tmp_gsim)
		# gsim[sgenes,sgenes,]
		# tmp_gsim[sgenes,sgenes]
		
		# Gene intersection
		int_genes = intersect(rownames(tmp_gsim),rownames(matSPM))
		length(int_genes)
		
		# Get gene COST
		COST = 1 - tmp_gsim[int_genes,int_genes]; rm(tmp_gsim)
		round(COST[sgenes,sgenes],5)
		
		# Define current sMUT
		sMUT = matSPM[int_genes,]
		sMUT = sMUT[,colSums(sMUT) > 0]
		dim(sMUT)
		
		if( any(is.na(COST)) ) stop("NA in COST")
		if( !all(diag(COST) == 0) ) stop("diag(COST) != 0")
		
		# Run OT
		if( !all(rownames(COST) == rownames(sMUT)) ) stop("genes not ordered")
		if( !all(colnames(COST) == rownames(sMUT)) ) stop("genes not ordered")
		EPS = 1e-3
		conv = 1e-3
		highLAM_lowMU = ifelse(RULE == "highLAM-lowTMB",TRUE,FALSE)
		subj_names = colnames(sMUT)#[1:40]
		ot_out = run_myOTs(ZZ = sMUT[,subj_names],COST = COST,
			EPS = EPS,LAMBDA1 = LAMBDA_2,LAMBDA2 = LAMBDA_2,
			balance = BAL,conv = conv,max_iter = 3e3,ncores = ncores,
			verbose = FALSE,show_iter = 50)
		
		# Save output
		saveRDS(ot_out,rds_fn)
		
	}
	
	return(NULL)
	
}


# ----------
# Real Data: Regression Analysis
# ----------
RDA_prepCLIN = function(my_dirs){
	
	dataset = my_dirs$dataset
	panc = down_PanCan(my_dirs = my_dirs)
	names(panc)
	DAT = panc$dat; # rm(panc)
	dim(DAT); DAT[1:3,]
	
	if( TRUE ){ # Get hg38 SPMs for batch variables
		uu = readRDS(file.path(my_dirs$spms_dir,"raw_spms.rds"))
		uu = unique(uu[,c("ID","Tumor_Sample_Barcode")])
		rownames(uu) = NULL
		uu$barcode_TSS = sapply(uu$Tumor_Sample_Barcode,
			function(xx) strsplit(xx,"-")[[1]][2],USE.NAMES = FALSE)
			table(uu$barcode_TSS)
		uu$barcode_plate = sapply(uu$Tumor_Sample_Barcode,
			function(xx) strsplit(xx,"-")[[1]][6],USE.NAMES = FALSE)
			table(uu$barcode_plate)
		uu$barcode_analyte = sapply(uu$Tumor_Sample_Barcode,
			function(xx) strsplit(strsplit(xx,"-")[[1]][5],"")[[1]][3],
			USE.NAMES = FALSE)
			table(uu$barcode_analyte)
		uu$barcode_SEQCENTER = sapply(uu$Tumor_Sample_Barcode,
			function(xx) strsplit(xx,"-")[[1]][7],USE.NAMES = FALSE)
			table(uu$barcode_SEQCENTER)
		# uu[1:3,]
		DAT = smart_merge(DAT,uu,all.x = TRUE)
		rm(uu)
	}
	if( TRUE ){ # Polish variables
		DAT = name_change(DAT,"ajcc_pathologic_tumor_stage","tumor_stage")
		DAT$tumor_stage[DAT$tumor_stage %in% paste0("Stage 0",c("","A","B","C"))] = "0"
		DAT$tumor_stage[DAT$tumor_stage %in% paste0("Stage I",c("","A","B","C"))] = "I"
		DAT$tumor_stage[DAT$tumor_stage %in% paste0("Stage II",c("","A","B","C"))] = "II"
		DAT$tumor_stage[DAT$tumor_stage %in% paste0("Stage III",c("","A","B","C"))] = "III"
		DAT$tumor_stage[DAT$tumor_stage %in% paste0("Stage IV",c("","A","B","C"))] = "IV"
		DAT$tumor_stage[DAT$tumor_stage %in% c("[Unknown]","[Discrepancy]")] = NA
		smart_table(DAT$tumor_stage)
		
		DAT = name_change(DAT,"histological_type","hist_type")
		DAT$hist_type[DAT$hist_type == "[Discrepancy]"] = NA
		
		DAT$hist_grade = DAT$histological_grade
		DAT$hist_grade[DAT$hist_grade == "[Discrepancy]"] = NA
		
		DAT$clin_stage = DAT$clinical_stage
		DAT$clin_stage = gsub("Stage ","",DAT$clin_stage)
		DAT$clin_stage[DAT$clin_stage %in% c("I","IA","IA1","IA2","IB",
			"IB1","IB2","IC")] = "I"
		DAT$clin_stage[DAT$clin_stage %in% c("II","IIA",
			"IIA1","IIA2","IIB","IIC")] = "II"
		DAT$clin_stage[DAT$clin_stage %in% c("III","IIIA",
			"IIIB","IIIC","IIIC1","IIIC2")] = "III"
		DAT$clin_stage[DAT$clin_stage %in% c("IV","IVA","IVB")] = "IV"
		
		DAT$BWA[DAT$BWA == "NA"] = NA
		smart_table(DAT$BWA)
		
		DAT$X20[DAT$X20 == "NA"] = NA
		DAT$X20 = as.numeric(DAT$X20)
		
		# Age
		DAT = name_change(DAT,"age_at_initial_pathologic_diagnosis","age")
		DAT$age_2 = DAT$age^2
		
		# Race
		DAT$race[DAT$race %in% c("AMERICAN INDIAN OR ALASKA NATIVE","ASIAN",
			"BLACK OR AFRICAN AMERICAN")] = "OTHER"
		DAT$race[grepl("NATIVE HAWA",DAT$race)] = "OTHER"
		DAT$race[is.na(DAT$race) | DAT$race %in% c("[Not Evaluated]","[Unknown]")] = NA
		smart_table(DAT$race)
		
		# Initial pathologic dx
		DAT$ipath_year = DAT$initial_pathologic_dx_year
		
	}
	
	if( dataset == "BLCA" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("I","II")] = "I/II"
		
		DAT$hist_grade = DAT$histological_grade
		DAT$hist_grade[DAT$hist_grade %in% c("[Unknown]","Low Grade")] = "Unknown/LowGrade"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7-6","VN:0.5.9-r16")] = "VN:0.5.REF"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1999","2000","2001",
			"2002","2003","2004","2005","2006")] = "1999-2006"
		DAT$ipath_year[DAT$ipath_year %in% c("2007","2008","2009")] = "2007-2009"
		
		return(DAT)
		
	} else if( dataset == "BRCA" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("Stage X","IV")] = "IV/X"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1988","1989",
			"1990","1991","1992","1993","1994","1995","1996")] = "1988-1996"
		DAT$ipath_year[DAT$ipath_year %in% c("1997","1998",
			"1999","2000")] = "1997-2000"
		DAT$ipath_year[DAT$ipath_year %in% c("2001","2002",
			"2003","2004","2005")] = "2001-2005"
		DAT$ipath_year[DAT$ipath_year %in% c("2006","2007",
			"2008","2009")] = "2006-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011",
			"2012","2013")] = "2010-2013"
		
		DAT$hist_type[!grepl("Infiltrating Ductal",DAT$hist_type)
			& !is.na(DAT$hist_type)] = "OtherCarc"
		
		DAT$SEQUENCER[grepl("Illumina HiSeq",DAT$SEQUENCER)] = "Illumina HiSeq"
		DAT$PAM50[DAT$PAM50 %in% c("Her2","Normal")] = "Her2,Normal"
		
		return(DAT)
		
	} else if( dataset == "CESC" ){
		DAT$ipath_year[DAT$ipath_year %in% c("1994","1995",
			"1996","1997","1998","1999","2000","2001","2002",
			"2003")] = "1994-2003"
		DAT$ipath_year[DAT$ipath_year %in% c("2004","2005",
			"2006","2007","2008","2009")] = "2004-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011",
			"2012","2013")] = "2010-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","G2",
			"GX")] = "G1/G2/GX"
		DAT$hist_grade[DAT$hist_grade %in% c("G3","G4")] = "G3/G4"
		
		DAT$hist_type[!grepl("Cervical Squa",DAT$hist_type)] = "NonCervSqua"
		
		DAT$clin_stage[DAT$clin_stage %in% c("II","III",
			"IV")] = "II/III/IV"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7-6","VN:0.5.9-r16")] = "nonVN:0.5.9"
		
		return(DAT)
		
	} else if( dataset == "COAD" ){
		DAT$hist_type[DAT$hist_type == "Colon Adenocarcinoma"] = "CA"
		# DAT$hist_type[DAT$hist_type == "Colon Mucinous Adenocarcinoma"] = "CMA"
		DAT$hist_type[grepl("Mucinous",DAT$hist_type)] = "MucAdeno"
		
		DAT$tumor_status2 = DAT$tumor_status
		DAT$tumor_status2[DAT$tumor_status2 == "[Discrepancy]"] = "WITH TUMOR"
		
		# Double check meaning of this variable: related to treatment? why CYT?
		DAT$ipath_year[DAT$ipath_year %in% c("1998","1999","2000","2001","2002")] = "1998-2002"
		DAT$ipath_year[DAT$ipath_year %in% c("2003","2004","2005","2006","2007","2008")] = "2003-2008"
		DAT$ipath_year[DAT$ipath_year %in% c("2009","2010")] = "2009-2010"
		DAT$ipath_year[DAT$ipath_year %in% c("2011","2012","2013")] = "2011-2013"
		
		return(DAT)
	
	} else if( dataset == "ESCA" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("I","II")] = "I/II"
		DAT$tumor_stage[DAT$tumor_stage %in% c("III","IV")
			| is.na(DAT$tumor_stage)] = "III/IV/NA"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1998","1999",
			"2000","2001","2003")] = "1998-2003"
		DAT$ipath_year[DAT$ipath_year %in% c("2004","2005",
			"2006","2007","2008","2009")] = "2004-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011")] = "2010-2011"
		DAT$ipath_year[DAT$ipath_year %in% c("2012","2013")] = "2012-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","G2")] = "G1/G2"
		
		DAT$hist_type[grepl("Adeno",DAT$hist_type)] = "EsoAdeno"
		DAT$hist_type[grepl("Squam",DAT$hist_type)] = "EsoSquam"
		
		DAT$clin_stage[DAT$clin_stage %in% c("I","II")] = "I/II"
		
		return(DAT)
		
	} else if( dataset == "GBM" ){
		DAT$ipath_year[DAT$ipath_year %in% c("1989","1990",
			"1991","1992","1993","1994","1995",
			"1996","1997","1998","1999","2000","2001",
			"2002","2003","2004","2006","2007","2008")] = "1989-2008"
		DAT$ipath_year[DAT$ipath_year %in% c("2011","2012","2013")] = "2011-2013"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7-6","VN:0.5.9-r16")] = "VN:0.5.7-0.5.9"
		
		return(DAT)
	} else if( dataset == "HNSC" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("I","II")] = "I/II"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1992","1993","1994",
			"1995","1996","1997","1998","1999","2000","2001","2002")] = "1992-2002"
		DAT$ipath_year[DAT$ipath_year %in% c("2003","2004","2005","2006")] = "2003-2006"
		DAT$ipath_year[DAT$ipath_year %in% c("2007","2008","2009")] = "2007-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2012","2013")] = "2012-2013"
		
		
		DAT$clin_stage[DAT$clin_stage %in% c("I","II")] = "I/II"
		DAT$clin_stage[DAT$clin_stage %in% c("IVA","IVB","IVC")] = "IV"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","GX")] = "G1/GX"
		DAT$hist_grade[DAT$hist_grade %in% c("G3","G4")] = "G3/G4"
		
		return(DAT)
		
	} else if( dataset == "KIRC" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("I","II")] = "I/II"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1998","1999","2000",
			"2001","2002")] = "1998-2002"
		DAT$ipath_year[DAT$ipath_year %in% c("2003","2004","2005")] = "2003-2005"
		DAT$ipath_year[DAT$ipath_year %in% c("2006","2007","2008",
			"2009","2010","2011","2012","2013")] = "2006-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","G2","GX")] = "G1/G2/GX"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.9-r16","VN:0.5.9-tpx",
			"VN:0.6.2-r126")] = "VN:0.5.9-0.6.2"
		
		DAT$KIT[grepl("Nimblegen",DAT$KIT)] = "Nimblegen"
		
		return(DAT)
		
	} else if( dataset == "LGG" ){
		DAT$ipath_year[DAT$ipath_year %in% c("1992","1993",
			"1994","1995","1996","1997","1998")] = "1992-1998"
		DAT$ipath_year[DAT$ipath_year %in% c("1999","2000",
			"2001","2002","2003","2004","2005")] = "1999-2005"
		DAT$ipath_year[DAT$ipath_year %in% c("2006","2007",
			"2008","2009")] = "2006-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011",
			"2012","2013")] = "2010-2013"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7-6","VN:0.5.9-r16")] = "VN:0.5.7-0.5.9"
		
		return(DAT)
		
	} else if( dataset == "LIHC" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("III","IV")] = "III/IV"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1995","1996",
			"1997","1998","1999","2000","2001","2002")] = "1995-2002"
		DAT$ipath_year[DAT$ipath_year %in% c("2003","2004",
			"2005","2006","2007")] = "2003-2007"
		DAT$ipath_year[DAT$ipath_year %in% c("2008","2009",
			"2010")] = "2008-2010"
		DAT$ipath_year[DAT$ipath_year %in% c("2011","2012",
			"2013")] = "2011-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G3","G4")] = "G3/G4"
		
		DAT$hist_type[grepl("Fibrola",DAT$hist_type)
			| grepl("Mixed",DAT$hist_type)] = "nonHepatocell"
		
		return(DAT)
		
	} else if( dataset == "LUAD" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("III","IV")] = "III/IV"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1991","1992",
			"1993","1994","1995","1996","1997","1998","1999",
			"2000","2001")] = "1991-2001"
		DAT$ipath_year[DAT$ipath_year %in% c("2002","2003",
			"2004","2005")] = "2002-2005"
		DAT$ipath_year[DAT$ipath_year %in% c("2006","2007")] = "2006-2007"
		DAT$ipath_year[DAT$ipath_year %in% c("2008","2009")] = "2008-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011")] = "2010-2011"
		DAT$ipath_year[DAT$ipath_year %in% c("2012","2013")] = "2012-2013"
		
		return(DAT)
		
	} else if( dataset == "LUSC" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("III","IV")] = "III/IV"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1992","1993",
			"1995","1996","1997","1998","1999","2000")] = "1992-2000"
		DAT$ipath_year[DAT$ipath_year %in% c("2001","2002")] = "2001-2002"
		DAT$ipath_year[DAT$ipath_year %in% c("2003","2004")] = "2003-2004"
		DAT$ipath_year[DAT$ipath_year %in% c("2005","2006")] = "2005-2006"
		DAT$ipath_year[DAT$ipath_year %in% c("2007","2008")] = "2007-2008"
		DAT$ipath_year[DAT$ipath_year %in% c("2009","2010")] = "2009-2010"
		DAT$ipath_year[DAT$ipath_year %in% c("2012","2013")] = "2012-2013"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7","VN:0.5.7-6")] = "VN:0.5.7-"
		
		return(DAT)
		
	} else if( dataset == "OV" ){
		DAT$ipath_year[DAT$ipath_year %in% c("1992","1993",
			"1994","1995","1996")] = "1992-1996"
		DAT$ipath_year[DAT$ipath_year %in% c("1997","1998")] = "1997-1998"
		DAT$ipath_year[DAT$ipath_year %in% c("2007","2008",
			"2009","2010","2011","2012","2013")] = "2007-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","G2","GB","GX")] = "G1/G2/GB/GX"
		DAT$hist_grade[DAT$hist_grade %in% c("G3","G4")] = "G3/G4"
		
		DAT$clin_stage[DAT$clin_stage %in% c("I","II")] = "I/II"
		
		return(DAT)
		
	} else if( dataset == "PAAD" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("III","IV")] = "III/IV"
		
		DAT$ipath_year[DAT$ipath_year %in% c("2001","2007",
			"2008")] = "2001-2008"
		DAT$ipath_year[DAT$ipath_year %in% c("2009","2010")] = "2009-2010"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G3","G4","GX")] = "G3/G4/GX"
		
		DAT$hist_type[!grepl("Ductal Type",DAT$hist_type)
			& !is.na(DAT$hist_type)] = "Non-Ductal"
		
		return(DAT)
		
	} else if( dataset == "SARC" ){
		DAT$ipath_year[DAT$ipath_year %in% c("1994","1995",
			"1996","1997","1998","2000","2002","2003","2004",
			"2005")] = "1994-2005"
		DAT$ipath_year[DAT$ipath_year %in% c("2006","2007",
			"2008")] = "2006-2008"
		DAT$ipath_year[DAT$ipath_year %in% c("2009","2010")] = "2009-2010"
		DAT$ipath_year[DAT$ipath_year %in% c("2011","2012",
			"2013")] = "2011-2013"
		
		DAT$hist_type[grepl("Desmoid",DAT$hist_type)
			| grepl("Giant cell",DAT$hist_type)
			| grepl("Sheath Tumor",DAT$hist_type)
			| grepl("synovial; poor",DAT$hist_type)
			| grepl("Biphasic",DAT$hist_type)
			| grepl("Monophasic",DAT$hist_type)
			| grepl("Myxofib",DAT$hist_type)
			] = "DGSsBM"
		DAT$hist_type[grepl("Undifferentiated",DAT$hist_type)] = "Undiff"
		
		return(DAT)
		
	} else if( dataset == "SKCM" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("0","I")] = "I new"
		DAT$tumor_stage[DAT$tumor_stage %in% c("I/II NOS","II")] = "II new"
		DAT$tumor_stage[DAT$tumor_stage %in% c("III","IV")] = "III/IV"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1978","1979",
			"1982","1984","1985","1986","1987","1989","1990",
			"1991","1992","1993","1994","1995","1996")] = "1978-1996"
		DAT$ipath_year[DAT$ipath_year %in% c("1997","1998",
			"1999","2000","2001")] = "1997-2001"
		DAT$ipath_year[DAT$ipath_year %in% c("2002","2003",
			"2004")] = "2002-2004"
		DAT$ipath_year[DAT$ipath_year %in% c("2005","2006")] = "2005-2006"
		DAT$ipath_year[DAT$ipath_year %in% c("2007","2008",
			"2009")] = "2007-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011",
			"2012","2013")] = "2010-2013"
		
		return(DAT)
		
	} else if( dataset == "STAD" ){
		DAT$tumor_stage[DAT$tumor_stage %in% c("I","II")] = "I/II"
		
		DAT$ipath_year[DAT$ipath_year %in% c("1996","2000",
			"2001","2002","2003","2004","2005","2006","2007")] = "1996-2007"
		DAT$ipath_year[DAT$ipath_year %in% c("2008","2009")] = "2008-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2012","2013")] = "2012-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","G2","GX")] = "G1/G2/GX"
		
		DAT$hist_type[grepl("Not Otherwise Specified",DAT$hist_type)] = "NotOtherSpec"
		DAT$hist_type[grepl("Diffuse Type",DAT$hist_type)
			| grepl("Papillary Type",DAT$hist_type)] = "Diff/Pap"
		DAT$hist_type[grepl("Signet Ring",DAT$hist_type)
			| grepl("Mucinous Type",DAT$hist_type)] = "Signet.Mucinous"
		DAT$hist_type[grepl("Intestinal",DAT$hist_type)] = "Intest,Tubular"
		
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7","VN:0.5.7-6")] = "VN:0.5.7_type"
		DAT$BWA[DAT$BWA %in% c("VN:0.5.7_type","VN:0.5.9-r16")] = "non_VN:0.5.9-tpx"
		
		return(DAT)
		
	} else if( dataset == "UCEC" ){
		DAT$ipath_year[DAT$ipath_year %in% c("1995","1996",
			"1997","1999","2000","2001","2002","2003","2004",
			"2005","2006")] = "1995-2006"
		DAT$ipath_year[DAT$ipath_year %in% c("2007","2008",
			"2009")] = "2007-2009"
		DAT$ipath_year[DAT$ipath_year %in% c("2010","2011",
			"2012","2013")] = "2010-2013"
		
		DAT$hist_grade[DAT$hist_grade %in% c("G1","G2",
			"High Grade")] = "G1/G2/HighG"
		
		DAT$hist_type[grepl("Endometrioid endo",DAT$hist_type)] = "Endometrioid"
		DAT$hist_type[grepl("Mixed serous",DAT$hist_type)
			| grepl("Serous endo",DAT$hist_type)] = "Serous.Endo"
		
		DAT$clin_stage[DAT$clin_stage %in% c("I","II")] = "I/II"
		
		DAT$KIT[grepl("Nimblegen.SQEZ",DAT$KIT)] = "Nimblegen.SQEZ"
		
		return(DAT)
		
	}
	
	stop(sprintf("Write code for %s dataset",dataset))
	
}
RDA_calcTMB = function(DAT,mSPM){
	if(FALSE){
		DAT = dat; mSPM = matSPM
		
	}
	
	cat(sprintf("%s: Get TMB/log10_TMB variable ...\n",date()))
	int_subj = intersect(DAT$ID,colnames(mSPM))
	length(int_subj)
	DAT = DAT[which(DAT$ID %in% int_subj),]
	DAT = DAT[match(int_subj,DAT$ID),]
	mSPM = mSPM[,int_subj]
	DAT$TMB = colSums(mSPM)
	DAT$log10_TMB = log10(1 + DAT$TMB)
	DAT$log10_TMB_2 = DAT$log10_TMB^2
	DAT$TMB_bin = ifelse(DAT$log10_TMB > 2.5,1,0)
	DAT$int_TMB = DAT$log10_TMB * DAT$TMB_bin
	
	# Append top mutated genes
	tab = rowSums(mSPM)
	tab = tab[names(sort(tab,decreasing = TRUE)[1:10])]
	DAT = cbind(DAT,t(mSPM[names(tab),]))
	
	return(DAT)
}
RDA_getCYT = function(my_dirs,DAT){
	if(FALSE){
		dataset = "COAD"
		DAT = dat
	}
	
	cat(sprintf("%s: Get CYT variable ...\n",date()))
	cyt_fn = file.path(my_dirs$panc_dir,"mmc1 CYT.xlsx")
	cyt = readxl::read_excel(cyt_fn,sheet = 1)
	cyt = smart_df(cyt)
	cyt = name_change(cyt,"PatientID","ID")
	cyt = name_change(cyt,"Cytolytic.Activity","CYT")
	smart_table(cyt$HistoSubtype)
	smart_table(cyt$cancer)
	dim(cyt); cyt[1:3,]
	
	int_subj = intersect(cyt$ID,DAT$ID)
	length(int_subj)
	
	cyt = cyt[which(cyt$ID %in% int_subj),]
	smart_table(cyt$HistoSubtype)
	dim(cyt); cyt[1:3,]
	# dim(cyt2)
	
	DAT = smart_rmcols(DAT,"CYT")
	DAT = smart_merge(DAT,cyt[,c("ID","CYT")],all = TRUE)
	DAT$log10_CYT = log10(DAT$CYT)
	dim(DAT); DAT[1:3,]
	
	return(DAT)
}
RDA_makeCOVAR = function(DAT,MODEL){
	if(FALSE){
		DAT = dat
		MODEL = c("age","gender","tumor_stage","log10_TMB")
	}
	
	if( length(MODEL) == 1 && MODEL == "" ){
		XX = matrix(1,nrow = nrow(DAT))
		rownames(XX) = DAT$ID
		return(XX)
	}
	
	# Check all variables included
	if( !all(MODEL %in% names(DAT)) ){
		stop("Some variables missing from DAT")
	}
	
	# Create XX matrix
	MODEL_2 = c(MODEL,"IDnum")
	DAT$IDnum = seq(nrow(DAT))
	tmp_formula = as.formula(sprintf("~ %s",paste(MODEL_2,collapse=" + ")))
	COVARS = model.matrix(tmp_formula,data = DAT)
	XX = apply(COVARS,2,function(xx) as.numeric(xx))
	rownames(XX) = DAT$ID[XX[,"IDnum"]]
	XX = XX[,colnames(XX) != "IDnum"]
	XX[1:3,]
	any(is.na(XX))
	
	return(XX)
}
RDA_getOT_DIST = function(my_dirs,KWS = NULL,SUBJS){
	
	all_ot_fns = list.files(my_dirs$ithOT_dir,pattern = "rds")
	if( !is.null(KWS) ){
		for(KW in KWS){
			# KW = KWS[1]
			all_ot_fns = all_ot_fns[grepl(KW,all_ot_fns)]
			if( length(all_ot_fns) == 0 )
				stop("vector empty")
		}
	}
	
	# cat(sprintf("%s: Test all 7 gene_sim approaches!\n",date()))
	all_ot_fns = all_ot_fns[!grepl("ww1.rds",all_ot_fns)]
	all_ot_fns = all_ot_fns[!grepl("ww5.rds",all_ot_fns)]
	all_ot_fns = all_ot_fns[!grepl("ww6.rds",all_ot_fns)]
	all_ot_fns = all_ot_fns[!grepl("ww7.rds",all_ot_fns)]
	
	length(all_ot_fns); all_ot_fns # [1:5]
	if( length(all_ot_fns) == 0 ) stop("No OT files")
	
	# Get dimnames
	all_subjs = SUBJS
	num_subjs = length(all_subjs); num_subjs
	ARR = array(data = NA,
		dim = c(num_subjs,num_subjs,length(all_ot_fns)),
		dimnames = list(all_subjs,all_subjs,
			all_ot_fns))
	dim(ARR)
	
	for(fn in all_ot_fns){
		# fn = all_ot_fns[1]; fn
		tmp_rds = readRDS(file.path(my_dirs$ithOT_dir,fn))
		tmp_names = rownames(tmp_rds$DIST)
		setdiff(all_subjs,tmp_names)
		setdiff(tmp_names,all_subjs)
		tmp_names = intersect(tmp_names,all_subjs)
		length(tmp_names)
		ARR[tmp_names,tmp_names,fn] = tmp_rds$DIST[tmp_names,tmp_names]
		
		rm(tmp_rds)
	}
	# ARR[1:3,1:3,]
	
	return(ARR)
}
RDA_getEUC_DIST = function(mSPM,THRES = c(1,2,5,10,15,20,30,40,50,60)){
	if(FALSE){
		mSPM = matSPM
		THRES = tmp_GM
	}
	
	vnames = paste0("EUC.GM.",THRES)
	cat(sprintf("%s: Calculate euclidean distances ...\n",date()))
	# Calculate at different gene_freq thresholds (20,30,40)
	cnt = 1
	for(thres in THRES){
		# thres = THRES[1]; thres
		
		cat(sprintf("%s: GM = %s ...\n",date(),thres))
		mSPM_2 = mSPM
		
		# Subset genes mutated X times
		mSPM_2 = mSPM_2[rowSums(mSPM_2) >= thres,,drop = FALSE]
		
		# Remove subjects with no mutated genes
		mSPM_2 = mSPM_2[,colSums(mSPM_2) > 0,drop = FALSE]
		
		dim(mSPM); dim(mSPM_2)
		# tmp_dist2 = Rcpp_calc_EUC(ZZ = t(mSPM_2))
		tmp_dist = as.matrix(dist(t(mSPM_2),diag = TRUE,upper = TRUE))
		dim(tmp_dist); tmp_dist[1:3,1:3]
		
		if( cnt == 1 ){
			subjs = colnames(mSPM)
			nsubj = length(subjs)
			n_thres = length(THRES)
			
			EUC = array(data = NA,
				dim = c(nsubj,nsubj,n_thres),
				dimnames = list(subjs,subjs,vnames))
		}
		
		int_subj = intersect(colnames(tmp_dist),subjs)
		EUC[int_subj,int_subj,vnames[cnt]] = tmp_dist[int_subj,int_subj]
		cnt = cnt + 1
	}
	
	return(EUC)
	
}
new_ANA_COV = function(dataset,rm_subtype = FALSE,rm_CNA = FALSE){
	model_0 = c("age","BMI","BWA","clin_stage",
		"gender","height","hist_grade",
		"hist_type","ipath_year","KIT",
		"log10_TMB","ploidy","purity",
		"race","tumor_stage","weight")
	
	# Baseline covariates
	if( dataset == "BLCA" ){
		model = model_0
	} else if( dataset == "BRCA" ){
		extra_vars = c("PAM50")
		model = c(model_0,extra_vars)
		# "TP53"
	} else if( dataset == "CESC" ){
		model = model_0
		model = model[!(model %in% c("BMI","weight"))]
	} else if( dataset == "COAD" ){
		model = model_0
		model = model[!(model %in% c("weight","height","BMI","BWA"))]
	} else if( dataset == "ESCA" ){
		model = model_0
		model = model[!(model %in% c("weight","race"))]
	} else if( dataset == "GBM" ){
		model = model_0
	} else if( dataset == "HNSC" ){
		model = model_0
		# "TP53"
	} else if( dataset == "KIRC" ){
		extra_vars = c("barcode_SEQCENTER")
		model = c(model_0,extra_vars)
		model = model[!(model %in% c("BWA"))]
	} else if( dataset == "LGG" ){
		model = model_0
		# "IDH1","TP53","ATRX"
	} else if( dataset == "LIHC" ){
		model = model_0
		model = model[!(model %in% c("weight","height"))]
	} else if( dataset == "LUAD" ){
		model = model_0
		# "TP53"
	} else if( dataset == "LUSC" ){
		model = model_0
	} else if( dataset == "PAAD" ){
		model = model_0
		# "KRAS"
	} else if( dataset == "SARC" ){
		model = model_0
	} else if( dataset == "SKCM" ){
		model = model_0
	} else if( dataset == "STAD" ){
		model = model_0
		# "TP53"
	} else if( dataset == "UCEC" ){
		model = model_0
	} else {
		stop(sprintf("No covariates set for %s dataset",dataset))
		model = c("age","gender","tumor_stage","clin_stage",
			"hist_type","hist_grade","ipath_year","BWA",
			"SEQUENCER","KIT","CENTER","WGA_STATUS",
			"barcode_TSS","barcode_plate","barcode_analyte",
			"barcode_SEQCENTER","purity","ploidy",
			"log10_TMB","log10_TMB_2")
		
	}
	
	# Screen and remove tumor subtype info
	#		like tumor_grade, clin_stage, hist_type, hist_grade
	covars = list(model = model)
	if( rm_subtype ){
		for(OUT in names(covars)){
			tmp_vec = covars[[OUT]]
			tmp_vec = tmp_vec[!grepl("tumor_stage",tmp_vec)]
			tmp_vec = tmp_vec[!grepl("clin_stage",tmp_vec)]
			tmp_vec = tmp_vec[!grepl("hist_type",tmp_vec)]
			tmp_vec = tmp_vec[!grepl("hist_grade",tmp_vec)]
			tmp_vec = tmp_vec[!grepl("PAM50",tmp_vec)]
			covars[[OUT]] = tmp_vec
		}
	}
	if( rm_CNA ){
		for(OUT in names(covars)){
			tmp_vec = covars[[OUT]]
			tmp_vec = tmp_vec[!grepl("purity",tmp_vec)]
			tmp_vec = tmp_vec[!grepl("ploidy",tmp_vec)]
			covars[[OUT]] = tmp_vec
		}
	}
	
	return(covars)
}
get_fNULL_MOD = function(my_dirs,YY,keep_vars = NULL,
	rm_subtype = FALSE,rm_CNA = FALSE,add_genes = FALSE,
	strata = "all",verbose = TRUE){
	
	dataset = my_dirs$dataset
	dat_clin_fn = file.path(my_dirs$out_dir,"dat_clin.rds")
	if( !file.exists(dat_clin_fn) ){
		dat = RDA_prepCLIN(my_dirs = my_dirs)
		
		# Import matSPM
		mat_fn = file.path(my_dirs$spms_dir,
			sprintf("%s_mat.rds",dataset))
		matSPM = readRDS(mat_fn)
		tab = rowSums(matSPM)
		sort(tab,decreasing = TRUE)[1:10]
		
		# Calc TMB/MUT
		dat = RDA_calcTMB(DAT = dat,mSPM = matSPM); rm(matSPM)
		dat = RDA_getCYT(my_dirs = my_dirs,DAT = dat)
		rownames(dat) = dat$ID
		saveRDS(dat,dat_clin_fn)
	}
	DAT = readRDS(dat_clin_fn)
	dim(DAT)
	
	TYPE = "CONT"
	if( YY %in% c("OS","PFI","DSS","DFI") ){
		YY2 = sprintf("%s.time",YY)
		TYPE = "SURV"
		DAT = DAT[!is.na(DAT[,YY]),]
	} else if( YY %in% c("CYT") ){
		YY2 = "log10_CYT"
	}
	DAT = DAT[!is.na(DAT[,YY2]),]
	if( nrow(DAT) == 0 ) return(NULL)
	
	if( strata == "nHM" ){
		DAT = DAT[DAT$TMB_bin == 0,]
	}
	
	covars = new_ANA_COV(dataset = dataset,
		rm_subtype = rm_subtype,rm_CNA = rm_CNA); # covars
	start_model = covars$model; # start_model
	
	curr_model = start_model
	curr_model = unique(curr_model)
	curr_model = intersect(curr_model,names(DAT))
	cnt_uniq = apply(DAT[,curr_model],2,function(xx) length(unique(xx[!is.na(xx)])))
	cnt_uniq = cnt_uniq[cnt_uniq > 1]; # cnt_uniq
	curr_model = curr_model[curr_model %in% names(cnt_uniq)]
	curr_model = unique(c(curr_model,keep_vars))
	prop_nonNA = apply(DAT[,curr_model],2,function(xx) mean(!is.na(xx))); prop_nonNA
	prop_nonNA_thres = 0.7
	if( any(prop_nonNA < prop_nonNA_thres) ){
		rm_vars = names(prop_nonNA)[prop_nonNA < prop_nonNA_thres]; rm_vars
		curr_model = curr_model[!(curr_model %in% rm_vars)]
	}
	if( dataset %in% c("CESC","KIRC") ){
		curr_model = curr_model[curr_model != "KIT"]
	}
	
	if( add_genes ){
		mut_thres = 0.20
		tmp_genes = names(DAT)
		tmp_genes = tmp_genes[!(tmp_genes %in% c("CYT","log10_CYT"))]
		tmp_genes = tail(tmp_genes,n = 10)
		tmp_genes
		prop_mut = apply(DAT[,tmp_genes],2,function(xx) mean(xx)); prop_mut
		prop_mut = prop_mut[prop_mut >= mut_thres]; prop_mut
		if( length(prop_mut) > 0 ) curr_model = c(curr_model,names(prop_mut))
	}
	
	curr_model
	PVAL_thres = 1.5e-1; PVAL_thres
	
	while(TRUE){
		# curr_model = c(curr_model,"height")
		# curr_model = c("age","height","hist_grade","log10_TMB","ploidy","tumor_stage")
		if( !is.null(keep_vars) && all(curr_model %in% keep_vars) ) break
		if( length(curr_model) == 0 ) break
		
		if( TYPE == "CONT" ){
			glm_out = tryCatch(lm(as.formula(sprintf("%s ~ %s",
				YY2,paste(curr_model,collapse=" + "))),data = DAT),
				error = function(ee){NULL})
			if( is.null(glm_out) ) print("OLS is null")
			
			if( is.null(glm_out) ){
				DAT2 = DAT[,curr_model]
				for(vv in curr_model){
					# vv = curr_model[11]; vv
					nonNA_idx = !is.na(DAT2[,vv])
					DAT2 = DAT2[nonNA_idx,,drop = FALSE]
				}
				cnt_uniq = apply(DAT2,2,function(xx) length(unique(xx))); cnt_uniq
				if( any(cnt_uniq == 1) ){
					rm_var = names(cnt_uniq)[cnt_uniq == 1][1]; rm_var
					curr_model = curr_model[curr_model != rm_var]
					cat(sprintf("OUT = %s; Rm var = %s ...\n",YY,rm_var))
					next
				}
				stop("Check variables")
			}
			# summary(glm_out)
			
			cov_test = drop1(glm_out,.~.,test = "F")
			cov_test = as.matrix(cov_test)
			cov_test = cov_test[!is.na(cov_test[,ncol(cov_test)]),,drop = FALSE]
			# cov_test
		}
		if( TYPE == "SURV" ){
			glm_out = tryCatch(coxph(formula(sprintf("Surv(%s.time,%s) ~ %s",
				YY,YY,paste(curr_model,collapse = " + "))),data = DAT),
				error = function(ee){NULL},warning = function(ww){NULL})
			glm_out
			
			if( is.null(glm_out) ){
				DAT2 = DAT[,c(YY2,YY,curr_model)]
				for(vv in names(DAT2)){
					# vv = curr_model[11]; vv
					nonNA_idx = !is.na(DAT2[,vv])
					DAT2 = DAT2[nonNA_idx,,drop = FALSE]
				}
				cnt_uniq = apply(DAT2,2,function(xx) length(unique(xx))); cnt_uniq
				if( any(cnt_uniq == 1) ){
					rm_var = names(cnt_uniq)[cnt_uniq == 1][1]; rm_var
					curr_model = curr_model[curr_model != rm_var]
					cat(sprintf("OUT = %s; Rm var = %s ...\n",YY,rm_var))
					next
				} else {
					glm_out = suppressWarnings(coxph(formula(sprintf("Surv(%s.time,%s) ~ %s",
						YY,YY,paste(curr_model,collapse = " + "))),data = DAT))
					# Rm variable with largest pvalue
					tmp_mat = summary(glm_out)$coefficients; tmp_mat
					if( any(tmp_mat[,ncol(tmp_mat)] > 0.95) ){
						tmp_PVAL = tmp_mat[,ncol(tmp_mat)]; tmp_PVAL
						rm_var0 = names(tmp_PVAL)[which.max(tmp_PVAL)]; rm_var0
						tmp_vec = sapply(curr_model,function(xx) grepl(sprintf("^%s",xx),rm_var0)); tmp_vec
						rm_var = names(tmp_vec)[tmp_vec]; rm_var
						if( length(rm_var) > 1 ) stop("inspect this")
						curr_model = curr_model[curr_model != rm_var]
						cat(sprintf("OUT = %s; Rm var = %s ...\n",YY,rm_var))
						next
					}
					
				}
				stop("Check variables")
			}
			
			cov_test = tryCatch(car::Anova(glm_out,type = "III"),
				warning = function(ww){NULL}); cov_test
			if( is.null(cov_test) ){
				# drop variable with largest p-value
				cov_test = suppressWarnings(car::Anova(glm_out,type = "III"))
				rm_var = rownames(cov_test)[which.max(cov_test[,ncol(cov_test)])]
				curr_model = curr_model[curr_model != rm_var]
				next
			}
			
			cov_test = as.matrix(cov_test)
		}
		
		# Remove vars to keep
		cov_test2 = cov_test[!(rownames(cov_test) %in% keep_vars),,drop = FALSE]
		# rm(cov_test)
		# print(cov_test2)
		if( verbose ) cat(sprintf("%s; ",paste(curr_model,collapse = " + ")))
		if( nrow(cov_test2) == 0 ) break
		
		# Find var to remove
		if( any(cov_test2[,ncol(cov_test2)] > PVAL_thres) ){
			tmp_PVAL = cov_test2[,ncol(cov_test2)]
			names(tmp_PVAL) = rownames(cov_test2); tmp_PVAL
			# rm(cov_test2)
			rm_var = names(tmp_PVAL)[which.max(tmp_PVAL)]; rm_var
			# print(cov_test)
			cat(sprintf("OUT = %s; Rm var = %s ...\n",YY,rm_var))
			curr_model = curr_model[curr_model != rm_var]
		} else {
			# End if all p-values less than threshold
			break
		}
		
	}
	if( verbose ){
		cat("\n")
		print(cov_test)
		if( TYPE == "SURV" ){
			DAT2 = DAT[,c(YY2,YY,curr_model)]
			for(vv in names(DAT2)){
				# vv = curr_model[11]; vv
				nonNA_idx = !is.na(DAT2[,vv])
				DAT2 = DAT2[nonNA_idx,,drop = FALSE]
			}
			print(smart_table(DAT2[[YY]]))
		}
		# print(smart_table(!is.na(DAT[[YY2]])))
	}
	
	return(list(model = curr_model,DAT = DAT))
	
}
RDA_REG_final = function(my_dirs,DAT,YY,OT,EUC,
	MODEL,nPERM = 5e3){
	
	if(FALSE){
		DAT = mod_out$DAT; YY = YY_2; OT = ot;
		EUC = euc; MODEL = mod_out$model; nPERM = nPERM
		
	}
	
	all_LABS = c("EUC","OT")
	
	# Subset to GMs >= 2 and GMs <= 30
	nms = dimnames(OT)[[3]]
	nms = nms[!grepl("GM.60",nms)]
	nms = nms[!grepl("GM.50",nms)]
	nms = nms[!grepl("GM.40",nms)]
	OT = OT[,,nms,drop = FALSE]
	
	nms = dimnames(EUC)[[3]]
	nms = nms[!grepl("GM.60",nms)]
	nms = nms[!grepl("GM.50",nms)]
	nms = nms[!grepl("GM.40",nms)]
	EUC = EUC[,,nms,drop = FALSE]
	
	# Check missingness
	cat(sprintf("%s: Check missingness ...\n",date()))
	int_subjs = intersect(DAT$ID,dimnames(OT)[[1]])
	length(int_subjs)
	OT = OT[int_subjs,int_subjs,,drop = FALSE]
	DAT = DAT[which(DAT$ID %in% int_subjs),]
	for(nm in dimnames(OT)[[3]]){
		# nm = dimnames(OT)[[3]][1]; nm
		if( !any(is.na(OT)) ) break
		keep_subjs = dimnames(OT)[[1]]
		miss_subjs = keep_subjs[is.na(diag(OT[,,nm]))]
		keep_subjs = keep_subjs[!(keep_subjs %in% miss_subjs)]
		length(keep_subjs)
		OT = OT[keep_subjs,keep_subjs,,drop = FALSE]
	}
	
	int_subjs = intersect(DAT$ID,dimnames(OT)[[1]])
	int_subjs = intersect(int_subjs,dimnames(EUC)[[1]])
	length(int_subjs)
	EUC = EUC[int_subjs,int_subjs,,drop = FALSE]
	DAT = DAT[which(DAT$ID %in% int_subjs),]
	for(nm in dimnames(EUC)[[3]]){
		# nm = dimnames(EUC)[[3]][1]; nm
		if( !any(is.na(EUC)) ) break
		keep_subjs = dimnames(EUC)[[1]]
		miss_subjs = keep_subjs[is.na(diag(EUC[,,nm]))]
		keep_subjs = keep_subjs[!(keep_subjs %in% miss_subjs)]
		length(keep_subjs)
		EUC = EUC[keep_subjs,keep_subjs,,drop = FALSE]
	}
	
	# Create XX matrix
	TYPE = "CONT"
	DAT = DAT[!is.na(DAT[,YY]),]
	if( YY %in% c("OS","PFI","DSS","DFI") ){
		YY2 = sprintf("%s.time",YY)
		DAT = DAT[!is.na(DAT[,YY2]),]
		TYPE = "SURV"
	}
	XX = RDA_makeCOVAR(DAT = DAT,MODEL = MODEL)
	dim(XX)
	
	# Intersect/Align subjects
	int_subjs = intersect(rownames(XX),dimnames(OT)[[1]])
	int_subjs = intersect(int_subjs,dimnames(EUC)[[1]])
	length(int_subjs)
	OT 	= OT[int_subjs,int_subjs,,drop = FALSE]
	XX 	= XX[int_subjs,,drop = FALSE]
	DAT = DAT[which(DAT$ID %in% int_subjs),]
	DAT = DAT[match(int_subjs,DAT$ID),]
	EUC = EUC[int_subjs,int_subjs,,drop = FALSE]
	
	cnt = 1; tot = length(all_LABS)
	cat(sprintf("%s: Total Distances = %s ...\n",date(),tot))
	res = c()
	
	for(LAB in all_LABS){
		# LAB = all_LABS[2]
		
		# Get kernels
		KK = list()
		if( LAB == "EUC" ){
			for(nam in dimnames(EUC)[[3]]){
				tmp_dist = EUC[,,nam]
				KK[[nam]] = MiRKAT::D2K(D = tmp_dist)
				rm(tmp_dist)
			}
		} else if( LAB == "OT" ){
			for(nam in dimnames(OT)[[3]]){
				tmp_dist = OT[,,nam]
				KK[[nam]] = MiRKAT::D2K(D = tmp_dist)
				rm(tmp_dist)
			}
			
			tmp_LAB = names(KK)
			tmp_LAB = gsub("highLAM-lowTMB_","",tmp_LAB)
			tmp_LAB = gsub("BAL.FALSE_L1.0.5_L2.0.5","LAM.0.5",tmp_LAB)
			tmp_LAB = gsub("BAL.FALSE_L1.1_L2.1","LAM.1",tmp_LAB)
			tmp_LAB = gsub("BAL.FALSE_L1.5_L2.5","LAM.5",tmp_LAB)
			tmp_LAB = gsub("BAL.TRUE_L1.1_L2.1","LAM.Inf",tmp_LAB)
			tmp_LAB = gsub("_ww2.rds","_GO",tmp_LAB)
			tmp_LAB = gsub("_ww3.rds","_PATH",tmp_LAB)
			tmp_LAB = gsub("_ww4.rds","_ME",tmp_LAB)
			tmp_LAB
			names(KK) = tmp_LAB
		}
		
		# Count subjects
		Ns = nrow(DAT); Ns
		Ne = ifelse(YY %in% c("OS","PFI","DSS","DFI"),
			sum(DAT[,YY]),Ns); Ne
		if( Ne < 50 ){
			print(table(DAT[,YY]))
			VARS = c(MODEL,YY)
			if( YY %in% c("OS","PFI","DSS","DFI") ) VARS = c(VARS,sprintf("%s.time",YY))
			print(apply(DAT[,VARS],2,function(xx) mean(!is.na(xx))))
			stop("Check model variable missingness")
		}
		
		XX_2 = NULL
		if( ncol(XX) > 1 ){
			XX_2 = XX[,-1,drop = FALSE]
		}
		
		if( TYPE == "SURV" ){ # Run survival CoxPH regression
			cout = coxph(Surv(DAT[,YY2],DAT[,YY]) ~ .,
				data = smart_df(XX_2))
			RESI = as.numeric(cout$residuals)
			names(RESI) = int_subjs
			fit_perm = kernTEST(RESI = RESI,KK = KK,
				nPERMS = nPERM,ncores = 1)
			
		} else { # Run continuous regression
			
			# Fit null model and get residuals
			lm_out = lm(DAT[,YY] ~ .,data = smart_df(XX_2))
			RESI = as.numeric(lm_out$residuals)
			names(RESI) = int_subjs
			fit_perm = kernTEST(RESI = RESI,KK = KK,
				nPERMS = nPERM,ncores = 1)
			
		}
		
		if( LAB == "EUC" ){
			
			# Omnibus result
			res = rbind(res,smart_df(TYPE = TYPE,
				OUT = YY,MODEL = paste(MODEL,collapse = ","),
				LAB = sprintf("%s_omnibus",LAB),
				PVAL_perm = fit_perm$omni_PVAL,Ns = Ns,Ne = Ne))
			
			# Per kernel results
			res = rbind(res,smart_df(TYPE = TYPE,
				OUT = YY,MODEL = paste(MODEL,collapse = ","),
				LAB = names(KK),PVAL_perm = as.numeric(fit_perm$PVALs),
				Ns = Ns,Ne = Ne))
			
		} else if( LAB == "OT" ){
			
			# Omnibus result
			res = rbind(res,smart_df(TYPE = TYPE,
				OUT = YY,MODEL = paste(MODEL,collapse = ","),
				LAB = sprintf("%s_omnibus",LAB),
				PVAL_perm = fit_perm$omni_PVAL,Ns = Ns,Ne = Ne))
			
			# Per kernel results
			res = rbind(res,smart_df(TYPE = TYPE,
				OUT = YY,MODEL = paste(MODEL,collapse = ","),
				LAB = tmp_LAB,PVAL_perm = as.numeric(fit_perm$PVALs),
				Ns = Ns,Ne = Ne))
			
		}
		
		rm(fit_perm,KK)
	}
	
	return(res)
	
}

#' @title ANA_TEST_final
#' @param my_dirs A list generated by \code{setdirs()}
#'	of directories specific to a cancer type
#' @param nPERM A positive integer specifying
#'	the number of permutations to calculate
#' 	permutation-based individual kernel and omnibus
#'	p-values
#' @export
ANA_TEST_final = function(my_dirs,nPERM = 1e5){
	
	dataset = my_dirs$dataset
	setwd(my_dirs$ithOT_dir)
	
	# Get all completed OT thresholds
	OT_fns = list.files(my_dirs$ithOT_dir)
	OT_fns = OT_fns[!grepl("ww1.rds",OT_fns)];
	OT_fns = OT_fns[!grepl("ww5.rds",OT_fns)]
	OT_fns = OT_fns[!grepl("ww6.rds",OT_fns)]
	OT_fns = OT_fns[!grepl("ww7.rds",OT_fns)]
	# OT_fns
	GMs = sapply(OT_fns,function(xx){
		# xx = OT_fns[1]
		xx2 = strsplit(xx,"_")[[1]][2]
		xx2 = gsub("GM.","",xx2)
		xx2
	},USE.NAMES = FALSE) # we expect 3 gene sims, 4 lambdas
	tab = table(GMs); tab = tab[tab == 12]; tab
	GMs = sort(as.integer(names(tab))); GMs
	
	# Import SPMs, calculate EUC dist for various thresholds
	mat_fn = file.path(my_dirs$spms_dir,
		sprintf("%s_mat.rds",dataset))
	matSPM = readRDS(mat_fn)
	
	# Calculate euclidean distances
	euc = RDA_getEUC_DIST(mSPM = matSPM,THRES = GMs)
	rm(matSPM)
	
	# Get all OT
	ot = RDA_getOT_DIST(my_dirs = my_dirs,
		SUBJS = dimnames(euc)[[1]])
	GMs = sapply(dimnames(ot)[[3]],function(xx){
		xx2 = strsplit(xx,"_")[[1]][2]
		xx2 = gsub("GM.","",xx2)
		xx2
	},USE.NAMES = FALSE) # we expect 3 gene sims, 4 lambdas
	tab = table(GMs); tab
	
	# Regression constants
	all_strat = "all"
	if( dataset %in% c("COAD") ) all_strat = c("all","nHM")
	all_YY = c("CYT","OS","PFI","DSS")
	
	## Create dat_clin_KWS.rds image file
	dat_clin_fn = file.path(my_dirs$out_dir,"dat_clin.rds")
	if( file.exists(dat_clin_fn) ) unlink(dat_clin_fn)
	
	reg_fn = file.path(my_dirs$reg_dir,"reg.rds")
	if( file.exists(reg_fn) ) unlink(reg_fn)
	fin_reg = c()
	for(YY in all_YY){
		# YY = all_YY[1]; YY
		YY_2 = YY
		if( YY == "CYT" ){
			YY_2 = "log10_CYT"
		}
		if( YY == "OS" && dataset %in% c("CESC","SARC") ) next
		if( YY == "PFI" && dataset %in% c("CESC") ) next
		if( YY == "DSS" && dataset %in% c("BRCA","CESC","COAD","ESCA",
			"KIRC","LIHC","SARC","UCEC") ) next
	for(strat in all_strat){
		# strat = all_strat[1]
		cat(sprintf("%s: YY = %s; STRAT = %s ...\n",date(),YY,strat))
		rm_subtype = FALSE; rm_CNA = FALSE
		
		# Set null model
		mod_out = get_fNULL_MOD(my_dirs = my_dirs,YY = YY,
			keep_vars = "log10_TMB",rm_subtype = rm_subtype,
			rm_CNA = rm_CNA,add_genes = FALSE,strata = strat,
			verbose = TRUE)
		if( is.null(mod_out) ) next
		
		# Run kernel regression
		tmp_reg = RDA_REG_final(my_dirs = my_dirs,DAT = mod_out$DAT,
			YY = YY_2,OT = ot,EUC = euc,MODEL = mod_out$model,
			nPERM = nPERM)
		
		# Append
		tmp_df = smart_df(STRAT = strat,tmp_reg)
		print(tmp_df)
		fin_reg = rbind(fin_reg,tmp_df)
		rm(tmp_df)
		
	}}
	print(fin_reg[which(fin_reg$LAB %in% c("EUC","OT_omnibus")),])
	saveRDS(fin_reg,reg_fn)
	unlink(dat_clin_fn)
	
	return(NULL)
	
}

#' @title new_ANA_AGG
#' @param my_dirs A list generated by \code{setdirs()}
#'	of directories specific to a cancer type
#' @export
new_ANA_AGG = function(my_dirs){
	
	res_fn = file.path(my_dirs$fin_dir,"pan_kern.rds")
	
	# Gather all tumor types
	datasets = c("BLCA","BRCA","CESC","COAD",
							"ESCA","GBM","HNSC","KIRC",
							"LGG","LIHC","LUAD","LUSC",
							"PAAD","SARC","SKCM","STAD",
							"UCEC")
	
	if( !file.exists(res_fn) ){
		res = c()
		for(dataset in datasets){
			# dataset = datasets[1]; dataset
			
			cat(sprintf("%s ",dataset))
			my_dirs1 = setdirs(git_dir = my_dirs$git_dir,
				work_dir = my_dirs$work_dir,dataset = dataset,
				verbose = FALSE)
			
			tum_fns = list.files(my_dirs1$reg_dir)
			tum_fns = tum_fns[grepl(".rds$",tum_fns)]
			# tum_fns = tum_fns[grepl("^reg_",tum_fns)]
			if( length(tum_fns) == 0 ) next
			tum_res = c()
			
			for(fn in tum_fns){
				# fn = tum_fns[1]
				tmp_df = readRDS(file.path(my_dirs1$reg_dir,fn))
				tum_res = rbind(tum_res,tmp_df); rm(tmp_df)
			}
			
			res = rbind(res,smart_df(dataset = dataset,tum_res))
			rm(tum_res)
		}
		cat("\n")
		saveRDS(res,res_fn)
	}
	
	# Import
	res = readRDS(res_fn)
	res$YY = -log10(res$PVAL_perm)
	res$GM = NA
	idx = grep("EUC.GM.",res$LAB)
	res$GM[idx] = as.integer(gsub("EUC.GM.","",res$LAB[idx]))
	idx = grep("OT_GM.",res$LAB)
	res$GM[idx] = as.integer(sapply(res$LAB[idx],function(xx){
		# xx = res$LAB[idx][1]; xx
		gsub("GM.","",strsplit(xx,"_")[[1]][2])
	},USE.NAMES = FALSE))
	
	# Plotting
	dim(res); res[1:3,]
	
	# Get covariates selected per null model
	if( TRUE ){
		res2 = res[grepl("log10_TMB",res$MODEL),]
		res2 = res2[which(res2$GM <= 30 & res2$OUT != "DFI"),]
		ures = unique(res2[,c("dataset","STRAT","OUT","MODEL")]); # ures
		all_vars = sapply(ures$MODEL,function(xx)
			strsplit(xx,",")[[1]],USE.NAMES = FALSE)
		all_vars = sort(unique(unlist(all_vars)))
		all_vars = all_vars[all_vars != "log10_TMB"]
		all_vars
		for(vv in all_vars){
			ures[[vv]] = ifelse(grepl(vv,ures$MODEL),1,0)
		}
		dim(ures)
		ures[1:5,]
		ures$OUT2 = ures$OUT
		ures$OUT2[grepl("CYT",ures$OUT2)] = "CYT"
		MAT = as.matrix(ures[,all_vars])
		colnames(MAT)[grepl("SEQCENTER",colnames(MAT))] = "CENTER"
		MAT = MAT[,sort(colnames(MAT))]
		rownames(MAT) = sprintf("%s + %s",ures$dataset,ures$OUT2)
		idx = which(ures$dataset == "COAD")
		rownames(MAT)[idx] = sprintf("%s + %s:%s",ures$dataset,ures$OUT2,ures$STRAT)[idx]
		dim(MAT); MAT[1:4,]
		png(file.path(my_dirs$fin_dir,"null_models.png"),units = "px",
			height = 2000,width = 1600,res = 250,type = "cairo",pointsize = 20)
		bb = smart_heatmap(MAT = MAT,main = "Null Models",
			width = c(0.7,3,1),height = c(0.15,0.3,3,1),
			GRID = list(GRID = TRUE,lwd = 0.5),clustRC = rep(FALSE,2),
			nodePar_row = list(xx_shift = 0,xx = 0,adj = 0,lab.cex = 0.75),
			nodePar_col = list(xx_shift = -0.02,srt = 45,adj = 0,lab.cex = 0.65),
			make_key = FALSE,vec_cols = c("white","red")); rm(bb)
		dev.off()
		
	}
	
	# Get TMB distribution and num genes per cutoff
	if( TRUE ){
		
		# Num genes per cutoff
		cnts = c(2,5,10,15,20,30)
		mat = matrix(NA,length(datasets),length(cnts),
			dimnames = list(datasets,sprintf("GM.%s",cnts)))
		for(dataset in datasets){
			# dataset = datasets[1]
			my_dirs = setdirs(dataset = dataset,
				verbose = FALSE)
			tab = make_tb1(my_dirs = my_dirs,
				rmWGA = TRUE,show = FALSE)
			# tab[1:10]
			mat[dataset,] = sapply(cnts,function(xx) sum(tab >= xx))
		}
		mat = formatC(x = mat,format = "d",big.mark = ",")
		mat = smart_df(mat)
		names(mat) = gsub("GM.","$\\\\geq$",names(mat))
		mat[["Abbreviation"]] = rownames(mat)
		tmpd = name_TUMORTYPES()
		tmpd = tmpd[match(mat[["Abbreviation"]],tmpd$V1),]
		mat[["Tumor Type"]] = tmpd$V2
		rownames(mat) = NULL
		mat = mat[,c(ncol(mat),ncol(mat)-1,seq(ncol(mat)-2))]
		mat
		cap = "The number of genes at various"
		cap = sprintf("%s mutation frequency cutoffs",cap)
		cap = sprintf("%s for each tumor type.",cap)
		print_latex_table(DATA = mat,add_table = TRUE,
			my_align = "llrrrrrr",label = "gene_mut_freq",
			fontsize = "footnotesize",caption = cap)
		
		
		# TMB distribution
		dat = c()
		for(dataset in datasets){
			# dataset = datasets[1]
			my_dirs = setdirs(dataset = dataset,
				verbose = FALSE)
			mat_fn = file.path(my_dirs$spms_dir,
				sprintf("%s_mat.rds",my_dirs$dataset))
			matSPM = readRDS(mat_fn)
			dim(matSPM); matSPM[1:5,1:4]
			dat = rbind(dat,smart_df(dataset = dataset,
				TMB = colSums(matSPM)))
		}
		
		my_theme = theme(legend.position = c("none","bottom")[1],
			panel.background = element_blank(),
			panel.border = element_rect(color = "black",fill = NA,size = 1),
			# panel.spacing = unit(0.75,"lines"),
			panel.grid.major = element_line(colour = "grey50",
				size = 0.5,linetype = "dotted"),
			panel.spacing.x = unit(1.0,"lines"),
			# panel.spacing.y = unit(1.00,"lines"),
			# legend.text = element_text(size = 40),
			# axis.text.x = element_text(size = 35),
			# strip.text = element_text(size = 40),
			# strip.text.y = element_text(angle = 0,hjust = 0)
			text = element_text(size = 45)
			)
		
		TMB = NULL
		gg = ggplot(data = dat,aes(x = log10(TMB))) +
			geom_histogram(bins = 50) + facet_wrap(~ dataset) +
			xlab("log10(Tumor Mutation Burden)") + 
			ylab("Frequency") + my_theme
		
		png_fn = file.path(my_dirs$fin_dir,"log10_TMB.png")
		ggsave(png_fn,plot = gg,device = "png",width = 35,height = 25,
			units = "in")
		rm(gg)
		
		
	}
	
	# Aggregate scatter plot
	if( TRUE ){
		res2 = res[grepl("log10_TMB",res$MODEL),]
		table(res$GM)
		# res2 = res2[which(res2$GM <= 30),]
		# res2 = res2[which(res2$GM %in% sprintf(">= %s",c(5,10,15,20,30,40))),]
		# res2 = res2[which(!(res2$dataset == "COAD" & res2$STRAT == "all")),]
		res2 = res2[which(res2$LAB %in% c("EUC_omnibus","OT_omnibus")),]
		dim(res); dim(res2); res2[1:10,]
		
		dat = res2[which(grepl("EUC",res2$LAB)),]
		dat = name_change(dat,"YY","YY_EUC")
		dat = smart_rmcols(dat,c("PVAL_perm","GM","LAB"))
		dat = smart_merge(dat,smart_rmcols(res2[which(grepl("OT_",res2$LAB)),],
			c("PVAL_perm","GM","LAB")))
		dat = name_change(dat,"YY","YY_OT")
		dim(dat); dat[1:10,]
		
		my_theme = theme(legend.position = c("none","right","bottom")[2],
			text = element_text(size = 30),
			axis.text = element_text(size = 20),
			panel.background = element_blank(),
			# panel.spacing = unit(0.75,"lines"),
			panel.grid.major = element_line(colour = "grey50",
				size = 0.5,linetype = "dotted"))
		
		# table(dat$dataset)
		dat$dataset2 = dat$dataset
		idx = which(dat$dataset == "COAD" & dat$STRAT == "nHM")
		dat$dataset2[idx] = sprintf("%s.%s",dat$dataset,dat$STRAT)[idx]
		tmp_lev = sort(unique(dat$OUT)); # tmp_lev
		tmp_lev = tmp_lev[c(2,3,4,1)]; # tmp_lev
		dat$OUT = factor(dat$OUT,levels = tmp_lev,
			labels = c("Cytolytic Activity","Overall Survival",
				"Progression-Free Interval","Disease-Specific Survival"))
		
		thres = -log10(0.05)
		YY_EUC = YY_OT = OUT = dataset2 = NULL
		gg = ggplot(data = dat,mapping = aes(x = YY_EUC,y = YY_OT,group = dataset2)) +
			geom_point(alpha = 0.85,size = 5,aes(color = dataset2,shape = dataset2)) +
			geom_abline(intercept = 0,slope = 1) + # xlim(rr) + ylim(rr) +
			geom_segment(size = 2,linetype = 2,color = "red",aes(x = 0,y = thres,
				xend = thres,yend = thres)) +
			geom_segment(size = 2,linetype = 2,color = "red",aes(x = thres,y = thres,
				xend = thres,yend = 0)) +
			scale_color_manual(values = rep(smart_colors(9),2)) +
			scale_shape_manual(values = sort(rep(c(16,17),9))) +
			# facet_wrap(~ OUT,scales = "free") + 
			facet_wrap(~ OUT) + 
			xlab("-log10(Euclidean Omnibus)") + 
			ylab("-log10(Optimal Transport Omnibus)") + labs(color = "Dataset",shape = "Dataset") +
			guides(colour = guide_legend(override.aes = list(size = 5))) +
			my_theme
		png_fn = file.path(my_dirs$fin_dir,"overall_omnibus_scatter.png")
		ggsave(png_fn,plot = gg,device = "png",width = 13,height = 8,
			units = "in"); # rm(gg)
		
	}
	
	# KIRC/LGG/LUAD plot
	if( TRUE ){
		res[1:3,]
		res2 = res[which( ( (res$dataset == "KIRC" & res$OUT == "PFI")
			| (res$dataset == "LGG" & res$OUT %in% c("log10_CYT","OS"))
			| (res$dataset == "LUAD" & res$OUT == "DSS") )
			& !grepl("omnibus",res$LAB)
			& res$GM <= 30 & res$GM > 1),]
		dim(res2); res2[1:5,]
		
		my_theme = theme(legend.position = "bottom",
			text = element_text(size = 40),
			plot.title = element_text(hjust = 0.5),
			panel.grid.major = element_line(colour = "grey50",
				size = 0.5,linetype = "dotted"),
			axis.title.x = element_text(vjust = -0.5),
			axis.text.x = element_text(size = 32),
			panel.background = element_blank(),
			panel.spacing.x = unit(0.75,"lines"),
			strip.text.y = element_text(angle = 0,hjust = 0))
		
		res2 = res2[order(res2$GM),]
		res2$OUT = ifelse(res2$OUT == "log10_CYT","Cytolytic~Activity",
			ifelse(res2$OUT == "OS","Overall~Survival",
			ifelse(res2$OUT == "PFI","Progression-Free~Interval",
				"Disease-Specific~Survival")))
		uOUT = sort(unique(res2$OUT)); uOUT
		mOUT = c("Cytolytic~Activity","Overall~Survival",
			"Progression-Free~Interval","Disease-Specific~Survival")
		mOUT = mOUT[mOUT %in% uOUT]; mOUT
		res2$OUT = factor(res2$OUT,levels = mOUT)
		
		res2$LAM = sapply(res2$LAB,function(xx){
			if( grepl("EUC",xx) ){
				"EUC"
			} else {
				# xx = res2$LAB[2]; xx
				LAM = gsub("LAM.","",strsplit(xx,"_")[[1]][3])
				sprintf("OT (lambda = %s)",LAM)
			}
		},USE.NAMES = FALSE)
		res2$GS = sapply(res2$LAB,function(xx){
			if( grepl("EUC",xx) ){
				"EUC"
			} else if( grepl("GO",xx) ){
				"GO"
			} else if( grepl("PATH",xx) ){
				"PATH"
			} else if( grepl("ME",xx) ){
				"ME"
			} else {
				xx2 = strsplit(xx,"_")[[1]]
				xx2 = gsub(".rds","",xx2[length(xx2)])
				xx2 = gsub("ww","",xx2)
				ifelse(grepl("2",xx2),"GO",
					ifelse(grepl("3",xx2),"PATH","ME"))
			}
		},USE.NAMES = FALSE)
		
		max_yy = max(c(-log10(0.05),res2$YY)) + 0.5
		if( any(is.infinite(max_yy)) ){
			cat(sprintf("Skip %s b/c perm_pval = 0\n",dataset))
		}
		max_yy
		
		lev_LAM = sort(unique(res2$LAM)); lev_LAM
		res2$LAM = factor(res2$LAM,levels = lev_LAM[c(1,2,3,4,5)],
			labels = c("Euclidean",expression(paste("OT (",lambda," = 0.5)")),
				expression(paste("OT (",lambda," = 1.0)")),
				expression(paste("OT (",lambda," = 5.0)")),
				# expression(paste("OT (",lambda," = ",infinity,")"))
				"OT (Balanced)"))
		lev_GS = sort(unique(res2$GS)); lev_GS
		res2$GS = factor(res2$GS,levels = lev_GS[c(1,2,4,3)],
			labels = c("Euclidean","GO","PATH","ME"))
		res2[1:3,]
		
		res2$OUT2 = sprintf("%s + %s",res2$dataset,res2$OUT); table(res2$OUT2)
		GM = YY = GS = OUT2 = LAM = NULL
		gg = ggplot(data = res2,mapping = aes(x = factor(GM),y = YY,fill = GS)) +
			geom_bar(stat = "identity",position = position_dodge(),width = 0.8) +
			geom_hline(yintercept = -log10(0.05),size = 1.5,linetype = 2) +
			#geom_hline(data = tmp_df,size = 1.2,linetype = 2,
			#	mapping = aes(yintercept = YY,color = factor(GM_0))) +
			xlab("Gene Mutation Frequency Cutoff") + # ggtitle(dataset) +
			ylab("-log10(Permutation P-value)") + labs(fill = "Gene Similarity Method") + 
			facet_grid(OUT2 ~ LAM,labeller = label_parsed) +
			ylim(c(0,max_yy)) + guides(color = "none") + my_theme
		# gg
		png_fn = file.path(my_dirs$fin_dir,"top_tumor_regs.png")
		ggsave(png_fn,plot = gg,device = "png",width = 30,height = 20,units = "in")
		
		
	}
	
	if( TRUE ){ # Table of sample sizes
		ures = unique(res[,c("dataset","STRAT","OUT","Ns","Ne")])
		ures2 = unique(res[,c("dataset","STRAT")])
		rownames(ures2) = NULL
		ures2$CYT = ""; ures2$OS = ""; ures2$PFI = ""; ures2$DSS = ""
		for(ii in seq(nrow(ures2))){
			# ii = 2
			dd = ures2$dataset[ii]
			ss = ures2$STRAT[ii]
			idx1 = (ures$dataset == dd & ures$STRAT == ss)
			idx2 = (ures2$dataset == dd & ures2$STRAT == ss)
			ures[idx1,]
			ures2[idx2,]
			
			if( length(grep("CYT",ures$OUT[idx1])) > 0 ){
				idx3 = idx1 & grepl("CYT",ures$OUT)
				ures2$CYT[idx2] = sprintf("%s",
					ures$Ns[idx3])
			}
			
			for(tt in c("OS","PFI","DSS")){
				# tt = c("OS","PFI","DSS")[1]; tt
				if( length(grep(tt,ures$OUT[idx1])) > 0 ){
					idx3 = idx1 & grepl(tt,ures$OUT)
					ures2[[tt]][idx2] = sprintf("%s (%s)",
						ures$Ns[idx3],ures$Ne[idx3])
				}
			}
		}
		ures2 = name_change(ures2,"dataset","Tumor")
		ures2 = name_change(ures2,"STRAT","Strata")
		ures2$Strata = ifelse(ures2$Strata == "all","ALL","nHM")
		ures2$ABBV = apply(ures2[,c("Tumor","Strata")],1,function(xx){
			ifelse(xx[2] == "nHM",sprintf("%s-%s",xx[1],xx[2]),xx[1])
		})
		ures2
		
		tmp_df = name_TUMORTYPES()
		names(tmp_df) = c("Tumor","Name")
		ures2 = smart_merge(ures2,tmp_df)
		ures2
		
		print_latex_table(DATA = ures2[,c("Name","ABBV","CYT","OS","PFI","DSS")],
			add_table = TRUE,my_align = "llllll",fontsize = "tiny",repeat_VARS = "Name",
			caption = sprintf("Number of subjects and number of observed events"))
		
	}
	
	# Check on regression sample sizes
	sapply(unique(res$dataset),function(xx) sort(unique(res$Ne[res$dataset == xx])))
	
	if( TRUE ){ # Check top significant tumor types
		
		my_theme = theme(legend.position = "bottom",
			text = element_text(size = 40),
			plot.title = element_text(hjust = 0.5),
			panel.grid.major = element_line(colour = "grey50",
				size = 0.5,linetype = "dotted"),
			axis.title.x = element_text(vjust = -0.5),
			axis.text.x = element_text(size = 32),
			panel.background = element_blank(),
			panel.spacing.x = unit(0.75,"lines"),
			strip.text.y = element_text(angle = 0,hjust = 0))
		
		# In-depth euclidean vs optimal transport
		for(dataset in datasets){
			# dataset = c("BLCA","COAD","KIRC","LGG","LIHC","LUAD","PAAD")[2]
			cat(sprintf("%s: dataset = %s ...\n",date(),dataset))
			# if( dataset == "COAD" ) next
			res2 = res[which(res$dataset == dataset
				& res$OUT != "DFI" & res$GM <= 30 & res$GM > 1
				),]
			res2 = res2[order(res2$GM),]
			res2$OUT = ifelse(res2$OUT == "log10_CYT","Cytolytic~Activity",
				ifelse(res2$OUT == "OS","Overall~Survival",
				ifelse(res2$OUT == "PFI","Progression-Free~Interval",
					"Disease-Specific~Survival")))
			uOUT = sort(unique(res2$OUT)); uOUT
			mOUT = c("Cytolytic~Activity","Overall~Survival",
				"Progression-Free~Interval","Disease-Specific~Survival")
			mOUT = mOUT[mOUT %in% uOUT]
			res2$OUT = factor(res2$OUT,levels = mOUT)
			#res2$LAB = factor(res2$LAB,levels = c("EUC","OT_omnibus"),
			#	labels = c("Euclidean","Optimal Transport (Omnibus)"))
			# dim(res2); res2[1:5,]
			
			res2$LAM = sapply(res2$LAB,function(xx){
				if( grepl("EUC",xx) ){
					"EUC"
				} else {
					# xx = res2$LAB[2]; xx
					LAM = gsub("LAM.","",strsplit(xx,"_")[[1]][3])
					sprintf("OT (lambda = %s)",LAM)
				}
			},USE.NAMES = FALSE)
			res2$GS = sapply(res2$LAB,function(xx){
				if( grepl("EUC",xx) ){
					"EUC"
				} else if( grepl("GO",xx) ){
					"GO"
				} else if( grepl("PATH",xx) ){
					"PATH"
				} else if( grepl("ME",xx) ){
					"ME"
				} else {
					xx2 = strsplit(xx,"_")[[1]]
					xx2 = gsub(".rds","",xx2[length(xx2)])
					xx2 = gsub("ww","",xx2)
					ifelse(grepl("2",xx2),"GO",
						ifelse(grepl("3",xx2),"PATH","ME"))
				}
			},USE.NAMES = FALSE)
			
			max_yy = max(c(-log10(0.05),res2$YY)) + 0.5
			if( any(is.infinite(max_yy)) ){
				cat(sprintf("Skip %s b/c perm_pval = 0\n",dataset))
				next
			}
			
			lev_LAM = sort(unique(res2$LAM)); # lev_LAM
			res2$LAM = factor(res2$LAM,levels = lev_LAM[c(1,2,3,4,5)],
				labels = c("Euclidean",expression(paste("OT (",lambda," = 0.5)")),
					expression(paste("OT (",lambda," = 1.0)")),
					expression(paste("OT (",lambda," = 5.0)")),
					# expression(paste("OT (",lambda," = ",infinity,")"))
					"OT (Balanced)"))
			lev_GS = sort(unique(res2$GS)); # lev_GS
			res2$GS = factor(res2$GS,levels = lev_GS[c(1,2,4,3)],
				labels = c("Euclidean","GO","PATH","ME"))
			GM = YY = OUT = LAM = STRAT = NULL
			gg = ggplot(data = res2,mapping = aes(x = factor(GM),y = YY,fill = GS)) +
				geom_bar(stat = "identity",position = position_dodge(),width = 0.8) +
				geom_hline(yintercept = -log10(0.05),size = 1.5,linetype = 2) +
				#geom_hline(data = tmp_df,size = 1.2,linetype = 2,
				#	mapping = aes(yintercept = YY,color = factor(GM_0))) +
				xlab("Gene Mutation Frequency Cutoff") + 
				ggtitle(dataset) +
				ylab("-log10 Permutation P-value") + labs(fill = "Gene Similarity Method") + 
				ylim(c(0,max_yy)) + guides(color = "none") + my_theme
			# dataset = unique(res2$dataset)
			
			if( dataset == "COAD" ){
				gg = gg + facet_nested(OUT ~ LAM + STRAT,labeller = label_parsed)
				width = 45
			} else {
				# gg = gg + facet_grid(out2 ~ LAM,labeller = label_parsed)
				gg = gg + facet_grid(OUT ~ LAM,labeller = label_parsed)
				width = 30
			}
			# gg
			
			png_fn = file.path(my_dirs$fin_dir,sprintf("indepth_%s.png",dataset))
			ggsave(png_fn,plot = gg,device = "png",width = width,height = 20,units = "in")
			rm(gg)
			
		}
		
	}
	
	return(NULL)
	
}



###
pllittle/ROKETworkflow documentation built on Sept. 27, 2022, 2:15 p.m.