docs/playground/script/hemdag-call.R

#!/usr/bin/Rscript

## load library
library(HEMDAG);
suppressPackageStartupMessages(library(graph)); ## silence biocgenerics mask messages...
library(optparse);

## command line arguments
## for a detailed description, please see the manual: https://cran.r-project.org/web/packages/HEMDAG/HEMDAG.pdf
optionList <- list(
    make_option(c("-o", "--organism"), type="character", default="7227_drome",
        help="organism name in the form <taxon>_<name> (def. 7227_drome)"),
    make_option(c("-d", "--domain"), type="character", default="mf",
        help="go domain. It can be: bp, mf or cc (def. mf)"),
    make_option(c("-e", "--exptype"), type="character", default="ho",
        help="type of dataset on which run HEMDAG. It can be: ho (hold-out) or cv (cross-validated) -- def. ho"),
    make_option(c("-f", "--flat"), type="character", default="svmlinear",
        help="flat classifier"),
    make_option(c("-p", "--positive"), type="character", default="descendants",
        help="positive nodes selection. It can be: children or descendants.
                Skip this parameter if only topdown strategy is applied (def. descendants)"),
    make_option(c("-b", "--bottomup"), type="character", default="tau",
        help="bottomup strategy. It can be: none, threshold.free, threshold, weighted.threshold.free, weighted.threshold or tau.
                If none only topdown strategy is applied (def. tau)"),
    make_option(c("-t", "--topdown"), type="character", default="gpav",
        help="topdown strategy. It can be: htd or gpav (def. gpav)"),
    make_option(c("-c", "--threshold"), type="character", default="seq(from=0.1, to=0.9, by=0.1)",
        help="threshold for the choice of positive nodes.
                It can be a fixed value or an array of values (def. seq(from=0.1, to=0.9, by=0.1))"),
    make_option(c("-w", "--weight"), type="character", default="0",
        help="weight for the choice of positive nodes. It can be a fixed value or an array of values (def. 0)"),
    make_option(c("-m", "--metric"), type="character", default="auprc",
        help="performance metric on which maximize the parametric ensemble algorithms. It can be: auprc or fmax (def. auprc)"),
    make_option(c("-r", "--round"), type="integer", default="3",
        help="number of rounding digits to be applied for choosing the best Fmax. To be used only if metric is set to fmax (def. 3)"),
    make_option(c("-s", "--seed"), type="integer", default="23",
        help="seed for the random generator to create folds (def. 23)"),
    make_option(c("-k", "--fold"), type="integer", default="5",
        help="number of folds for the cross validation (def. 5)"),
    make_option(c("-l", "--parallel"), type="logical", default=FALSE, action="store_true",
        help="should the sequential or parallel version of gpav be run?
                If flag -p is 'on' the gpav parallel version is run. NB: only gpav can be run in parallel (def. FALSE)"),
    make_option(c("-n", "--cores"), type="integer", default="1",
        help="number of cores to use for the parallel execution of gpav (def. 1)"),
    make_option(c("-z", "--norm"), type="logical", default=FALSE, action="store_true",
        help="should the flat score matrix be normalized? If flag -p is 'on' the input flat scores is normalized (def. FALSE)"),
    make_option(c("-y", "--normtype"), type="character", default="none",
        help="type of normalization. It can be maxnorm or qnorm (def. none)")
);

optParser <- OptionParser(option_list=optionList);
opt <- parse_args(optParser);

prefix    <- opt$organism;
organism  <- strsplit(prefix,"_")[[1]][2];
exptype   <- opt$exptype;
flat      <- opt$flat;
positive  <- opt$positive;
bottomup  <- opt$bottomup;
topdown   <- opt$topdown;
domain    <- opt$domain;
threshold <- eval(parse(text=opt$threshold));
weight    <- eval(parse(text=opt$weight));
metric    <- opt$metric;
round     <- opt$round;
seed      <- opt$seed;
kk        <- opt$fold;
parallel  <- opt$parallel;
cores     <- opt$cores;
norm      <- opt$norm;
normtype  <- opt$normtype;
if(normtype == "none")
    normtype <- NULL;

## hemdag algorithm to be displayed in output file name -> 18 iso/tpr-dag ensemble combinations + gpav + htd (tot 20 hemdag family)
if(positive=="children" && bottomup=="threshold.free" && topdown=="htd")
    hemdag.name <- "tprTF";
if(positive=="children" && bottomup=="threshold" && topdown=="htd")
    hemdag.name <- "tprT";
if(positive=="children" && bottomup=="weighted.threshold.free" && topdown=="htd")
    hemdag.name <- "tprW";
if(positive=="children" && bottomup=="weighted.threshold" && topdown=="htd")
    hemdag.name <- "tprwt";
if(positive=="descendants" && bottomup=="threshold.free" && topdown=="htd")
    hemdag.name <- "descensTF";
if(positive=="descendants" && bottomup=="threshold" && topdown=="htd")
    hemdag.name <- "descensT";
if(positive=="descendants" && bottomup=="weighted.threshold.free" && topdown=="htd")
    hemdag.name <- "descensW";
if(positive=="descendants" && bottomup=="weighted.threshold" && topdown=="htd")
    hemdag.name <- "descensWT";
if(positive=="descendants" && bottomup=="tau" && topdown=="htd")
    hemdag.name <- "descensTAU";
if(positive=="children" && bottomup=="threshold.free" && topdown=="gpav")
    hemdag.name <- "isotprTF";
if(positive=="children" && bottomup=="threshold" && topdown=="gpav")
    hemdag.name <- "isotprT";
if(positive=="children" && bottomup=="weighted.threshold.free" && topdown=="gpav")
    hemdag.name <- "isotprW";
if(positive=="children" && bottomup=="weighted.threshold" && topdown=="gpav")
    hemdag.name <- "isotprWT";
if(positive=="descendants" && bottomup=="threshold.free" && topdown=="gpav")
    hemdag.name <- "isodescensTF";
if(positive=="descendants" && bottomup=="threshold" && topdown=="gpav")
    hemdag.name <- "isodescensT";
if(positive=="descendants" && bottomup=="weighted.threshold.free" && topdown=="gpav")
    hemdag.name <- "isodescensW";
if(positive=="descendants" && bottomup=="weighted.threshold" && topdown=="gpav")
    hemdag.name <- "isodescensWT";
if(positive=="descendants" && bottomup=="tau" && topdown=="gpav")
    hemdag.name <- "isodescensTAU";
if(bottomup=="none" && topdown=="gpav")
    hemdag.name <- "gpav";
if(bottomup=="none" && topdown=="htd")
    hemdag.name <- "htd";

## I/O directories
data.dir <- paste0("../data/", exptype, "/");
res.dir  <- paste0("../res/",  exptype, "/");
if(!dir.exists(res.dir)){dir.create(res.dir, recursive=TRUE);}

## flat/ann/dag/testIndex files
files <- list.files(data.dir);
flat.file <- files[grep(paste0(organism, ".*", domain, ".scores.*", flat), files)];
ann.file  <- files[grep(paste0(domain,".ann"), files)];
dag.file  <- files[grep(paste0(domain,".dag"), files)];
if(exptype == "ho"){
    idx.file  <- files[grep(paste0(domain,".testindex"), files)];
    if(length(idx.file)==0)
        stop("no index file found\n");
}

## check if flat|ann|dag exists
if(length(flat.file)==0 || length(ann.file)==0 || length(dag.file)==0)
    stop("no flat|ann|dag file found\n");

## load data
S <- get(load(paste0(data.dir, flat.file)));
g <- get(load(paste0(data.dir, dag.file)));
ann <- get(load(paste0(data.dir, ann.file)));
if(exptype == "ho")
    testIndex <- get(load(paste0(data.dir, idx.file)));

## shrink graph g to terms of matrix S -- if number of nodes between g and S mismatch
root <- root.node(g);
nd <- colnames(S);
class.check <- ncol(S) != graph::numNodes(g);
if(class.check){
    root.check <- root %in% colnames(S);
    if(!root.check)
        nd <- c(root, nd);
    g <- build.subgraph(nd, g, edgemode="directed");
    ann <- ann[, colnames(S)];
}

## address case when (iso)descensW is called with a fixed value of w to enter in the right branch
if(bottomup=="weighted.threshold.free" && length(weight)==1)
    threshold <- 0;

## elapsed time
start.elapsed <- proc.time();

## HEMDAG calling
if(exptype == "ho"){
    if(bottomup=="none"){
        if(topdown=="gpav"){
            S.hier <- gpav.holdout(S=S, g=g, testIndex=testIndex, W=NULL, parallel=parallel,
                ncores=cores, norm=norm, norm.type=normtype);
        }else{
            S.hier <- htd.holdout(S=S, g=g, testIndex=testIndex, norm=norm, norm.type=normtype);
        }
    }else{ ## branch to call HEMDAG by tuning the parameters
        if(length(threshold)>1 || length(weight)>1){
            S.hier <- tpr.dag.holdout(S, g, ann=ann, testIndex=testIndex, norm=norm, norm.type=normtype,
                positive=positive, bottomup=bottomup, topdown=topdown, W=NULL, parallel=parallel, ncores=cores,
                threshold=threshold, weight=weight, kk=kk, seed=seed, metric=metric, n.round=round);
        }else{ ## branch to call HEMDAG with fixed the parameters
            ## add root node if it does not exist
            if(!(root %in% colnames(S))){
                max.score <- max(S);
                z <- rep(max.score,nrow(S));
                S <- cbind(z,S);
                colnames(S)[1] <- root;
            }
            ## normalization
            if(norm){
                S <- scores.normalization(norm.type=normtype, S);
                cat(normtype, "normalization done\n");
            }
            ## shrink S to test indexes
            S.test <- S[testIndex,];
            ## degenerate case when test set has just one row/example
            if(!is.matrix(S.test)){
                test.sample <- rownames(S)[testIndex];
                S.test <- matrix(S.test, ncol=length(S.test), dimnames=list(test.sample, names(S.test)));
            }
            ## tpr-dag correction
            S.hier <- tpr.dag(S.test, g, root=root, positive=positive, bottomup=bottomup, topdown=topdown,
                t=threshold, w=weight, W=NULL, parallel=parallel, ncores=cores);
            ## print chosen parameters
            if(bottomup=="weighted.threshold.free"){
                cat("fixed weight:", weight, "\n");
            }else if(bottomup=="weighted.threshold"){
                cat("fixed weight:", weight, "fixed threshold:", threshold, "\n");
            }else{
                cat("fixed threshold:", threshold, "\n");
            }
            cat("tpr-dag correction done\n");
        }
    }
}else{
    if(bottomup=="none"){
        if(topdown=="gpav"){
            S.hier <- gpav.vanilla(S=S, g=g, W=NULL, parallel=parallel, ncores=cores, norm=norm, norm.type=normtype);
        }else{
            S.hier <- htd.vanilla(S=S, g=g, norm=norm, norm.type=normtype);
        }
    }else{  ## branch to call HEMDAG by tuning the parameters
        if(length(threshold)>1 || length(weight)>1){
            S.hier <- tpr.dag.cv(S, g, ann=ann, norm=norm, norm.type=normtype, positive=positive, bottomup=bottomup,
                topdown=topdown, W=NULL, parallel=parallel, ncores=cores, threshold=threshold, weight=weight,
                kk=kk, seed=seed, metric=metric, n.round=round);
        }else{  ## branch to call HEMDAG with fixed parameters
            ## add root node if it does not exist
            if(!(root %in% colnames(S))){
                max.score <- max(S);
                z <- rep(max.score,nrow(S));
                S <- cbind(z,S);
                colnames(S)[1] <- root;
            }
            ## normalization
            if(norm){
                S <- scores.normalization(norm.type=normtype, S);
                cat(normtype, "normalization done\n");
            }
            S.hier <- tpr.dag(S, g, root=root, positive=positive, bottomup=bottomup, topdown=topdown,
                t=threshold, w=weight, W=NULL, parallel=parallel, ncores=cores);
            ## print chosen parameters
            if(bottomup=="weighted.threshold.free"){
                cat("weight: ", weight, "\n");
            }else if(bottomup=="weighted.threshold"){
                cat("weight: ", weight, "threshold: ", threshold, "\n");
            }else{
                cat("threshold: ", threshold, "\n");
            }
            cat("tpr-dag correction done\n");
        }
    }
}

stop.elapsed <- proc.time() - start.elapsed;
timing.s <- stop.elapsed["elapsed"];
timing.m <- round(timing.s/(60),4);
timing.h <- round(timing.m/(60),4);
cat(hemdag.name, "running time:", timing.s["elapsed"], "(seconds)", "|", timing.m["elapsed"], "(minutes)", "|" , timing.h["elapsed"], "(hours)", "\n\n");

## store results
## outname
fname <- unlist(strsplit(flat.file, split="[.,_]"));
outname <- paste0(fname[-((length(fname)-1):length(fname))], collapse="_");
if(norm==TRUE && !(is.null(normtype)))
    outname <- paste0(outname,"_",normtype);
if(exptype == "ho"){
    save(S.hier, file=paste0(res.dir, outname, "_", hemdag.name, "_holdout.rda"), compress=TRUE);
}else{
    save(S.hier, file=paste0(res.dir, outname, "_", hemdag.name, ".rda"), compress=TRUE);
}
AnacletoLAB/HEMDAG documentation built on Oct. 14, 2022, 9:18 p.m.