R/get_option_parser_training.R
In REPTILE: Regulatory DNA Element Prediction

get_option_parser_training <-
function () 
{
    suppressPackageStartupMessages(requireNamespace("optparse", 
        quietly = TRUE))
    option_list <- list(optparse::make_option(c("-i", "--data-info-file"), 
        type = "character", metavar = "data_info_file", dest = "data_info_file", 
        help = paste0("Tab-separated file providing information about samples,\n", 
            "\t\tepigenetic marks and paths to corresponding bigwig files.\n", 
            "\t\tNo duplicated mark name is allowed for each sample and\n", 
            "\t\tno duplicated sample name is allowed for each mark. \"samples\"\n", 
            "\t\tcan be different cell/tissue types or different conditions.\n", 
            "\t\tformat:\n", "\t\t  <sample name><\\t><mark name><\\t><path to bigwig file>\n", 
            "\t\texample (header is required):\n", "\t\t  sample mark bw_file\n", 
            "\t\t  heart H3K4me1 bigwig/heart_H3K4me1.bw\n", 
            "\t\t  ...\n", "\t\t  brain H3K27ac bigwig/brain_H3K27ac.bw\n", 
            "\t    **Please use the same data info file as the one used in\n", 
            "\t      data preprocessing (using \"REPTILE_preprocess.py\").\n")), 
        optparse::make_option(c("-a", "--annotated-region-epimark-file"), 
            type = "character", metavar = "annotated_region_epimark_file", 
            dest = "annotated_region_epimark_file", help = paste0("Tab-separated file of epigenetic profiles of annotated regions\n", 
                "\t\tformat:\n", "\t\t  First line is a header indicating the content of each\n", 
                "\t\t  column. First four columns are chromosome, start, end and\n", 
                "\t\t  region id. Following columns are the scores/enrichment of\n", 
                "\t\t  epigenetic marks for annotated regions.\n", 
                "\t\texample:\n", "\t\t  chr start end id H3K4me1_H1 H3K4me1_H9 H3K4me1_IMR90 ...\n", 
                "\t\t  chr1 3172000 3173000 reg_0 1.43 1.50 0.03 ...\n", 
                "\t\t  ...\n", "\t\t  chr9 124412000 124413000 reg_302031 0.34 0.44 2.42 ...\n", 
                "\t    **Highly recommend to to use \"REPTILE_preprocess.py\" script to generate\n", 
                "\t      this file. Run \"./REPTILE_preprocess.py -h\" for more information.\n", 
                "\t      Make sure the same data info file is used.\n")), 
        optparse::make_option(c("-d", "--DMR-epimark-file"), 
            type = "character", metavar = "DMR_epimark_file", 
            dest = "DMR_epimark_file", help = paste0("Tab-separated file of epigenetic profiles of differentially\n", 
                "\t\tmethyated regions (DMRs). Format is same as that of \n", 
                "\t\tannotated_region_epimark_file. DMRs are used as high-resolution\n", 
                "\t\tenhancer candidates to increase the resolution of training and\n", 
                "\t\tprediction. The candidate loci can also come from assays like\n", 
                "\t\tDNase-seq or ATAC-seq or analysis on motifs.\n", 
                "\t\tformat:\n", "\t\t  First line is a header indicating the content of each\n", 
                "\t\t  column. First four columns are chromosome, start, end and\n", 
                "\t\t  region id. Following columns are the scores/enrichment of\n", 
                "\t\t  epigenetic marks for annotated regions.\n", 
                "\t\texample:\n", "\t\t  chr start end id H3K4me1_H1 H3K4me1_H9 H3K4me1_IMR90 ...\n", 
                "\t\t  chr1 3172266 3172488 dmr_0 1.43 1.50 0.03 ...\n", 
                "\t\t  ...\n", "\t\t  chr19 61316546 61316778 dmr_513260 0.34 0.44 2.42 ...\n", 
                "\t    **Highly recommend to to use \"REPTILE_preprocess.py\" script to generate\n", 
                "\t      this file. Run \"./REPTILE_preprocess.py -h\" for more information.\n", 
                "\t      Make sure the same data info file is used.\n")), 
        optparse::make_option(c("-l", "--label-file"), type = "character", 
            metavar = "label_file", dest = "label_file", help = paste0("Tab-separated file labeling what annotated regions are active in\n", 
                "\t\twhich sample(s).\n", "\t\tformat:\n", "\t\t  The file has multiple columns. First column is region id.\n", 
                "\t\t  Each following column corresponds to one sample and the values\n", 
                "\t\t  indicate whether the region is active in the sample:\n", 
                "\t\t    - 1: active,\n", "\t\t    - 0: no activity\n", 
                "\t\t    - NA: unknown\n", "\t\t  First line is a header indicating the content of each column.\n", 
                "\t\t  Name of first column is \"id\" and others are sample names.\n", 
                "\t\texample:\n", "\t\t  id H1 H9 IMR90 ...\n", 
                "\t\t  reg_0 1 1 0 ...\n", "\t\t  ...\n", "\t\t  reg_302031 0 0 1 ....\n")), 
        optparse::make_option(c("-s", "--samples-for-training"), 
            type = "character", metavar = "samples_for_training", 
            dest = "samples_for_training", help = paste0("Samples in which the activities of annotated regions are used for\n", 
                "\t\ttraining model.\n", "\t\tformat:\n", "\t\t  Sample names separated by comma.\n", 
                "\t\texample:\n", "\t\t  H1,H9,IMR90\n")), optparse::make_option(c("-r", 
            "--reference-samples"), type = "character", metavar = "ref_sample", 
            dest = "ref_sample", default = NULL, help = paste0("Samples used as reference to calculate intensity deviation\n", 
                "\t\tformat:\n", "\t\t  Sample names separated by comma.\n", 
                "\t\texample:\n", "\t\t  E11_5_FB,E11_5_HT,E11_5_MB\n")), 
        optparse::make_option(c("-o", "--output-prefix"), type = "character", 
            metavar = "output_prefix", dest = "output_prefix", 
            help = paste0("Prefix of the output file storing the model obtained from\n", 
                "\t\ttraining. The output file is \"<OUTPUT_PREFIX>.reptile\".\n", 
                "\t\texample: enhancer_model\n", "\t\t  The output file is enhancer_model.reptile\n")), 
        optparse::make_option(c("-c", "--classifier-family"), 
            type = "character", default = "RandomForest", metavar = "classifier_family", 
            dest = "classifier_family", help = paste0("Classifier family to use in the prediction model\n", 
                "\t\t  default: RandomForest\n", "\t\tClassifiers available:\n", 
                "\t\t - RandomForest: random forest\n", "\t\t - Logistic: logistic regression\n")), 
        optparse::make_option(c("-x", "--no-intensity-deviation"), 
            type = "logical", action = "store_false", default = TRUE, 
            metavar = "incl_dev", dest = "incl_dev", help = paste0("If this option is used, REPTILE will not compute the intensity\n", 
                "\t\tdeviation feature, which captures the tissue-specificity of\n", 
                "\t\tenhancers.\n")), optparse::make_option(c("-t", 
            "--number-of-trees"), type = "double", default = 2000, 
            metavar = "num_trees", dest = "num_trees", help = paste0("Number of trees to be constructed in random forest\n", 
                "\t\t  classifier. Ignored when other classifiers are\n", 
                "\t\t  used.\n", "\t\t  default: 2000\n")))
    description <- paste0("\tTraining enhancer model from annotated regions (known enhancers and known negative\n", 
        "\tregions). \"REPTILE_preprocess.py\" can be used to prepare the input files.\n", 
        "\tPlease email Yupeng He (yupeng.he.bioinfo at gmail) for feedbacks, questions or bugs.")
    usage <- paste0("Usage: ./REPTILE_train.R \\\n", "\t\t -i data_info_file \\\n", 
        "\t\t -a query_region_epimark_file \\\n", "\t\t -d DMR_epimark_file \\\n", 
        "\t\t -l label_file \\\n", "\t\t -s sample_for_training \\\n", 
        "\t\t -o output_prefix\n")
    option_parser <- optparse::OptionParser(usage = usage, description = description, 
        option_list = option_list)
    return(option_parser)
}