R/api_dataframe_loader.R

Defines functions match_features find_matching_featureset feature_matching_level1 formulate_feature_source_from_measuremenset createDataLoader

Documented in feature_matching_level1 find_matching_featureset formulate_feature_source_from_measuremenset match_features

#' convenience function for matching features between database and file
match_features = function(features_in_file, df_features_db, feature_type, column_in_db) {
  df_features_db = df_features_db[df_features_db$feature_type == feature_type | 
                                    df_features_db$feature_type == 'controls', ]
  
  print(head(as.character(features_in_file)))
  matchL = find_matches_and_return_indices(features_in_file, df_features_db[, column_in_db])
  
  # Assume that all features are already registered
  if (length(matchL$source_unmatched_idx) != 0) {
    cat("Out of", length(unique(features_in_file)), "features, ")
    cat("unmatched features:", 
        pretty_print(vec = unique(features_in_file[matchL$source_unmatched_idx])), "\n")
    stop("...")
  }
  df_features_db$feature_id[matchL$target_matched_idx]
}

#' find featureset from Pipeline sheet info
#' 
#' Row or row(s) of Pipeline sheet are used to load data. Unique choice
#' of featureset name is used to subselect a specific featureset registered
#' in the database. 
find_matching_featureset = function(
  pipeline_df, # row(s) of Pipeline sheets that are being loaded
  featureset_link_col, # column linking selection in Pipelines sheet to featureset definition
  fsets_scidb # dataframe containing all featuresets registered with system
) {
  matchTarget = unique(pipeline_df[, featureset_link_col])
  stopifnot(length(matchTarget) == 1)
  fset = drop_na_columns(fsets_scidb[match(matchTarget, 
                                           fsets_scidb[,
                                                       featureset_link_col]), ])
  stopifnot(nrow(fset) == 1)
  fset
}

#' Level 1 feature matching
#' 
#' Match column in file with feature names
#' Return result is a structure with matched and unmatched indices
feature_matching_level1 = function(data_df, # data-frame containing feature and measurement values
                                   col_match_ftr_name, # column in data to match with scidb features
                                   fset, # specific featureset to be used for matching
                                   feature_df, # features data frame at specific featureset_id
                                   feature_df_col = 'name' # column in features data.frame with which to match
) {
  cat("Matching features in file by feature-names in DB at featureset_id", 
      fset$featureset_id, "\n")
  find_matches_and_return_indices(data_df[, col_match_ftr_name], 
                                  feature_df[, feature_df_col])
}       

#' Formulate feature source string for use in feature registration
formulate_feature_source_from_measuremenset = function(measurementset) {
  stopifnot(nrow(measurementset) == 1)
  paste0(
    "measurementset_id: ", 
    measurementset$measurementset_id, 
    "; pipeline_name: ", 
    measurementset$name, 
    "; data_type: ", 
    measurementset$entity)
}

##### DataLoader #####
DataLoader = R6::R6Class(classname = "DataLoader", 
        public = list(
          print_level = function() {cat("----(Level: DataLoader)\n")},
          assign_biosample_ids = function(){
            cat("assign_biosample_ids()"); self$print_level()
            bios_ref = private$.reference_object$biosample
            entity = unique(private$.reference_object$measurement_set$entity)
            if (entity %in% c(.ghEnv$meta$arrRnaquantification,
                              .ghEnv$meta$arrVariant,
                              .ghEnv$meta$arrCytometry_cytof)) {
              suffix = template_helper_suffix_by_entity(entity = private$.reference_object$measurement_set$entity)
              bios_ref = bios_ref[grep(suffix, bios_ref$name), ]
              cat("Chose suffix:", suffix, "for entity:", private$.reference_object$measurement_set$entity, 
                  "\nRetained:", nrow(bios_ref), "of total:", nrow(private$.reference_object$biosample), "in manifest\n")
            }
            if (nrow(private$.reference_object$pipeline_df) > 1) { 
              # multiple Measurements combined into one file
              # ==> data must contain a column called `biosample_name`
              cat("Multiple Measurements combined into one file\n")
              cat("Replacing `biosample_name` with `biosample_id`\n")
              
              cat("Summary file has", length(unique(private$.data_df$biosample_name)), "biosamples\n")
              cat("Current pipeline record expects", length(private$.reference_object$pipeline_df$original_sample_name), 
                  "entries\n")
              
              m1 = find_matches_and_return_indices(
                source = private$.reference_object$pipeline_df$sample_name,
                target = bios_ref$name
              )
              if (nrow(bios_ref) != length(m1$target_matched_idx)) {
                cat("Filtering available biosamples by pipeline sheet entries\n",
                    "Originally", nrow(bios_ref), "samples. Filtered down to",
                    length(m1$target_matched_idx), "samples\n")
                bios_ref = bios_ref[m1$target_matched_idx, ]
              }
              
              if (private$.match_biosample_name_exactly) {
                cat("Doing exact matching\n")
                if (!all(private$.data_df$biosample_name %in% private$.reference_object$pipeline_df$original_sample_name)) {
                  cat("Dropping extra entries\n")
                  private$.data_df = private$.data_df[private$.data_df$biosample_name %in% private$.reference_object$pipeline_df$original_sample_name, ]
                }
                
                mL = find_matches_and_return_indices(private$.data_df$biosample_name, 
                                                     bios_ref$original_sample_name)
                
                private$.data_df$biosample_id = -1
                private$.data_df$biosample_id[mL$source_matched_idx] = bios_ref$biosample_id[mL$target_matched_idx]
                
                unmatched = unique(private$.data_df[private$.data_df$biosample_id == -1, ]$biosample_name)
                if (length(unmatched) > 0) {
                  cat("dropping", length(unmatched), "samples:", pretty_print(unmatched, prettify_after = length(unmatched)), "\n")
                  private$.data_df = private$.data_df[private$.data_df$biosample_id != -1, ]
                }
              } else {
                cat("Doing inexact name matching\n") # Tested on CyTOF data
                candidate_sample_names = bios_ref$original_sample_name
                file_sample_names = unique(private$.data_df$biosample_name)
                match_file_to_candidate = sapply(
                  file_sample_names, function(nm) {
                    # cat("Sample name: ", nm, "\n")
                    res = unlist(sapply(candidate_sample_names, 
                           function(bios_ref_nm) grepl(bios_ref_nm, nm)))
                    res = which(res)
                    stopifnot(length(res) == 1)
                    as.integer(res)
                  })
                private$.data_df$biosample_id = 
                  bios_ref[match_file_to_candidate[private$.data_df$biosample_name], ]$biosample_id
              }
              
              private$.data_df$biosample_name = NULL
            } else if (nrow(private$.reference_object$pipeline_df) == 1) {
              # One Measurement per file 
              # ==> information about biosample will exist in `pipeline_df` reference object
              cat("One Measurement per file\n")
              
              sample_name_in_file = private$.reference_object$pipeline_df$sample_name
              
              mL = find_matches_and_return_indices(sample_name_in_file,
                                                   bios_ref$name)
              
              if (length(mL$source_unmatched_idx) != 0) {
                stop("did not expect to have to match original_sample_names while handling
                     one row of Pipeline sheet (i.e. per sample file)")
              }
              
              if (length(grep(sample_name_in_file, 
                              bios_ref$name)) > 1) {
                if (entity %in% c(.ghEnv$meta$arrRnaquantification,
                                  .ghEnv$meta$arrVariant)) {
                  stop("Did not expect this error for RNA-seq/GXP or variant entities")
                }
                stop("sample name in pipeline sheet was matched to more than one sample in biosample dataframe.
                    Must add logic to handle this e.g.
                    - FUSION data is typically derived from RNA sample, and rarely from DNA sample
                    - COPYNUMBER data is typically derived from DNA sample, and rarely from RNA sample")
              }
              
              sample_id_db = bios_ref$biosample_id[mL$target_matched_idx]
              cat("sample-name:", sample_name_in_file, "==> biosample_id:", 
                  sample_id_db, "\n")
              private$.data_df$biosample_id = sample_id_db
            }
            invisible(self)
          },
          
          #' download features for current featureset
          #' 
          #' compare with `featureset_id` of the current measurementSet
          #' and update the feature stored in `reference_object` if there has been a change in `featureset_id`
          download_features_for_featureset = function() {
            cat("download_features_for_featureset()"); self$print_level()
            fsets_scidb = private$.reference_object$featureset
            # print(dim(fsets_scidb))
            
            featureset_name = unique(private$.reference_object$pipeline_df[, 
                                                      template_linker$featureset$choices_col])
            stopifnot(length(featureset_name) == 1)
            # print(featureset_name)
            
            fset = fsets_scidb[match(featureset_name, 
                                     fsets_scidb[, template_linker$featureset$choices_col]), ]
            stopifnot(nrow(fset) == 1)
            # print(fset)
            
            if (is.null(private$.reference_object$feature) ||
                nrow(private$.reference_object$feature) == 0) {
              cat("feature_ref = NULL; downloading features for featureset", fset$featureset_id, "into loaderObject\n")
              private$.reference_object$feature = search_features(featureset_id = fset$featureset_id)
            } else {
              if (unique(private$.reference_object$feature$featureset_id) != fset$featureset_id) {
                cat("featureset has changed; downloading features for featureset", fset$featureset_id, "into loaderObject\n")
                private$.reference_object$feature = search_features(featureset_id = fset$featureset_id)
              }
            }
          },
          
          #' register new features (if any)
          #' 
          register_new_features = function() {
            cat("register_new_features()"); self$print_level()
            return(FALSE)
          },
          
          #' return reference_object 
          #' 
          #'  For DEBUG only
          # get_reference_object = function(){
          #   private$.reference_object
          # },
          
          # return features in reference object
          retrieve_features = function() {
            cat("retrieve_features()"); self$print_level()
            private$.reference_object$feature
          },
          
          retrieve_feature_synonyms = function() {
            cat("retrieve_feature_synonyms()"); self$print_level()
            private$.reference_object$feature_synonym
          },
          
          #' update feature and feature synonym in reference object
          update_reference_object = function() {
            # update the selected features
            fsets_scidb = private$.reference_object$featureset
            # print(dim(fsets_scidb))
            
            featureset_name = unique(private$.reference_object$pipeline_df[, 
                                                                           template_linker$featureset$choices_col])
            stopifnot(length(featureset_name) == 1)
            # print(featureset_name)
            
            fset = fsets_scidb[match(featureset_name, 
                                     fsets_scidb[, template_linker$featureset$choices_col]), ]

            cat("updating feature in reference object\n")
            
            private$.reference_object$feature = search_features(featureset_id = 
                                fset$featureset_id)
            cat("updating feature-synonym in reference object\n")
            fsyn = get_feature_synonym()
            private$.reference_object$feature_synonym = fsyn[fsyn$featureset_id == fset$featureset_id, ]
          },
          
          #' assign feature ids
          #' 
          #' assume that new features have been registered by this time
          assign_feature_ids = function(){
          },
          
          #' assign dataset_id and measurementset_id
          assign_other_ids = function(){
            private$.data_df$dataset_id = private$.reference_object$record$dataset_id
            private$.data_df$measurementset_id = private$.reference_object$measurement_set$measurementset_id
          },
          load_data = function(){
            
          },
          returnReferenceObject = function(){
            
          },
          initialize = function(data_df, reference_object, feature_annotation_df = NULL){
            private$.data_df = data_df
            private$.reference_object = reference_object
            private$.feature_annotation_df = feature_annotation_df
          }
        ), private = list(
          get_selected_featureset = function() {
            # retrieve the featureset for selected set of entries from Pipeline sheet 
            # (must have downselected to a unique featureset at the time of calling)
            fset_choice = unique(
              private$.reference_object$pipeline_df[,
                                                    template_linker$featureset$choices_col])
            stopifnot(length(fset_choice) == 1)
            fsets_scidb = private$.reference_object$featureset
            fset = drop_na_columns(
              fsets_scidb[match(fset_choice,
                                fsets_scidb[,
                                            template_linker$featureset$choices_col]), ])
            stopifnot(nrow(fset) == 1)
            fset
          },
          get_feature_synonym_df_for_selected_featureset = function() {
            # get feature synonym dataframe for use in matching
            # (assumes user must have downselected to a unique featureset at the time of calling)
            fset = private$get_selected_featureset()
            cat("Matching features in file by feature-synonyms in DB at featureset_id", 
                fset$featureset_id, "\n")
            fsyn_sel = private$.reference_object$feature_synonym[
              private$.reference_object$feature_synonym$featureset_id == 
                fset$featureset_id, ]
            fsyn_sel
          },
          .data_df = NULL,
          .feature_annotation_df = NULL, 
          .reference_object = NULL,
          .match_biosample_name_exactly = TRUE
        ))


##### DataLoaderBlankSlots #####
# Class where all slots are blank on purpose i.e. do no action
# To be used as a dummy
DataLoaderBlankSlots = R6::R6Class(
  classname = "DataLoaderBlankSlots",
  inherit = DataLoader,
  public = list(
    assign_biosample_ids = function() {}, 
    download_features_for_featureset = function() {}, 
    register_new_features = function() {return(FALSE)}, 
    retrieve_features = function() {}, 
    retrieve_feature_synonyms = function() {}, 
    update_reference_object = function() {}, 
    assign_feature_ids = function() {}, 
    assign_other_ids = function() {}, 
    load_data = function() {}
  )
)
##### DataLoaderExpression #####
# class to be shared between loaders for
# - GeneExpression (RNAQuantRNASeq), 
# - ProteinExpression
# - etc.
DataLoaderExpression = R6::R6Class(classname = "DataLoaderExpression",
                                inherit = DataLoader,
                                public = list(
                                  print_level = function() {cat("----(Level: DataLoaderExpression)\n")},
                                  load_data = function(){
                                    cat("load_data()"); self$print_level()
                                    df_size_mb = as.integer(
                                      as.double(object.size(
                                        private$.data_df)
                                        )/1024/1024)
                                    upload_chunk_max = 200 # 200 MB
                                    if (df_size_mb < upload_chunk_max) {
                                      register_expression_dataframe(df1 = private$.data_df, 
                                                                    dataset_version = private$.reference_object$record$dataset_version)
                                    } else{
                                      factorLarger = round(df_size_mb/upload_chunk_max)
                                      cat("Dataframe is of size:", df_size_mb, "Mb. Uploading in", factorLarger, "pieces\n")
                                      stepSize = round(nrow(private$.data_df)/factorLarger)
                                      starts = seq(1, nrow(private$.data_df), stepSize)
                                      ends = starts + stepSize - 1
                                      ends[length(ends)] = nrow(private$.data_df)
                                      for (idx in 1:factorLarger) {
                                        cat("Uploading data.frame sub-chunk #", idx, "\n")
                                        register_expression_dataframe(df1 = private$.data_df[c(starts[idx]:ends[idx]), ], 
                                                                      dataset_version = private$.reference_object$record$dataset_version)
                                      }
                                    }
                                  },
                                  
                                  assign_feature_ids = function(feature_type, column_in_file) {
                                    super$assign_feature_ids()
                                    cat("assign_feature_ids()"); self$print_level()
                                    
                                    if (identical(colnames(private$.feature_annotation_df), 
                                                  'gene_symbol')) {
                                      column_in_db = 'gene_symbol' # special handling for RSEM file with Hugo gene symbols 
                                                                   # https://github.com/Paradigm4/revealgenomics/blob/master/inst/extdata/data_study_XX_ROW_DNAnexus_gene_RSEM_eset_TPM_w_clin_1samplePerPatient.tsv
                                                                   # see commit https://github.com/Paradigm4/revealgenomics/commit/39e96f18a5b1fd0fbc0418f3e0597fa4706ccc8b#diff-680b17553e29f834dcf09bd13c28ccac
                                    } else {
                                      column_in_db = 'name'        # in all other cases, match with feature `name` 
                                    }
                                    private$.data_df$feature_id = match_features(
                                      features_in_file = private$.data_df[, column_in_file],
                                      df_features_db = private$.reference_object$feature,
                                      feature_type = feature_type,
                                      column_in_db = column_in_db)
                                    private$.data_df[, column_in_file] = NULL
                                  }
                                ))

##### DataLoaderRNAQuantRNASeq #####
DataLoaderRNAQuantRNASeq = R6::R6Class(classname = "DataLoaderRNAQuantRNASeq",
                                inherit = DataLoaderExpression, 
                                public = list(
                                  print_level = function() {cat("----(Level: DataLoaderRNAQuantRNASeq)\n")},
                                  register_new_features = function() {
                                    cat("register_new_features()"); self$print_level()
                                    if ('gene_short_name' %in% colnames(private$.data_df)) {
                                      stopifnot(nrow(private$.reference_object$pipeline_df) == 1)
                                      fsets_scidb = private$.reference_object$featureset
                                      fset = drop_na_columns(fsets_scidb[match(private$.reference_object$pipeline_df[, 
                                                                                                                     template_linker$featureset$choices_col], 
                                                                               fsets_scidb[,
                                                                                           template_linker$featureset$choices_col]), ])
                                      stopifnot(nrow(fset) == 1)
                                      # cat("Reading annotation info from RNA-seq data file\n")
                                      feature_df = data.frame(name = private$.data_df$tracking_id,
                                                           gene_symbol = private$.data_df$gene_short_name,
                                                           featureset_id = fset$featureset_id,
                                                           chromosome = 'unknown',
                                                           start = '...',
                                                           end = '...',
                                                           feature_type = 'gene',
                                                           source = 'RNA-seq file',
                                                           stringsAsFactors = FALSE)
                                      
                                      cat("Matching features in file by feature-names in DB at featureset_id", 
                                          fset$featureset_id, "\n")
                                      features_sel = private$.reference_object$feature
                                      m1 = find_matches_and_return_indices(private$.data_df$tracking_id, features_sel$name)
                                      
                                      if (length(m1$source_unmatched_idx) > 0) {
                                        new_ftr_ids = register_feature(df = feature_df)
                                      
                                        return(TRUE)
                                      } else {
                                        cat("No new features to register\n")
                                        return(FALSE)
                                      }
                                    } else if ('tracking_id' %in% colnames(private$.data_df)) {
                                      col_match_ftr_name = 'tracking_id'
                                      fset = find_matching_featureset(pipeline_df = private$.reference_object$pipeline_df,
                                                                      featureset_link_col = template_linker$featureset$choices_col,
                                                                      fsets_scidb = private$.reference_object$featureset
                                      )
                                      ftr_ann_df = private$.feature_annotation_df
                                      if (identical(colnames(ftr_ann_df), 'gene_symbol')) {
                                        # special handling for RSEM file with Hugo gene symbols 
                                        # https://github.com/Paradigm4/revealgenomics/blob/master/inst/extdata/data_study_XX_ROW_DNAnexus_gene_RSEM_eset_TPM_w_clin_1samplePerPatient.tsv
                                        # see commit https://github.com/Paradigm4/revealgenomics/commit/39e96f18a5b1fd0fbc0418f3e0597fa4706ccc8b#diff-680b17553e29f834dcf09bd13c28ccac
                                        feature_df_col = 'gene_symbol'
                                      } else if (identical(colnames(ftr_ann_df), 'feature_name')) {
                                        feature_df_col = 'name'
                                      } else {
                                        stop("Matrix files loaded by this loader are expected to have ", 
                                             "one feature column with feature name (e.g. Ensembl ID) ",
                                             "or gene_symbol (e.g. Hugo gene symbol). ", 
                                             "Need to implement handling of case where matrix file has multiple rows. ",
                                             "Follow template for `DataLoaderRNAQuantMicroarray::register_new_features()`")
                                      }
                                      m1 = feature_matching_level1(data_df = private$.data_df,
                                                                   col_match_ftr_name = col_match_ftr_name,
                                                                   fset = fset,
                                                                   feature_df = private$.reference_object$feature,
                                                                   feature_df_col = feature_df_col)
                                      
                                      if (length(m1$source_unmatched_idx) > 0) {
                                        if ("GTF_URL" %in% colnames(fset)) stop("Did not expect to find unmatched features in ", 
                                                                                "featureset populated using GTF file. Please check") 
                                        unmatched_ftrs = unique(private$.data_df[m1$source_unmatched_idx,
                                                                                 col_match_ftr_name])
                                        
                                        if (ncol(ftr_ann_df) == 1) {
                                          ftr_ann_df_unmatched = data.frame(name = ftr_ann_df[
                                            match(unmatched_ftrs, ftr_ann_df[, 1]), ],  # matches for RSEM type
                                            stringsAsFactors = FALSE)
                                        } else {
                                          stop("Need to implement this code-path. Follow template for `DataLoaderRNAQuantMicroarray::register_new_features()`")
                                        }
                                        
                                        ftr_ann_df_unmatched$featureset_id = fset$featureset_id
                                        if (feature_df_col == 'name') {
                                          ftr_ann_df_unmatched$gene_symbol = 'NA'
                                        } else if (feature_df_col == 'gene_symbol') {
                                          ftr_ann_df_unmatched$gene_symbol = ftr_ann_df_unmatched$name
                                        } else {
                                          stop("Expect `feature_df_col` to be `name` or `gene_symbol")
                                        }
                                        ftr_ann_df_unmatched$chromosome = 'NA'
                                        ftr_ann_df_unmatched$start = 'NA'
                                        ftr_ann_df_unmatched$end = 'NA'
                                        if (length(grep("gene", 
                                                        private$.reference_object$measurement_set$filter_name)) == 1) {
                                          ftr_ann_df_unmatched$feature_type = 'gene'
                                        } else if (length(grep("transcript", 
                                                               private$.reference_object$measurement_set$filter_name)) == 1) {
                                          ftr_ann_df_unmatched$feature_type = 'transcript'
                                        }
                                        ftr_ann_df_unmatched$source = formulate_feature_source_from_measuremenset(
                                          measurementset = private$.reference_object$measurement_set)
                                        
                                        stopifnot(nrow(ftr_ann_df_unmatched) == length(unmatched_ftrs))
                                        register_feature(df = ftr_ann_df_unmatched, 
                                                         register_gene_synonyms = TRUE)
                                        return(TRUE)
                                      } else {
                                        cat("No new features to register\n")
                                        return(FALSE)
                                      }
                                    }
                                    
                                      
                                  }))
##### DataLoaderRNASeqGeneFormat #####
# loader corresponding to output of all DataReaderRNASeqGene* types 
# e.g. DataReaderRNASeqGeneFormatA, DataReaderRNAQuantRNASeqCufflinks
DataLoaderRNASeqGeneFormat = R6::R6Class(classname = "DataLoaderRNASeqGeneFormat",
                                             inherit = DataLoaderRNAQuantRNASeq,
                                             public = list(
                                               print_level = function() {cat("----(Level: DataLoaderRNASeqGeneFormat)\n")},
                                               assign_feature_ids = function(){
                                                 cat("assign_feature_ids()"); self$print_level()
                                                 super$assign_feature_ids(feature_type = 'gene',
                                                                          column_in_file = 'tracking_id')
                                               }))

##### DataLoaderRNASeqTranscriptFormat #####
DataLoaderRNASeqTranscriptFormat = R6::R6Class(classname = "DataLoaderRNASeqTranscriptFormat",
                                                inherit = DataLoaderRNAQuantRNASeq,
                                                public = list(
                                                  print_level = function() {cat("----(Level: DataLoaderRNASeqTranscriptFormat)\n")},
                                                  assign_feature_ids = function(){
                                                    cat("assign_feature_ids()"); self$print_level()
                                                    super$assign_feature_ids(feature_type = 'transcript',
                                                                             column_in_file = 'tracking_id')
                                                  }))

##### DataLoaderRNASeqGenesetFormat #####
DataLoaderRNASeqGenesetFormat = R6::R6Class(classname = "DataLoaderRNASeqGenesetFormat",
                                         inherit = DataLoaderRNAQuantRNASeq,
                                         public = list(
                                           print_level = function() {cat("----(Level: DataLoaderRNASeqGenesetFormat)\n")},
                                           assign_feature_ids = function(){
                                             cat("assign_feature_ids()"); self$print_level()
                                             super$assign_feature_ids(feature_type = 'geneset',
                                                                      column_in_file = 'tracking_id')
                                           }))


##### DataLoaderRNAQuantMicroarray #####
DataLoaderRNAQuantMicroarray = R6::R6Class(classname = "DataLoaderRNAQuantMicroarray",
                                       inherit = DataLoaderExpression, 
                                       public = list(
                                         print_level = function() {cat("----(Level: DataLoaderRNAQuantMicroarray)\n")},
                                         register_new_features = function() {
                                           cat("register_new_features()"); self$print_level()
                                           col_match_ftr_name = 'probeset_id'
                                           if (col_match_ftr_name %in% colnames(private$.data_df)) {
                                             matchTarget = unique(private$.reference_object$pipeline_df[, 
                                                                                                        template_linker$featureset$choices_col])
                                             stopifnot(length(matchTarget) == 1)
                                             fsets_scidb = private$.reference_object$featureset
                                             fset = drop_na_columns(fsets_scidb[match(matchTarget, 
                                                                                      fsets_scidb[,
                                                                                                  template_linker$featureset$choices_col]), ])
                                             stopifnot(nrow(fset) == 1)
                                             
                                             cat("Matching features in file by feature-names in DB at featureset_id", 
                                                 fset$featureset_id, "\n")
                                             features_sel = private$.reference_object$feature
                                             m1 = find_matches_and_return_indices(private$.data_df[, col_match_ftr_name], 
                                                                                  features_sel$name)
                                             
                                             if (length(m1$source_unmatched_idx) > 0) {
                                               ftr_ann_df = private$.feature_annotation_df
                                               
                                               ftr_ann_df$name = rownames(ftr_ann_df)
                                               ftr_ann_df = plyr::rename(ftr_ann_df, 
                                                                         c('ENTREZID' = 'entrez_gene_id',
                                                                           'ENSEMBL' = 'ensembl_gene_id',
                                                                           'SYMBOL' = 'gene_symbol',
                                                                           'GENENAME' = 'gene_full_name', 
                                                                           'GENEFAMILY' = 'gene_family',
                                                                           'INICalls' = 'INI_calls',
                                                                           'PROBENUM' = 'probe_num'))
                                               
                                               unmatched_features = unique(private$.data_df[, 
                                                                        col_match_ftr_name][m1$source_unmatched_idx])
                                               
                                               m2 = find_matches_and_return_indices(unmatched_features, 
                                                                                    ftr_ann_df$name)
                                               if (length(m2$source_unmatched_idx) > 0) {
                                                 stop("Feature annotation data does not contain annotation for",
                                                      pretty_print(unmatched_features[m2$source_unmatched_idx]))
                                               }
                                               
                                               ftr_ann_df = ftr_ann_df[m2$target_matched_idx, ]
                                               rownames(ftr_ann_df) = 1:nrow(ftr_ann_df)
                                               
                                               ftr_ann_df$featureset_id = fset$featureset_id
                                               ftr_ann_df$chromosome = 'NA'
                                               ftr_ann_df$start = 'NA'
                                               ftr_ann_df$end = 'NA'
                                               ftr_ann_df$strand_term = 'strand_term_unspecified'
                                               ftr_ann_df$feature_type = 'probeset'
                                               ftr_ann_df$source = 'expression_set_object'
                                               register_feature(df = ftr_ann_df, 
                                                                register_gene_synonyms = FALSE)
                                               return(TRUE)
                                             } else {
                                               cat("No new features to register\n")
                                               return(FALSE)
                                             }
                                           }
                                         },
                                         assign_feature_ids = function(){
                                           cat("assign_feature_ids()"); self$print_level()
                                           # If featureSet has both multiple feature types, then prioritize `probeset`-s over `gene`-s
                                           uniq_ftr_types = unique(private$.reference_object$feature$feature_type)
                                           priority_order = c('probeset', 'gene')
                                           feature_type = priority_order[priority_order %in% uniq_ftr_types][1]
                                           super$assign_feature_ids(feature_type = feature_type,
                                                                    column_in_file = 'probeset_id')
                                         }))

##### DataLoaderProteomicsMaxQuant #####
DataLoaderProteomicsMaxQuant = R6::R6Class(
  classname = "DataLoaderProteomicsMaxQuant",
  inherit = DataLoaderExpression, 
  public = list(
    print_level = function() {cat("----(Level: DataLoaderProteomicsMaxQuant)\n")},
    register_new_features = function() {
      col_match_ftr_name = 'tracking_id'
      fset = find_matching_featureset(pipeline_df = private$.reference_object$pipeline_df,
                                      featureset_link_col = template_linker$featureset$choices_col,
                                      fsets_scidb = private$.reference_object$featureset
      )
      m1 = feature_matching_level1(data_df = private$.data_df, 
                                   col_match_ftr_name = col_match_ftr_name,
                                   fset = fset,
                                   feature_df = private$.reference_object$feature
      )
      
      if (length(m1$source_unmatched_idx) > 0) {
        stop("Need to implement level 2 matching -- see DataLoaderRNAQuantRNASeq")
        return(FALSE)
      } else {
        return(FALSE)
      }
    },
    assign_feature_ids = function(){
      cat("assign_feature_ids()"); self$print_level()
      super$assign_feature_ids(feature_type = 'protein_probe',
                               column_in_file = 'tracking_id')
    }
  )
)

##### DataLoaderVariant #####
DataLoaderVariant = R6::R6Class(classname = 'DataLoaderVariant',
                                     inherit = DataLoader,
                                     public = list(
                                       print_level = function() {cat("----(Level: DataLoaderVariant)\n")},
                                       load_data = function() {
                                         cat("load_data()"); self$print_level()
                                         register_variant(df1 = private$.data_df, 
                                                          dataset_version = private$.reference_object$record$dataset_version)
                                       }
                                     ))

##### DataLoaderVariantGemini #####
#' loader corresponding to GEMINI files
#' - May need to use feature-synonym for feature matching, but current implementation is inefficient
#' - also has hard-coded links to column names used for feature matching (e.g. `gene`)
DataLoaderVariantGemini = R6::R6Class(classname = 'DataLoaderVariantGemini',
                                     inherit = DataLoaderVariant,
                                     public = list(
                                       print_level = function() {cat("----(Level: DataLoaderVariant)\n")},
                                       assign_feature_ids = function(){
                                         cat("assign_feature_ids()"); self$print_level()
                                         super$assign_feature_ids()
                                         
                                         fset_id = private$.reference_object$measurement_set$featureset_id
                                         cat("Match features in file to feature synonyms at featureset_id =", fset_id, "\n")
                                         fsyn = private$.reference_object$feature_synonym
                                         # print(dim(fsyn))
                                         fsyn_sel = fsyn[fsyn$featureset_id == fset_id, ]
                                         # print(dim(fsyn_sel))
                                         
                                         m2 = find_matches_and_return_indices(private$.data_df$gene, 
                                                                              fsyn_sel$synonym)
                                         stopifnot(length(m2$source_unmatched_idx) == 0)
                                         
                                         private$.data_df$feature_id = fsyn_sel$feature_id[m2$target_matched_idx]
                                         if ('scidb_feature_col' %in% colnames(private$.data_df)) {
                                           cat("Dropping extra column `scidb_feature_col` \n")
                                           private$.data_df$scidb_feature_col = NULL
                                         }
                                       },
                                       
                                       register_new_features = function() {
                                         cat("Function: Register new features (Level: DataLoaderVariantGemini)\n")
                                         super$register_new_features()
                                         
                                         fset_choice = unique(private$.reference_object$pipeline_df[,
                                                                template_linker$featureset$choices_col])
                                         stopifnot(length(fset_choice) == 1)
                                         fsets_scidb = private$.reference_object$featureset
                                         fset = drop_na_columns(fsets_scidb[match(fset_choice, 
                                                                                  fsets_scidb[,
                                                                                              template_linker$featureset$choices_col]), ])
                                         stopifnot(nrow(fset) == 1)
                                         cat("Matching features in file by feature-synonyms in DB at featureset_id", 
                                             fset$featureset_id, "\n")
                                         fsyn_sel = private$.reference_object$feature_synonym[
                                           private$.reference_object$feature_synonym$featureset_id == 
                                             fset$featureset_id, ]
                                         m1 = find_matches_and_return_indices(private$.data_df$gene, 
                                                                              fsyn_sel[fsyn_sel$source == 'gene_symbol', ]$synonym)
                                         
                                         private$.data_df$feature_id = -1
                                         if (length(m1$source_unmatched_idx) > 0) {
                                           private$.data_df$feature_id[m1$source_matched_idx] = fsyn_sel$feature_id[m1$target_matched_idx]
                                           
                                           unmatched = private$.data_df[private$.data_df$feature_id == -1, ]
                                           unmatched_genes = as.character(unique(unmatched$gene))
                                           cat("Manually registering", length(unmatched_genes), "new features\n")
                                           ftr_new = data.frame(name = unmatched_genes,
                                                                featureset_id = fset$featureset_id,
                                                                chromosome = 'unknown',
                                                                gene_symbol = unmatched_genes,
                                                                start = '...',
                                                                end = '...',
                                                                feature_type = 'gene',
                                                                source = 'GEMINI mutation file',
                                                                stringsAsFactors = FALSE)
                                           new_ftr_ids = register_feature(df = ftr_new, register_gene_synonyms = TRUE)
                                           
                                           return(TRUE)
                                         } else {
                                           cat("No new features to register\n")
                                           return(FALSE)
                                         }
                                         invisible(self)
                                       }
                                     ))

##### DataLoaderVariantFormatA #####
#' loader to try and handle various variant formats
#' - currently assumes that features in the data-file are previously registered from a GTF file
#' - may use the `feature_annotation_df` data.frame to figure out action related to feature matching
DataLoaderVariantFormatA = R6::R6Class(classname = 'DataLoaderVariantFormatA',
                                      inherit = DataLoaderVariant,
                                      public = list(
                                        print_level = function() {cat("----(Level: DataLoaderVariant)\n")},
                                        assign_feature_ids = function(){
                                          cat("assign_feature_ids()"); self$print_level()
                                          super$assign_feature_ids()
                                          
                                          column_in_file = 'scidb_feature_col'
                                          feature_type = 'gene' # so far, all variant data files link to gene features only
                                          column_names = colnames(private$.feature_annotation_df)
                                          cat("Feature annotation dataframe has columns:",
                                              paste0(column_names,
                                                     collapse = ", "), 
                                              "\n\tWill match with first column\n")
                                          private$.data_df$feature_id = match_features(
                                            features_in_file = private$.data_df[, column_in_file],
                                            df_features_db = private$.reference_object$feature,
                                            feature_type = feature_type,
                                            column_in_db = column_names[1])
                                          private$.data_df[, column_in_file] = NULL
                                        },
                                        
                                        register_new_features = function() {
                                          cat("Function: Register new features (Level: DataLoaderVariantFormatA)\n")
                                          super$register_new_features()
                                          
                                          fset_id = private$.reference_object$measurement_set$featureset_id
                                          cat("Match features in file to features at featureset_id =", fset_id, "\n")
                                          
                                          ftrs = private$.reference_object$feature
                                          stopifnot(unique(ftrs$featureset_id) == fset_id)
                                          
                                          column_names = colnames(private$.feature_annotation_df)
                                          cat("Feature annotation dataframe has columns:",
                                              paste0(column_names,
                                                     collapse = ", "), 
                                              "\n\tWill match with first column\n")
                                          # =========== Level 1 matching: match with gene symbols ===========
                                          m1 = find_matches_and_return_indices(
                                            private$.data_df$scidb_feature_col,
                                            ftrs[, column_names[1]]
                                          )
                                          if (length(m1$source_unmatched_idx) > 0) {
                                            cat("Level 1 matching with gene symbols is insufficient;\n\t
                                                Proceeding to match with synonyms\n")
                                            ftrs_unmatched_v1 = private$.data_df$scidb_feature_col[
                                              m1$source_unmatched_idx]
                                            fsyn = private$.reference_object$feature_synonym
                                            fsyn = fsyn[fsyn$featureset_id == fset_id, ]
                                            
                                            # =========== Level 2 matching: match with gene synonyms ===========
                                            m2 = find_matches_and_return_indices(
                                              ftrs_unmatched_v1, 
                                              fsyn$synonym
                                            )
                                            if (length(m2$source_unmatched_idx) > 0) {
                                              stop("currently assumes that features in the data-file are 
                                                 previously registered from a GTF file,
                                                   match with standard hugo gene symbol list, 
                                                   or match with gene synonyms")
                                              return(TRUE)
                                            }
                                            matched_syn_feature_id = fsyn[m2$target_matched_idx, ]$feature_id
                                            syn_ftrs = get_features(feature_id = fsyn[m2$target_matched_idx, ]$feature_id)
                                            syn_ftrs = syn_ftrs[match(matched_syn_feature_id, 
                                                           syn_ftrs$feature_id), ]
                                            stopifnot(nrow(syn_ftrs) == length(m1$source_unmatched_idx))
                                            
                                            cat("Now overwriting synonym in data with standard hugo symbol:\n\t",
                                                pretty_print(unique(ftrs_unmatched_v1)),
                                                "==>", 
                                                pretty_print(unique(syn_ftrs$gene_symbol)), 
                                                "\n")
                                            private$.data_df[m1$source_unmatched_idx, ]$scidb_feature_col = 
                                              syn_ftrs$gene_symbol
                                            return(FALSE)
                                          } else {
                                            cat("No new features to register\n")
                                            return(FALSE)
                                          }
                                          invisible(self)
                                        }
                                      ))

##### DataLoaderFusionFormatA #####
DataLoaderFusionFormatA = R6::R6Class(
  classname = 'DataLoaderFusionFormatA',
  inherit = DataLoader,
  public = list(
    print_level = function() {cat("----(Level: DataLoaderFusionFormatA)\n")},
    register_new_features = function() {
      fset = private$get_selected_featureset()
      fsyn_sel = private$get_feature_synonym_df_for_selected_featureset()
      
      list_of_features = unique(c(as.character(private$.data_df$gene_left), 
                                  as.character(private$.data_df$gene_right)))
      
      matches_synonym = find_matches_and_return_indices(list_of_features, 
                                                        fsyn_sel$synonym)
      
      unmatched = matches_synonym$source_unmatched_idx
      cat("Number of unmatched gene symbols:", length(unmatched), "\n e.g.", 
          pretty_print(list_of_features[unmatched]), "\n")
      
      if (length(list_of_features[unmatched]) > 0) {
        stop("Have turned off new feature registration in Fusion data for now", 
             "-- till we figure out why some deFuse features were loaded with duplicate",
             " feature.name-s. Contact scidbadmin to enable browser() at this code point", 
             " and debug current case")
        ftr_source = formulate_feature_source_from_measuremenset(
          measurementset = private$.reference_object$measurement_set
        )
        
        newfeatures = data.frame(
          name = list_of_features[unmatched],
          gene_symbol = "NA",
          featureset_id = fset$featureset_id,
          chromosome = "unknown",
          start = '...', 
          end = '...',
          feature_type = "gene",
          source = ftr_source)
        
        feature_record = register_feature(df = newfeatures)
        
        return(TRUE)
      } else {
        cat("No new features to register\n")
        return(FALSE)
      }                                         
    },
    assign_feature_ids = function(){
      cat("assign_feature_ids()"); self$print_level()
      super$assign_feature_ids()
      
      syn = private$get_feature_synonym_df_for_selected_featureset()
      
      # Now register the left and right genes with system feature_id-s
      private$.data_df$feature_id_left = syn[match(private$.data_df$gene_left, syn$synonym), ]$feature_id
      private$.data_df$feature_id_right = syn[match(private$.data_df$gene_right, syn$synonym), ]$feature_id
      stopifnot(!any(is.na(private$.data_df$feature_id_left)))
      stopifnot(!any(is.na(private$.data_df$feature_id_right)))
    },
    load_data = function() {
      cat("load_data()"); self$print_level()
      private$.data_df = plyr::rename(private$.data_df,
                                      c('biosample_name' = 
                                          'sample_name_unabbreviated'))
      register_fusion(df1 = private$.data_df,
                      measurementset = private$.reference_object$measurement_set)
    }
  ))

##### DataLoaderCopyNumberMatrix #####
DataLoaderCopyNumberMatrix = R6::R6Class(
  classname = "DataLoaderCopyNumberMatrix",
  inherit = DataLoaderExpression, 
  public = list(
    print_level = function() {cat("----(Level: DataLoaderCopyNumberMatrix)\n")},
    register_new_features = function() {
      col_match_ftr_name = 'tracking_id'
      fset = find_matching_featureset(pipeline_df = private$.reference_object$pipeline_df,
                                      featureset_link_col = template_linker$featureset$choices_col,
                                      fsets_scidb = private$.reference_object$featureset
      )
      m1 = feature_matching_level1(data_df = private$.data_df, 
                                   col_match_ftr_name = col_match_ftr_name,
                                   fset = fset,
                                   feature_df = private$.reference_object$feature
      )
      if (length(m1$source_unmatched_idx) > 0) {
        stop("Need to implement level 2 matching -- see DataLoaderRNAQuantRNASeq")
        return(FALSE)
      } else {
        return(FALSE)
      }
    },
    assign_feature_ids = function(){
      cat("assign_feature_ids()"); self$print_level()
      super$assign_feature_ids(feature_type = 'gene',
                               column_in_file = 'tracking_id')
    }
  )
)

##### DataLoaderCopyNumberVariantFormatA #####
#' loader to try and handle various copy number variant subformats that have variable number of columns
#' - currently assumes that features in the data-file are previously registered from a GTF file
#' - may use the `feature_annotation_df` data.frame to figure out action related to feature matching
DataLoaderCopyNumberVariantFormatA = R6::R6Class(
  classname = 'DataLoaderCopyNumberVariantFormatA',
  inherit = DataLoader,
  public = list(
   print_level = function() {cat("----(Level: DataLoaderVariant)\n")},
   assign_feature_ids = function(){
     cat("assign_feature_ids()"); self$print_level()
     super$assign_feature_ids()
     
     syn = private$get_feature_synonym_df_for_selected_featureset()
     
     # Now register the left and right genes with system feature_id-s
     column_in_file = 'scidb_feature_col'
     private$.data_df$feature_id = syn[match(private$.data_df[, column_in_file], syn$synonym), ]$feature_id
     stopifnot(!any(is.na(private$.data_df$feature_id)))
     private$.data_df[, column_in_file] = NULL
   },
   
   register_new_features = function() {
     fset = private$get_selected_featureset()
     fsyn_sel = private$get_feature_synonym_df_for_selected_featureset()
     
     list_of_features = unique(as.character(private$.data_df[, 'scidb_feature_col']))
     
     matches_synonym = find_matches_and_return_indices(list_of_features, 
                                                       fsyn_sel$synonym)
     
     unmatched = matches_synonym$source_unmatched_idx
     cat("Number of unmatched gene symbols:", length(unmatched), "\n e.g.", 
         pretty_print(list_of_features[unmatched]), "\n")
     
     if (length(list_of_features[unmatched]) > 0) {
       ftr_source = formulate_feature_source_from_measuremenset(
         measurementset = private$.reference_object$measurement_set)
       
       newfeatures = data.frame(
         name = list_of_features[unmatched],
         gene_symbol = "NA",
         featureset_id = fset$featureset_id,
         chromosome = "unknown",
         start = '...', 
         end = '...',
         feature_type = "gene",
         source = ftr_source, 
         stringsAsFactors = FALSE)
       
       feature_record = register_feature(df = newfeatures)
       
       return(TRUE)
     } else {
       cat("No new features to register\n")
       return(FALSE)
     }                                         
   },
   load_data = function() {
     cat("load_data()"); self$print_level()
     register_copynumbervariant_variable_columns(df1 = private$.data_df,
                     measurementset = private$.reference_object$measurement_set)
   }
  ))


DataLoaderCyTOF = R6::R6Class(
  classname = 'DataLoaderCyTOF',
  inherit = DataLoaderExpression,
  public = list(
    print_level = function() {cat("----(Level: DataLoaderCyTOF)\n")},
    assign_feature_ids = function(){
      cat("assign_feature_ids()"); self$print_level()
      # super$assign_feature_ids() # override the definition in parent class
      
      ftr_df = private$.reference_object$feature
      
      # Now register the left and right genes with system feature_id-s
      column_in_file = 'scidb_feature_col'
      private$.data_df$feature_id = ftr_df[match(private$.data_df[, column_in_file], ftr_df$name), ]$feature_id
      stopifnot(!any(is.na(private$.data_df$feature_id)))
      private$.data_df[, column_in_file] = NULL
    },
    
    register_new_features = function() {
      fset = private$get_selected_featureset()
      ftr_sel = private$.reference_object$feature
      
      list_of_features = unique(as.character(private$.data_df[, 'scidb_feature_col']))
      
      matches_synonym = find_matches_and_return_indices(list_of_features, 
                                                        ftr_sel$name)
      
      unmatched = matches_synonym$source_unmatched_idx
      cat("Number of unmatched gene symbols:", length(unmatched), "\n e.g.", 
          pretty_print(list_of_features[unmatched]), "\n")
      
      if (length(list_of_features[unmatched]) > 0) {
        stop("Expected all features to be preregistered")
        
        return(TRUE)
      } else {
        cat("No new features to register\n")
        return(FALSE)
      }                                         
    }
  ),
  private = list(
    .match_biosample_name_exactly = FALSE
  )
)

DataLoaderVariantExomic = R6::R6Class(
  classname = 'DataLoaderVariantExomic',
  inherit = DataLoader,
  public = list(
   print_level = function() {cat("----(Level: DataLoaderVariantExomic)\n")},
   assign_feature_ids = function(){
     # Empty function because this data class does not deal with features
     cat("assign_feature_ids()"); self$print_level()
   },
   
   register_new_features = function() {
     # Empty function because this data class does not deal with features
     cat("Function: Register new features"); self$print_level()
     return(FALSE)
   },
   
   assign_biosample_ids = function() {
     cat("assign_biosample_ids()"); self$print_level()
     bios_ref = private$.reference_object$biosample
     suffix = template_helper_suffix_by_entity(entity = private$.reference_object$measurement_set$entity)
     bios_ref = bios_ref[grep(suffix, bios_ref$name), ]
     cat("Chose suffix:", suffix, "for entity:", private$.reference_object$measurement_set$entity, 
         "\nRetained:", nrow(bios_ref), "of total:", nrow(private$.reference_object$biosample), "in manifest\n")
     if (nrow(private$.reference_object$pipeline_df) > 1) { 
       # multiple Measurements combined into one file
       # ==> data must contain a column called `biosample_name`
       stop("Path for multiple exomic variant measurements combined into one file 
            not implemented yet\n")
     } else if (nrow(private$.reference_object$pipeline_df) == 1) {
       # One Measurement per file 
       # ==> information about biosample will exist in `pipeline_df` reference object
       cat("One Measurement per file\n")
       
       sample_name_in_file = private$.reference_object$pipeline_df$sample_name
       
       mL = find_matches_and_return_indices(sample_name_in_file,
                                            bios_ref$name)
       
       if (length(mL$source_unmatched_idx) != 0) {
         print(sample_name_in_file)
         print(head(bios_ref$name))
         stop("Excel sheet must provide link between sample in Pipelines sheet and Sample sheet under column `sample_name`")
       }
       
       sample_id_db = bios_ref$biosample_id[mL$target_matched_idx]
       cat("sample-name:", sample_name_in_file, "==> biosample_id:", 
           sample_id_db, "\n")
       attr(private$.data_df, 'biosample_id') = sample_id_db
     }
     invisible(self)   
   },
   download_features_for_featureset = function() {
     # Empty function because this data class does not deal with features
   },
   #' assign dataset_id and measurementset_id
   assign_other_ids = function(){
     attr(private$.data_df, "dataset_id") = private$.reference_object$record$dataset_id
     attr(private$.data_df, "dataset_version") = private$.reference_object$record$dataset_version
     attr(private$.data_df, "measurementset_id") = private$.reference_object$measurement_set$measurementset_id
   },
   load_data = function() {
     # ##### FORMAT #####
     # format_str = unique(private$.data_df@gt[, 'FORMAT'])
     # if (length(format_str) != 1) {
     #   stop("Expected unique format string per sample, but got", 
     #        pretty_print(format_str))
     # }
     # format_df = data.frame(
     #   measurementset_id = private$.data_df@measurementset_id,
     #   format = format_str,
     #   stringsAsFactors = FALSE
     # )
     # 
     
     xx = private$.data_df
     exomic_var_df = as.data.frame(xx@fix[, c("CHROM", "POS", "ID", "REF", "ALT")], 
                                   stringsAsFactors = FALSE)
     exomic_var_df = plyr::rename(
       exomic_var_df,
       c('CHROM' = 'chromosome',
         'POS' = 'start',
         'ID' = 'id',
         'REF' = 'reference',
         'ALT' = 'alternate')
     )
     chromosomes = naturalsort::naturalsort(unique(exomic_var_df$chromosome))
     chromosome_key_id = register_chromosome_key(
       df1 = data.frame(
         chromosome = chromosomes, 
         stringsAsFactors = FALSE
       ))
     chrom_key_df = get_chromosome_key()
     m1 = find_matches_and_return_indices(exomic_var_df$chromosome, 
                                          chrom_key_df$chromosome)
     stopifnot(length(m1$source_unmatched_idx) == 0)
     exomic_var_df$chromosome_key_id = chrom_key_df[m1$target_matched_idx, ]$chromosome_key_id
     exomic_var_df$chromosome = NULL
     
     exomic_var_df$end = exomic_var_df$start
     
     ms_id = attr(xx, "measurementset_id")
     ms_df = get_measurementsets(measurementset_id = ms_id)
     fset_df = get_featuresets(featureset_id = ms_df$featureset_id)
     exomic_var_df$referenceset_id = fset_df$referenceset_id
     
     head(exomic_var_df)
     register_res = revealgenomics:::register_exomic_variant(df1 = exomic_var_df)
     
     var_call_df = data.frame(
       quality = xx@fix[, "QUAL"],
       filter = xx@fix[, "FILTER"],
       info = xx@fix[, "INFO"],
       stringsAsFactors = FALSE
     )
     var_call_tibble = vcfR::extract_gt_tidy(xx)
     var_call_tibble = as.data.frame(var_call_tibble)
     var_call_tibble$Key = NULL
     if (length(unique(var_call_tibble$Indiv)) == 1)  {# Germline variants 
       cat("Loading germline variants\n")
       variantType = 'Germline'
       var_call_tibble$Indiv = NULL
     } else if (length(unique(var_call_tibble$Indiv)) == 2)  {# Somatic variants (tumor/normal)
       cat("Loading somatic variants\n")
       variantType = 'Somatic'
       stopifnot(nrow(var_call_tibble) %% 2 == 0)
       if (identical(
         unique(var_call_tibble$Indiv), 
         c("NORMAL", "TUMOR" ))) {
         suffixes = c("NORMAL", "TUMOR" )
       } else if (length(grep("_", var_call_tibble$Indiv)) == 
                  nrow(var_call_tibble)) { # Sample_name + "_" + norm / on / pre
         splitstr_list = stringi::stri_split(var_call_tibble$Indiv, fixed="_")
         suffixes = unique(unlist(lapply(splitstr_list, function(elem) elem[2])))
         # stopifnot(identical( # confirm that suffixes are "on" and "norm" only
         #   sort(suffixes),
         #   c("norm", "on")))
       }
       var_call_tibble_s1 = var_call_tibble[grep(suffixes[1], var_call_tibble$Indiv), ]
       var_call_tibble_s2 = var_call_tibble[grep(suffixes[2], var_call_tibble$Indiv), ]
       var_call_tibble_s1$Indiv = NULL
       var_call_tibble_s2$Indiv = NULL
       colnames(var_call_tibble_s1) = paste0(colnames(var_call_tibble_s1), "_", suffixes[1])
       colnames(var_call_tibble_s2) = paste0(colnames(var_call_tibble_s2), "_", suffixes[2])
       var_call_tibble = cbind(var_call_tibble_s1,
                               var_call_tibble_s2)
     } else {
       stop("Expect number of unique values for Indiv to be 1 or 2")
     }
     
     stopifnot(nrow(var_call_df) == nrow(var_call_tibble))
     var_call_df2 = cbind(
       var_call_df,
       var_call_tibble
     )
     assign_ids = function(df1,
                           measurementset_id,
                           dataset_id,
                           dataset_version) {
       df1$measurementset_id = measurementset_id
       df1$dataset_id = dataset_id
       df1$dataset_version = dataset_version
       df1
     }
     var_call_df3 = assign_ids(
       df1 = var_call_df2,
       measurementset_id = attr(private$.data_df, "measurementset_id"),
       dataset_id = attr(private$.data_df, "dataset_id"),
       dataset_version = attr(private$.data_df, "dataset_version"))
     var_call_df3$biosample_id = attr(private$.data_df, "biosample_id")
     
     stopifnot(nrow(register_res) == nrow(var_call_df3))
     var_call_df3$exomic_variant_id = register_res$exomic_variant_id
     
     df1 = var_call_df3
     # Step 1
     # Identify three groups of column-names
     # - `dimensions`: indices of the multi-dimensional array
     # - `attr_mandatory`: VCF attribute fields that are mandatory
     # - `attr_flex`: VCF attrubute fields that are not mandatory 
     cols_dimensions = get_idname(.ghEnv$meta$arrExomicVariantCall)[!(
       get_idname(.ghEnv$meta$arrExomicVariantCall) %in% 'key_id')]
     if (variantType == 'Germline') {
       cols_attr_mandatory = c('quality', 'filter', 'info', 'gt_AD', 'gt_DP', 'gt_GT')
     } else if (variantType == 'Somatic') {
       cols_attr_mandatory = c('quality', 'filter', 'info', 
                               unlist(
                                 lapply(
                                   suffixes, 
                                   function(elem) paste0(c('gt_AD', 'gt_DP', 'gt_GT'), "_", elem)
                                   )
                                 )
                               )
       not_found_mandatory = cols_attr_mandatory[!(cols_attr_mandatory %in% colnames(df1))]
       if (length(not_found_mandatory) > 0) {
         cat("Dropping mandatory restriction on columns:", 
             pretty_print(not_found_mandatory), "\n")
         cols_attr_mandatory = cols_attr_mandatory[(cols_attr_mandatory %in% colnames(df1))]
       }
     }
     cols_attr_flex = colnames(df1)[!(colnames(df1) %in% 
                                        c(cols_dimensions, cols_attr_mandatory))]
     if (!('per_gene_variant_number' %in% colnames(df1))) {
       # specify dplyr mutate as per https://stackoverflow.com/a/33593868
       df1x = df1 %>% group_by(exomic_variant_id, biosample_id) %>% dplyr::mutate(per_gene_variant_number = row_number())
       if (unique(df1x$per_gene_variant_number) != 1) stop("Not coded yet!")
     }
     df1 = as.data.frame(df1)
     
     # Step 5A
     # Introduce `key_id` and `val` columns i.e. handle VariantKeys 
     # -- First register any new keys
     cat("Step 5A -- Register the variant attribute columns as variant keys\n")
     variant_key_id = register_variant_key(
       df1 = data.frame(
         key = c(cols_attr_mandatory, cols_attr_flex), 
         stringsAsFactors = FALSE))
     if (!identical(
       get_variant_key(variant_key_id = variant_key_id)$key,
       c(cols_attr_mandatory, cols_attr_flex))) {
       stop("Faced issue registering variant keys")
     }
     
     cat("Step 5B -- Converting wide data.frame to tall data.frame\n")
     VAR_KEY = get_variant_key()
     var_gather = tidyr::gather(data = df1, key = "key", value = "val", 
                                c(cols_attr_mandatory, cols_attr_flex))
     
     cat("Step 5B1 -- Dropping NA columns except for when the key belongs to", 
         pretty_print(cols_attr_mandatory), "\n")
     var_gather1 = var_gather[var_gather$key %in% cols_attr_mandatory, ] 
     var_gather2 = var_gather[!(var_gather$key %in% cols_attr_mandatory), ]
     stopifnot(nrow(var_gather1) + nrow(var_gather2) == nrow(var_gather))
     non_null_indices = which(!is.na(var_gather2$val))
     if (length(non_null_indices) != nrow(var_gather2)) {
       cat(paste0("From total: ", nrow(var_gather), " key-value pairs, retaining: ", 
                  (length(non_null_indices) + nrow(var_gather1)), " non-null pairs.\n\tSavings(A) = ", 
                  round(
                    (nrow(var_gather2) - length(non_null_indices)) / nrow(var_gather) * 100, 
                    digits = 1), "%\n"))
       var_gather2 = var_gather2[non_null_indices, ]
       if (nrow(var_gather2) > 0) {
         var_gather = rbind(var_gather1, var_gather2)
       } else {
         var_gather = var_gather1
       }
     }
     
     M = find_matches_and_return_indices(var_gather$key, VAR_KEY$key)
     stopifnot(length(M$source_unmatched_idx) == 0)
     var_gather$key_id = VAR_KEY$key_id[M$target_matched_idx]
     var_gather$key = NULL # drop the key column
     var_gather = var_gather[, c(cols_dimensions, 'key_id', 'val')]
     
     # Step 6
     # Remove rows that are effectively empty
     cat("Step 6 -- Calculating empty markers\n")
     empty_markers = c('.', 'None')
     non_null_indices = which(!(var_gather$val %in% empty_markers))
     if (length(non_null_indices) != nrow(var_gather)) {
       cat(paste0("From total: ", nrow(var_gather), " key-value pairs, retaining: ", 
                  length(non_null_indices), " non-null pairs.\n\tSavings(B) = ", 
                  round(
                    (nrow(var_gather) - length(non_null_indices)) / nrow(var_gather) * 100, 
                    digits = 1), "%\n"))
       var_gather = var_gather[non_null_indices, ] 
     }
     
     if (variantType == 'Germline') { # Extra check for germline data
                                      # only `filter` column was found to have NA
       if (!identical(
         get_variant_key(variant_key_id = unique(var_gather[is.na(var_gather$val), ]$key_id))$key,
         "filter")) {
         stop("Result not handled yet")
       }
     }
     
     # Step 7
     # Upload and insert the data
     cat("Step 7 -- Upload and insert the data\n")
     UPLOAD_N = 5000000
     return_sub_indices = function(bigN, fac) {
       starts = seq(1, bigN, fac)
       ends   = c(tail(seq(0, bigN-1, fac), -1), bigN)
       stopifnot(length(starts) == length(ends))
       lapply(1:length(starts), function(idx) {c(starts[idx]: ends[idx])})
     }
     steps = return_sub_indices(bigN = nrow(var_gather), fac = UPLOAD_N)
     
     arrayname = full_arrayname(.ghEnv$meta$arrExomicVariantCall)
     for (upidx in 1:length(steps)) {
       step = steps[[upidx]]
       cat(paste0("Uploading variants. Sub-segment ", 
                  upidx, " of ", length(steps), " segments\n\t", 
                  "Rows: ", step[1], "-", tail(step, 1), "\n"))
       con = use_ghEnv_if_null(con = NULL)
       var_sc = as.scidb_int64_cols(db = con$db, 
                                    df1 = var_gather[c(step[1]:tail(step, 1)), ],
                                    int64_cols = colnames(var_gather)[!(colnames(var_gather) %in% 'val')])
       cat("Redimension and insert\n")
       iquery(con$db, paste0("insert(redimension(",
                             var_sc@name,
                             ", ", arrayname, "), ", arrayname, ")"))
       remove_old_versions_for_entity(entitynm = .ghEnv$meta$arrExomicVariantCall, con = con)
     }
     
     head(var_call_df)
     # ##### PER PIPELINE data #####
     # if (!any(is.na(as.integer(vcfR::getPOS(private$.data_df))))) {
     #   vcf_fix_start = as.integer(vcfR::getPOS(private$.data_df))
     #   vcf_fix_end   = vcf_fix_start
     # }
     # if (!any(is.na(as.numeric(vcfR::getQUAL(private$.data_df))))) {
     #   vcf_fix_qual = as.numeric(vcfR::getQUAL(private$.data_df))
     # }
     # per_pipeline_df = data.frame(
     #   chromosome = vcfR::getCHROM(private$.data_df),
     #   start      = vcf_fix_start, 
     #   end        = vcf_fix_end,
     #   id         = vcfR::getID(private$.data_df),
     #   reference  = vcfR::getREF(private$.data_df),
     #   alternate  = vcfR::getALT(private$.data_df),
     #   stringsAsFactors = FALSE
     # )
     # if (ncol(private$.data_df@gt) == 2) {
     #   per_sample_df = data.frame(
     #     chromosome = vcfR::getCHROM(private$.data_df),
     #     start      = vcf_fix_start, 
     #     end        = vcf_fix_end,
     #     id         = vcfR::getID(private$.data_df),
     #     reference  = vcfR::getREF(private$.data_df),
     #     alternate  = vcfR::getALT(private$.data_df),
     #     quality    = vcf_fix_qual,
     #     info       = vcfR::getINFO(private$.data_df),
     #     call       = private$.data_df@gt[, 2],
     #     stringsAsFactors = FALSE
     #   )
     # } else if (ncol(private$.data_df@gt) == 3) {
     #   stop("Not implemented tumor normal case yet")
     # } else {
     #   stop("Unexpected number of columns in vcf GT dataframe: ", 
     #        ncol(private$.data_df@gt))
     # }
     # per_pipeline_df = assign_ids(
     #   df1 = per_pipeline_df,
     #   measurementset_id = attr(private$.data_df, "measurementset_id"),
     #   dataset_id = attr(private$.data_df, "dataset_id"),
     #   dataset_version = attr(private$.data_df, "dataset_version"))
     # per_sample_df = assign_ids(
     #   df1 = per_sample_df,
     #   measurementset_id = attr(private$.data_df, "measurementset_id"),
     #   dataset_id = attr(private$.data_df, "dataset_id"),
     #   dataset_version = attr(private$.data_df, "dataset_version"))
     # per_sample_df$biosample_id = attr(private$.data_df, "biosample_id")
   }
   
  )
)

##### createDataLoader #####
#' @export      
createDataLoader = function(data_df, reference_object, feature_annotation_df = NULL,
                            loader_config){
  # Special formulation for entries that need to be disambuiguated by filter_choices
  if (grepl("COPY|CNV|CyTOF", reference_object$measurement_set$pipeline_scidb) |
      grepl("file link", reference_object$measurement_set$filter_name) | 
      grepl("geneset", reference_object$measurement_set$filter_name)) { 
    temp_string = paste0("{", reference_object$measurement_set$pipeline_scidb, "}{", 
                         reference_object$measurement_set$filter_name, "}")
  } else {
    temp_string = paste0("{", reference_object$measurement_set$pipeline_scidb, "}{", 
                         reference_object$measurement_set$quantification_level, "}")
  }
  
  switch_case_call = paste0(
    "switch(temp_string,",
    paste0(
      sapply (names(loader_config), function(name) {
        selectors = loader_config[[name]]
        text1 = paste0("'", selectors, "'", collapse = ' = , ')
        paste0(text1, 
          ' = ', name, 
          '$new(data_df = data_df, reference_object = reference_object, feature_annotation_df = feature_annotation_df)'
          )
      }), 
      collapse = ', '),
    ',stop("Need to add loader for choice:\n", temp_string))'
  )
  loaderObj = eval(parse(text = switch_case_call))
  return(loaderObj)
}
Paradigm4/revealgenomics documentation built on April 7, 2020, 2:01 a.m.