R/api_dataframe_reader.R

Defines functions createDataReader

##### DataReader #####
DataReader = R6::R6Class(classname = 'DataReader', 
                         public = list(
                           initialize = function(pipeline_df,
                                                 measurement_set){
                             private$.measurement_set = measurement_set
                             private$.pipeline_df = pipeline_df
                           },
                           print_level = function() {cat("----(Level: DataReader)\n")},
                           file_format_handler = function() {
                             private$.file_format_handler
                           },
                           load_data_from_file = function() {
                             cat("load_data_from_file()"); self$print_level()
                             file_path = unique(private$.pipeline_df$file_path)
                             stopifnot(length(file_path) == 1)
                             if (private$.is_excel) {
                               cat(paste0("Reading Excel file:\n\t", file_path, "\n"))
                               sheets = readxl::excel_sheets(path = file_path)
                               if (length(sheets) != 1) {
                                 stop("Expected 1 sheet in Excel workbook. Found following:\n\t", 
                                      pretty_print(sheets))
                               }
                               private$.data_df = as.data.frame(
                                 readxl::read_excel(path = file_path)
                               )
                             } else {
                               cat(paste0("Reading *", switch(private$.separator,
                                                              '\t' = 'tab',
                                                              ',' = 'comma'), 
                                          "* delimited file:\n\t", file_path, "\n"))
                               if (!private$.header) {
                                 cat("--- No header in file\n")
                               }
                               private$.data_df = read.delim(file = file_path,
                                                             sep = private$.separator,
                                                             check.names = FALSE,
                                                             stringsAsFactors = FALSE,
                                                             header = private$.header)
                             }
                             cat("Dimensions:", dim(private$.data_df), "\n")
                             invisible(self)
                           },
                           get_feature_annotation = function() {
                             private$.feature_annotation_df
                           },
                           get_data = function(){
                             private$.data_df
                           },
                           # get_pipeline_df = function() { private$.pipeline_df }
                           verify_read = function() {
                             if( ! ('biosample_name' %in% colnames(private$.data_df)) ) {
                               stop("Column `biosample_name` must be introduced (if not already present) by DataReader")
                             }
                             if ( any(
                                    sapply(
                                      c('__DNA', '__RNA', '__Protein'), 
                                      function(suffix) {length(grep(suffix, private$.data_df$biosample_name)) > 1}
                                      )
                                    )
                                  ) {
                               stop("`sample_molecule_type` should not be present in column `biosample_name`")
                             }
                             
                             
                           },
                           
                           # Handle case when sample column in data file is different from name in sample manifest
                           # 
                           # Replace sample column in file with `original_sample_name` (name from sample manifest) using 
                           # mapping (from `data_file_sample_name` to `original_sample_name` provided in Excel file 
                           enforce_data_file_sample_name_column = function(column_in_file = 'biosample_name') {
                             # Whether one needs to enforce the `data_file_sample_name` column
                             need_to_enforce_data_file_sample_name_column = function() {
                               if ('data_file_sample_name' %in% colnames(private$.pipeline_df)) {
                                 !identical(
                                   private$.pipeline_df$data_file_sample_name, 
                                   private$.pipeline_df$original_sample_name
                                 ) # enforce only if `data_file_sample_name` is different from `original_sample_name`
                               } else { # Cannot enforce if `data_file_sample_name` column is not present
                                 FALSE
                               } 
                             }
                             if (need_to_enforce_data_file_sample_name_column()) {
                               cat("Pipelines sheet column `data_file_sample_name` is different from column `original_sample_name`\n")
                               if (column_in_file %in% colnames(private$.data_df)) {
                                 m1 = find_matches_and_return_indices(
                                   private$.data_df[, column_in_file], 
                                   private$.pipeline_df$data_file_sample_name
                                 )
                                 if (length(m1$source_unmatched_idx) != 0) { # Being very restrictive right now
                                   stop("Did not find matches for following sample names in data file:\n\t", 
                                        pretty_print(unique(private$.data_df[, column_in_file][m1$source_unmatched_idx])))
                                 }
                                 private$.data_df[, column_in_file] = 
                                   private$.pipeline_df[m1$target_matched_idx, ]$original_sample_name
                               } else { # if `column_in_file` not in columns of private$.data_df, assume that data is in matrix format
                                 m0 = find_matches_and_return_indices(
                                   source = colnames(private$.data_df),
                                   target = private$.pipeline_df$data_file_sample_name
                                 )
                                 if (length(m0$source_unmatched_idx) > 0) {
                                   message("Unmatched columns: \n\t", 
                                           pretty_print(
                                             colnames(private$.data_df)[m0$source_unmatched_idx], 
                                             prettify_after = length(m0$source_unmatched_idx)
                                           ))
                                 }
                                 colnames(private$.data_df)[m0$source_matched_idx] = 
                                   private$.pipeline_df[m0$target_matched_idx, ]$original_sample_name
                               }
                             } # end of if (need_to_enforce_data_file_sample_name_column())
                           }, 
                           
                           # transpose matrix file if column 1 contains sample names
                           # convert to matrix with rows = features, and columns = samples
                           transpose_matrix_if_required = function() {
                             needToTransposeMatrix = FALSE
                             col1_names = private$.data_df[, 1] # first column contains sample names
                             if (any(duplicated(col1_names))) {
                               stop("Expected to call this function only on matrix file ", 
                                    "with first column containing unique identifiers")
                             }
                             pipeline_col_for_matching = ifelse('data_file_sample_name' %in% colnames(private$.pipeline_df),
                                    'data_file_sample_name', 'sample_name')
                             
                             m1 = find_matches_and_return_indices(
                               col1_names, private$.pipeline_df[, pipeline_col_for_matching]
                             )
                             
                             if (length(m1$source_matched_idx) == length(unique(private$.pipeline_df$data_file_sample_name))) {
                               message("All identifiers in column 1 of data file match with sample names", 
                                       "; Definitely need to transpose matrix")
                               needToTransposeMatrix = TRUE
                             } else if (length(m1$source_matched_idx) > 0) {
                               pctMatch = length(m1$source_matched_idx)/length(col1_names)*100
                               if (pctMatch > 0) {
                                 message("Of ", length(col1_names), " identifiers in column 1 of data file, ", 
                                         length(m1$source_matched_idx), " (", 
                                         pctMatch, 
                                         " %) matched sample names. Seems like transpose is required")
                                 needToTransposeMatrix = TRUE
                               }
                             }
                             if (needToTransposeMatrix) {
                               if (length(m1$source_unmatched_idx) > 0) {
                                 message("Dropping rows where sample identifier were not matched in pipeline: \n\t", 
                                         pretty_print(col1_names[m1$source_unmatched_idx]))
                                 private$.data_df = private$.data_df[m1$source_matched_idx, ]
                                 col1_names = col1_names[m1$source_matched_idx]
                               }
                               message("Transposing matrix data")
                               matrix_data = private$.data_df[, 2:ncol(private$.data_df)]
                               dataT = as.data.frame(t(as.matrix(matrix_data)))
                               if (ncol(dataT) != length(col1_names)) {
                                 stop("Expected number of columns in transposed data matrix", 
                                      " to be equal to number of identifiers in column 1 of original data.frame")
                               }
                               dataT_df = cbind(
                                 data.frame('tracking_id' = rownames(dataT), 
                                            stringsAsFactors = FALSE), 
                                 dataT
                               )
                               colnames(dataT_df)[2:ncol(dataT_df)] = col1_names
                               rownames(dataT_df) = 1:nrow(dataT_df)
                               private$.data_df = dataT_df
                             }
                             
                           }
                         ),
                         private = list(
                           .measurement_set = NULL,
                           .pipeline_df = NULL,
                           .feature_annotation_df = NULL,
                           .header = TRUE, # whether to read first row of data-file as header
                           .separator = '\t',
                           .is_excel = FALSE, 
                           .data_df = NULL, 
                           .file_format_handler = yaml::yaml.load_file(
                             system.file("extdata", "datareader_file_format_handler.yaml", package="revealgenomics"))
                         ))

## DataReaderAuto (Automatically infer separator -- CSV or TSV) ####
DataReaderAuto = R6::R6Class(
  classname = 'DataReaderAuto',
  inherit = DataReader,
  public = list(
   print_level = function() {cat("----(Level: DataReaderAuto)\n")},
   load_data_from_file = function() {
     cat("load_data_from_file()"); self$print_level()
     
     file_path = unique(private$.pipeline_df$file_path)
     stopifnot(length(file_path) == 1)
     if (tools::file_ext(file_path) %in% c('xls', 'xlsx')) {
       private$.is_excel = TRUE
     } else {
       tsvReadSuccess = tryCatch({
         cat("Trying to read as TSV file\n")
         temp = read.delim(file = file_path,
                           sep = private$.separator,
                           nrows = 2, # file should at least have two lines
                           check.names = FALSE)
         if (ncol(temp) == 1) {
           cat("Unlikely that data file has 1 column. Try CSV next\n")
           FALSE
         } else {
           cat("Read attempt as TSV succeeded\n")
           TRUE
         }
       }, error = function(e) {
         cat("Read attempt as TSV failed. Try CSV next\n")
         FALSE
       })
       
       if (!tsvReadSuccess) {
         private$.separator = ','
       }
     }
     
     super$load_data_from_file()
   }
  ),
  private = list(
    convert_wide_to_tall_skinny = function(keyname = 'biosample_name') {
      cat("convert_wide_to_tall_skinny()"); self$print_level()
      
      cat("-----\n\tConverting wide matrix form to tall-skinny table\n")
      private$.data_df = tidyr::gather(data = private$.data_df, 
                                       key = keyname, 
                                       value='value', 
                                       colnames(private$.data_df)[2:length(colnames(private$.data_df))])
      colnames(private$.data_df)[2] = keyname
    }
  )
)
                                           
##### DataReaderVariant #####
DataReaderVariant = R6::R6Class(classname = 'DataReaderVariant',
                                inherit = DataReader,
                                public = list(
                                  print_level = function() {cat("----(Level: DataReaderVariant)\n")}
                                ))

##### DataReaderVariantFormatA #####
#' generic variant loader that aims to work with multiple formats
#' - makes sure the mandatory fields for registering variants are named accordingly
#' - for biosample matching, preps a column called `biosample_name`
#' - for feature matching
#'     + creates a column called `scidb_feature_column`
#'     + stores annotation information in `feature_annotation_df` slot
#'       (column name of this data.frame describes whether feature `name` or `gene_symbol` 
#'       should be used for matching)
DataReaderVariantFormatA = R6::R6Class(classname = 'DataReaderVariantFormatA',
                                      inherit = DataReaderVariant,
                                      public = list(
                                        print_level = function() {cat("----(Level: DataReaderVariantFormatA)\n")},
                                        load_data_from_file = function() {
                                          cat("load_data_from_file()"); self$print_level()
                                          
                                          tsvReadSuccess = tryCatch({
                                            cat("Trying to read as TSV file\n")
                                            temp = read.delim(file = unique(private$.pipeline_df$file_path),
                                                            sep = private$.separator,
                                                            nrows = 2, # file should at least have two lines
                                                            check.names = FALSE)
                                            if (ncol(temp) == 1) {
                                              cat("Unlikely that variant file has 1 column. Try CSV next\n")
                                              FALSE
                                            } else {
                                              cat("Read attempt as TSV succeeded\n")
                                              TRUE
                                            }
                                          }, error = function(e) {
                                            cat("Read attempt as TSV failed. Try CSV next\n")
                                            FALSE
                                          })
                                          
                                          if (!tsvReadSuccess) {
                                            private$.separator = ','
                                          }
                                            
                                          super$load_data_from_file()
                                          
                                          cols_library = list(
                                            option1 = list(
                                              annotation_col = c('ref', 'alt', 'chrom', 'vcf_id'),
                                              ftr_col    = 'gene',
                                              ftr_compare_col = 'gene_symbol',
                                              sample_col = 'Analysis_Barcode',
                                              start_col = 'start',
                                              end_col = 'end'
                                            ), 
                                            option2 = list(
                                              annotation_col = c('REF', 'ALT', '#CHROM', 'ID'),
                                              ftr_col    = 'ANN[*].GENEID',
                                              ftr_compare_col = 'name',
                                              sample_col = 'Sample',
                                              start_col = 'POS',
                                              end_col = 'POS'
                                            ),
                                            option3 = list( # Personalis panel
                                              annotation_col = c('REF', 'ALT', 'Sequence', 'Variant ID'),
                                              ftr_col    = 'Gene Symbol',
                                              ftr_compare_col = 'gene_symbol',
                                              sample_col = 'biosample_name', # this will be manually introduced below
                                              start_col = 'POS',
                                              end_col = 'POS'
                                            ), 
                                            option4 = list( # Covance panel
                                              annotation_col = c('ref', 'alt', 'chrom', 'rs_ids'),
                                              ftr_col    = 'gene',
                                              ftr_compare_col = 'gene_symbol',
                                              sample_col = 'PFREFID',
                                              start_col = 'start',
                                              end_col = 'end'
                                            )
                                          )
                                          
                                          include_sample_col_in_match = TRUE
                                          potential_sample_cols = sapply(cols_library, function(item) item$sample_col)
                                          skip_enforcing_data_file_sample_name = FALSE
                                          if (!(any(potential_sample_cols %in% colnames(private$.data_df)))) {
                                            if (nrow(private$.pipeline_df) == 1) {
                                              cat("One sample per file. Manually attaching sample information\n")
                                              private$.data_df$biosample_name = private$.pipeline_df$original_sample_name
                                              skip_enforcing_data_file_sample_name = TRUE # Do not need to enforce since manually copied above
                                              include_sample_col_in_match = FALSE
                                            }
                                          }
                                          
                                          # Find which column library current file matches with
                                          matchWithLibrary = sapply(cols_library, function(item) {
                                            cols_to_match = c(item$annotation_col, item$ftr_col,
                                                              item$start_col, item$end_col)
                                            if (include_sample_col_in_match) {
                                              cols_to_match = c(cols_to_match, 
                                                                item$sample_col)
                                            }
                                            all(cols_to_match %in% colnames(private$.data_df))}
                                          )
                                          if (!any(matchWithLibrary)) {
                                            stop("Expected match with at least one of the library options")
                                          }
                                          if (sum(matchWithLibrary) > 1) {
                                            message("Found match at more than one library options; Using first")
                                          }
                                          colsMatched = cols_library[[names(which(matchWithLibrary)[1])]]
                                          
                                          # Enforce data file sample name column if required
                                          if (!skip_enforcing_data_file_sample_name) {
                                            super$enforce_data_file_sample_name_column(column_in_file = colsMatched$sample_col)
                                          }
                                          
                                          cat("Rule 1:\n")
                                          mandatory_columns = colsMatched$annotation_col
                                          cat("======\n\t test presence of columns:\n\t\t", 
                                              pretty_print(mandatory_columns), "\n")
                                          stopifnot(mandatory_columns %in% colnames(private$.data_df))
                                          
                                          cat("Rule 2:\n")
                                          column_rename = c('reference',
                                                            'alternate',
                                                            'chromosome',
                                                            'id')
                                          names(column_rename) = mandatory_columns
                                          cat("======\n\t rename columns:\n\t\t", 
                                              pretty_print(names(column_rename)), "==>", 
                                              pretty_print(column_rename), "\n")
                                          private$.data_df = plyr::rename(private$.data_df, 
                                                                          column_rename)
                                          
                                          cat("Rule 3:\n")
                                          cat("======\nFeature column in data is at column:", colsMatched$ftr_col,
                                              "\n\tMatch with scidb feature column:", colsMatched$ftr_compare_col, "\n")
                                          private$.data_df[, 'scidb_feature_col'] = 
                                            private$.data_df[, colsMatched$ftr_col]
                                          private$.feature_annotation_df = data.frame(
                                            col1 = private$.data_df[, colsMatched$ftr_col],
                                            stringsAsFactors = FALSE
                                          )
                                          colnames(private$.feature_annotation_df) = colsMatched$ftr_compare_col
                                          
                                          cat("Rule 3:\n")
                                          cat("======\nstart and end columns\n")
                                          cat("start col:", colsMatched$start_col, "\n")
                                          cat("end col:",   colsMatched$end_col, "\n")
                                          private$.data_df[, 'start'] = private$.data_df[, 
                                                                                         colsMatched$start_col]
                                          private$.data_df[, 'end'] = private$.data_df[, 
                                                                                         colsMatched$end_col]
                                          
                                          cat("Rule 5:\n======\nHandling sample name (if present)\n")
                                          if (nrow(private$.pipeline_df) > 1) {
                                            cat("This is a summarized variant file\n")
                                            cat(paste0("Expecting '", colsMatched$sample_col, 
                                                       "' column to hold the sample name\n"))
                                            stopifnot(colsMatched$sample_col %in% colnames(private$.data_df) )
                                            if ('biosample_name' %in% colnames(private$.data_df)) {
                                              stop("Did not expect `biosample_name` column in variant file\n")
                                            }
                                            private$.data_df$biosample_name = private$.data_df[, 
                                                                                               colsMatched$sample_col]
                                          }
                                        }
                                      ))

##### DataReaderCopyNumberVariantVariableColumns #####
#' generic variant loader that aims to work with multiple formats
#' - makes sure the mandatory fields for registering variants are named accordingly
#' - for biosample matching, preps a column called `biosample_name`
#' - for feature matching
#'     + creates a column called `scidb_feature_column`
#'     + stores annotation information in `feature_annotation_df` slot
#'       (column name of this data.frame describes whether feature `name` or `gene_symbol` 
#'       should be used for matching)
DataReaderCopyNumberVariantVariableColumns = R6::R6Class(
  classname = 'DataReaderCopyNumberVariantVariableColumns',
  inherit = DataReaderAuto,
  public = list(
   print_level = function() {cat("----(Level: DataReaderCopyNumberVariantVariableColumns)\n")},
   load_data_from_file = function() {
     cat("load_data_from_file()"); self$print_level()
     
     super$load_data_from_file()
     
     cols_library = list(
       option1 = list(
         colnames_in_file = c('NCBI Gene ID', 
                            'Gene Symbol', 
                            'Sequence', 
                            'Segment Start', 
                            'Segment End', 
                            'CNA Type', 
                            'AbsoluteCN', 
                            'Probability CNA Call', 
                            'P-value CNA Call', 
                            'Provided Tumor purity'),
         ftr_col    = 'Gene Symbol',
         ftr_compare_col = 'gene_symbol',
         sample_col = NA,
         chromosome_col = 'Sequence',
         start_col = 'Segment Start',
         end_col = 'Segment End',
         type_col = 'CNA Type',
         probablity_cna_call_fill = FALSE
       ), 
       option2 = list(
         colnames_in_file = c('NCBI Gene ID', 
                              'Gene Symbol', 
                              'Sequence', 
                              'Segment Start', 
                              'Segment End', 
                              'CNA Type', 
                              'AbsoluteCN', 
                              'P-value CNA Call', 
                              'Provided Tumor purity'),
         ftr_col    = 'Gene Symbol',
         ftr_compare_col = 'gene_symbol',
         sample_col = NA,
         chromosome_col = 'Sequence',
         start_col = 'Segment Start',
         end_col = 'Segment End',
         type_col = 'CNA Type',
         probablity_cna_call_fill = TRUE
       )
     )
     matchWithLibrary = sapply(cols_library, function(item) {
       cols_to_match = item$colnames_in_file
       identical(sort(cols_to_match), sort(colnames(private$.data_df)))}
     )
     if (!any(matchWithLibrary)) {
       stop("Expected match with at least one of the library options")
     }
     if (sum(matchWithLibrary) > 1) {
       message("Found match at more than one library options; Using first")
     }
     colsMatched = cols_library[[names(which(matchWithLibrary)[1])]]
     
     # Sample matching 
     potential_sample_cols = sapply(cols_library, function(item) item$sample_col)
     skip_enforcing_data_file_sample_name = FALSE
     if (!(any(potential_sample_cols %in% colnames(private$.data_df)))) {
       if (nrow(private$.pipeline_df) == 1) {
         cat("One sample per file. Manually attaching sample information\n")
         skip_enforcing_data_file_sample_name = TRUE # Do not need to enforce since manually copied above
         if (nrow(private$.data_df) == 0) {
           private$.data_df$biosample_name = character()
         } else {
           private$.data_df$biosample_name = private$.pipeline_df$original_sample_name
         }
       }
     }
     
     if (!skip_enforcing_data_file_sample_name) {
       super$enforce_data_file_sample_name_column()
     }
     
     cat("Rule 1:\n")
     if (colsMatched$probablity_cna_call_fill) {
       if (nrow(private$.data_df) == 0) {
         private$.data_df$`Probability CNA Call` = numeric()
       } else {
         private$.data_df$`Probability CNA Call` = as.numeric(NA)
       }
     } 

     cat("Rule 2:\n")

     cat("Rule 3:\n")
     cat("======\nFeature column in data is at column:", colsMatched$ftr_col,
         "\n\tMatch with scidb feature column:", colsMatched$ftr_compare_col, "\n")
     private$.data_df[, 'scidb_feature_col'] = 
       private$.data_df[, colsMatched$ftr_col]
     private$.feature_annotation_df = data.frame(
       col1 = private$.data_df[, colsMatched$ftr_col],
       stringsAsFactors = FALSE
     )
     colnames(private$.feature_annotation_df) = colsMatched$ftr_compare_col
     
     cat("Rule 3:\n")
     cat("======\nCNV type, chromosome, start and end columns\n")
     cat("type col:", colsMatched$type_col, "\n")
     cat("chromosome col:", colsMatched$chromosome_col, "\n")
     cat("start col:", colsMatched$start_col, "\n")
     cat("end col:",   colsMatched$end_col, "\n")
     private$.data_df[, 'type'] = private$.data_df[, 
                                                            colsMatched$type_col]
     private$.data_df[, 'segment_start'] = private$.data_df[, 
                                                            colsMatched$start_col]
     private$.data_df[, 'segment_end'] = private$.data_df[, 
                                                  colsMatched$end_col]
     
     cat("Rule 5:\n======\nHandling sample name (if present)\n")
     if (nrow(private$.pipeline_df) > 1) {
       stop("Not implemented yet")
       cat("This is a summarized variant file\n")
       cat(paste0("Expecting '", colsMatched$sample_col, 
                  "' column to hold the sample name\n"))
       stopifnot(colsMatched$sample_col %in% colnames(private$.data_df) )
       if ('biosample_name' %in% colnames(private$.data_df)) {
         stop("Did not expect `biosample_name` column in variant file\n")
       }
       private$.data_df$biosample_name = private$.data_df[, 
                                                          colsMatched$sample_col]
                                           }
                                         }
                                       ))
##### DataReaderFMI #####
DataReaderFMI = R6::R6Class(
  classname = 'DataReaderFMI',
  inherit = DataReader,
  public = list(
    print_level = function() {cat("----(Level: DataReaderFMI)\n")},
    load_data_from_file = function() {
      cat("load_data_from_file()"); self$print_level()
      
      super$load_data_from_file()
      
      # Infer biosample name column
      biosample_name_col_options = unlist(private$.file_format_handler$DataReaderFMI$mapped_columns$biosample_name_sample_col)
      bios_col_in_data = biosample_name_col_options[biosample_name_col_options %in% colnames(private$.data_df)]
      if (length(bios_col_in_data) == 0) {
        stop("Did not find matching biosample column in file. Possible options: ", 
             pretty_print(biosample_name_col_options))
      } else if (length(bios_col_in_data) > 1) {
        message("Found multiple matches for biosample column names: ", 
                pretty_print(bios_col_in_data), 
                ". Using first")
        bios_col_in_data = bios_col_in_data[1]
      }
      biosample_name_col = bios_col_in_data
      cat("Assigning values for biosample name using column:", biosample_name_col, "\n")
      private$.data_df[, 'biosample_name'] = private$.data_df[, biosample_name_col]
      
      cols_to_discard = private$.file_format_handler$DataReaderFMI$mapped_columns$discarded_columns
      
      cat("Discarding columns:", pretty_print(cols_to_discard), "\n")
      private$.data_df = private$.data_df[, 
                  !(colnames(private$.data_df) %in% cols_to_discard)]
      discard_na_columns = function(data_df, 
                                    na_marker = '-') {
        # name of columns where all values are equal to `na_marker`
        cols_to_keep = names(
          which(
            apply(data_df == na_marker, 
                  FUN = function(elem) !all(elem), 
                  MARGIN = 2)
            )
          )
        data_df[, cols_to_keep]
      }
      cat("Splitting file into separate data frames for
          Variant, Fusion and CNV\n")
      data_df_list = list(
        variant_data = discard_na_columns(private$.data_df[
          private$.data_df$`VARIANT-TYPE` == 'short-variant', 
        ]),
        cnv_data = discard_na_columns(private$.data_df[
          private$.data_df$`VARIANT-TYPE` == 'copy-number-alteration', 
          ]),
        fusion_data = discard_na_columns(private$.data_df[
          (private$.data_df$`VARIANT-TYPE` == 'rearrangement' |
           private$.data_df$`VARIANT-TYPE` == 'fusion'), 
          ])
      )
      data_df_list$variant_data$`VARIANT-TYPE` = NULL
      data_df_list$cnv_data$`VARIANT-TYPE` = NULL
      data_df_list$fusion_data$`VARIANT-TYPE` = NULL
      
      # Some more manual handling of columns
      data_df_list$fusion_data$GENE = NULL
      
      private$.data_df = data_df_list
    }
  )
)

##### DataReaderFMIVariant #####
DataReaderFMIVariant = R6::R6Class(
  classname = 'DataReaderFMIVariant',
  inherit = DataReaderFMI,
  public = list(
    print_level = function() {cat("----(Level: DataReaderFMIVariant)\n")},
    load_data_from_file = function() {
      cat("load_data_from_file()"); self$print_level()
      
      super$load_data_from_file()
      
      cat("Extracting variant data\n")
      private$.data_df = private$.data_df$variant_data
      super$enforce_data_file_sample_name_column()
      
      cat("(Extracted) Dimensions:", dim(private$.data_df), "\n")
      
      cat("Applying rules specific to FMI data\n")
      chr_pos_split = stringi::stri_split(
        str = private$.data_df$`SV-GENOME-POSITION`,
        fixed = ":")
      if (
        unique(sapply(chr_pos_split, FUN = function(elem) length(elem))) != 2) {
        stop("Expect each row to contain chromosome and position (no more no less)\n
             Data has entries that do not satisfy this rule:\n\t", 
             pretty_print(
               private$.data_df$`SV-GENOME-POSITION`[
                 sapply(chr_pos_split, FUN = function(elem) length(elem)) != 2]
             ),
             "\nNeed to write handler for this case")
      }
      
      cat("Extracting chromosome, start, end information\n")
      private$.data_df$chromosome = sapply(chr_pos_split, 
                                           function(elem) elem[1])
      private$.data_df$start = sapply(chr_pos_split, 
                                           function(elem) elem[2])
      private$.data_df$end = sapply(chr_pos_split, 
                                      function(elem) elem[2])
      
      cat("Extracting reference, alternate information\n")
      
      stopifnot(!('reference' %in% colnames(private$.data_df)))
      stopifnot(!('alternate' %in% colnames(private$.data_df)))
      
      private$.data_df$reference = NA
      private$.data_df$alternate = NA
      
      ref_alt_split = stringi::stri_split(str = private$.data_df$`SV-CDS-CHANGE`, fixed = ">")
      split_len = sapply(ref_alt_split, function(elem) length(elem))
      
      valid_pos = which(split_len == 2)
      private$.data_df$alternate[valid_pos] = sapply(
        ref_alt_split[valid_pos], 
        function(elem) elem[2])
      private$.data_df$reference[valid_pos] = sapply(
        ref_alt_split[valid_pos], 
        function(elem) {
          xx = elem[1]
          substr(xx, stringr::str_length(xx), stringr::str_length(xx))
        })
      
      cat("Supplying NA for id\n")
      private$.data_df$id = NA
      
      feature_col = 'GENE'
      cat("Assigning values for feature name using column:", feature_col, "\n")
      private$.data_df[, 'scidb_feature_col'] = private$.data_df[, feature_col]
      
      cat("Storing the feature annotation information\n")
      private$.feature_annotation_df = data.frame(gene_symbol = private$.data_df[, feature_col], 
                                                  stringsAsFactors = FALSE)
    }
  )
)

##### DataReaderFMIFusion #####
DataReaderFMIFusion = R6::R6Class(
  classname = 'DataReaderFMIFusion',
  inherit = DataReaderFMI,
  public = list(
    print_level = function() {cat("----(Level: DataReaderFMIFusion)\n")},
    load_data_from_file = function() {
      cat("load_data_from_file()"); self$print_level()
      
      super$load_data_from_file()
      
      cat("Extracting fusion data\n")
      private$.data_df = private$.data_df$fusion_data
      super$enforce_data_file_sample_name_column()
      
      cat("(Extracted) Dimensions:", dim(private$.data_df), "\n")
      
      cat("Extracting chromosome, start, end information\n")
      
      # attribute: gene_left  (Column REARR-GENE1)
      private$.data_df$gene_left = private$.data_df$`REARR-GENE1`
      
      # attributes: chromosome_left, pos_left  (Column REARR-POS1)
      # REARR-POS1 column will have string like "chr12:103439097-103439635"
      rearr_pos1 = stringi::stri_split(
        str = private$.data_df$`REARR-POS1`,
        fixed = ":")
      private$.data_df$chromosome_left = sapply(rearr_pos1, function(elem) {elem[1]})
      rearr_pos1_start_end = sapply(rearr_pos1, function(elem) {elem[2]})
      rearr_pos1_start_end = sapply(rearr_pos1_start_end, function(elem) {stringi::stri_split(str = elem, fixed='-')})
      rearr_pos1_start = sapply(rearr_pos1_start_end, function(elem) {elem[1]})
      rearr_pos1_end = sapply(rearr_pos1_start_end, function(elem) {elem[2]})
      private$.data_df$start_left = rearr_pos1_start
      private$.data_df$end_left   = rearr_pos1_end

      # attribute: gene_right  (Column REARR-GENE2)
      private$.data_df$gene_right = private$.data_df$`REARR-GENE2`
      
      # attributes: chromosome_left, pos_left  (Column REARR-POS2)
      # REARR-POS1 column will have string like "chr12:103439097-103439635"
      rearr_pos2 = stringi::stri_split(
        str = private$.data_df$`REARR-POS2`,
        fixed = ":")
      private$.data_df$chromosome_right = sapply(rearr_pos2, function(elem) {elem[1]})
      rearr_pos2_start_end = sapply(rearr_pos2, function(elem) {elem[2]})
      rearr_pos2_start_end = sapply(rearr_pos2_start_end, function(elem) {stringi::stri_split(str = elem, fixed='-')})
      rearr_pos2_start = sapply(rearr_pos2_start_end, function(elem) {elem[1]})
      rearr_pos2_end = sapply(rearr_pos2_start_end, function(elem) {elem[2]})
      private$.data_df$start_right = rearr_pos2_start
      private$.data_df$end_right   = rearr_pos2_end
      
      # attribute: num_spanning_reads (Column REARR-NUMBER-OF-READS)
      private$.data_df$num_spanning_reads = private$.data_df$`REARR-NUMBER-OF-READS`
      
      # these values are not found in the sheet.
      private$.data_df$num_mate_pairs = NA
      private$.data_df$num_mate_pairs_fusion = NA
      
      feature_col = c('REARR-GENE1', 'REARR-GENE2')
      # cat("Assigning values for feature name using column:", pretty_print(feature_col), "\n")
      # private$.data_df[, 'scidb_feature_col1'] = private$.data_df[, feature_col[1]]
      # private$.data_df[, 'scidb_feature_col2'] = private$.data_df[, feature_col[2]]
      
      cat("Storing the feature annotation information\n")
      private$.feature_annotation_df = private$.data_df[, feature_col]
      cat("DataReaderFMIFusion::load_data_from_file() complete")
    }
  )
)

##### DataReaderFMICopyNumberVariant #####
DataReaderFMICopyNumberVariant = R6::R6Class(
  classname = 'DataReaderFMICopyNumberVariant',
  inherit = DataReaderFMI,
  public = list(
    print_level = function() {cat("----(Level: DataReaderFMICopyNumberVariant)\n")},
    load_data_from_file = function() {
      cat("load_data_from_file()"); self$print_level()
      
      super$load_data_from_file()
      
      cat("Extracting copy number variant data\n")
      private$.data_df = private$.data_df$cnv_data
      super$enforce_data_file_sample_name_column()
      
      cat("(Extracted) Dimensions:", dim(private$.data_df), "\n")
      
      cat("Applying rules specific to FMI data\n")

      cat("Removing suffix `CNA-`")
      colnames(private$.data_df)[
        grep("CNA-", colnames(private$.data_df))
      ] = gsub(
        "CNA-", "", colnames(private$.data_df)[
          grep("CNA-", colnames(private$.data_df))
          ]
      )
      
      cat("Renaming column TYPE as type\n")
      private$.data_df = plyr::rename(private$.data_df, c('TYPE' = 'type'))
      
      feature_col = 'GENE'
      cat("Assigning values for feature name using column:", feature_col, "\n")
      private$.data_df[, 'scidb_feature_col'] = private$.data_df[, feature_col]
      
      cat("Storing the feature annotation information\n")
      private$.feature_annotation_df = data.frame(gene_symbol = private$.data_df[, feature_col], 
                                                  stringsAsFactors = FALSE)
    }
  )
)

DataReaderVariantExomic = R6::R6Class(
  classname = 'DataReaderVariantExomic',
  inherit = DataReader,
  public = list(
    print_level = function() {cat("----(Level: DataReaderVariantExomic)\n")},
    load_data_from_file = function() {
      cat("load_data_from_file()"); self$print_level()
      file_path = unique(private$.pipeline_df$file_path)
      stopifnot(length(file_path) == 1)
      cat(paste0("Reading *", switch(private$.separator,
                                     '\t' = 'tab',
                                     ',' = 'comma'), 
                 "* VCF file:\n\t", file_path, "\n"))
      private$.data_df = vcfR::read.vcfR(file = file_path)
      # cat("Preview:", vcfR::head(private$.data_df), "\n")
      invisible(self)
    }
    ),
  private = list(
  )
)
                                           
##### DataReaderExpressionMatrix #####
DataReaderExpressionMatrix = R6::R6Class(classname = 'DataReaderExpressionMatrix',
                               inherit = DataReaderAuto,
                               public = list(
                                 print_level = function() {cat("----(Level: DataReaderExpressionMatrix)\n")}
                               ), 
                               private = list(
                               ))

##### DataReaderRNAQuantRNASeqCufflinks #####
DataReaderRNAQuantRNASeqCufflinks = R6::R6Class(classname = 'DataReaderRNAQuantRNASeqCufflinks',
                                inherit = DataReaderExpressionMatrix,
                                public = list(
                                  print_level = function() {cat("----(Level: DataReaderRNAQuantRNASeqCufflinks)\n")},
                                  load_data_from_file = function() {
                                    super$load_data_from_file()
                                    cat("load_data_from_file()"); self$print_level()
                                    
                                    # - if 1 pipeline info row for a file, try per-sample reader. 
                                    #     + if that fails, try aggregate reader
                                    # - if multiple pipeline info rows for a file, try aggregate reader
                                    if (nrow(private$.pipeline_df) == 1) {
                                      cat("One pipeline information row for file. Attempting to use per-sample file loader\n")
                                      tryAggregateLoader = tryCatch({
                                        # code for per-sample loader
                                        stopifnot(length(private$.pipeline_df$original_sample_name) == 1)
                                        private$.data_df$biosample_name = private$.pipeline_df$original_sample_name
                                        # super$enforce_data_file_sample_name_column() # Do not need to enforce since manually copied above
                                        
                                        columns_to_drop = c('class_code', 
                                                            'nearest_ref_id', 
                                                            'tss_id', 'locus', 
                                                            'length', 'coverage', 
                                                            'FPKM_conf_lo', 'FPKM_conf_hi', 'FPKM_status')
                                        private$.data_df[, columns_to_drop] = NULL
                                        private$.data_df = plyr::rename(private$.data_df, 
                                                                        c('FPKM' = 'value'))
                                        
                                        if (identical(private$.data_df$tracking_id, private$.data_df$gene_id)) {
                                          cat("Identified gene file-type\n")
                                          private$.data_df$gene_id = NULL
                                        } else {
                                          cat("Identified isoform file-type\n")
                                          stop("Need to code for this. Refer function: register_expression_file_cufflinks()")
                                        }
                                        
                                        return(FALSE) # do not need to try aggregate loader 
                                      }, error = function(e) {
                                        cat("Tried per-sample file loader and failed. Trying aggregateLoader\n")
                                        return(TRUE)
                                      })
                                    } else { # 
                                      cat("multiple pipeline information row for file. Attempting to use aggregate file loader\n")
                                      tryAggregateLoader = TRUE
                                    }
                                    
                                    if (tryAggregateLoader) {
                                      # code for aggregate file loader

                                      # Cufflinks file has feature annotation as well as expression data
                                      # Step 1 of 2 -- Extract annotation data
                                      ftr_col = 'gene'
                                      stopifnot(length(unique(private$.data_df[, ftr_col])) == 
                                                  nrow(private$.data_df[, ftr_col]))
                                      ftr_ann_columns = c(ftr_col, 'mrna', 'refseq', 'ucscid', 
                                                          'description', 'entrez', 
                                                          'chr', 'beg', 'end')
                                      private$.feature_annotation_df = private$.data_df[, ftr_ann_columns]
                                      colnames(private$.feature_annotation_df) = c('gene_symbol', 
                                                                                   'mrna', 
                                                                                   'refseq', 
                                                                                   'ucscid', 
                                                                                   'description', 
                                                                                   'entrez', 
                                                                                   'chromosome', 
                                                                                   'start', 
                                                                                   'end')
                                      
                                      # Step 2 of 2 -- Extract expression data
                                      private$.data_df = private$.data_df[, 
                                                                          c(ftr_col, 
                                                                            colnames(private$.data_df)[!(colnames(private$.data_df) %in% 
                                                                                                           ftr_ann_columns)])]
                                      column_names = colnames(private$.data_df)
                                      
                                      # There is a "_0" suffix in all column names -- remove that
                                      cat("Local rule 1\n")
                                      cat("============\n")
                                      cat("\tThere is a \"_0\" suffix in all column names -- remove that\n")
                                      stopifnot(length(grep("_0$", column_names)) == (length(column_names) -1))
                                      colnames(private$.data_df) = gsub("_[0-9]$", "", column_names)
                                      
                                      cat("Local rule 2\n")
                                      cat("============\n")
                                      cat("\tColumn name for feature; renaming as tracking id\n")
                                      colnames(private$.data_df)[1] = 'tracking_id'

                                      super$convert_wide_to_tall_skinny()
                                      super$enforce_data_file_sample_name_column()
                                    }
                                  }
                                ))

##### DataReaderRNASeqGeneFormatA #####
# Reader that is being used to cover gene matrix files for
# - Salmon
# - HTSeq
# - Sailfish
DataReaderRNASeqGeneFormatA = R6::R6Class(classname = 'DataReaderRNASeqGeneFormatA',
                                    inherit = DataReaderExpressionMatrix,
                                    public = list(
                                      print_level = function() {cat("----(Level: DataReaderRNASeqGeneFormatA)\n")},
                                      load_data_from_file = function() {
                                        super$load_data_from_file()
                                        cat("load_data_from_file()"); self$print_level()
                                        
                                        super$transpose_matrix_if_required()
                                        
                                        data_file_sample_name_column_present = (
                                          'data_file_sample_name' %in% colnames(private$.pipeline_df)
                                        )
                                        super$enforce_data_file_sample_name_column()
                                        
                                        bios = search_biosamples(dataset_id = private$.measurement_set$dataset_id, 
                                                                 dataset_version = private$.measurement_set$dataset_version)
                                        
                                        sample_from_manifest = bios[grep("__RNA", bios$name), ]$original_sample_name
                                        if (!data_file_sample_name_column_present) { # if data_file_sample_name column is not present in Excel file
                                          cat("Automatically interpreting specific format of data by matching with biosample names\n")
                                          m1 = lapply(colnames(private$.data_df), 
                                                      function(colnm) {
                                                        x2 = sapply(1:length(sample_from_manifest), 
                                                                    function(pos) {
                                                                      subelem = sample_from_manifest[pos]; 
                                                                      xx = grep(subelem, colnm, fixed = TRUE); 
                                                                      ifelse(length(xx) == 0, NA, pos)})
                                                        x2[!is.na(x2)]
                                                      })
                                          m1_len = sapply(m1, function(elem) {length(elem)})
                                          if (!all(unique(m1_len) %in% c(0,1))) {
                                            stop("Expected columns to be either feature annotation or unique biosample names.
                                               Received data that has some non-unique biosample names:\n\t",
                                                 pretty_print(colnames(private$.data_df), prettify_after = 15))
                                          }
                                          sample_manifest_matched_pos = unlist(m1)
                                          potential_sample_col_pos = which(m1_len == 1)
                                          first_sample_col_pos = min(potential_sample_col_pos)
                                          stopifnot(first_sample_col_pos >= 2)
                                          
                                          potential_sample_cols = colnames(private$.data_df)[potential_sample_col_pos]
                                          potential_sample_names_manifest = sample_from_manifest[sample_manifest_matched_pos]
                                        } else { # if data_file_sample_name column was enforced
                                          cat("Excel sheet provided `data_file_sample_name`", 
                                              "will be used to match with name in sample manifest",
                                              "\n\tExact matching will be employed\n")
                                          
                                          m1 = find_matches_and_return_indices(
                                            source = colnames(private$.data_df),
                                            target = sample_from_manifest
                                          )
                                          
                                          potential_sample_cols = colnames(private$.data_df)[m1$source_matched_idx]
                                          potential_sample_names_manifest = potential_sample_cols # Since exact matching in this case
                                          first_sample_col_pos = min(which(colnames(private$.data_df) %in% potential_sample_cols))
                                        }
                                        
                                        ftr_ann_columns       = colnames(private$.data_df)[1:(first_sample_col_pos - 1)]
                                        # Match with library of feature annotation choices (most restrictive first)
                                        feature_library = private$.file_format_handler[[class(self)[1]]]$mapped_columns$feature_library
                                        if (all(c('GENE_ID', 'Location') %in% colnames(private$.data_df))) {
                                          cat("Manual handling of case where matrix annotation file has c('GENE_ID', 'Location') columns\n",
                                              "\tFor entries with duplicated GENE_ID, concatenating Location information...",
                                              "\n\tThe concatenated entries must have been registered previously by a one-time script\n")
                                          duplicated_loc = which(duplicated(private$.data_df$GENE_ID))
                                          private$.data_df$GENE_ID[duplicated_loc] = paste0(private$.data_df$GENE_ID[duplicated_loc], 
                                                                                            "; ", private$.data_df$Location[duplicated_loc])
                                          private$.data_df$Location = NULL
                                        }
                                        matches = which(
                                          sapply(names(feature_library), 
                                                 function(idx) {
                                                   all(feature_library[[idx]]$required_colnames_in_file %in% ftr_ann_columns)
                                                   })
                                          )
                                        if (length(matches) > 0) {
                                          matching_key = names(matches[1])
                                        } else {
                                          stop("Case not covered. Feature annotation columns:\n\t", 
                                               pretty_print(ftr_ann_columns))
                                        }
                                        ftr_ann_columns =         feature_library[[matching_key]]$required_colnames_in_file
                                        ftr_col =                 feature_library[[matching_key]]$feature_col
                                        ftr_ann_columns_replace = feature_library[[matching_key]]$replace_colnames_by
                                        # HTSeq file has feature annotation as well as expression data
                                        # Step 1 of 2 -- Extract annotation data
                                        stopifnot(length(unique(private$.data_df[, ftr_col])) == 
                                                    nrow(private$.data_df[, ftr_col]))
                                        if (length(ftr_ann_columns) == 1) {
                                          private$.feature_annotation_df = data.frame(ftr_ann_columns = 
                                                                                        private$.data_df[, ftr_ann_columns],
                                                                                      stringsAsFactors = FALSE)
                                        } else {
                                          private$.feature_annotation_df = private$.data_df[, ftr_ann_columns]
                                        }
                                        colnames(private$.feature_annotation_df) = ftr_ann_columns_replace
                                        
                                        # Step 2 of 2 -- Extract expression data
                                        private$.data_df = private$.data_df[, 
                                                            c(ftr_col, 
                                                              potential_sample_cols)]

                                        # column renames 
                                        cat("Local rule(s)\n")
                                        cat("============\n")
                                        cat("\tColumn name for feature; renaming as tracking id\n")
                                        cat("============\n")
                                        cat("\tRename the sample columns in the format that matched with the sample manifest\n")

                                        colnames(private$.data_df) = c('tracking_id', potential_sample_names_manifest)
                                        
                                        super$convert_wide_to_tall_skinny()
                                      }
                                      
                                    ))


##### DataReaderRNAQuantMicroarray #####
DataReaderRNAQuantMicroarray = R6::R6Class(classname = 'DataReaderRNAQuantMicroarray',
                                 inherit = DataReaderExpressionMatrix,
                                 public = list(
                                   print_level = function() {cat("----(Level: DataReaderRNAQuantMicroarray)\n")},
                                   load_data_from_file = function() {
                                     cat("load_data_from_file()"); self$print_level()
                                     file_path = unique(private$.pipeline_df$file_path)
                                     stopifnot(length(file_path) == 1)
                                     # restore bioconductor expressionSet object from file path
                                     cat(paste0("restore bioconductor expressionSet object from file path:\n\t", file_path, "\n"))
                                     exprSetObj = readRDS(file_path)
                                     
                                     cat("Extracting expression data\n")
                                     private$.data_df = as.data.frame(
                                       Biobase::exprs(exprSetObj) # to distinguish from plyr::exprs
                                       )
                                     private$.data_df = cbind(
                                       data.frame('probeset_id' = rownames(private$.data_df),
                                                  stringsAsFactors = FALSE),
                                       private$.data_df)
                                     rownames(private$.data_df) = 1:nrow(private$.data_df)
                                     
                                     super$convert_wide_to_tall_skinny()
                                     super$enforce_data_file_sample_name_column()
                                     
                                     cat("Dimensions:", dim(private$.data_df), "\n")
                                     
                                     cat("Extracting feature data\n")
                                     private$.feature_annotation_df = exprSetObj@featureData@data
                                     
                                     invisible(self)
                                     
                                   }
                                 ), 
                                 private = list(
                                 ))
##### DataReaderProteomicsMaxQuant #####
DataReaderProteomicsMaxQuant = R6::R6Class(
  classname = 'DataReaderProteomicsMaxQuant',
  inherit = DataReaderExpressionMatrix,
  public = list(
    print_level = function() {cat("----(Level: DataReaderProteomicsMaxQuant)\n")},
    load_data_from_file = function() {
      super$load_data_from_file()
      cat("load_data_from_file()"); self$print_level()
      
      private$.data_df = as.data.frame(t(private$.data_df))
      private$.data_df = cbind(
        data.frame(tracking_id = rownames(private$.data_df), 
                   stringsAsFactors = FALSE),
        private$.data_df)
      super$convert_wide_to_tall_skinny()
      super$enforce_data_file_sample_name_column()
      cat("Dimensions:", dim(private$.data_df), "\n")
    }
  ), 
  private = list()
)
                                     
                                     
##### DataReaderFusionTophat #####
DataReaderFusionTophat = R6::R6Class(classname = 'DataReaderFusionTophat',
                                     inherit = DataReaderAuto,
                                     public = list(
                                       print_level = function() {cat("----(Level: DataReaderFusionTophat)\n")},
                                       load_data_from_file = function() {
                                         cat("Tophat Fusion text files may or may not have header info. Reading without header and will adjust as required.")
                                         private$.header = FALSE
                                         super$load_data_from_file()
                                         if (private$.is_excel) {
                                           private$.data_df$biosample_name = private$.data_df$SeqID
                                           private$.data_df = plyr::rename(
                                             private$.data_df, 
                                             c('LeftGene' = 'gene_left',
                                               'LeftChrID' = 'chromosome_left',
                                               'LeftCoord' = 'start_left',
                                               'RightGene' = 'gene_right',
                                               'RightChrID' = 'chromosome_right',
                                               'RightCoord' = 'start_right',
                                               'Num.ofReadsSpanningFusion' = 'num_spanning_reads',
                                               'Num.ofSpanningMatePairs' = 'num_mate_pairs')
                                           )
                                           private$.data_df$num_mate_pairs_fusion = NA # https://janssenwiki.na.jnj.com:3000/inforMe-SciDB/scidb_loader_templates/issues/76
                                         } else {
                                           first_10_cols = c('biosample_name', 
                                                             'gene_left', 'chromosome_left', 'start_left',
                                                             'gene_right', 'chromosome_right', 'start_right',
                                                             'num_spanning_reads', 'num_mate_pairs', 
                                                             'num_mate_pairs_fusion')
                                           if (ncol(private$.data_df) == 10) {
                                             colnames(private$.data_df) = first_10_cols
                                           } else if (ncol(private$.data_df) == 11) {
                                             colnames(private$.data_df) = c(
                                               first_10_cols,
                                               'quality_score')
                                           } else {
                                             stop("Expect 10 or 11 columns here")
                                           }
                                         }
                                         private$.data_df$end_left = NA
                                         private$.data_df$end_right = NA
                                         
                                         super$enforce_data_file_sample_name_column()
                                       }
                                     ))

##### DataReaderFusionDeFuse #####
DataReaderFusionDeFuse = R6::R6Class(
  classname = 'DataReaderFusionDeFuse',
  inherit = DataReader,
  public = list(
   print_level = function() {cat("----(Level: DataReaderFusionDeFuse)\n")},
   load_data_from_file = function() {
     private$.header = TRUE
     super$load_data_from_file()
     # Translate columns from the spreadsheet into the normalized columns
     # expected by the fusion data array.
     # I don't think that these mappings are correct, but I tried other mappings from the same 
     # spreadsheet and was unable to use 'search_fusion' to query the data with features.  I think
     # that either some new features need to be registered or feature synonyms need to be created
     # because I found matching features in the FEATURE table.  I will need some direction as to
     # the design here because I want to time-box the amount of guess-and-check effort required to
     # complete this.
     private$.data_df$chromosome_left = private$.data_df$gene_chromosome1
     private$.data_df$gene_left = private$.data_df$gene_name1
     private$.data_df$start_left = NA
     private$.data_df$end_left = NA
     
     private$.data_df$chromosome_right = private$.data_df$gene_chromosome2
     private$.data_df$gene_right = private$.data_df$gene_name2
     private$.data_df$start_right = NA
     private$.data_df$end_right = NA
     
     private$.data_df$num_spanning_reads = NA
     private$.data_df$num_mate_pairs = NA
     private$.data_df$num_mate_pairs_fusion = NA
     
     if (nrow(private$.data_df) == 0) {
       private$.data_df$biosample_name = character()
     } else {
       if (length(unique(private$.pipeline_df$original_sample_name)) > 1) {
         stop("Expected per sample file for this reader")
       }
       if (!('biosample_name' %in% colnames(private$.data_df))) {
        private$.data_df$biosample_name = private$.pipeline_df$original_sample_name
        # No need to enforce `data_file_sample_name` in this case 
        # (since was just assigned to `original_sample_name)
       } else { # if `biosample_name` was present in original file
         super$enforce_data_file_sample_name_column()
       }
     }
   }
  ))

##### DataReaderFileLink #####
DataReaderFileLink = R6::R6Class(
  classname = 'DataReaderFileLink',
  inherit = DataReader,
  public = list(
    load_data_from_file = function() {
      private$.data_df = data.frame(
        biosample_name = private$.pipeline_df$original_sample_name, 
        stringsAsFactors = FALSE
      )
      # super$enforce_data_file_sample_name_column() # No need to enforce as just manually enforced above
      private$.feature_annotation_df = NULL
    }
  ))

##### DataReaderCopyNumberMatrix #####
DataReaderCopyNumberMatrix = R6::R6Class(
  classname = 'DataReaderCopyNumberMatrix',
  inherit = DataReaderExpressionMatrix,
  public = list(
    print_level = function() {cat("----(Level: DataReaderCopyNumberMatrix)\n")},
    load_data_from_file = function() {
      super$load_data_from_file()
      cat("load_data_from_file()"); self$print_level()
      
      colnames(private$.data_df)[1] = 'tracking_id'
      super$convert_wide_to_tall_skinny()
      super$enforce_data_file_sample_name_column()
      cat("Dimensions:", dim(private$.data_df), "\n")
    }
  ), 
  private = list()
)


##### DataReaderCyTOF #####
DataReaderCyTOF = R6::R6Class(
  classname = 'DataReaderCyTOF',
  inherit = DataReaderAuto,
  public = list(
    print_level = function() {cat("----(Level: DataReaderCyTOF)\n")},
    load_data_from_file = function() {
      cat("load_data_from_file()"); self$print_level()
      file_path = unique(private$.pipeline_df$file_path)
      stopifnot(length(file_path) == 1)
      cat(paste0("Reading *comma* delimited file:\n\t", file_path, "\n"))
      private$.data_df = read.delim(file = file_path, 
                 sep = ',', skip = 6, check.names = F, stringsAsFactors = F)
      
      # Skip columns marked 'Sample Type', 'Plate', 'Panel'
      private$.data_df = private$.data_df[,
                                          !(colnames(private$.data_df) %in% c('Sample Type', 'Plate', 'Panel')) ]
      
      private$.data_df = plyr::rename(private$.data_df, c('FCS Filename' = 'biosample_name'))
      super$enforce_data_file_sample_name_column()
      
      super$convert_wide_to_tall_skinny(keyname = 'scidb_feature_col')
      
      # Skip features that have % marked in them
      private$.data_df = private$.data_df[
        grep("%", private$.data_df$scidb_feature_col, fixed = T, invert = T), ]
      
      # Skip features that have 'Ungated' marked in them
      private$.data_df = private$.data_df[
        grep("Ungated", private$.data_df$scidb_feature_col, fixed = T, invert = T), ]

      # One occurence of the phrase " of " in each line
      verify_that_there_is_one_occurence_of__of__in_each_feature = function(vec) {
        count_number_of_expr_matches = function(vec, expr) {
          stringi::stri_count(vec, fixed = expr)
        }
        stopifnot(
          all(count_number_of_expr_matches(vec, expr = " of ") == 1)
        )
      }
      
      split_by_expr = function(vec, expr, side = c('left', 'right')) {
        side = match.arg(side)
        sapply(
          stringi::stri_split(str = vec, fixed = expr),
          function(elem) ifelse(side == 'left', elem[1], elem[2])
        )
      }
      
      
      verify_that_there_is_one_occurence_of__of__in_each_feature(private$.data_df$scidb_feature_col)
      cat("Dimensions:", dim(private$.data_df), "\n")
      
      # Retain stuff only the right of the phrase " of "
      private$.data_df$scidb_feature_col = split_by_expr(
        private$.data_df$scidb_feature_col, expr = " of ", side = 'right')
     

    }
  ), 
  private = list()
)

##### createDataReader #####
#' @export
createDataReader = function(pipeline_df, measurement_set, reader_config){
  # Special formulation for entries that need to be disambuiguated by filter_choices
  if (grepl("COPY|CNV|CyTOF", measurement_set$pipeline_scidb) |
      grepl("file link", measurement_set$filter_name) | 
      grepl("geneset", measurement_set$filter_name)) { 
    temp_string = paste0("{",measurement_set$pipeline_scidb, "}{", 
                         measurement_set$filter_name, "}")
  } else {
    temp_string = paste0("{",measurement_set$pipeline_scidb, "}{", 
                         measurement_set$quantification_level, "}")
  }
  
  switch_case_call = paste0(
    "switch(temp_string,",
    paste0(
      sapply (names(reader_config), function(name) {
        selectors = reader_config[[name]]
        text1 = paste0("'", selectors, "'", collapse = ' = , ')
        paste0(text1, 
               ' = ', name, 
               '$new(pipeline_df = pipeline_df,measurement_set = measurement_set)')
      }), 
      collapse = ', '),
    ',stop("Need to add reader for choice:\n", temp_string))'
  )
  readerObj = eval(parse(text = switch_case_call))
  return(readerObj)
}
Paradigm4/insight documentation built on April 5, 2020, 1:12 p.m.