R/api2_load-api.R

Defines functions api_register_project_datasets api_register_definitions api_register_individuals update_entity_via_excel_loader api_register_biosamples api_register_featuresets_experimentsets_measurementsets api_register_measurements api_register_ontology_from_definition_sheet

Documented in api_register_biosamples api_register_definitions api_register_featuresets_experimentsets_measurementsets api_register_individuals api_register_measurements api_register_ontology_from_definition_sheet

#
# BEGIN_COPYRIGHT
#
# PARADIGM4 INC.
# This file is part of the Paradigm4 Enterprise SciDB distribution kit
# and may only be used with a valid Paradigm4 contract and in accord
# with the terms and conditions specified by that contract.
#
# Copyright (C) 2011 - 2018 Paradigm4 Inc.
# All Rights Reserved.
#
# END_COPYRIGHT
#
#####################################

# File structure is as follows
# - <load-api.R> is a collection of high-level functions called in a loader script
# - <load-helper.R, load-helper-filetypes.R> contain lower level functions for 
#                   interpreting data in worksheet or specific file-types 
#                   (e.g. Cufflinks RNASeq files, Gemine DNASeq files) before ingesting into SciDB
# - <template-helper.R> contains functions specifically for interpreting / parsing the Excel template sheet

#' @export
api_register_project_datasets = function(workbook_path = NULL, workbook = NULL, only_test = FALSE, 
                                         entity_to_update = NULL, 
                                         permissions_config, 
                                         con = NULL) {
  con = use_ghEnv_if_null(con)
  if (is.null(workbook_path) & is.null(workbook)) {
    stop("must supply path to workbook, or workbook opened by XLConnect::loadWorkbook")
  }
  if (is.null(workbook) & !is.null(workbook_path)){
    workbook = loadWorkbook(workbook_path)
  }
  df0 = myExcelReader(workbook, sheet_name = 'Studies')

  proj_study_ids_summ = data.frame(project_id = numeric(0),
                                   dataset_id = numeric(0),
                                   dataset_version = numeric(0))
  # Work on one project at a time
  for (proj_idx in unique(df0$project_id)) {
    cat("====Working on project idx", proj_idx, "of worksheet\n")
    dfi = df0[df0$project_id == proj_idx, ]

    # --------------------------    
    # PROJECT
    # Register the project first
    wksht_fields_proj = c('project_name', 'project_description')
    dfi_proj = dfi[, wksht_fields_proj]
    dfi_proj = dfi_proj[!duplicated(dfi_proj), ]
    
    if (nrow(dfi_proj) != 1) {
      stop("Expected one row project dataframe here. 
           Project name and description should be consistent across rows of worksheet Studies sheet")
    }
    
    dfi_proj = load_helper_column_rename(dfx = dfi_proj,
                                 revealgenomics_fields = mandatory_fields()[[.ghEnv$meta$arrProject]],
                                 worksheet_fields = wksht_fields_proj
                                )
    
    project_id = register_project(df = dfi_proj, 
                                  only_test = only_test, 
                                  con = con)
    
    # IMPORTANT: remove the columns that will no longer be needed
    dfi[, wksht_fields_proj] = NULL
    
    # --------------------------    
    # DATASET  
    dfi$project_id = project_id
    revealgenomics_fields = mandatory_fields()[[.ghEnv$meta$arrDataset]]
    wksht_fields_study = c('study_name', 'study_description', 'project_id', 'is_study_public')
    stopifnot(all(c(wksht_fields_study, 'study_version') %in% colnames(df0)))
    
    dfi_st = load_helper_column_rename(dfx = dfi, 
                               revealgenomics_fields = mandatory_fields()[[.ghEnv$meta$arrDataset]], 
                               worksheet_fields = wksht_fields_study)
    
    # Replicate the column `is_study_public` (studies loaded with Excel sheet will have a duplicate column 
    # `public` (API required) and `is_study_public`(Excel sheet required))
    dfi_st$is_study_public = dfi_st$public
    
    # IMPORTANT: remove the columns that will no longer be needed
    dfi_st$study_id = NULL
    
    if (length(unique(dfi_st$public)) != 1) {
      stop("Study versions should either be public or not")
    } else {
      # df1$public = NULL
    }
    
    dataset_version = unique(dfi_st$study_version)
    if (length(dataset_version) != 1) {
      stop("must write code to ingest multiple study_version-s")
    }
    # IMPORTANT: remove the columns that will no longer be needed
    dfi_st$study_version = NULL
    dataset_record = register_dataset(df = dfi_st, public = unique(dfi_st$public), 
                                      dataset_version = dataset_version,
                                      only_test = only_test, 
                                      con = con)
    # Update entity if suggested by user
    update_entity_via_excel_loader(
      data_df = dfi_st, 
      data_df_record = dataset_record, 
      data_df_entity = .ghEnv$meta$arrDataset, 
      entity_to_update = entity_to_update,
      con = con)
    
    if (as.logical(options("revealgenomics.use_scidb_ee"))) { # run auto permissions grants on scidb EE
      message("#### Automatically granting permissions to:\n", 
              "\t1. core team (approved users, privileged_service_accounts accounts, project owner(s))\n", 
              "\t2. people with prior permission to study with identical project + dataset (study) name")
      
      # Extracting project owner users from Excel sheet entry
      project_owner_email_col = grep("project_owner_email", colnames(dfi), value = T)
      if (length(project_owner_email_col) != 1) {
        stop("Expected to find one column matching with text: project_owner_email", 
             ". Found: ", length(project_owner_email_col))
      }
      if (length(unique(dfi[, project_owner_email_col])) != 1) {
        stop("Need to account for case where Excel sheet has entries with differeing project owner emails")
      }
      project_owner_email_entry = unique(dfi[, project_owner_email_col])
      if (grepl(';', project_owner_email_entry)) {
        project_owner_email_entry = gsub(" ", "", 
                                         unlist(stringi::stri_split(str = project_owner_email_entry, fixed = ',')))
      }
      project_owners = unique(
        sapply(
          stringi::stri_split(str = project_owner_email_entry, 
                              fixed = '@'), function(elem) elem[1]))
      
      core_team = c(
        permissions_config$privileged_users,
        permissions_config$privileged_service_accounts,
        project_owners
      )
      
      assign_permission_to_studies_for_group_of_users = function(study_names, userlist) {
        for (usernm in userlist) {
          tryCatch({
            set_permissions(con = con, user_name = usernm, 
                            dataset_names = dfi_st$name, 
                            allowed = TRUE)
          }, error = function(e) {
            if (grepl("(db:)o", e$message, fixed = T)) {
              simple_msg = "***Failed to assign permissions. Need to login as operator***"
            } else {
              simple_msg = e$message
            }
            message("Failed to grant permissions for user: ", usernm, 
                    ".\n\tError: ", simple_msg)
          })
        } # end of for user in userlist
      } # end of assign_permissions function
      
      if (length(core_team) > 0) {
        message("-------------------------\nAssigning permissions to ", 
                ifelse(length(dfi_st$name) == 1, "study: ", "studies: "), 
                paste0(dfi_st$name, collapse = ", "),
                "\n\tfor core team: ", 
                paste0(core_team, collapse = ", "))
        assign_permission_to_studies_for_group_of_users(study_names = dfi_st$name,
                                                        userlist = core_team)
      }
      
      ##### Previously permitted users ####
      for (study_name in dfi_st$name) {
        prev_permitted_users = search_dataset_permissions_archive(
          dataset_name = study_name,
          project_name = dfi_proj$name, 
          con = con
        )$user_name
        if (length(prev_permitted_users) > 0) {
          prev_permitted_users = prev_permitted_users[!prev_permitted_users %in% core_team]
        }
        if (length(prev_permitted_users) > 0) {
          message("-------------------------\nRestoring dataset permissions for previously permitted users:\n ", 
                  paste0(prev_permitted_users, collapse = ", "), 
                  " -- at : \n", 
                  "project_name: ", dfi_proj$name, " / dataset_name: ", study_name)
          assign_permission_to_studies_for_group_of_users(study_names = study_name,
                                                          userlist = prev_permitted_users)
        }
        message("Deleting dataset permissions archive at: \n", 
                "project_name: ", dfi_proj$name, " / dataset_name: ", study_name)
        iquery(con$db, 
               paste0("delete(", full_arrayname(.ghEnv$meta$arrDatasetPermissionsArchive), 
                      ", project_name = '", dfi_proj$name, 
                      "' AND dataset_name = '", study_name, "')"))
      }
    }

    proj_study_ids = dataset_record
    proj_study_ids$project_id = project_id
    proj_study_ids
    proj_study_ids = proj_study_ids[, 
                                    c('project_id', 'dataset_id', 'dataset_version')]
    
    proj_study_ids_summ = rbind(proj_study_ids_summ, proj_study_ids)
    cat("----\n")
  } # <end of> for (proj_idx in df0$project_id)

  proj_study_ids_summ
}

#' register definitions for dataset
#' 
#' @export
api_register_definitions = function(df_definitions, record, con = NULL) {
  stopifnot(nrow(record) == 1)
  
  df_definitions$dataset_id = record$dataset_id
  definition_record = register_definitions(df = as.data.frame(df_definitions), 
                                           con = con)
}

#' register individuals
#' 
#' @param workbook workbook object returned by XLConnect:loadWorbook
#' @param record record of scidb project_id, dataset_id and dataset_version at which to register individuals
#' @export
api_register_individuals = function(workbook, record, def, 
                                    entity_to_update = NULL, con = NULL) {
  stopifnot(nrow(record) == 1)
  
  data_df = load_helper_prepare_dataframe(workbook = workbook,
                                         record = record, 
                                         def = def, 
                                         sheetName = 'Subjects',
                                         entityName = .ghEnv$meta$arrIndividuals,
                                         worksheet_fields = 
                                           c('dataset_id', 'subject_id', 'description'),
                                         con = con)
  
  
  data_df_record = register_individual(df = data_df, dataset_version = record$dataset_version, 
                                     con = con)
  
  # Update entity if suggested by user
  update_entity_via_excel_loader(
    data_df = data_df, 
    data_df_record = data_df_record, 
    data_df_entity = .ghEnv$meta$arrIndividuals, 
    entity_to_update = entity_to_update,
    con = con
  )
}

update_entity_via_excel_loader = function(
  data_df, data_df_record, data_df_entity, entity_to_update = NULL, con = con) {
  if (!is.null(entity_to_update)) {
    if (entity_to_update == data_df_entity | 
        entity_to_update == 'all') {
      dataset_version = unique(data_df_record$dataset_version)
      if (length(dataset_version) != 1) {
        stop("The following code snippet works OK for 1 dataset version at a time right now")
      }
      idname = get_base_idname(data_df_entity)
      stopifnot(length(idname) == 1)
      data_df_db = get_entity(
        entity = data_df_entity, 
        id = data_df_record[, idname],
        dataset_version = dataset_version, 
        all_versions = FALSE, 
        con = con)
      data_df_db = data_df_db[
        match(data_df_record[, idname], 
              data_df_db[, idname]), ]
      # Unique fields must be same; Otherwise new entry would have to be registered
      # (New entries are automatically assigned by `register_ENTITY` class of functions)
      stopifnot(
        all_equal(
          data_df[, unique_fields()[[data_df_entity]]], 
          data_df_db[, unique_fields()[[data_df_entity]]]
        )
      )
      if (all(c('created', 'updated') %in% 
              names(.ghEnv$meta$L$array[[data_df_entity]]$attributes))) {
        dfx = cbind(data_df, data_df_record, data_df_db[, c('created', 'updated')])
      } else {
        dfx = cbind(data_df, data_df_record)
      }
      update_entity(entity = data_df_entity, 
                    df = dfx, 
                    con = con)
    }
  }
}

#' Samples / Biosample
#' @export
api_register_biosamples = function(workbook, record, def, 
                                   entity_to_update = NULL, 
                                   con = NULL) {
  stopifnot(nrow(record) == 1)
  
  data_df = load_helper_prepare_dataframe(workbook = workbook,
                                         record = record, 
                                         def = def, 
                                         sheetName = 'Samples',
                                         entityName = .ghEnv$meta$arrBiosample,
                                         worksheet_fields = 
                                           c('dataset_id', 'sample_name', 
                                             'description', 'individual_id'),
                                         con = con)
  
  data_df_record = register_biosample(df = data_df, 
                                      dataset_version = record$dataset_version, 
                                      con = con)
  
  # Update entity if suggested by user
  update_entity_via_excel_loader(
    data_df = data_df, 
    data_df_record = data_df_record, 
    data_df_entity = .ghEnv$meta$arrBiosample, 
    entity_to_update = entity_to_update,
    con = con)
}

#' Register FeatureSets, ExperimentSets and MeasurementSets
#' 
#' This function differs from api_register_indiv/bios/measurements
#' that work row by row on Subjects/Samples/Pipelines sheets respectively.
#' This function needs to find unique set of Experiments from the unique value of `concat` column
#' in Pipelines sheet, and then uses info in `pipeline_choices` sheet to fill up the 
#' other necessary information
#' @export
api_register_featuresets_experimentsets_measurementsets = function(
  workbook, record, def, 
  entity_to_update = NULL, 
  con = NULL) {
  stopifnot(nrow(record) == 1)
  
  # Create choices objects from metadata sheet
  choicesObj = list(
    pipelineChoicesObj = PipelineChoices$new(
      pipeline_choices_df = myExcelReader(workbook = workbook, 
                                          sheet_name = template_linker$pipeline$choices_sheet)),
    filterChoicesObj = FilterChoices$new(
      filter_choices_df = myExcelReader(workbook = workbook, 
                                        sheet_name = template_linker$filter$choices_sheet)),
    featuresetChoicesObj = FeaturesetChoices$new(
      featureset_choices_df = myExcelReader(workbook = workbook, 
                                            sheet_name = template_linker$featureset$choices_sheet))
  )
  
  pipelines_df = template_helper_extract_record_related_rows(workbook = workbook,
                                                             sheetName = 'Pipelines', 
                                                             record = record)

  ######################################  
  # EXTRACT ALL RELEVANT INFORMATION
  msmtset_df = template_helper_extract_pipeline_meta_info(pipelines_df = pipelines_df, 
                                                          choicesObj = choicesObj,
                                                          record = record)
  if (nrow(msmtset_df) == 0) {
    cat("No ExperimentSets that match pipeline_choices description\n")
    return(NULL)
  }

  # ====================================
  # some parsing on the data
  # Extract relevant definitions 
  defi = rbind(template_helper_extract_definitions(sheetName = 'pipeline_choices', def = def), 
               template_helper_extract_definitions(sheetName = 'filter_choices', def = def),
               template_helper_extract_definitions(sheetName = 'featureset_choices', def = def))
  defi = defi[!(defi$attribute_name %in% c('filter_id')), ]
  
  # # Enforce that columns in data are defined in Definitions sheet
  # cat("Suppressing this check as dataset_id is present in data, but
  #      not in Definitions sheet\n")
  # try({template_helper_enforce_columns_defined(data_df = msmtset_df, 
  #                                              definitions = defi)})
  
  
  # Enforce that mandatory columns listed in Definitions sheet are present in data
  cat("Suppressing this check as unique case here (try block will report failure if any, but continue)\n")
  try({template_helper_enforce_mandatory_columns_present(data_df = msmtset_df,
                                                    definitions = defi)})
  
  
  ######################################  
  # EXPERIMENTSET
  # Formulate ExperimentSet
  columns_experimentSet = c('dataset_id', 
                            'data_subtype')
  expset_df = unique(msmtset_df[, columns_experimentSet])
  expset_df = plyr::rename(expset_df, c('data_subtype' = 'name'))
  expset_df$description = paste0(expset_df$name, " experiments")
  expset_df$molecule = '...'
  expset_df$experiment_type_API = template_helper_assign_experiment_entity(expset_df$name)
  
  cat("==== Registering EXPERIMENTSET =====\n")
  expset_record = register_experimentset(df = expset_df, dataset_version = record$dataset_version, 
                                         con = con)
  
  # Update entity if suggested by user
  update_entity_via_excel_loader(
    data_df = expset_df, 
    data_df_record = expset_record, 
    data_df_entity = .ghEnv$meta$arrExperimentSet, 
    entity_to_update = entity_to_update,
    con = con)
  ######################################  
  # FEATURESET
  # Formulate FeatureSet
  columns_featureSet = c('featureset_name', 
                         'featureset_scidb',
                         'featureset_source', 
                         'featureset_source_version',
                         'featureset_species')
  ftrset_df = unique(msmtset_df[, columns_featureSet])
  
  new_colnames_featureset = c('featureset_name', 'name', 'source', 'source_version', 'species')
  names(new_colnames_featureset) = columns_featureSet
  ftrset_df = plyr::rename(ftrset_df, new_colnames_featureset)
  ftrset_df$description = "..."
  ftrset_df$source_uri = "..."
  
  refSets = get_referenceset()
  refset37_id = refSets[grep("grch37", refSets$name, ignore.case = T), ]$referenceset_id
  refset38_id = refSets[grep("grch38", refSets$name, ignore.case = T), ]$referenceset_id
  if (length(refset37_id) != 1 |
              length(refset38_id) != 1) {
    stop("Expected exactly one referenceset each for grch37 and grch38. ", 
         "Found: (grch37) -- ", length(refset37_id), 
         "; (grch38) -- ", length(refset38_id))
  }
  
  ftrset_df$referenceset_id = -1
  if (length(grep("37", ftrset_df$source)) > 0) {
    ftrset_df[grep("37", ftrset_df$source), ]$referenceset_id = refset37_id
  }
  if (length(grep("38", ftrset_df$source)) > 0) {
    ftrset_df[grep("38", ftrset_df$source), ]$referenceset_id = refset38_id
  }
  cat("==== Registering FEATURESET ====\n")
  ftrset_record = register_featureset(df = ftrset_df, 
                          con = con)  
  
  # ====================================
  # MEASUREMENTSET
  
  # experimentset_id
  expset_df = get_experimentset(experimentset_id = expset_record$experimentset_id, 
                                dataset_version = record$dataset_version)
  expset_df = plyr::rename(expset_df, c('name' = 'experiment_name'))
  columns_experimentSet2 = c('dataset_id', 
                            'experiment_name')
  
  msmtset_df = merge(msmtset_df, 
                     expset_df[, c(get_base_idname(.ghEnv$meta$arrExperimentSet),
                                   columns_experimentSet2)],
                     by.x = columns_experimentSet,
                     by.y = columns_experimentSet2)
  
  # featureset_id
  fsets = get_featuresets(con = con)
  matchL = find_matches_and_return_indices(msmtset_df$featureset_name, 
                                           fsets[, template_linker$featureset$choices_col])
  if (length(matchL$source_unmatched_idx) > 0){
    cat("Following pipelines do not have featuresets defined yet -- skipping them:\n")
    print(msmtset_df[matchL$source_unmatched_idx, c(1:5)])
    msmtset_df = msmtset_df[matchL$source_matched_idx, ]
    stop("Should not have occurred")
  }
  msmtset_df$featureset_id = fsets$featureset_id[matchL$target_matched_idx]
  
  # Rename columns from external custom fields to revealgenomics fields
  msmtset_df = plyr::rename(msmtset_df, 
                            c('measurement_entity' = 'entity'))
  msmtset_df$name = paste0(msmtset_df$pipeline_source_title, ": ", msmtset_df$filter_name)
  msmtset_df = drop_na_columns(msmtset_df)
  
  # description
  if ('pipeline_source_title' %in% colnames(msmtset_df)) {
    msmtset_df$description = msmtset_df$pipeline_source_title
  } else {
    msmtset_df$description = '...'
  }
  
  cat("==== Registering MEASUREMENTSET ====\n")
  msmtset_record = register_measurementset(df = msmtset_df, 
                                              dataset_version = record$dataset_version, 
                                              con = con)
  # Update entity if suggested by user
  update_entity_via_excel_loader(
    data_df = msmtset_df, 
    data_df_record = msmtset_record, 
    data_df_entity = .ghEnv$meta$arrMeasurementSet, 
    entity_to_update = entity_to_update,
    con = con)
  return(list(ExperimentSetRecord = expset_record,
              MeasurementSetRecord = msmtset_record))
}

#' Register measurements
#' 
#' Excel sheet equivalent of \code{\link{populate_measurements}}
#' 
#' @param pipeline_df subselection of Pipeline sheet corresponding to current pipeline
#' @param bios_df_ref             dataframe containing API entry for biosample data at current \code{dataset_id} (retrieve from reference object)
#' @param msmtset_df_ref  dataframe containing API entry for current measurementset (retrieve from reference object)
api_register_measurements = function(
  pipeline_df, 
  bios_df_ref,
  msmtset_df_ref
) {
  biosample_names = pipeline_df$sample_name
  
  m1 = find_matches_and_return_indices(
    biosample_names, 
    bios_df_ref$name
  )
  if (length(m1$source_unmatched_idx) != 0) {
    # stop("Excel file must provide direct match between `sample_name` column in `Pipelines` sheet, 
    #      and `sample_name` column of `Subjects` sheet")
    cat("Dropping", length(m1$source_unmatched_idx), "entries:",
        pretty_print(biosample_names[m1$source_unmatched_idx]),
        "as corresponding sample information was not provided for these rows in `Sample` sheet\n")
  }
  msmt_df = pipeline_df[m1$source_matched_idx, ]
  # Formulate measurement dataframes with unique matched sample id-s
  # Reasons for non uniqueness: 
  # - Some entries might have more than one entry with same filter + pipeline combination
  # - CNV data generated by Personalis panel have two files associated with same filter + pipeline combination
  # `*somatic_dna_gene_cna_report_DEL.tsv` and `*somatic_dna_gene_cna_report_AMP.tsv`

  # Drop unnecessary columns
  for (column in c('project_id', 'study_id', 'study_version',  # tracked by `dataset_id`, `dataset_version`
                   'sample_name', 'original_sample_name',      # tracked by `biosample_id`
                   'pipeline_choice', 'filter_choice', 
                   'featureset_name', 'featureset_scidb'       # tracked by `featureset_id`
                   )) {
    msmt_df[, column] = NULL
  }
  
  # Introduce essential columns
  msmt_df$dataset_id          = msmtset_df_ref$dataset_id
  msmt_df$experimentset_id    = msmtset_df_ref$experimentset_id
  msmt_df$measurementset_id   = msmtset_df_ref$measurementset_id
  msmt_df$measurement_entity  = msmtset_df_ref$entity
  msmt_df$measurementset_name = msmtset_df_ref$name
  msmt_df$biosample_id        = bios_df_ref[m1$target_matched_idx, ]$biosample_id
  cat("use S3 path instead of local path while storing file path in DB\n")
  msmt_df$file_path = template_helper_formulate_file_path(pipeline_df = msmt_df, local_path = FALSE)
  register_measurement(df = msmt_df)
}
#' automatically register ontology terms
#' 
#' gather the terms from controlled_vocabulary column, 
#' categorize by attribute_name, and
#' assign them ontology_id-s
#' @export
api_register_ontology_from_definition_sheet = function(workbook = NULL, 
                                                       def = NULL,
                                                       con = NULL) {
  con = use_ghEnv_if_null(con=con)
  
  if (is.null(workbook) & is.null(def)) stop("must supply at least master workbook
                                       or definitions worksheet")
  
  if (is.null(def)) def = myExcelReader(workbook = workbook, 
                                        sheet_name = 'Definitions')
  
  defx = def[!is.na(def$controlled_vocabulary), ]
  defx = defx[, c('attribute_name', 'controlled_vocabulary')]
  
  L1 = lapply(1:nrow(defx),
        FUN = function(idx) {
                  term_list = defx$controlled_vocabulary[idx]
                  vec = trimws(
                    unlist(stringi::stri_split(str = term_list, 
                                               regex = "//")), 
                         which = 'both')
                  data.frame(
                    category = defx$attribute_name[idx],
                    term = c(vec, "NA"), stringsAsFactors = FALSE)
  })
  
  ont_df = do.call(what = "rbind", 
                   args = L1)
  
  ont_df$source_name = "..."
  ont_df$source_version = "..."
  
  ont_NA = data.frame(term = "NA", 
                      category = 'uncategorized_ontology_terms',
                      source_name = "...", source_version = "...", 
                      stringsAsFactors = FALSE)
  ont_df = rbind(ont_df, ont_NA)
  register_ontology_term(df = ont_df, con = con)
}
Paradigm4/revealgenomics documentation built on April 7, 2020, 2:01 a.m.