R/FileQueries.R.R

Defines functions cavatica_download_link_to_filenames filepaths_fix cavatica_file_id_from_files cavatica_file_from_filepath cavatica_file_from_file_id recursive_file_list get_files_from_filelist_by_name cavatica_file_path_from_file get_path_from_splitpath_with_explicit_parent_folder get_path_from_splitpath cavatica_file_id_from_filepath cavatica_file_from_filepath2

Documented in cavatica_download_link_to_filenames cavatica_file_from_file_id cavatica_file_from_filepath cavatica_file_from_filepath2 cavatica_file_id_from_filepath cavatica_file_id_from_files cavatica_file_path_from_file get_files_from_filelist_by_name get_path_from_splitpath get_path_from_splitpath_with_explicit_parent_folder

# list_files_recursively <- function(folder){
# }


#' File and FileList Operations
#'
#' @param project a cavatica project object
#' @param filepaths the filepaths to files you want retrieve (character)
#'
#' @return A Files object or a list of Files Objects.
cavatica_file_from_filepath2 <- function(project, filepaths){
  assertthat::assert_that(is.character(filepaths))
  filepaths <- sub(pattern = "\\/$", replacement = "", x = filepaths)
  filepaths <- gsub(pattern = "(\\/)+", replacement = "/", x = filepaths)
  filepaths <- sub(pattern = "^\\/", replacement = "", x = filepaths)

  # Split path into each element
  splitpath <- strsplit(x = filepaths, split = "/", fixed = TRUE)
  files <- sapply(splitpath, FUN = function(path){ get_path_from_splitpath(project = project, splitpath = path)})
  if(length(files) == 1) { files <- files[[1]]}
  return(files)
}

#' Cavatica File From Filepath
#'
#' returns file_id from filepaths
#'
#' @inheritParams  cavatica_file_from_filepath
#'
#' @return character filepaths
#' @export
#'
cavatica_file_id_from_filepath <- function(project, filepaths){
  files = cavatica_file_from_filepath(project, filepaths)
  if (length(files) == 1) files <- list(files)

  vapply(
      X = files,
      FUN.VALUE = "",
      FUN = function(x){
        if(!isS4(x)) return(NA_character_)
        return(x[["id"]])
      })
}

#' File and FileList Operations
#'
#' @inheritParams standard_description_function
#' @param splitpath a single vector where each element in a folder/file in a filepath
#' @param file_object_only return only the file object - or a list where element 1 is file object and element 2 is a dataframe of all cached filepaths stepped through on the way to finding the file (boolean)
#' @return File at the end of the filepath (sevenbridges Files class object) OR NA if file doesnt exist
#'
get_path_from_splitpath <- function(project, splitpath, file_object_only = FALSE){
  current_folder <- project$file(name=splitpath[1], exact=TRUE, complete=TRUE)

  if(length(current_folder) == 0 ) { message("Could not find: ", splitpath[1]); return(NA)}

  path_record <- c(splitpath[1])
  folder_record <- list(current_folder)

  if (current_folder$type == "file" | length(splitpath) == 1){
    record_df = dplyr::tibble(path = path_record, file = folder_record)
    if(file_object_only) return(current_folder) else return(list("FileObject" = current_folder, "Record" = record_df))
  }

  i=0
  for (filename in splitpath[-1]){
    i=i+1
    current_folder_contents <- current_folder$list_folder_contents(complete=TRUE)

      next_file_or_folder <- get_files_from_filelist_by_name(
        filelist = current_folder_contents,
        filenames = filename
      )

      if(!isS4(next_file_or_folder) && is.na(next_file_or_folder)){
        message(paste0("Could not find the file: ", paste0(splitpath, collapse = "/")))
        return(NA)
        }

      if(next_file_or_folder$type == "file") {
        current_folder <- next_file_or_folder
        path_record <- c(path_record,paste0(splitpath[1:min(i+1, length(splitpath))], collapse = "/"))
        folder_record[[length(folder_record)+1]] <- current_folder
        record_df = dplyr::tibble(path = path_record, file = folder_record)
        if(file_object_only) return(next_file_or_folder) else return(list("FileObject" = current_folder, "Record" = record_df))
      }
      else{
        current_folder <- next_file_or_folder
        path_record <- c(path_record,paste0(splitpath[1:min(i+1, length(splitpath))], collapse = "/"))
        folder_record[[length(folder_record)+1]] <- current_folder
      }
  }

  #This code gets run only if last element of splitlist was a folder
  record_df = dplyr::tibble(path = path_record, file = folder_record)
  if(file_object_only) return(next_file_or_folder) else return(list("FileObject" = current_folder, "Record" = record_df))
}

#' File and FileList Operations
#'
#' @inheritParams standard_description_function
#' @param splitpath a single vector where each element in a folder/file in a filepath
#' @param parent_folder folder to act as 'root' folder from which splitpath is relative to (Sevenbridges File Object)
#' @param file_object_only return only the file object - or a list where element 1 is file object and element 2 is a dataframe of all cached filepaths stepped through on the way to finding the file (boolean)
#' @return File at the end of the filepath (sevenbridges Files class object) OR NA if file doesnt exist
#'
get_path_from_splitpath_with_explicit_parent_folder <- function(project, splitpath, parent_folder, file_object_only = FALSE){

  current_folder <- parent_folder

  if(length(current_folder) == 0) { message("Could not find: ", splitpath[1]); return(NA)}

  path_prefix = cavatica_file_path_from_file(project = project, file = parent_folder)

  path_record <- paste0(path_prefix)
  folder_record <- list(current_folder)

  if (current_folder$type == "file"){
    record_df = dplyr::tibble(path = path_record, file = folder_record)
    if(file_object_only) return(current_folder) else return(list("FileObject" = current_folder, "Record" = record_df))
  }

  i=0
  for (filename in splitpath){
    i=i+1
    current_folder_contents <- current_folder$list_folder_contents(complete=TRUE)

    next_file_or_folder <- get_files_from_filelist_by_name(
      filelist = current_folder_contents,
      filenames = filename
    )

    if(!isS4(next_file_or_folder) && is.na(next_file_or_folder)){
      message(paste0("Could not find the file: ", paste0(splitpath, collapse = "/")))
      return(NA)
    }

    if(next_file_or_folder$type == "file") {
      current_folder <- next_file_or_folder

      path_record <- c(path_record,paste0(path_prefix,"/", paste0(splitpath[1:min((i), length(splitpath))], collapse = "/")))
      folder_record[[length(folder_record)+1]] <- current_folder
      record_df = dplyr::tibble(path = path_record, file = folder_record)
      if(file_object_only) return(next_file_or_folder) else return(list("FileObject" = current_folder, "Record" = record_df))
    }
    else{
      current_folder <- next_file_or_folder
      path_record <- c(path_record,paste0(path_prefix, "/", paste0(splitpath[1:min((i), length(splitpath))], collapse = "/")))
      folder_record[[length(folder_record)+1]] <- current_folder
    }
  }

  #This code gets run only if last element of splitlist was a folder
  record_df = dplyr::tibble(path = path_record, file = folder_record)
  if(file_object_only) return(next_file_or_folder) else return(list("FileObject" = current_folder, "Record" = record_df))
}

#' cavatica_file_path_from_file
#'
#' @inheritParams standard_description_function
#'
#' @return path of file (string)
#' @export
#'
cavatica_file_path_from_file <- function(project, file){
  assert_is_file(file)

  path = character(0)
  currentfile = file
  while(currentfile$id != project$root_folder){
    path = c(path, currentfile$name)
    currentfile = currentfile$get_parent_folder()
  }
  return(paste0(rev(path), collapse = "/"))
}

#' File and FileList Operations
#'
#' @param filelist a sevenbridges FilesList class object
#' @param filenames a vector of names
#'
#' @return a sevenbridges Files object OR a FileList Object (if youve supplied multiple filenames) OR NA if names don't exists in the filelist
#' @export
#'
get_files_from_filelist_by_name <- function(filelist, filenames){
  assertthat::assert_that(is.character(filenames), msg = paste0("Filename must be a character, not a ", class(filenames)))
  query_results <- filelist %>%
    as.list %>%
    vapply(FUN.VALUE = TRUE, FUN = function(x) x$name %in% filenames) %>%
    which()

  if(length(query_results) == 0) return(NA)
  else if (length(query_results) == 1) return(filelist[[query_results]])
  else return(filelist[query_results])
}


recursive_file_list <- function(project, folder){
  #folder_contents = list()
  current_folder = folder
  purrr::map(as.list(current_folder$list_folder_contents(complete = TRUE)), ~ if(.x$type == "folder") {recursive_file_list(project, .x)})
}

#' Get File from ID
#'
#' @inheritParams standard_description_function
#'
#' @return sevenbridges File Object (or FileList if multiple IDs are supplied)
#' @export
cavatica_file_from_file_id <- function(project, file_id){
  assert_is_project(project)
  project$file(id=file_id)
}




#' File and FileList Operations
#'
#' Finds files from filepaths: designed to be vectorised and efficient with respect to api calls
#' How it works is if you parse multiple filepaths with shared folders in their history,
#' this fuction will remembar paths to shared folders so we don't have to re-lookup the same folders over and over.
#'
#' This is usually more efficient than spending forever building a cache which has to be updated whenever new files are added to the volume.
#'
#'
#' @param project a cavatica project object
#' @param filepaths the filepaths to files you want retrieve (character)
#' @param return_ids return file ids instead of file objects (boolean)
#' @return A Files object or a list of Files Objects.
#' @export
#'
cavatica_file_from_filepath <- function(project, filepaths, return_ids = FALSE){
  assertthat::assert_that(is.character(filepaths))
  filepaths = filepaths_fix(filepaths)

  known_paths_df = dplyr::tibble(path = character(0), file = list())

  # Split path into each element
  splitpath <- strsplit(x = filepaths, split = "/", fixed = TRUE)

  for (i in seq_along(splitpath)){
    current_splitpath=splitpath[[i]]

    paths = purrr::map_chr(seq_along(current_splitpath), .f = function(n) {paste0(current_splitpath[1:n], collapse = "/")})

    message(rep("=", times=40))
    message("Finding File: [", filepaths[i], "]")
    message(rep("=", times=40))

    # If some part of the path has already been recorded
    if(any(paths %in% known_paths_df$path)){
      longest_path_matched_already_known_index = max(which(paths %in% known_paths_df$path))
      longest_path_matched_already_known_path = paths[longest_path_matched_already_known_index]
      longest_path_matched_already_known_file_object = known_paths_df$file[match(longest_path_matched_already_known_path, known_paths_df$path)][[1]]

      message("Utilising previously described path: ", longest_path_matched_already_known_path)

      remaining_path_steps = current_splitpath[(longest_path_matched_already_known_index+1):max((longest_path_matched_already_known_index+1), length(current_splitpath))]

      if(all(is.na(remaining_path_steps))){
        message("Full path described. Pulling file from known db.") # Don't need to do anyhting in this section since its already in the db we filter for samples of interest at the end

      }
      else{

        res = get_path_from_splitpath_with_explicit_parent_folder(project = project, splitpath = remaining_path_steps, parent_folder = longest_path_matched_already_known_file_object)
        if(all(!is.na(res))) known_paths_df = rbind(known_paths_df, res[[2]])
      }
    }

    #If no part of the path has been previously recorded
    else{
      message("No part of this path has been previously cached ... finding from scratch")
      res = get_path_from_splitpath(project = project, splitpath = current_splitpath) # returning the same folders
      if(all(!is.na(res))) known_paths_df = rbind(known_paths_df, res[[2]])
    }
    message("\n")
  }

  known_paths_df[["id"]] = cavatica_file_id_from_files(files = known_paths_df$file)

  filepaths_of_interest_only = dplyr::left_join(
    x = dplyr::tibble(filepaths = filepaths),
    y= dplyr::distinct(known_paths_df, path, .keep_all = TRUE),
    by = list(x="filepaths", y= "path")
  )

  filepaths_not_found = dplyr::filter(filepaths_of_interest_only, is.null(unlist(file)))[["filepaths"]]

  #browser()
  if(length(filepaths_not_found) == 0){
    message("======================\nSuccess!\n======================")
    message("[\u2714] all files were found on cavatica!")
    message("======================")
  }
  else{
    message("======================\nMissing Files\n======================")
    message("Failed to find the following files: \n")
    message(paste0("[", seq_along(filepaths_not_found), "] ", filepaths_not_found, collapse = "\n"))
    message("======================")
  }


  if(return_ids){
    return(filepaths_of_interest_only[["id"]])
  }
  else
    return(filepaths_of_interest_only[["file"]])


}

#' Files to file IDs
#'
#' @param files sevenbridges file objects
#'
#' @return character vector describing IDs
#' @export
#'
cavatica_file_id_from_files <- function(files){
  vapply(
    X = files,
    FUN.VALUE = "",
    FUN = function(x){
      assert_is_file(x)
      if(!isS4(x)) return(NA_character_)
      return(x[["id"]])
    })
}


filepaths_fix <- function(filepaths){
  filepaths <- sub(pattern = "\\/$", replacement = "", x = filepaths)
  filepaths <- gsub(pattern = "(\\/)+", replacement = "/", x = filepaths)
  filepaths <- sub(pattern = "^\\/", replacement = "", x = filepaths)
  return(filepaths)
}

#' Convert download link to filename
#'
#' @param link cavatica file download link
#'
#' @return file names (character)
#' @export
#'
cavatica_download_link_to_filenames <- function(link){
    link = sub(x= link, pattern = ".*?filename%3D%22", replacement = "")
    link = sub(x= link, pattern = "%.*$", replacement = "")
    return(link)
}
selkamand/utilitybeltcavatica documentation built on April 26, 2023, 4:18 p.m.