R/query_snaptron.R

#' Query an junction using snaptron
#'
#' \code{query_snaptron_junc} takes as input the junction co-ordinates and
#' returns the details of matching junctions found in the snapton database
#'
#' @param database Character scalar. Database of junctions - "srav1", "srav2",
#'   "gtex"
#' @param regions Character scalar. Genomic co-ordinates of the junctions you
#'   want to query in format - "chr1:1-100"
#' @param rfilter Character vector. Filters to use for returned junctions -
#'   "length", "strand", "annotated" etc
#' @param rfilter_values Character vector. Corresponding to rfilter parameters,
#'   need to be prefaced by the type of test or ":" - e.g. "<:5000" or ":1"
#' @param method Character scalar. Do you want all junctions within the region
#'   or only those precisely matching the start and stop?
#'
#' @return df with details of junctions that match criteria from snaptron
#' @export
#'
#' @examples
#' query_snaptron_junc(database = "gtex", regions = "chr4:89729278-89771396",
#'   rfilter = c("length", "annotated"), rfilter_values = c("<:10000", ":0"))
#' query_snaptron_junc(database = "srav2",
#'   regions = c("chr4:89729278-89771396", "chr4:89729278-89729407"),
#'   method = "exact", tmp_dir = "/home/dzhang/")
#'
#' @seealso For more details on input and output parameters see:
#'   \url{http://snaptron.cs.jhu.edu/reftables.html}
query_snaptron_junc <- function(database, regions, rfilter = NULL, rfilter_values = NULL, method = "contains", tmp_dir = "."){

  # build base of snaptron query
  snaptron_curl_query <- stringr::str_c("curl 'http://snaptron.cs.jhu.edu/", database, "/snaptron?")

  # add regions to look up
  regions_query <-
    stringr::str_c("regions=",
                   regions,
                   collapse = "&")
  snaptron_curl_query <- stringr::str_c(snaptron_curl_query, regions_query)

  # add method to extract matching junctions
  if(!(method %in% c("contains", "exact"))) stop("Method must be one of: contains, exact")
  snaptron_curl_query <- stringr::str_c(snaptron_curl_query, "&", method, "=1")

  # filtering for junction details
  if(!is.null(rfilter)){

    if(!all(rfilter %in% c("length", "annotated", "left_annotated", "right_annotated", "strand", "samples_count", "coverage_sum", "coverage_avg", "coverage_median"))){

      stop("Not all filters match those of snaptron - check http://snaptron.cs.jhu.edu/reftables.html")

    }

    if(!all(stringr::str_detect(rfilter_values, ":"))) stop("No ':' found in at least 1 rfilter_values - check http://snaptron.cs.jhu.edu/reftables.html")

    rfilter_query <-
      stringr::str_c("rfilter=",
                     rfilter,
                     rfilter_values,
                     collapse = "&")

    snaptron_curl_query <-
      snaptron_curl_query %>%
      stringr::str_c("&", rfilter_query)

  }

  # add output path to tmp save results
  snaptron_curl_query_w_output_path <-
    snaptron_curl_query %>%
    stringr::str_c("' > ", tmp_dir, "/tmp_snaptron.txt")

  print(stringr::str_c("Searching junction in ", database , " using snaptron"))

  system(command = snaptron_curl_query_w_output_path)

  suppressMessages(
    snaptron_result <-
      readr::read_delim(stringr::str_c(tmp_dir, "/tmp_snaptron.txt"), delim = "\t")
  )

  # delete the tmp
  if(file.exists(stringr::str_c(tmp_dir, "/tmp_snaptron.txt"))){

    file.remove(stringr::str_c(tmp_dir, "/tmp_snaptron.txt"))

  }

  return(snaptron_result)

}

#' Query samples using snaptron
#'
#' \code{query_snaptron_samples} find the metadata for the samples of interest
#'
#' @param sample_info either df outputed from \code{\link{query_snaptron_junc}}
#'   or integer vector of rail ids or list of integer vectors of rail ids
#' @inheritParams query_snaptron_junc
#'
#' @return list containing df, with each element correponding to sample metadata
#'   for 1 junction
#' @export
#'
#' @examples
#' query_snaptron_samples(database = "srav2", sample_info = c(1,2))
#' query_snaptron_samples(database = "srav2", sample_info = list(c(1, 2), c(3, 4)))
#' query_snaptron_samples(database = "gtex", sample_info = query_snaptron_junc_output)
#'
#' @seealso For more details on input and output parameters see:
#'   \url{http://snaptron.cs.jhu.edu/reftables.html}
#' @seealso Sample metadata downloaded from
#'   \url{http://snaptron.cs.jhu.edu/data/}
query_snaptron_samples <- function(database, sample_info){

  if(database == "srav2"){

    suppressMessages(
      suppressWarnings(

        sample_metadata <-
          readr::read_delim("/data/snaptron/sample_meta/samples.tsv", delim = "\t")

      ))

  }else{

    suppressMessages(
      suppressWarnings(

        sample_metadata <-
          readr::read_delim("/data/recount/GTEx_SRP012682/gtex_recount_meta_data_tidy.txt", delim = "\t") %>%
          dplyr::rename(rail_id = recount_id)

      ))

  }

  # convert any input format into list of rail ids as int vectors
  if(is.data.frame(sample_info)){

    if(!("samples" %in% colnames(sample_info))) stop("sample column not found in input dataframe")

    sample_info_ids_list <-
      sample_info$samples %>%
      stringr::str_replace("^,", "") %>% # remove the "," that's always at the start
      stringr::str_split(",") %>%
      lapply(FUN = stringr::str_replace, pattern = ":.*", replacement = "") %>%
      lapply(FUN = as.integer)

  }else if(is.list(sample_info)){

    sample_info_ids_list <-
      lapply(sample_info, FUN = as.integer)

  }else if(is.vector(sample_info)){

    sample_info_ids_list <-
      sample_info %>%
      as.integer() %>%
      list()

  }

  filter_sample_metadata <- function(sample_info_ids, sample_metadata){

    sample_metadata_filtered <-
      sample_metadata %>%
      dplyr::filter(rail_id %in% sample_info_ids)

    print(stringr::str_c("Found metadata for ", nrow(sample_metadata_filtered), "/", length(sample_info_ids), " samples"))

    return(sample_metadata_filtered)

  }

  sample_metadata_list <-
    sample_info_ids_list %>%
    lapply(X = ., FUN = filter_sample_metadata, sample_metadata = sample_metadata)

  return(sample_metadata_list)

}
dzhang32/snaptronr documentation built on May 16, 2019, 4:07 a.m.