R/main.R

Defines functions write_genbed load_genbed load_genbed_tabix is_gzip_file is_remote_url

Documented in is_gzip_file is_remote_url load_genbed load_genbed_tabix

# Home-made essentials tools for bioinformatics

#' @import stringr
#' @importFrom purrr map map_int map_dbl map_lgl map_chr walk
#' @importFrom dplyr `%>%`
#' @importFrom data.table setnames setkey fread fwrite shift setDT
NULL


#' Check whether a url is remote or local
is_remote_url <- function(url) {
  schema <- str_match(url, "([a-z0-9]+)://.+")[, 2]
  
  sapply(schema, function(s) {
    if (is.na(s))
      FALSE
    else {
      if (!is.na(match(s, c("ftp", "sftp", "http", "https"))))
        TRUE
      else if (s == "file")
        FALSE
      else
        stop(paste0("Unknown schema: ", s))
    }
  })
}


#' Check if a file name is a gzip file
#' 
is_gzip_file <- function(file_path) {
  str_split(file_path,
            pattern = "\\.") %>% map_chr( ~ tail(., n = 1)) == "gz"
}


#' Load GENBED using tabix
#' 
#' @param file_path Path to the data file. Should be compressed by BGZIP,
#' locally present, and come along with a paired tabix index file
load_genbed_tabix <- function(file_path,
                              region,
                              col_names = NULL,
                              na = c("", "NA"),
                              col_types = NULL,
                              is_bedgraph = is_bedgraph,
                              ...) {
  genbed <-
    bedr::tabix(
      region = region,
      file.name = file_path,
      check.chr = FALSE,
      verbose = FALSE
    ) %>% as_tibble()
  
  if (!is.null(col_names)) {
    colnames(genbed) <- col_names
  } else if (all(colnames(genbed) == paste0("V", seq_along(colnames(genbed))))) {
    # Columns are default values: V1, V2, ...
    colnames(genbed) <- paste0("X", seq_along(colnames(genbed)))
    colnames(genbed)[1:3] <- c("chrom", "start", "end")
  }
  
  if (is_bedgraph) {
    # Try to convert the 4th column to numeric
    # In the original data table, identify rows with NA scores. Then identify
    # this kind of rows in converted score. If the indices match, the string-to-number
    # conversion is successful.
    na_idx <- genbed[[4]] %in% na
    score_dbl <-  as.numeric(genbed[[4]][!na_idx])
    if (all(!is.na(score_dbl))) {
      genbed[[4]] <- NA
      genbed[[4]][!na_idx] <- score_dbl
    }
  }
  
  genbed %>%
    modify_at(1, as.character) %>%
    modify_at(2:3, as.integer)
}


#' Load a data file in GENBED format
#'
#' This function loads a genbed file, which is simply a BED-like tsv file,
#' with a header line (optional) containing column names, and the types of
#' the first three columns are guaranteed to be "cii"
#' 
#' @param file_path Path or URL to the data file. Can be a local path, file://, 
#' http://, https://, ftp:// and sftp://
#' @param col_names can be NULL (infer from file), or a character vector
#' (user-provided column names). In the infer-from-file scenario, if there is 
#' a column header, we will have the column names. However, if the column header
#' is missing, then we will have the following column names: chrom, start, end,
#' X4, X5, ...
#' @param na Character vector of strings to interpret as missing values.
#' @param col_types A character string represents column names. Refer to
#' \code{col_type} in readr. If NULL, then the types for the first three columns
#' are character, integer, integer, and the rest are guessed from the data.
#' @param is_bedgraph A logic value indicating whether the input is BED or
#' BEDGRAPH. The difference between these two is that, the 4th column is character
#' for BED, and numeric for BEDGRAPH
#' @export
load_genbed <-
  function(file_path,
           region = NULL,
           col_names = NULL,
           na = c("", "NA"),
           col_types = NULL,
           is_bedgraph = FALSE,
           ...) {
    stopifnot(length(file_path) == 1)
    
    get_col_names <- function(file_path) {
      con <- file(file_path, "r")
      last_comment_line <- NULL
      first_data_line <- NULL
      n_fields <- 0
      while (TRUE) {
        line = str_trim(readLines(con, n = 1))
        
        if (length(line) == 0)
          break
        
        if (startsWith(line, "#")) {
          last_comment_line <- line
        }
        else {
          first_data_line <- line
          break
        }
      }
      close(con)
      
      # Number of fields
      if (is.null(first_data_line)) {
        # Empty data frame
        return(NULL)
      } else {
        n_fields <-
          length(str_split(string = first_data_line, pattern = "\t")[[1]])
      }
      
      stopifnot(n_fields >= 3)
      
      # Try extracting column names
      if (is.null(last_comment_line)) {
        col_names <- paste("X", 1:n_fields, sep = "")
        col_names[1:3] <- c("chrom", "start", "end")
      } else {
        col_names <- str_replace(string = last_comment_line,
                                 pattern = "^[# \t]+",
                                 replacement = "") %>%
          str_split(pattern = "\t") %>% .[[1]]
      }
      
      return(col_names)
    }
    
    tryCatch({
      temp_dir <- tempdir(check = TRUE)
      temp_path <- NULL
      temp_index_path <- NULL
      is_remote <- is_remote_url(file_path)
      
      # For remote files (either the data or the index), download them to 
      # the temporary dir 
      if (is_remote) {
        url <- file_path
        temp_path <-
          paste0(temp_dir, "/", tail(str_split(url, "/")[[1]], n = 1))
        download.file(url = url, destfile = temp_path)
        file_path <- temp_path
        
        # Attempt to download the tabix tbi file
        if (endsWith(url, ".gz") && !is.null(region)) {
          index_url <- paste0(url, ".tbi")
          if (RCurl::url.exists(index_url)) {
            temp_index_path <- paste0(temp_path, ".tbi")
            download.file(url = index_url, destfile = temp_index_path)
          }
        }
      }
      
      # Try identify column names from the header
      if (is.null(col_names)) {
        col_names <- get_col_names(file_path = file_path)
      }
      
      if (is.null(col_types)) {
        col_types <-
          paste(c("cii", rep_len("?", length(col_names) - 3)), collapse = "")
      }
      
      if (is.null(region)) {
        genbed <- read_tsv(
          file = file_path,
          comment = "#",
          col_names = col_names,
          col_types = col_types,
          na = na,
          ...
        )
      } else{
        genbed <-
          load_genbed_tabix(
            file_path = file_path,
            region = region,
            na = na,
            col_names = col_names,
            is_bedgraph = is_bedgraph,
            ...
          )
      }
      
      return(genbed)
    }, finally = {
      unlink(c(temp_path, temp_index_path, temp_dir))
    })
  }


# Dump the data frame as GENBED files
# Unlike
#
#' @export
write_genbed <-
  function(df,
           file_path,
           header = NULL,
           append = FALSE,
           compressor = "bgzip",
           col_names = !append,
           ...) {
    stopifnot(is.character(file_path) &&
                length(file_path) == 1 &&
                any(endsWith(
                  file_path, c(".gz", ".bed", ".bedGraph")
                )))
    
    args <- list(...)
    
    # User-provided header information
    if (!is.null(header)) {
      header <- paste0("#", header)
    }
    header <-
      paste(c(header, paste0("#", paste(
        colnames(df), collapse = "\t"
      ))),
      collapse = "\n")
    
    if (endsWith(file_path, ".gz")) {
      destfile <- tempfile(fileext = ".bed")
    } else {
      destfile <- file_path
    }
    
    if (col_names)
      cat(paste0(header, "\n"), file = destfile, append = append)
    df %>% write_tsv(
      file = destfile,
      col_names = FALSE,
      # If append is FALSE, bug col_names is TRUE, it means we have to write the header information
      # Thus, we still write in append mode
      append = append || col_names,
      ...
    )
    
    if (endsWith(file_path, ".gz")) {
      system(str_interp("${compressor} ${destfile}"))
      system(str_interp("mv ${destfile}.gz ${file_path}"))
    }
  }
haizi-zh/bioessentials documentation built on April 10, 2021, 2:34 p.m.