R/l_tools.R

Defines functions print_authors_affiliation

common_list <- list(
  taxaclass = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
)

some_packages <- c(
  "ggsankey" = "davidsjoberg/ggsankey",
  "sankeyD3" = "fbreitwieser/sankeyD3",
  "pctax" = "Asa12138/pctax",
  "MetaNet" = "Asa12138/MetaNet",
  "plot4fun" = "Asa12138/plot4fun",
  "iCRISPR" = "Asa12138/iCRISPR",
  "ReporterScore" = "Asa12138/ReporterScore",
  "ggcor" = "Github-Yilei/ggcor",
  "chorddiag" = "mattflor/chorddiag",
  "ggradar" = "ricardo-bion/ggradar",
  "pairwiseAdonis" = "pmartinezarbizu/pairwiseAdonis/pairwiseAdonis",
  "linkET" = "Hy4m/linkET",
  "ggchicklet" = "hrbrmstr/ggchicklet",
  "ggkegg" = "noriakis/ggkegg",
  "SpiecEasi" = "zdk123/SpiecEasi"
)

print_authors_affiliation <- function(authors = c("jc", "pc")) {
  affiliations <- c(
    "1" = "MOE Key Laboratory of Biosystems Homeostasis & Protection, and Zhejiang Provincial Key Laboratory of Cancer Molecular Cell Biology, Life Sciences Institute, Zhejiang University, Hangzhou, Zhejiang 310030, China",
    "2" = "State Key Laboratory for Diagnosis and Treatment of Infectious Diseases, First Affiliated Hospital, Zhejiang University School of Medicine, Hangzhou, Zhejiang 310009, China",
    "3" = "Center for Life Sciences, Shaoxing Institute, Zhejiang University, Shaoxing, Zhejiang 321000, China",
    "4" = "BGI Research, Wuhan, Hubei 430074, China",
    "5" = "BGI Research, Shenzhen, Guangdong 518083, China",
    "6" = "Department of Genetics, Stanford University School of Medicine, Stanford, CA, USA"
  )
  author_list <- list(
    jc = 1:3,
    pc = 1:2,
    lye = 1:2,
    lz = 1:2,
    jlyq = 1:2,
    hzn = 1:2,
    cq = 1:2,
    tsj = 4:5,
    sxt = 6
  )
  pa <- affiliations[author_list[authors] %>% Reduce(union, .)]
  for (i in seq_along(pa)) {
    pa[i] <- paste0("^", i, "^", pa[i])
  }
  paste0(pa, collapse = "\n\n") %>% clipr::write_clip()
  message(paste0(pa, collapse = "\n\n"))
}

# =========Little tools=========
#' Print some message with =
#'
#' @param str output strings
#' @param ... strings will be paste together
#' @param char side chars default:=
#' @param n the number of output length
#' @param print print or message?
#' @param mode "middle", "left" or "right"
#'
#' @examples
#' dabiao("Start running!")
#'
#' @export
#' @return No return value
dabiao <- function(str = "", ..., n = 80, char = "=", mode = c("middle", "left", "right"), print = FALSE) {
  str <- paste0(c(str, ...), collapse = "")
  mode <- match.arg(mode, c("middle", "left", "right"))
  if (n < nchar(str)) n <- nchar(str)
  x <- (n - nchar(str)) %/% 2
  x2 <- n - nchar(str) - x
  switch(mode,
    "left" = {
      xx <- paste0(str, strrep(char, x + x2))
    },
    "middle" = {
      xx <- paste0(strrep(char, x), str, strrep(char, x2))
    },
    "right" = {
      xx <- paste0(strrep(char, x + x2), str)
    }
  )
  if (print) {
    cat(xx, "\n")
  } else {
    message(xx, "\n")
  }
}

#' Copy a vector
#'
#' @param vec a R vector object
#'
#' @export
#' @return No return value
copy_vector <- function(vec) {
  if (!interactive()) {
    stop("This function is not allowed in non-interactive mode.")
  }
  lib_ps("clipr", library = FALSE)
  if (is.numeric(vec)) {
    res <- paste0("c(", paste0(vec, collapse = ","), ")")
  } else {
    res <- paste0('c("', paste0(vec, collapse = '","'), '")')
  }
  if (!is.null(names(vec))) {
    res <- paste0("setNames(", res, ",c(", paste0('"', names(vec), '"', collapse = ","), "))")
  }

  clipr::write_clip(res)
  message("copy done, just Ctrl+V")
}

#' Copy a data.frame
#'
#' @param df a R data.frame object
#'
#' @export
#' @return No return value
copy_df <- function(df) {
  if (!interactive()) {
    stop("This function is not allowed in non-interactive mode.")
  }
  lib_ps("rio", library = FALSE)
  rio::export(df, file = "clipboard")
  message("copy done, just Ctrl+V")
}


#' Change factor levels
#'
#' @param x vector
#' @param levels custom levels
#' @param last put the custom levels to the last
#'
#' @return factor
#' @export
#'
#' @examples
#' change_fac_lev(letters[1:5], levels = c("c", "a"))
change_fac_lev <- function(x, levels = NULL, last = FALSE) {
  ordervec <- factor(x)
  if (!is.null(levels)) {
    levels <- intersect(levels, levels(ordervec))
    if (last) {
      shunxu <- c(setdiff(levels(ordervec), levels), levels)
    } else {
      shunxu <- c(levels, setdiff(levels(ordervec), levels))
    }
    ordervec <- factor(ordervec, levels = shunxu)
  }
  ordervec
}

#' Replace a vector by named vector
#'
#' @param x a vector need to be replaced
#' @param y named vector
#' @param fac consider the factor?
#' @param keep_origin keep_origin?
#'
#' @return vector
#' @export
#' @examples
#' tidai(c("a", "a", "b", "d"), c("a" = "red", b = "blue"))
#' tidai(c("a", "a", "b", "c"), c("red", "blue"))
#' tidai(c("A" = "a", "B" = "b"), c("a" = "red", b = "blue"))
#' tidai(factor(c("A" = "a", "B" = "b", "C" = "c")), c("a" = "red", b = "blue", c = "green"))
tidai <- function(x, y, fac = FALSE, keep_origin = FALSE) {
  if (is.null(y)) {
    return(x)
  }
  tmp <- y
  if (is.null(names(tmp))) {
    tmp <- rep(unique(tmp), len = length(unique(x)))
    if (fac) {
      names(tmp) <- levels(factor(x))
    } else {
      names(tmp) <- unique(x)
    }
  }
  if (keep_origin) {
    add <- setdiff(x, names(tmp))
    tmp <- c(tmp, setNames(add, add))
  }
  if (is.null(names(x))) {
    return(unname(tmp[as.character(x)]))
  }
  return(setNames(unname(tmp[as.character(x)]), names(x)))
}

#' Update the parameters
#'
#' @description
#' Keep the different parameters while use the same name in update first.
#'
#' @param default default (data.frame, list, vector)
#' @param update update (data.frame, list, vector)
#'
#' @export
#' @return same class of your input (data.frame, list or vector)
#' @examples
#' update_param(list(a = 1, b = 2), list(b = 5, c = 5))
#'
update_param <- function(default, update) {
  if (missing(default) || length(default) == 0) default <- NULL
  if (missing(update) || length(update) == 0) update <- NULL
  if (is.null(default)) {
    return(update)
  }
  if (is.null(update)) {
    return(default)
  }

  if (!identical(class(default), class(update))) stop("Two different class object is not allowed to update")
  if (is.data.frame(default)) {
    inter <- intersect(colnames(update), colnames(default))
    la <- setdiff(colnames(default), inter)
    return(cbind(default[, la, drop = FALSE], update))
  }
  if (is.list(default)) {
    if (is.null(names(update)) | is.null(names(default))) stop("No name")
    inter <- intersect(names(update), names(default))
    la <- setdiff(names(default), inter)
    return(append(default[la, drop = FALSE], update))
  }
  if (is.vector(default)) {
    if (is.null(names(update)) | is.null(names(default))) stop("No name")
    inter <- intersect(names(update), names(default))
    la <- setdiff(names(default), inter)
    return(c(default[la, drop = FALSE], update))
  }
}

#' Attach packages or install packages have not benn installed
#'
#' @param p_list a vector of packages list
#' @param all_yes all install try set to yes?
#' @param library should library the package or just get Namespace ?
#' @param ... packages
#'
#' @return No return value
#' @export
#'
lib_ps <- function(p_list, ..., all_yes = FALSE, library = TRUE) {
  no_p <- p_list[!vapply(p_list, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1))]

  if (!interactive()) {
    if (length(no_p) > 0) {
      no_p <- paste0(seq_along(no_p), ". ", no_p)
      stop(paste0("exit, because some packages need to be installed:\n", paste(no_p, collapse = "\n")))
    }
  }

  p_list <- c(p_list, ...)
  for (p in p_list) {
    if (!requireNamespace(p, quietly = TRUE)) {
      if (!all_yes) {
        message(paste0(p, ": this package has not been installed yet, should it be installed?"))
        flag <- readline("yes/no(y/n)?")
      } else {
        flag <- "y"
      }

      if (tolower(flag) %in% c("yes", "y")) {
        if (p %in% names(some_packages)) {
          if (!requireNamespace("devtools", quietly = TRUE)) utils::install.packages("devtools")
          message("Install the ", p, "from github: ", some_packages[p])
          devtools::install_github(some_packages[p])
        } else {
          utils::install.packages(p)
        }
      } else {
        stop(paste0("exit, because '", p, "' needs to be installed"))
      }

      if (!requireNamespace(p, quietly = TRUE)) {
        if (!all_yes) {
          message(paste0(p, " is not available at CRAN, try Bioconductor?"))
          flag <- readline("yes/no(y/n)?")
        }

        if (tolower(flag) %in% c("yes", "y")) {
          if (!requireNamespace("BiocManager", quietly = TRUE)) utils::install.packages("BiocManager")
          BiocManager::install(p, update = FALSE)
        } else {
          stop(paste0("exit, because '", p, "' needs to be installed"))
        }
      }

      if (!requireNamespace(p, quietly = TRUE)) {
        stop("\nplease try other way (e.g. github...) to install ", p)
      }
    }
  }

  if (library) {
    for (p in p_list) {
      suppressPackageStartupMessages(library(p, character.only = TRUE, quietly = TRUE, warn.conflicts = FALSE))
    }
  }
}


#' Detach packages
#'
#' @param p_list a vector of packages list
#' @param origin keep the original Namespace
#' @param ... packages
#' @return No return value
#' @export
del_ps <- function(p_list, ..., origin = NULL) {
  p_list <- c(p_list, ...)
  p_list <- paste0("package:", p_list)
  all <- search()
  p_list <- p_list[p_list %in% all]
  if (!is.null(origin)) p_list <- setdiff(p_list, origin)
  for (p in p_list) {
    detach(p, character.only = TRUE)
  }
}

#' Three-line table
#'
#' @param df a data.frame
#' @param digits how many digits should remain
#' @param nrow show how many rows
#' @param ncol show how many columns
#' @param fig output as a figure
#' @param ... additional arguments e.g.(rows=NULL)
#' @param mode 1~2
#' @param background background color
#'
#' @return a ggplot
#' @export
#'
#' @examples
#' \donttest{
#' if (require("kableExtra")) {
#'   data(otutab)
#'   sanxian(otutab)
#' }
#' }
sanxian <- function(df, digits = 3, nrow = 10, ncol = 10, fig = FALSE, mode = 1, background = "#D7261E", ...) {
  if (nrow(df) > nrow) df <- df[1:nrow, , drop = FALSE]
  if (ncol(df) > ncol) df <- df[, 1:ncol, drop = FALSE]

  if (fig) {
    lib_ps("ggpubr", library = FALSE)
    df %>%
      dplyr::mutate_if(is.numeric, \(x)round(x, digits = digits)) %>%
      ggpubr::ggtexttable(..., theme = ggpubr::ttheme("blank")) %>%
      ggpubr::tab_add_hline(at.row = 1:2, row.side = "top", linewidth = 3) %>%
      ggpubr::tab_add_hline(at.row = nrow(df) + 1, row.side = "bottom", linewidth = 3) -> p
    return(p)
  } else {
    lib_ps("kableExtra", library = FALSE)
    if (mode == 1) {
      p <- kableExtra::kbl(df, digits = digits, ...) %>% kableExtra::kable_classic(full_width = FALSE, html_font = "Cambria")
    } else if (mode == 2) {
      p <- kableExtra::kbl(df, digits = digits, ...) %>%
        kableExtra::kable_classic(full_width = FALSE, html_font = "Cambria") %>%
        kableExtra::row_spec(0, bold = TRUE, color = "white", background = background) %>%
        kableExtra::row_spec(seq(2, nrow(df), 2), background = add_alpha(background))
    } else {
      p <- NULL
    }
    return(p)
  }
}

#' Grepl applied on a data.frame
#'
#' @param pattern search pattern
#' @param x your data.frame
#' @param ... addtitional arguments for gerpl()
#'
#' @return a logical matrix
#' @export
#' @examples
#' matrix(letters[1:6], 2, 3) |> as.data.frame() -> a
#' grepl.data.frame("c", a)
#' grepl.data.frame("\\w", a)
grepl.data.frame <- function(pattern, x, ...) {
  y <- if (length(x)) {
    do.call("cbind", lapply(x, "grepl", pattern = pattern, ...))
  } else {
    matrix(FALSE, length(row.names(x)), 0)
  }
  if (.row_names_info(x) > 0L) {
    rownames(y) <- row.names(x)
  }
  y
}

#' Gsub applied on a data.frame
#'
#' @param pattern search pattern
#' @param replacement a replacement for matched pattern
#' @param x your data.frame
#' @param ... additional arguments for gerpl()
#'
#' @return a data.frame
#' @export
#' @examples
#' matrix(letters[1:6], 2, 3) |> as.data.frame() -> a
#' gsub.data.frame("c", "a", a)
gsub.data.frame <- function(pattern, replacement, x, ...) {
  y <- if (length(x)) {
    do.call("cbind", lapply(x, "gsub", pattern = pattern, replacement = replacement, ...))
  } else {
    matrix(FALSE, length(row.names(x)), 0)
  }
  if (.row_names_info(x) > 0L) {
    rownames(y) <- row.names(x)
  }
  data.frame(y, check.names = FALSE)
}

# =======Read file========

#' Read some special format file
#'
#' @param file file path
#' @param format "blast", "diamond", "fa", "fasta", "fna", "gff", "gtf","jpg", "png", "pdf", "svg"...
#' @param just_print just print the file
#' @param all_yes all_yes?
#' @param ... additional arguments
#' @param density the resolution for reading pdf or svg
#'
#' @return data.frame
#' @export
#'
read.file <- function(file, format = NULL, just_print = FALSE, all_yes = FALSE, density = 120, ...) {
  if (!file.exists(file)) {
    stop(paste0(file, " does not exist!"))
  }
  if (!all_yes & !interactive()) {
    stop("This function is not allowed in non-interactive mode when all_yes is FALSE.")
  }

  if ((file.size(file) > 1e6) & !all_yes) {
    message(paste0(file, ": this file is a little big, still open?"))
    flag <- readline("yes/no(y/n)?")
    if (!tolower(flag) %in% c("yes", "y")) {
      return(NULL)
    }
  }
  if (just_print) {
    lib_ps("readr", library = FALSE)
    cat(readr::read_file(file))
  } else {
    if (is.null(format)) format <- tools::file_ext(file)
    format <- match.arg(format, c(
      "blast", "diamond", "fa", "fasta", "fna", "gff", "gtf",
      "jpg", "png", "pdf", "svg", "gif", "biom"
    ))

    if (format %in% c("gff", "gtf")) {
      # 读取文件内容
      lines <- readLines(file)
      # 过滤掉注释行
      data_lines <- lines[!grepl("^#", lines)]

      df <- utils::read.table(
        text = data_lines, sep = "\t",
        header = FALSE, stringsAsFactors = FALSE, comment.char = "",
        col.names = c("seqid", "source", "feature", "start", "end", "score", "strand", "phase", "attributes")
      )
      return(df)
    }

    if (format %in% c("fa", "fasta", "fna")) {
      df <- read_fasta(file)
      return(df)
    }

    if (format %in% c("blast", "diamond")) {
      df <- utils::read.table(file,
        sep = "\t", header = FALSE, stringsAsFactors = FALSE, comment.char = "",
        col.names = c(
          "Qseqid", "Sseqid", "Pident", "Length", "Mismatch", "Gapopen",
          "Qstart", "Qend", "Sstart", "Send", "E_value", "Bitscore"
        )
      )
      return(df)
    }

    if (format %in% c("jpg", "png", "svg", "pdf")) {
      oldpar <- graphics::par(no.readonly = TRUE)
      on.exit(graphics::par(oldpar))
      graphics::par(mar = rep(0, 4))

      lib_ps("magick", library = FALSE)
      image <- magick::image_read(file, density = density, ...)
      if (length(image) > 1) message("Your file has more than one page! Print the first one page.")
      plot(image[1])
    }
    if (format %in% c("gif")) {
      lib_ps("magick", library = FALSE)
      image <- magick::image_read(file, ...)
      print(image)
    }
  }
}

#' Read fasta file
#' @param fasta_file file path
#' @return data.frame
#' @export
read_fasta <- function(fasta_file) {
  fasta_data <- readLines(fasta_file)
  # create a null data.frame
  df <- data.frame(stringsAsFactors = FALSE)

  # initialize
  current_id <- ""
  current_seq <- ""

  # read fasta row by row
  for (line in fasta_data) {
    if (startsWith(line, ">")) {
      # start with > indicate name
      # add name and sequence of last one
      if (current_id != "") {
        df <- rbind(df, c(current_id, current_seq))
      }
      # update the new sequence
      current_id <- gsub(">", "", line)
      current_seq <- ""
    } else {
      # add sequence
      current_seq <- paste(current_seq, line, sep = "")
    }
  }

  # add the last sequence
  df <- rbind(df, c(current_id, current_seq))

  colnames(df) <- c("Sequence_ID", "Sequence")
  df
}

#' Write a data.frame to fasta
#'
#' @param df data.frame
#' @param file_path output file path
#' @param str_per_line how many base or animo acid in one line, if NULL, one sequence in one line.
#' @return No return value
#' @export
write_fasta <- function(df, file_path, str_per_line = 70) {
  file_conn <- file(file_path, "w")
  df <- as.data.frame(df)
  for (i in 1:nrow(df)) {
    sequence_id <- df[i, 1]
    sequence <- df[i, 2]

    writeLines(paste0(">", sequence_id), file_conn)
    if (is.null(str_per_line)) {
      writeLines(sequence, file_conn)
    } else if (str_per_line > 0) {
      split_sequence <- strsplit(sequence, split = "")
      split_sequence <- unlist(split_sequence)
      num_chunks <- ceiling(length(split_sequence) / str_per_line)
      for (j in 1:num_chunks) {
        start_index <- (j - 1) * str_per_line + 1
        end_index <- min(j * str_per_line, length(split_sequence))
        chunk <- paste(split_sequence[start_index:end_index], collapse = "")
        writeLines(chunk, file_conn)
      }
    } else {
      close(file_conn)
      stop("str_per_line should be NULL or number bigger than 1.")
    }
  }
  close(file_conn)
}

#' Transfer the format of file
#'
#' @param file input file
#' @param to_format transfer to
#' @param format input file format
#' @param ... additional argument
#' @param browser the path of Google Chrome, Microsoft Edge or Chromium in your computer.
#'
#' @return file at work directory
#' @export
#'
trans_format <- function(file, to_format, format = NULL, ..., browser = "/Applications/Microsoft\ Edge.app/Contents/MacOS/Microsoft\ Edge") {
  if (is.null(format)) format <- tools::file_ext(file)
  name <- tools::file_path_sans_ext(file)
  out <- paste0(name, ".", to_format)

  if (to_format == "jpeg") to_format <- "jpg"
  if (format == to_format) {
    message("The format is the same! No need to transfer.")
    return(invisible())
  }

  if (format == "svg") {
    if (to_format == "html") {
      file.copy(file, out)
    } else {
      lib_ps("rsvg", "grImport2", library = FALSE)
      rsvg::rsvg_svg(file, file)
      x <- grImport2::readPicture(file)
      g <- grImport2::pictureGrob(x)
      ggplot2::ggsave(g, filename = out, device = to_format, ...)
      invisible(g)
    }
  }

  if (format %in% c("pdf", "png", "jpg")) {
    lib_ps("magick", library = FALSE)
    img <- magick::image_read(file, density = 200, ...)
    g <- grid::rasterGrob(img, interpolate = TRUE)
    p <- ggplot2::ggplot() +
      ggplot2::annotation_custom(g, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf) +
      ggplot2::theme_void()
    ggplot2::ggsave(p, filename = out, device = to_format, ...)
    invisible(g)
  }

  if (format == "html") {
    if (to_format %in% c("pdf", "png", "jpeg")) {
      pagedown::chrome_print(file, out,
        wait = 0, browser = browser, format = to_format,
        options = list(
          # paperWidth=width,
          # pageRanges="1",
          # paperHeight=height,
          ...
        )
      )
    }
    if (to_format == "svg") {
      file.copy(file, out)
    }
  }
  message(paste0("Sucessfully transfered to ", to_format, " format!", "\nThe file is at ", out))
}

# ======= Web ========

#' Download File
#'
#' This function downloads a file from the provided URL and saves it to the specified location.
#'
#' @param file_path The full path to the file.
#' @param url The URL from which to download the file.
#' @param timeout timeout, 300s
#' @param force FALSE, if TRUE, overwrite existed file
#' @param ... add
#'
#' @return No value
#' @export
download2 <- function(url, file_path, timeout = 300, force = FALSE, ...) {
  if (file.exists(file_path) & !force) {
    return(invisible())
  } else {
    ori_time <- getOption("timeout")
    on.exit(options(timeout = ori_time))

    if (!dir.exists(dirname(file_path))) dir.create(dirname(file_path), recursive = TRUE)
    options(timeout = timeout)
    # Download the file
    tryCatch(
      expr = {
        utils::download.file(url, destfile = file_path, ...)
      },
      error = function(e) {
        stop("Try downloading yourself from ", url)
      }
    )
  }
}


#' Search and browse the web for specified terms
#'
#' This function takes a vector of search terms, an optional search engine (default is Google),
#' and an optional base URL to perform web searches. It opens the default web browser
#' with search results for each term.
#'
#' @param search_terms A character vector of search terms to be searched.
#' @param engine A character string specifying the search engine to use (default is "google").
#'               Supported engines: "google", "bing".
#' @param base_url A character string specifying the base URL for web searches. If not provided,
#'                the function will use a default URL based on the chosen search engine.
#' @return No return value
#' @examples
#' \dontrun{
#' search_terms <- c(
#'   "s__Pandoraea_pnomenusa",
#'   "s__Alicycliphilus_sp._B1"
#' )
#'
#' # Using Google search engine
#' search_browse(search_terms, engine = "google")
#'
#' # Using Bing search engine
#' search_browse(search_terms, engine = "bing")
#' }
#'
#' @export
search_browse <- function(search_terms, engine = "google", base_url = NULL) {
  if (length(search_terms) > 30) stop("too many search_terms, please cut down to 30.")
  # 如果未提供基础URL,则根据搜索引擎设置默认URL
  if (is.null(base_url)) {
    if (engine == "google") {
      base_url <- "https://www.google.com/search?q="
    } else if (engine == "bing") {
      base_url <- "https://www.bing.com/search?q="
    } else {
      stop("Unsupported search engine. Supported engines: 'google', 'bing'")
    }
  }

  search_terms <- gsub("_", " ", search_terms)
  # 循环遍历搜索每个元素
  for (term in search_terms) {
    # 构建搜索 URL
    search_url <- paste0(base_url, utils::URLencode(term))

    # 在默认浏览器中打开搜索页面
    utils::browseURL(search_url)
  }
}

#' Translator
#'
#' language: en, zh, jp, fra, th..., see \code{https://www.cnblogs.com/pieguan/p/10338255.html}
#'
#' @param words words
#' @param from source language, default "en"
#' @param to target language, default "zh"
#' @param split split to blocks when your words are too much
#' @param verbose verbose
#'
#' @export
#' @return vector
#' @examples
#' \dontrun{
#' translator(c("love", "if"), from = "en", to = "zh")
#' }
translator <- function(words, from = "en", to = "zh", split = TRUE, verbose = TRUE) {
  pcutils_config <- show_pcutils_config()
  if (is.null(pcutils_config$baidu_appid) | is.null(pcutils_config$baidu_key)) {
    message("Please set the baidu_appid and baidu_key using set_pcutils_config:")
    message("first, get the appid and key from baidu: https://zhuanlan.zhihu.com/p/375789804 ,")
    message("then, set_pcutils_config('baidu_appid',your_appid),")
    message("and set_pcutils_config('baidu_key',your_key).")
    return(invisible())
  }

  if (identical(from, to)) {
    to <- setdiff(c("en", "zh"), from)[1]
    if (verbose) message("Same `from` and `to` language, change `to` to ", to)
  }
  words <- as.character(words)
  words[words == ""] <- " "
  orginal_words <- setNames(words, words)
  idx <- grepl("^\\s+$", words)

  if (sum(idx, na.rm = TRUE) > 0) {
    if (verbose) message("Some of your words are invalid")
    # words[idx]="NULL"
    words <- words[!idx]
  }
  words <- unique(words)
  if (length(words) == 0) {
    return(orginal_words)
  }

  if (length(orginal_words) > 1) {
    if (any(grepl("\n", words, fixed = TRUE))) {
      if (verbose) message("'\\n' was found in your words, change to ';'.")
    }
    words <- gsub("\n", ";", words, fixed = TRUE)
  } else {
    words <- strsplit(words, "\n+")[[1]]
  }
  input_words <- paste0(words, collapse = "\n")

  if (split) {
    split_words <- split_text(input_words, nchr_each = 5000)
  } else {
    split_words <- input_words
  }

  if (length(split_words) > 1) {
    res_ls <- lapply(split_words, translator, from = from, to = to, split = FALSE, verbose = FALSE)
    return(do.call(c, res_ls))
  }

  res <- baidu_translate(input_words, from = from, to = to)

  if (length(res) == length(words)) {
    names(res) <- words
    res1 <- tidai(orginal_words, res, keep_origin = TRUE)
  } else {
    warning("Some thing wrong with your words, make the output length not equal to the input length")
    res1 <- res
  }
  return(res1)
}

baidu_translate <- function(x, from = "en", to = "zh", pcutils_config = show_pcutils_config()) {
  lib_ps("openssl", "httr", "jsonlite", library = FALSE)
  water <- sample.int(4711, 1)
  sign <- sprintf("%s%s%s%s", pcutils_config$baidu_appid, x, water, pcutils_config$baidu_key)
  sign2 <- openssl::md5(sign)

  .query <- list(
    q = x, from = from, to = to,
    appid = pcutils_config$baidu_appid,
    salt = water, sign = sign2
  )

  url <- httr::modify_url("http://api.fanyi.baidu.com/api/trans/vip/translate",
    query = .query
  )
  url <- url(url, encoding = "utf-8")
  res <- jsonlite::fromJSON(url)

  return(res$trans_result$dst)
}

#' Split text into parts, each not exceeding a specified character count
#'
#' @param text Original text
#' @param nchr_each Maximum character count for each part
#' @return List of divided parts
#' @export
#'
#' @examples
#' \donttest{
#' original_text <- paste0(sample(c(letters, "\n"), 400, replace = TRUE), collapse = "")
#' parts <- split_text(original_text, nchr_each = 200)
#' lapply(parts, nchar)
#' }
split_text <- function(text, nchr_each = 200) {
  # Split the text by newline characters
  parts <- strsplit(text, "\n+")[[1]]

  # Initialize the result list
  result <- list()

  # Loop through each part to ensure each does not exceed the specified character count
  current_part <- parts[1]

  for (part in parts[-1]) {
    if (nchar(current_part) > nchr_each) message("Characters number of this paragraph is more than ", nchr_each)
    if (nchar(current_part) + nchar(part) <= nchr_each) {
      # If adding the current part to the new part does not exceed the specified character count,
      # merge them into the current part
      current_part <- paste(current_part, part, sep = "\n")
    } else {
      # If it exceeds the specified character count, add the current part to the result list
      # and start a new part
      result <- c(result, current_part)
      current_part <- part
    }
  }

  # Add the last part to the result list
  result <- c(result, current_part)

  # Return the result
  return(result)
}


#' Download genome files from NCBI based on accession number
#'
#' This function downloads specific genomic files from NCBI's FTP server
#' based on the provided accession number. It supports downloading
#' different types of files, or the entire directory containing the files.
#'
#' @param accession A character string representing the NCBI accession number
#'        (e.g., "GCF_001036115.1_ASM103611v1" or "GCF_001036115.1"). The accession can start with
#'        "GCF" or "GCA".
#' @param out_dir A character string representing the directory where the
#'        downloaded files will be saved. Defaults to the current working directory (".").
#' @param type A character string representing the type of file to download.
#'        Supported types are "all", "gff", "fna". If "all" is specified,
#'        the function will prompt the user to use command line tools to download
#'        the entire directory. Defaults to "gff".
#' @param file_suffix A character string representing the specific file suffix to download.
#'        If specified, this will override the `type` parameter. Defaults to NULL.
#' @param timeout A numeric value representing the maximum time in seconds to wait for the download. Defaults to 300.
#'
#' @details
#' If the provided `accession` does not contain the version suffix (e.g., "GCF_001036115.1"),
#' the function will query the NCBI FTP server to determine the full accession name.
#'
#' When `type` is set to "all", the function cannot download the entire directory
#' directly but provides a command line example for the user to download the directory
#' using tools like `wget`.
#' @return No value
#' @examples
#' \dontrun{
#' download_ncbi_genome_file("GCF_001036115.1", out_dir = "downloads", type = "gff")
#' download_ncbi_genome_file("GCF_001036115.1", out_dir = "downloads", file_suffix = "_genomic.fna.gz")
#' }
#'
#' @export
download_ncbi_genome_file <- function(accession, out_dir = ".", type = "gff", file_suffix = NULL, timeout = 300) {
  lib_ps("httr", library = FALSE)
  # 基础URL
  base_url <- "https://ftp.ncbi.nlm.nih.gov/genomes/all/"

  # 确定是GCA还是GCF开头
  prefix <- substr(accession, 1, 3)
  if (!(prefix %in% c("GCF", "GCA"))) {
    stop("Accession must start with 'GCF' or 'GCA'")
  }
  # 确定有无版本号
  if (substr(accession, 14, 14) != ".") {
    stop("Accession must have version number, such as GCF_001036115.1")
  }

  # 提取数字部分并格式化路径
  numbers <- gsub("GCF_|GCA_|\\..*", "", accession)
  formatted_path <- paste(substr(numbers, 1, 3), substr(numbers, 4, 6), substr(numbers, 7, 9), sep = "/")

  # 基础路径
  base_path <- paste0(base_url, prefix, "/", formatted_path, "/")

  # 如果accession不包含详细版本后缀,则需要查找完整路径
  if (!grepl("_", gsub("GCF_|GCA_", "", accession))) {
    res <- httr::GET(base_path)
    if (httr::status_code(res) != 200) {
      stop("Unable to access the directory: ", base_path)
    }
    # 提取文件夹列表
    content <- httr::content(res, "text")
    folders <- regmatches(content, gregexpr(paste0(accession, "_[^/]+"), content))
    if (length(folders[[1]]) == 0) {
      stop("No matching folder found for accession: ", accession)
    }
    # 取第一个匹配的文件夹名
    full_accession <- folders[[1]][1]
  } else {
    full_accession <- accession
  }

  # 如果指定了file_suffix,则直接使用file_suffix
  if (!is.null(file_suffix)) {
    file_name <- paste0(full_accession, file_suffix)
  } else {
    # 根据type确定文件后缀
    file_suffix <- switch(type,
      "all" = "",
      "gff" = "_genomic.gff.gz",
      "fna" = "_genomic.fna.gz",
      "gbff" = "_genomic.gbff.gz",
      "gtf" = "_genomic.gtf.gz",
      "faa" = "_protein.faa.gz",
      "gpff" = "_genomic.gpff.gz",
      stop("Unsupported type: ", type)
    )
    file_name <- if (type == "all") "" else paste0(full_accession, file_suffix)
  }

  # 生成最终的URL
  full_url <- paste0(base_path, full_accession, "/", file_name)

  # 创建输出目录(如果不存在)
  if (!dir.exists(out_dir)) {
    dir.create(out_dir, recursive = TRUE)
  }

  if (type == "all") {
    # 下载整个文件夹
    message("Downloading the entire folder is not directly supported in R. You can use command line tools like wget or rsync for this purpose.")
    message("Example command: wget -r -np -nH --cut-dirs=5 ", full_url, "\n")
    return(full_url)
  } else {
    # 下载单个文件
    dest_file <- file.path(out_dir, basename(file_name))
    tryCatch(
      {
        download2(full_url, dest_file, timeout = timeout, mode = "wb")
        message("File downloaded successfully: ", dest_file)
      },
      error = function(e) {
        message("Error in downloading file: ", e)
      }
    )
  }
}

Try the pcutils package in your browser

Any scripts or data that you put into this service are public.

pcutils documentation built on June 26, 2024, 1:06 a.m.