R/para_read_data.R
In parasiteR: A Theorical-Practical Approach to Parasitological Data Analysis

Documented in para_read_data

#' Read parasite data
#'
#' Load data from a .CSV file
#'
#' This package includes a specific function to import tables (.CSV files) into the R environment. Each row in the table should correspond to an individual host that was analyzed, while the columns may contain both quantitative and qualitative variables.
#' Columns may represent two principal categories of variables:
#' \itemize{
#'   \item "Host-related variables": Encompassing metadata such as the site of specimen collection, host species, morphophysiological traits, applied experimental treatments, and other relevant descriptors.
#'   \item "Parasite-related variables": Denoting parasite abundance per host, typically structured across multiple columns corresponding to the finest available taxonomic resolution (e.g., species, genus, family, order).
#' }
#' Parasite abundance values must be encoded as non-negative integers. It is critical to distinguish between the following:
#' \itemize{
#'   \item 0: Represents a confirmed absence of the parasite in the host specimen.
#'   \item NA: Indicates that parasite detection or quantification was not feasible due to methodological or technical limitations.
#' }
#'
#' @usage
#' para_read_data(file_name, verbose = FALSE)
#'
#' @param file_name Name of .CSV table file.
#' @param verbose A logical value indicating if progress messages should be given.
#'
#' @return The function returns:
#' \item{dataset}{A table that can be used as input for other parasiteR functions.}
#' \item{factors_v}{A list of columns with factor values.}
#' \item{num_v}{A list of columns with numeric values.}
#' \item{summ}{A summary of the loaded data. Check \code{summary()} function}
#' @author Juan Manuel Cabrera, Exequiel Furlan and Elisa Helman
#'
#' @export

para_read_data <- function(file_name, verbose = FALSE)
{

  file_name->datafile

  if (!(file.exists(datafile))){stop("File not found. Please check file name.")}

  chk_sep <- readLines(file_name, n = 2)

  if(verbose){message("Checking table format...")}

  # NUMFIELDS ES LA CANTIDAD DE CAMPOS (columnas) QUE TIENE LA TABLA
  ## SI LA MISMA SE LEE CON EL CARACTER ";" COMO SEPARADOR
  numfields <- utils::count.fields(textConnection(chk_sep[1]), sep = ";")

  #######
  #REVISAR LA COMPROBACIÓN DE SEPARADOR DE DECIMALES. ACTUALMENTE LA FUNCIÓN
  # SELECCIONA UNA COLUMNA DONDE SE ESPERA QUE HAYA NUMEROS Y REALIZÁ ALLÍ LA
  #  COMPROBACIÓN. dataset[1,6])


  # SI numfields == 1 ESTO QUIERE DECIR QUE NO SE LOGRÓ SEPARAR LAS COLUMNAS UTILIZANDO ;
  ## ENTONCES SE REALIZA UNA SEPARACIÓN POR EL CARACTER ","
  if (numfields == 1){
    numfields <- utils::count.fields(textConnection(chk_sep[1]), sep = ",")
    if (numfields == 1){stop("Could not identify column separator character.")}
    dataset<-readr::read_csv(file = file_name, col_names = TRUE, trim_ws = TRUE, show_col_types = verbose)
  }else{dataset<-readr::read_csv2(file = file_name, col_names = TRUE, trim_ws = TRUE, show_col_types = verbose)}

  ##Programar función para ver si hay columnas llenas de NA en host_fields
  #apply(asd$dataset,MARGIN = 2,function(x) all(is.na(x)))

  if(verbose){message("Data was succesfully read!")}

  #if(length(c_rows)!=0){if(verbose){message("Row/s removed: ", paste(c_rows, collapse = " "),".")}}


  #Pasar a as.factor() todos los character
  dataset[sapply(dataset, is.character)] <- lapply(dataset[sapply(dataset, is.character)],
                                                   as.factor)
  #r_p_table<-dataset
  r_p_table<-list(dataset = dataset,
                  factors_v = colnames(dataset[sapply(dataset, is.factor)]),
                  num_v = colnames(dataset[sapply(dataset, is.numeric)]),
                  summ = summary(dataset))

  return(r_p_table)
}