R/genlight.R

Defines functions tidy_genlight

Documented in tidy_genlight

# tidy_genlight ----------------------------------------------------------------
#' @name tidy_genlight
#' @title Tidy a genlight object to a tidy dataframe and/or GDS object/file
#' @description Tidy genlight object to a tidy dataframe and/or GDS object/file.
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.

#' @param data (path or object) A genlight object in the global environment or
#' path to a genlight file that will be open with \code{readRDS}.

#' @inheritParams radiator_common_arguments

#' @param tidy (logical) Generate a tidy dataset.
#' Default: \code{tidy = TRUE}.

#' @param gds (optional, logical) To write a radiator gds object.
#' Default: \code{gds = TRUE}.

#' @param write (optional, logical) To write in the working directory the tidy
#' data. The file is written with \code{radiator_genlight_DATE@TIME.rad}.
#' Default: \code{write = FALSE}.

#' @export
#' @rdname tidy_genlight

#' @references Jombart T (2008) adegenet: a R package for the multivariate
#' analysis of genetic markers. Bioinformatics, 24, 1403-1405.
#' @references Jombart T, Ahmed I (2011) adegenet 1.3-1:
#' new tools for the analysis of genome-wide SNP data.
#' Bioinformatics, 27, 3070-3071.

#' @details
#' A string of the same dimension is generated when genlight:
#' \enumerate{
#' \item \code{is.null(genlight@pop)}: pop will be integrated
#' in the tidy dataset.
#' \item \code{is.null(data@chromosome)}: CHROM1 will be integrated
#' in the tidy dataset.
#' \item \code{is.null(data@loc.names)}: LOCUS1 to dim(genlight)[2]
#' will be integrated in the tidy dataset.
#' \item \code{is.null(data@position)}: an integer string of
#' length = dim(genlight)[2] will be integrated in the tidy dataset.
#' }
#'
#' \strong{Note: that if all CHROM, LOCUS and POS is missing the function will be terminated}


#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


tidy_genlight <- function(
    data,
    tidy = TRUE,
    gds = TRUE,
    write = FALSE,
    verbose = FALSE,
    parallel.core = parallel::detectCores() - 1
) {
  # Test
  # data = "radiator_genlight_20191211@1836.RData"
  # tidy = TRUE
  # gds = TRUE
  # write = FALSE
  # verbose = TRUE
  # parallel.core = 12L


  # Package requirement --------------------------------------------------------
  radiator_packages_dep(package = "adegenet")

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("data argument required")

  # Import data ---------------------------------------------------------------
  if (is.vector(data)) data <- readRDS(data)
  if (class(data)[1] != "genlight") rlang::abort("Input is not a genlight object")

  if (verbose) message("genlight info:")
  # strata ?
  strata <- tibble::tibble(INDIVIDUALS = data@ind.names)
  if (is.null(data@pop)) {
    if (verbose) message("    strata: no")
    if (verbose) message("    'pop' will be added")
    strata %<>% dplyr::mutate(STRATA = "pop")
  } else {
    if (verbose) message("    strata: yes")
    strata$STRATA = data@pop
  }

  n.markers <- dim(data)[2]
  n.ind <- nrow(strata)

  # Chromosome ?
  if (is.null(data@chromosome)) {
    if (verbose) message("    Chromosome/contig/scaffold: no")
    data@chromosome <- factor(rep("CHROM1", n.markers))
    chrom.info <- FALSE
  } else {
    if (verbose) message("    Chromosome/contig/scaffold: yes")
    chrom.info <- TRUE
  }

  # Locus ?
  if (is.null(data@loc.names)) {
    if (verbose) message("    Locus: no")
    locus.info <- FALSE
    data@loc.names <- stringi::stri_join("LOCUS", seq(from = 1, to = n.markers, by = 1))
  } else {
    if (verbose) message("    Locus: yes")
    locus.info <- TRUE
  }

  # POS ?
  if (is.null(data@position)) {
    if (verbose) message("    POS: no")
    pos.info <- FALSE
    # data@position <- rlang::as_integer(seq(from = 1, to = n.markers, by = 1))
    data@position <- vctrs::vec_cast(x = seq(from = 1, to = n.markers, by = 1), to = integer())
  } else {
    if (verbose) message("    POS: yes")
    pos.info <- TRUE
  }


  if (!chrom.info && !locus.info && !pos.info) {
    rlang::abort("Tidying the genlight requires at least one of these 3 markers metadata:
       CHROM (genlight@chromosome), LOCUS (genlight@loc.names) or POS (genlight@position)")
  }


  # markers
  markers <- tibble::tibble(
    CHROM = data@chromosome,#adegenet::chromosome(data),
    LOCUS = data@loc.names,#adegenet::locNames(data),
    POS = data@position#adegenet::position(data)
  ) %>%
    dplyr::mutate(dplyr::across(tidyselect::everything(), .fns = as.character)) %>%
    dplyr::mutate(dplyr::across(tidyselect::everything(), .fns = radiator::clean_markers_names)) %>%
    tidyr::unite(data = ., col = MARKERS, CHROM, LOCUS, POS, sep = "__", remove = FALSE) %>%
    dplyr::select(MARKERS, CHROM, LOCUS, POS)

  # Nuc info
  nuc.data <- data@loc.all
  if (!is.null(nuc.data)) {
    markers %<>%
      dplyr::mutate(
        REF = stringi::stri_sub(str = nuc.data, from = 1, to = 1),
        ALT = stringi::stri_sub(str = nuc.data, from = 3, to = 3)
      )
  }

  if (gds) tidy <- TRUE

  if (tidy) {
    if (write) {
      filename.temp <- generate_filename(extension = "rad")
      filename.short <- filename.temp$filename.short
      filename.genlight <- filename.temp$filename
    }

    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT","STRATA", "INDIVIDUALS",
              "GT_VCF", "GT_BIN", "GT")

    if (verbose) message("Generating tidy data...")
    tidy.data <- data.frame(data) %>%
      magrittr::set_colnames(x = ., value = markers$MARKERS) %>%
      tibble::add_column(.data = ., INDIVIDUALS = rownames(.), .before = 1) %>%
      radiator::rad_long(
        x = .,
        cols = "INDIVIDUALS",
        names_to = "MARKERS",
        values_to = "GT_BIN"
      ) %>%
      dplyr::full_join(markers, by = "MARKERS") %>%
      dplyr::full_join(strata, by =  "INDIVIDUALS") %>%
      dplyr::mutate(
        INDIVIDUALS = radiator::clean_ind_names(INDIVIDUALS),
        STRATA = radiator::clean_pop_names(STRATA)
      ) %>%
      dplyr::arrange(MARKERS, STRATA, INDIVIDUALS) %>%
      dplyr::select(tidyselect::any_of(want)) %>%
      radiator::calibrate_alleles(
        data = .,
        biallelic = TRUE,
        verbose = verbose
      ) %$%
      input

    if (write) radiator::write_rad(data = tidy.data, path = filename.genlight, verbose = verbose)

  }#End tidy genlight

  if (gds) {
    # generate the GDS --------------------------------------------------------------
    # markers %<>% dplyr::mutate(VARIANT_ID = as.integer(factor(MARKERS)))
    gds.filename <- radiator_gds(
      data.source = "genlight",
      geno.coding = "alt.dos",
      genotypes = gt2array(
        gt.bin = tidy.data$GT_BIN,
        n.ind = n.ind,
        n.snp = n.markers
      ),
      # genotypes = tibble::as_tibble(data.frame(data) %>% t) %>%
      #   tibble::add_column(.data = ., MARKERS = markers$MARKERS, .before = 1) %>%
      #   dplyr::arrange(MARKERS) %>%
      #   tibble::column_to_rownames(.data = ., var = "MARKERS"),
      strata = strata,
      biallelic = TRUE,
      markers.meta = markers,
      filename = NULL,
      verbose = verbose
    )
    # if (verbose) message("Written: GDS filename: ", gds.filename)
  }# End gds genlight

  if (tidy) {
    return(tidy.data)
  } else {
    message("returning GDS filename")
    return(gds.filename)
  }
} # End tidy_genlight

# write_genlight ----------------------------------------------------------------
#' @name write_genlight
#' @title Write a genlight object from a tidy data frame or GDS file or object.
#' @description Write a genlight object from a tidy data frame or GDS file or object.
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.

#' @inheritParams radiator_common_arguments

#' @param biallelic (logical, optional) If you already know that the data is
#' biallelic use this argument to speed up the function.
#' Default: \code{biallelic = TRUE}.

#' @param write (logical, optional) To write in the working directory the genlight
#' object. The file is written with \code{radiator_genlight_DATE@TIME.RData} and
#' can be open with load or readRDS.
#' Default: \code{write = FALSE}.

#' @param dartr (logical, optional) For non-dartR users who wants to have a genlight
#' object ready for the package. This option transfer or generates:
#' \code{CALL_RATE, AVG_COUNT_REF, AVG_COUNT_SNP, REP_AVG,
#' ONE_RATIO_REF, ONE_RATIO_SNP}. These markers metadata are stored into
#' the genlight slot: \code{genlight.obj@other$loc.metrics}.
#' \strong{Use the radiator generated GDS data for best result}.
#' Default: \code{dartr = FALSE}.

#' @export
#' @rdname write_genlight

#' @references Jombart T (2008) adegenet: a R package for the multivariate
#' analysis of genetic markers. Bioinformatics, 24, 1403-1405.
#' @references Jombart T, Ahmed I (2011) adegenet 1.3-1:
#' new tools for the analysis of genome-wide SNP data.
#' Bioinformatics, 27, 3070-3071.

#' @examples
#' \dontrun{
#' # With defaults:
#' turtle <- radiator::write_genlight(data = "my.radiator.gds.rad")
#'
#' # Write gl object in directory:
#' turtle <- radiator::write_genlight(data = "my.radiator.gds.rad", write = TRUE)
#'
#' # Generate a dartR ready genlight and verbose = TRUE:
#' turtle <- radiator::write_genlight(
#'     data = "my.radiator.gds.rad",
#'     write = TRUE,
#'     dartr = TRUE,
#'     verbose = TRUE
#'  )
#' }

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}


write_genlight <- function(
    data,
    biallelic = TRUE,
    write = FALSE,
    dartr = FALSE,
    verbose = FALSE,
    parallel.core = parallel::detectCores() - 2
) {


  # NOTE: Make sure it's the same levels.... id, markers etc...
  # TEST
  # biallelic = TRUE
  # write = TRUE
  # dartr = TRUE
  # verbose = TRUE
  # parallel.core = parallel::detectCores() - 1

  # Checking for missing and/or default arguments ------------------------------
  radiator_packages_dep(package = "adegenet")
  if (missing(data)) rlang::abort("Input file missing")
  if (verbose) message("Generating genlight...")

  # File type detection---------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)

  # Import data ---------------------------------------------------------------
  if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
    # Check that SeqArray is installed (it requires automatically: SeqArray and gdsfmt)
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
    }
    biallelic <- radiator::detect_biallelic_markers(data)# faster with GDS
    if (!biallelic) rlang::abort("genlight object requires biallelic genotypes")

    data.source <- radiator::extract_data_source(data)
    # markers.meta <- extract_markers_metadata(gds = data, whitelist = TRUE)
    data.bk <- data
    # data.bk -> data
    if (dartr && !any(unique(c("1row", "2rows") %in% data.source))) {
      # counts data and data with read depth alleles depth info...
      if (!"counts" %in% data.source) {
        genotypes.meta <- radiator::extract_genotypes_metadata(gds = data, whitelist = TRUE)
      } else {
        genotypes.meta <- NULL
      }
      data <- gds2tidy(gds = data.bk,
                       wide = FALSE,
                       parallel.core = parallel.core,
                       calibrate.alleles = FALSE)
      markers.levels <- unique(data$MARKERS)
      ind.levels <- unique(data$INDIVIDUALS)

      if (!"counts" %in% data.source) {
        # genotypes.meta <- extract_coverage(# Note to my self... need to adapt the code here for the new extract function when you have the time
        #   gds = data.bk,
        #   individuals = FALSE,
        #   depth.tibble = TRUE,
        #   parallel.core = parallel.core
        # )
        # suppressWarnings(
        #   data %<>%
        #     dplyr::left_join(genotypes.meta, by = c("INDIVIDUALS", "MARKERS"))
        # )
      }

      genotypes.meta <- NULL
      want <- c("MARKERS", "CHROM", "LOCUS", "POS",
                "REF", "ALT",
                "CALL_RATE", "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG",
                "ONE_RATIO_REF", "ONE_RATIO_SNP")
      markers.meta <- data %>%
        dplyr::select(tidyselect::any_of(want)) %>%
        dplyr::distinct(MARKERS, .keep_all = TRUE)
    } else {
      data <- radiator::extract_individuals_metadata(
        gds = data, ind.field.select = c("INDIVIDUALS", "STRATA"), whitelist = TRUE) %>%
        dplyr::bind_cols(
          SeqArray::seqGetData(
            gdsfile = data, var.name = "$dosage_alt") %>%
            magrittr::set_colnames(x = ., value = markers.meta$MARKERS) %>%
            tibble::as_tibble(.)
        ) %>%
        dplyr::rename(POP_ID = STRATA)
    }

    # dartR-----------------------------------------------------------------------
    if (dartr) {
      # That bits of code below generate what's nenessary for dartR
      if (verbose) message("Calculating read depth for each SNP\n")
      if ("dart" %in% data.source) {
        # if (any(unique(c("1row", "2rows") %in% data.source))) {
        markers.meta %<>%
          dplyr::mutate(
            N_IND = SeqArray::seqApply(
              gdsfile = data.bk,
              var.name = "$dosage_alt",
              FUN = function(g) length(g[!is.na(g)]),
              margin = "by.variant", as.is = "integer",
              parallel = parallel.core),
            rdepth = round(
              ((N_IND * ONE_RATIO_REF * AVG_COUNT_REF) +
                 (N_IND * ONE_RATIO_SNP * AVG_COUNT_SNP)) / N_IND
            )
          ) %>%
          dplyr::ungroup(.)
        data.bk <- NULL
      } else {
        if (rlang::has_name(data, "READ_DEPTH")) {
          # dart 2 rows counts...
          if (rlang::has_name(markers.meta, "AVG_COUNT_REF")) {

            #NOTE TO MYSELF: will have to keep REP_AVG from count somehow... to do
            # for now, I expect people will use this with filtered data so not important
            # to fill with 1
            if ("counts" %in% data.source) rep.avg <- markers.meta$REP_AVG
            not.wanted <- c("CALL_RATE", "AVG_COUNT_REF", "AVG_COUNT_SNP",
                            "REP_AVG", "ONE_RATIO_REF", "ONE_RATIO_SNP")
            markers.meta %<>% dplyr::select(-tidyselect::any_of(not.wanted))
          }

          suppressWarnings(
            markers.meta %<>%
              dplyr::left_join(
                data %>%
                  dplyr::group_by(MARKERS) %>%
                  dplyr::summarise(
                    rdepth = mean(READ_DEPTH, na.rm = TRUE),
                    AVG_COUNT_REF = mean(ALLELE_REF_DEPTH, na.rm = TRUE),
                    AVG_COUNT_SNP = mean(ALLELE_ALT_DEPTH, na.rm = TRUE),
                    CALL_RATE = length(INDIVIDUALS[!is.na(GT_BIN)]) / length(INDIVIDUALS),
                    ONE_RATIO_REF = length(INDIVIDUALS[GT_BIN == 0]) + length(INDIVIDUALS[GT_BIN == 1]) / length(INDIVIDUALS),
                    ONE_RATIO_SNP = length(INDIVIDUALS[GT_BIN == 2]) + length(INDIVIDUALS[GT_BIN == 1]) / length(INDIVIDUALS)
                  ) %>%
                  dplyr::mutate(REP_AVG = 1L)
                , by = "MARKERS"
              ) %>%
              dplyr::ungroup(.)
          )
          genotypes.meta <- NULL
          #Note to myself: will have to remove duplicated info in code between genotypes.meta and data...
          if ("counts" %in% data.source) {
            markers.meta$REP_AVG <- rep.avg
          }
        }
      }

      # ~25 times slower
      # Calculate Read Depth (from Arthur Georges)
      # gl.obj@other$loc.metrics$rdepth <- array(NA, adegenet::nLoc(gl.obj))
      # for (i in 1:adegenet::nLoc(gl.obj)){
      #   called.ind <- round(adegenet::nInd(gl.obj) * gl.obj@other$loc.metrics$CallRate[i],0)
      #   ref.count <- called.ind * gl.obj@other$loc.metrics$OneRatioRef[i]
      #   alt.count <- called.ind * gl.obj@other$loc.metrics$OneRatioSnp[i]
      #   sum.count.ref <- ref.count * gl.obj@other$loc.metrics$AvgCountRef[i]
      #   sum.count.alt <- ref.count * gl.obj@other$loc.metrics$AvgCountSnp[i]
      #   gl.obj@other$loc.metrics$rdepth[i] <- round((sum.count.alt + sum.count.ref) / called.ind, 1)
      # }

      # gl.obj@other$loc.metrics$rdepth
      data %<>%
        dplyr::select(MARKERS, STRATA, INDIVIDUALS, GT_BIN) %>%
        dplyr::mutate(GT_BIN = as.integer(GT_BIN)) %>%
        radiator::rad_wide(
          x = .,
          formula = "STRATA + INDIVIDUALS ~ MARKERS",
          values_from = "GT_BIN"
        )
    }#End dartr

    data.type <- "tbl_df"
  } else {
    # Tidy data
    # if (rlang::has_name(data, "STRATA")) data %<>% dplyr::rename(POP_ID = STRATA)
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "STRATA", "INDIVIDUALS",
              "REF", "ALT", "GT_VCF", "GT_BIN",
              "CALL_RATE", "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG",
              "ONE_RATIO_REF", "ONE_RATIO_SNP")
    data %<>%
      radiator::tidy_wide(data = ., import.metadata = TRUE) %>%
      dplyr::select(tidyselect::any_of(want)) %>%
      dplyr::arrange(STRATA, INDIVIDUALS)

    # Detect if biallelic data ---------------------------------------------------
    if (is.null(biallelic)) biallelic <- radiator::detect_biallelic_markers(data)
    if (!biallelic) rlang::abort("genlight object requires biallelic genotypes")
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT",
              "CALL_RATE", "AVG_COUNT_REF", "AVG_COUNT_SNP", "REP_AVG",
              "ONE_RATIO_REF", "ONE_RATIO_SNP")
    data %<>% dplyr::arrange(MARKERS, STRATA, INDIVIDUALS)
    markers.meta <- dplyr::select(data, tidyselect::any_of(want)) %>%
      dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
      separate_markers(
        data = .,
        sep = "__",
        markers.meta.all.only = TRUE,
        biallelic = TRUE,
        verbose = verbose)

    if (!rlang::has_name(data, "GT_BIN") && rlang::has_name(data, "GT_VCF")) {
      data$GT_BIN <- stringi::stri_replace_all_fixed(
        str = data$GT_VCF,
        pattern = c("0/0", "1/1", "0/1", "1/0", "./."),
        replacement = c("0", "2", "1", "1", NA),
        vectorize_all = FALSE
      )
    }

    data %<>%
      dplyr::select(MARKERS, STRATA, INDIVIDUALS, GT_BIN) %>%
      dplyr::mutate(GT_BIN = as.integer(GT_BIN)) %>%
      radiator::rad_wide(
        x = .,
        formula = "STRATA + INDIVIDUALS ~ MARKERS",
        values_from = "GT_BIN"
      )
  }# End tidy data

  # Generate genlight
  parallel.core.temp <- FALSE
  if (length(markers.meta$MARKERS) > 10000) parallel.core.temp <- parallel.core

  # longer
  # gl.obj2 <- methods::new(
  #   "genlight",
  #   gen = data[,-(1:2)],
  #   ploidy = 2,
  #   ind.names = data$INDIVIDUALS,
  #   chromosome = markers.meta$CHROM,
  #   loc.names = markers.meta$LOCUS,
  #   position = markers.meta$POS,
  #   pop = data$POP_ID,
  #   parallel = parallel.core.temp)
  # tictoc::toc()


  gl.obj <- methods::new(
    "genlight",
    data[,-(1:2)],
    parallel = parallel.core.temp
  )
  adegenet::indNames(gl.obj)   <- data$INDIVIDUALS
  adegenet::pop(gl.obj)        <- data$STRATA
  adegenet::chromosome(gl.obj) <- markers.meta$CHROM
  adegenet::locNames(gl.obj)   <- markers.meta$LOCUS
  adegenet::position(gl.obj)   <- markers.meta$POS

  if (dartr) {
    gl.obj@other$loc.metrics <- markers.meta %>%
      dplyr::select(
        OneRatioRef = ONE_RATIO_REF,
        OneRatioSnp = ONE_RATIO_SNP,
        AvgCountRef = AVG_COUNT_REF,
        AvgCountSnp = AVG_COUNT_SNP,
        CallRate = CALL_RATE,
        RepAvg = REP_AVG,
        rdepth
      )
  }#End dartr


  # Check
  # gl.obj@n.loc
  # gl.obj@ind.names
  # gl.obj@chromosome
  # gl.obj@position
  # length(gl.obj@position)
  # gl.obj@loc.names
  # length(gl.obj@loc.names)
  # gl.obj@pop
  # gl.obj@strata
  # adegenet::nLoc(gl.obj)
  # adegenet::popNames(gl.obj)
  # adegenet::indNames(gl.obj)
  # adegenet::nPop(gl.obj)
  # adegenet::NA.posi(gl.obj)
  # names(gl.obj@other$loc.metrics)


  if (write) {
    filename.temp <- generate_filename(extension = "genlight")
    filename.short <- filename.temp$filename.short
    filename.genlight <- filename.temp$filename
    saveRDS(object = gl.obj, file = filename.genlight)
    if (verbose) message("File written: ", filename.short)
  }
  return(gl.obj)
} # End write_genlight
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.