# Read PLINK -------------------------------------------------------------------
#' @name read_plink
#' @title Reads PLINK tped and bed files
#' @description The function reads PLINK tped and bed files.
#' radiator prefers the use of BED file. These files are converted to
#' a connection SeqArray \href{https://github.com/zhengxwen/SeqArray}{SeqArray}
#' GDS object/file of class \code{SeqVarGDSClass} (Zheng et al. 2017).
#' The Genomic Data Structure (GDS) file format is detailed in
#' \href{https://github.com/zhengxwen/gdsfmt}{gdsfmt}.
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#' @param data The PLINK file.
#' \itemize{
#' \item {bi-allelic data only}. For haplotypes use VCF.
#' \item \code{tped} file format: the corresponding \code{tfam} file must be in the directory.
#' \item \code{bed} file format: IS THE PREFERRED format, the corresponding
#' \code{fam} and \code{bim} files must be in the directory.
#' }
#' @param filename (optional) The file name of the Genomic Data Structure (GDS) file.
#' radiator will append \code{.gds.rad} to the filename.
#' If the filename chosen exists in the working directory,
#' the default \code{radiator_datetime.gds} is chosen.
#' Default: \code{filename = NULL}.
#' @inheritParams tidy_genomic_data
#' @inheritParams radiator_common_arguments
#' @details
#' Large PLINK files will require the use of BED plink format. Look below
#' in the example for conversion with PLINK.
#'
#' Large PLINK bed files will take longer to import and transform in GDS, but
#' after the file is generated, you can close your computer and
#' come back to it a month later and it's now a matter of sec to open a connection!
#### To do ....
# @section Advance mode:
#
# \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the function:
# \enumerate{
# \item \code{path.folder}: to write ouput in a specific path
# (used internally in radiator). Default: \code{path.folder = getwd()}.
# If the supplied directory doesn't exist, it's created.
# \item \code{random.seed}: (integer, optional) For reproducibility, set an integer
# that will be used inside codes that uses randomness. With default,
# a random number is generated, printed and written in the directory.
# Default: \code{random.seed = NULL}.
# \item \code{subsample.markers.stats}: By default, when no filters are
# requested and that the number of markers is > 200K,
# 0.2 of markers are randomly selected to generate the
# statistics (individuals and markers). This is an all-around and
# reliable number.
# In doubt, overwrite this value by using 1 (all markers selected) and
# expect a small computational cost.
# }
#' @return
#' For \code{tped} the function returns a list object with the non-modified \code{tped} and
#' the strata corresponding to the \code{tfam}.
#' With \code{bed}, the function returns a GDS object.
#' @export
#' @rdname read_plink
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
#' @references Zheng X, Gogarten S, Lawrence M, Stilp A, Conomos M, Weir BS,
#' Laurie C, Levine D (2017). SeqArray -- A storage-efficient high-performance
#' data format for WGS variant calls.
#' Bioinformatics.
#'
#' @references
#' PLINK: a tool set for whole-genome association and population-based linkage
#' analyses.
#' American Journal of Human Genetics. 2007: 81: 559–575. doi:10.1086/519795
#' @examples
#' \dontrun{
#' data <- radiator::read_plink(data = "my_plink_file.bed")
#' # when conversion is required from TPED to BED, in Terminal:
#' # plink --tfile my_plink_file --make-bed --allow-no-sex --allow-extra-chr --chr-set 95
#' }
#' @seealso
#' \href{https://www.cog-genomics.org/plink/1.9/}{PLINK}
read_plink <- function(
data,
filename = NULL,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
) {
# #Test
# filename <- NULL
# parallel.core <- parallel::detectCores() - 1
# verbose <- TRUE
# Cleanup---------------------------------------------------------------------
radiator_function_header(f.name = "read_plink", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(future.globals.maxSize = Inf)
options(width = 70)
timing <- radiator_tic()
res <- list()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(if (verbose) radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "read_plink", start = FALSE, verbose = verbose), add = TRUE)
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("PLINK file missing")
# Warning concerning plink files----------------------------------------------
plink.format <- radiator::detect_genomic_format(data = data)
# PLINK TPED -----------------------------------------------------------------
if (plink.format == "plink.tped.file") {
message("Reading PLINK tped...")
# Get file size
plink.size <- file.size(data)
if (!plink.size <= 100000000) {
message("\n\nNote: huge PLINK tped files are no longer recommended inside radiator")
message("Convert to PLINK BED with: --tfile [] --make-bed --allow-no-sex --allow-extra-chr --chr-set [] with PLINK command")
rlang::abort("Read the documentation of radiator::read_plink")
}
# Importing file -----------------------------------------------------------
fam.file <- stringi::stri_replace_all_fixed(
str = data,
pattern = ".tped",
replacement = ".tfam",
vectorize_all = FALSE
)
res$strata <- readr::read_delim(
file = fam.file,
delim = " ",
col_names = c("STRATA", "INDIVIDUALS"),
col_types = "cc____"
) %>%
radiator::read_strata(strata = .) %$%
strata
fam.file <- NULL
# preparing header for tped file
tped.header.prep <- res$strata %>%
dplyr::select(INDIVIDUALS) %>%
dplyr::mutate(
NUMBER = seq(1, n()),
ALLELE1 = rep("A1", n()), ALLELE2 = rep("A2", n())
) %>%
radiator::rad_long(
x = .,
cols = c("INDIVIDUALS", "NUMBER"),
names_to = "ALLELES_GROUP",
values_to = "ALLELES"
) %>%
dplyr::arrange(NUMBER) %>%
dplyr::select(-ALLELES_GROUP) %>%
tidyr::unite(INDIVIDUALS_ALLELES, c(INDIVIDUALS, ALLELES), sep = "_", remove = FALSE) %>%
dplyr::arrange(NUMBER) %>%
dplyr::mutate(NUMBER = seq(from = (1 + 4), to = n() + 4)) %>%
dplyr::select(-ALLELES)
tped.header.names <- c("CHROM", "LOCUS", "POS", tped.header.prep$INDIVIDUALS_ALLELES)
tped.header.integer <- c(1, 2, 4, tped.header.prep$NUMBER)
tped.header.prep <- NULL
# import PLINK
res$data <- data.table::fread(
input = data,
sep = " ",
header = FALSE,
stringsAsFactors = FALSE,
verbose = FALSE,
select = tped.header.integer,
col.names = tped.header.names,
showProgress = TRUE,
data.table = FALSE) %>%
tibble::as_tibble(.) %>%
dplyr::mutate(
CHROM = as.character(CHROM),
LOCUS = as.character(LOCUS),
POS = as.character(POS)
) %>%
tidyr::unite(data = ., col = "MARKERS", c("CHROM", "LOCUS", "POS"), remove = FALSE, sep = "__")
# Unused objects
tped.header.integer <- tped.header.names <- NULL
return(res)
}
# PLINK BED ------------------------------------------------------------------
if (plink.format == "plink.bed.file") {
message("Reading PLINK bed file...")
# Required package -----------------------------------------------------------
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("internal", "path.folder", "parameters"),
verbose = FALSE
)
# Folders---------------------------------------------------------------------
wf <- path.folder <- generate_folder(
f = path.folder,
rad.folder = "read_plink",
prefix_int = FALSE,
internal = internal,
file.date = file.date,
verbose = verbose)
radiator.folder <- generate_folder(
f = path.folder,
rad.folder = "import_gds",
prefix_int = TRUE,
internal = internal,
file.date = file.date,
verbose = verbose)
# write the dots file
write_rad(
data = rad.dots,
path = radiator.folder,
filename = stringi::stri_join("radiator_read_plink_args_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
write.message = "Function call and arguments stored in: ",
verbose = verbose
)
filename <- generate_filename(
name.shortcut = filename,
path.folder = radiator.folder,
extension = "gds")
filename.short <- filename$filename.short
filename <- filename$filename
timing.plink <- proc.time()
# read_plink -----------------------------------------------------------------
bim.file <- stringi::stri_replace_all_fixed(
str = data,
pattern = ".bed",
replacement = ".bim",
vectorize_all = FALSE
)
fam.file <- stringi::stri_replace_all_fixed(
str = data,
pattern = ".bed",
replacement = ".fam",
vectorize_all = FALSE
)
if (verbose) message("Generating GDS...")
gds <- SeqArray::seqBED2GDS(
bed.fn = data,
fam.fn = fam.file,
bim.fn = bim.file,
out.gdsfn = filename,
compress.geno = "ZIP_RA",
compress.annotation = "ZIP_RA",
verbose = verbose
) %>%
SeqArray::seqOpen(gds.fn = ., readonly = FALSE)
if (verbose) message("done! timing: ", round((proc.time() - timing.plink)[[3]]), " sec\n")
# PLINK: Summary ----------------------------------------------------------------
summary_gds(gds = gds, verbose = TRUE)
if (verbose) message("\nFile written: ", filename.short)
# Generate radiator skeleton -------------------------------------------------
radiator.gds <- radiator_gds_skeleton(gds)
# data.source ----------------------------------------------------------------
update_radiator_gds(gds = gds, node.name = "data.source", value = "plink")
# bi- or multi-alllelic--------------------------------------------------
biallelic <- detect_biallelic_markers(data = gds, verbose = verbose)
# Clean sample id---------------------------------------------------------
individuals.plink <- tibble::tibble(
INDIVIDUALS_PLINK = SeqArray::seqGetData(gds, "sample.id")) %>%
dplyr::mutate(INDIVIDUALS_CLEAN = radiator::clean_ind_names(INDIVIDUALS_PLINK))
if (!identical(individuals.plink$INDIVIDUALS_PLINK, individuals.plink$INDIVIDUALS_CLEAN)) {
if (verbose) message("Cleaning PLINK's sample names")
clean.id.filename <- stringi::stri_join("cleaned.plink.id.info_", file.date, ".tsv")
readr::write_tsv(x = individuals.plink,
file = file.path(radiator.folder, clean.id.filename))
update_radiator_gds(gds = gds, node.name = "id.clean", value = individuals.plink)
}
# replace id
update_radiator_gds(
gds = gds,
radiator.gds = FALSE,
node.name = "sample.id",
value = individuals.plink$INDIVIDUALS_CLEAN,
replace = TRUE)
individuals <- dplyr::select(individuals.plink, INDIVIDUALS = INDIVIDUALS_CLEAN)
individuals.plink <- NULL
# sync id with STRATA---------------------------------------------------------
if (verbose) message("Using .fam file for strata...")
strata <- readr::read_delim(
file = fam.file,
delim = " ",
col_names = c("STRATA", "INDIVIDUALS"),
col_types = "cc____"
) %>%
radiator::read_strata(strata = .) %$%
strata
id.levels <- individuals$INDIVIDUALS
individuals %<>%
dplyr::left_join(
join_strata(individuals, strata, verbose = verbose) %>%
dplyr::mutate(FILTERS = "whitelist")
, by = "INDIVIDUALS"
) %>%
dplyr::mutate(FILTERS = tidyr::replace_na(data = FILTERS, replace = "filter.stata"))
strata <- generate_strata(data = dplyr::filter(individuals, FILTERS == "whitelist"), pop.id = FALSE)
#Update GDS node
update_radiator_gds(gds = gds, node.name = "individuals.meta", value = individuals, sync = TRUE)
# PLINK: Markers metadata ------------------------------------------------------
markers.meta <- extract_markers_metadata(gds = gds)
# PLINK: reference genome or de novo -------------------------------------------
ref.genome <- detect_ref_genome(data = gds, verbose = verbose)
# Generate MARKERS column and fix types --------------------------------------
markers.meta %<>%
dplyr::mutate(
dplyr::across(
.cols = c(CHROM, LOCUS, POS),
.fns = radiator::clean_markers_names
)
) %>%
dplyr::mutate(
MARKERS = stringi::stri_join(CHROM, LOCUS, POS, sep = "__"),
REF = SeqArray::seqGetData(gdsfile = gds, var.name = "$ref"),
ALT = SeqArray::seqGetData(gdsfile = gds, var.name = "$alt")
)
# PLINK file with duplicate markers... sometimes tagged ISOFORMS...
dup.markers <- length(markers.meta$MARKERS) - length(unique(markers.meta$MARKERS))
if (dup.markers > 0) {
message("\nNumber of duplicate MARKERS id: ", dup.markers)
message("Adding integer to differentiate...")
markers.meta %<>%
dplyr::arrange(MARKERS) %>%
dplyr::mutate(MARKERS_NEW = MARKERS) %>%
dplyr::group_by(MARKERS_NEW) %>%
dplyr::mutate(
MARKERS = stringi::stri_join(MARKERS, seq(1, n(), by = 1), sep = "_")
) %>%
dplyr::ungroup(.) %>%
dplyr::select(-MARKERS_NEW)
}
# ADD MARKERS META to GDS
update_radiator_gds(gds = gds, node.name = "markers.meta", value = markers.meta)
# replace chromosome info in GDS
# Why ? well snp ld e.g. will otherwise be performed by chromosome and with de novo data = by locus...
update_radiator_gds(
gds = gds,
radiator.gds = FALSE,
node.name = "chromosome",
value = markers.meta$CHROM,
replace = TRUE
)
# # radiator_parameters: generate --------------------------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = FALSE,
update = FALSE,
parameter.obj = parameters,
path.folder = radiator.folder,
file.date = file.date,
verbose = verbose,
internal = internal)
# radiator_parameters: initiate --------------------------------------------
# with original PLINK values
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = TRUE,
update = TRUE,
parameter.obj = filters.parameters,
data = gds,
filter.name = "plink",
param.name = "original values in PLINK + strata",
values = "",
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose
)
return(gds)
} # end plink's BED format
} # End read_plink
# tidy_plink -------------------------------------------------------------------
#' @name tidy_plink
#' @title Tidy PLINK tped and bed files
#' @description Transform bi-allelic PLINK files in \code{.tped} or {.bed} formats
#' into a tidy dataset.
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and \href{https://github.com/thierrygosselin/assigner}{assigner}
#' and might be of interest for users.
#' @param data The PLINK file.
#' \itemize{
#' \item {bi-allelic data only}. For haplotypes use VCF.
#' \item \code{tped} file format: the corresponding \code{tfam} file must be in the directory.
#' \item \code{bed} file format: IS THE PREFERRED format, the corresponding
#' \code{fam} and \code{bim} files must be in the directory.
#' }
#' @inheritParams radiator_common_arguments
#' @section Advance mode:
#'
#' \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the function:
#' \enumerate{
#' \item \code{calibrate.alleles}: logical. For \code{tped} files, if
#' \code{calibrate.alleles = FALSE} the function runs faster
#' but REF/ALT alleles may not be calibrated. The default assumes the users or
#' sotware producing the PLINK file calibrated the alleles.
#' Default: \code{calibrate.alleles = FALSE}.
#' }
#' @references Zheng X, Gogarten S, Lawrence M, Stilp A, Conomos M, Weir BS,
#' Laurie C, Levine D (2017). SeqArray -- A storage-efficient high-performance
#' data format for WGS variant calls.
#' Bioinformatics.
#'
#' @references
#' PLINK: a tool set for whole-genome association and population-based linkage
#' analyses.
#' American Journal of Human Genetics. 2007: 81: 559–575. doi:10.1086/519795
#' @examples
#' \dontrun{
#' data <- radiator::tidy_plink(data = "my_plink_file.bed", verbose = TRUE)
#'
#'
#' # when conversion is required from TPED to BED, in Terminal:
#' # plink --tfile my_plink_file --make-bed --allow-no-sex --allow-extra-chr --chr-set 95
#' }
#' @seealso
#' \href{https://www.cog-genomics.org/plink/1.9/}{PLINK}
#'
#' \code{\link[radiator]{read_plink}}
#' @return
#' A tidy tibble of the PLINK file.
#'
#' @export
#' @rdname tidy_plink
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
tidy_plink <- function(
data,
parallel.core = parallel::detectCores() - 1,
verbose = FALSE,
...
) {
# Test
# verbose = TRUE
# strata = NULL
# calibrate.alleles = TRUE
# parallel.core = parallel::detectCores() - 1
# parameters <- NULL
# filename=NULL
# internal = FALSE
# path.folder = getwd()
# Cleanup---------------------------------------------------------------------
radiator_function_header(f.name = "tidy_plink", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(future.globals.maxSize = Inf)
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "tidy_plink", start = FALSE, verbose = verbose), add = TRUE)
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("PLINK file missing")
# Function call and dotslist -------------------------------------------------
path.folder <- filename <- calibrate.alleles <- NULL
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
deprecated = c("blacklist.id", "pop.select", "pop.levels", "pop.labels"),
keepers = c("calibrate.alleles", "filename", "internal", "path.folder", "parameters"),
verbose = FALSE
)
# Warning concerning plink files----------------------------------------------
plink.format <- radiator::detect_genomic_format(data = data)
data <- radiator::read_plink(
data = data,
filename = filename,
parallel.core = parallel.core,
verbose = FALSE,
internal = TRUE,
path.folder = path.folder,
parameters = parameters
)
if (plink.format == "plink.tped.file") {
# Make tidy ------------------------------------------------------------------
if (verbose) message("Tidying the PLINK tped file ...")
# Filling GT and new separating INDIVIDUALS from ALLELES
# combining alleles
strata <- data$strata
data <- radiator::rad_long(
x = data$data,
cols = c("MARKERS", "CHROM", "LOCUS", "POS"),
names_to = "INDIVIDUALS_ALLELES",
values_to = "GT"
)
# detect GT coding
detect.gt.coding <- unique(sample(x = data$GT, size = 100, replace = FALSE))
gt.letters <- c("A", "C", "G", "T")
if (TRUE %in% unique(gt.letters %in% detect.gt.coding)) {
if (verbose) message("Genotypes coded with letters")
gt.letters.df <- tibble::tibble(
GT = c("A", "C", "G", "T", "0"),
NEW_GT = c("001", "002", "003", "004", "000")
)
data %<>%
dplyr::left_join(
gt.letters.df, by = "GT") %>%
dplyr::select(-GT) %>%
dplyr::rename(GT = NEW_GT)
gt.letters.df <- NULL
} else {
if (verbose) message("Genotypes coded with integers")
data %<>%
dplyr::mutate(GT = stringi::stri_pad_left(str = GT, width = 3, pad = "0"))
}
detect.gt.coding <- gt.letters <- NULL
data %<>%
tidyr::separate(
data = .,
col = INDIVIDUALS_ALLELES,
into = c("INDIVIDUALS", "ALLELES"),
sep = "_") %>%
radiator::rad_wide(
x = .,
formula = "MARKERS + CHROM + LOCUS + POS + INDIVIDUALS ~ ALLELES",
values_from = "GT"
) %>%
dplyr::ungroup(.) %>%
tidyr::unite(data = ., col = GT, A1, A2, sep = "") %>%
dplyr::select(MARKERS, CHROM, LOCUS, POS, INDIVIDUALS, GT)
# population levels and strata
if (verbose) message("Integrating the tfam/strata file...")
data %<>% dplyr::left_join(strata, by = "INDIVIDUALS")
strata <- NULL
# removing untyped markers across all-pop
remove.missing.gt <- data %>%
dplyr::select(LOCUS, GT) %>%
dplyr::filter(GT != "000000")
untyped.markers <- dplyr::n_distinct(data$LOCUS) - dplyr::n_distinct(remove.missing.gt$LOCUS)
if (untyped.markers > 0) {
if (verbose) message("Number of marker with 100 % missing genotypes: ", untyped.markers)
data <- suppressWarnings(
dplyr::semi_join(data,
remove.missing.gt %>%
dplyr::distinct(LOCUS, .keep_all = TRUE),
by = "LOCUS")
)
}
# Unused objects
remove.missing.gt <- NULL
# detect if biallelic give vcf style genotypes
# biallelic <- radiator::detect_biallelic_markers(input)
# filename <- internal <- parameters <- path.folder <- calibrate.alleles <- NULL
# rm(filename, internal, parameters, path.folder, calibrate.alleles)
if (calibrate.alleles) {
data %<>% radiator::calibrate_alleles(data = ., verbose = verbose)
return(res = list(input = data$input, biallelic = data$biallelic))
} else {
return(res = list(input = data, biallelic = radiator::detect_biallelic_markers(data)))
}
} #End tidy tped
if (plink.format == "plink.bed.file") {
# Get info markers and individuals -----------------------------------------
gds.info <- radiator::summary_gds(gds = data, verbose = FALSE)
n.markers <- gds.info$n.markers
n.individuals <- gds.info$n.ind
cat("\n\n################################## IMPORTANT ###################################\n")
message("Tidying PLINK file with ", n.markers, " SNPs is not optimal:")
message(" 1. a computer with lots of RAM is required")
message(" 2. it's very slow to generate")
message(" 3. it's very slow to run codes after")
message(" 4. for most non model species this number of markers is not realistic...")
message("\nRecommendation:")
message(" 1. filter your dataset. e.g. with filter_rad")
message("\nIdeally target a maximum of ~ 10 000 - 20 0000 unlinked SNPs\n")
message("\nGenotypes formats generated with ", n.markers, " SNPs: ")
message(" GT_BIN (the dosage of ALT allele: 0, 1, 2 NA)")
biallelic <- radiator::detect_biallelic_markers(data)
tidy.data <- gds2tidy(
gds = data,
markers.meta = NULL,
pop.id = FALSE,
calibrate.alleles = calibrate.alleles,
close.gds = TRUE
)
return(res = list(input = tidy.data, biallelic = biallelic))
}#End tidy bed
} # End tidy_plink
# write_plink ------------------------------------------------------------------
#' @name write_plink
#' @title Write a plink tped/tfam file from a tidy data frame
#' @description Write a plink file from a tidy data frame.
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and \href{https://github.com/thierrygosselin/assigner}{assigner}
#' and might be of interest for users.
#' @param data A tidy data frame object in the global environment or
#' a tidy data frame in wide or long format in the working directory.
#' \emph{How to get a tidy data frame ?}
#' Look into \pkg{radiator} \code{\link{tidy_genomic_data}}.
#' @param filename (optional) The file name prefix for tped/tfam files
#' written to the working directory. With default: \code{filename = NULL},
#' the date and time is appended to \code{radiator_plink_}.
#' @export
#' @rdname write_plink
#' @references Purcell S, Neale B, Todd-Brown K, Thomas L, Ferreira MAR,
#' Bender D, et al.
#' PLINK: a tool set for whole-genome association and population-based linkage
#' analyses.
#' American Journal of Human Genetics. 2007: 81: 559–575. doi:10.1086/519795
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
write_plink <- function(data, filename = NULL) {
# Import data ---------------------------------------------------------------
if (is.vector(data)) data %<>% radiator::tidy_wide(data = ., import.metadata = TRUE)
tped <- data %>%
dplyr::arrange(INDIVIDUALS) %>%
dplyr::mutate(
COL1 = rep("0", n()),
COL3 = rep("0", n()),
COL4 = rep("0", n())
)
if (rlang::has_name(data, "GT")) {
tped %<>%
dplyr::select(COL1, MARKERS, COL3, COL4, INDIVIDUALS, GT) %>%
dplyr::mutate(
A1 = stringi::stri_sub(str = GT, from = 1, to = 3),
A2 = stringi::stri_sub(str = GT, from = 4, to = 6)
) %>%
dplyr::select(-GT) %>%
radiator::rad_long(
x = .,
cols = c("COL1", "MARKERS", "COL3", "COL4", "INDIVIDUALS"),
names_to = "ALLELES",
values_to = "GENOTYPE"
) %>%
dplyr::mutate(
GENOTYPE = as.character(as.numeric(GENOTYPE)),
GENOTYPE = stringi::stri_pad_left(GENOTYPE, width = 2, pad = "0")
) %>%
dplyr::arrange(INDIVIDUALS, ALLELES)
}
if (rlang::has_name(data, "GT_BIN")) {
tped %<>%
dplyr::select(COL1, MARKERS, COL3, COL4, INDIVIDUALS, GT_BIN) %>%
dplyr::mutate(
A1 = dplyr::case_when(
GT_BIN == 0 ~ "01",
GT_BIN == 1 ~ "01",
GT_BIN == 2 ~ "00",
is.na(GT_BIN) ~ "00"
),
A2 = dplyr::case_when(
GT_BIN == 0 ~ "00",
GT_BIN == 1 ~ "01",
GT_BIN == 2 ~ "01",
is.na(GT_BIN) ~ "00"
)
) %>%
dplyr::select(-GT_BIN) %>%
radiator::rad_long(
x = .,
cols = c("COL1", "MARKERS", "COL3", "COL4", "INDIVIDUALS"),
names_to = "ALLELES",
values_to = "GENOTYPE"
)
}
tped %<>%
tidyr::unite(INDIVIDUALS_ALLELES, INDIVIDUALS, ALLELES, sep = "_") %>%
radiator::rad_wide(x = ., formula = "COL1 + MARKERS +COL3 + COL4 ~ INDIVIDUALS_ALLELES", values_from = "GENOTYPE") %>%
dplyr::arrange(MARKERS)
tfam <- dplyr::distinct(.data = data, STRATA, INDIVIDUALS) %>%
dplyr::arrange(INDIVIDUALS) %>%
dplyr::mutate(
COL3 = rep("0",n()),
COL4 = rep("0",n()),
COL5 = rep("0",n()),
COL6 = rep("-9",n())
)
# Create a filename to save the output files ********************************
if (is.null(filename)) {
# Get date and time to have unique filenaming
file.date <- stringi::stri_replace_all_fixed(Sys.time(), pattern = " EDT", replacement = "", vectorize_all = FALSE)
file.date <- stringi::stri_replace_all_fixed(file.date, pattern = c("-", " ", ":"), replacement = c("", "@", ""), vectorize_all = FALSE)
file.date <- stringi::stri_sub(file.date, from = 1, to = 13)
filename.tped <- stringi::stri_join("radiator_plink_", file.date, ".tped")
filename.tfam <- stringi::stri_join("radiator_plink_", file.date, ".tfam")
} else {
filename.tped <- stringi::stri_join(filename, ".tped")
filename.tfam <- stringi::stri_join(filename, ".tfam")
}
readr::write_delim(x = tped, file = filename.tped, col_names = FALSE, delim = " ")
readr::write_delim(x = tfam, file = filename.tfam, col_names = FALSE, delim = " ")
} # end write_plink
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.