#' Check PLINK software access
#'
#' checkPlink checks that the PLINK software
#' (\url{https://www.cog-genomics.org/plink/1.9/})
#' can be found from system call.
#'
#' @param path2plink [character] Absolute path to PLINK executable
#' (\url{https://www.cog-genomics.org/plink/1.9/}) i.e.
#' plink should be accessible as path2plink -h. The full name of the executable
#' should be specified: for windows OS, this means path/plink.exe, for unix
#' platforms this is path/plink. If not provided, assumed that PATH set-up works
#' and PLINK will be found by \code{\link[sys]{exec}}('plink').
#' @return Path to PLINK executable.
#' @export
checkPlink <- function(path2plink=NULL) {
if (is.null(path2plink)) {
path2plink <- 'plink'
preset <- FALSE
} else {
preset <- TRUE
}
if (grepl("~", path2plink)) {
stop("Path to plink (", path2plink,
") contains ~: please supply full path, not relying",
" on tilde extension.")
}
if (.Platform$OS.type == 'windows') {
path2plink <- paste(gsub('\\.exe', '', path2plink), '.exe', sep="")
}
findPlink <- tryCatch(sys::exec_wait(path2plink, args="-h", std_out=FALSE,
std_err=TRUE),
warning=function(w) w, error = function(e) e)
if("simpleError" %in% is(findPlink)) {
if (!preset) {
stop("PLINK software required for running this function cannot be ",
"found in current PATH setting. Error message:", findPlink,
". Try to set path2plink.")
} else {
stop("PLINK software required for running this function cannot be ",
"found in path2plink (", path2plink, ") . Error message: ",
findPlink, ".")
}
}
return(path2plink)
}
#' Test lists for different properties of numerics
#'
#' Test all elements of a list if they are numeric, positive numbers, integers
#' or proportions (range 0-1).
#'
#' @param numbers [list] whose elements are tested for being numeric.
#' @param positives [list] whose elements are tested for being positive numbers.
#' @param integers [list] whose elements are tested for being integers.
#' @param proportions [list] whose elements are tested for being proportions.
#' between 0 and 1.
#' @return NULL
#' @export
testNumerics <- function(numbers, positives=NULL, integers=NULL,
proportions=NULL) {
notNullnotNumeric <- function(x) !is.null(x) && !is.numeric(x)
notNullnotPositive <- function(x) !is.null(x) && x <= 0
notNullnotInteger <- function(x) !is.null(x) && x%%1!=0
notNullNotInRange <- function(x) !is.null(x) && (x < 0 || x > 1)
if (any(sapply(numbers, notNullnotNumeric))) {
notNumbers <- which(sapply(numbers, notNullnotNumeric))
stop(paste(names(numbers)[notNumbers], collapse=","),
" is/are not numeric (",
paste(numbers[notNumbers], collapse=","), ")", sep="")
}
if (!is.null(positives)) {
if (any(sapply(positives, notNullnotPositive))) {
notPositives <- which(sapply(positives, notNullnotPositive))
stop(paste(names(positives)[notPositives], collapse=","),
" has/have to be greater than zero (",
paste(positives[notPositives], collapse=","), ")",
sep="")
}
}
if (!is.null(integers)) {
if (any(sapply(integers, notNullnotInteger))) {
notIntegers <- which(sapply(integers, notNullnotInteger))
stop(paste(names(integers)[notIntegers], collapse=","),
" has/have to be integers, given ",
paste(integers[notIntegers], collapse=","),
sep="")
}
}
if (!is.null(proportions)) {
if (any(sapply(proportions, notNullNotInRange))) {
outOfRange <- which(sapply(proportions, notNullNotInRange))
stop("Proportions have to be specified between 0 and 1: ",
paste(names(proportions)[outOfRange], collapse=","),
" are outside of this range (",
paste(proportions[outOfRange], collapse=","), ")", sep="")
}
}
}
#' Remove related individuals while keeping maximum number of individuals
#'
#' \code{relatednessFilter} takes a data.frame with pair-wise relatedness
#' measures of samples and returns pairs of individual IDs that are related as
#' well as a list of suggested individual IDs to remove.
#' \code{relatednessFilter} finds pairs of samples whose relatedness estimate
#' is larger than the specified relatednessTh. Subsequently, for pairs of
#' individual that do not have additional relatives in the dataset, the
#' individual with the worse otherCriterionMeasure (if provided) or arbitrarily
#' individual 1 of that pair is selected and returned as the individual failing
#' the relatedness check. For more complex family structures, the unrelated
#' individuals per family are selected (e.g. in a simple case of a
#' parents-offspring trio, the offspring will be marked as fail, while the
#' parents will be kept in the analysis). Selection is achieved by constructing
#' subgraphs of clusters of individuals that are related.
#' \code{relatednessFilter} then finds the maximum independent set of vertices
#' in the subgraphs of related individuals. If all individuals are related (i.e.
#' all maximum independent sets are 0), one individual of that cluster will be
#' kept and all others listed as failIDs.
#' @param relatedness [data.frame] containing pair-wise relatedness estimates
#' (in column [relatednessRelatedness]) for individual 1 (in column
#' [relatednessIID1] and individual 2 (in column [relatednessIID1]). Columns
#' relatednessIID1, relatednessIID2 and relatednessRelatedness have to present,
#' while additional columns such as family IDs can be present. Default column
#' names correspond to column names in output of plink --genome
#' (\url{https://www.cog-genomics.org/plink/1.9/ibd}). All original
#' columns for pair-wise highIBDTh fails will be returned in fail_IBD.
#' @param relatednessTh [double] Threshold for filtering related individuals.
#' Individuals, whose pair-wise relatedness estimates are greater than this
#' threshold are considered related.
#' @param otherCriterion [data.frame] containing a QC measure (in column
#' [otherCriterionMeasure]) per individual (in column [otherCriterionIID]).
#' otherCriterionMeasure and otherCriterionIID have to present, while additional
#' columns such as family IDs can be present. IIDs in relatednessIID1 have to
#' be present in otherCriterionIID.
#' @param otherCriterionTh [double] Threshold for filtering individuals based on
#' otherCriterionMeasure. If related individuals fail this threshold they will
#' automatically be excluded.
#' @param otherCriterionThDirection [character] Used to determine the direction
#' for failing the otherCriterionTh. If 'gt', individuals whose
#' otherCriterionMeasure > otherCriterionTh will automatically be excluded.
#' For pairs of individuals that have no other related samples in the cohort:
#' if both otherCriterionMeasure < otherCriterionTh, the individual with the
#' larger otherCriterionMeasure will be excluded.
#' @param relatednessIID1 [character] Column name of column containing the IDs
#' of the first individual.
#' @param relatednessIID2 [character] Column name of column containing the IDs
#' of the second individual.
#' @param relatednessFID1 [character, optional] Column name of column containing
#' the family IDs of the first individual; if only relatednessFID1 but not
#' relatednessFID2 provided, or none provided even though present in
#' relatedness, FIDs will not be returned.
#' @param relatednessFID2 [character, optional] Column name of column containing
#' the family IDs of the second individual; if only relatednessFID2 but not
#' relatednessFID1 provided, or none provided even though present in
#' relatedness, FIDs will not be returned.
#' @param relatednessRelatedness [character] Column name of column containing
#' the relatedness estimate.
#' @param otherCriterionIID [character] Column name of column containing the
#' individual IDs.
#' @param otherCriterionMeasure [character] Column name of the column containing
#' the measure of the otherCriterion (for instance SNP missingness rate).
#' @param verbose [logical] If TRUE, progress info is printed to standard out.
#' @return named [list] with i) relatednessFails, a [data.frame] containing the
#' data.frame relatedness after filtering for pairs of individuals in
#' relatednessIID1 and relatednessIID2, that fail the relatedness
#' QC; the data.frame is reordered with the fail individuals in column 1 and
#' their related individuals in column 2 and ii) failIDs, a [data.frame] with
#' the [IID]s (and [FID]s if provided) of the individuals that fail the
#' relatednessTh.
#' @export
relatednessFilter <- function(relatedness, otherCriterion=NULL,
relatednessTh, otherCriterionTh=NULL,
otherCriterionThDirection=
c("gt", "ge", "lt", "le", "eq"),
relatednessIID1="IID1", relatednessIID2="IID2",
relatednessFID1=NULL, relatednessFID2=NULL,
relatednessRelatedness="PI_HAT",
otherCriterionIID="IID",
otherCriterionMeasure=NULL,
verbose=FALSE) {
if (!(relatednessIID1 %in% names(relatedness))) {
stop(paste("Column", relatednessIID1, "for relatedness not found!"))
}
if (!(relatednessIID2 %in% names(relatedness))) {
stop(paste("Column", relatednessIID1, "for relatedness not found!"))
}
if (!(relatednessRelatedness %in% names(relatedness))) {
stop(paste("Column", relatednessRelatedness,
"for relatedness not found!"))
}
iid1_index <- which(colnames(relatedness) == relatednessIID1)
iid2_index <- which(colnames(relatedness) == relatednessIID2)
relatedness[,iid1_index] <- as.character(relatedness[,iid1_index])
relatedness[,iid2_index] <- as.character(relatedness[,iid2_index])
relatedness_names <- names(relatedness)
names(relatedness)[iid1_index] <- "IID1"
names(relatedness)[iid2_index] <- "IID2"
names(relatedness)[names(relatedness) == relatednessRelatedness] <- "M"
relatedness_original <- relatedness
if (!is.null(relatednessFID1) && is.null(relatednessFID2) ||
is.null(relatednessFID1) && !is.null(relatednessFID2) ) {
stop("Either none or both, relatednessFID1 and relatednessFID2 have to
be provided")
}
if (!is.null(relatednessFID1) && !is.null(relatednessFID2)) {
if (!(relatednessFID1 %in% names(relatedness))) {
stop(paste("Column", relatednessFID1, "for relatedness not found!"))
}
if (!(relatednessFID2 %in% names(relatedness))) {
stop(paste("Column", relatednessFID2, "for relatedness not found!"))
}
fid1_index <- which(colnames(relatedness) == relatednessFID1)
fid2_index <- which(colnames(relatedness) == relatednessFID2)
names(relatedness)[fid1_index] <- "FID1"
names(relatedness)[fid2_index] <- "FID2"
relatedness$FID1 <- as.character(relatedness$FID1)
relatedness$FID2 <- as.character(relatedness$FID2)
}
# Remove symmetric IID rows
relatedness <- dplyr::select(relatedness, .data$IID1, .data$IID2, .data$M)
sortedIDs <- data.frame(t(apply(relatedness, 1, function(pair) {
c(sort(c(pair[1], pair[2])))
})), stringsAsFactors=FALSE)
keepIndex <- which(!duplicated(sortedIDs))
relatedness_original <- relatedness_original[keepIndex,]
relatedness <- relatedness[keepIndex,]
# individuals with at least one pair-wise comparison > relatednessTh
highRelated <- dplyr::filter(relatedness, .data$M > relatednessTh)
if (nrow(highRelated) == 0) {
return(list(relatednessFails=NULL, failIDs=NULL))
}
uniqueIIDs <- unique(c(highRelated$IID1, highRelated$IID2))
failIDs_other <- NULL
if (!is.null(otherCriterion)) {
otherCriterionThDirection <- match.arg(otherCriterionThDirection)
if (!(otherCriterionMeasure %in% names(otherCriterion))) {
stop(paste("Column", otherCriterionMeasure,
"for otherCriterion not found!"))
}
if (!(otherCriterionIID %in% names(otherCriterion))) {
stop(paste("Column", otherCriterionIID,
"for otherCriterion not found!"))
}
names(otherCriterion)[names(otherCriterion) ==
otherCriterionMeasure] <- "M"
names(otherCriterion)[names(otherCriterion) ==
otherCriterionIID] <- "IID"
if (any(!uniqueIIDs %in% otherCriterion$IID)) {
stop("Not all IIDs provided in relatedness are contained in" ,
"otherCriterion")
}
fail_other <- apply(highRelated, 1, function(pair) {
failID1 <- NULL
failID2 <- NULL
one <- evaluateDirection(
otherCriterion$M[otherCriterion$IID == pair[1]],
otherCriterionTh,
direction=otherCriterionThDirection)
two <- evaluateDirection(
otherCriterion$M[otherCriterion$IID == pair[2]],
otherCriterionTh,
direction=otherCriterionThDirection)
if (one) failID1 <- pair[1]
if (two) failID2 <- pair[2]
return(c(failID1, failID2))
})
failIDs_other <- unique(unlist(fail_other))
# remove samples that fail other criterion
highRelated <- highRelated[!(highRelated$IID1 %in% failIDs_other |
highRelated$IID2 %in% failIDs_other), ]
if (nrow(highRelated) == 0) {
if (verbose) {
message("Relatedness cannot be evaluated as all individuals ",
"involved fail due to otherCriterion")
}
return(list(relatednessFails=NULL, failIDs=NULL))
}
uniqueIIDs <- unique(c(highRelated$IID1, highRelated$IID2))
}
# all samples with related individuals
allRelated <- c(highRelated$IID1, highRelated$IID2)
# Further selection of samples with relatives in cohort
multipleRelative <- unique(allRelated[duplicated(allRelated)])
singleRelative <- uniqueIIDs[!uniqueIIDs %in% multipleRelative]
highRelatedMultiple <- highRelated[highRelated$IID1 %in% multipleRelative |
highRelated$IID2 %in% multipleRelative,]
highRelatedSingle <- highRelated[highRelated$IID1 %in% singleRelative &
highRelated$IID2 %in% singleRelative,]
# Only one related samples per individual
if(length(singleRelative) != 0) {
if (!is.null(otherCriterion)) {
# choose one with best otherCriterion, if equal arbitrarily return
# the first one as fail
failIDs_single <- apply(highRelatedSingle, 1, function(pair) {
one_two <- evaluateDirection(
otherCriterion$M[otherCriterion$IID == pair[1]],
otherCriterion$M[otherCriterion$IID == pair[2]],
direction=otherCriterionThDirection)
two_one <- evaluateDirection(
otherCriterion$M[otherCriterion$IID == pair[2]],
otherCriterion$M[otherCriterion$IID == pair[1]],
direction=otherCriterionThDirection)
if (one_two) failID <- pair[1]
else if (two_one) failID <- pair[2]
else failID <- pair[1]
return(failID)
})
} else {
# randomly choose one to exclude
failIDs_single <- highRelatedSingle[,1]
}
} else {
failIDs_single <- NULL
}
if(length(multipleRelative) != 0) {
relatedPerID <- lapply(multipleRelative, function(x) {
tmp <- highRelatedMultiple[rowSums(
cbind(highRelatedMultiple$IID1 %in% x,
highRelatedMultiple$IID2 %in% x)) != 0,1:2]
rel <- unique(unlist(tmp))
return(rel)
})
names(relatedPerID) <- multipleRelative
keepIDs_multiple <- lapply(relatedPerID, function(x) {
pairwise <- t(combn(x, 2))
index <- (highRelatedMultiple$IID1 %in% pairwise[,1] &
highRelatedMultiple$IID2 %in% pairwise[,2]) |
(highRelatedMultiple$IID1 %in% pairwise[,2] &
highRelatedMultiple$IID2 %in% pairwise[,1])
combination <- highRelatedMultiple[index,]
combination_graph <- igraph::graph_from_data_frame(combination,
directed=FALSE)
all_iv_set <- igraph::ivs(combination_graph)
length_iv_set <- sapply(all_iv_set, function(x) length(x))
if (all(length_iv_set == 1)) {
# check how often they occurr elsewhere
occurrence <- sapply(x, function(id) {
sum(sapply(relatedPerID, function(idlist) id %in% idlist))
})
# if occurrence the same everywhere, pick the first, else keep
# the one with minimum occurrence elsewhere
if (length(unique(occurrence)) == 1) {
nonRelated <- sort(x)[1]
} else {
nonRelated <- names(occurrence)[which.min(occurrence)]
}
} else {
nonRelated <- all_iv_set[which.max(length_iv_set)]
}
return(nonRelated)
})
keepIDs_multiple <- unique(unlist(keepIDs_multiple))
failIDs_multiple <- c(multipleRelative[!multipleRelative %in%
keepIDs_multiple])
} else {
failIDs_multiple <- NULL
}
allFailIIDs <- c(failIDs_single, failIDs_multiple, failIDs_other)
relatednessFails <- lapply(allFailIIDs, function(id) {
fail_inorder <- relatedness_original$IID1 == id &
relatedness_original$M > relatednessTh
fail_inreverse <- relatedness_original$IID2 == id &
relatedness_original$M > relatednessTh
if (any(fail_inreverse)) {
inreverse <- relatedness_original[fail_inreverse, ]
if (!is.null(relatednessFID2)) {
id1 <- c(iid1_index,fid1_index)
id2 <- c(iid2_index,fid2_index)
} else {
id1 <- iid1_index
id2 <- iid2_index
}
inreverse[,c(id1, id2)] <- inreverse[,c(id2, id1)]
names(inreverse) <- relatedness_names
} else {
inreverse <- NULL
}
inorder <- relatedness_original[fail_inorder, ]
names(inorder) <- relatedness_names
return(rbind(inorder, inreverse))
})
relatednessFails <- do.call(rbind, relatednessFails)
if (nrow(relatednessFails) == 0) {
relatednessFails <- NULL
failIDs <- NULL
} else {
names(relatednessFails) <- relatedness_names
rownames(relatednessFails) <- 1:nrow(relatednessFails)
uniqueFails <- relatednessFails[!duplicated(relatednessFails[,iid1_index]),]
if (!is.null(relatednessFID2)) {
failIDs <- data.frame(FID=uniqueFails[,fid1_index],
IID=uniqueFails[,iid1_index],
stringsAsFactors=FALSE)
} else {
failIDs <- data.frame(IID=uniqueFails[,iid1_index],
stringsAsFactors=FALSE)
}
}
return(list(relatednessFails=relatednessFails, failIDs=failIDs))
}
evaluateDirection <- function(x, y, direction) {
if (direction == 'ge') x >= y
else if (direction == 'le') x <= y
else if (direction == 'gt') x > y
else if (direction == 'lt') x < y
else if (direction == 'eq') x == y
else stop(direction, " as direction in evaluateDirection not known.")
}
makepath <- function(directory, name) {
path <- file.path(directory, name)
path <- gsub('\\\\', '/', path)
return(path)
}
checkFormat <- function(prefix) {
if (!file.exists(paste(prefix, ".fam", sep=""))){
stop("plink family file: ", prefix, ".fam does not exist.")
}
if (!file.exists(paste(prefix, ".bim", sep=""))){
stop("plink snp file: ", prefix, ".bim does not exist.")
}
if (!file.exists(paste(prefix, ".bed", sep=""))){
stop("plink binary file: ", prefix, ".bed does not exist.")
}
}
#' Check and construct PLINK sample and marker filters
#'
#' checkFiltering checks that the file names with the individuals and markers to
#' be filtered can be found. If so, it constructs the command for filtering
#'
#' @param keep_individuals [character] Path to file with individuals to be
#' retained in the analysis. The file has to be a space/tab-delimited text file
#' with family IDs in the first column and within-family IDs in the second
#' column. All samples not listed in this file will be removed from the current
#' analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
#' Default: NULL, i.e. no filtering on individuals.
#' @param remove_individuals [character] Path to file with individuals to be
#' removed from the analysis. The file has to be a space/tab-delimited text file
#' with family IDs in the first column and within-family IDs in the second
#' column. All samples listed in this file will be removed from the current
#' analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
#' Default: NULL, i.e. no filtering on individuals.
#' @param extract_markers [character] Path to file with makers to be
#' included in the analysis. The file has to be a text file with a list of
#' variant IDs (usually one per line, but it's okay for them to just be
#' separated by spaces). All unlisted variants will be removed from the current
#' analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
#' Default: NULL, i.e. no filtering on markers.
#' @param exclude_markers [character] Path to file with makers to be
#' removed from the analysis. The file has to be a text file with a list of
#' variant IDs (usually one per line, but it's okay for them to just be
#' separated by spaces). All listed variants will be removed from the current
#' analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
#' Default: NULL, i.e. no filtering on markers.
#' @return Vector containing args in sys::exec_wait format to enable filtering
#' on individuals and/or markers.
#' @export
checkFiltering <- function(keep_individuals=NULL,
remove_individuals=NULL,
extract_markers=NULL,
exclude_markers=NULL
) {
args <- NULL
if (!is.null(keep_individuals)) {
if (!file.exists(keep_individuals)) {
stop("File with individuals to keep in analysis does not exist: ",
keep_individuals)
}
args = c(args, "--keep", keep_individuals)
}
if (!is.null(remove_individuals)) {
if (!file.exists(remove_individuals)) {
stop("File with individuals to remove from analysis does not exist: ",
remove_individuals)
}
args = c(args, "--remove", remove_individuals)
}
if (!is.null(extract_markers)) {
if (!file.exists(extract_markers)) {
stop("File with markers to extract for analysis does not exist: ",
extract_markers)
}
args = c(args, "--extract", extract_markers)
}
if (!is.null(exclude_markers)) {
if (!file.exists(exclude_markers)) {
stop("File with markers to exclude from analysis does not exist: ",
exclude_markers)
}
args = c(args, "--exclude", exclude_markers)
}
return(args)
}
#' Check and construct individual IDs to be removed
#'
#' checkRemoveIDs checks that the file names with the individuals to be filtered
#' can be found. It reads the corresponding files, combines the selected
#' individuals into one data.frame and compares these to all individuals in
#' the analysis.
#' @param prefix [character] Prefix of PLINK files, i.e. path/2/name.bed,
#' path/2/name.bim and path/2/name.fam.
#' @param keep_individuals [character] Path to file with individuals to be
#' retained in the analysis. The file has to be a space/tab-delimited text file
#' with family IDs in the first column and within-family IDs in the second
#' column. All samples not listed in this file will be removed from the current
#' analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
#' Default: NULL, i.e. no filtering on individuals.
#' @param remove_individuals [character] Path to file with individuals to be
#' removed from the analysis. The file has to be a space/tab-delimited text file
#' with family IDs in the first column and within-family IDs in the second
#' column. All samples listed in this file will be removed from the current
#' analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
#' Default: NULL, i.e. no filtering on individuals.
#' @return data.frame containing family (FID) and individual (IID) IDs of
#' individuals to be removed from analysis.
checkRemoveIDs <- function(prefix, remove_individuals=NULL, keep_individuals) {
removeIDs <- NULL
allIDs <- data.table::fread(paste(prefix, ".fam", sep=""),
data.table=FALSE, stringsAsFactors=FALSE,
header=FALSE)
allIDs <- allIDs[,1:2]
if (!is.null(remove_individuals)) {
if (!file.exists(remove_individuals)) {
stop("File with individuals to remove from analysis does not exist: ",
remove_individuals)
}
removeIDs <- data.table::fread(remove_individuals, data.table=FALSE,
stringsAsFactors=FALSE,
header=FALSE)
}
if (!is.null(keep_individuals)) {
if (!file.exists(keep_individuals)) {
stop("File with individuals to keep in analysis does not exist: ",
keep_individuals)
}
pre_keepIDs <- data.table::fread(keep_individuals, data.table=FALSE,
stringsAsFactors=FALSE,
header=FALSE)
if(ncol(pre_keepIDs) != 2) {
stop("File keep_individual is not in the right format; should be ",
"two columns separated by space/tab.")
}
removeIDs <- rbind(removeIDs,
allIDs[!allIDs[,2] %in% pre_keepIDs[,2],])
}
return(removeIDs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.