# Get top loci
#' @title Creates a panel of the top n unlinked loci, and exports the list of loci
#' @description Extract the genotypes of individuals at the top n (by Fst) unlinked loci. The default threshold of r2>0.2 is employed for assigning 'linked' loci (default for plink).
#' @param genepop A file path to the genepop format file you wish to create your panel from
#' @param where.plink A file path to the PLINK installation folder.
#' @param where.pgdspider A file path to the PGDspider installation folder.
#' @param r2.threshold The minimum r^2 threshold to consider a pair of loci to be in LD
#' @param fst.threshold The minimum FST threshold required to retain a locus
#' @param ld.window Number of adjacent SNPs to compare each SNP against for LD - default is NULL, which translates to a window size of 99999, which essentially asks to compare each SNP against all others
#' @param ldpop A string which populations (default: "All") will be used to calculate linkage disequilibrium. Names must match names returned by genepop_detective().
#' @param allocate.pgd.ram An integer value in GB to specify the maximum amount of RAM to allocate to PGDspider. The default is 1 GB, which should be sufficient for most analyses.
#' @param return.workspace (default: TRUE) Logical query to return the output to the workspace
#' @param save.output Logical query (default: FALSE) to save the output to the same location as the file being analyzed. Each of the outputs of the function will be saved as a separate file with the file name of the orginal data appended with "_Linkages", "Loci_FST", and "Unlinked_Loci_FST" for the pairwise linked loci along with their r^2, all loci with their global Fst, and only the top unlinked loci with their Fst respectively.
#' @rdname genepop_toploci
#' @export
#' @importFrom diveRsity diffCalc
#' @importFrom stringr str_split str_detect
#' @importFrom plyr rbind.fill
#' @importFrom utils write.table
#' @importFrom utils read.table
genepop_toploci <- function(genepop, where.plink, where.pgdspider, r2.threshold = 0.2, fst.threshold = 0.05, ld.window = NULL, ldpop = "All", allocate.pgd.ram = 1, return.workspace = TRUE, save.output = FALSE){
writeLines("Note that this function works with Plink 1.9 for now. If you have Plink 2.0, the function may not perform as intended.")
path.start <- getwd() ### where to write the files created by genepopedit to
## check for retrun options
if(!save.output & !return.workspace){
stop("No output option specified. At least one output parameter (save.output or return.workspace) must be TRUE.")
}
## find the populations in the file
pops.exist <- genepop_detective(genepop) ## see what populations are in the file
if(allocate.pgd.ram%%1 != 0){
stop("Please specify an integer GB value to allocate to PGDspider.")
}
#Set up ram allocation. Note that only 1024 gb of ram will work with Windows
allocate.pgd.ram <- allocate.pgd.ram*1024
if(Sys.info()["sysname"] == "Windows" & allocate.pgd.ram>1024){
allocate.pgd.ram=1024
writeLines("Note that currently PGDspider can only utilize ~1 GB of RAM on windows based operating systems. Periodically check back to https://github.com/rystanley/genepopedit for any updates to this limitation.
")}
#Variable checks
if(r2.threshold < 0 | r2.threshold > 1){
stop("r^2 threshold must be a value between 0 and 1")
}
if(r2.threshold<0.2){
writeLines("Linkage detection threshold is low (<0.2). Linkage will be classified at a higher frequency than default PLINK selection parameters.
")
}
if(is.null(ld.window)){
ld.window = 999999 ### sets the LD window to essentially check every SNP pairwise
}
if(ld.window < 0){
stop("LD window must be positive")
}
if(length(which(ldpop %in% c("All",pops.exist)))==0){
stop(paste0("Parameter 'ldpop' must be a string of population names in the dataset, or ", "'All'. ", "Function stopped."),call. = FALSE)
}
if(fst.threshold < 0 | fst.threshold > 1){
stop("FST threshold must be a value between 0 and 1")
}
# modify the path to play nice with spaces
path.start.PGD <- gsub(x = path.start, pattern = " ", replacement = "\\")
where.pgdspider.PGD <- gsub(x = where.pgdspider, pattern = " ", replacement = "\\ ", fixed = TRUE)
#Calculate FST
writeLines("Calculating locus-specific Fst\n\n")
FST.dat <- diveRsity::diffCalc(infile = genepop,outfile = NULL,fst = T,bs_locus = F)
FST.dat <- FST.dat$std_stats[1:length(FST.dat$std_stats$loci)-1,]
FSTs <- FST.dat$Fst
FST.df <- data.frame(genepopedit::genepop_detective(genepop,variable="Loci"), FSTs)
names(FST.df)[1] <- "loci"
FST.df <- FST.df[base::order(FST.df$FSTs, decreasing = TRUE),
]
FST.Filter.Vec <- as.character(FST.df[which(FST.df$FSTs >=
fst.threshold), 1])
#Console message
writeLines("Converting GENEPOP to MAP-PED format.")
writeLines("
")
writeLines("Warning messages are expected as part of conversion process using PGDspider.
")
### Will LD be calculated for All or specified populations.
if(ldpop != "All"){
subPOP <- as.character(ldpop)
## subset out the population in which LD is to be calculated - this will make a file, which will be deleted after
subset_genepop(genepop = genepop, spop = subPOP, subs = FST.Filter.Vec, keep = TRUE, path = paste0(path.start, "/", "subset_for_LD.txt"))
## remember path to the file created by subset_genepop
sub_data_path <- paste0(path.start, "/", "subset_for_LD.txt")
}
if(ldpop == "All"){
popLDsubsetDF <- data.frame(op=pops.exist, paste0("PopA", rep(1:length(pops.exist)))) ## make a both the same
subset_genepop_aggregate(genepop = genepop, agpopframe = popLDsubsetDF, path = paste0(path.start, "/", "subset_for_LD.txt"))
sub_data_path <- paste0(path.start, "/", "subset_for_LD.txt")
subset_genepop(genepop = sub_data_path, subs = FST.Filter.Vec, keep = TRUE, path = paste0(path.start, "/", "subset_for_LD.txt"))
## now rename
subset_genepop_rename(genepop = sub_data_path, path = sub_data_path, nameframe = popLDsubsetDF,renumber=TRUE)
}
### convert file to .ped and .map using PGD spider
## create spid file
spidTop <- "# spid-file generated: Thu Mar 10 09:40:22 AST 2016
# GENEPOP Parser questions
PARSER_FORMAT=GENEPOP
# Enter the size of the repeated motif (same for all loci: one number; different: comma separated list (e.g.: 2,2,3,2):
GENEPOP_PARSER_REPEAT_SIZE_QUESTION=
# Select the type of the data:
GENEPOP_PARSER_DATA_TYPE_QUESTION=SNP
# How are Microsat alleles coded?
GENEPOP_PARSER_MICROSAT_CODING_QUESTION=REPEATS
# PED Writer questions
WRITER_FORMAT=PED
# Save MAP file"
map.loc <- paste0("PED_WRITER_MAP_FILE_QUESTION= ", "PGDtest")
spidBottom <- "# Replacement character for allele encoded as 0 (0 encodes for missing data in PED):
PED_WRITER_ZERO_CHAR_QUESTION=
# Specify the locus/locus combination you want to write to the PED file:
PED_WRITER_LOCUS_COMBINATION_QUESTION=
# Do you want to save an additional MAP file with loci information?
PED_WRITER_MAP_QUESTION=true"
spid.file <- c(spidTop, map.loc, spidBottom)
## write spid file
write(x = spid.file, file = paste0(path.start, "/", "hyb.spid"))
### move spid file to the PGDspider folder
file.copy(from = paste0(path.start, "/hyb.spid"), to = where.pgdspider, overwrite = TRUE)
remember.spidpath <- paste0(path.start, "/", "hyb.spid")
## move the input file as well to the same location as PGDspider - this makes this step so much easier
file.copy(from <- sub_data_path, to = where.pgdspider, overwrite = TRUE)
#### OS X LINUX call
if(Sys.info()["sysname"] != "Windows"){
### create a string to call PGDspider
input.file.call <- paste0("-inputfile subset_for_LD.txt")
execute.SPIDER <- paste0("java -Xmx", allocate.pgd.ram, "m -Xms512m -jar PGDSpider2-cli.jar")
spid.call <- "-spid hyb.spid"
input.format <- "-inputformat GENEPOP"
output.format <- "-outputformat PED"
goto.spider <- paste0("cd ", where.pgdspider.PGD, "; ", execute.SPIDER)
output.file.path <- paste0("-outputfile PGDtest.ped")
## string to run
run.PGDspider <- paste0(goto.spider, " ", input.file.call, " ", input.format, " ", output.file.path, " ", output.format, " ", spid.call)
### run PGDspider through system
system(run.PGDspider)
} # END OSX LINUX if
#### Windows call
if(Sys.info()["sysname"] == "Windows"){
### create a string to call PGDspider
input.file.call <- paste0("-inputfile subset_for_LD.txt")
execute.SPIDER <- paste0("java -Xmx", allocate.pgd.ram, "m -Xms512m -jar PGDSpider2-cli.jar")
spid.call <- "-spid hyb.spid"
input.format <- "-inputformat GENEPOP"
output.format <- "-outputformat PED"
goto.spider <- paste0("cd ", where.pgdspider.PGD, " && ", execute.SPIDER)
output.file.path <- paste0("-outputfile PGDtest.ped")
## string to run
run.PGDspider <- paste0(goto.spider, " ", input.file.call, " ", input.format, " ", output.file.path, " ", output.format, " ", spid.call)
### run PGDspider through system
shell(run.PGDspider)
}
## move the created ped and map files to the PLINK folder
ped.path <- paste0(where.pgdspider, "/", "PGDtest.ped")
map.path <- paste0(where.pgdspider, "/", "PGDtest.map")
file.copy(from = ped.path, to = where.plink, overwrite = TRUE)
file.copy(from = map.path, to = where.plink, overwrite = TRUE)
plink_ped_path <- paste0(where.plink, "/", "PGDtest.ped")
plink_map_path <- paste0(where.plink, "/", "PGDtest.map")
#Console message
writeLines("
")
writeLines("Calculating Linkage.
")
### prepare a string to call PLINK
### modify PLINK path so it plays nice with system
where.plink.go <- gsub(x = where.plink, pattern = " ", replacement = "\\ ", fixed = TRUE)
go.to.PLINK <- paste0("cd ", where.plink.go)
### OSX LINUX PLINK call
if(Sys.info()["sysname"] != "Windows"){
execute.PLINK <- paste0(go.to.PLINK, "; ", "./plink --file PGDtest --r2 --ld-window-r2 ", r2.threshold, " --ld-window ", ld.window, " --noweb")
### run PLINK through system
system(execute.PLINK)
}
### Windows PLINK CALL
if(Sys.info()["sysname"] == "Windows"){
execute.PLINK <- paste0(go.to.PLINK, " && ", "plink --file PGDtest --r2 --ld-window-r2 ", r2.threshold, " --ld-window ", ld.window, " --noweb")
### run PLINK through system
shell(execute.PLINK)
}
#Console message
writeLines("
")
writeLines("Creating Fst optimized unlinked panel.
")
## copy the LD file created by PLINK to the working directory
file.copy(from = paste0(where.plink, "plink.ld"), to = path.start)
## the format of the LD file is a bit messed up - it is not a regular matrix - have to modify the file a bit to get it to read in properly
## rename the file and change it to a txt
file.rename(from = paste0(path.start, "/", "plink.ld"), to = paste0(path.start, "/", "plink.txt"))
## read the file in as characters - there are an uneven number of spaces between data, so have to remove those before can read as a table
LDs <- readChar(con = "plink.txt", file.info("plink.txt")$size) ## read in as a long character string
LDs_reform <- gsub(x = LDs, pattern = "\\ +", replacement = " ") ### remove more than 1 space
## save as a new .txt file, which is now formatted to be read into R easily
write(x = LDs_reform, file = "LDsReform.txt")
## read back in
Linked <- utils::read.table("LDsReform.txt", header = TRUE)
## keep only the needed columns
Linked <- Linked[c("SNP_A", "SNP_B", "R2")]
### turn both columns into a single vector of unduplicated loci names
ld.unique <- c(as.character(Linked$SNP_A), as.character(Linked$SNP_B))
ld.unique <- as.character(ld.unique[which(duplicated(ld.unique)==FALSE)])
if(length(ld.unique) > 1){
FST.df2 <- FST.df[which(FST.df$loci %in% ld.unique),]
FST.ld.ordered <- as.character(FST.df2[order(FST.df2$FSTs,decreasing=T),"loci"])
holdlist <- list()
for(i in FST.ld.ordered){
hold <- c(i,as.character(Linked[which(Linked[,1]%in%i),2]),
as.character(Linked[which(Linked[,2]%in%i),1]))
hold2 <- NULL
for(k in 1:length(hold)){hold2 <- c(hold2,which(FST.ld.ordered == hold[k]))}
holdlist[[i]] <- hold2
}
linked.ranks.df2 <- plyr::rbind.fill(lapply(holdlist,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
LRD2 <- as.matrix(linked.ranks.df2)
highest1 <- LRD2[1,which.min(LRD2[1,])]
otherdat1 <- LRD2[1,-which.min(LRD2[1,])]
otherdat1 <- otherdat1[!is.na(otherdat1)]
to.keep <- FST.ld.ordered[Optimfunc(linked.ranks.df2)]
to.drop <- setdiff(as.character(FST.df2$loci),to.keep)
FST.df2$loci <- as.character(FST.df2$loci)
FST.df$loci <- as.character(FST.df$loci)
to.keep <- FST.df[-which(FST.df$loci %in% to.drop),]
#Console message
writeLines("
")
writeLines("Writing output.
")
Unlinked.panel <- FST.df[which(FST.df$loci %in% to.keep$loci),]
your.panel <- FST.df
your.panel_un <- Unlinked.panel[which(Unlinked.panel$FSTs>=fst.threshold),]
your.panel <- your.panel[order(your.panel$FSTs,decreasing=TRUE),]
your.panel_unlinked <- your.panel_un[order(your.panel_un$FSTs,decreasing = TRUE),]
Linked.df <- Linked
} ### END IF There are loci in LD
if(length(ld.unique) < 1){ # if not linked loci
writeLines("Writing output")
writeLines("No linked loci detected, all loci returned.")
your.panel <- FST.df
your.panel <- your.panel[order(your.panel$FSTs,decreasing=TRUE),]
your.panel_unlinked <- your.panel
Linked.df <- NA
}
#clean up temporary files used in the analysis.
file.remove(remember.spidpath)
file.remove(sub_data_path)
file.remove(ped.path)
file.remove(map.path)
file.remove(paste0(where.pgdspider, "/hyb.spid"))
file.remove(plink_map_path)
file.remove(plink_ped_path)
file.remove(paste0(path.start, "/plink.txt"))
file.remove(paste0(path.start, "/LDsReform.txt"))
file.remove(paste0(where.pgdspider,"/subset_for_LD.txt"))
#Console message
writeLines("
")
writeLines("Process Completed.")
## return output
Output <- list(Linkages=Linked.df,
Fst=your.panel,
Fst_Unlinked=your.panel_unlinked
)
if(save.output == TRUE){
write_path <- gsub(x = genepop, pattern = ".txt", replacement = "")
utils::write.table(x = Output$Linkages, file = paste0(write_path, "_linkages.txt"),row.names = FALSE,quote=FALSE)
utils::write.table(x = Output$Fst, file = paste0(write_path, "_Loci_FST.txt"),row.names=FALSE,quote = FALSE)
utils::write.table(x = Output$Fst_Unlinked, file = paste0(write_path, "_Unlinked_Loci_FST.txt"),row.names=FALSE,quote = FALSE)
}
if(return.workspace == TRUE){
return(Output)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.