Nothing
pre8.split.train.test.batch <- function(dir.file, dir.out, prefix.file, key.file="", ending.file=".txt", train.percent=80, separ="\t", index.prefix="index", file.has.ext=TRUE, resample=FALSE) {
# For all the files in the directory dir.file, that begin with prefix.file, containt key.file,
# and end with ending.file, will use the same sampling to split
# the data into TRAIN and TEST files, based on
# the percentage train.percent - how many percent of the data should go into TRAIN file.
# The files are expected to have last column represent disease status; this is necessary
# to make sure that train.percent of CASE and train.percent of CONTROL entries go into
# TRAIN file, to have even sample of both types of entries.
# If the data is saved in many files (for example one file per chromosome), this function
# is designed to first randomly sample the individuals for the TRAIN file for the first
# file it is run on. Then it uses this sampling for all other chromosomes on subsequent
# runs, such that individuals in TRAIN file correspond to one another
# across all chromosome files (same holds for TEST files).
# Also copies over all the .dat files from input to output directory.
#
# Sample run:
#
#
# dir.file: the directory where files can be found
# dir.out: the directory into which the two output files should go.
# prefix.file:
# key.file:
# ending.file:
# train.percent: the pecentage (0 to 100) of what portion of data (rows) should go
# into the TRAIN file; the rest will be in TEST file. Ex: for 1000 entries,
# if train.percent=80, then 800 entries will appear in <file.name>.test, and
# 200 entries will go into <file.name>.train.
# separ: the separator used in the file.name to separate entries.
# index.prefix: the name of the index file to use for the separation of train from test entries.
# This file may already exist (has been created by previous runs of this program)
# file.has.ext: Whether or not <file.name> has a filename extension (ex. ".txt", ".ped", ".mlgeno")
# train.percent: the pecentage (0 to 100) of what portion of data (rows) should go
# into the TRAIN file; the rest will be in TEST file. Ex: for 1000 entries,
# if train.percent=80, then 800 entries will appear in <file.name>.test, and
# 200 entries will go into <file.name>.train.
# separ: the separator used in the file.name to separate entries.
# index.prefix: the name of the index file to use for the separation of train from test entries.
# This file may already exist (has been created by previous runs of this program)
# file.has.ext: Whether or not <file.name> has a filename extension (ex. ".txt", ".ped", ".mlgeno")
# resample: additional file with indices that correspond to entries taken into the TRAIN file
# will be saved in the dir.out directory for the given train.percent.
# If resample=FALSE, all subsequent runs of this function on other files (for example for
# different chromosomes on the same dataset) with the same train.percent will use that saved file
# (if it exists).
# This is to make sure that the same individuals go into TRAIN file, across all chromosomes.
# If resample=TRUE, then new random resampling will take place and new index file will be generated
# and saved to the dir.out directory; Note, in this case the individuals generated by this file
# will not correspond to individuals generated for previous files; so for consistency, re-run
# all chromosomes with resample flag set to FALSE.
#
# Returns: the list of names of the resultant TRAIN and TEST files.
#
# **************************************************************
# OUTPUTS:
#
# - <file.name>.train.<train.percent>.<ext> - the output TRAIN file containing train.percent percent
# of the original data; will appear in dir.out directory.
# <file.name> here is the name without extension;
# <ext> is the extension part of <file.name> (i.e. the section that follows the last "." symbol)
# <train.percent> is specifying the percentage that was used to generate the file.
# - <file.name>.test.<train.percent>.<ext> - the entries for TEST file, containing the remaining
# (100 - train.percent) data. Similar to the TRAIN file above.
# - <index.prefix>.<train.percent>.txt - the file containing indicies of the entries corresponding to TRAIN file,
# this file will be generated if it does not already exist in dir.out, or if resample=TRUE.
#
#
# **************************************************************
if(missing(dir.file)) stop("Name of input directory with pedegree files must be provided.")
if(missing(dir.out)) stop("Name of output directory must be provided")
if(missing(prefix.file)) stop("Prefix of the pedegree file name must be provided.")
# *******************************************
# 1. Obtain all .ped files
all.ped <- get.file.name(dir=dir.file, prefix=prefix.file, key=key.file, ending=ending.file)
if(length(all.ped) == 0)
return()
# *******************************************
# 2. For all .ped files, run the pre8.split.train.test(), first with user's desired re-sampling
# parameter, then all other files with resample=FALSE (to make all consistent).
pre8.split.train.test(file.name=all.ped[1], dir.file=dir.file, dir.out=dir.out, train.percent=train.percent, separ=separ, index.prefix=index.prefix, file.has.ext=file.has.ext, resample=resample)
i <- 2
while (i <= length(all.ped)) {
pre8.split.train.test(file.name=all.ped[i], dir.file=dir.file, dir.out=dir.out, train.percent=train.percent, separ=separ, index.prefix=index.prefix, file.has.ext=file.has.ext, resample=FALSE)
i <- i + 1
}
# Copy over all .dat files
get.file.copy(dir.in=dir.file, dir.out=dir.out, ending=".dat", verbal=FALSE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.