R/pre8.split.train.test.batch.R

pre8.split.train.test.batch <- function(dir.file, dir.out, prefix.file, key.file="", ending.file=".txt", train.percent=80, separ="\t", index.prefix="index", file.has.ext=TRUE, resample=FALSE) {
# For all the files in the directory dir.file, that begin with prefix.file, containt key.file,
# and end with ending.file, will use the same sampling to split
# the data into TRAIN and TEST files, based on
# the percentage train.percent - how many percent of the data should go into TRAIN file.
# The files are expected to have last column represent disease status; this is necessary
# to make sure that train.percent of CASE and train.percent of CONTROL entries go into
# TRAIN file, to have even sample of both types of entries.
# If the data is saved in many files (for example one file per chromosome), this function
# is designed to first randomly sample the individuals for the TRAIN file for the first
# file it is run on. Then it uses this sampling for all other chromosomes on subsequent
# runs, such that individuals in TRAIN file correspond to one another
# across all chromosome files (same holds for TEST files).
# Also copies over all the .dat files from input to output directory.
#
# Sample run:
#
#
# dir.file: the directory where files can be found
# dir.out: the directory into which the two output files should go.
# prefix.file: 
# key.file:
# ending.file: 
# train.percent: the pecentage (0 to 100) of what portion of data (rows) should go
#       into the TRAIN file; the rest will be in TEST file. Ex: for 1000 entries,
#       if train.percent=80, then 800 entries will appear in <file.name>.test, and
#       200 entries will go into <file.name>.train.
# separ: the separator used in the file.name to separate entries.
# index.prefix: the name of the index file to use for the separation of train from test entries.     
#       This file may already exist (has been created by previous runs of this program)
# file.has.ext: Whether or not <file.name> has a filename extension (ex. ".txt", ".ped", ".mlgeno")
# train.percent: the pecentage (0 to 100) of what portion of data (rows) should go
#       into the TRAIN file; the rest will be in TEST file. Ex: for 1000 entries,
#       if train.percent=80, then 800 entries will appear in <file.name>.test, and
#       200 entries will go into <file.name>.train.
# separ: the separator used in the file.name to separate entries.
# index.prefix: the name of the index file to use for the separation of train from test entries.
#       This file may already exist (has been created by previous runs of this program)
# file.has.ext: Whether or not <file.name> has a filename extension (ex. ".txt", ".ped", ".mlgeno")
# resample: additional file with indices that correspond to entries taken into the TRAIN file
#       will be saved in the dir.out directory for the given train.percent.
#       If resample=FALSE, all subsequent runs of this function on other files (for example for
#       different chromosomes on the same dataset) with the same train.percent will use that saved file
#       (if it exists).
#       This is to make sure that the same individuals go into TRAIN file, across all chromosomes.
#       If resample=TRUE, then new random resampling will take place and new index file will be generated
#       and saved to the dir.out directory; Note, in this case the individuals generated by this file
#       will not correspond to individuals generated for previous files; so for consistency, re-run
#       all chromosomes with resample flag set to FALSE.
#
# Returns: the list of names of the resultant TRAIN and TEST files.
#
# **************************************************************
# OUTPUTS:
#
# - <file.name>.train.<train.percent>.<ext> - the output TRAIN file containing train.percent percent
#       of the original data; will appear in dir.out directory.
#       <file.name> here is the name without extension;
#       <ext> is the extension part of <file.name> (i.e. the section that follows the last "." symbol)
#       <train.percent> is specifying the percentage that was used to generate the file.
# - <file.name>.test.<train.percent>.<ext> - the entries for TEST file, containing the remaining
#       (100 - train.percent) data. Similar to the TRAIN file above.
# - <index.prefix>.<train.percent>.txt - the file containing indicies of the entries corresponding to TRAIN file,
#        this file will be generated if it does not already exist in dir.out, or if resample=TRUE.
#
#
# **************************************************************

if(missing(dir.file)) stop("Name of input directory with pedegree files must be provided.")
if(missing(dir.out)) stop("Name of output directory must be provided")
if(missing(prefix.file)) stop("Prefix of the pedegree file name must be provided.")


# *******************************************
# 1. Obtain all .ped files
all.ped <- get.file.name(dir=dir.file, prefix=prefix.file, key=key.file, ending=ending.file)

if(length(all.ped) == 0)
	return()


# *******************************************
# 2. For all .ped files, run the pre8.split.train.test(), first with user's desired re-sampling
# parameter, then all other files with resample=FALSE (to make all consistent). 

pre8.split.train.test(file.name=all.ped[1], dir.file=dir.file, dir.out=dir.out, train.percent=train.percent, separ=separ, index.prefix=index.prefix, file.has.ext=file.has.ext, resample=resample)

i <- 2
while (i <= length(all.ped)) {
	pre8.split.train.test(file.name=all.ped[i], dir.file=dir.file, dir.out=dir.out, train.percent=train.percent, separ=separ, index.prefix=index.prefix, file.has.ext=file.has.ext, resample=FALSE)

        i <- i + 1
}

# Copy over all .dat files
get.file.copy(dir.in=dir.file, dir.out=dir.out, ending=".dat", verbal=FALSE)


}

Try the genMOSSplus package in your browser

Any scripts or data that you put into this service are public.

genMOSSplus documentation built on May 1, 2019, 10:31 p.m.