R/pre8.split.train.test.R

pre8.split.train.test <- function(file.name, dir.file, dir.out, train.percent=80, separ="\t", index.prefix="index", file.has.ext=TRUE, resample=FALSE) {
#
# Splits the data file named file.name in dir.file, into TRAIN and TEST files, based on
# the percentage train.percent - how many percent of the data should go into TRAIN file.
# The file is expected to have last column represent CASE and CONTROL; this is necessary
# to make sure that train.percent of CASE and train.percent of CONTROL entries go into
# TRAIN file, to have even sample of both types of entries. 
# If the data is saved in many files (for example one file per chromosome), this function
# is designed to first randomly sample the individuals for the TRAIN file for the first
# file it is run on. Then it uses this sampling for all other chromosomes on subsequent
# runs (if resample=FALSE), such that individuals in TRAIN file correspond to one another
# across all chromosome files (same holds for TEST files).  
#
# Sample run:
#
# split.train.test(file.name="CGEM_Breast_complete.txt", dir.file="/home/briollaislab/olga/curr/data/f08_step2in", dir.out="/home/briollaislab/olga/curr/data/f09_step2in_traintest", train.percent=50)
# 
# file.name: the  name of the file. This file is expected to have the disease status
#       in its last column (1 for CASE and 0 for CONTROL). 
# dir.file: the directory where file.name can be found
# dir.out: the directory into which the two output files should go.
# train.percent: the pecentage (0 to 100) of what portion of data (rows) should go
#       into the TRAIN file; the rest will be in TEST file. Ex: for 1000 entries,
#       if train.percent=80, then 800 entries will appear in <file.name>.test, and
#       200 entries will go into <file.name>.train.
# separ: the separator used in the file.name to separate entries.
# index.prefix: the name of the index file to use for the separation of train from test entries.
#       This file may already exist (has been created by previous runs of this program)
# file.has.ext: Whether or not <file.name> has a filename extension (ex. ".txt", ".ped", ".mlgeno")
# resample: additional file with indices that correspond to entries taken into the TRAIN file
#       will be saved in the dir.out directory for the given train.percent.
#       If resample=FALSE, all subsequent runs of this function on other files (for example for
#       different chromosomes on the same dataset) with the same train.percent will use that saved file
#       (if it exists).
#       This is to make sure that the same individuals go into TRAIN file, across all chromosomes.
#       If resample=TRUE, then new random resampling will take place and new index file will be generated
#       and saved to the dir.out directory; Note, in this case the individuals generated by this file
#       will not correspond to individuals generated for previous files; so for consistency, re-run
#       all chromosomes with resample flag set to FALSE.   
#
# Returns: the names of the resultant TRAIN and TEST files.
#    - out$train: the name of the output train file
#    - out$test: the name of the output test file
#
# ************************************************************** 
# OUTPUTS:
# 
# - <file.name>.train.<train.percent>.<ext> - the output TRAIN file containing train.percent percent 
#       of the original data; will appear in dir.out directory.
#       <file.name> here is the name without extension;
#       <ext> is the extension part of <file.name> (i.e. the section that follows the last "." symbol)  
#       <train.percent> is specifying the percentage that was used to generate the file. 
# - <file.name>.test.<train.percent>.<ext> - the entries for TEST file, containing the remaining
#       (100 - train.percent) data. Similar to the TRAIN file above.
# - <index.prefix>.<train.percent>.txt - the file containing indicies of the entries corresponding to TRAIN file,
#        this file will be generated if it does not already exist in dir.out, or if resample=TRUE.
# 
#
# **************************************************************

# TODO: remove later:
#source("rand.ints.R")
#source("get.ext.R")
#source("get.file.copy.R")

if(missing(file.name)) stop("Name of input file must be provided.")
if(missing(dir.file)) stop("Name of file directory must be provided.")
if(missing(dir.out)) stop("Name of output file directory must be provided.")

full.name <- paste(dir.file, file.name, sep="/")
D <- read.table(full.name, header=F, sep=separ, stringsAsFactors=FALSE)
rowD <- nrow(D)
colD <- ncol(D)

temp.name <- paste(dir.out, "/", index.prefix,".", train.percent, ".txt", sep="")

mask.train <- NULL

# If file exists and no need to resample it, then just read it from file
if(file.exists(temp.name) && resample == FALSE) {
	print("Using existing index file")
	mask.train <- read.table(temp.name, header=F, stringsAsFactors=FALSE)
}

# If file does not exist, or we're required to resample it, or
# the file just read does not correspond to our data D, then resample it.
if (!file.exists(temp.name) || resample==TRUE || nrow(mask.train) != rowD) {

	mask.case <- rep(1, times=rowD)
	mask.control <- rep(0, times=rowD)

	u.last.col <- unique(D[,colD])
	u.len <- length(u.last.col)

	if(u.len != 2)
	        print(paste("Note: last column of the file ", file.name, " has last column containing ", u.len, " distinct values (which does not seem to correspond to disease status).", sep=""))


	if(u.len == 2) {
		zero.index <- which(D[,colD] == 0)
		mask.case[zero.index] <- rep(0, times=length(zero.index))
		mask.control[zero.index] <- rep(1, times=length(zero.index))
	}

	mask.case <- rand.mask(mask.case, train.percent)
	mask.control <- rand.mask(mask.control, train.percent)

	mask.train <- mask.case + mask.control
	write.table(mask.train, file=temp.name, quote=FALSE, row.names=FALSE, col.names=FALSE)
	print(paste("Wrote out ", temp.name, " index file. Do not remove it until all chromosomes are processed.", sep=""))
}

mask.test <- (mask.train + 1) %% 2	# the opposite of mask.train

Dtrain <- D[mask.train==1, ]
Dtest <- D[mask.test==1, ]

# Obtain the filename extension, if it's said to exist; otherwise extension is ""
ending.name <- get.ext(file.name, file.has.ext)
name.part1 <- ending.name$part1
name.ext <- ending.name$ext

train.name <- paste(dir.out, "/", name.part1, ".train.", train.percent, name.ext, sep="")
test.name <- paste(dir.out, "/", name.part1, ".test.", train.percent, name.ext, sep="")

print(train.name)
print(test.name)
write.table(Dtrain, file=train.name, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
write.table(Dtest, file=test.name, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")

return(list(train=train.name, test=test.name))

}


# Given a mask: array of 0s and 1s, sample p percent of values 
# among the entries that correspond to '1' in the mask.
# For example in mask that has 15 entries (10 '1's and 5 '0's, randomly scattered),
#   and p=60%, out of the 10 '1's, a random sample of 6 of them will remain a '1'
#   whereas 4 of the '1's will be reset to '0'. 
rand.mask <- function(mask, p) {
	n.ones <- sum(mask)
	if(n.ones > 0) {
		n.p <- round(n.ones * p / 100)

		vals <- rand.ints(n.p, 1, n.ones)

		new.mask <- rep(0, times=n.ones)
		new.mask[vals] <- rep(1, times=length(vals))
	
		# Pass-by-value parameters, we can modify the mask locally
		mask[mask==1] <- new.mask
	}
	return(mask)
}

Try the genMOSSplus package in your browser

Any scripts or data that you put into this service are public.

genMOSSplus documentation built on May 1, 2019, 10:31 p.m.