Nothing
pre8.split.train.test <- function(file.name, dir.file, dir.out, train.percent=80, separ="\t", index.prefix="index", file.has.ext=TRUE, resample=FALSE) {
#
# Splits the data file named file.name in dir.file, into TRAIN and TEST files, based on
# the percentage train.percent - how many percent of the data should go into TRAIN file.
# The file is expected to have last column represent CASE and CONTROL; this is necessary
# to make sure that train.percent of CASE and train.percent of CONTROL entries go into
# TRAIN file, to have even sample of both types of entries.
# If the data is saved in many files (for example one file per chromosome), this function
# is designed to first randomly sample the individuals for the TRAIN file for the first
# file it is run on. Then it uses this sampling for all other chromosomes on subsequent
# runs (if resample=FALSE), such that individuals in TRAIN file correspond to one another
# across all chromosome files (same holds for TEST files).
#
# Sample run:
#
# split.train.test(file.name="CGEM_Breast_complete.txt", dir.file="/home/briollaislab/olga/curr/data/f08_step2in", dir.out="/home/briollaislab/olga/curr/data/f09_step2in_traintest", train.percent=50)
#
# file.name: the name of the file. This file is expected to have the disease status
# in its last column (1 for CASE and 0 for CONTROL).
# dir.file: the directory where file.name can be found
# dir.out: the directory into which the two output files should go.
# train.percent: the pecentage (0 to 100) of what portion of data (rows) should go
# into the TRAIN file; the rest will be in TEST file. Ex: for 1000 entries,
# if train.percent=80, then 800 entries will appear in <file.name>.test, and
# 200 entries will go into <file.name>.train.
# separ: the separator used in the file.name to separate entries.
# index.prefix: the name of the index file to use for the separation of train from test entries.
# This file may already exist (has been created by previous runs of this program)
# file.has.ext: Whether or not <file.name> has a filename extension (ex. ".txt", ".ped", ".mlgeno")
# resample: additional file with indices that correspond to entries taken into the TRAIN file
# will be saved in the dir.out directory for the given train.percent.
# If resample=FALSE, all subsequent runs of this function on other files (for example for
# different chromosomes on the same dataset) with the same train.percent will use that saved file
# (if it exists).
# This is to make sure that the same individuals go into TRAIN file, across all chromosomes.
# If resample=TRUE, then new random resampling will take place and new index file will be generated
# and saved to the dir.out directory; Note, in this case the individuals generated by this file
# will not correspond to individuals generated for previous files; so for consistency, re-run
# all chromosomes with resample flag set to FALSE.
#
# Returns: the names of the resultant TRAIN and TEST files.
# - out$train: the name of the output train file
# - out$test: the name of the output test file
#
# **************************************************************
# OUTPUTS:
#
# - <file.name>.train.<train.percent>.<ext> - the output TRAIN file containing train.percent percent
# of the original data; will appear in dir.out directory.
# <file.name> here is the name without extension;
# <ext> is the extension part of <file.name> (i.e. the section that follows the last "." symbol)
# <train.percent> is specifying the percentage that was used to generate the file.
# - <file.name>.test.<train.percent>.<ext> - the entries for TEST file, containing the remaining
# (100 - train.percent) data. Similar to the TRAIN file above.
# - <index.prefix>.<train.percent>.txt - the file containing indicies of the entries corresponding to TRAIN file,
# this file will be generated if it does not already exist in dir.out, or if resample=TRUE.
#
#
# **************************************************************
# TODO: remove later:
#source("rand.ints.R")
#source("get.ext.R")
#source("get.file.copy.R")
if(missing(file.name)) stop("Name of input file must be provided.")
if(missing(dir.file)) stop("Name of file directory must be provided.")
if(missing(dir.out)) stop("Name of output file directory must be provided.")
full.name <- paste(dir.file, file.name, sep="/")
D <- read.table(full.name, header=F, sep=separ, stringsAsFactors=FALSE)
rowD <- nrow(D)
colD <- ncol(D)
temp.name <- paste(dir.out, "/", index.prefix,".", train.percent, ".txt", sep="")
mask.train <- NULL
# If file exists and no need to resample it, then just read it from file
if(file.exists(temp.name) && resample == FALSE) {
print("Using existing index file")
mask.train <- read.table(temp.name, header=F, stringsAsFactors=FALSE)
}
# If file does not exist, or we're required to resample it, or
# the file just read does not correspond to our data D, then resample it.
if (!file.exists(temp.name) || resample==TRUE || nrow(mask.train) != rowD) {
mask.case <- rep(1, times=rowD)
mask.control <- rep(0, times=rowD)
u.last.col <- unique(D[,colD])
u.len <- length(u.last.col)
if(u.len != 2)
print(paste("Note: last column of the file ", file.name, " has last column containing ", u.len, " distinct values (which does not seem to correspond to disease status).", sep=""))
if(u.len == 2) {
zero.index <- which(D[,colD] == 0)
mask.case[zero.index] <- rep(0, times=length(zero.index))
mask.control[zero.index] <- rep(1, times=length(zero.index))
}
mask.case <- rand.mask(mask.case, train.percent)
mask.control <- rand.mask(mask.control, train.percent)
mask.train <- mask.case + mask.control
write.table(mask.train, file=temp.name, quote=FALSE, row.names=FALSE, col.names=FALSE)
print(paste("Wrote out ", temp.name, " index file. Do not remove it until all chromosomes are processed.", sep=""))
}
mask.test <- (mask.train + 1) %% 2 # the opposite of mask.train
Dtrain <- D[mask.train==1, ]
Dtest <- D[mask.test==1, ]
# Obtain the filename extension, if it's said to exist; otherwise extension is ""
ending.name <- get.ext(file.name, file.has.ext)
name.part1 <- ending.name$part1
name.ext <- ending.name$ext
train.name <- paste(dir.out, "/", name.part1, ".train.", train.percent, name.ext, sep="")
test.name <- paste(dir.out, "/", name.part1, ".test.", train.percent, name.ext, sep="")
print(train.name)
print(test.name)
write.table(Dtrain, file=train.name, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
write.table(Dtest, file=test.name, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
return(list(train=train.name, test=test.name))
}
# Given a mask: array of 0s and 1s, sample p percent of values
# among the entries that correspond to '1' in the mask.
# For example in mask that has 15 entries (10 '1's and 5 '0's, randomly scattered),
# and p=60%, out of the 10 '1's, a random sample of 6 of them will remain a '1'
# whereas 4 of the '1's will be reset to '0'.
rand.mask <- function(mask, p) {
n.ones <- sum(mask)
if(n.ones > 0) {
n.p <- round(n.ones * p / 100)
vals <- rand.ints(n.p, 1, n.ones)
new.mask <- rep(0, times=n.ones)
new.mask[vals] <- rep(1, times=length(vals))
# Pass-by-value parameters, we can modify the mask locally
mask[mask==1] <- new.mask
}
return(mask)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.