demo/gendemo.R

"?" <- function(...) {
        a<-readline("press ENTER to continue")
}

mach.path <- readline("Please enter the path to where MaCH1 is installed (ex. /software or /software/mach/executables, a 'mach1' file must be inside of it):\n")
if (!file.exists(paste(mach.path, "mach1", sep="/")))
	mach.path <- readline("That path was not correct. Please try again:\n")
if (!file.exists(paste(mach.path, "mach1", sep="/")))
	stop("Please install MaCH software: (http://www.sph.umich.edu/csg/abecasis/MACH/download/)")

?"Thank you. We will now remove directories that were generated by previous runs of this demo."
system("rm -r ./demo.genos")

?"Create the directory structure: will return names of folders: d0 to d11"
dirs <- pre0.dir.create(".", "demo.genos", "dir")

?"Find directory where original data files are and copy them into d0 folder. Also copy reference hapmap files into d4 folder for future use. Also, to speed up computation of last step, save MaCH with Hapmap output. Unzip all."
dir.orig.data <- paste(.find.package("genMOSSplus"), "/exdata/", sep="")
file.name.data <- "genotypes_10_90.txt"
file.name.csv <- "Identifiers_comma.csv"
file.name.anno <- "chr20.annotation.txt"
file.name.hap <- "chr20.hap.gz"
file.name.snps <- "chr20.snps"
file.name.conf <- "conf.txt"
file.name.ind.ann <- "indices.ann.txt"
get.file.copy(dir.in=dir.orig.data, dir.out=dirs$d0, fname=c(paste(file.name.data, ".tar.bz2", sep=""), paste(file.name.csv, ".tar.bz2", sep=""), paste(file.name.ind.ann, ".tar.bz2", sep=""), paste(file.name.conf, ".tar.bz2", sep="")), untarbz=TRUE)
get.file.copy(dir.in=dir.orig.data, dir.out=dirs$d4, fname=c(paste(file.name.anno, ".tar.bz2", sep=""), paste(file.name.hap, ".tar.bz2", sep=""), paste(file.name.snps, ".tar.bz2", sep="")), untarbz=TRUE)

file.name.mach.hap <- c("mach20CASE.mlgeno.tar.bz2", "mach20CASE.mlinfo.tar.bz2", "mach20CONTROL.mlgeno.tar.bz2", "mach20CONTROL.mlinfo.tar.bz2")
dir.hap <- paste(dirs$d11, "seg1/working/s02_machout", sep="/")
dir.create(path=dir.hap, recursive=TRUE)
get.file.copy(dir.in=dir.orig.data, dir.out=dir.hap, fname=file.name.mach.hap, untarbz=TRUE)


?"Convert your dataset into PLINK format, similar to the way ex2plink() function does, and save result into d1 folder"
prefix.ped <- "genotypes_" 	# this prefix will be used frequently
prefix.dat <- "genos_chr"	# this prefix will be used frequently
ex2plink(dir.file=dirs$d0, dir.out=dirs$d1, file.name=file.name.data, annotation.name=file.name.csv, out.prefix.ped=prefix.ped, out.prefix.dat=prefix.dat)

?"Convert all files in folder d1 from PLINK format to MaCH format (for imputation of missing SNPs), and save into d2 folder"
pre1.plink2mach.batch(dir.in=dirs$d1, dir.out=dirs$d2)
 
?"Clean up the dataset by removing all SNPs that have too many missing values. These missings in our dataset are represented as 'N/N'. Suppose we allow up to 10 percent of data to be missing for each SNP. Save clean result into d3 folder."
pre2.remove.genos.batch(dir.dat=dirs$d2, dir.out=dirs$d3, perc.snp=10, empty="N/N", prefix.dat=prefix.dat, prefix.case=prefix.ped, prefix.control=prefix.ped)


?"Run MaCH1 without hapmap for all the chromosomes, saving result to d5 folder. Since this dataset is small, we do not need to split it for processing, so set all numbers of subjects to 0 (meaning use everyone, 25 subjects). Also this is the only batch function that does not process all chromosomes (since it's best to do this in parallel). Note: this initial imputation of the database should be done WITHOUT Hapmap option."
prefix.ped.case <- paste(prefix.ped, "CASE", sep="")
prefix.ped.control <- paste(prefix.ped, "CONTROL", sep="")
i <- 1
while(i<=22) {
	pre3.call.mach.batch(dir.file=dirs$d3, dir.out=dirs$d5, prefix.dat=prefix.dat, prefix.ped=prefix.ped, prefix.out=prefix.ped.case, key.ped="CASE", num.iters=1, num.subjects=0, step2.subjects=0, empty="N/N", chrom.num=i, mach.loc=mach.path)
	pre3.call.mach.batch(dir.file=dirs$d3, dir.out=dirs$d5, prefix.dat=prefix.dat, prefix.ped=prefix.ped, prefix.out=prefix.ped.control, key.ped="CONTROL", num.iters=1, num.subjects=0, step2.subjects=0, empty="N/N", chrom.num=i, mach.loc=mach.path)
	i <- i+1
}

?"Combine CASE and CONTROL files together, saving to d6 folder. The output name will begin with initial prefix we've had before. Note in folder d5, the pedegree file names end with a filename extention '.mlgeno'; also note that the files have space separated entries."
pre4.combine.case.control.batch(dir.file=dirs$d5, dir.out=dirs$d6, prefix.case=prefix.ped.case, prefix.control=prefix.ped.control, prefix.out=prefix.ped, ending.case=".mlgeno", separ=" ")

?"Convert Alleles into three codes: 1, 2, 3, saving to d7 folder. Note in folder d6, the pedegree files have first two columns containing non-SNP information; also last column is the disease status: 0 or 1. Values for SNPs are expected to be A, C, T, G - so we enforce that check by 'letter.encoding', and if a SNP contains some other values (due to occasional bad prediction by MaCH), the SNP will be removed. Note that output file will not have any leading non-SNP columns; so separate file .fam will be written into d7 folder with patient IDs."
name.fam <- "patients.fam"  # This is the name of file that will contain patient IDs
pre5.genos2numeric.batch(dir.ped=dirs$d6, dir.out=dirs$d7, prefix.ped=prefix.ped, prefix.dat=prefix.dat, num.nonsnp.col=2, num.nonsnp.last.col=1, letter.encoding=TRUE, remove.bad.genos=TRUE, save.ids.name=name.fam)

?"Now we merge all chromosomes into one file, save to d8 folder."
name.ped <- "genotypes.txt"
name.dat <- "genos.dat"
pre6.merge.genos(dir.file=dirs$d7, dir.dat=dirs$d7, dir.out=dirs$d8, file.out=name.ped, dat.out=name.dat, prefix.file=prefix.ped, prefix.dat=prefix.dat)

?"Optionally, we may add confounding variables to the end of the data file. There are two variations of the function - use either R or Unix commands. Unix is faster for large datasets, since it does not require R to load up the entire file into memory. We save results into d9 folder."

pre7.add.conf.var.unix(file.name=name.ped, dir.file=dirs$d8, file.fam=name.fam, dir.fam=dirs$d7, file.conf=file.name.conf, dir.conf=dirs$d0, file.out="genos_plus_conf.txt", dir.out=dirs$d9)

?"Split the dataset into train and test sets, 50% each, save result to d10 folder. Note, since we merged all chromosomes into one file, we do not need to run batch version. Full names of train and test files will be returned."
genos <- pre8.split.train.test(file.name=name.ped, dir.file=dirs$d8, dir.out=dirs$d10, train.percent=50)

?"Now apply the same train-test split to the patients' ID file for future reference. Recall, when converting Alleles to 1, 2, and 3, the patient IDs have been saved separately in d7 folder.  Note: the first time we ran the pre8() function, an index file was generated with train indicies in d10 folder. Since we intend to save result of patients' IDs into same d10 folder, that same index file will be used for consecutive runs of pre8() function, as long as train.percent remains the same and resample=FALSE."
pre8.split.train.test(file.name=name.fam, dir.file=dirs$d7, dir.out=dirs$d10, train.percent=50, resample=FALSE)

?"Run MOSS algorithm on the merged file in folder d8 or d9 or d10. MaxVars should be 3-6. However, since for demo purposes this run takes too long, we'll demonstrate run1.moss functionality on a smaller file, like chromosome 22 from directory d7:"
#models <- run1.moss(filename=paste(dirs$d8, name.ped, sep="/"), replicates = 1, maxVars = 3, k = 2)
models <- run1.moss(filename=paste(dirs$d7, "genotypes_22_num.txt", sep="/"), replicates = 1, maxVars = 3, k = 2)
models


?"To explore a small region of a chromosome, re-run MaCH1 with hapmap and prepare it for MOSS analysis."
subdata <- tune1.subsets(dir.dat=dirs$d3, dir.ped=dirs$d3, dir.ann=dirs$d1, dir.pos.snp=dirs$d4, dir.pos.ann=dirs$d4, dir.pos.hap=dirs$d4, dir.out=dirs$d11, prefix.dat=prefix.dat, prefix.ped=prefix.ped, prefix.ann=prefix.dat, prefix.pos.snp="chr", prefix.pos.ann="chr", prefix.pos.hap="chr", pos.list.triple=c(20, 13034962, 13221577, 20, 58174374, 58765836), out.name.subdir="seg1", out.prefix="subdata", mach.loc=mach.path)

?"Now we can run MOSS algorithm on this subset data file. If desired, this file is ready to be processed by steps pre7 and pre8."
models.subset <- run1.moss(filename=subdata, replicates = 1, maxVars = 3, k = 2)
models.subset

Try the genMOSSplus package in your browser

Any scripts or data that you put into this service are public.

genMOSSplus documentation built on May 1, 2019, 10:31 p.m.