script/make_yeast_10k.R

require(ezRun)
require(ShortRead)

setwdNew("/srv/GT/analysis/hubert/yeast_10k")


FLUX <<- "export FLUX_MEM=\"20G\"; /usr/local/ngseq/src/flux-simulator-1.2.1/bin/flux-simulator"
my.system(paste(FLUX, "--printParameters"))
my.system(paste(FLUX, "--help"))
parLines = c("GEN_DIR"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Sequence/Chromosomes",
             "PAIRED_END"="TRUE",
             "READ_LENGTH"="36",
             "UNIQUE_IDS"="true",
             "READ_NUMBER"="10000",
             #"REF_FILE_NAME"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Annotation/Genes/genes.gtf",
             "REF_FILE_NAME"="/srv/GT/analysis/hubert/yeast_10k/clean.gtf")
# grep -v _codon genes_sorted.gtf > clean.gtf
my.write.table(parLines, file="wt.par", col.names = FALSE, append = FALSE)
my.system(paste(FLUX, "-x", "-p", "wt.par"))

parLines = c("GEN_DIR"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Sequence/Chromosomes",
             "PAIRED_END"="TRUE",
             "READ_LENGTH"="36",
             "UNIQUE_IDS"="true",
             "READ_NUMBER"="10000",
             #"REF_FILE_NAME"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Annotation/Genes/genes.gtf",
             "REF_FILE_NAME"="/srv/GT/analysis/hubert/yeast_10k/clean.gtf",
             "PRO_FILE_NAME"="wt.pro",
             "FASTA"="true")
my.write.table(parLines, file="wt_1.par", col.names = FALSE, append = FALSE)
my.system(paste(FLUX, "-s -l", "-p", "wt_1.par"))



pro = my.read.table("wt.pro", header=FALSE, row.names=NULL)
sum(pro$V5)
pro[1:50, "V6"] = pro[1:50, "V6"] *2
pro[51:100, "V6"] = round(pro[51:100, "V6"] /2)
pro$V5 = pro$V6 / sum(pro$V6)
sum(pro$V6)
my.write.table(pro, file="mut.pro", col.names=FALSE, row.names=FALSE)


parLines = c("GEN_DIR"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Sequence/Chromosomes",
             "PAIRED_END"="true",
             "READ_LENGTH"="36",
             "ERR_FILE"="35",
             "UNIQUE_IDS"="true",
             "READ_NUMBER"="20000",
             #"REF_FILE_NAME"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Annotation/Genes/genes.gtf",
             "REF_FILE_NAME"="/srv/GT/analysis/hubert/yeast_10k/clean.gtf",
             "PRO_FILE_NAME"="mut.pro",
             "FASTA"="true")
my.write.table(parLines, file="mut_1.par", col.names = FALSE, append = FALSE)
my.system(paste(FLUX, "--threads 8", "-s -l", "-p", "mut_1.par"))


parLines = c("GEN_DIR"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Sequence/Chromosomes",
             "PAIRED_END"="true",
             "READ_LENGTH"="36",
             "ERR_FILE"="35",
             "UNIQUE_IDS"="true",
             "READ_NUMBER"="20000",
             #"REF_FILE_NAME"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Annotation/Genes/genes.gtf",
             "REF_FILE_NAME"="/srv/GT/analysis/hubert/yeast_10k/clean.gtf",
             "PRO_FILE_NAME"="mut.pro",
             "FASTA"="true")
my.write.table(parLines, file="mut_2.par", col.names = FALSE, append = FALSE)
my.system(paste(FLUX, "--threads 8", "-s -l", "-p", "mut_2.par"))



# parLines = c("GEN_DIR"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Sequence/Chromosomes",
#              "PAIRED_END"="true",
#              "READ_LENGTH"="36",
#              "ERR_FILE"="35",
#              "UNIQUE_IDS"="true",
#              "READ_NUMBER"="20000",
#              #"REF_FILE_NAME"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Annotation/Genes/genes.gtf",
#              "REF_FILE_NAME"="/srv/GT/analysis/hubert/yeast_10k/clean.gtf",
#              "PRO_FILE_NAME"="wt.pro",
#              "FASTA"="true")
# my.write.table(parLines, file="wt_1.par", col.names = FALSE, append = FALSE)
# my.system(paste(FLUX, "--threads 8", "-s -l", "-p", "wt_1.par"))


parLines = c("GEN_DIR"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Sequence/Chromosomes",
             "PAIRED_END"="true",
             "READ_LENGTH"="36",
             "ERR_FILE"="35",
             "UNIQUE_IDS"="true",
             "READ_NUMBER"="20000",
             #"REF_FILE_NAME"="/srv/GT/reference/Saccharomyces_cerevisiae/Ensembl/EF4/Annotation/Genes/genes.gtf",
             "REF_FILE_NAME"="/srv/GT/analysis/hubert/yeast_10k/clean.gtf",
             "PRO_FILE_NAME"="wt.pro",
             "FASTA"="true")
my.write.table(parLines, file="wt_2.par", col.names = FALSE, append = FALSE)
my.system(paste(FLUX, "--threads 8", "-s -l", "-p", "wt_2.par"))




## build the dataset
dir.create("yeast_10k")
require(ShortRead)
ds = data.frame(row.names=c("wt_1", "wt_2", "mut_1", "mut_2"))
ds$"Genotype [Factor]" = c("wt", "wt", "mut", "mut")
ds$"Read1 [File]" = ""
ds$"Read2 [File]" = ""
ds$"Read Count" = 0
sm = "mut_2"
for (sm in rownames(ds)){
  reads = readFastq(paste0(sm, ".fastq"))
  r1 = reads[grepl("/1$", id(reads))]
  r2 = reads[grepl("/2$", id(reads))]
  use = width(r1) == 36 & width(r2) == 36
  r1@id = BStringSet(sub("/1", "", id(r1)))
  outFile = paste0("yeast_10k/", sm, "_R1.fastq")
  writeFastq(r1[use], file = outFile, compress=FALSE)
  my.system(paste("pigz -p 4 --best", outFile))
  ds[sm, "Read1 [File]"] = paste0("extdata", "/", outFile, ".gz")
  r2@id = BStringSet(sub("/2", "", id(r2)))
  outFile = paste0("yeast_10k/", sm, "_R2.fastq")
  writeFastq(r2[use], file = outFile, compress=FALSE)
  my.system(paste("pigz -p 4 --best", outFile))
  ds[sm, "Read2 [File]"] = paste0("extdata", "/", outFile, ".gz")
  ds[sm, "Read Count"] = sum(use)
}
ds$"Adapter1" = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
ds$"Adapter2" = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
ds$strandMode = "sense"
ds$Species = "S. cerevisiae"
ds$"Enrichment Kit" = "poly-A"
my.write.table(ds, file="yeast_10k/dataset.tsv", head="Name")



ds = my.read.table("inst/extdata/yeast_10k/dataset.tsv")
ds$"Adapter1" = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
ds$"Adapter2" = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
my.write.table(ds, file="inst/extdata/yeast_10k/dataset.tsv", head="Name")
uzh/ezRun documentation built on Sept. 16, 2024, 11:21 p.m.