readData <- function(path,populations=FALSE,outgroup=FALSE,include.unknown=FALSE,gffpath=FALSE,format="fasta",parallized=FALSE,progress_bar_switch=TRUE,
FAST=FALSE,big.data=FALSE,SNP.DATA=FALSE){
## CHECK INPUT
if(!file.exists(path)){stop("Cannot find path !")}
if(!file.info(path)[2]){stop("Put your file/files in a folder !")}
if(gffpath[1]!=FALSE){
if(!file.exists(gffpath)){stop("Cannot find gff path !")}
if(!file.info(gffpath)[2]){stop("Put your file/files in a folder ! (GFF)")}
}
if(format=="HapMap"){SNP.DATA=TRUE;FAST=TRUE}
if(format=="VCF") {SNP.DATA=TRUE;FAST=TRUE}
#if(format=="VCFhap"){SNP.DATA=TRUE;FAST=TRUE}
if(SNP.DATA){big.data=TRUE}
# Parallized version of readData
if(parallized){
#library(parallel)
n.cores <- parallel::detectCores() #multicore:::detectCores()
#n.cores <- n.cores - 1
#options(cores=n.cores)
#getOption('cores')
files <- list.files(path)
split_files <- split(files,sort(rep(1:n.cores,ceiling(length(files)/n.cores))))
if(gffpath[1]!=FALSE){
gff_files <- list.files(gffpath)
split_files_gff <- split(gff_files,sort(rep(1:n.cores,ceiling(length(gff_files)/n.cores))))
}
xxx <- NULL
yyy <- NULL
for (i in 1:n.cores){
command <- paste("mkdir split",i,sep="")
system(command)
filename <- paste("split",i,sep="")
filename_path <- file.path(getwd(),filename)
sapply(split_files[[i]],function(x){
command <- file.path(path,x)
#command <- paste("mv",command,filename_path,sep=" ")
command <- paste("cp",command,filename_path,sep=" ")
system(command)
})
xxx <- c(xxx,filename_path)
if(gffpath[1]!=FALSE){
command <- paste("mkdir GFFRObjects_split",i,sep="")
system(command)
filename <- paste("GFFRObjects_split",i,sep="")
filename_path <- file.path(getwd(),filename)
sapply(split_files_gff[[i]],function(x){
command <- file.path(gffpath,x)
#command <- paste("mv",command,filename_path,sep=" ")
command <- paste("cp",command,filename_path,sep=" ")
system(command)
})
yyy <- c(yyy,filename_path)
}
}
if(gffpath[1]==FALSE){
#print(xxx)
#print(format)
#print(progress_bar_switch)
#stop("Jo")
rueck <- parallel::mclapply(xxx,readData,
format=format,parallized=FALSE,progress_bar_switch=FALSE,
FAST=FAST,big.data=big.data,SNP.DATA=SNP.DATA,
include.unknown=include.unknown, mc.cores = n.cores, mc.silent = TRUE)
}else{
cat("Calculation ... \n")
INPUT <- vector("list",n.cores)
for(iii in 1:n.cores){
INPUT[[iii]] <- c(xxx[iii],yyy[iii])
}
rueck <- parallel::mclapply(INPUT,function(x){
daten <- x[1]
gffdaten <- x[2]
TT <- readData(path=daten,
gffpath=gffdaten,format=format,parallized=FALSE,progress_bar_switch=FALSE,
FAST=FAST,big.data=big.data,SNP.DATA=SNP.DATA,include.unknown=include.unknown)
return(TT)
},mc.cores = n.cores, mc.silent = TRUE, mc.preschedule = TRUE)
}
genome <- concatenate(rueck,n.cores)
genome@basepath <- file.path(path)
genome@project <- file.path(path)
liste <- list.files(path,full.names=TRUE)
genome@genelength <- length(liste)
if(FAST){
genome@Pop_Slide$calculated <- TRUE # nur wegen Einschraenkungen, die auch im Slid Modus gelten
}else{
genome@Pop_Slide$calculated <- FALSE
}
if(!is.list(populations)){genome@populations <- list(NULL)}else{genome@populations <- populations}
genome@outgroup <- outgroup
## Move splits back to original folder
## and Delete splits afterwards #TODO
for (i in 1:n.cores){
#command <- paste("mv split",i,"/*"," ",path,sep="")
#system(command)
command <- paste("rm -r split",i,sep="")
system(command)
if(gffpath[1]!=FALSE){
#command <- paste("mv split",i,"/*"," ",gffpath,sep="")
#system(command)
command <- paste("rm -r GFFRObjects_split",i,sep="")
system(command)
}
}# end of delete
cat("\n")
return(genome)
# SNOWFALL
#def.cpus <- 3
#sfInit(parallel=TRUE,cpus=def.cpus,type="SOCK")
#sfLibrary(ape)
#sfExportAll() # notwendig !
#sfSource("C:/Users/NULLEINS/Desktop/POPGEN/GENOME.R")
#sfSource("C:/Users/NULLEINS/Desktop/POPGEN/GEN.R")
#sfSource("C:/Users/NULLEINS/Desktop/POPGEN/DATA.R")
# split the Data
#files <- list.files(path)
#split_files <- split(files,1:def.cpus)
#xxx <- NULL
#for (i in 1:def.cpus){
# command <- paste("md split",i,sep="")
# shell(command)
# filename <- paste("split",i,sep="")
# filename_path <- file.path(getwd(),filename)
# sapply(split_files[[i]],function(x){
# command <- file.path(path,x)
# command <- paste("cp",command,filename_path,sep=" ")
# shell(command)
# })
#
# xxx <- c(xxx,filename_path)
#}
#genome <- sfLapply(xxx,readData,parallized=FALSE)
#sfStop()
#return(genome)
#genome <- sfLapply()
}
#### End of parallization
methods <- "DATA"
########################
# if (.Platform$OS.type == "unix") {
# path_C_code <- file.path(.path.package("PopGenome"),"libs","PopGenome.so")
# }else{
# ## muss je nach 32 64 Win geaendert werden !
# path_C_code <- file.path(.path.package("PopGenome"),"libs","PopGenome.dll")
# }
# dyn.load(path_C_code,PACKAGE="PopGenome")
########################
npops <- length(populations)
popnames <- paste("pop",1:npops)
# Get the alignments
liste <- list.files(path,full.names=TRUE)
liste2 <- list.files(path)
liste3 <- gsub("\\.[a-zA-Z]*","",liste2)
# sort the list -------------
ordered <- as.numeric(gsub("\\D", "", liste))
if(!any(is.na(ordered))){
names(ordered) <- 1:length(ordered)
ordered <- sort(ordered)
ids <- as.numeric(names(ordered))
liste <- liste [ids]
liste2 <- liste2[ids]
liste3 <- liste3[ids]
}
# ---------------------------
gff_objects <- vector("list",length(liste))
SNP.GFF <- FALSE
GFF.BOOL <- FALSE
## GET THE GFF FILES -----------------------------
if(gffpath[1]!=FALSE){
GFF.BOOL <- TRUE
gff_liste <- list.files(gffpath,full.names=TRUE)
gff_liste2 <- list.files(gffpath)
gff_liste3 <- gsub("\\.[a-zA-Z]*","",gff_liste2)
# print(gff_liste3)
# print("-------------")
# print(liste3)
#treffer <- match(gff_liste3,liste3)
treffer <- match(liste3,gff_liste3)
### Print a warning when GFF files are missing
if(any(is.na(treffer))){
cat("WARNING:: Could not find GFF files for:\n")
cat(liste3[is.na(treffer)],"\n",sep=",")
}
#print(liste3)
#print(gff_liste3)
#print("------------")
#print(treffer)
gff_liste <- gff_liste[treffer]
#print(liste)
#print(gff_liste)
#stop("")
}
# ---------------------------------------------
################# Get the poppairs names ##################################
if(npops>1){
if(outgroup[1]!=FALSE){
poppairs <- choose(npops+1,2) # Outgroup is included !!
pairs <- combn(1:(npops+1),2)
}else{
poppairs <- choose(npops,2) # Outgroup is not included !!
pairs <- combn(1:(npops),2)
}
###########################################################################
#### --- Names of population pairs --- #######################################
nn <- paste("pop",pairs[1,1],"/pop",pairs[2,1],sep="")
if(dim(pairs)[2]>1){ # more than 2 Populations
for(xx in 2:dim(pairs)[2]){
m <- paste("pop",pairs[1,xx],"/pop",pairs[2,xx],sep="")
nn <- c(nn,m)
}
}#END if
}# End npops > 1
else{poppairs <- 1;nn <- "pop1"}
##### ------------------------------ #########################################
### --- ------------------------------ ####
sizeliste <- length(liste)
genome <- new("GENOME")
genome@basepath <- file.path(path)
genome@project <- file.path(path)
genome@genelength <- sizeliste
#populationsX <- as.matrix(populations)
#rownames(populationsX) <- popnames
#colnames(populationsX) <- "Number of Samples"
if(!is.list(populations)){genome@populations <- list(NULL)}else{genome@populations <- populations}
genome@poppairs <- nn
genome@outgroup <- outgroup
genome@region.names <- liste2
DATABOOL <- is.element("DATA",methods)
nsites <- integer(sizeliste)
nsites2 <- integer(sizeliste)
# if(ALL_METHODS){
# genelist <- vector("list",length(liste)) # list of objects (class GEN )
# datalist <- vector("list",length(liste)) # list of objects (class DATA)
# }
# INIT
# do this calculation every time
region.data <- new("region.data")
region.stats <- new("region.stats")
init <- vector("list",sizeliste)
init2 <- numeric(sizeliste)
init3 <- rep(NaN,sizeliste)
# region.data init
populationsX <- init
populations2 <- init
popmissing <- init
outgroupX <- init
outgroup2 <- init
CodingSNPS <- init
UTRSNPS <- init
IntronSNPS <- init
ExonSNPS <- init
GeneSNPS <- init
reading.frame <- init
rev.strand <- init
Coding.matrix <- init
Coding.matrix2 <- init
UTR.matrix <- init
Intron.matrix <- init
Exon.matrix <- init
Gene.matrix <- init
#Coding.info <- init
#UTR.info <- init
#Intron.info <- init
#Exon.info <- init
#Gene.info <- init
Coding.region <- init2
UTR.region <- init2
Intron.region <- init2
Exon.region <- init2
Gene.region <- init2
transitions <- init # matrix_sv transition war eine 1
biallelic.matrix <- init # matrix_pol
biallelic.sites <- init # matrix_pos
biallelic.sites2 <- init
reference <- init
matrix_codonpos <- init # codonpos. of biallelics
synonymous <- init # synnonsyn
matrix_freq <- init
n.singletons <- init # unic
polyallelic.sites <- init # mhitbp
n.nucleotides <- init # sum_sam
biallelic.compositions <- init # TCGA
biallelic.substitutions <- init # subst
minor.alleles <- init # mutations
codons <- init
sites.with.gaps <- init # gaps
sites.with.unknowns <- init
# GENOME data init
n.valid.sites <- init2
n.gaps <- init2
n.unknowns <- init2
n.polyallelic.sites <- init2
n.biallelic.sites <- init2
trans.transv.ratio <- init3
## PROGRESS #########################
if(progress_bar_switch){ ### because of parallized
progr <- progressBar()
}
#####################################
# -----------------------------#
#if(!is.list(populations)){
#allseq <- T}else{allseq <- F}
# -----------------------------#
for(xx in 1:sizeliste){ #
if(!progress_bar_switch){
print(liste[xx])
}
#print(liste[xx])
CCC <- try(PopGenread(liste[xx],format),silent=TRUE)
if(is.na(CCC$matrix[1])){next}
gen <- CCC$matrix
pos <- CCC$positions
ref <- CCC$reference
# GFF stuff ----------------------------------------------------------------------
gff.object.exists <- FALSE
FIT <- FALSE
if(GFF.BOOL){
gff.object.exists <- TRUE
# if(length(grep("GFFRObjects",gffpath))!=0){
if(SNP.DATA){
if(!is.na(gff_liste[xx])){
if(length(grep("GFFRObjects",gffpath))!=0){
Robj <- load(gff_liste[xx])
gff_object <- get(Robj[1])
}else{
gff_object <- gffRead(gff_liste[xx])
}
gff_object_fit <- fitting_gff_fast(pos,gff_object)
FIT <- TRUE
}else{gff.object.exists <- FALSE}
}else{
if(!is.na(gff_liste[xx])){
gff_object <- gffRead(gff_liste[xx])
}else{gff.object.exists <- FALSE}
}
# Parsing GFF file
if(gff.object.exists){
gff_object <- parse_gff(gff_object,SNP.DATA=SNP.DATA)
if(FIT){
gff_object_fit <- parse_gff(gff_object_fit,SNP.DATA=SNP.DATA)
GLOBAL.GFF$GFF <- NULL
}else{
gff_object_fit <- gff_object # in case of non SNP.DATA
}
}else{
gff_object <- FALSE
gff_object_fit <- FALSE
}
}else{gff_object <- FALSE;gff_object_fit <- FALSE}
# GFF stuff END -------------------------------------------------
if(is.matrix(gen)){
nsites[xx] <- dim(gen)[2]
nsites2[xx] <- dim(gen)[2] # important for SNPDATA
#------------------------------------------------------#
result <- popgen(gen,Populations=populations,outgroup=outgroup,methods=methods,include.unknown=include.unknown,gff=gff_object_fit,FAST,SNP.DATA)
#------------------------------------------------------#
rm(gen) # delete gen
}else{result <- NA;next}
# PROGRESS #######################################################
if(progress_bar_switch){ # wegen parallized
progr <- progressBar(xx,sizeliste, progr)
}
if(xx==ceiling(sizeliste/2)){gc()}
###################################################################
if(is.list(result)){ # biallelic.sites exist
# fill region.data
populationsX[[xx]] <- result$populations
populations2[[xx]] <- result$populations2
popmissing[[xx]] <- result$popmissing
outgroupX[[xx]] <- result$outgroup
outgroup2[[xx]] <- result$outgroup2
datt <- result$data.sum
CodingSNPS[[xx]] <- datt$CodingSNPS
UTRSNPS[[xx]] <- datt$UTRSNPS
IntronSNPS[[xx]] <- datt$IntronSNPS
ExonSNPS[[xx]] <- datt$ExonSNPS
GeneSNPS[[xx]] <- datt$GeneSNPS
if(GFF.BOOL & !gff.object.exists){
fillinit <- vector(,length(datt$biallelic.sites))
CodingSNPS[[xx]] <- fillinit
UTRSNPS[[xx]] <- fillinit
IntronSNPS[[xx]] <- fillinit
ExonSNPS[[xx]] <- fillinit
GeneSNPS[[xx]] <- fillinit
}
transitions[[xx]] <- datt$transitions # matrix_sv transition war eine 1
if(is.na(pos[1])){
biallelic.sites[[xx]] <- datt$biallelic.sites # matrix_pos
polyallelic.sites[[xx]] <- datt$polyallelic.sites
sites.with.gaps[[xx]] <- datt$sites.with.gaps
sites.with.unknowns[[xx]] <- datt$sites.with.unknowns
}else{
biallelic.sites2[[xx]] <- datt$biallelic.sites
biallelic.sites[[xx]] <- pos[datt$biallelic.sites]
nsites[xx] <- biallelic.sites[[xx]][length(biallelic.sites[[xx]])]
polyallelic.sites[[xx]] <- pos[datt$polyallelic.sites] # mhitbp
sites.with.gaps[[xx]] <- pos[datt$sites.with.gaps]
sites.with.unknowns[[xx]] <- pos[datt$sites.with.unknowns]
}
if(!big.data){
biallelic.matrix[[xx]] <- datt$biallelic.matrix # matrix_pol
colnames(biallelic.matrix[[xx]]) <- biallelic.sites[[xx]]
if(GFF.BOOL & gff.object.exists){
reading.frame[[xx]] <- gff_object$reading.frame
rev.strand[[xx]] <- gff_object$rev.strand
Coding.matrix[[xx]] <- gff_object$Coding
UTR.matrix[[xx]] <- gff_object$UTR
Intron.matrix[[xx]] <- gff_object$Intron
Exon.matrix[[xx]] <- gff_object$Exon
Gene.matrix[[xx]] <- gff_object$Gene
#
#Coding.info[[xx]] <- gff_object$Coding.info
#UTR.info[[xx]] <- gff_object$UTR.info
#Intron.info[[xx]] <- gff_object$Intron.info
#Exon.info[[xx]] <- gff_object$Exon.info
#Gene.info[[xx]] <- gff_object$Gene.info
}
}else{ # BIG DATA FF
biallelic.matrix[[xx]] <- ff(datt$biallelic.matrix,dim=dim(datt$biallelic.matrix)) # matrix_pol
close(biallelic.matrix[[xx]])
dimnames(biallelic.matrix[[xx]]) <- list(rownames(datt$biallelic.matrix),biallelic.sites[[xx]])
if(GFF.BOOL & gff.object.exists){
if(dim(gff_object$Coding)[1]>0){
fill <- as.matrix(gff_object$Coding)
Coding.matrix[[xx]] <- ff(fill,dim=dim(fill))
close(Coding.matrix[[xx]])
if(dim(gff_object_fit$Coding)[1]>0){
reading.frame[[xx]] <- gff_object_fit$reading.frame
rev.strand[[xx]] <- gff_object_fit$rev.strand
fill <- as.matrix(gff_object_fit$Coding)
Coding.matrix2[[xx]] <- ff(fill,dim=dim(fill)) # wegen set.synnonsyn
close(Coding.matrix2[[xx]])
}
# Coding.info[[xx]] <- ff(as.factor(gff_object$Coding.info))
}
if(dim(gff_object$UTR)[1]>0){
fill <- as.matrix(gff_object$UTR)
UTR.matrix[[xx]] <- ff(fill,dim=dim(fill))
close(UTR.matrix[[xx]])
# UTR.info[[xx]] <- ff(as.factor(gff_object$UTR.info))
}
if(dim(gff_object$Intron)[1]>0){
fill <- as.matrix(gff_object$Intron)
Intron.matrix[[xx]] <- ff(fill,dim=dim(fill))
close(Intron.matrix[[xx]])
# Intron.info[[xx]] <- ff(as.factor(gff_object$Intron.info))
}
if(dim(gff_object$Exon)[1]>0){
fill <- as.matrix(gff_object$Exon)
Exon.matrix[[xx]] <- ff(fill,dim=dim(fill))
close(Exon.matrix[[xx]])
# Exon.info[[xx]] <- ff(as.factor(gff_object$Exon.info))
}
if(dim(gff_object$Gene)[1]>0){
fill <- as.matrix(gff_object$Gene)
Gene.matrix[[xx]] <- ff(fill,dim=dim(fill))
close(Gene.matrix[[xx]])
# Gene.info[[xx]] <- ff(as.factor(gff_object$Gene.info))
}
}
}
if(!is.na(ref[1])){
reference[[xx]] <- ref[datt$biallelic.sites]
}
matrix_codonpos[[xx]] <- datt$matrix_codonpos # codonpos. of biallelics
synonymous[[xx]] <- datt$synonymous # synnonsyn
matrix_freq[[xx]] <- datt$matrix_freq
n.singletons[[xx]] <- datt$n.singletons # unic
n.nucleotides[[xx]] <- datt$n.nucleotides # sum_sam
biallelic.compositions[[xx]] <- datt$biallelic.compositions # TCGA
biallelic.substitutions[[xx]] <- datt$biallelic.substitutions# subst
minor.alleles[[xx]] <- datt$minor.alleles # mutations
codons[[xx]] <- datt$codons
# fill Genome data
n.valid.sites[xx] <- datt$n.valid.sites
n.gaps[xx] <- length(datt$sites.with.gaps)
n.unknowns[xx] <- length(datt$sites.with.unknowns)
n.polyallelic.sites[xx] <- length(datt$polyallelic.sites)
n.biallelic.sites[xx] <- length(datt$biallelic.sites)
trans.transv.ratio[xx] <- datt$trans.transv.ratio
Coding.region[xx] <- datt$Coding_region_length
UTR.region[xx] <- datt$UTR_region_length
Intron.region[xx] <- datt$Intron_region_length
Exon.region[xx] <- datt$Exon_region_length
Gene.region[xx] <- datt$Gene_region_length
}# else{warnings("No biallelic position !")}
}# End of For
# region.data
region.data@populations <- populationsX
region.data@populations2 <- populations2
region.data@popmissing <- popmissing
region.data@outgroup <- outgroupX
region.data@outgroup2 <- outgroup2
region.data@CodingSNPS <- CodingSNPS
region.data@UTRSNPS <- UTRSNPS
region.data@IntronSNPS <- IntronSNPS
region.data@ExonSNPS <- ExonSNPS
region.data@GeneSNPS <- GeneSNPS
region.data@reading.frame <- reading.frame
region.data@rev.strand <- rev.strand
region.data@Coding.matrix <- Coding.matrix
region.data@Coding.matrix2 <- Coding.matrix2
region.data@UTR.matrix <- UTR.matrix
region.data@Intron.matrix <- Intron.matrix
region.data@Exon.matrix <- Exon.matrix
region.data@Gene.matrix <- Gene.matrix
#region.data@Coding.info <- Coding.info
#region.data@UTR.info <- UTR.info
#region.data@Intron.info <- Intron.info
#region.data@Exon.info <- Exon.info
#region.data@Gene.info <- Gene.info
region.data@transitions <- transitions # matrix_sv transition war eine 1
region.data@biallelic.matrix <- biallelic.matrix# matrix_pol
region.data@biallelic.sites <- biallelic.sites # matrix_pos
region.data@biallelic.sites2 <- biallelic.sites2
region.data@reference <- reference
region.data@matrix_codonpos <- matrix_codonpos # codonpos. of biallelics
region.data@synonymous <- synonymous# synnonsyn
region.data@matrix_freq <- matrix_freq
region.data@n.singletons <- n.singletons # unic
region.data@polyallelic.sites <- polyallelic.sites # mhitbp
region.data@n.nucleotides <- n.nucleotides # sum_sam
region.data@biallelic.compositions <- biallelic.compositions # TCGA
region.data@biallelic.substitutions <- biallelic.substitutions # subst
region.data@minor.alleles <- minor.alleles # mutations
region.data@codons <- codons
region.data@sites.with.gaps <- sites.with.gaps # gaps
region.data@sites.with.unknowns <- sites.with.unknowns
region.stats@nucleotide.diversity <- init
region.stats@haplotype.diversity <- init
region.stats@haplotype.counts <- init # sfreqh
region.stats@minor.allele.freqs <- init # JFD
region.stats@biallelic.structure <- init # SXX
region.stats@linkage.disequilibrium <- init
# GENOME data
genome@big.data <- big.data
names(nsites) <- liste2
genome@n.sites <- nsites
genome@n.sites2 <- nsites2
genome@n.valid.sites <- n.valid.sites
genome@n.gaps <- n.gaps
genome@n.unknowns <- n.unknowns
genome@n.polyallelic.sites <- n.polyallelic.sites
genome@n.biallelic.sites <- n.biallelic.sites
genome@trans.transv.ratio <- trans.transv.ratio
genome@Coding.region <- Coding.region
genome@UTR.region <- UTR.region
genome@Intron.region <- Intron.region
genome@Exon.region <- Exon.region
genome@Gene.region <- Gene.region
genome@region.data <- region.data
genome@region.stats <- region.stats
genome@Pop_Neutrality$calculated <- FALSE
genome@Pop_FSTN$calculated <- FALSE
genome@Pop_FSTH$calculated <- FALSE
genome@Pop_MK$calculated <- FALSE
genome@Pop_Linkage$calculated <- FALSE
genome@Pop_Recomb$calculated <- FALSE
genome@Pop_Slide$calculated <- FALSE
genome@Pop_Detail$calculated <- FALSE
if(FAST){genome@Pop_Slide$calculated <- TRUE}
if(GFF.BOOL){genome@gff.info<-TRUE}else{genome@gff.info <- FALSE}
genome@snp.data <- SNP.DATA
cat("\n")
return(genome)
}# End of Function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.