Nothing
# Secretome Lysat Peptide Descriptor
#
# Collection of Uniprot download functions
# download a proeteom fasta file from Uniprot-FTP-Server and save it to BasicData
# This function first checks the actual release version of Uniprot FTP Server files.
# If there is a new version, a new Release Folder will be created in BasicData.
# Than the FASTA files of the required Proteom and its additions are downloaded.
# Both are unzipped and loaded again with read.fasta from "seqinr".
# These results are stored as rds files in the BasicData folder
# Input: Domain (vector),Proteom_ID (vector),Tax_ID (vector),speciesname (vector),forcedl (logical)
# Output: char vector result contains path to directory with FASTA-rds-files
#Proteom_ID,Tax_ID,Species
#UP000005640 9606 HUMAN
#UP000000589 10090 MOUSE
#UP000002494 10116 RAT
#UP000008227 9823 PIG
#Domain: Archaea,Bacteria,Eukaryota,Viruses#
#forcedl: if old databasic/existing files should be replaced
#fasta2rds <- function(species=c("HUMAN", "MOUSE", "RAT", "PIG")){
fasta2rds <- function(species=c("HUMAN", "MOUSE", "RAT", "PIG"),
proteomeid=c("UP000005640","UP000000589","UP000002494","UP000008227"),
taxid=c("9606","10090","10116","9823"),
domain=rep("Eukaryota",4), forcedl = FALSE)
{
#wd <- paste0("bin/")
wd <- c("BasicData/")
FASTAEndingGZ <- ".fasta.gz"
FASTAEnding <- ".fasta"
#FASTAPath <- "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/proteomes/"
FASTAPath <- "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/"
#erstelle eine aktuelle version der Datenbank
#ACHTUNG: hier auch ueberpruefen ob Datei schon vorhanden
#save Uniprot Readme
#check version/ release of Uniprot Data on uniprot and in BasicData folder
cat("Check version \n")
bdown("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/relnotes.txt",
paste0(wd, "relnotes.txt"))
version <- gsub(" ", "", readLines(paste0(wd, "relnotes.txt"), 1))
cat("\n")
if (!dir.exists(paste0(wd, version)))
{
dir.create(paste0(wd, version))
}
if (!file.exists(paste0(wd, version, "/relnotes.txt")) | forcedl)
{
file.copy(paste0(wd, "relnotes.txt"), paste0(wd, version, "/relnotes.txt"), overwrite = FALSE)
}
file.remove(paste0(wd, "relnotes.txt"))
result <- paste0(wd, version)
cat("Starting FASTA file download, extraction and RDS creation...\n")
uniprot <- data.frame(species, proteomeid, taxid, domain, stringsAsFactors = FALSE)
for (i in seq_along(uniprot$species))
{
s <- uniprot[i,"species"]
pid <- uniprot[i,"proteomeid"]
tid <- uniprot[i,"taxid"]
d <- uniprot[i,"domain"]
################################################
## Download FASTA files
################################################
FASTAsource <- paste0(FASTAPath, d, "/", pid, "_", tid, FASTAEndingGZ)
FASTAdest <- paste0(wd, version,"/", s,"_",pid, "_", tid, FASTAEndingGZ)
FASTAfile <- paste0(wd, version,"/", s,"_",pid, "_", tid, FASTAEnding)
FASTAsourceAdd <- paste0(FASTAPath, d, "/", pid, "_", tid, "_additional", FASTAEndingGZ)
FASTAdestAdd <- paste0(wd, version,"/", s,"_",pid, "_", tid, "_additional", FASTAEndingGZ)
FASTAfileAdd <- paste0(wd, version,"/", s,"_",pid, "_", tid, "_additional", FASTAEnding)
FASTArds <- paste0(wd, version, "/", "FASTA", tolower(s), ".rds")
#print(bla)
cat(paste("Start download check:", s, "\n"))
if(!file.exists(FASTAfile) | forcedl )
{
ret <- bdown(FASTAsource, FASTAdest)
cat("\n")
cat("Extracting:", s, "\n")
gunzip(FASTAdest, remove=TRUE, overwrite=TRUE)
}
else
{
cat(FASTAfile, " is up to date. \n")
}
if(!file.exists(FASTAfileAdd) | forcedl )
{
retadd <- bdown(FASTAsourceAdd, FASTAdestAdd)
cat("\n")
cat("Extracting:", s, "\n")
gunzip(FASTAdestAdd, remove=TRUE, overwrite=TRUE)
}
else
{
cat(FASTAfileAdd, " is up to date. \n")
}
if (!file.exists(FASTArds) | forcedl )
{
cat("Reading FASTA file for", s, "\n")
###and save as RDS files ##
FASTAopen <- read.fasta(FASTAfile, as.string=TRUE, seqtype="AA", )
FASTAopenAdd <- read.fasta(FASTAfileAdd, as.string=TRUE, seqtype="AA", )
#zusammenfuegen
FASTAopenTg <- c(FASTAopen,FASTAopenAdd)
cat("Saving RDS file for", s, "\n")
saveRDS(FASTAopenTg, file=FASTArds)
}
else
{
cat(FASTArds, " is up to date. \n")
}
######################################################
## Create and save index, accessions, protein names ##
######################################################
if (!file.exists(paste0(wd,version,"/", s,".ACC.RDS"))| forcedl)
{
actProteinAcc <- {}
actProtein <- {}
actFASTAindex <- {}
actFASTA <- FASTAopenTg
index <- seq(1:length(actFASTA))
lenFASTA <- length(actFASTA)
cat("Creating Accession table for ", s, " #", length(actFASTA), "\n")
barWidth <- (getOption("width")-20)
pb <- txtProgressBar(style=3, , width=barWidth)
m <- 0
actProtein <- character(lenFASTA)
actProteinAcc <- character(lenFASTA)
for (y in 1:lenFASTA){
actProtein[y] <- getAnnot(actFASTA[[y]], as.string = TRUE)
actProtein[y] <- as.character(actProtein[y])
actProteinAcc[y] <- strsplit(actProtein[y], "|", fixed=TRUE)[[1]][2]
progress <- (y / lenFASTA)
setTxtProgressBar(pb, progress)
}
cat("\nSaving result...")
actFASTAindex <- data.table(index, actProteinAcc, actProtein)
saveRDS(actFASTAindex, paste0(wd,version,"/", s,".ACC.RDS"))
close(pb)
cat(paste0("Finished Accession table for ", s, "\n"))
cat("______________________________________________\n\n")
}
else
{
cat(paste0(wd,version,"/", s,".ACC.RDS"), " is up to date. \n")
}
}
cat("Finished process for\n")
cat(species, "\n")
cat("______________________________________________\n\n")
return(result)
}
#localfasta2rds allows to use a local fasta file.
# Input: character vector pathlf (path to local fasta file)
# character vector version
# Output: char vector result contains path to directory with FASTA-rds-files
localfasta2rds <- function(pathlf, version, org)
{
if ( !file.exists(pathlf) )
{
cat("localfasta2rds ERROR: There is no file called:\n",pathlf,"\n")
result <- "ERROR"
return(result)
}
pathsplit <- unlist(strsplit(pathlf,"/"))
fastaname <- pathsplit[length(pathsplit)]
if(version=="actual")
{
cat("Check version \n")
bdown("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/relnotes.txt",
paste0("BasicData/", "relnotes.txt"))
actversion <- gsub(" ", "", readLines(paste0("BasicData/", "relnotes.txt"), 1))
cat("\n")
newdir <- paste0("BasicData/", actversion, "_", fastaname, "_", org)
if (dir.exists(newdir))
{
cat("localfasta2rds ERROR: there is already a directory:\n", newdir,"\n")
result <- "ERROR"
return(result)
}
dir.create(newdir)
file.copy(pathlf, paste0(newdir, "/"))
} else if (dir.exists(paste0("BasicData/", version))) {
newdir <- paste0("BasicData/",version,"_",fastaname,"_",org)
if (dir.exists(newdir))
{
cat("localfasta2rds ERROR: there is already a directory:\n", newdir,"\n")
result <- "ERROR"
return(result)
}
dir.create(newdir)
file.copy(pathlf, paste0( newdir, "/"))
#version was not correct
} else {
cat("localfasta2rds ERROR: no directory matches to the given version \n")
result <- "ERROR"
return(result)
}
# save fasta in RDS
FASTAopen <- read.fasta(pathlf, as.string = TRUE, seqtype = "AA", )
cat("localfasta2rds: Saving FASTA-RDS file for",fastaname , "\n")
FASTArds <- paste0(newdir, "/FASTA", tolower(org), ".rds")
saveRDS(FASTAopen, file = FASTArds)
#create index
if (!file.exists(paste0(newdir, "/", toupper(org),".ACC.RDS")))
{
actProteinAcc <- {}
actProtein <- {}
actFASTAindex <- {}
actFASTA <- FASTAopen
index <- seq(1:length(actFASTA))
lenFASTA <- length(actFASTA)
cat("Creating Accession table for ", fastaname, " #", length(actFASTA), "\n")
barWidth <- (getOption("width")-20)
pb <- txtProgressBar(style=3, , width=barWidth)
m <- 0
actProtein <- character(lenFASTA)
actProteinAcc <- character(lenFASTA)
for (y in 1:lenFASTA){
actProtein[y] <- getAnnot(actFASTA[[y]], as.string = TRUE)
actProtein[y] <- as.character(actProtein[y])
actProteinAcc[y] <- strsplit(actProtein[y], "|", fixed=TRUE)[[1]][2]
progress <- (y / lenFASTA)
setTxtProgressBar(pb, progress)
}
cat("\nSaving result...")
actFASTAindex <- data.table(index, actProteinAcc, actProtein)
saveRDS(actFASTAindex, paste0(newdir,"/", toupper(org),".ACC.RDS"))
close(pb)
cat(paste0("Finished Accession table for ", fastaname, "\n"))
cat("______________________________________________\n\n")
}
else
{
cat(paste0(newdir,"/", toupper(org),".ACC.RDS"), " is up to date. \n")
}
result <- newdir
return(result)
}
#########################################
## Download function with progress bar ##
#########################################
bdown <- function(url, file)
{
h <- basicTextGatherer()
curlPerform(url=url, customrequest='HEAD', header=0L, nobody=1L, headerfunction=h$update)
if(grepl('Content-Length: ', h$value()))
{
#print(paste0("bdown Ergebnis Abfrgae Contenlength", grepl("Content-Length:",h$value())))
size <- as.numeric(strsplit(strsplit(h$value(),'Content-Length: ')[[1]][2],"\r")[[1]][1])
}
else
{
size <- 1
}
#size <-2000
f <- CFILE(file, mode="wb")
#print(paste0("bdown size: ", size))
if (size > 1000){
barWidth <- (getOption("width")-20)
bar <- txtProgressBar(0, size, style=3, , width=barWidth)
a <- curlPerform(url=url, noprogress=0L,
writedata = f@ref,
progressfunction=function(down,up)
setTxtProgressBar(bar, down[2]))
}else{
a <- curlPerform(url=url, noprogress=0L,
writedata = f@ref,
progressfunction=function(down,up)
cat("Downloading ", format(down[2], big.mark=",")," bytes\r"))
}
close(f)
return(a)
}
#-----------------------------------------------------------------------------------------------------------
# gff download
#------------------
## Saving imported GFF-files to RDS
# this function downloads species specific gff files from uniprot and saves them as rds in
# the BasicData folder for this release.
# Input: char vector speciesName, logical vector forcedl
# forcedl indicates if the files should be downloaded even if there is already a file with this release
# Output: char vector result contains path to directory with gff-rds-files
gff2rds <- function(speciesName = c("HUMAN", "MOUSE", "RAT", "PIG"), forcedl)
{
GFFEnding <- ".gff"
wd <- "BasicData/"
GFFEndingGZ <- ".gff.gz"
GFFPath <- "https://www.uniprot.org/uniprot/?format=gff&compress=yes&query=organism:"
#erstelle eine aktuelle Version der Datenbank
#ACHTUNG: hier auch ueberpruefen ob Datei schon vorhanden
#save Uniprot Readme
# Idee speichere die gffs passend zu den Aktualisierungen der Proteome
# check version/ release of Uniprot Data on uniprot and in BasicData folder
cat("Check version \n")
bdown("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/relnotes.txt",
paste0(wd, "relnotes.txt"))
version <- gsub(" ", "", readLines(paste0(wd, "relnotes.txt"), 1))
cat("\n")
if (!dir.exists(paste0(wd, version)))
{
dir.create(paste0(wd, version))
}
if (!file.exists(paste0(wd, version, "/relnotes.txt")) | forcedl)
{
file.copy(paste0(wd, "relnotes.txt"), paste0(wd, version, "/relnotes.txt"), overwrite = FALSE)
}
file.remove(paste0(wd, "relnotes.txt"))
result <- paste0(wd, version)
cat("Starting GFF file download, extraction and RDS creation...\n")
for (s in speciesName){
##############################################
## Download GFF files and save as RDS files ##
##############################################
GFFsource <- paste0(GFFPath, s)
GFFdest <- paste0(wd, version, "/", s, GFFEndingGZ)
GFFfile <- paste0(wd, version, "/", s, GFFEnding)
GFFrds <- paste0(wd, version, "/GFF", tolower(s), ".rds")
if(!file.exists(GFFfile) | forcedl )
{
cat("Downloading:", s, "\n")
ret <- bdown(GFFsource, GFFdest)
cat("\n")
cat("- Finished downloading ", s, "\n")
cat("Extracting:", s, "\n")
gunzip(GFFdest, remove=TRUE, overwrite=TRUE)
}
else
{
cat(GFFfile, " already exists. \n")
}
if(!file.exists(GFFrds) | forcedl )
{
cat("- Using GFF-file for species: ", s, "\n")
gfffile <- paste0(wd, version, "/", s,".gff")
gff <- gffRead(gfffile)
#names(gff)
#[1] "seqname" "source" "feature" "start" "end" "score" "strand" "frame" "attributes" NA
cat("- Extracting attributes from GFF-file (this will take a while):\n")
cat(" Notes...\n")
gff$Note <- getAttributeField(gff$attributes, "Note")
cat(" IDs...\n")
gff$ID <- getAttributeField(gff$attributes, "ID")
gff <- (as.data.table(gff))
cat("- GFF-parsing finished", "\n")
cat("- Saving GFF data table as RDS file for", s, "\n")
#GFFrds <- paste0(wd, version, "/GFF", tolower(s), ".rds")
saveRDS(gff, file=GFFrds)
cat("- Finished GFF.RDS generation for ", s, "\n")
cat("___________________________________________\n")
}
else
{
cat(GFFrds, " already exists. \n")
}
}
cat("- Finished GFF.RDS generation for ", "\n")
cat(speciesName, "\n")
cat("___________________________________________\n")
return(result)
}
#-----------------------------------------------------------
## Saving imported GFF-files to RDS
# this function downloads species specific gff files from uniprot and saves them as rds in
# the BasicData folder for this release.
# Input: char vector pathdir
# char vector version
# char vector org
#
# Output: char vector result contains path to directory with gff-rds-files
localgff2rds <- function(pathdir, version, org)
{
# check uniprot version/ release of Uniprot Data on uniprot and in BasicData folder
cat("localgff2rds: Check version \n")
bdown("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/relnotes.txt",
paste0("BasicData/", "relnotes.txt"))
actversion <- gsub(" ", "", readLines(paste0("BasicData/", "relnotes.txt"), 1))
cat("\n")
if (version == "actual")
{
# there is already a directory
if (dir.exists(paste0("BasicData/", actversion)))
{
file.copy(paste0("BasicData/", actversion,"/GFF",tolower(org),".rds"), paste0(pathdir,"/"))
file.copy(paste0("BasicData/", actversion,"/",toupper(org),".gff"), paste0(pathdir,"/"))
#there is no directory but there should be a newly made one from localfasta2rds
} else if (dir.exists(pathdir)){
# run download
##############################################
## Download GFF files and save as RDS files ##
##############################################
GFFEnding <- ".gff"
GFFEndingGZ <- ".gff.gz"
GFFPath <- "http://www.uniprot.org/uniprot/?format=gff&compress=yes&query=organism:"
GFFsource <- paste0(GFFPath, toupper(org))
GFFdest <- paste0(pathdir, "/", org, GFFEndingGZ)
GFFfile <- paste0(pathdir, "/", toupper(org), GFFEnding)
GFFrds <- paste0(pathdir, "/GFF", tolower(org), ".rds")
if(!file.exists(GFFfile) )
{
cat("localgff2rds: Downloading:", org, "\n")
ret <- bdown(GFFsource, GFFdest)
cat("\n")
cat("localgff2rds: Finished downloading ", org, "\n")
cat("localgff2rds: Extracting:", org, "\n")
gunzip(GFFdest, remove = TRUE, overwrite = TRUE)
}
else
{
cat("localgff2rds: ",GFFfile, " already exists. \n")
}
if(!file.exists(GFFrds))
{
cat("localgff2rds: Using GFF-file for species: ", org, "\n")
#gfffile <- paste0(wd, version, "/", s,".gff")
#gff <- gffRead(gfffile)
gff <- gffRead(GFFfile)
#names(gff)
#[1] "seqname" "source" "feature" "start" "end" "score" "strand" "frame" "attributes" NA
cat("- Extracting attributes from GFF-file (this will take a while):\n")
cat(" Notes...\n")
gff$Note <- getAttributeField(gff$attributes, "Note")
cat(" IDs...\n")
gff$ID <- getAttributeField(gff$attributes, "ID")
gff <- (as.data.table(gff))
cat("- GFF-parsing finished", "\n")
cat("- Saving GFF data table as RDS file for", org, "\n")
#GFFrds <- paste0(wd, version, "/GFF", tolower(s), ".rds")
saveRDS(gff, file=GFFrds)
cat("- Finished GFF.RDS generation for ", org, "\n")
cat("___________________________________________\n")
}
else
{
cat(GFFrds, " already exists. \n")
}
} else {
#none of the given directories exist
cat("localgff2rds ERROR: there is no pathdir with the given name\n")
return("ERROR")
}
# there is an archive file for this version
} else if (dir.exists(paste0("BasicData/", version))){
file.copy(paste0("BasicData/", version,"/GFF",tolower(org),".rds"), paste0(pathdir,"/"))
file.copy(paste0("BasicData/", version,"/",toupper(org),".gff"), paste0(pathdir,"/"))
} else {
cat("localgff2rds ERROR: there is no archive with the given name\n")
return("ERROR")
}
cat("- Finished GFF.RDS generation for ", org, "\n")
result <- pathdir
return(result)
}
############################################################
# get values from a choosen attribute from gff
getAttributeField <- function (x, field, attrsep = ";")
{
s = strsplit(x, split = attrsep, fixed = TRUE)
sapply(s, function(atts)
{
a = strsplit(atts, split = "=", fixed = TRUE)
m = match(field, sapply(a, "[", 1))
if (!is.na(m))
{
rv = a[[m]][2]
}
else
{
rv = as.character(NA)
}
return(rv)
}
)
}
###############################################
gffRead <- function(gffFile, nrows = -1)
{
cat("Reading ", gffFile, ": ", sep="")
gff = read.table(gffFile, sep="\t", as.is=TRUE, quote="",
header=FALSE, comment.char="#", nrows = nrows,
colClasses=c("character", "character", "character", "integer",
"integer",
"character", "character", "character", "character"))
# es entsteht eine leere Spalte am Ende der Tabelle, da im gff ein \t am Ende steht
# falls diese vorhanden ist loesche diese
if (sum(gff[, length(gff)] %in% "") == length(gff[, length(gff)]))
{
gff[, length(gff)] <- NULL
}
colnames(gff) = c("seqname", "source", "feature", "start", "end",
"score", "strand", "frame", "attributes")
cat("found", nrow(gff), "rows with classes:",
paste(sapply(gff, class), collapse=", "), "\n")
stopifnot(!any(is.na(gff$start)), !any(is.na(gff$end)))
return(gff)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.