Rowley et al., 2011; Bray et al., 2013; Kissopoulou et al., 2013; Londin et al., 2014; Lefrançais et al., 2017)

Folder organizations

fetch GEO/SRA accession

quantify_salmon(rtype = rtype, files = files, 
                          salmondir = salmondir, smp = smp,
                          salmonbin = salmonbin, libtype = libtype, 
salmonindex = salmonindex, bias = bias)
quantify_salmon <- function(rtype, files, salmondir, smp, salmonbin, 
                            libtype, salmonindex, bias = FALSE) {
  if (!any(file.exists(c(paste0(salmondir, "/", smp, "/aux_info/meta_info.json"),
                         paste0(salmondir, "/", smp, "/aux/meta_info.json"))))) {
    if (rtype == "single") {
      salmon <- sprintf("bash -c '%s quant -p 10 -l %s -i %s -r <(cat %s) -o %s %s'",
                        salmonbin, 
                        libtype,
                        salmonindex,
                        files, 
                        paste0(salmondir, "/", smp),
                        ifelse(bias, "--seqBias", ""))
      system(salmon)
    } else if (rtype == "paired") {
      salmon <- sprintf("bash -c '%s quant -p 10 -l %s -i %s -1 <(cat %s) -2 <(cat %s) -o %s %s'",
                        salmonbin, 
                        libtype,
                        salmonindex,
                        files$f1,
                        files$f2,
                        paste0(salmondir, "/", smp),
                        ifelse(bias, "--seqBias", ""))
      system(salmon)
    } 
  } else {
    message("Salmon has already been run for ", smp)
  }
}

SRA535016 GSE94384

mydf <- data.table::fread("_db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab")


mygeo_gse <- getGEO("GSE94384")[[1]]
mygeo_runs <- rownames(phenoData(mygeo_gse))
mygeo_runslist <- lapply(mygeo_runs, getGEO)
mygeo_allGSM <- unlist(lapply(mygeo_runslist,function(arg) arg@header$geo_accession))

mygeo_allSRR <- lapply(mygeo_allGSM, function(arg) mydf[grep(arg,mydf$Alias),])
mygeo_SRAoverview <- do.call(rbind,mygeo_allSRR)

# reverse checks via SRAdb
library(SRAdb)
sqlfile <- '_db/SRAmetadb.sqlite'
sra_con <- dbConnect(SQLite(),sqlfile)

myohmy <- getFASTQinfo( mygeo_SRAoverview$Accession, sra_con, srcType = 'ftp' )
myohmy2 <- getSRAinfo( mygeo_SRAoverview$Accession, sra_con)


getSRAfile(myohmy$run, sra_con, fileType = 'fastq' ,destDir = "_fastq")


mydf[grep("GSM2474455",mydf$Alias),]
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223045/SRR5223045.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223046/SRR5223046.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223047/SRR5223047.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223048/SRR5223048.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223049/SRR5223049.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223050/SRR5223050.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223051/SRR5223051.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223052/SRR5223052.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223053/SRR5223053.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223054/SRR5223054.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223055/SRR5223055.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223056/SRR5223056.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223057/SRR5223057.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR522/SRR5223058/SRR5223058.sra

wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/006/SRR5223046/SRR5223046.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/007/SRR5223047/SRR5223047.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/008/SRR5223048/SRR5223048.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/009/SRR5223049/SRR5223049.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/000/SRR5223050/SRR5223050.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/001/SRR5223051/SRR5223051.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/002/SRR5223052/SRR5223052.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/003/SRR5223053/SRR5223053.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/004/SRR5223054/SRR5223054.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/005/SRR5223055/SRR5223055.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/006/SRR5223056/SRR5223056.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/007/SRR5223057/SRR5223057.fastq.gz
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR522/008/SRR5223058/SRR5223058.fastq.gz
GSE74246 - https://www.ncbi.nlm.nih.gov//geo/query/acc.cgi?acc=GSE74246

Corces MR, Buenrostro JD, Wu B, Greenside PG et al. Lineage-specific and single-cell chromatin accessibility charts human hematopoiesis and leukemia evolution. Nat Genet 2016 Oct;48(10):1193-203. PMID: 27526324








SRA062032
bray- The complex transcriptional landscape of the anucleate human platelet

rs <- getSRAinfo ( c("SRA062032"), sra_con, sraType = "sra" )
rs2 <- getFASTQinfo ( c("SRA062032"), sra_con)
getSRAfile(rs2$run, sra_con, fileType = 'fastq' ,destDir = "_fastq")

wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR627/SRR627520/SRR627520.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628222/SRR628222.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628233/SRR628233.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628234/SRR628234.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628235/SRR628235.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628237/SRR628237.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628238/SRR628238.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628239/SRR628239.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628240/SRR628240.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628241/SRR628241.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628243/SRR628243.sra
wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR628/SRR628244/SRR628244.sra


1  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR627/SRR627520/SRR627520.fastq.gz
2  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628222/SRR628222.fastq.gz
3  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628233/SRR628233.fastq.gz
4  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628234/SRR628234.fastq.gz
5  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628235/SRR628235.fastq.gz
6  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628237/SRR628237.fastq.gz
7  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628238/SRR628238.fastq.gz
8  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628239/SRR628239.fastq.gz
9  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628240/SRR628240.fastq.gz
10 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628241/SRR628241.fastq.gz
11 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628243/SRR628243.fastq.gz
12 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR628/SRR628244/SRR628244.fastq.gz

-> add this too?
  https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50858









# best 2015
RNA-seq of tumor-educated platelets enables blood-based pan-cancer, multiclass and molecular pathway cancer diagnostics
Identifiers: SRA: SRP057500
BioProject: PRJNA281708
GEO: GSE68086
SRA260904






# best 2017
GEO: GSE89843
SRP093349
Study Type: Transcriptome Analysis
Submission: SRA493919

getSRAinfo( "SRP093349", sra_con)

















https://www.thieme-connect.com/products/ejournals/pdf/10.1160/TH16-11-0873.pdf

# data not provided!














http://heart.bmj.com/content/101/Suppl_4/A102.2









# probably to REDO

?????
https://trace.ncbi.nlm.nih.gov/Traces/sra?study=SRP092389
We here systematically studied the interaction network of bone marrow cells. To this end, we micro-dissected many small interacting structures (cell doublets, triplets etc.) into single cells, and sequenced their mRNAs, to infer cell identity. After grouping the cells into cell types (based on the single-cell transcriptomes), we identified actual physical interactions that occurred more, or less, than what would be expected by chance. Overall design: After mild dissociation of the bone marrow or fetal liver, we micro-dissected many small interacting structures (cell doublets, triplets etc.) into single cells, and sequenced their mRNAs. We also sorted single bone marrow cells. Finally we sorted 2000 or single neutrophils (Gr1+ CD115-) from control or megakaryocyte depleted mice.

myohmy <- getFASTQinfo( "SRP092389", sra_con, srcType = 'ftp' )
getSRAinfo ( "SRP092389", sra_con, sraType = "sra" )
getFASTQinfo ( "SRP092389", sra_con)











# chipseq?

https://www.ncbi.nlm.nih.gov//geo/query/acc.cgi?acc=GSE42249

https://www.ncbi.nlm.nih.gov//geo/query/acc.cgi?acc=GSE24674





https://platelets.group.cam.ac.uk/publications ?





http://www.bloodjournal.org/content/121/19/3908.long?sso-checked=true


http://www.bloodjournal.org/content/118/7/1903.long


https://www.sciencedirect.com/science/article/pii/S1873506115000665?via%3Dihub
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60941



http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0183042
Data Availability: Data have been deposited at Gene Expression Omnibus (GEO) under accession number GSE94292. -> microarray!
# as in https://www.biostars.org/p/244150/
accessions_SRA_file <- "ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab"

download.file(accessions_SRA_file,paste0("_db/SRA_Accessions_SNAPSHOT_",Sys.Date(),".tab"))

# accessions_SRA <- readr::read_tsv(accessions_SRA_file)


grep '^SRR' _db/SRA_Accessions_SNAPSHOT_2018-01-31.tab | grep 'GSM' > _db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab

# insert header as first line
# Accession       Submission      Status  Updated Published       Received        Type    Center  Visibility      Alias   Experiment      Sample  Study   Loaded  Spots   Bases   Md5sum  BioSample     BioProject      ReplacedBy
sed -i "1i\
$(head -n 1 _db/SRA_Accessions_SNAPSHOT_2018-01-31.tab)" _db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab



mydf <- data.table::fread("_db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab")
# colnames(mydf) <- dbListFields(sraacc_con,name = "SRAaccs")



# to be done once
mydf <- data.table::fread("SRA_Accessions_SNAPSHOT_2018-01-31.tab")
library(DBI)
mySRAaccessiondb <- dbConnect(RSQLite::SQLite(), paste0("_db/SRA_Accessions_SNAPSHOT_",Sys.Date(),".sqlite"))
dbWriteTable(mySRAaccessiondb, "SRAaccs", mydf)

# to use it:
sraacc_con <- dbConnect(SQLite(),paste0("_db/SRA_Accessions_SNAPSHOT_",Sys.Date(),".sqlite"))
dbListTables(sraacc_con)

dbListFields(sraacc_con,name = "SRAaccs")

dbGetQuery(sraacc_con, 'SELECT * FROM SRAaccs WHERE "Alias" == :x', 
  params = list(x = "GSM2474455"))
library(SRAdb)
sqlfile <- '_db/SRAmetadb.sqlite'
if(!file.exists('_db/SRAmetadb.sqlite')) sqlfile <<- getSRAdbFile()
sra_con <- dbConnect(SQLite(),sqlfile)
getFASTQinfo( c("SRR000648","SRR000657"), sra_con, srcType = 'ftp' )
getFASTQinfo( c("SRX2532222"), sra_con, srcType = 'ftp' )
getFASTQinfo( c("SRR343051"), sra_con, srcType = 'ftp' )
getSRAfile( c("SRR343051"), sra_con, fileType = 'fastq' )
getSRAfile( c("SRR343051"), sra_con, fileType = 'sra',destDir = "_sra" )

sraFiles <- getFASTQinfo( c("SRR343051"), sra_con, srcType = 'ftp' )

method = "curl"
fnames <- sraFiles$ftp

fileinfo = NULL
    for (i in fnames) {
      f_reads <- gsub(".fastq.gz","_1.fastq.gz",i)
      r_reads <- gsub(".fastq.gz","_2.fastq.gz",i)  
      download.file(f_reads, destfile = file.path(destDir, basename(f_reads)), method = method)
      download.file(r_reads, destfile = file.path(destDir, basename(r_reads)), method = method)
    }


get.fastq.urls <- function(df) {
    url <- rep(NA, length(df$run))
    for(i in 1:length(df$run)) {
        run <- df$run[i]
        filename <- paste0(df$run[i],".fastq.gz")
        if(nchar(run) < 10) {
            url[i] <- file.path('ftp://ftp.sra.ebi.ac.uk/vol1/fastq',
                                substring(run, 1, 6), run, filename)
        } else {
            dir2 <- paste( c(rep(x='0', 12-nchar(run)), substring(run, 10, 
                                                                  nchar(run))), collapse = '' )
            url[i] <- file.path('ftp://ftp.sra.ebi.ac.uk/vol1/fastq', 
                                substring(run, 1, 6), dir2, run, filename)
        }  
    }
    return(url)
}
rs <- getSRA( search_terms = "platelets",out_types = c('run','study'), sra_con )

retrieve info programmatically



federicomarini/plateletopedia documentation built on Dec. 20, 2021, 7:49 a.m.