dev/step_0_ref_building.R

library(tidyverse)
library(magrittr)
library(foreach)
library(doParallel)
registerDoParallel()
source("https://gist.githubusercontent.com/stemangiola/90a528038b8c52b21f9cfa6bb186d583/raw/cdf04a5988ab44c10d8a05fa46f1e175d526b2de/tidyTranscriptionTools.R")
library(ttBulk)

forget_FANTOM5 = function(){

  onto = ontologyIndex::get_ontology(
    "~/PhD/deconvolution/FANTOM5/ff-phase2-140729.obo.txt",
    propagate_relationships = "is_a",
    extract_tags = "minimal"
  )

  reads = read_delim(
    "~/PhD/deconvolution/FANTOM5/hg19.cage_peak_phase1and2combined_counts_ann.osc.txt.uncommented",
    "\t",
    escape_double = FALSE,
    trim_ws = TRUE
  ) %>%
    filter(grepl("p1@", short_description)) %>%
    gather(sample, `count`, -c(1:7)) %>%
    separate(sample,  c("dummy", "info", "cnhs", "onto_link"), sep="\\.", remove = F) %>%
    separate(info, sprintf("info_%s", 1:20), sep="%20|%2c", remove = F) %>%

    # filter for non induced
    filter(
      !grepl("day[0-9]+|[0-9]+hr", info) |
        grepl("00hr00min|day00", info)
    ) %>%

    # Get gene symbol
    separate(short_description, c("dummy", "symbol"), sep="@", remove = F)

  # reads  %>%
  #   left_join(
  #
  #     (.) %>%
  #       distinct(onto_link) %>%
  #       rowwise() %>%
  #       do(
  #         ontologyIndex::get_term_property(
  #           ontology=onto,
  #           property="ancestors",
  #           term=sprintf("FF:%s", .$onto_link),
  #           as_names=TRUE
  #         ) %>%
  #           as.data.frame() %>%
  #           t() %>%
  #           as_tibble() %>%
  #           mutate(onto_link =  names(.)[length(names(.))] %>% gsub("FF:", "", .) ) %>%
  #           setNames(
  #             c(
  #               names(.)[-c(length(names(.)), length(names(.))-1)],
  #               c("FF:main_info", "onto_link")
  #             )
  #           ) %>%
  #           gather(onto_category, onto_value, -onto_link)
  #       )  %>%
  #
  #       # Filter onyl human samples
  #       right_join((.) %>% filter(onto_value == "human sample") %>% distinct(onto_link)  ) %>%
  #
  #       # Filter only main info
  #       filter(onto_category == "FF:main_info") %>%
  #
  #       # Establish cell types
  #       mutate(`Cell type` = onto_value)
  #   ) %>%
  #   distinct(onto_link ,  sample, `Cell type`) %>%
  #   write_csv("big_data/tibble_cellType_files/FANTOM5_annotation_cell_types.csv")

  reads %>%
    # Attach ontology
    left_join(
      read_csv("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/FANTOM5_annotation_cell_types.csv")
    ) %>%
    dplyr::distinct(sample, symbol, `Cell type`, `Cell type formatted`, `count`, entrezgene_id) %>%

    mutate(`Data base` = "FANTOM5")

}

get_BLUEPRINT = function(){
  # Parse BLUEPRINT data base


  # read_delim(
  #   "~/PhD/deconvolution/BLUEPRINT_db/blueprint_files.tsv",
  #   "\t",
  #   escape_double = FALSE,
  #   trim_ws = TRUE
  # ) %>%
  #   filter(`File type` == "Transcription quantification (Genes)") %>%
  #   filter(!is.na(`Cell type`)) %>%
  #
  #   # Get data from URL
  #   separate(URL, sep="/|\\.", sprintf("URL_%s", 1:30), remove = F) %>%
  #   mutate(`Cell type` = gsub("_", " ", URL_15)) %>%
  #   mutate(sample = URL_18) %>%
  #
  #   distinct(Group, `Sub-group`, `Cell type`, Tissue, sample) %>%
  #   write_csv("big_data/tibble_cellType_files/BLUEPRINT__annotation_cell_types.csv")

  # # Chenage cell type names
  # mutate(`Cell type formatted` = NA) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("plasma cell", `Cell type`, ignore.case=T), "plasma_cell", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("plasma cell", `Cell type`, ignore.case=T), "plasma_cell", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("band form neutrophil", `Cell type`, ignore.case=T), "neutrophil", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("mature neutrophil", `Cell type`, ignore.case=T), "neutrophil", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("segmented neutrophil of bone marrow", `Cell type`, ignore.case=T), "neutrophil", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("myeloid cell", `Cell type`, ignore.case=T), "myeloid", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("lymphocyte of B lineage", `Cell type`, ignore.case=T), "b_cell", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("CD14-positive, CD16-negative classical monocyte", `Cell type`, ignore.case=T), "monocyte", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("common lymphoid progenitor", `Cell type`, ignore.case=T), "lymphoid", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("hematopoietic multipotent progenitor cell", `Cell type`, ignore.case=T), "stem_cell", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("hematopoietic stem cell", `Cell type`, ignore.case=T), "stem_cell", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("CD38-negative naive B cell", `Cell type`, ignore.case=T), "b_cell_naive", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("cytotoxic CD56-dim natural killer cell", `Cell type`, ignore.case=T), "natural_killer", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("common myeloid progenitor", `Cell type`, ignore.case=T), "myeloid", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("inflammatory macrophage", `Cell type`, ignore.case=T), "macrophage_M1", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("macrophage", `Cell type`, ignore.case=T), "macrophage", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("endothelial", `Cell type`, ignore.case=T), "endothelial", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("alternatively activated macrophage", `Cell type`, ignore.case=T), "macrophage_M2", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("conventional dendritic cell", `Cell type`, ignore.case=T), "dendritic", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("germinal center B cell", `Cell type`, ignore.case=T), "b_cell_germinal_center", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("naive B cell", `Cell type`, ignore.case=T), "b_cell_naive", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("immature conventional dendritic cell", `Cell type`, ignore.case=T), "dendritic_immature", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("mature conventional dendritic cell", `Cell type`, ignore.case=T), "dendritic", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("osteoclast", `Cell type`, ignore.case=T), "osteoclast", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("class switched memory B cell", `Cell type`, ignore.case=T), "b_cell_memory", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("memory B cell", `Cell type`, ignore.case=T), "b_cell_memory", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("monocyte", `Cell type`, ignore.case=T), "monocyte", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("peripheral blood mononuclear cell", `Cell type`, ignore.case=T), "mono_derived", `Cell type formatted`)) %>%
  #
  # mutate(`Cell type formatted` = ifelse(`Cell type`==("CD8-positive alpha-beta T cell"), "t_CD8", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(`Cell type`==("CD4-positive alpha-beta T cell"), "t_CD4", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("central memory CD8-positive", `Cell type`, ignore.case=T), "t_CD8_memory_central", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("effector memory CD8-positive", `Cell type`, ignore.case=T), "t_CD8_memory_effector", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("central memory CD4-positive", `Cell type`, ignore.case=T), "t_memory_central", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("effector memory CD4-positive", `Cell type`, ignore.case=T), "t_memory_effector", `Cell type formatted`)) %>%
  # mutate(`Cell type formatted` = ifelse(grepl("regulatory T cell", `Cell type`, ignore.case=T), "t_reg", `Cell type formatted`)) %>%

  # Select info
  read_csv("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/BLUEPRINT__annotation_cell_types.csv") %>%

    # Add expression
    left_join(

      foreach(
        my_file = dir(
          path="~/PhD/deconvolution/BLUEPRINT_db/",
          pattern="results",
          full.names=T
        ),
        .combine = bind_rows
      ) %dopar% {
        read_delim(
          my_file,
          "\t",
          escape_double = FALSE,
          trim_ws = TRUE
        ) %>%
          mutate(sample = my_file %>% basename())
      } %>%
        separate(sample, sep="\\.", sprintf("sample_%s", 1:6), remove=F) %>%
        mutate(sample = sample_1) %>%
        select(gene_id, expected_count, sample)

    ) %>%

    rename(`count` =  expected_count) %>%

    # Attach symbol names
    separate(gene_id, sep="\\.", c("ensembl_gene_id", "isoform"), remove=F) %>%
    add_symbol_from_ensembl("ensembl_gene_id") %>%
    rename(symbol = hgnc_symbol) %>%
    distinct %>%
    mutate(`Data base` = "BLUEPRINT") %>%

    # NO NK
    filter(`Cell type formatted` != "natural_killer")


  #AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, "ENSG00000069712", 'SYMBOL', 'ENSEMBL')
}

get_bloodRNA = function(){

  load("~/PhD/deconvolution/test_ARMET/myTest_TME_first_run_pure_populations_TME/humanRNASeqCnts_CdG160630.rda")

  counts$counts %>%
    as_tibble(rownames="ensembl_gene_id") %>%
    gather(file, `count`, -ensembl_gene_id) %>%
    mutate(`Cell type` = NA) %>%

    # Format cell types
    mutate(`Cell type` = ifelse(grepl("_mDC_", file), "dendritic_myeloid", `Cell type`)) %>%

    mutate(`Cell type` = ifelse(grepl("_CD4_Tcells_", file), "t_CD4", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_Eosinophils_", file), "eosinophil", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_Memory_Bcells_", file), "b_memory", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_Monocytes_", file), "monocyte", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_Naive_Bcells_", file), "b_naive", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_Neutrophils_", file), "neutrophil", `Cell type`)) %>%
    #mutate(`Cell type` = ifelse(grepl("_Nkcells_", file), "natural_killer", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_CD8_Tcells_", file), "t_CD8", `Cell type`)) %>%
    mutate(`Cell type` = ifelse(grepl("_Mem_Bcell_", file), "b_memory", `Cell type`)) %>%
    filter(`Cell type` %>% is.na %>% `!`) %>%

    # Merge same samples
    mutate(sample = gsub("_C1B73ACXX.+", "", file)) %>%
    group_by(sample, ensembl_gene_id, `Cell type`) %>%
    summarise(`count` = `count` %>% median(na.rm=T)) %>%
    ungroup() %>%


    # Cell type formatted
    mutate(`Cell type formatted` = `Cell type`) %>%

    # Attach symbol names
    add_symbol_from_ensembl("ensembl_gene_id") %>%
    rename(symbol = hgnc_symbol) %>%
    distinct %>%

    mutate(`Data base` = "bloodRNA")  %>%

    # NO NK
    filter(`Cell type formatted` != "natural_killer")

}

get_ENCODE = function(){


  # read_delim("~/PhD/deconvolution/ENCODE/metadata.tsv",  "\t", escape_double = FALSE, trim_ws = TRUE) %>%
  #   filter(`Output type` == "gene quantifications") %>%
  #   dplyr::mutate(sample = `File accession`) %>%
  #   mutate(`Cell type` = `Biosample term name`) %>%
  #   distinct(sample, `Cell type`, `Biosample type`) %>%
  #   write_csv("big_data/tibble_cellType_files/ENCODE__annotation_cell_types.csv")
  #
  #
  # metadata$`Cell type formatted` = NA
  # metadata$`Cell type formatted`[grep("epithelial", metadata$`Biosample term name`, ignore.case=T)] = "epithelial"
  # metadata$`Cell type formatted`[grep("stem cell", metadata$`Biosample term name`, fixed=T) ] = "stem_cell"
  # metadata$`Cell type formatted`[grep("fibroblast", metadata$`Biosample term name`, fixed=T) ] = "fibroblast"
  # metadata$`Cell type formatted`[grep("endothelial", metadata$`Biosample term name`, fixed=T) ] = "endothelial"
  # metadata$`Cell type formatted`[grep("smooth muscle", metadata$`Biosample term name`, fixed=T) ] = "smooth_muscle"
  # metadata$`Cell type formatted`[grep("neuron", metadata$`Biosample term name`, fixed=T) ] = "neural"
  # metadata$`Cell type formatted`[grep("keratinocyte", metadata$`Biosample term name`, fixed=T) ] = "keratinocyte"
  # metadata$`Cell type formatted`[grep("astrocyte", metadata$`Biosample term name`, fixed=T) ] = "astrocyte"
  # metadata$`Cell type formatted`[grep("dendritic cell", metadata$`Biosample term name`, fixed=T) ] = "dendritic"
  # metadata$`Cell type formatted`[grep("myocyte", metadata$`Biosample term name`, fixed=T) ] = "myocyte"
  # metadata$`Cell type formatted`[grep("natural killer", metadata$`Biosample term name`, fixed=T) ] = "natural_killer"
  # metadata$`Cell type formatted`[grep("CD4-positive helper T cell", metadata$`Biosample term name`, fixed=T) ] = "t_helper"
  # metadata$`Cell type formatted`[grep("neural", metadata$`Biosample term name`, fixed=T) ] = "neural"
  # metadata$`Cell type formatted`[grep("CD14-positive monocyte", metadata$`Biosample term name`, fixed=T) ] = "mono_derived"
  # metadata$`Cell type formatted`[grep("hepatocyte", metadata$`Biosample term name`, fixed=T) ] = "hepatocyte"
  # metadata$`Cell type formatted`[grep("hematopoietic multipotent progenitor cell", metadata$`Biosample term name`, fixed=T) ] = "stem_cell"
  # metadata$`Cell type formatted`[grep("chondrocyte", metadata$`Biosample term name`)] = "chondrocyte"
  # metadata$`Cell type formatted`[grep("CD8-positive, alpha-beta T cell", metadata$`Biosample term name`) ] = "t_CD8"
  # metadata$`Cell type formatted`[grep("melanocyte", metadata$`Biosample term name`, fixed=T) ] = "melanocyte"
  # metadata$`Cell type formatted`[grep("B cell", metadata$`Biosample term name`, fixed=T) ] = "b_cell"
  # metadata$`Cell type formatted`[grep("osteoblast", metadata$`Biosample term name`, fixed=T) ] = "osteoblast"
  # metadata$`Cell type formatted`[grep("naive B cell", metadata$`Biosample term name`, fixed=T) ] = "b_cell_naive"
  # metadata$`Cell type formatted`[grep("T-cell", metadata$`Biosample term name`, fixed=T) ] = "t_cell"
  # metadata = metadata[!is.na(metadata$`Cell type formatted`),]

  read_csv("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/ENCODE__annotation_cell_types.csv") %>%

    left_join(
      # Get counts from files
      foreach(f = .$sample, .combine = bind_rows) %dopar% {
        read_delim(
          sprintf("~/PhD/deconvolution/ENCODE/%s.tsv", f),
          "\t",
          escape_double = FALSE,
          trim_ws = TRUE
        ) %>%
          dplyr:::select(gene_id, expected_count) %>%
          mutate(sample = f)
      }
    ) %>%
    {
      samples_with_symbol = c("ENCFF060YNO", "ENCFF677SZA", "ENCFF708ZUJ", "ENCFF255ULI", "ENCFF717WSQ", "ENCFF118GPH", "ENCFF083PYO" ,"ENCFF712VOY" ,"ENCFF094ADI",
                              "ENCFF841AKS", "ENCFF331CDB", "ENCFF263OIE", "ENCFF798GKH" ,"ENCFF491YKJ" ,"ENCFF867RFN", "ENCFF461BKM", "ENCFF929RZY", "ENCFF440CJU",
                              "ENCFF246ZOR", "ENCFF680YEW")
      bind_rows(

        (.) %>%
          filter(sample %in% samples_with_symbol) %>%
          rename(symbol = gene_id),

        (.) %>% filter(!sample %in% samples_with_symbol) %>%
          separate(gene_id, sep="\\.", c("ensembl_gene_id", "isoform"), remove=F) %>%
          add_symbol_from_ensembl("ensembl_gene_id") %>%
          rename(symbol = hgnc_symbol)
      )
    } %>%
    rename(`count` = expected_count) %>%
    dplyr::select(sample, `Cell type`, `Cell type formatted`, `count`, symbol, ensembl_gene_id) %>%
    distinct %>%

    mutate(`Data base` = "ENCODE") %>%

    # NO NK
    filter(`Cell type formatted` != "natural_killer")

}

get_N52_TME = function(){
  load("~/PhD/TME_prostate_N52/N52_plus_12_run/fastq/run_all/code_repository/input_parallel_TABI.RData")
  ex %>% gather(sample, `count`, -symbol) %>%
    rename(count = `count`) %>%
    left_join(
      annot %>%
        left_join(
          tibble(
            `Cell type` = c("EpCAM", "CD90+ CD31-", "CD45+CD3+", "CD45+CD16+"),
            `Cell type formatted` = c("epithelial", "fibroblast", "t_cell", "myeloid"),
            cell_type_formatted = c("E", "F", "T", "M")
          )
        ) %>%
        distinct(file, `Cell type`, `Cell type formatted`) %>%
        rename(sample = file)
    ) %>%
    mutate(`Data base` = "N52 prostate")
}

get_immune_singapoor = function(){

  # Emailed the author  GSE107011
  load("~/PhD/deconvolution/immune_singapoor/Genes_TPM_counts.RData")

  kallisto_gene %$% counts %>%
    as_tibble(rownames = "gene_id") %>%
    gather(sample, `count`, -gene_id) %>%
    mutate(`count` = `count` %>% as.integer) %>%

    # Add cell type
    left_join(
      kallisto_gene %$% samples %>% as_tibble %>%
        distinct(sample, group10) %>%
        rename(`Cell type` = group10) %>%
        mutate(`Cell type` = gsub('[\"]', '', `Cell type`) %>% trimws  ) %>%
        left_join(
          read_csv("~/PhD/deconvolution/immune_singapoor/cell_type_conversion.csv")
        )
    ) %>%

    # Add symbol
    separate(gene_id, sep="\\.", c("ensembl_gene_id", "isoform"), remove=F) %>%
    ttBulk::annotate_symbol(ensembl_gene_id) %>%
    rename(symbol = transcript) %>%
    distinct() %>%

    # Add data base label
    mutate(`Data base` = "Immune Singapoor") %>%

    # NO NK
    filter(`Cell type formatted` != "natural_killer")

}

get_dendritic_STIMULATED_NOT_GOOD = function(){
  read_csv("~/PhD/deconvolution/mDC_database/GSE89442_Mathan_et_al_RNAseq_data_raw_values.csv") %>%
    gather(sample, `count`, -Symbol,-GeneID) %>%
    rename(symbol = Symbol) %>%
    filter(grepl("unstimulated", sample)) %>%
    mutate(`Cell type` = sample) %>%
    mutate(sample = paste("GSE89442", sample)) %>%
    mutate(`Cell type formatted` = ifelse(grepl("mDC", sample), "dendritic_myeloid", "dendritic_plasmacytoid")) %>%
    mutate(`Data base` = "dendritic GSE89442")
}

get_influenza_immune = function(){

  # PRJNA271578
  # Download fastq pair enda
  # cat SRR_Acc_List.txt | parallel --eta -j 10  '~/third_party_sofware/sratoolkit.2.9.6-1-centos_linux64/bin/fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip {}'
  # module load cutadapt; module load trimgalore; ls fastq/ | awk '{split($0,a,"_"); print a[1]}' | uniq | parallel --eta -j1 trim_galore --paired {}_pass_1.fastq.gz {}_pass_2.fastq.gz --fastqc > trim_galore.log
  # module load STAR; for i in $(ls *_val_*fq.gz | rev | cut -c 15- | rev | uniq); do mkdir -p alignment_hg38/$i; cd alignment_hg38/$i; STAR --genomeDir ~/third_party_sofware/hg38/karyotypic/ --readFilesIn ../../$i'_1_val_1.fq.gz' ../../$i'_2_val_2.fq.gz' --readFilesCommand zcat --genomeLoad LoadAndKeep --runThreadN 42 --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM 40000000000 --outReadsUnmapped Fastx; cd ../../; done

  read_csv("~/PhD/deconvolution/immune_influenza_RNAseq_PBMC/counts_paired_end.csv") %>%
    rename(count = `read count`) %>%
    separate(sample, c("dummy", "sample"), sep=c("hg38.")) %>%
    separate(sample, c("sample", "dummy"), sep=c("_pass")) %>%
    select(-dummy) %>%
    inner_join(
      read_csv("~/PhD/deconvolution/immune_influenza_RNAseq_PBMC/annot.csv") %>%
        rename(sample = Run, `Cell type` = `source name`) %>%
        filter(`Cell type` != "PBMC") %>%
        filter(time == "0 d") %>%
        select(`Cell type`, sample)
    ) %>%
    left_join(
      tibble(
        `Cell type` = c("T cells", "NK cells",  "Neutrophils", "Monocytes", "myeloid DC", "B cells"),
        `Cell type formatted` = c("t_cell", "natural_killer_NO_HERE",  "neutrophil", "monocyte", "dendritic_myeloid", "b_cell")
      )
    ) %>%
    mutate(`Data base` = "influenza immune") %>%

    # NO NK
    filter(`Cell type formatted` != "natural_killer")

}

get_immune_skin = function(){

  # PRJNA476386
  # Download fastq pair enda
  # cat SRR_Acc_List.txt | parallel --eta -j 10  '~/third_party_sofware/sratoolkit.2.9.6-1-centos_linux64/bin/fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip {}'
  # module load cutadapt; module load trimgalore; module load parallel;  ls fastq/ | awk '{split($0,a,"_"); print a[1]}' | uniq | parallel --eta -j1 trim_galore --paired {}_pass_1.fastq.gz {}_pass_2.fastq.gz --fastqc > trim_galore.log
  # for i in $(ls *_val_*fq.gz | rev | cut -c 15- | rev | uniq); do qsub -l nodes=1:ppn=24,mem=41gb,walltime=120:00:00 STAR_HPC.sh -F "$i"; done


  read_csv("~/PhD/deconvolution/immune_skin/counts_paired_end.csv") %>%
    separate(sample, c("sample", "dummy"), sep=c("\\.pass")) %>%
    mutate(sample = gsub("\\.", "", sample)) %>%
    inner_join(
      read_csv("~/PhD/deconvolution/immune_skin/annot.csv") %>%
        rename(sample = Run, `Cell type` = `cell type`) %>%
        select(`Cell type`, sample)
    ) %>%
    left_join(
      tibble(
        `Cell type` = c("CD8+ T cell", "Keratinocyte",  "Dendritic cell", "CD4+ T effector"),
        `Cell type formatted` = c("t_CD8", "keratinocyte",  "dendritic_myeloid", "t_CD4_effector")
      )
    ) %>%
    mutate(`Data base` = "skin immune") %>%

    rename(symbol = transcript)

}

get_NK_curated_yuhan = function(){

  read_csv("~/PhD/deconvolution/NK_yuhan/Ste_Alex_GEO_RNA_seq.csv") %>%
    rename(`Cell type formatted` = state, `Data base` = database, symbol = transcript) %>%
    mutate(`Cell type formatted` = ifelse(`Cell type formatted` != "nk_resting", "nk_primed", `Cell type formatted`)) %>%
		filter(`Data base` != "FANTOM5") %>%

		# Add priming state
		left_join( read_csv("~/PhD/deconvolution/NK_yuhan/priming_type.csv")) %>%
		mutate(`Cell type formatted` = ifelse(state %>% is.na, `Cell type formatted`, state))

}

get_macro = function(){

  # PRJNA339309
  # Download fastq pair enda
  # cat SRR_Acc_List.txt | parallel --eta -j 10  '~/third_party_sofware/sratoolkit.2.9.6-1-centos_linux64/bin/fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip {}'
  # module load parallel; module load trimgalore; module load cutadapt; ls ./ | awk '{split($0,a,"_"); print a[1]}' | uniq | parallel --eta -j1 trim_galore --paired {}_pass_1.fastq.gz {}_pass_2.fastq.gz --fastqc > trim_galore.log
  # for i in $(ls *_val_*fq.gz | rev | cut -c 15- | rev | uniq); do qsub -l nodes=1:ppn=24,mem=41gb,walltime=120:00:00 STAR_HPC.sh -F "$i"; done


  read_csv("~/PhD/deconvolution/macrophages_db/macrophages_DB_PRJNA339309.csv") %>%
    select(-contains("Unassigned")) %>%
    rename(symbol = transcript) %>%
    mutate(`Data base` = "PRJNA339309_macrophages")


}

get_mast_cell = function(){
  methods = "Mature human MCs were generated from human peripheral blood-derived CD34+ progenitor cells collected from healthy volunteers"

  dir(
    path = "/stornext/Home/data/allstaff/m/mangiola.s/PhD/deconvolution/mast_cell_GSE125887/",
    pattern="tsv",
    full.names = T
  ) %>%
    map_dfr(
      ~ .x %>%
        read_delim("\t",escape_double = FALSE, col_names = FALSE, trim_ws = TRUE) %>%
        mutate(file = .x)
    ) %>%
    select(X1, X2, file) %>%
    setNames(c("symbol", "count", "file")) %>%
    filter(!grepl("^N_", symbol)) %>%
    separate(file, c("dummy", "sample"), sep="GSE125887//") %>%
    separate(sample, c("sample", "dummy"), sep="_star_hg19") %>%
    select(-dummy) %>%
    mutate(methods = !!methods %>% as.factor) %>%
    mutate(`Data base` = "GSE125887") %>%
    mutate(`Cell type formatted` = "mast_cell")


}

get_melanocytes = function(){

	dir(
		path = "../melanocyte_GSE71747_cultured",
		pattern="txt$",
		full.names = T
	) %>%
		map_dfr(
			~ .x %>%
				read_delim("\t", escape_double = FALSE, trim_ws = TRUE) %>%
				setNames(c("symbol", "count")) %>%
				mutate_if(is.double, as.integer) %>%
				mutate(file = .x)
		) %>%
		filter(!grepl("^N_", symbol)) %>%
		separate(file, c("dummy", "sample"), sep="_cultured/") %>%
		separate(sample, c("sample", "dummy"), sep="_L005") %>%
		select(-dummy) %>%
		mutate(methods = "cultured from tissue" %>% as.factor) %>%
		mutate(`Data base` = "GSE71747") %>%
		mutate(`Cell type formatted` = "melanocyte") %>%

		# bind_rows(
		# 	# GSE109245
		# 	read_delim("../melanocyte_GSE109245_RAW//GSM2935823_mcf_quant.sf.txt", "\t", escape_double = FALSE, trim_ws = TRUE) %>%
		# 		select(Name, NumReads) %>%
		# 		mutate(sample ="GSM2935823") %>%
		# 		annotate_symbol(Name) %>%
		# 		rename(count = NumReads, symbol = transcript) %>%
		# 		ttBulk(sample, symbol, count) %>%
		# 		aggregate_duplicates() %>%
		# 		select(-`merged transcripts`) %>%
		# 		filter(!grepl("^N_", symbol)) %>%
		# 		mutate(methods = "cultured from tissue" %>% as.factor) %>%
		# 		mutate(`Data base` = "GSE109245") %>%
		# 		mutate(`Cell type formatted` = "melanocyte")
		#
		# ) %>%

		bind_rows(
			# Other dataset

			# GSE138412
			# Download fastq pair enda
			# cat SRR_Acc_List.txt | parallel --eta -j 10  '~/third_party_sofware/sratoolkit.2.9.6-1-centos_linux64/bin/fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip {}'
			# module load parallel; module load trimgalore; module load cutadapt; ls ./ | awk '{split($0,a,"_"); print a[1]}' | uniq | parallel --eta -j1 trim_galore --paired {}_pass_1.fastq.gz {}_pass_2.fastq.gz --fastqc > trim_galore.log
			# mv fastq/*_val_* ./
			# for i in $(ls *_val_*fq.gz | rev | cut -c 15- | rev | uniq); do qsub -l nodes=1:ppn=24,mem=41gb,walltime=120:00:00 STAR_HPC.sh -F "$i"; done


			read_csv("../melanocyte_GSE138412/melanocyte_GSE138412.csv") %>%
				rename(symbol = transcript) %>%
				select(entrez ,sample , count , symbol, group) %>%
				mutate(methods = "cultured from tissue" %>% as.factor) %>%
				mutate(`Data base` = "GSE138412") %>%
				mutate(`Cell type formatted` = "melanocyte") %>%
				mutate(	sample = gsub("alignment.hg38.", "", sample, fixed = T)) %>%
				mutate(	sample = gsub(".pass.Aligned.sortedByCoord.out.bam", "", sample, fixed = T))

		)


}

# from here

get_myeloid_differentiation = function(){

  # Emailed the author GSM2084371
  0

}

get_monocytes_brain = function(){

  # Emailed authors
  # Data set GSE88888 from  doi: 10.1038/s41598-018-28986-7

  GSE88888 <-
    read_delim("~/PhD/deconvolution/immune_monocyte_2/raw.txt",  "\t", escape_double = FALSE, trim_ws = TRUE) %>%
    separate(`Annotation/Divergence`, c("transcript"), extra = "drop") %>%
    gather(sample, count, -(1:8)) %>%
    mutate(sample = gsub("/gpfs/data01/glasslab/home/jtao/analysis/pd_analysis//tag_directories/", "", sample)) %>%
    filter(grepl("RNA_Control", sample))

  GSE88888 %>%
    ttBulk(sample, transcript, count) %>%
    aggregate_duplicates() %>%
    scale_abundance() %>%
    reduce_dimensions(method="MDS") %>%
    select(contains("Dim"), sample) %>%
  distinct %>%
  ggplot(aes(x = `Dim 1`, y = `Dim 2`)) + geom_point()


}

# Produce data sets

ENCODE = get_ENCODE()
save(ENCODE, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/ENCODE.RData", compress = "gzip")

BLUEPRINT = get_BLUEPRINT()
save(BLUEPRINT, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/BLUEPRINT.RData", compress = "gzip")

# FANTOM5 = get_FANTOM5()
# save(FANTOM5, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/FANTOM5.RData", compress = "gzip")

bloodRNA = get_bloodRNA()
save(bloodRNA, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/bloodRNA.RData", compress = "gzip")

immune_singapoor = get_immune_singapoor()
save(immune_singapoor, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/immune_singapoor.RData", compress = "gzip")

N52_TME = get_N52_TME()
save(N52_TME, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/N52_TME.RData", compress = "gzip")

# dendritic = get_dendritic()
# save(dendritic, file="big_data/tibble_cellType_files/dendritic.RData")

influenza_immune = get_influenza_immune()
save(influenza_immune, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/influenza_immune.RData", compress = "gzip")

immune_skin = get_immune_skin()
save(immune_skin, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/immune_skin.RData", compress = "gzip")

nk_yuhan = get_NK_curated_yuhan()
save(nk_yuhan, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/nk_yuhan.RData", compress = "gzip")

macro = get_macro()
save(macro, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/macro.RData", compress = "gzip")

mast = get_mast_cell()
save(mast, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/mast.RData", compress = "gzip")

melanocyte = get_melanocytes()
save(melanocyte, file="~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/melanocyte.RData", compress = "gzip")


load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/ENCODE.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/BLUEPRINT.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/bloodRNA.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/immune_singapoor.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/N52_TME.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/influenza_immune.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/immune_skin.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/nk_yuhan.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/macro.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/mast.RData")
load("~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/melanocyte.RData")

all =
  ENCODE %>%
  bind_rows(BLUEPRINT) %>%
  bind_rows(bloodRNA) %>%
  bind_rows(immune_singapoor) %>%
  bind_rows(N52_TME) %>%
  bind_rows(influenza_immune) %>%
  bind_rows(immune_skin) %>%
  bind_rows(nk_yuhan) %>%
  bind_rows(macro) %>%
  bind_rows(mast) %>%
	bind_rows(melanocyte) %>%

  #print statistics
  {
    (.) %>% distinct(`Cell type formatted`, `Data base`, sample) %>% count(`Cell type formatted`, `Data base`)%>% drop_na %>% arrange(n %>% desc) %>% spread(`Data base`, n) %>% print(n=99)
    (.)
  } %>%

  filter(symbol %>% is.na %>% `!`) %>%
  filter(`Cell type formatted` %>% is.na %>% `!`) %>%
  mutate(`count` = `count` %>% as.integer) %>%
  mutate_if(is.character, as.factor)


all %>%
  group_by(`Cell type formatted`) %>%
  do({

    (.) %>%
      write_csv(
        sprintf(
          "~/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files/tibble_cellTypes_%s.csv",
          (.) %>%
            pull(`Cell type formatted`) %>%
            unique %>%
            { gsub("/| |,", "_", (.)) }
        )
      )

    tibble(`Cell type` = (.) %>% pull(`Cell type formatted`) %>% unique)

  })

# Create harmonised dataset based on myhierarchy

# Setup table of name conversion
tree =
  yaml:: yaml.load_file("~/PhD/deconvolution/ARMET/data/tree.yaml") %>%
  data.tree::as.Node()

sample_blacklist = c(
	"666CRI",
	"972UYG",
	"344KCP",
	"555QVG",
	"370KKZ",
	"511TST",
	"13816.11933",
	"13819.11936",
	"13817.11934",
	"13818.11935",
	"096DQV",
	"711SNV",
	"counts.Ciliary%20Epithelial%20Cells%2c%20donor3.CNhs12009.11399-118D4"   ,
	"counts.Iris%20Pigment%20Epithelial%20Cells%2c%20donor1.CNhs12596.11530-119I9",
	"ENCFF890DJO"
)

ct_to_correlation_threshold =

	tree %>%
	data.tree::ToDataFrameTree("name", "level") %>%
	mutate(level = level -1 ) %>%
	rename(`Cell type category` =  name) %>%
	as_tibble %>%
	left_join(
		tibble(level=1:6, threshold=c(0.9, 0.975, 0.99, 0.99, 0.99, 0.99))
	)

library(data.tree)
library(multidplyr)
cluster <- new_cluster(30)

# Get data
counts =
  foreach(
    f = dir(
      path = "/wehisan/bioinf/bioinf-data/Papenfuss_lab/projects/mangiola.s/PostDoc/RNAseq-noise-model/big_data/tibble_cellType_files",
      pattern="cellTypes_",
      full.names = T
    ),
    .combine = bind_rows
  ) %dopar% {
    read_csv(f) %>%
      mutate_at(vars(one_of('sample')), as.character) %>%
      mutate_at(vars(one_of('isoform')), as.character) %>%
      select(sample, `Cell type`, `Cell type formatted`, count, symbol, `Data base`)
  } %>%

  filter((symbol %>% is.na %>% `!`) & (symbol != "")) %>%
  filter(`Cell type formatted` %>% is.na %>% `!`) %>%

  # Filter out black list
  filter(!grepl(
    sample_blacklist %>% paste(collapse = "|"), sample
  )) %>%

  # Filter dendritic that look too much like monocytes
  filter(!(
    (`Cell type formatted` == "dendritic_myeloid" & `Data base` == "Immune Singapoor") |
      (`Cell type formatted` == "dendritic_myeloid" & `Data base` == "bloodRNA")
  )) %>%

  # Setup Cell type category names
  left_join(
    tree %>%
    Clone %>%
    ARMET::ToDataFrameTypeColFull(F, "name") %>%
    pivot_longer(cols = -name, names_to = "level", values_to = "Cell type category", names_prefix = "level_") %>%
    drop_na() %>%
    mutate(level = as.integer(level) -1) %>%
    filter(level > 0) %>%
    dplyr::rename(`Cell type formatted` = name)
  ) %>%

  filter(`Cell type category` %>% is.na %>% `!`) %>%

  # Eliminate genes that are not in all cell types
  inner_join( (.) %>% distinct(symbol, `Cell type category`) %>% count(symbol) %>% filter(n == max(n)) ) %>%

  # Setup house keeping genes
  mutate(`Cell type category` = ifelse(symbol %in% (read_csv("~/PhD/deconvolution/ARMET/dev/hk_600.txt", col_names = FALSE) %>% pull(1)), "house_keeping", `Cell type category`)) %>%

  # Select just needed columns
  #select(sample, `Cell type`, `Cell type formatted`, `Cell type category`, level, count, symbol, ensembl_gene_id, `Data base`,  `Sample name`) %>%

  # Median redundant
  group_by(level, `Data base`) %>%
  multidplyr::partition(cluster) %>%
  do(
    ttBulk::aggregate_duplicates((.), .sample = sample, .transcript = symbol, .abundance = count)
  ) %>%
  collect() %>%
  ungroup %>%

  # do_parallel_start(n_cores, "symbol") %>%
  # do({
  #   `%>%` = magrittr::`%>%`
  #   library(tidyverse)
  #   library(magrittr)
  #
  #   (.) %>%
  #     group_by(sample, symbol, `Cell type`, `Cell type category`, `Cell type formatted`,  `Data base`) %>%
  #     summarise(`count` = `count` %>% median(na.rm = T)) %>%
  #     ungroup()
  # }) %>%
  # do_parallel_end() %>%

  # Normalise
  group_by(level) %>%
  multidplyr::partition(cluster) %>%
  do(
    ttBulk::scale_abundance((.), sample, symbol, `count`)
  ) %>%
  collect() %>%
  ungroup %>%

  mutate(`count scaled` = `count scaled` %>% as.integer) %>%
  mutate(`count scaled log` = `count scaled` %>% `+` (1) %>% log) %>%

  # mutate symbol
  mutate(`symbol original` = symbol) %>%
  unite(symbol, c("Cell type category", "symbol"), remove = F) %>%

# Remove redundant samples
group_by(level) %>%
  do({

  nr =
    (.) %>%
    filter(`Cell type category` != "house_keeping") %>%
    group_by( `Cell type category`) %>%
    do({

      threshold = (.) %>% distinct(`Cell type category`) %>% left_join( ct_to_correlation_threshold ) %>% pull(threshold)


      # Remove redundant samples
      (.) %>%
        anti_join(
          (.) %>%
            distinct(symbol, sample, `count`) %>%
            ttBulk::filter_variable(sample, symbol, count, top = 300) %>%
            spread(sample, `count`) %>%
            drop_na %>%
            gather(sample, `count`, -symbol) %>%
            rename(rc = `count`) %>%
            mutate_if(is.factor, as.character) %>%
            widyr::pairwise_cor(sample, symbol, rc, sort=T, diag = FALSE, upper = F) %>%
            filter(correlation > threshold) %>%
            distinct(item1) %>%
            rename(sample = item1)
        ) %>%

        # Sample homogeneous population
        mutate( `threshold contribution` = (.) %>%  distinct(sample, `Cell type formatted`) %>% count(`Cell type formatted`) %>% pull(n) %>% quantile(0.8) %>% floor) %>%
        group_by(`Cell type formatted`) %>%
        inner_join( (.) %>% distinct(sample, `threshold contribution`) %>%  filter(row_number(sample) <= `threshold contribution`)) %>%
        ungroup() %>%
        select(- `threshold contribution`)

    }) %>%
    ungroup

  (.) %>%
    filter(`Cell type category` == "house_keeping") %>%
    inner_join( nr %>% distinct(sample)) %>%
    bind_rows( nr )

} )%>%
  ungroup %>%

  mutate(symbol = symbol %>% as.factor) %>%
  mutate(sample = sample %>% as.factor) %>%
  saveRDS(file="~/PhD/deconvolution/ARMET/dev/counts_infer_NB.rds")


# Plots and Study
library(ttBulk)
(
  readRDS(file="counts_infer_NB.rds") %>%
    dplyr::distinct(sample, symbol, `count`, `Cell type formatted`, `Data base`) %>%
    ttBulk::ttBulk(sample, symbol, `count`) %>%
    distinct() %>%
    aggregate_duplicates() %>%
    ttBulk::scale_abundance() %>%
    distinct(`Data base`, sample, symbol, `Cell type formatted`, `count scaled`)  %>%
    reduce_dimensions(sample, symbol, `count scaled`, method = "tSNE", .dims=2) %>%
    select(contains("tSNE"), `Data base`, `Cell type formatted`) %>%
    distinct %>%
    ggplot(aes(x = `tSNE1`, y = `tSNE2`, color=`Cell type formatted`)) + geom_point(size=2) +
    theme_bw() +
    theme(
      panel.border = element_blank(),
      axis.line = element_line(),
      panel.grid.major = element_line(size = 0.2),
      panel.grid.minor = element_line(size = 0.1),
      text = element_text(size=12),
      legend.position="bottom",
      aspect.ratio=1,
      #axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      strip.background = element_blank(),
      axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
      axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
    )
) %>% plotly::ggplotly()
#
# (
#   ARMET::ARMET_ref %>%
#     filter(level==3) %>%
#     distinct(`Data base`, sample, symbol, `Cell type category`, `read count scaled bayes`)  %>%
#      inner_join(rr$signatures[[3]] %>% distinct(symbol)) %>%
#     reduce_dimensions(sample, symbol, `read count scaled bayes`, method = "tSNE", .dims=2) %>%
#     select(contains("tSNE"), `Data base`, `Cell type category`) %>%
#     distinct %>%
#     ggplot(aes(x = `tSNE 1`, y = `tSNE 2`, color=`Cell type category`, shape=`Data base`)) + geom_point(size=2) +
#     theme_bw() +
#     theme(
#       panel.border = element_blank(),
#       axis.line = element_line(),
#       panel.grid.major = element_line(size = 0.2),
#       panel.grid.minor = element_line(size = 0.1),
#       text = element_text(size=12),
#       legend.position="bottom",
#       aspect.ratio=1,
#       #axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
#       strip.background = element_blank(),
#       axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
#       axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
#     )
# ) %>% plotly::ggplotly()


#
# (
#   ARMET::ARMET_ref %>%
#     distinct(`Data base`, sample, symbol, `Cell type formatted`, `read count`)  %>%
#     ttBulk(sample, symbol, `read count`) %>%
#     aggregate_duplicates() %>%
#     scale_abundance() %>%
#     #inner_join(rr$signatures[[3]] %>% distinct(symbol)) %>%
#     reduce_dimensions(sample, symbol, `read count scaled`, method = "tSNE", .dims=2) %>%
#     select(contains("tSNE"), `Data base`, `Cell type formatted`) %>%
#     distinct %>%
#     ggplot(aes(x = `tSNE 1`, y = `tSNE 2`, color=`Cell type formatted`, shape=`Data base`)) + geom_point(size=2) +
#     theme_bw() +
#     theme(
#       panel.border = element_blank(),
#       axis.line = element_line(),
#       panel.grid.major = element_line(size = 0.2),
#       panel.grid.minor = element_line(size = 0.1),
#       text = element_text(size=12),
#       legend.position="bottom",
#       aspect.ratio=1,
#       #axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
#       strip.background = element_blank(),
#       axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
#       axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
#     )
# ) %>% plotly::ggplotly()
#
# counts_annot =
#   counts %>%
#   filter(`read count` %>% is.na %>% `!`) %>%
#   filter(symbol %>% is.na %>% `!`) %>%
#   left_join(
#     ARMET::tree %>%
#       ARMET::ToDataFrameTypeColFull(fill = F) %>%
#       rename(`Cell type formatted` = level_5)
#   ) %>%
#   ttBulk(sample, symbol, `read count`) %>%
#   aggregate_duplicates() %>%
#   scale_abundance() %>%
#   reduce_dimensions(method = "MDS" ) %>%
#   reduce_dimensions(method = "tSNE")
stemangiola/ARMET documentation built on July 9, 2022, 1:25 a.m.