R/functions.R

Defines functions create_tt_from_bam_sam_bulk create_tt_from_tibble_bulk

Documented in create_tt_from_bam_sam_bulk create_tt_from_tibble_bulk

#' Create tt object from tibble
#'
#' @keywords internal
#'
#' @importFrom rlang enquo
#' @importFrom magrittr %>%
#'
#' @param .data A tibble
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param .abundance_scaled The name of the transcript/gene scaled abundance column
#'
#' @return A tibble with an additional column
#'
#'
create_tt_from_tibble_bulk = function(.data,
																			.sample,
																			.transcript,
																			.abundance,
																			.abundance_scaled = NULL) {
	# Make col names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)
	.abundance_scaled = enquo(.abundance_scaled)

	.data %>%

		# Add tt_columns attribute
		add_tt_columns(!!.sample,!!.transcript,!!.abundance,!!.abundance_scaled) %>%
		memorise_methods_used("tidyverse") %>%

		# Add class
		add_class("tt") %>%
		add_class("tidybulk")
}



#' Convert bam/sam files to a tidy gene transcript counts data frame
#'
#' @keywords internal
#'
#' @importFrom purrr reduce
#'
#' @param file_names A character vector
#' @param genome A character string
#' @param ... Further parameters passed to the function Rsubread::featureCounts
#'
#' @return A tibble of gene counts
#'
create_tt_from_bam_sam_bulk <-
	function(file_names, genome = "hg38", ...) {
		# This function uses Subread to count the gene features,
		# annotate gene features with symbols, and
		# convert the data frame to tibble format

		n_cores <- system("nproc", intern = TRUE) %>%
			as.integer() %>%
			`-`(2)

		file_names %>%

			# Run subread
			Rsubread::featureCounts(annot.inbuilt = genome,
															nthreads = n_cores,
															...) %>%

			# Anonymous function
			# input: Subread results
			# output edgeR::DGEList object
			{
				edgeR::DGEList(
					counts = (.)$counts,
					genes = (.)$annotation[, c("GeneID", "Length")],
					samples = (.)$stat %>% as_tibble() %>% gather(sample, temp,-Status) %>% spread(Status, temp)
				)
			} %>%

			# Anonymous function
			# input: edgeR::DGEList object
			# output: edgeR::DGEList object with added transcript symbol
			{
				dge <- (.)
				dge$genes$transcript <-
					AnnotationDbi::mapIds(
						org.Hs.eg.db::org.Hs.eg.db,
						keys = as.character(dge$genes$GeneID),
						column = "SYMBOL",
						keytype = "ENTREZID",
						multiVals = "first"
					)

				dge
			} %>%

			# Anonymous function
			# input: annotated edgeR::DGEList object
			# output: tibble
			{
				reduce(
					list(
						(.) %$% counts %>% as_tibble(rownames = "GeneID") %>% mutate(GeneID = GeneID %>% as.integer()) %>% gather(sample, count,-GeneID),
						(.) %$% genes %>% select(GeneID, transcript) %>% as_tibble(),
						(.) %$% samples %>% as_tibble()
					),
					dplyr::left_join
				) %>%
					rename(entrez = GeneID) %>%
					mutate(entrez = entrez %>% as.character())
			} %>%

			# Add tt_columns attribute
			add_tt_columns(sample,transcript,count) %>%
			memorise_methods_used("featurecounts") %>%

			# Add class
			add_class("tt") %>%
			add_class("tidybulk")
	}

#' Get a tibble with scaled counts using TMM
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr equals
#' @importFrom rlang :=
#' @importFrom stats median
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")
#' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference. 
#'
#' @return A tibble including additional columns
#'
#'
get_scaled_counts_bulk <- function(.data,
																	 .sample = NULL,
																	 .transcript = NULL,
																	 .abundance = NULL,
																	 method = "TMM",
																	 reference_sample = NULL) {
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)

	# Check if package is installed, otherwise install
	if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing edgeR needed for analyses")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("edgeR", ask = FALSE)
	}

	# Reformat input data set
	df <-
		.data %>%

		# Rename
		dplyr::select(!!.sample,!!.transcript,!!.abundance) %>%

		# Set samples and genes as factors
		dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript))


	# Get reference
	reference <-
		reference_sample %>%
		when(
			!is.null(.) ~ (.),
			
			# If not specified take most abundance sample
			df %>%
				group_by(!!.sample) %>%
				summarise(sum = sum(!!.abundance)) %>%
				mutate(med = max(sum)) %>%
				mutate(diff = abs(sum - med)) %>%
				arrange(diff) %>%
				head(n = 1) %>%
				pull(!!.sample) %>%
				as.character()
		)
		

	nf_obj <-
		add_scaled_counts_bulk.calcNormFactor(
			df,
			reference,
			.sample = !!.sample,
			.transcript = !!.transcript,
			.abundance = !!.abundance,
			method
		)
 
	# Calculate normalization factors
	nf_obj$nf %>%
		dplyr::left_join(
			df %>%
				group_by(!!.sample) %>%
				summarise(tot = sum(!!.abundance, na.rm = TRUE)) %>%
				ungroup() %>%
				dplyr::mutate(!!.sample := as.factor(as.character(!!.sample))),
			by = quo_name(.sample)
		) %>%
		mutate(multiplier =
					 	1 /
					 	(tot_filt * nf) *
					 	
					 	# Put everything to the reference sample scale
					 	((.) %>% filter(!!.sample == reference) %>% pull(tot))) %>%

		# I have correct the strange behaviour of edgeR of reference
		# sample not being 1
		ifelse_pipe(
			"reference" %in% ((.) %>% pull(!!.sample)),
			~ .x %>%
				mutate(
					multiplier =
						multiplier /
						(.) %>%
						filter(!!.sample == "reference") %>%
						pull(multiplier)
				)
		) %>%
		
		dplyr::select(-tot,-tot_filt) %>%
		dplyr::rename(TMM = nf) %>%
		
		# # Attach internals
		# add_tt_columns(!!.sample,!!.transcript,!!.abundance) %>%
		
		# Add methods
		memorise_methods_used(c("edger", "tmm"))
		
	# Return
	
	# df_norm =
	# 	df %>%
	# 
	# 	# drop factor of interest
	# 	select(!!.sample, !!.transcript, !!.abundance) %>%
	# 
	# 	# Manipulate
	# 	dplyr::mutate(!!.sample := as.factor(as.character(!!.sample))) %>%
	# 	dplyr::left_join(nf, by = quo_name(.sample)) %>%
	# 
	# 	# Calculate scaled values
	# 	dplyr::mutate(!!value_scaled := !!.abundance * multiplier) %>%
	# 
	# 	# Format df for join
	# 	dplyr::select(!!.sample, !!.transcript, !!value_scaled,	everything()) %>%
	# 	# dplyr::mutate(lowly_abundant = !!.transcript %in% nf_obj$gene_to_exclude) %>%
	# 	dplyr::select(-!!.abundance,-tot,-tot_filt) %>%
	# 	dplyr::rename(TMM = nf) %>%
	# 	arrange(!!.sample,!!.transcript)
	# #dplyr::select(-!!.sample,-!!.transcript)

	# # Attach attributes
	# df_norm 

}



#' Get differential transcription information to a tibble using edgeR.
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom utils install.packages
#' @importFrom purrr when
#'
#'
#' @param .data A tibble
#' @param .formula a formula with no response variable, referring only to numeric variables
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest)
#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT)
#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")
#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames.
#'
#' @return A tibble with edgeR results
#'
get_differential_transcript_abundance_bulk <- function(.data,
																											 .formula,
																											 .sample = NULL,
																											 .transcript = NULL,
																											 .abundance = NULL,
																											 .contrasts = NULL,
																											 method = "edgeR_quasi_likelihood",
																											 scaling_method = "TMM",
																											 omit_contrast_in_colnames = FALSE,
																											 prefix = "") {
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)

	# Check if omit_contrast_in_colnames is correctly setup
	if(omit_contrast_in_colnames & length(.contrasts) > 1){
		warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present")
		omit_contrast_in_colnames = FALSE
	}

	# distinct_at is not released yet for dplyr, thus we have to use this trick
	df_for_edgeR <- .data %>%

		# Prepare the data frame
		select(!!.transcript,
					 !!.sample,
					 !!.abundance,
					 one_of(parse_formula(.formula))) %>%
		distinct() %>%

		# drop factors as it can affect design matrix
		droplevels() 
	#%>%

		# # Check if data rectangular
		# ifelse2_pipe(
		# 	(.) %>% check_if_data_rectangular(!!.sample,!!.transcript,!!.abundance) %>% not() & fill_missing_values,
		# 	(.) %>% check_if_data_rectangular(!!.sample,!!.transcript,!!.abundance) %>% not() & !fill_missing_values,
		# 	~ .x %>% fill_NA_using_formula(.formula,!!.sample, !!.transcript, !!.abundance),
		# 	~ .x %>% eliminate_sparse_transcripts(!!.transcript)
		# ) 
		

	# # Check if at least two samples for each group
	# if (
	# 	# If I have some discrete covariates
	# 	df_for_edgeR %>%
	# 	select(one_of(parse_formula(.formula))) %>%
	# 	select_if(function(col)
	# 		is.character(col) | is.factor(col) | is.logical(col)) %>%
	# 	ncol %>% gt(0) &
	#
	# 	# If I have at least 2 samples per group
	# 	df_for_edgeR %>%
	# 	select(!!.sample, one_of(parse_formula(.formula))) %>%
	# 	select_if(function(col) !is.numeric(col) & !is.integer(col) & !is.double(col) ) %>%
	# 	distinct %>%
	# 	group_by_at(vars(-!!.sample)) %>%
	# 	count() %>%
	# 	ungroup() %>%
	# 	{
	# 		(.) %>% nrow %>% st(2) |
	# 		(.) %>% distinct(n) %>%	pull(n) %>%	min %>% st(2)
	# 	}
	# )
	# message("tidybulk says: Just so you know. You have less than two replicates for each factorial combination")

	# Create design matrix
	design =
		model.matrix(
			object = .formula,
			data = df_for_edgeR %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
		)

	# Print the design column names in case I want contrasts
	message(
		sprintf(
			"tidybulk says: The design column names are \"%s\"",
			design %>% colnames %>% paste(collapse = ", ")
		)
	)

	my_contrasts =
		.contrasts %>%
		ifelse_pipe(length(.) > 0,
								~ limma::makeContrasts(contrasts = .x, levels = design),
								~ NULL)

	# Check if package is installed, otherwise install
	if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing edgeR needed for differential transcript abundance analyses")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("edgeR", ask = FALSE)
	}



	edgeR_object =
		df_for_edgeR %>%
		select(!!.transcript,!!.sample,!!.abundance) %>%
		spread(!!.sample,!!.abundance) %>%
		as_matrix(rownames = !!.transcript) %>%

		edgeR::DGEList(counts = .) %>%
		edgeR::calcNormFactors(method = scaling_method) %>%
		edgeR::estimateDisp(design) %>%

		# select method
		when(
			tolower(method) ==  "edger_likelihood_ratio" ~ (.) %>% edgeR::glmFit(design),
			tolower(method) ==  "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFit(design)
		)


	edgeR_object %>%

		# If I have multiple .contrasts merge the results
		ifelse_pipe(
			my_contrasts %>% is.null | omit_contrast_in_colnames,

			# Simple comparison
			~ .x %>%

				# select method
				when(
					tolower(method) ==  "edger_likelihood_ratio" ~ (.) %>% edgeR::glmLRT(coef = 2, contrast = my_contrasts) ,
					tolower(method) ==  "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFTest(coef = 2, contrast = my_contrasts)
				)	%>%

				# Convert to tibble
				edgeR::topTags(n = 999999) %$%
				table %>%
				as_tibble(rownames = quo_name(.transcript)) %>%

				# # Mark DE genes
				# mutate(significant = FDR < significance_threshold) 	%>%

				# Arrange
				arrange(FDR),

			# Multiple comparisons
			~ {
				edgeR_obj = .x

				1:ncol(my_contrasts) %>%
					map_dfr(
						~ edgeR_obj %>%

							# select method
							when(
								tolower(method) ==  "edger_likelihood_ratio" ~ (.) %>% edgeR::glmLRT(coef = 2, contrast = my_contrasts[, .x]) ,
								tolower(method) ==  "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFTest(coef = 2, contrast = my_contrasts[, .x])
							)	%>%

							# Convert to tibble
							edgeR::topTags(n = 999999) %$%
							table %>%
							as_tibble(rownames = quo_name(.transcript)) %>%
							mutate(constrast = colnames(my_contrasts)[.x]) 
							# %>%
							# 
							# # Mark DE genes
							# mutate(significant = FDR < significance_threshold)
					) %>%
					pivot_wider(values_from = -c(!!.transcript, constrast),
											names_from = constrast, names_sep = "___")
			}
		)	 %>%

		# Attach prefix
		setNames(c(
			colnames(.)[1], 
			sprintf("%s%s", prefix, colnames(.)[2:ncol(.)])
		)) %>%
		
		# Attach attributes
		reattach_internals(.data) %>%
		memorise_methods_used(c("edger", "limma")) %>%

		# Add raw object
		attach_to_internals(edgeR_object, "edgeR") %>%
		# Communicate the attribute added
		{
			message(
				"tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$edgeR`"
			)
			(.)
		}
}


#' Get differential transcription information to a tibble using voom.
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom utils install.packages
#' @importFrom purrr when
#'
#'
#' @param .data A tibble
#' @param .formula a formula with no response variable, referring only to numeric variables
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param .contrasts A character vector. See voom makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest)
#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")
#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames.
#'
#' @return A tibble with voom results
#'
get_differential_transcript_abundance_bulk_voom <- function(.data,
																											 .formula,
																											 .sample = NULL,
																											 .transcript = NULL,
																											 .abundance = NULL,
																											 .contrasts = NULL,
																											 scaling_method = "TMM",
																											 omit_contrast_in_colnames = FALSE,
																											 prefix = "") {
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)

	# Check if omit_contrast_in_colnames is correctly setup
	if(omit_contrast_in_colnames & length(.contrasts) > 1){
		warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present")
		omit_contrast_in_colnames = FALSE
	}

	# distinct_at is not released yet for dplyr, thus we have to use this trick
	df_for_voom <- .data %>%

		# Prepare the data frame
		select(!!.transcript,
					 !!.sample,
					 !!.abundance,
					 one_of(parse_formula(.formula))) %>%
		distinct() %>%

		# drop factors as it can affect design matrix
		droplevels() 
	# %>%

		# # Check if data rectangular
		# ifelse2_pipe(
		# 	(.) %>% check_if_data_rectangular(!!.sample,!!.transcript,!!.abundance) %>% not() & fill_missing_values,
		# 	(.) %>% check_if_data_rectangular(!!.sample,!!.transcript,!!.abundance) %>% not() & !fill_missing_values,
		# 	~ .x %>% fill_NA_using_formula(.formula,!!.sample, !!.transcript, !!.abundance),
		# 	~ .x %>% eliminate_sparse_transcripts(!!.transcript)
		# ) 
		

	# Create design matrix
	design =
		model.matrix(
			object = .formula,
			data = df_for_voom %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
		)

	# Print the design column names in case I want contrasts
	message(
		sprintf(
			"tidybulk says: The design column names are \"%s\"",
			design %>% colnames %>% paste(collapse = ", ")
		)
	)

	my_contrasts =
		.contrasts %>%
		ifelse_pipe(length(.) > 0,
								~ limma::makeContrasts(contrasts = .x, levels = design),
								~ NULL)

	# Check if package is installed, otherwise install
	if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing limma needed for differential transcript abundance analyses")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("limma", ask = FALSE)
	}

	voom_object =
		df_for_voom %>%
		select(!!.transcript,!!.sample,!!.abundance) %>%
		spread(!!.sample,!!.abundance) %>%
		as_matrix(rownames = !!.transcript) %>%

		edgeR::DGEList() %>%
		edgeR::calcNormFactors(method = scaling_method) %>%

		limma::voom(design, plot=FALSE) %>%
		limma::lmFit(design)

	voom_object %>%

		# If I have multiple .contrasts merge the results
		ifelse_pipe(
			my_contrasts %>% is.null | omit_contrast_in_colnames,

			# Simple comparison
			~ .x %>%

				# Contrasts
				limma::contrasts.fit(contrasts=my_contrasts, coefficients =  when(my_contrasts, is.null(.) ~ 2)) %>%
				limma::eBayes() %>%

			# Convert to tibble
				limma::topTable(n = 999999) %>%
				as_tibble(rownames = quo_name(.transcript)) %>%

				# # Mark DE genes
				# mutate(significant = adj.P.Val < significance_threshold) 	%>%

				# Arrange
				arrange(adj.P.Val),

			# Multiple comparisons
			~ {
				voom_obj = .x

				1:ncol(my_contrasts) %>%
					map_dfr(
						~ voom_obj %>%

							# Contrasts
							limma::contrasts.fit(contrasts=my_contrasts[, .x]) %>%
							limma::eBayes() %>%

							# Convert to tibble
							limma::topTable(n = 999999) %>%
							as_tibble(rownames = quo_name(.transcript)) %>%
							mutate(constrast = colnames(my_contrasts)[.x]) 
							# %>%
							# 
							# # Mark DE genes
							# mutate(significant = adj.P.Val < significance_threshold)
					) %>%
					pivot_wider(values_from = -c(!!.transcript, constrast),
											names_from = constrast, names_sep = "___")
			}
		)	 %>%

		
		# Attach prefix
		setNames(c(
			colnames(.)[1], 
			sprintf("%s%s", prefix, colnames(.)[2:ncol(.)])
		)) %>%
		
		# Attach attributes
		reattach_internals(.data) %>%

		# Add raw object
		attach_to_internals(voom_object, "voom") %>%
		# Communicate the attribute added
		{
			message(
				"tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$voom`"
			)
			(.)
		}
}


#' Get differential transcription information to a tibble using DESeq2
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom utils install.packages
#' @importFrom purrr when
#'
#'
#' @param .data A tibble
#' @param .formula a formula with no response variable, referring only to numeric variables
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest)
#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT)
#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")
#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames.
#'
#' @return A tibble with edgeR results
#'
get_differential_transcript_abundance_deseq2 <- function(.data,
																											 .formula,
																											 .sample = NULL,
																											 .transcript = NULL,
																											 .abundance = NULL,
																											 .contrasts = NULL,
																											 method = "edgeR_quasi_likelihood",
																											 scaling_method = "TMM",
																											 omit_contrast_in_colnames = FALSE,
																											 prefix = "") {
	
	# Check if contrasts are of the same form
	if(
		.contrasts %>% is.null %>% not() &
		.contrasts %>% class %>% equals("list") %>% not()
	)
		stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))")
	
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)

	# Check if omit_contrast_in_colnames is correctly setup
	if(omit_contrast_in_colnames & length(.contrasts) > 1){
		warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present")
		omit_contrast_in_colnames = FALSE
	}

	if (find.package("acepack", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing acepack needed for analyses")
		install.packages("acepack", repos = "https://cloud.r-project.org")
	}

	# Check if package is installed, otherwise install
	if (find.package("DESeq2", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing DESeq2 needed for differential transcript abundance analyses")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("DESeq2", ask = FALSE)
	}

	# # Print the design column names in case I want contrasts
	# message(
	# 	sprintf(
	# 		"tidybulk says: The design column names are \"%s\"",
	# 		design %>% colnames %>% paste(collapse = ", ")
	# 	)
	# )

	my_contrasts = .contrasts


	deseq2_object =
		.data %>%

		# Prepare the data frame
		select(!!.transcript,
					 !!.sample,
					 !!.abundance,
					 one_of(parse_formula(.formula))) %>%
		distinct() %>%

		# drop factors as it can affect design matrix
		droplevels() %>%

		# Needed for DESeq2
		mutate(!!.abundance := as.integer(!!.abundance)) %>%
		rename(counts = !!.abundance) %>%
		mutate_if(is.logical , as.factor) %>%
		mutate_if(is.character , as.factor) %>%

		# Filter
		tidybulk_to_SummarizedExperiment(!!.sample, !!.transcript, counts) %>%

		# DESeq2
		DESeq2::DESeqDataSet( design = .formula) %>%
		DESeq2::DESeq()

	# Read ft object
	deseq2_object %>%

		# If I have multiple .contrasts merge the results
		when(
			
			# Simple comparison continuous
			(my_contrasts %>% is.null ) & 
				(deseq2_object@colData[,parse_formula(.formula)[1]] %>% 
				 	class %in% c("numeric", "integer", "double")) 	~ 
				(.) %>%
				DESeq2::results() %>%
				as_tibble(rownames = quo_name(.transcript)), 
			
			# Simple comparison discrete
			my_contrasts %>% is.null 	~ 
				(.) %>%
				DESeq2::results(contrast = c(
					parse_formula(.formula)[1],
					deseq2_object@colData[,parse_formula(.formula)[1]] %>% levels %>% .[2],
					deseq2_object@colData[,parse_formula(.formula)[1]] %>% levels %>% .[1]
				)) %>%
				as_tibble(rownames = quo_name(.transcript)), 

			# Simple comparison discrete
			my_contrasts %>% is.null %>% not() & omit_contrast_in_colnames	~ 
				(.) %>%
				DESeq2::results(contrast = my_contrasts[[1]])%>%
				as_tibble(rownames = quo_name(.transcript)), 
			
			# Multiple comparisons NOT USED AT THE MOMENT
			~ {
				deseq2_obj = (.)

				1:length(my_contrasts) %>%
					map_dfr(
						~ 	deseq2_obj %>%

							# select method
							DESeq2::results(contrast = my_contrasts[[.x]])	%>%

							# Convert to tibble
							as_tibble(rownames = quo_name(.transcript)) %>%
							mutate(constrast = sprintf("%s %s-%s", my_contrasts[[.x]][1], my_contrasts[[.x]][2], my_contrasts[[.x]][3]) ) 

					) %>%
					pivot_wider(values_from = -c(!!.transcript, constrast),
											names_from = constrast, names_sep = "___")
			}
		)	 %>%

		# Attach prefix
		setNames(c(
			colnames(.)[1], 
			sprintf("%s%s", prefix, colnames(.)[2:ncol(.)])
		)) %>%
		
		# Attach attributes
		reattach_internals(.data) %>%
		memorise_methods_used("deseq2") %>%

		# Add raw object
		attach_to_internals(deseq2_object, "DESeq2") %>%
		
		# Communicate the attribute added
		{
			message(
				"tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$DESeq2`"
			)
			(.)
		}
}

#' Get differential composition information to a tibble using edgeR.
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom utils install.packages
#' @importFrom purrr when
#' @importFrom stringr str_replace
#'
#'
#' @param .data A tibble
#' @param .formula a formula with no response variable, referring only to numeric variables
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT)
#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance
#' @param significance_threshold A real between 0 and 1
#'
#' @return A tibble with edgeR results
#'
test_differential_cellularity_ <- function(.data,
																					 .formula,
																					 .sample = NULL,
																					 .transcript = NULL,
																					 .abundance = NULL,
																					 method = "cibersort",
																					 reference = X_cibersort,
																					 significance_threshold = 0.05,
																					 ...
) {

	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)


	if (find.package("broom", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing broom needed for analyses")
		install.packages("broom", repos = "https://cloud.r-project.org")
	}

	# Parse formula
	.my_formula =
		.formula %>%
		when(

			# If I have the dot, needed definitely for censored
			format(.) %>% grepl("\\.", .) %>% any ~ format(.formula) %>% str_replace("([-\\+\\*~ ])(\\.)", "\\1.proportion_0_corrected"),

			# If normal formula
			~ sprintf(".proportion_0_corrected%s", format(.))
		) %>%

		as.formula
 
	.data %>%

		# Deconvolution
		deconvolve_cellularity(
			!!.sample, !!.transcript, !!.abundance,
			method=method,
			reference = reference,
			action="get",
			...
		)  %>%

		# Test
		pivot_longer(
			names_prefix = sprintf("%s: ", method),
			cols = starts_with(method),
			names_to = ".cell_type",
			values_to = ".proportion"
		) %>%

		# Replace 0s
		mutate(min_proportion = min(.proportion[.proportion!=0])) %>%
		mutate(.proportion_0_corrected = if_else(.proportion==0, min_proportion, .proportion)) %>%

		# Test survival
		tidyr::nest(cell_type_proportions = -.cell_type) %>%
		mutate(surv_test = map(cell_type_proportions, ~ {
			if(pull(., .proportion_0_corrected) %>% unique %>% length %>%  `<=` (3)) return(NULL)

			# See if regression if censored or not
			.x %>%
				when(
					grepl("Surv", .my_formula) %>% any ~ {
						# Check if package is installed, otherwise install
						if (find.package("survival", quiet = TRUE) %>% length %>% equals(0)) {
							message("Installing betareg needed for analyses")
							install.packages("survival", repos = "https://cloud.r-project.org")
						}

						if (find.package("boot", quiet = TRUE) %>% length %>% equals(0)) {
							message("Installing boot needed for analyses")
							install.packages("boot", repos = "https://cloud.r-project.org")
						}

						(.) %>%
							mutate(.proportion_0_corrected = .proportion_0_corrected  %>% boot::logit()) %>%
							survival::coxph(.my_formula, .)	%>%
							broom::tidy() %>%
							select(-term)
					} ,
					~ {
						# Check if package is installed, otherwise install
						if (find.package("betareg", quiet = TRUE) %>% length %>% equals(0)) {
							message("Installing betareg needed for analyses")
							install.packages("betareg", repos = "https://cloud.r-project.org")
						}
						(.) %>%
							betareg::betareg(.my_formula, .) %>%
							broom::tidy() %>%
							filter(component != "precision") %>%
							pivot_wider(names_from = term, values_from = c(estimate, std.error, statistic,   p.value)) %>%
							select(-c(`std.error_(Intercept)`, `statistic_(Intercept)`, `p.value_(Intercept)`)) %>%
							select(-component)
					}
				)
		}


		)) %>%


		unnest(surv_test, keep_empty = TRUE) %>%

		# Attach attributes
		reattach_internals(.data) %>%
		
		# Add methods used
		when(
			grepl("Surv", .my_formula) ~ (.) %>% memorise_methods_used(c("survival", "boot")),
			~ (.) %>% memorise_methods_used("betareg")
		)





}



#' Get gene enrichment analyses using EGSEA
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom purrr map2_dfr
#' @importFrom stats model.matrix
#' @importFrom utils install.packages
#'
#'
#' @param .data A `tbl` formatted as | <SAMPLE> | <TRANSCRIPT> | <COUNT> | <...> |
#' @param .formula A formula with no response variable, representing the desired linear model
#' @param .sample The name of the sample column
#' @param .entrez The ENTREZ code of the transcripts/genes
#' @param .abundance The name of the transcript/gene abundance column
#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest)
#' @param method A character vector. The methods to be included in the ensembl. Type EGSEA::egsea.base() to see the supported GSE methods.
#' @param species A character. For example, human or mouse
#' @param cores An integer. The number of cores available
#'
#' @return A tibble with edgeR results
#'
test_gene_enrichment_bulk_EGSEA <- function(.data,
																							 .formula,
																							 .sample = NULL,
																							 .entrez,
																							 .abundance = NULL,
																							 .contrasts = NULL,
																						method,
																							 species,
																							 cores = 10) {
	# Comply with CRAN NOTES
	. = NULL

	# Get column names
	.sample = enquo(.sample)
	.abundance = enquo(.abundance)

	.entrez = enquo(.entrez)

	# distinct_at is not released yet for dplyr, thus we have to use this trick
	df_for_edgeR <- .data %>%

		# Prepare the data frame
		select(!!.entrez, !!.sample, !!.abundance,
					 one_of(parse_formula(.formula))) %>%
		distinct() %>%

		# Add entrez from symbol
		filter(!!.entrez %>% is.na %>% not())

	# Check if at least two samples for each group
	if (df_for_edgeR %>%
			select(!!.sample, one_of(parse_formula(.formula))) %>%
			distinct %>%
			count(!!as.symbol(parse_formula(.formula))) %>%
			distinct(n) %>%
			pull(n) %>%
			min %>%
			st(2))
		stop("tidybulk says: You need at least two replicated for each condition for EGSEA to work")


	# Create design matrix
	design =
		model.matrix(
			object = .formula,
			data = df_for_edgeR %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
		)

	# Print the design column names in case I want contrasts
	message(
		sprintf(
			"tidybulk says: The design column names are \"%s\"",
			design %>% colnames %>% paste(collapse = ", ")
		)
	)

	my_contrasts =
		.contrasts %>%
		ifelse_pipe(length(.) > 0,
								~ limma::makeContrasts(contrasts = .x, levels = design),
								~ NULL)

	# Check if package is installed, otherwise install
	if (find.package("EGSEA", quiet = TRUE) %>% length %>% equals(0)) {
		stop("
				 EGSEA not installed. Please install it. EGSEA require manual installation for not overwelming the user in case it is not needed.
				 BiocManager::install(\"EGSEA\", ask = FALSE)
				 ")
	}
	if (!"EGSEA" %in% (.packages())) {
		stop("EGSEA package not loaded. Please run library(\"EGSEA\"). With this setup, EGSEA require manual loading, for technical reasons.")
	}

	dge =
		df_for_edgeR %>%

		# Make sure transcript names are adjacent
		arrange(!!.entrez) %>%

		select(!!.sample, !!.entrez, !!.abundance) %>%
		spread(!!.sample,!!.abundance) %>%
		as_matrix(rownames = !!.entrez) %>%
		edgeR::DGEList(counts = .)

	idx =  buildIdx(entrezIDs = rownames(dge), species = species)

	# Due to a bug in kegg, this data set is run without report
	# http://supportupgrade.bioconductor.org/p/122172/#122218
	message("tidybulk says: due to a bug in the call to KEGG database (http://supportupgrade.bioconductor.org/p/122172/#122218), the analysis for this database is run without report production.")

	res_kegg =
		dge %>%

		# Calculate weights
		limma::voom(design, plot = FALSE) %>%

		# Execute EGSEA
		egsea(
			contrasts = my_contrasts,
			gs.annots = idx %>% .["kegg"],
			baseGSEAs = method,
			sort.by = "med.rank",
			num.threads = cores,
			report = FALSE
		)

	res_formatted_kegg =
		res_kegg@results %>%
		map2_dfr(
			(.) %>% names,
			~ .x[[1]][[1]] %>%
				as_tibble(rownames = "pathway") %>%
				mutate(data_base = .y)
		) %>%
		arrange(med.rank) %>%
		select(data_base, pathway, everything())

	res =
		dge %>%

		# Calculate weights
		limma::voom(design, plot = FALSE) %>%

		# Execute EGSEA
		egsea(
			contrasts = my_contrasts,
			gs.annots = idx[which(names(idx)!="kegg")],
			baseGSEAs = method,
			sort.by = "med.rank",
			num.threads = cores,
		)

	res_formatted_all =
		res@results %>%
		map2_dfr(
			(.) %>% names,
			~ .x[[1]][[1]] %>%
				as_tibble(rownames = "pathway") %>%
				mutate(data_base = .y)
		) %>%
		arrange(med.rank) %>%
		select(data_base, pathway, everything())


	bind_rows(res_formatted_all, res_formatted_kegg)

}

#' Get K-mean clusters to a tibble
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom stats kmeans
#' @importFrom rlang :=
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally samples)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally genes)
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param ... Further parameters passed to the function kmeans
#'
#' @return A tibble with additional columns
#'
#'
get_clusters_kmeans_bulk <-
	function(.data,
					 .element = NULL,
					 .feature = NULL,
					 .abundance = NULL,
					 of_samples = TRUE,
					 log_transform = TRUE,
					 ...) {
		# Check if centers is in dots
		dots_args = rlang::dots_list(...)
		if ("centers" %in% names(dots_args) %>% not())
			stop("tidybulk says: for kmeans you need to provide the \"centers\" integer argument")

		# Get column names
		.element = enquo(.element)
		.feature = enquo(.feature)
		.abundance = enquo(.abundance)

		.data %>%

			# Prepare data frame
			distinct(!!.feature,!!.element,!!.abundance) %>%

			# Check if log transform is needed
			ifelse_pipe(log_transform,
									~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>%  `+`(1) %>%  log())) %>%

			# Prepare data frame for return
			spread(!!.feature,!!.abundance) %>%
			as_matrix(rownames = !!.element) %>%

			# Wrap the do.call because of the centrers check
			{
				do.call(kmeans, list(x = (.), iter.max = 1000) %>% c(dots_args))
			}	 %$%
			cluster %>%
			as.list() %>%
			as_tibble() %>%
			gather(!!.element, `cluster kmeans`) %>%
			mutate(`cluster kmeans` = `cluster kmeans` %>% as.factor()) %>%

			# Attach attributes
			reattach_internals(.data) %>%
			memorise_methods_used("stats")
	}

#' Get SNN shared nearest neighbour clusters to a tibble
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally samples)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally genes)
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param ... Further parameters passed to the function kmeans
#'
#' @return A tibble with additional columns
#'
get_clusters_SNN_bulk <-
	function(.data,
					 .element = NULL,
					 .feature = NULL,
					 .abundance,
					 of_samples = TRUE,
					 log_transform = TRUE,
					 ...) {
		# Get column names
		.element = enquo(.element)
		.feature = enquo(.feature)
		.abundance = enquo(.abundance)

		# Check if package is installed, otherwise install
		if (find.package("cluster", quiet = TRUE) %>% length %>% equals(0)) {
			message("Installing cluster")
			install.packages("cluster", repos = "https://cloud.r-project.org")
		}
		if (find.package("Seurat", quiet = TRUE) %>% length %>% equals(0)) {
			message("Installing Seurat")
			install.packages("Seurat", repos = "https://cloud.r-project.org")
		}
		if (find.package("KernSmooth", quiet = TRUE) %>% length %>% equals(0)) {
			message("Installing KernSmooth")
			install.packages("KernSmooth", repos = "https://cloud.r-project.org")
		}

		my_df =
			.data %>%

			# Prepare data frame
			distinct(!!.element,!!.feature,!!.abundance) %>%

			# Check if log tranfrom is needed
			#ifelse_pipe(log_transform, ~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>%  `+`(1) %>%  log())) %>%

			# Prepare data frame for return
			spread(!!.element,!!.abundance)

		my_df %>%
			data.frame(row.names = quo_name(.feature)) %>%
			Seurat::CreateSeuratObject() %>%
			Seurat::ScaleData(display.progress = TRUE,
												num.cores = 4,
												do.par = TRUE) %>%
			Seurat::FindVariableFeatures(selection.method = "vst") %>%
			Seurat::RunPCA(npcs = 30) %>%
			Seurat::FindNeighbors() %>%
			Seurat::FindClusters(method = "igraph", ...) %>%
			`[[` ("seurat_clusters") %>%
			as_tibble(rownames = quo_name(.element)) %>%
			rename(`cluster SNN` = seurat_clusters) %>%
			dplyr::mutate(!!.element := gsub("\\.", "-",!!.element)) %>%

			# Attach attributes
			reattach_internals(.data) %>%
			memorise_methods_used("seurat")
	}

#' Get dimensionality information to a tibble using MDS
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom purrr map_dfr
#' @importFrom rlang :=
#' @importFrom stats setNames
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#'
#' @return A tibble with additional columns
#'
#'
get_reduced_dimensions_MDS_bulk <-
	function(.data,
					 .element = NULL,
					 .feature = NULL,
					 .abundance = NULL,
					 .dims = 2,
					 top = 500,
					 of_samples = TRUE,
					 log_transform = TRUE) {
		# Comply with CRAN NOTES
		. = NULL

		# Get column names
		.element = enquo(.element)
		.feature = enquo(.feature)
		.abundance = enquo(.abundance)

		# Get components from dims
		components = 1:.dims

		mds_object =
			.data %>%

			distinct(!!.feature,!!.element,!!.abundance) %>%

			# Check if log transform is needed
			ifelse_pipe(log_transform,
									~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% log1p())) %>%

			# Stop any column is not if not numeric or integer
			ifelse_pipe(
				(.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer")) %>% not() %>% any(),
				~ stop("tidybulk says: .abundance must be numerical or integer")
			) %>%
			spread(!!.element,!!.abundance) %>%
			as_matrix(rownames = !!.feature, do_check = FALSE) %>%
			limma::plotMDS(ndim = .dims, plot = FALSE, top = top)

		# Parse results
		mds_object %$%	cmdscale.out %>%
			as.data.frame %>%
			as_tibble(rownames = quo_name(.element)) %>%
			setNames(c(quo_name(.element), sprintf("Dim%s", 1:.dims))) %>%


			# Attach attributes
			reattach_internals(.data) %>%
			memorise_methods_used("limma") %>%

			# Add raw object
			attach_to_internals(mds_object, "MDS") %>%
			# Communicate the attribute added
			{
				message("tidybulk says: to access the raw results do `attr(..., \"internals\")$MDS`")
				(.)
			}

	}

#' Get principal component information to a tibble using PCA
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom stats prcomp
#' @importFrom utils capture.output
#' @importFrom magrittr divide_by
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param scale A boolean
#' @param ... Further parameters passed to the function prcomp
#'
#' @return A tibble with additional columns
#'
#'
get_reduced_dimensions_PCA_bulk <-
	function(.data,
					 .element = NULL,
					 .feature = NULL,

					 .abundance  = NULL,
					 .dims = 2,
					 top = 500,
					 of_samples = TRUE,
					 log_transform = TRUE,
					 scale = FALSE,
					 ...) {
		# Comply with CRAN NOTES
		. = NULL

		# Get column names
		.element = enquo(.element)
		.feature = enquo(.feature)
		.abundance = enquo(.abundance)

		# Get components from dims
		components = 1:.dims

		prcomp_obj =
			.data %>%

			# Filter most variable genes
			keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top) %>%

			# Prepare data frame
			distinct(!!.feature,!!.element,!!.abundance) %>%

			# Check if log transform is needed
			ifelse_pipe(log_transform,
									~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% log1p())) %>%

			# Stop any column is not if not numeric or integer
			ifelse_pipe(
				(.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer", "bouble")) %>% not() %>% any(),
				~ stop("tidybulk says: .abundance must be numerical or integer")
			) %>%

			spread(!!.element,!!.abundance) %>%

			drop_na %>% # Is this necessary?

			# check that there are non-NA genes for enough samples
			ifelse2_pipe(# First condition
				(.) %>% nrow == 0,

				# Second condition
				(.) %>% nrow < 100,

				# First function
				~ stop(
					"tidybulk says: In calculating PCA there is no gene that have non NA values is all samples"
				),

				# Second function
				~ {
					warning(
						"
						tidybulk says: In PCA correlation there is < 100 genes that have non NA values is all samples.
The correlation calculation would not be reliable,
we suggest to partition the dataset for sample clusters.
						"
					)
					.x
				}) %>%

			# Transform to matrix
			as_matrix(rownames = !!.feature, do_check = FALSE) %>%
			t() %>%

			# Calculate principal components
			prcomp(scale = scale, ...)

		prcomp_obj %>%

			# Anonymous function - Prints fraction of variance
			# input: PCA object
			# output: PCA object
			{
				message("Fraction of variance explained by the selected principal components")

				(.) %$% sdev %>% pow(2) %>% # Eigen value
					divide_by(sum(.)) %>%
					`[` (components) %>%
					enframe() %>%
					select(-name) %>%
					rename(`Fraction of variance` = value) %>%
					mutate(PC = components) %>%
					capture.output() %>% paste0(collapse = "\n") %>% message()

				(.)

			} %$%

			# Parse the PCA results to a tibble
			x %>%
			as_tibble(rownames = quo_name(.element)) %>%
			select(!!.element, sprintf("PC%s", components)) %>%

			# Attach attributes
			reattach_internals(.data) %>%
			memorise_methods_used("stats") %>%

			# Add raw object
			attach_to_internals(prcomp_obj, "PCA") %>%
			# Communicate the attribute added
			{
				message("tidybulk says: to access the raw results do `attr(..., \"internals\")$PCA`")
				(.)
			}

	}

#' Get principal component information to a tibble using tSNE
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom stats setNames
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param ... Further parameters passed to the function Rtsne
#'
#' @return A tibble with additional columns
#'
get_reduced_dimensions_TSNE_bulk <-
	function(.data,
					 .element = NULL,
					 .feature = NULL,

					 .abundance = NULL,
					 .dims = 2,
					 top = 500,
					 of_samples = TRUE,
					 log_transform = TRUE,
					 ...) {
		# Comply with CRAN NOTES
		. = NULL

		# Get column names
		.element = enquo(.element)
		.feature = enquo(.feature)
		.abundance = enquo(.abundance)

		# Evaluate ...
		arguments <- list(...)
		if (!"check_duplicates" %in% names(arguments))
			arguments = arguments %>% c(check_duplicates = TRUE)
		if (!"verbose" %in% names(arguments))
			arguments = arguments %>% c(verbose = TRUE)
		if (!"dims" %in% names(arguments))
			arguments = arguments %>% c(dims = .dims)


		# Check if package is installed, otherwise install
		if (find.package("Rtsne", quiet = TRUE) %>% length %>% equals(0)) {
			message("Installing Rtsne")
			install.packages("Rtsne", repos = "https://cloud.r-project.org")
		}

		# Set perprexity to not be too high
		if (!"perplexity" %in% names(arguments))
			arguments = arguments %>% c(perplexity = ((
				.data %>% distinct(!!.element) %>% nrow %>% sum(-1)
			) / 3 / 2) %>% floor() %>% min(30))

		# If not enough samples stop
		if (arguments$perplexity <= 2)
			stop("tidybulk says: You don't have enough samples to run tSNE")

		# Calculate the most variable genes, from plotMDS Limma


		df_tsne =
			.data %>%

			# Filter NA symbol
			filter(!!.feature %>% is.na %>% not()) %>%

			# Prepare data frame
			distinct(!!.feature,!!.element,!!.abundance) %>%

			# Check if data rectangular
			ifelse_pipe(
				(.) %>% check_if_data_rectangular(!!.element,!!.feature,!!.abundance),
				~ .x %>% eliminate_sparse_transcripts(!!.feature)
			) %>%

			# Check if log transform is needed
			ifelse_pipe(log_transform,
									~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% log1p)) %>%

			# Filter most variable genes
			keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top) %>%

			spread(!!.feature,!!.abundance) %>%
			# select(-sample) %>%
			# distinct %>%
			as_matrix(rownames = quo_name(.element))

		do.call(Rtsne::Rtsne, c(list(df_tsne), arguments)) %$%
			Y %>%
			as_tibble(.name_repair = "minimal") %>%
			setNames(c("tSNE1", "tSNE2")) %>%

			# add element name
			dplyr::mutate(!!.element := df_tsne %>% rownames) %>%
			select(!!.element, everything()) %>%

			# Attach attributes
			reattach_internals(.data) %>%
			memorise_methods_used("rtsne")

	}

#' Get rotated dimensions of two principal components or MDS dimension of choice, of an angle
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang quo_is_null
#'
#'
#' @param .data A tibble
#' @param dimension_1_column A column symbol. The column of the dimension 1
#' @param dimension_2_column   A column symbol. The column of the dimension 2
#' @param rotation_degrees A real number between 0 and 360
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param of_samples A boolean
#' @param dimension_1_column_rotated A column symbol. The column of the dimension 1 rotated
#' @param dimension_2_column_rotated   A column symbol. The column of the dimension 2 rotated
#'
#' @return A tibble with additional rotated columns
#'
#'
get_rotated_dimensions =
	function(.data,
					 dimension_1_column,
					 dimension_2_column,
					 rotation_degrees,
					 .element = NULL,
					 of_samples = TRUE,
					 dimension_1_column_rotated = NULL,
					 dimension_2_column_rotated = NULL) {
		# Get column names
		.element = enquo(.element)
		dimension_1_column = enquo(dimension_1_column)
		dimension_2_column = enquo(dimension_2_column)
		dimension_1_column_rotated = enquo(dimension_1_column_rotated)
		dimension_2_column_rotated = enquo(dimension_2_column_rotated)

		if (.data %>%
				distinct(!!.element, !!dimension_1_column, !!dimension_2_column) %>%
				count(!!.element, !!dimension_1_column, !!dimension_2_column) %>%
				pull(n) %>%
				max %>%
				gt(1))
			stop(sprintf(
				"tidybulk says: %s must be unique for each row for the calculation of rotation",
				quo_name(.element)
			))

		# Function that rotates a 2D space of a arbitrary angle
		rotation = function(m, d) {
			r = d * pi / 180
			((dplyr::bind_rows(
				c(`1` = cos(r), `2` = -sin(r)),
				c(`1` = sin(r), `2` = cos(r))
			) %>% as_matrix) %*% m)
		}

		# Sanity check of the angle selected
		if (rotation_degrees %>% between(-360, 360) %>% not())
			stop("tidybulk says: rotation_degrees must be between -360 and 360")

		# Return
		.data %>%
			distinct(!!.element, !!dimension_1_column, !!dimension_2_column) %>%
			as_matrix(rownames = !!.element) %>% t %>%
			rotation(rotation_degrees) %>%
			as_tibble() %>%
			mutate(`rotated dimensions` =
						 	c(
						 		quo_name(dimension_1_column_rotated),
						 		quo_name(dimension_2_column_rotated)
						 	)) %>%
			gather(!!.element, value,-`rotated dimensions`) %>%
			spread(`rotated dimensions`, value) %>%

			# Attach attributes
			reattach_internals(.data)

	}

#' Aggregates multiple counts from the same samples (e.g., from isoforms)
#' This function aggregates counts over samples, concatenates other character columns, and averages other numeric columns
#'
#' @keywords internal
#'
#' @importFrom dplyr summarise_all
#' @importFrom dplyr bind_rows
#' @importFrom magrittr %$%
#' @importFrom rlang :=
#'
#' @param .data A tibble
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param aggregation_function A function for counts aggregation (e.g., sum)
#' @param keep_integer A boolean
#'
#' @return A tibble with aggregated genes and annotation
#'
#'
aggregate_duplicated_transcripts_bulk =
	function(.data,
					 .sample = NULL,
					 .transcript = NULL,
					 .abundance = NULL,
					 aggregation_function = sum,

					 keep_integer = TRUE) {
		# Comply with CRAN NOTES
		. = NULL

		# Get column names
		.sample = enquo(.sample)
		.transcript = enquo(.transcript)
		.abundance = enquo(.abundance)
		col_names = get_sample_transcript_counts(.data, .sample, .transcript, .abundance)
		.sample = col_names$.sample
		.transcript = col_names$.transcript
		.abundance = col_names$.abundance

		# Robust paste function that preserves NAs
		paste3 <- function(..., sep = ", ") {
			L <- list(...)
			L <- lapply(L, function(x) {
				x[is.na(x)] <- ""
				x
			})
			ret <- gsub(paste0("(^", sep, "|", sep, "$)"),
									"",
									gsub(paste0(sep, sep), sep,
											 do.call(paste, c(
											 	L, list(sep = sep)
											 ))))
			is.na(ret) <- ret == ""
			ret
		}

		# Through warning if there are logicals of factor in the data frame
		# because they cannot be merged if they are not unique
		if ((lapply(.data, class) %>% unlist %in% c("logical", "factor")) %>% any) {
			warning("tidybulk says: for aggregation, factors and logical columns were converted to character")
			message("Converted to characters")
			message(lapply(.data, class) %>% unlist %>% `[` (. %in% c("logical", "factor") %>% which))
		}

		# Select which are the numerical columns
		numerical_columns =
			.data %>%
			ungroup() %>%
			select_if(is.numeric) %>%
			select(-!!.abundance) %>%

			# If scaled add the column to the exclusion
			ifelse_pipe((
				".abundance_scaled" %in% (.data %>% get_tt_columns() %>% names) &&
					# .data %>% get_tt_columns() %$% .abundance_scaled %>% is.null %>% not() &&
					quo_name(.data %>% get_tt_columns() %$% .abundance_scaled) %in% (.data %>% colnames)
			),
			~ .x %>% select(-!!(
				.data %>% get_tt_columns() %$% .abundance_scaled
			)))	%>%
			colnames() %>%
			c("n_aggr")

		# aggregates read .data over samples, concatenates other character columns, and averages other numeric columns
		.data %>%

			# transform logicals and factors
			mutate_if(is.factor, as.character) %>%
			mutate_if(is.logical, as.character) %>%

			# Add the number of duplicates for each gene
			dplyr::left_join((.) %>% count(!!.sample,!!.transcript, name = "n_aggr"),
											 by = c(quo_name(.sample), quo_name(.transcript))) %>%

			# Anonymous function - binds the unique and the reduced genes,
			# in the way we have to reduce redundancy just for the duplicated genes
			# input: tibble
			# output tibble distinct
			{
				dplyr::bind_rows(
					# Unique symbols
					(.) %>%
						filter(n_aggr == 1),

					# Duplicated symbols
					(.) %>%
						filter(n_aggr > 1) %>%
						group_by(!!.sample,!!.transcript) %>%
						dplyr::mutate(!!.abundance := !!.abundance %>% aggregation_function()) %>%

						# If scaled abundance exists aggregate that as well
						ifelse_pipe((
							".abundance_scaled" %in% (.data %>% get_tt_columns() %>% names) &&
								# .data %>% get_tt_columns() %$% .abundance_scaled %>% is.null %>% not() &&
								quo_name(.data %>% get_tt_columns() %$% .abundance_scaled) %in% (.data %>% colnames)
						),
						~ {
							.abundance_scaled = .data %>% get_tt_columns() %$% .abundance_scaled
							.x %>% dplyr::mutate(!!.abundance_scaled := !!.abundance_scaled %>% aggregation_function())
						}) %>%

						mutate_at(vars(numerical_columns), mean) %>%
						mutate_at(
							vars(
								-group_cols(),
								-contains(quo_name(.abundance)),
								-!!numerical_columns
							),
							list( ~ paste3(unique(.), collapse = ", "))
						) %>%
						distinct()
				)
			} %>%

			# Rename column of number of duplicates for each gene
			rename(`merged transcripts` = n_aggr) %>%

			# Attach attributes
			reattach_internals(.data)

	}

#' Drop redundant elements (e.g., samples) for which feature (e.g., genes) aboundances are correlated
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param correlation_threshold A real number between 0 and 1
#' @param top An integer. How many top genes to select
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#'
#' @return A tibble with redundant elements removed
#'
#'
remove_redundancy_elements_through_correlation <- function(.data,
																													 .element = NULL,
																													 .feature = NULL,
																													 .abundance = NULL,
																													 correlation_threshold = 0.9,
																													 top = Inf,
																													 of_samples = TRUE,
																													 log_transform = FALSE) {
	# Comply with CRAN NOTES
	. = NULL

	# Get column names
	.element = enquo(.element)
	.feature = enquo(.feature)
	.abundance = enquo(.abundance)
	col_names = get_elements_features_abundance(.data, .element, .feature, .abundance, of_samples)
	.element = col_names$.element
	.feature = col_names$.feature
	.abundance = col_names$.abundance

	# Check if package is installed, otherwise install
	if (find.package("widyr", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing widyr needed for correlation analyses")
		install.packages("widyr", repos = "https://cloud.r-project.org")
	}

	# Get the redundant data frame
	.data.correlated =
		.data %>%

		# Prepare the data frame
		select(!!.feature,!!.element,!!.abundance) %>%

		# Filter variable genes
		keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top = top) %>%

		# Check if log transform is needed
		ifelse_pipe(log_transform,
								~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% `+`(1) %>%  log())) %>%
		distinct() %>%
		spread(!!.element,!!.abundance) %>%
		drop_na() %>%

		# check that there are non-NA genes for enough samples
		ifelse2_pipe(# First condition
			(.) %>% nrow == 0,

			# Second condition
			(.) %>% nrow < 100,

			# First function
			~ stop(
				"tidybulk says: In calculating correlation there is no gene that have non NA values is all samples"
			),

			# Second function
			~ {
				message(
					"tidybulk says: In calculating correlation there is < 100 genes (that have non NA values) is all samples.
The correlation calculation might not be reliable"
				)
				.x
			}) %>%

		# Prepare the data frame
		gather(!!.element,!!.abundance,-!!.feature) %>%
		dplyr::rename(rc := !!.abundance,
									sample := !!.element,
									transcript := !!.feature) %>% # Is rename necessary?
		mutate_if(is.factor, as.character) %>%

		# Run pairwise correlation and return a tibble
		widyr::pairwise_cor(
			sample,
			transcript,
			rc,
			sort = TRUE,
			diag = FALSE,
			upper = FALSE
		) %>%
		filter(correlation > correlation_threshold) %>%
		distinct(item1) %>%
		dplyr::rename(!!.element := item1)

	# Return non redundant data frame
	.data %>% anti_join(.data.correlated, by = quo_name(.element)) %>%

		# Attach attributes
		reattach_internals(.data) %>%
		memorise_methods_used("widyr")
}

#' Identifies the closest pairs in a MDS context and return one of them
#'
#' @keywords internal
#'
#' @importFrom stats setNames
#' @importFrom stats dist
#'
#' @param .data A tibble
#' @param Dim_a_column A column symbol. The column of one principal component
#' @param Dim_b_column A column symbol. The column of another principal component
#' @param .element A column symbol. The column that is represents entities to cluster (i.e., normally samples)
#' @param of_samples A boolean
#'
#' @return A tibble with pairs dropped
#'
#'
remove_redundancy_elements_though_reduced_dimensions <-
	function(.data,
					 Dim_a_column,
					 Dim_b_column,
					 .element = NULL,
					 of_samples = TRUE) {
		# This function identifies the closest pairs and return one of them

		# Get column names
		.element = enquo(.element)
		col_names = get_elements(.data, .element)
		.element = col_names$.element

		Dim_a_column = enquo(Dim_a_column)
		Dim_b_column = enquo(Dim_b_column)

		# Find redundant samples
		.data.redundant =

			# Calculate distances
			.data %>%
			select(!!.element,!!Dim_a_column,!!Dim_b_column) %>%
			distinct() %>%
			as_matrix(rownames = !!.element) %>%
			dist() %>%

			# Prepare matrix
			as.matrix() %>% as_tibble(rownames = "sample a") %>%
			gather(`sample b`, dist,-`sample a`) %>%
			filter(`sample a` != `sample b`) %>%

			# Sort the elements of the two columns to avoid eliminating all samples
			rowwise() %>%
			mutate(
				`sample 1` = c(`sample a`, `sample b`) %>% sort() %>% `[`(1),
				`sample 2` = c(`sample a`, `sample b`) %>% sort() %>% `[`(2)
			) %>%
			ungroup() %>%
			select(`sample 1`, `sample 2`, dist) %>%
			distinct() %>%

			# Select closestpairs
			select_closest_pairs %>%

			# Select pair to keep
			select(1) %>%

			# Set the column names
			setNames(quo_name(.element))

		# Drop samples that are correlated with others and return
		.data %>% anti_join(.data.redundant, by = quo_name(.element)) %>%

			# Attach attributes
			reattach_internals(.data)
	}

##' after wget, this function merges hg37 and hg38 mapping data bases - Do not execute!
##'
##' @return A tibble with ensembl-transcript mapping
##'
# get_ensembl_symbol_mapping <- function() {
# 	# wget -O mapping_38.txt 'http://www.ensembl.org/biomart/martservice?query=  <Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6">  <Dataset name="hsapiens_gene_ensembl" interface="default"> <Attribute name="ensembl_transcript_id"/> <Attribute name="ensembl_gene_id"/><Attribute name="transcript"/> </Dataset> </Query>'
# 	# wget -O mapping_37.txt 'http://grch37.ensembl.org/biomart/martservice?query=<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6"><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ensembl_transcript_id"/><Attribute name="ensembl_gene_id"/><Attribute name="transcript"/></Dataset></Query>'
# 	all =
# 		read_table2("~/third_party_sofware/ensembl_mapping/mapping_37.txt",
# 								col_names = FALSE) %>%
# 		setNames(c("ensembl_transcript_id", "ensembl_gene_id", "transcript")) %>%
# 		mutate(ref_genome = "hg37") %>%
# 		bind_rows(
# 			read_table2(
# 				"~/third_party_sofware/ensembl_mapping/mapping_38.txt",
# 				col_names = FALSE
# 			) %>%
# 				setNames(
# 					c("ensembl_transcript_id", "ensembl_gene_id", "transcript")
# 				) %>%
# 				mutate(ref_genome = "hg38")
# 		) %>%
# 		drop_na()
#
#
# 	bind_rows(
# 		# Gene annotation
# 		all %>%
# 			select(-ensembl_transcript_id) %>%
# 			group_by(ensembl_gene_id) %>%
# 			arrange(ref_genome %>% desc()) %>%
# 			slice(1) %>%
# 			ungroup() %>%
# 			rename(ensembl_id = ensembl_gene_id),
#
# 		# Transcript annotation
# 		all %>%
# 			select(-ensembl_gene_id) %>%
# 			group_by(ensembl_transcript_id) %>%
# 			arrange(ref_genome %>% desc()) %>%
# 			slice(1) %>%
# 			ungroup() %>%
# 			rename(ensembl_id = ensembl_transcript_id)
# 	) %>%
#
# 		# Write to file and return
# 		{
# 			(.) %>% write_csv("~/third_party_sofware/ensembl_mapping/ensembl_symbol_mapping.csv")
# 			(.)
# 		}
# }

##' after wget, this function merges hg37 and hg38 mapping data bases - Do not execute!
##'
##' @return A tibble with ensembl-transcript mapping
##'
# get_ensembl_symbol_mapping_mouse <- function() {
#
#
# 	left_join(
# 		AnnotationDbi::toTable(org.Mm.eg.db::org.Mm.egENSEMBL),
# 		AnnotationDbi::toTable(org.Mm.eg.db::org.Mm.egSYMBOL)
# 	) %>%
# 		as_tibble %>%
# 		select(-gene_id) %>%
# 		rename(ensembl_id = ensembl_id, transcript = symbol) %>%
# 		drop_na() %>%
#
# 		mutate(ref_genome = "mm10") %>%
#
# 		# Write to file and return
# 		{
# 			(.) %>% write_csv("~/third_party_sofware/ensembl_mapping/ensembl_symbol_mapping_mouse.csv")
# 			(.)
# 		}
# }

#' get_symbol_from_ensembl
#'
#' @keywords internal
#'
#' @description Get transcript column from ensembl gene id
#'
#' @param .data A tibble
#' @param .ensembl A column symbol. The column that is represents ensembl gene id
#'
#' @return A tibble with added annotation
#'
#'
get_symbol_from_ensembl <-
	function(.data, .ensembl) {

		# # Creating file
		# ensembl_symbol_mapping =
		# 	bind_rows(
		# 		read_csv("~/third_party_sofware/ensembl_mapping/ensembl_symbol_mapping.csv"),
		# 		read_csv("~/third_party_sofware/ensembl_mapping/ensembl_symbol_mapping_mouse.csv")
		# 	)
		#
		# save(ensembl_symbol_mapping, file="data/ensembl_symbol_mapping.rda", compress = "xz")


		.ensembl = enquo(.ensembl)

		.data %>%
			select(!!.ensembl) %>%
			distinct() %>%

			# Add name information
			dplyr::left_join(
				tidybulk::ensembl_symbol_mapping %>%
					distinct(ensembl_id, transcript, ref_genome) %>%
					dplyr::rename(!!.ensembl := ensembl_id) %>%
					rename(transcript = transcript),
				by = quo_name(.ensembl)
			)

	}

#' Perform linear equation system analysis through llsr
#'
#' @keywords internal
#'
#' @importFrom stats lsfit
#'
#' @param mix A data frame
#' @param reference A data frame
#'
#' @return A data frame
#'
#'
run_llsr = function(mix, reference) {
	# Get common markers
	markers = intersect(rownames(mix), rownames(reference))

	X <- (reference[markers, , drop = FALSE])
	Y <- (mix[markers, , drop = FALSE])

	results <- t(data.frame(lsfit(X, Y)$coefficients)[-1, , drop = FALSE])
	results[results < 0] <- 0
	results <- results / apply(results, 1, sum)
	rownames(results) = colnames(Y)

	results
}

#' Perform linear equation system analysis through llsr
#'
#' @keywords internal
#'
#' @importFrom stats lsfit
#'
#' @param mix A data frame
#' @param reference A data frame
#'
#' @return A data frame
#'
#'
run_epic = function(mix, reference = NULL) {
	
	# Check if package is installed, otherwise install
	if (find.package("devtools", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing class needed for EPIC")
		install.packages("devtools", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports"))
	}
	
	# Check if package is installed, otherwise install
	if (find.package("EPIC", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing class needed for EPIC")
		devtools::install_github("GfellerLab/EPIC")
	}
	
	if("EPIC" %in% .packages() %>% not) stop("tidybulk says: Please attach the apckage EPIC manually (i.e. library(EPIC)). This is because EPIC is only available on GitHub and it is not possible to seemlessy make EPIC part of the dependencies.")
		
	# Get common markers
	markers = intersect(rownames(mix), rownames(reference))
	
	X <- (reference[markers, , drop = FALSE])
	Y <- (mix[markers, , drop = FALSE])
	
	if(!is.null(reference))
		reference = list(
			refProfiles = X,
			sigGenes = rownames(X)
		)
	
	
	
	results <- EPIC(Y, reference = reference)$cellFractions %>% data.frame()
	#results[results < 0] <- 0
	#results <- results / apply(results, 1, sum)
	rownames(results) = colnames(Y)
	
	results
}


#' Get cell type proportions from cibersort
#'
#' @keywords internal
#'
#' @import parallel
#' @import preprocessCore
#' @importFrom stats setNames
#' @importFrom rlang dots_list
#' @importFrom magrittr equals
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance
#' @param method A character string. The method to be used. At the moment Cibersort (default) and llsr (linear least squares regression) are available.
#' @param ... Further parameters passed to the function Cibersort
#'
#' @return A tibble including additional columns
#'
#'
get_cell_type_proportions = function(.data,
																		 .sample = NULL,
																		 .transcript = NULL,
																		 .abundance = NULL,
																		 reference = X_cibersort,
																		 method = "cibersort",
																		 ...) {
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)
 


	# Load library which is optional for the whole package
	#library(preprocessCore)

	# Check if there are enough genes for the signature
	if ((.data %>%
			 pull(!!.transcript) %in% (reference %>% rownames)) %>%
			which %>%
			length %>%
			st(50))
		stop(
			"tidybulk says: You have less than 50 genes that overlap the Cibersort signature. Please check again your input dataframe"
		)

	# Check if rownames exist
	if (reference %>% sapply(class) %in% c("numeric", "double", "integer") %>% not() %>% any)
		stop("tidybulk says: your reference has non-numeric/integer columns.")

	# Get the dots arguments
	dots_args = rlang::dots_list(...)

	.data %>%

		# Prepare data frame
		distinct(!!.sample,!!.transcript,!!.abundance) %>%
		spread(!!.sample,!!.abundance) %>%

		# Eliminate NA transcripts
		filter(!!.transcript %>% is.na %>% not()) %>%

		# Convert
		data.frame(row.names = 1, check.names = FALSE) %>%

		# Run Cibersort or llsr through custom function, depending on method choice
		when(
			
			# Execute do.call because I have to deal with ...
			method %>% tolower %>% equals("cibersort") 	~ {
				
				# Check if package is installed, otherwise install
				if (find.package("class", quiet = TRUE) %>% length %>% equals(0)) {
					message("Installing class needed for Cibersort")
					install.packages("class", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports"))
				}
				
				# Check if package is installed, otherwise install
				if (find.package("e1071", quiet = TRUE) %>% length %>% equals(0)) {
					message("Installing e1071 needed for Cibersort")
					install.packages("e1071", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports"))
				}
				
				# Check if package is installed, otherwise install
				if (find.package("preprocessCore", quiet = TRUE) %>% length %>% equals(0)) {
					message("Installing preprocessCore needed for Cibersort")
					if (!requireNamespace("BiocManager", quietly = TRUE))
						install.packages("BiocManager", repos = "https://cloud.r-project.org")
					BiocManager::install("preprocessCore", ask = FALSE)
					
				}
				
				do.call(my_CIBERSORT, list(Y = ., X = reference) %>% c(dots_args)) %$%
				proportions %>%
				as_tibble(rownames = quo_name(.sample)) %>%
				select(-`P-value`,-Correlation,-RMSE)
			},
			
			# Don't need to execute do.call
			method %>% tolower %>% equals("llsr") ~ (.) %>%
				run_llsr(reference) %>%
				as_tibble(rownames = quo_name(.sample)),

			# Don't need to execute do.call
			method %>% tolower %>% equals("epic") ~ {
				
				(.) %>%
					run_epic(reference) %>%
					as_tibble(rownames = quo_name(.sample))
			},
			
			~ stop(
				"tidybulk says: please choose between cibersort, llsr and epic methods"
			)
		)	 %>%

		# Parse results and return
		setNames(c(
			quo_name(.sample),
			(.) %>% select(-1) %>% colnames() %>% sprintf("%s: %s", method, .)
		)) %>%
		#%>%
		#gather(`Cell type`, proportion,-!!.sample) %>%

		# Attach attributes
		reattach_internals(.data) %>%
		memorise_methods_used("cibersort")


}

#' Get adjusted count for some batch effect
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom stats as.formula
#' @importFrom utils install.packages
#' @importFrom stats rnorm
#'
#' @param .data A tibble
#' @param .formula a formula with no response variable, of the kind ~ factor_of_interest + batch
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param ... Further parameters passed to the function sva::ComBat
#'
#' @return A tibble with adjusted counts
#'
#'
get_adjusted_counts_for_unwanted_variation_bulk <- function(.data,
																														.formula,
																														.sample = NULL,
																														.transcript = NULL,
																														.abundance = NULL,
																														log_transform = TRUE,
																														...) {
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)

	# Check that .formula includes at least two covariates
	if (parse_formula(.formula) %>% length %>% st(2))
		stop(
			"The .formula must contain two covariates, the first being the factor of interest, the second being the factor of unwanted variation"
		)

	# Check that .formula includes no more than two covariates at the moment
	if (parse_formula(.formula) %>% length %>% gt(3))
		warning("tidybulk says: Only the second covariate in the .formula is adjusted for, at the moment")

	# New column name
	value_adjusted = as.symbol(sprintf("%s%s",  quo_name(.abundance), adjusted_string))

	df_for_combat <-
		.data %>%

		select(!!.transcript,
					 !!.sample,
					 !!.abundance,
					 one_of(parse_formula(.formula))) %>%
		distinct() %>%

		# Check if log transform is needed
		ifelse_pipe(log_transform,
								~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% log1p()))


	# Create design matrix
	design =
		model.matrix(
			object = as.formula("~" %>% paste0(parse_formula(.formula)[1])),
			# get first argument of the .formula
			data = df_for_combat %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
		) %>%
		set_colnames(c("(Intercept)", parse_formula(.formula)[1]))

	# Check if package is installed, otherwise install
	if (find.package("sva", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing sva - Combat needed for adjustment for unwanted variation")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("sva", ask = FALSE)
	}

	my_batch =
		df_for_combat %>%
		distinct(!!.sample,!!as.symbol(parse_formula(.formula)[2])) %>%
		arrange(!!.sample) %>%
		pull(2)

	mat = df_for_combat %>%
		# Select relevant info
		distinct(!!.transcript,!!.sample,!!.abundance) %>%

		# Stop any column is not if not numeric or integer
		ifelse_pipe(
			(.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer")) %>% not() %>% any(),
			~ stop(".abundance must be numerical or integer")
		) %>%

		spread(!!.sample,!!.abundance) %>%
		as_matrix(rownames = !!.transcript,
							do_check = FALSE)
	mat %>%
		
		# Add little noise to avoid all 0s for a covariate that would error combat code (not statistics that would be fine) 
		`+` (rnorm(length(mat), 0, 0.000001)) %>%
		
		# Run combat
		sva::ComBat(batch = my_batch,
								mod = design,
								prior.plots = FALSE,
								...) %>%

		as_tibble(rownames = quo_name(.transcript)) %>%
		gather(!!.sample,!!.abundance,-!!.transcript) %>%

		# Reverse-Log transform if transformed in the first place
		ifelse_pipe(
			log_transform,
			~ .x %>%
				dplyr::mutate(!!.abundance := !!.abundance %>% exp %>% `-`(1)) %>%
				dplyr::mutate(!!.abundance := ifelse(!!.abundance < 0, 0,!!.abundance)) %>%
				dplyr::mutate(!!.abundance := !!.abundance %>% as.integer)
		) %>%

		# Reset column names
		dplyr::rename(!!value_adjusted := !!.abundance)  %>%

		# # Add filtering info
		# right_join(
		# 	df_for_combat %>%
		# 		distinct(!!.transcript,!!.sample,
		# 						 lowly_abundant),
		# 	by = c(quo_name(.transcript), quo_name(.sample))
		# )%>%

		# Attach attributes
		reattach_internals(.data) %>%
		memorise_methods_used("sva")
}

#' Identify variable genes for dimensionality reduction
#'
#' @keywords internal
#'
#' @param .data A tibble
#' @param .sample A character name of the sample column
#' @param .transcript A character name of the transcript/gene column
#' @param .abundance A character name of the read count column
#' @param top An integer. How many top genes to select
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#'
#' @return A tibble filtered genes
#'
keep_variable_transcripts = function(.data,
																			 .sample = NULL,
																			 .transcript = NULL,
																			 .abundance = NULL,
																			 top = 500,
																			 log_transform = TRUE) {
	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	col_names = get_sample_transcript(.data, .sample, .transcript)
	.sample = col_names$.sample
	.transcript = col_names$.transcript

	# Get scaled abundance if present, otherwise get abundance
	.abundance = enquo(.abundance)
	col_names = get_abundance_norm_if_exists(.data, .abundance)
	.abundance = col_names$.abundance

	# Manage Inf
	top = min(top, .data %>% distinct(!!.transcript) %>% nrow)

	message(sprintf("Getting the %s most variable genes", top))

	x =
		.data %>%
		distinct(!!.sample,!!.transcript,!!.abundance) %>%

		# Check if logtansform is needed
		ifelse_pipe(log_transform,
								~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% `+`(1) %>%  log())) %>%

		spread(!!.sample,!!.abundance) %>%
		as_matrix(rownames = quo_name(.transcript))

	s <- rowMeans((x - rowMeans(x)) ^ 2)
	o <- order(s, decreasing = TRUE)
	x <- x[o[1L:top], , drop = FALSE]
	variable_trancripts = rownames(x)

	.data %>%
		filter(!!.transcript %in% variable_trancripts) %>%

		# Add methods used
		memorise_methods_used(c("edger"))
}

#' tidybulk_to_SummarizedExperiment
#'
#' @keywords internal
#'
#' @importFrom utils data
#' @importFrom tidyr pivot_longer
#'
#' @param .data A tibble
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#'
#' @return A SummarizedExperiment
#'
tidybulk_to_SummarizedExperiment = function(.data,
																					.sample = NULL,
																					.transcript = NULL,
																					.abundance = NULL) {

	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)
	col_names = get_sample_transcript_counts(.data, .sample, .transcript, .abundance)
	.sample = col_names$.sample
	.transcript = col_names$.transcript
	.abundance = col_names$.abundance

	# Check if package is installed, otherwise install
	if (find.package("SummarizedExperiment", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing SummarizedExperiment")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("SummarizedExperiment", ask = FALSE)
	}
	if (find.package("S4Vectors", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing S4Vectors")
		if (!requireNamespace("BiocManager", quietly = TRUE))
			install.packages("BiocManager", repos = "https://cloud.r-project.org")
		BiocManager::install("S4Vectors", ask = FALSE)
	}
	# If present get the scaled abundance
	.abundance_scaled =
		.data %>%
		ifelse_pipe(
			".abundance_scaled" %in% ((.) %>% get_tt_columns() %>% names) &&
				# .data %>% get_tt_columns() %$% .abundance_scaled %>% is.null %>% not() &&
				quo_name((.) %>% get_tt_columns() %$% .abundance_scaled) %in% ((.) %>% colnames),
			~ .x %>% get_tt_columns() %$% .abundance_scaled,
			~ NULL
		)

	# Get which columns are sample wise and which are feature wise
	col_direction = get_x_y_annotation_columns(.data,
																						 !!.sample,
																						 !!.transcript,
																						 !!.abundance,
																						 !!.abundance_scaled)
	sample_cols = col_direction$horizontal_cols
	feature_cols = col_direction$vertical_cols
	counts_cols = col_direction$counts_cols

	colData = .data %>% select(!!.sample, sample_cols) %>% distinct %>% arrange(!!.sample) %>% {
		S4Vectors::DataFrame((.) %>% select(-!!.sample),
												 row.names = (.) %>% pull(!!.sample))
	}

	rowData = .data %>% select(!!.transcript, feature_cols) %>% distinct %>% arrange(!!.transcript) %>% {
		S4Vectors::DataFrame((.) %>% select(-!!.transcript),
												 row.names = (.) %>% pull(!!.transcript))
	}

	my_assays =
		.data %>%
		select(!!.sample,
					 !!.transcript,
					 !!.abundance,
					 !!.abundance_scaled,
					 counts_cols) %>%
		distinct() %>%
		pivot_longer( cols=-c(!!.transcript,!!.sample), names_to="assay", values_to= ".a") %>%
		nest(`data` = -`assay`) %>%
		mutate(`data` = `data` %>%  map(
			~ .x %>% spread(!!.sample, .a) %>% as_matrix(rownames = quo_name(.transcript))
		))

	# Build the object
	SummarizedExperiment::SummarizedExperiment(
		assays = my_assays %>% pull(`data`) %>% setNames(my_assays$assay),
		rowData = rowData,
		colData = colData
	)

}

#' Get matrix from tibble
#'
#'
#' @import dplyr
#' @import tidyr
#' @importFrom magrittr set_rownames
#' @importFrom rlang quo_is_null
#'
#' @param tbl A tibble
#' @param rownames A character string of the rownames
#' @param do_check A boolean
#'
#' @return A matrix
#'
#' @examples
#'
#' as_matrix(head(dplyr::select(tidybulk::counts_mini, transcript, count)), rownames=transcript)
#'
#' @export
as_matrix <- function(tbl,
											rownames = NULL,
											do_check = TRUE) {
	rownames = enquo(rownames)
	tbl %>%

		# Through warning if data frame is not numerical beside the rownames column (if present)
		ifelse_pipe(
			do_check &&
				tbl %>%
				# If rownames defined eliminate it from the data frame
				ifelse_pipe(!quo_is_null(rownames), ~ .x[,-1], ~ .x) %>%
				dplyr::summarise_all(class) %>%
				tidyr::gather(variable, class) %>%
				pull(class) %>%
				unique() %>%
				`%in%`(c("numeric", "integer")) %>% not() %>% any(),
			~ {
				warning("tidybulk says: there are NON-numerical columns, the matrix will NOT be numerical")
				.x
			}
		) %>%
		as.data.frame() %>%

		# Deal with rownames column if present
		ifelse_pipe(
			!quo_is_null(rownames),
			~ .x %>%
				magrittr::set_rownames(tbl %>% pull(!!rownames)) %>%
				select(-1)
		) %>%

		# Convert to matrix
		as.matrix()
}

#' This function is needed for DE in case the matrix is not rectangular, but includes NA
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom stats as.formula
#' @importFrom utils install.packages
#' @importFrom tidyr complete
#'
#' @param .data A tibble
#' @param .formula a formula with no response variable, of the kind ~ factor_of_intrest + batch
#' @param .sample The name of the sample column
#' @param .transcript The name of the transcript/gene column
#' @param .abundance The name of the transcript/gene abundance column
#' @param .abundance_scaled The name of the transcript/gene scaled abundance column
#'
#'
#' @return A tibble with adjusted counts
#'
#'
fill_NA_using_formula = function(.data,
																 .formula,
																 .sample = NULL,
																 .transcript = NULL,
																 .abundance = NULL,
																 .abundance_scaled = NULL){

	# Get column names
	.sample = enquo(.sample)
	.transcript = enquo(.transcript)
	.abundance = enquo(.abundance)
	.abundance_scaled = enquo(.abundance_scaled)

	col_formula =
		.data %>%
		select(parse_formula(.formula)) %>%
		distinct() %>%
		select_if(function(x) is.character(x) | is.logical(x) | is.factor(x)) %>%
		colnames

	# Sample-wise columns
	sample_col = .data %>% get_specific_annotation_columns(!!.sample) %>% outersect(col_formula)
	

	# Create NAs for missing sample/transcript pair

 .data_completed = 
		.data %>%

		# Add missing pairs
 		nest(ct_data = -c(col_formula)) %>%
 		mutate(ct_data = map(ct_data, ~ .x %>% complete(!!as.symbol(quo_name(.sample)), !!.transcript) )) %>%
 		unnest(ct_data)
		
 .data_OK = 
 	.data %>%
 	anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula))
 
 .data_FIXED = 
 .data %>%
 	inner_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) %>%

 	# attach NAs
 	bind_rows(
	.data_completed %>% 
		filter(!!.abundance %>% is.na) %>%
		select(!!.sample, !!.transcript) %>%
		left_join(.data %>% pivot_sample(!!.sample))
	) %>%
 	
	# Group by covariate
	nest(cov_data = -c(col_formula, !!.transcript)) %>%
	mutate(cov_data = map(cov_data, ~
											.x %>%
											mutate(
												!!.abundance := ifelse(
													!!.abundance %>% is.na,
													median(!!.abundance, na.rm = TRUE),!!.abundance
												)
											) %>%

											# Impute scaled if exist
											ifelse_pipe(
												quo_is_symbol(.abundance_scaled),
												~ .x %>% mutate(
													!!.abundance_scaled := ifelse(
														!!.abundance_scaled %>% is.na,
														median(!!.abundance_scaled, na.rm = TRUE),!!.abundance_scaled
													)
												)
											) %>%

											# Through warning if group of size 1
											ifelse_pipe((.) %>% nrow %>% `<` (2), warning("tidybulk says: According to your design matrix, u have sample groups of size < 2, so you your dataset could still be sparse."))
	)) %>%
	unnest(cov_data) 
 
	.data_OK %>%
		bind_rows(.data_FIXED) %>%

		# Reattach internals
		reattach_internals(.data)

}

#' This function is needed for DE in case the matrix is not rectangular, but includes NA
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom magrittr set_colnames
#' @importFrom stats model.matrix
#' @importFrom stats as.formula
#'
#' @param .data A `tbl` formatted as | <element> | <feature> | <value> | <...> |
#' @param .sample The name of the element column
#' @param .transcript The name of the feature/gene column
#' @param .abundance The name of the feature/gene value column
#' @param fill_with A numerical value with which fill the missing data points
#'
#'
#' @return A tibble with adjusted counts
#'
#'
fill_NA_using_value = function(.data,
															 .sample,
															 .transcript,
															 .abundance,
															 fill_with){

	# Comply with CRAN NOTES
	. = NULL

	# Get column names
	.element = enquo(.sample)
	.feature = enquo(.transcript)
	.value = enquo(.abundance)

	# Create NAs for missing element/feature pair
	df_to_impute =
		.data %>%
		select(!!.element, !!.feature, !!.value) %>%
		distinct %>%
		pivot_wider(
			names_from = !!.feature,
			values_from = !!.value,
			names_sep = "___",
			names_prefix = "fill_miss_"
		) %>%
		pivot_longer(
			names_to = .data %>% select(!!.feature) %>% names,
			values_to = quo_names(.value),
			names_sep = purrr::when(quo_names(.feature), length(.) > 1 ~ "___", ~ NULL),
			names_prefix = "fill_miss_",
			cols = contains("fill_miss_")
		)

	# Select just features/covariates that have missing
	combo_to_impute = df_to_impute %>% anti_join(.data, by=c(quo_names(.element), quo_names(.feature))) %>% select(!!.feature, !!.element) %>% distinct()

	# Impute using median
	df_to_impute %>%
		inner_join(combo_to_impute, by = c(quo_names(.element), quo_names(.feature))) %>%

		# Fill
		mutate(!!.value := if_else(!!.value %>% is.na, fill_with, !!.value)) %>%
		# when(
		# 	quo_is_symbol(.value_scaled) ~ mutate(., !!.value_scaled := !!.value)) ,
		# 	~ (.)
		# ) %>%

		# In next command avoid error if no data to impute
		ifelse_pipe(
			nrow(.) > 0,
			~ .x %>% left_join(.data %>% pivot_sample(!!.element), by=quo_names(.element))
		) %>%

		# Add original dataset
		bind_rows(.data %>% anti_join(combo_to_impute, by=c(quo_names(.feature), quo_names(.element)))) %>%
		select(.data %>% colnames) %>%

		# Reattach internals
		reattach_internals(.data)

}


# # Iterative version of Siberg function because fails
# siberg_iterative = function(x) {
# 	if (x %>% unique %>% length %>% st(5))
# 		return(c(NA, NA))
#
#
#
# 	mu = NA
# 	max_i = ceiling(length(x) / 10)
# 	#max_i = 10
# 	i = 0
# 	while (mu %>% is.na | i <= max_i) {
# 		res = SIBERG::SIBER(x, model = 'NB')
#
# 		BI = res[7]
# 		mu = res[1]
# 		x = x[-1]
# 		i = i + 1
#
# 	}
#
#
# 	if (mu %>% is.na & x %>% length %>% st(5))
# 		return(c(NA, NA))
#
# 	return(c(max(res[1], res[2]) / (min(res[1], res[2]) + 1),
# 					 res[7]))
# }


# # Calculate bimodality
# bimodality =
#
# 	counts_non_red %>%
# 	#keep_variable(top = 5000) %>%
# 	tidybulk:::drop_class(c("tidybulk", "tt")) %>%
# 	tidybulk:::drop_internals() %>%
# 	nest(data = -c(`Cell type formatted`, symbol)) %>%
#
# 	#slice(1:10) %>%
# 	mutate(	bimodality_NB =
# 		map(
# 			data,
# 			~ tryCatch(
# 							.x %>% pull(count_scaled) %>% as.integer %>%
# 								siberg_iterative() %>%
# 								`[` (1:2) , error=function(e) c(NA, NA))		%>%
# 							setNames(c("bimodality_NB_diff", "bimodality_NB")) %>%
# 							enframe() %>% spread(name, value)
#
# 		)
# 	) %>%
# 	select(-data) %>%
# 	unnest(bimodality_NB)
#
# bimodality %>% saveRDS("dev/bimodality.rds")
#
# bimodality = readRDS("dev/bimodality.rds")
#
# non_bimodal =
# 	bimodality %>%
# 	add_count(symbol) %>%
# 	filter(n==max(n)) %>%
# 	mutate(bimodal = ((bimodality_NB > 0.8 & bimodality_NB_diff > 20) | bimodality_NB_diff > 100) ) %>%
# 	nest(data = -symbol) %>%
# 	mutate(how_many_bimod = map_int(data, ~ .x %>% pull(bimodal) %>% sum(na.rm=TRUE))) %>%
# 	filter(how_many_bimod == 0)


#' @importFrom stats p.adjust
entrez_rank_to_gsea = function(my_entrez_rank, species, gene_set = NULL){

	# From the page
	# https://yulab-smu.github.io/clusterProfiler-book/chapter5.html

	# Check if package is installed, otherwise install
	if (find.package("fastmatch", quiet = TRUE) %>% length %>% equals(0)) {
		message("Installing fastmatch needed for analyses")
		install.packages("fastmatch", repos = "https://cloud.r-project.org")
	}

	if (find.package("clusterProfiler", quiet = TRUE) %>% length %>% equals(0)) {
		message("clusterProfiler not installed. Installing.")
		BiocManager::install("clusterProfiler", ask = FALSE)
	}




	# Get gene sets signatures
	msigdbr::msigdbr(species = species) %>%

		# Filter specific gene_set if specified. This was introduced to speed up examples executionS
		when(
			!is.null(gene_set) ~ filter(., gs_cat %in% gene_set),
			~ (.)
		) %>%
		
		# Execute calculation
		nest(data = -gs_cat) %>%
		mutate(test =
					 	map(
					 		data,
					 		~ clusterProfiler::enricher(my_entrez_rank, TERM2GENE=.x %>% select(gs_name, entrez_gene), pvalueCutoff = 1) %>%
					 			as_tibble
					 	)) %>%
		select(-data) %>%
		unnest(test) %>%
		
		# Order
		arrange(`p.adjust`) %>%

		# format transcripts
		mutate(entrez = strsplit(geneID, "/")) %>% 
		select(-geneID) %>% 
		
		# Add methods used
		memorise_methods_used(c("clusterProfiler", "msigdbr"))

}



# gsea_de = function(.data,
# 								 .formula,
# 								 .sample = NULL,
# 								 .entrez,
# 								 .abundance = NULL,
# 								 .contrasts = NULL,
# 								 species){
#
# 	# Comply with CRAN NOTES
# 	. = NULL
#
# 	# Check packages
# 	# Check if package is installed, otherwise install
# 	if (find.package("EGSEA", quiet = TRUE) %>% length %>% equals(0)) {
# 		message("EGSEA not installed. Please install it with.")
# 		message("BiocManager::install(\"EGSEA\", ask = FALSE)")
# 	}
# 	if (!"EGSEA" %in% (.packages())) {
# 		message("EGSEA package not loaded. Please run library(\"EGSEA\")")
# 	}
#
# 	# Check column type
# 	if (reference %>% sapply(class) %in% c("numeric", "double", "integer") %>% not() %>% any)
# 		stop("tidybulk says: your reference has non-numeric/integer columns.")
#
# 	# Get column names
# 	.sample = enquo(.sample)
# 	.abundance = enquo(.abundance)
# 	col_names = get_sample_counts(.data, .sample, .abundance)
# 	.sample = col_names$.sample
# 	.abundance = col_names$.abundance
#
#
# 	df_for_edgeR %>%
# 		select(!!.sample, !!.entrez,!!.abundance)
#
#
# 	library(msigdbr)
# 	library(clusterProfiler)
#
# 	counts %>% tidybulk(sample, transcript, count) %>% test_differential_abundance(~ condition) %>% arrange(logFC %>% desc) %>% pull(transcript)
#
#
# 	em <- enricher(gene, TERM2GENE=m_df %>% select(gs_name, entrez_gene))
#
# 	emGSEA(geneList, TERM2GENE = m_t2g)
#
#
# 	m_t2g <- msigdbr(species = "Homo sapiens", category = "C6")
#
#
#
# }

Try the tidybulk package in your browser

Any scripts or data that you put into this service are public.

tidybulk documentation built on April 7, 2021, 6 p.m.