##### overview for data ##########
#' JSD distance
#' to compute the jsd distance
#' @param inMatrix
#' @param pseudocount
#' @param ...
#'
#' @return distance
#' @export
#'
#' @examples
distJSD <- function(inMatrix, pseudocount=0.00000001, ...) {
# if have negtive number, transform them
if(any(inMatrix < 0)){
mixV <- min(inMatrix)
inMatrix <- inMatrix+abs(mixV)+pseudocount
}
inMatrix <- t(inMatrix)
KLD <- function(x,y) sum(x *log(x/y))
JSD<- function(x,y) sqrt(0.5 * KLD(x, (x+y)/2) + 0.5 * KLD(y, (x+y)/2))
matrixColSize <- length(colnames(inMatrix))
matrixRowSize <- length(rownames(inMatrix))
colnames <- colnames(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
min <- min(inMatrix)
inMatrix <- inMatrix+min
inMatrix = apply(inMatrix,1:2,function(x) ifelse (x==0,pseudocount,x))
for(i in 1:matrixColSize) {
for(j in 1:matrixColSize) {
resultsMatrix[i,j]=JSD(as.vector(inMatrix[,i]),
as.vector(inMatrix[,j]))
}
}
colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix)
as.dist(resultsMatrix)->resultsMatrix
attr(resultsMatrix, "method") <- "dist"
return(resultsMatrix)
}
#' kBest
#' to get the best k in non-supervision method
#' @param data
#' @param dist
#' @param method
#'
#' @return list , include the best cluster number k & original figure & dataframe
#' @export
#'
#' @examples
kBest <- function(data, dist , method = "kmeans"){
nclusters=NULL
sil = NULL
out <- list()
res <- matrix(NA, 19, ncol(data))
for (k in 2:20) {
#print(k)
switch (method,
kmeans = { data.cluster_temp <-kmeans(dist, k)$cluster},
pam = { data.cluster_temp <-pam(dist, k)$clustering},
fanny = { data.cluster_temp <- fanny(dist, k)$clustering}
)
res[k-1,] <- data.cluster_temp
nclusters[k-1] <- index.G1(t(data) , data.cluster_temp, d = dist,
centrotypes = "medoids")
sil[k-1] <- mean(silhouette(data.cluster_temp, dist = dist)[,3])
}
best <- which.max(nclusters)+1
kCluster <- c(2:20)
CH_index <- nclusters
Silhouette <- sil
cluster <- data.frame(kCluster, CH_index, Silhouette)
cluster <- melt(cluster, id = "kCluster")
colnames(cluster) <- c("kCluster", "Index", "value")
figure <- ggplot(cluster, aes(x=value, y=kCluster))+
geom_segment(aes(yend=kCluster),xend=0,colour="grey")+
geom_point(size=3,aes(colour=Index))+
scale_colour_brewer(palette="Set1",limits=c("CH_index","Silhouette"))+
theme_bw()+finalTheme+xlab("")+ylab("Number of cluster")+facet_grid(.~Index, scales = "free")
out <- list(res[best-1,], best, figure)
return(out)
}
#' kBest2
#'
#' @param data
#' @param dist
#' @param method
#' @param clustNum
#'
#' @return
#' @export
#'
#' @examples
kBest2 <- function(data, dist , method = "kmeans", clustNum){
nclusters=NULL
sil = NULL
out <- list()
res <- matrix(NA, clustNum, ncol(data))
for (k in 1:clustNum) {
switch (method,
kmeans = { data.cluster_temp <-kmeans(dist, k)$cluster},
pam = { data.cluster_temp <- pam(dist, k)$clustering},
fanny = { data.cluster_temp <- fanny(dist, k)$clustering}
)
res[k,] <- data.cluster_temp
nclusters[k] <- index.G1(t(data) , data.cluster_temp, d = dist,
centrotypes = "medoids")
}
best <- which.max(nclusters)
print("Best cluster number is ", best)
CH_index <- nclusters
df <- data.frame(clusters = as.factor(1:clustNum), y = CH_index)
df[1,2] <- min(df[,2], na.rm=T)-0.1*min(df[,2], na.rm=T)
ylab <- "Calinski-Harabasz index"
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = "steelblue", ylab = ylab,
xlab = "Number of clusters k",
main = "Optimal number of clusters\nCH-index method"
)
p <- p + geom_vline(xintercept = which.max(CH_index), linetype=2, color = "steelblue")
return(p)
}
#' multiAdonis
#' for pair compare for multigroup use the adonis
#' @param dat
#' @param config
#' @param perm
#' @param method
#'
#' @return data.frame
#' @export
#'
#' @examples
multiAdonis <- function(dat, config, perm=999, method="bray"){
# reorder the sample id of dat and config
# dat : row is sample
# perm : permutation number ,default 999
# method : distance method
id <- intersect(rownames(dat), rownames(config))
dat <- dat[id, ]
config <- config[id,]
combng <- combn(unique(config), 2)
res <- matrix(NA, nrow=ncol(combng), ncol = 6)
for(i in 1:ncol(combng)){
group <- c(config[which(config==combng[1,i])],
config[which(config==combng[2,i])])
pro <- dat[c(which(config==combng[1,i]),
which(config==combng[2,i])), ]
# here use the adonise function
library(vegan)
res[i, 1:6] <- as.numeric(adonis(pro~group, permutations = perm, method = method)$aov[1,])
}
rownames(res) <- apply(combng, 2, function(x){paste0(x[1], " vs ", x[2])})
colnames(res) <- c("Df", "SumsOfSqs", "MeanSqs", "F.Model", "R2", "Pr(>F)")
return(res)
}
############# outlier ################
#' checkOutlier
#' output the outlier value on a vector by quantile
#' @param x
#' @param coef
#' @param sname
#' @param plot
#' @param vname
#'
#' @return
#' @export
#'
#' @examples
checkOutlier <- function(x, coef = 2, sname ,plot = F, vname = "test"){
# whether is a conituous data
if(is.numeric(x) & length(levels(as.factor(x)))>=5){
# get the outlier index
x[is.na(x)] <- median(x, na.rm = T)
quantiles <- quantile(x, probs=c(0.25,0.75), na.rm = T)
IQR <- quantiles[2]-quantiles[1]
index <- x < (quantiles[1]-coef*IQR)|x > (quantiles[2]+coef*IQR)
if(any(index)){
# print the outlier
name <- paste0(sname[index], "_", round(x[index], 2))
res <- paste(name, collapse = " ")
#return(res)
# print the outlier sample on boxplot
if(plot){
dat <- data.frame(value = x)
dat$label <- ifelse(index, sname, "")
dat$var <- rep(vname, nrow(dat))
p <- ggplot(dat,aes(var,value))+
geom_boxplot()+geom_text_repel(aes(label=label))+
theme(panel.grid.major=element_line(colour=NA))+labs(x="")+
theme_bw()
}
return(list(p,res))
}else{
return(NULL)
}
}else{
stop("the value is not conituous data\n")
}
}
#' mutivarOutlier
#' on a dataframe to select the outlier
#' @param dat
#' @param scale
#' @param k
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
mutivarOutlier <- function(dat, scale=F, naaction = F, k, ...){
if(scale){
dat <- scale(dat)
}
sam_num <- nrow(dat)
row_name <- rownames(dat)
# to handle the NA
if(naaction){
dat <- mice::complete(mice::mice(dat))
}
# library(DDoutlier)
outlier.scores <- LOF(dat, k = 10 )
checkOutlier(outlier.scores, ...)
}
#' filterPer
#'
#' @param x , data.frame
#' @param row , row or col
#' @param percent , ratio
#' @param include , belong or exclude
#'
#' @return dataframe
#' @export
#'
#' @examples
filterPer <- function(x, row, percent, include=T){
if(include){
index <- apply(x, row, function(x){(sum(x!=0)/length(x))>percent})
}else{
index <- apply(x, row, function(x){(sum(x!=0)/length(x))<percent})
}
if(row==1){
out <- x[index, ]
}else{
out <- x[, index]
}
return(out)
}
#' summarySE
#'
#' @param data
#' @param measurevar
#' @param groupvars
#' @param na.rm
#' @param conf.interval
#' @param .drop
#'
#' @return
#' @export
#'
#' @examples
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
########## WGCNA ################
wgcnaAna <- function(data ){
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.