rmdocpu: Metalab report upon provided templates

# 20190830: PostHoc, add a ANOVA pvalue to the return
#           find_p_location, changed the failed information



# Starting_install_packages -----------------------------------------------


# this is the first file to source when the project is loaded

#________________________________________________________________________________________
#     install.packages.auto
#________________________________________________________________________________________

# ___Description___:
# 1: will check packages installed or not, if not will install from CRAN or bioconductor
# 2: the default setting is not update all dependent packages, which can be set ask = TRUE, or just coment this line
# 3: after install, load the package

# ___Arguments___:
# pacakge names, without quote
#

#____Usage____;
# example:
# install.packages.auto(qvalue) # from bioconductor
# install.packages.auto(rNMF) # from CRAN


install.packages.auto <- function(x) {
  # setting up the miror
  local({
    r <- getOption("repos")
    r["CRAN"] <- "https://cloud.r-project.org/"
    options(repos = r)
  })


  x <- as.character(substitute(x))

  if(isTRUE(x %in% .packages(all.available=TRUE))) {
    eval(parse(text = sprintf("require(\"%s\")", x)))
  } else {
    update.packages(ask= FALSE) #update installed packages.
    eval(parse(text = sprintf("install.packages(\"%s\", dependencies = TRUE)", x)))
  }
  if(isTRUE(x %in% .packages(all.available=TRUE))) {
    eval(parse(text = sprintf("require(\"%s\")", x)))
  } else {
    source("http://bioconductor.org/biocLite.R")
    biocLite(character(), ask=FALSE) #update installed packages.
    eval(parse(text = sprintf("biocLite(\"%s\")", x)))
    eval(parse(text = sprintf("require(\"%s\")", x)))
  }
}


# Load all basic necessary libraries ---------------------------------------------------------

suppressMessages(install.packages.auto(rmarkdown))
suppressMessages(install.packages.auto(htmlwidgets))
suppressMessages(install.packages.auto(plotly))
suppressMessages(install.packages.auto(visNetwork)) # for network


# Prepare dataset ---------------------------------------------------------

# data() # show all bult in dataset


# Most used R built-in data sets
# mtcars: Motor Trend Car Road Tests
# iris
# ToothGrowth
# PlantGrowth
# USArrests

set.seed(1981)

# example:
# data preparation

matrix_test1 <- matrix(runif(2000), nrow = 200)
matrix_test2 <- matrix(runif(2000)+ runif(200), nrow = 200)
matrix_test3 <- matrix(runif(2000)+ 2*runif(200), nrow = 200)

matrix_test <- cbind(matrix_test1,matrix_test2, matrix_test3)

rownames(matrix_test) <- paste("COG", 1:200, sep = "_")
colnames(matrix_test) <- paste("sample", 1:30, sep = "_")

factor_test <- c(rep("A",10),rep("B",10),rep("C",10))
meta_test <- data.frame(samplename = colnames(matrix_test), grouping = factor_test)

# Randomize the order
meta_test <- meta_test[sample(1:nrow(meta_test)),]
rownames(meta_test) <- NULL






# basic_plotting ----------------------------------------------------------



# ternary_plot_matrix plot a ternary plot from three lines of the matrix,
# the matrix columns are samples, rows are functions
# grouping information will help to visualize more information
#


# example:
# data preparation
#   matrix_test <- matrix(runif(2000), nrow = 20)
#    rownames(matrix_test) <- paste("COG", 1:20, sep = "_")
#    colnames(matrix_test) <- paste("sample", 1:100, sep = "_")
#   factor_test <- sample(LETTERS[1:3], 100, replace = TRUE)
#   meta_test <- data.frame(samplename = colnames(matrix_test), grouping = factor_test)

# run three cases
#t <- ternary_plot_matrix(data_matrix = matrix_test, data_meta = meta_test, three_points = c("COG_1", "COG_2", "COG_3"))
# t$ternary.plot
#t <- ternary_plot_matrix(data_matrix = matrix_test, data_meta = meta_test, three_points = "top3")
# t$ternary.plot
#t <- ternary_plot_matrix(data_matrix = matrix_test, data_meta = meta_test, three_points = c(2,4,5))
# t$ternary.plot

ternary_plot_matrix <- function(data_matrix, data_meta, three_points){

  suppressMessages(install.packages.auto(ggtern))

  mode(data_matrix) <-"numeric" # in case mode is character

  if(length(three_points) ==1 && three_points == "top3"){
    three_points = rownames(data_matrix)[1:3]
  } else if(is.numeric(three_points)){
    # check if some of the location out of boundary(nrow)
    if(any(unlist((lapply(three_points, function(x) x>20))))){
      stop("out of boundary")
    }else{
      three_points = rownames(data_matrix)[three_points]
    }
  }else if(is.character(three_points)){
    index <- match(three_points, rownames(data_matrix))
    if( any(unlist(lapply(index, is.na))) ) {
      stop("item_name Not found, please input the right one")
    }
  }

  select_3 <- as.data.frame(t(data_matrix[which(rownames(data_matrix) %in% three_points),]))
  # select_3 <- as.data.frame(scale(select_3), scale = FALSE) # rescale

  # reorder the data_meta, in case there is mis-match of the column names
  data_meta_Reorder <- data_meta[match(rownames(select_3), data_meta[,1]),]

  select_3$Group <- data_meta_Reorder[,2]

  colnames(select_3)<-c("x","y","z", "Group")
  p <- ggtern(data=select_3,aes(x,y,z,color=Group)) + geom_point() + labs(x=three_points[1],y=three_points[2],z=three_points[3],title="Ternary Plot")


  return(list(ternary.plot = p,
              matrix_selected = select_3
  ))

}



# do a simple row plot according to the grouping information
# plot_type = "violin" or "box"

# example:
#   matrix_test <- matrix(rnorm(2000), nrow = 20)
#   factor_test <- sample(LETTERS[1:3], 100, replace = TRUE)
#   meta_test <- data.frame(samplename = colnames(matrix_test), grouping = factor_test)

# run: plot single row
#   matrix_boxplot_byrow_bygroup(row = 1, data_matrix = matrix_test, data_meta = meta_test, plot_type = "violin")
# run: all rows, result into a list
# plot_all_rows <- lapply(1:nrow(matrix_test), matrix_boxplot_byrow_bygroup, data = matrix_test, data_meta = meta_test)





matrix_boxplot_byrow_bygroup <- function(row = 1,data_matrix, data_meta, plot_type = "violin"){

  # reorder the meta
  data_meta <- data_meta[match(colnames(data_matrix), data_meta[,1]),]

  # check the seqeunce
  if(!all(colnames(data_matrix) ==  data_meta[,1])){stop("unmatched")}

  if(is.numeric(row)){
    index <- row
    if(index > nrow(data_matrix) | index <=0) stop("row number is out of range (numer of column number), please input the right one")

  }else if(is.character(row)){
    index <- match(row, colnames(data_matrix_tranverse))
    if(is.na(index)) stop("item_name Not found, please input the right one")
  }

  # plot single here
  # generate a simple database

  tile = rownames(data_matrix)[index]
  plot <- simple_ggboxplot_vector(vector = data_matrix[index,], factor = data_meta[,2], plot_type = plot_type, title = tile)
  return(plot)
}



# this function does a very simple box or violin plot using give vector values and grouping information
# vector and factor have to be the corresponding

# example:
# vector_test <- rnorm(200)
# factor_test <- sample(LETTERS[1:3], 200, replace = TRUE)
# simple_ggboxplot_vector(vector_test, factor_test)
# simple_ggboxplot_vector(vector = vector_test, factor = factor_test, plot_type = "box", title = "test")


simple_ggboxplot_vector <- function(vector = NULL, factor = NULL, plot_type = "violin", title = ""){
  install.packages.auto(ggplot2)
  if(is.null(vector) | is.null(factor)){
    stop("Neither \"vector\"  or \"factor\" has default values, please define")
  }else{
    data_df <- data.frame(groups = as.factor(factor), Values = vector)
    p <- ggplot(data_df,aes_string("groups","Values",fill="groups"))

    if (plot_type == "violin"){
      p1 <- p+geom_violin() + ggtitle(title) + theme(plot.title = element_text(hjust = 0.5))
    } else if (plot_type == "box"){
      p1 <- p+geom_boxplot(alpha=0.3, width=0.9) + geom_jitter(shape=21) + ggtitle(title) + theme(plot.title = element_text(hjust = 0.5))
    }
    return(plot = p1)
  }
}




# this function is the smart way to do plot by columns, and combine the plot output into a list,
# this is very easy to store a good number of plot at a time
# usage
#  generate a test data frame
#   data_test <- data.frame(rnorm(200), nrow = 10)
# using a list apply to iterate
#   myplots <- lapply(colnames(data_test), plot_df_bycolumn_into_list, data = data_test)


plot_df_bycolumn_into_list = function (data, column){

  ggplot(data = data, aes_string(x = column)) +
    geom_density(fill = "lightgreen") + xlab(column)

}

# the plot function could be changed into any type










# http://peterhaschke.com/Code/multiplot.R

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots==1) {
    print(plots[[1]])

  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}


# accept data.frame with the follwing columns
#  "FunctionName"    "Number_in_group" "Number_matched"  "pvalue"          "pvalue_adjusted"
# plot a graph out directly

#p1 <- barplot_enrichemt(df.match = matched_categories,top20 = "FALSE")

# for plot of topN
#p2 <- barplot_enrichemt(df.match = matched_categories,top20 = "TRUE")



barplot_enrichemt <- function(df.match, title.tag = NULL, top20 = "TRUE"){

  df.match[which(df.match$pvalue_adjusted == 0),5] <- NA
  df.match[which(is.na(df.match$pvalue_adjusted)),5] <- min(df.match$pvalue_adjusted, na.rm = TRUE)

  postscript("temp")
  dev.control('enable')

  if( top20 == "TRUE") {

    if(nrow(df.match) > 20){
      df.match <- df.match[1:20,]
    }

    par(mar = c(5,20,8,5),mgp=c(2,0.3,0.5),tck=-.01,cex.axis = 0.8)
    m <- df.match$Number_matched
    names(m) <-df.match$FunctionName
    barplot(rev(m),cex.names = 0.8,font.axis = 3,axes=F,border = NA,las =1,
            horiz = T,xlim = c(0, 1.2*max(m)), main = paste(title.tag, " Enrichment Analysis", sep = "") )
    z<-seq(0, ceiling(max(m)), by = ceiling(max(m))/4)
    axis(side = 1,at = z,col="grey",line = -0.5)

    par(new = T)
    p_for_plot <- -log10(df.match$pvalue_adjusted)

    plot(rev(p_for_plot),1:length(p_for_plot),type="l",lwd = 2,col="red",axes=F,xlab=NA,ylab=NA,xlim = c(0, ceiling(max(p_for_plot))))
    abline(v = -log10(0.01),lwd = 1, col="red" ,lty = "dotted" )
    z<-seq(0, ceiling(max(p_for_plot)), by = ceiling(max(p_for_plot))/4)
    axis(side = 3,at = z,col="red",line = -0.5, col.axis = 'red')
    mtext(side = 3, line = 1.2,cex= 0.9, col="red",expression(-log[10](italic(p.adjusted))))


  }else if (top20 == "FALSE"){

    par(mar = c(5,5,8,5),mgp=c(2,0.3,0.5),tck=-.01,cex.axis = 0.8)
    m <- df.match$Number_matched
    names(m) <-df.match$FunctionName
    barplot(rev(m),yaxt = "n",cex.names = 0.8,font.axis = 3,axes=F,border = NA,las =1,
            horiz = T,xlim = c(0, 1.2*max(m)), main = paste(title.tag, " Enrichment Analysis", sep = "") )
    z<-seq(0, ceiling(max(m)), by = ceiling(max(m))/4)
    axis(side = 1,at = z,col="grey",line = -0.5)

    par(new = T)
    p_for_plot <- -log10(df.match$pvalue_adjusted)

    plot(rev(p_for_plot),1:length(p_for_plot),type="l",lwd = 2,col="red",axes=F,xlab=NA,ylab=NA,xlim = c(0, ceiling(max(p_for_plot))))
    abline(v = -log10(0.01),lwd = 1, col="red" ,lty = "dotted" )
    z<-seq(0, ceiling(max(p_for_plot)), by = ceiling(max(p_for_plot))/4)
    axis(side = 3,at = z,col="red",line = -0.5, col.axis = 'red')
    mtext(side = 3, line = 1.2,cex= 0.9, col="red",expression(-log[10](italic(p.adjusted))))

  }

  EA.plot <- recordPlot()
  dev.off()
  return(EA.plot)
}




ternary_plot <- function(data_matrix, data_meta, three_points){

  install.packages.auto(ggtern)

  select_3 <- as.data.frame(t(data_matrix[which(rownames(data_matrix) %in% three_points),]))
  #select_3 <- as.data.frame(scale(select_3), scale = FALSE) # rescore

  index <- match(rownames(select_3), data_meta[,1])
  data_meta_Reorder <- data_meta[index,]

  select_3[,4] <- data_meta_Reorder[,2]
  #select_3_hits[,5] <- data_meta_Reorder[,1]


  colnames(select_3)<-c("x","y","z", "Group")
  p<- ggtern(data=select_3,aes(x,y,z,color=Group)) +geom_point() +labs(x=three_points[1],y=three_points[2],z=three_points[3],title="Ternary Plot")



  return(list(ternary.plot = p,
              matrix_selected = select_3
  ))

}



#________________________________________________________________________________________
#     color brewer section
#________________________________________________________________________________________

install.packages.auto("RColorBrewer")



# display.brewer.all() # to display the color pannel
# display.brewer.pal(7,"BrBG")
# display.brewer.pal(7,"Accent") good color pannel for qualative

# mycolor_gradient <- brewer.pal(7,"Greens")
# mycolor_discret <- brewer.pal(7,"Accent")

# brewer.pal
# mycol <- colorRampPalette(c("blue", "white", "red"))(100)





#________________________________________________________________________________________
#     output_SVG_png
#________________________________________________________________________________________

# ___Description___:
# 1: This function writes out any recorded or ggplot2 plot into svg and png format
# 2: for some grdevices not supported figures, use output_SVG_png2 for plotting

# ___Arguments___:
# path, it has to be with last "/",  default is current
# name tag
# frequently used with and height


#____Usage____;
# filename_tag and graph is mandatory


# ___Values___:
#  this function does not have return values, only write figure to hardrives
# if succeed in writing, return "1";


output_SVG_png <- function(path = "./plots/", filename_tag, graph, width = 10, height = 8){

  install.packages.auto(svglite) # the svg driver, letting R outpuf figure as avg format
  install.packages.auto(rsvg) # convert rsvg into other formats
  svglite(paste(path, filename_tag, ".svg", sep=""), width = width, height = height)
  replayPlot(graph) # replot the graph
  dev.off()
  rsvg_png(paste(path, filename_tag, ".svg", sep=""), paste(path, filename_tag, ".png", sep=""))
  return("File saved!")
}


output_SVG_png2 <- function(path = "./plots/",filename_tag, graph, width = 10, height = 8){
  install.packages.auto(rsvg) # convert rsvg into other formats
  svg(paste(path,filename_tag, ".svg", sep=""),onefile = TRUE, width = width, height = height)
  plot(graph) # replot the graph
  dev.off()
  rsvg_png(paste(path,filename_tag, ".svg", sep=""), paste(path,filename_tag, ".png", sep=""))
  return("File saved!")
}

output_SVG_png_plotlist <- function(path = "./plots/", plotlist, prefix = NULL){
  file_NamesTags <- names(plotlist)
  for(i in 1:length(plotlist)){
    try(output_SVG_png(path = path, filename_tag = paste(prefix, file_NamesTags[i], sep = "_"), graph = plotlist[[i]]), silent = TRUE)
  }
}

#________________________________________________________________________________________
#     PCA_plot
#________________________________________________________________________________________

# ___Description___:
# 1: classical 2d plot of


# ___Arguments___:
# data matrix of tidy format
# grouping information is used just for plotting, if not given, there will be no grouping plot on the figure
# grouping information is required to be in a data.frame, with one column named as sample.name, one colum named as Groups

#____Usage____;

# PCA_plot(data_matrix, grouping)$pca.plot

# ___Values___:
# pca.plot as a ggplot2 object, to plot out the PCA plot
#

#p <- PCA_plot(data_matrix = matrix_test, grouping = meta_test)



PCA_plot<-function(data_matrix, grouping){
  suppressMessages(install.packages.auto(ggplot2))
  suppressMessages(install.packages.auto(ggfortify))# for autoplot

  # reorder the meta (grouping information)
  grouping <- grouping[match(colnames(data_matrix), grouping[,1]),]

  # check the order
  if(any(colnames(data_matrix) !=  grouping[,1])){stop("unmatched sample name/colun names")}



  data_matrix_t<-t(data_matrix)

  if(missing(grouping)){
    p1<-autoplot(prcomp(data_matrix_t), label = TRUE )
  }else{

    # combine the
    data_matrix_t_merge <- merge(grouping, data_matrix_t, by.y=0, by.x = colnames(grouping)[1])
    row.names(data_matrix_t_merge)<-data_matrix_t_merge[,1]
    p1<-autoplot(prcomp(data_matrix_t), data = data_matrix_t_merge, colour = colnames(data_matrix_t_merge)[2],label = TRUE )

  }

  p1<-p1+labs(title = "PCA Clustering")
  p1<-p1+theme_bw()+theme(plot.title = element_text(hjust = 0.5))

  return(list(pca.plot = p1))

}


#________________________________________________________________________________________
#     PCA_plot_with_ellipse_kmeans
#     PCA_plot_with_ellipse_kmeans_2
#________________________________________________________________________________________

# ___Description___:
# 1: do pca on columns,
# 2: plot a pca plot, with eliipse on the groups
# 3: note that is a indirect strategy, using result of prcomp to plot points, using kmeans grouping result to plot groups
# # level of 0.95 as confidence to draw the ellipse


# ___Argument___:
# a data matrix, with  samples as columns, and features as rows,

#___Usage___:
# p <- PCA_plot_with_ellipse_kmeans(t(iris[,1:4]),iris$Species)
# p


# ___Values___:
#  a ggplot2 plot, with points and ellipse
#


PCA_plot_with_ellipse_kmeans <- function(data_matrix, grouping){
  suppressMessages(install.packages.auto(ggplot2))
  pca    <- prcomp(t(data_matrix), retx=T, scale.=T) # do pca
  scores <- pca$x                       # scores for first three PC's
  #scores <- pca$x[,1:3]
  # k-means clustering [assume 3 clusters]
  number_of_groups <- length(levels(as.factor(grouping[,2])))
  km  <- kmeans(scores, centers=number_of_groups, nstart=5)

  #ggdata <- data.frame(scores, Cluster=km$cluster, Groups=grouping)
  ggdata <- data.frame(scores, Cluster=km$cluster)


  p<- ggplot(ggdata) +
    geom_point(aes(x=PC1, y=PC2, colour=factor(Cluster)), size=3, shape=20) +
    geom_text(aes(x=PC1, y=PC2, color=factor(Cluster),label=colnames(data_matrix)))+
    stat_ellipse(aes(x=PC1,y=PC2,fill=factor(Cluster)), geom="polygon", level=0.95, alpha=0.2) +
    guides(color=guide_legend("Cluster"),fill=guide_legend("Cluster"))+
    theme_bw()+theme(plot.title = element_text(hjust = 0.5))

  return(p)

}

PCA_plot_with_ellipse_kmeans_2 <- function(prcomp_out, grouping){
  suppressMessages(install.packages.auto(ggplot2))

  scores <- prcomp_out$x                       # scores for first three PC's
  number_of_groups <- length(levels(as.factor(grouping[,2])))
  km  <- kmeans(scores, centers=number_of_groups, nstart=5)

  #ggdata <- data.frame(scores, Cluster=km$cluster, Groups=grouping)
  ggdata <- data.frame(scores, Cluster=km$cluster)

  p<- ggplot(ggdata) +
    geom_point(aes(x=PC1, y=PC2, colour=factor(Cluster)), size=3, shape=20) +
    geom_text(aes(x=PC1, y=PC2, color=factor(Cluster),label=rownames(ggdata)))+
    stat_ellipse(aes(x=PC1,y=PC2,fill=factor(Cluster)), geom="polygon", level=0.95, alpha=0.2) +
    guides(color=guide_legend("Cluster"),fill=guide_legend("Cluster"))+
    theme_bw()+theme(plot.title = element_text(hjust = 0.5))


  p<-p+labs(title = "PCA Clustering")+ theme_bw()+theme(plot.title = element_text(hjust = 0.5))

  return(p)

}




PCA_plot_with_confidence_2 <- function(prcomp_out, data_meta){
  suppressMessages(install.packages.auto(ggplot2))

  loadings <- as.data.frame(prcomp_out$x)                       # scores for first three PC's

  # reorder the meta (grouping information)
  data_meta <- data_meta[match(rownames(loadings), data_meta[,1]),]

  loadings$groups <- data_meta[,2]

  p<- ggplot(loadings, aes(PC1, PC2, color = groups)) +
    geom_point() +
    stat_ellipse(type = "norm", linetype = 2, level = 0.95)
  # confidence level =0.95 for normal distribution
  # see for more details: https://ropensci.github.io/plotly/ggplot2/stat_ellipse.html
  p<-p+labs(title = "PCA Clustering")+ theme_bw()+theme(plot.title = element_text(hjust = 0.5))

  return(p)

}


#________________________________________________________________________________________

#     PCA_plot_3d_interactive
#________________________________________________________________________________________

# ___Description___:
# interactive 3d plot of the pca result
# using 3d function of plotly


# ___Arguments___:
#
# grouping information is used just for plotting, if not given, there will be no grouping plot on the figure
# grouping information is required to be in a data.frame, with one column named as sample.name, one colum named as Groups

#____Usage____;

# PCA_plot_3d_interactive(data_matrix, grouping)$pca.plot

# ___Values___:
# a general objective of plotly
#

#colnames(meta_test) <- c("Sample.Name", "Groups")

#PCA_plot_3d_interactive(data_matrix = matrix_test, grouping = meta_test )

PCA_plot_3d_interactive_1<-function(data_matrix, grouping){
  suppressMessages(install.packages.auto(plotly))

  data_matrix_t<-t(data_matrix)
  loading <- prcomp(data_matrix_t)$x
  colnames(meta_test) <- c("Sample.Name", "Groups")

  loading_merge <- as.data.frame(merge(grouping, loading, by.y=0, by.x = "Sample.Name"))
  row.names(loading_merge)<-loading_merge$Sample.Name


  p1 <- plot_ly(loading_merge, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Groups, colors = c('#BF382A', '#0C4B8E', "#1ABC9C")) %>%
    add_markers() %>%
    #add_text(loading_merge, x = ~PC1, y = ~PC2, z = ~PC3,text = ~Sample.Name) %>%
    add_text(text = ~Sample.Name) %>%
    layout(scene = list(xaxis = list(title = 'PC1'),
                        yaxis = list(title = 'PC2'),
                        zaxis = list(title = 'PC3')))

  return(p1)
}

PCA_plot_3d_interactive_2<-function(data_matrix, grouping){
  suppressMessages(install.packages.auto(plotly))

  data_matrix_t<-t(data_matrix)
  loading <- as.data.frame(prcomp(data_matrix_t)$x)

  grouping <- grouping[match(rownames(loading), grouping[,1]),]


  p1 <- plot_ly(loading, x = ~PC1, y = ~PC2, z = ~PC3, color = grouping[,2], colors = c('#BF382A', '#0C4B8E', "#1ABC9C")) %>%
    add_markers() %>%
    #add_text(loading_merge, x = ~PC1, y = ~PC2, z = ~PC3,text = ~Sample.Name) %>%
    add_text(text = grouping[,1]) %>%
    layout(scene = list(xaxis = list(title = 'PC1'),
                        yaxis = list(title = 'PC2'),
                        zaxis = list(title = 'PC3')))

  return(p1)
}

PCA_plot_3d_interactive_3<-function(prcomp_out, grouping){
  suppressMessages(install.packages.auto(plotly))

  loading <- as.data.frame(prcomp_out$x)
  # reorder the meta (grouping information)
  grouping <- grouping[match(rownames(loading), grouping[,1]),]

  loading$Sample.Name <- rownames(loading)
  loading$Groups <- grouping[,2]

  #loading_merge <- as.data.frame(merge(grouping, loading, by.y=0, by.x = "Sample.Name"))
  #row.names(loading_merge)<-loading_merge$Sample.Name

  p1 <- plot_ly(loading, x = ~PC1, y = ~PC2, z = ~PC3, color = grouping[,2], colors = c('#BF382A', '#0C4B8E', "#1ABC9C")) %>%
    add_markers() %>%
    #add_text(loading_merge, x = ~PC1, y = ~PC2, z = ~PC3,text = ~Sample.Name) %>%
    add_text(text = grouping[,1]) %>%
    layout(scene = list(xaxis = list(title = 'PC1'),
                        yaxis = list(title = 'PC2'),
                        zaxis = list(title = 'PC3')))

  return(p1)
}



#________________________________________________________________________________________

#     PCA_Screeplot
#     PCA_Screeplot_2
#
#________________________________________________________________________________________

# ___Description___:
# plot PCA screeplot, which shows how much of variance of each Principal Component explains
# a good pca needs the firt 2/3 components explains most of the varaiance, otherwise, the separtaion is not good
# the pca analysis was done by prcomp
# an alternative is to to use a function of recordPlot to record the last plot as an object, which can be replotted after, where it was invoked. A list of plots can be returned by this way
# a better alternative is to use ggplot2, where all plots are objects

# ___Arguments___:
# PCA_Screeplot: data matrix as input
# PCA_Screeplot_@: the output of prcomp as input. in case you have already finished the pca analysis

#____Usage____;
# PCA_Screeplot(data_matrix)
# PCA_Screeplot_2(prcomp.out)

# ___Values___:
# plot a scree plot
#

PCA_Screeplot<-function(data_matrix){
  suppressMessages(install.packages.auto(ggplot2))
  pca.output <- prcomp(t(data_matrix), scale.=TRUE, center = TRUE)
  sd <- pca.output$sdev
  scores <- pca.output$x
  var <- sd^2
  var.percent <- var/sum(var) * 100

  #barplot(var.percent, xlab="Principal Component", ylab="Percent of Variance", names.arg=1:length(var.percent), las=1, ylim=c(0,max(var.percent)), col="gray", main="Percent of Variance")
  #abline(h=1/nrow(pca.output$rotation)*100, col="red")
  #p1 <- recordPlot()

  p1<-ggplot()+geom_bar(aes(x=c(1:length(var.percent)),y=var.percent), stat="identity")
  p1<-p1+geom_hline(yintercept = 1/nrow(pca.output$rotation)*100, colour = "red")
  p1<-p1+labs(x = "Princaple Component Number",y="Percent of Variance",title = "Screeplot of Variance")
  p1<-p1+theme_bw()+theme(plot.title = element_text(hjust = 0.5))

  return(list(Scree.plot = p1))
}


# accept pca result from prcomp
PCA_Screeplot_2<-function(prcomp_out){
  suppressMessages(install.packages.auto(ggplot2))
  # get the values
  sd <- prcomp_out$sdev
  scores <- prcomp_out$x

  var <- sd^2
  var.percent <- var/sum(var) * 100

  #barplot(var.percent, xlab="Principal Component", ylab="Percent of Variance", names.arg=1:length(var.percent), las=1, ylim=c(0,max(var.percent)), col="gray", main="Percent of Variance")
  #abline(h=1/nrow(pca.output$rotation)*100, col="red")
  #p1 <- recordPlot()

  p1<-ggplot()+geom_bar(aes(x=c(1:length(var.percent)),y=var.percent), stat="identity")
  p1<-p1+geom_hline(yintercept = 1/nrow(prcomp_out$rotation)*100, colour = "red")
  p1<-p1+labs(x = "Princaple Component Number",y="Percent of Variance",title = "Screeplot of Variance")
  p1<-p1+theme_bw()+theme(plot.title = element_text(hjust = 0.5))


  return(list(Scree.plot = p1))
}



# sub function for visualization



#________________________________________________________________________________________
#     simple_venn_plot_from_list
#________________________________________________________________________________________

# ___Description___:
# 1: a simple function to transform list of vectors to a data matrix,
# 2: the shiny version can be found: http://jolars.co/shiny/eulerr/
# another venn tool: http://www.biovenn.nl/index.php

# ___Arguments___:
# 1: the function accepts a list of vectors, with names, see the usage
#

#____Usage____;
# p <-simple_venn_plot_from_list(vector_list = list( "list_A"= A, "list_B" = B, "list_C" = C))

# ___Values___:
# a plot
#

# ___Note____;
# this object is grid based one, so no need to record

# test:

# A <- sample (LETTERS, 10)
# B <- sample (LETTERS, 15)
# C <- sample (LETTERS, 7)
# D <- sample (LETTERS, 20)

# simple_venn_plot_from_list(vector_list = list( "list_A"= A, "list_B" = B))
# simple_venn_plot_from_list(vector_list = list( "list_A"= A, "list_B" = B, "list_C" = C))
# simple_venn_plot_from_list(vector_list = list( "list_A"= A, "list_B" = B, "list_C" = C, "list_D" = D))



simple_venn_plot_from_list <-function(vector_list = list( "list_A"= A, "list_B" = B, "list_C" = C)){

  install.packages.auto(eulerr) #euler
  install.packages.auto("RColorBrewer") #   color brewer

  all <- unique(unlist(vector_list))
  match_list <- lapply(vector_list,function(x) all %in% x)
  presence_matrix <- matrix(unlist(match_list), ncol =length(match_list) , byrow = FALSE)
  colnames(presence_matrix) <- names(vector_list)

  fit <- euler(presence_matrix)
  mycolor_discret <- brewer.pal(4,"Set1") [1:length(vector_list)]
  p <- plot(fit, counts = TRUE, fill_opacity = 0.7, lty = 0,fontface = "italic", fill = mycolor_discret)
  return(p) # this object is grid based one, so no need to record

}



#________________________________________________________________________________________

#     correlation_matrix_plot
#________________________________________________________________________________________

# ___Description___:
# 1: do correlation analysis of columns,
# 2: do visualization of result correlation matrix, with p values
# 3: a simple wrap of two funcitons: Hmisc::rcorr and corrplot::corrplot

# ___Arguments___:
# data matrix:
# a data matrix to do the column correlatin,

#  method_type:  Character, the visualization method of correlation matrix to be used. Currently,
# it supports seven methods, named "circle" (default), "square", "ellipse", "number", "pie", "shade" and "color".
# See examples for details.
# The areas of circles or squares show the absolute value of corresponding correlation coefficients. Method "pie" and "shade" came from Michael Friendly's job (with some adjustment about the shade added on), and "ellipse" came from D.J. Murdoch and E.D. Chow's job, see in section References.

# order_type:
# corresponds the order or corrplot::corrplot

# "original" for original order (default).
# "AOE" for the angular order of the eigenvectors.
# "FPC" for the first principal component order.
# "hclust" for the hierarchical clustering order.
# "alphabet" for alphabetical order.


#____Usage____;
# t <- correlation_matrix_plot(mtcars, order_type = "hclust", plot_type ="color")
# t$corrplot
# t$corrmatrix

# ___Values___:
#  a corrplot and a very detailed matrix plot, see PerformanceAnalytics::chart.Correlation for more details
#  very cool plot


correlation_matrix_plot <- function(data_matrix, order_type = "hclust", plot_type ="circle" ){

  suppressMessages(install.packages.auto(Hmisc))
  suppressMessages(install.packages.auto(corrplot))
  suppressMessages(install.packages.auto(PerformanceAnalytics))

  correlation_matrix <- rcorr(as.matrix(data_matrix))
  win.metafile()
  dev.control('enable')
  corrplot(correlation_matrix$r, type="lower", order=order_type,  p.mat = correlation_matrix$P, sig.level = 0.01, insig = "pch", method =plot_type)
  p1 <- recordPlot()
  dev.off()

  win.metafile()
  dev.control('enable')
  suppressWarnings(chart.Correlation(data_matrix, histogram=TRUE, pch= "+"))
  p2 <- recordPlot()
  dev.off()

  return(list(corrplot = p1, corrmatrix = p2))
}



#________________________________________________________________________________________
#     matrix_ggboxplot
#________________________________________________________________________________________

# ___Description___:
# ggplot2 is powerful at boxplot and violinplot, but needs some pre-process of the datamatrix, a bit tricky sometimes
# here, the steps are wrapped up to give out an easy way

# ___Arguments___:
# data_matrix: data matrix
#  xlabel, ylabel, maintitle

#____Usage____;
# boxplot_ressult <- matrix_ggboxplot(data_matrix, xlabel="Sample", ylabel = "Value", maintitle = "Distribution")
# plot by: boxplot_ressult$boxplot, boxplot_ressult$violinplot

# ___Values___:
# a list of plot, the first object boxplot, second one is violinplot



matrix_ggboxplot<-function(data_matrix, xlabel="Samples", ylabel = "Value", maintitle = "Distribution"){
  suppressMessages(install.packages.auto(ggplot2))
  data_matrix_melt<-reshape2::melt(as.matrix(data_matrix))
  # in data_matrix_melt, Var1 is the orignal row.names, Var2 is the orignial column names, value is the orignial values

  p1<-ggplot(data_matrix_melt, aes(x = Var2, y = value, fill=Var2))+geom_boxplot()
  p1<-p1+labs(x = xlabel,y=ylabel,title = maintitle, fill = xlabel)
  p1<-p1+theme_bw()+theme(plot.title = element_text(hjust = 0.5))

  p2<-ggplot(data_matrix_melt, aes(x = Var2, y = value, fill=Var2)) +geom_jitter(shape=21,alpha=0.3) +geom_violin()
  p2<-p2+labs(x = xlabel,y=ylabel,title = maintitle)
  p2<-p2+theme_bw()+theme(plot.title = element_text(hjust = 0.5)) + guides(fill=guide_legend(title=xlabel))

  return(list(boxplot = p1, violinplot = p2))
}


#________________________________________________________________________________________
#     matrix_quick_heatmap
#________________________________________________________________________________________

# bug fixed
# 20191002 add a line to replace the infinite values, heatmap.2 can deal with NA, but not infinte values


matrix_quick_heatmap <- function(matrix,
                                 scale = "row",
                                 col_groupcolor_factor = NULL ,
                                 row_groupcolor_factor = NULL,
                                 Col_tree = TRUE){

  install.packages.auto(gplots)

  matrix[is.infinite(matrix)] <- NA # in case there is infinite values

  svg(tempfile(),onefile = TRUE)
  dev.control('enable')

  if(is.null(row_groupcolor_factor) & is.null(col_groupcolor_factor)){
    # if no grouping information provided
    if(Col_tree){
      heatmap.2(matrix, col=bluered, trace = "none",
                scale = scale,
                keysize = 1.5,
                density.info = "none",
                key.title  = "",
                key.xlab = "",
                key.ylab = "")
    }else{
      heatmap.2(matrix, col=bluered, trace = "none",
                scale = scale,
                keysize = 1.5,
                density.info = "none",
                Colv  = FALSE,
                dendrogram = "row",
                key.title  = "",
                key.xlab = "",
                key.ylab = "")
    }



  }else if((length(col_groupcolor_factor) > 0) & is.null(row_groupcolor_factor)){
    # if only colum grouping information provided

    col_groupcolor_factor <- col_groupcolor_factor[order(col_groupcolor_factor[,2]),]
    matrix<- matrix[,match(col_groupcolor_factor[,1],colnames(matrix))]

    color_labeling <- factor(col_groupcolor_factor[,2]) # removing unused factors
    levels(color_labeling) <- rainbow(length(levels(color_labeling))) # rename the
    color_labeling <- as.vector(color_labeling)
    if(Col_tree){

      heatmap.2(matrix,
                col=bluered,
                ColSideColors = color_labeling,
                trace = "none",
                scale = scale,
                keysize = 1.5,
                density.info = "none",
                key.title  = "",
                key.xlab = "",
                key.ylab = "")


    }else{
      heatmap.2(matrix,
                col=bluered,
                ColSideColors = color_labeling,
                trace = "none",
                scale = scale,
                Colv  = FALSE,
                dendrogram = "row",
                keysize = 1.5,
                density.info = "none",
                key.title  = "",
                key.xlab = "",
                key.ylab = ""
      )
    }

  }else if((length(row_groupcolor_factor) > 0) & is.null(col_groupcolor_factor)){
    # if only row grouping information provided

    row_groupcolor_factor <- row_groupcolor_factor[match(rownames(matrix), row_groupcolor_factor[,1]),]

    row_groupcolor_factor <- row_groupcolor_factor[order(row_groupcolor_factor[,2]),]
    matrix<- matrix[match(row_groupcolor_factor[,1],rownames(matrix)),]

    color_labeling <- factor(row_groupcolor_factor) # removing unused factors
    levels(color_labeling) <- rainbow(length(levels(color_labeling))) # rename the
    color_labeling <- as.vector(color_labeling)
    if(Col_tree){
      heatmap.2(matrix,
                col=bluered,
                ColSideColors = color_labeling,
                trace = "none",
                scale = scale,
                keysize = 1.5,
                density.info = "none",
                key.title  = "",
                key.xlab = "",
                key.ylab = "" )

    }else{
      heatmap.2(matrix,
                col=bluered,
                ColSideColors = color_labeling,
                trace = "none",
                scale = scale,
                Colv  = FALSE,
                dendrogram = "row",
                keysize = 1.5,
                density.info = "none",
                key.title  = "",
                key.xlab = "",
                key.ylab = ""
      )
    }
  }
  p1 <- recordPlot()
  dev.off()
  return(p1)
}

# visualize the datamatrix institutively
matrix_display <- function(matrix,method="square",title.tag = "Display", ...){

  suppressMessages(install.packages.auto(corrplot))

  postscript("temp")
  dev.control('enable')

  # corrplot is powerful to visualize
  corrplot(matrix, type = "full",is.corr = FALSE, order = "original",
           tl.col = "black",tl.cex = 0.5,  tl.srt = 45,title = title.tag, method=method,
           addgrid.col =NA, outline = FALSE)

  p1 <- recordPlot()
  dev.off()

  return(plot = p1)
}


# data_matrix process and plot --------------------------------------------------------



#________________________________________________________________________________________
#     subfunction
#     matrix_ANOVA
#________________________________________________________________________________________
#     do ANOVA test on matrix, according to given grouping information
matrix_ANOVA<-function(data,groups) {
  p_value<-apply(data,1,function(x) anova(lm(as.numeric(x)~as.factor(groups)))$Pr[1])
  return(p_value)
}

#________________________________________________________________________________________
#     subfunction
#     matrix_ttest
#________________________________________________________________________________________

# do preset t test among two groups
# for this function, you need to input the data, grouping informatin, and two group names for comparison
matrix_ttest<-function(data,group_info,group_name1,group_name2) {
  group1<-which(group_info==group_name1)
  group2<-which(group_info==group_name2)
  p_value<-apply(data,1,function(x) t.test(x[group1],x[group2])$p.value)
  return(p_value)
}


#similarly
matrix_fold_change<-function(data,group_info,group_name1,group_name2) {
  group1<-which(group_info==group_name1)
  group2<-which(group_info==group_name2)

  fold_change<-apply(data,1,function(x) sum(x[group1])/sum(x[group2]))

  return(fold_change)
}


#________________________________________________________________________________________
#     subfunction
#     PostHoc
#________________________________________________________________________________________

# do post hoc analysis on a vector with group information
#This function only applies to vetor data (one dimenstion)
# do anova first, if p-value is significant, do the posthoc analysis, then return the names of the pairs, and the p value
# imputL a vector and grouping/factor information
# output: significantly different pairs and corresponding p-values, return NA if no significance found

PostHoc<-function(vector, factor, p_threshold = 0.05){
  p.value_anova <- anova(lm(as.numeric(vector)~as.factor(factor)))$Pr[1]
  if(p.value_anova < 0.05){
    p_PostHoc_matrix <- pairwise.t.test(as.numeric(vector),as.factor(factor),p.adj = "fdr")$p.value
    p_PostHoc_pairs <- find_p_location(p_PostHoc_matrix, p_threshold = p_threshold) # find_p_location is the self define functions
    #return(p_PostHoc_pairs)
    return(paste0("ANOVA p=",p.value_anova,"; ",p_PostHoc_pairs))
  }else{
    return(NA)
  }
}


#________________________________________________________________________________________
#     subfunction
#     matrix_PostHoc
#________________________________________________________________________________________

# this funtion use the homemade function "PostHoc" to return the post hoc analysis on a matrix
# imput: data matrix, and grouping infomation
# ouput: p values and pairs with significance

matrix_PostHoc<-function(data,groups) {
  p_PostHoc_pairs<-apply(data,1,PostHoc,factor=groups) # this is a good example how to apply second parometer to "apply" functions
  return(as.data.frame(p_PostHoc_pairs))                            # PostHoc is self defined functions
}



matrix_PostHoc2<-function(data,groups) {
  p_PostHoc_pairs<-apply(data,1,PostHoc,factor=groups) # this is a good example how to apply second parometer to "apply" functions
  # PostHoc is self defined functions


  return(p_PostHoc_pairs)
}




#________________________________________________________________________________________
#     subfunction
#     find_p_location
#________________________________________________________________________________________

# Description: pick up the significant pairs
# Arguments: p value matrix from pairwise.t.test like functions
# Values: list of the pairs
# Details:
# see data format by the following example
# attach(airquality)
# Month <- factor(Month, labels = month.abb[5:9])
# result<-pairwise.t.test(Ozone, Month)
# the inpute data is result$p.value
# threshold is 0.05 by default, can be changed to 0.01

find_p_location<-function(p_matrix, p_threshold=0.05){
  xx <- which(p_matrix < p_threshold,arr.ind=TRUE)
  if(length(xx) > 0){ # only perform the picking up if there is any significant one
    ALL_significant_pairs <- NULL
    for(i in 1:nrow(xx)){
      #group1 name/dimention1 name:rownames(xx)[i]
      #group2 name/dimention2 name:colnames(p_matrix)[xx[i,2]]
      #p value: p_matrix[xx[i,1],xx[i,2]]
      significant_pair <- paste0(p_matrix[xx[i,1],xx[i,2]],"(",rownames(xx)[i],"~",colnames(p_matrix)[xx[i,2]],"); ")
      ALL_significant_pairs <- paste0(ALL_significant_pairs,significant_pair)
    }
    return(ALL_significant_pairs)
  }else{
    return("Failed in locating pairs by t.test")
    # to pick up the significant pairs after posthoc analysis after anova
    # sometimes using different p.adjust method for anova  and ttest
    # therefore it's marked here if there is any inconcistency
  }
}

#________________________________________________________________________________________
#     wrapper
#     PCA_wrapper
#________________________________________________________________________________________

# this function is just a wrapper of the PCA analysis
# input is a general data matrix(row as features, column as samples)

# will do sequential KNN using rrcovNA::impSeqRob

# data_meta is a data.frame with two columns of sample and grouping
# ncomp: number of component to keep, it cannot be larger than the smaller dimention
# Q is the strengh of imputation

PCA_wrapper_mixOmics <- function(data_matrix = NULL, data_meta = NULL, ncomp = 10, inputation = TRUE, Q = 0.75, ...){
  suppressMessages(install.packages.auto(mixOmics))


  # reorder the meta (grouping information)
  data_meta <- data_meta[match(colnames(data_matrix), as.data.frame(data_meta)[,1]),]

  # check the order
  if(any(colnames(data_matrix) !=  data_meta[,1])){stop("unmatched sample name/colun names")}


  # take the values from the input
  X <- as.matrix(t(data_matrix))
  Y <- as.data.frame(data_meta)[,2]

  # filter and imputat the mising value
  if(inputation){
    X <- tidy_IntensityMarix_process(X, Q = Q, Imputation = TRUE)
  }


  if(ncomp > ncol(data_matrix)){
    ncomp <-  ncol(data_matrix)
  }

  # this part could be done by any method

  my.pca <- mixOmics::pca(X, ncomp = ncomp, center = TRUE, scale = TRUE)

  PCA_analysis <- PCA_post_analysis(my.pca, Y)

  return(list(
    PCA_result = my.pca,
    plot_PCA_Scree =  PCA_analysis$plot_Scree,
    plot_PCA_component =  PCA_analysis$plot_component

  ))

}



# using prcomp as the method to do pca then plot
# test: matrix_test and meta_test are the dataset generated in this manual
# p <- PCA_wrapper_prcomp(data_matrix = matrix_test, data_meta = meta_test, inputation = TRUE, Q = 0.75)
# the 3D interactive plot might not work on some terminals


PCA_wrapper_prcomp <- function(data_matrix, data_meta, inputation = TRUE, Q = 0.75){

  suppressMessages(install.packages.auto(ggplot2))
  suppressMessages(install.packages.auto(ggfortify))# for autoplot

  data_matrix_t<-t(data_matrix)

  # filter and imputat the mising value (including 0)
  if(inputation){
    data_matrix_t <- tidy_IntensityMarix_process(data_matrix_t, Q = Q, Imputation = TRUE)
  }

  PCA_result <- prcomp(data_matrix_t)

  # scree plot
  pca_Scree <- PCA_Screeplot_2(prcomp_out = PCA_result)$Scree.plot

  if(missing(data_meta)){ # if there is no data_meta
    pca_component <- autoplot(PCA_result, label = TRUE )
    pca_component <- pca_component+labs(title = "PCA Clustering")+ theme_bw()+theme(plot.title = element_text(hjust = 0.5))

    return(list(pca_result = PCA_result,
                pca_component_plot = pca_component,
                pca_scree_plot = pca_Scree
    ))

  }else{
    # reorder the meta (grouping information)
    data_meta <- data_meta[match(colnames(data_matrix), as.data.frame(data_meta)[,1]),]
    row.names(data_meta) <- data_meta[,1] # this is only for proper labeling
    colnames(data_meta) <- c("Sample.name", "grouping")

    # check the order
    if(any(colnames(data_matrix) !=  data_meta[,1])){stop("unmatched sample name/colun names")}

    pca_component <- autoplot(PCA_result, data = data_meta, colour = "grouping", label = TRUE)
    pca_component <- pca_component+labs(title = "PCA Clustering")+ theme_bw()+theme(plot.title = element_text(hjust = 0.5))


    pca_kmeans <- PCA_plot_with_ellipse_kmeans_2(prcomp_out = PCA_result, grouping = data_meta)

    pca_3d <- PCA_plot_3d_interactive_3(prcomp_out = PCA_result, grouping = data_meta)

    pca_confidence <- PCA_plot_with_confidence_2(prcomp_out = PCA_result, data_meta = data_meta)


    return(list(pca_result = PCA_result,
                pca_component_plot = pca_component,
                pca_component_plot_kmeans = pca_kmeans,
                pca_component_plot_3d_interactive = pca_3d,
                pca_confidence = pca_confidence,
                pca_scree_plot = pca_Scree
    ))

  }

}




#________________________________________________________________________________________
#     wrapper
#     PCA_PLSDA_wrapper
#________________________________________________________________________________________

# this function is just a wrapper of the PCA and PLSDA analysis
# input is a general data matrix(row as features, column as samples)

# will do sequential KNN using rrcovNA::impSeqRob

PCA_PLSDA_wrapper <- function(data_matrix = NULL, data_meta = NULL, ncomp = 10, inputation = TRUE, Q = 0.75, VIP_threshold = 1){
  suppressMessages(install.packages.auto(mixOmics))

  rownames(data_matrix) <- 1:nrow(data_matrix)
  # reorder the meta (grouping information)
  data_meta <- data_meta[match(colnames(data_matrix), as.data.frame(data_meta)[,1]),] # revised @201906013

  # check the order
  if(any(colnames(data_matrix) !=  data_meta[,1])){stop("unmatched sample name/colun names")}


  # take the values from the input
  X <- as.matrix(t(data_matrix))
  Y <- as.data.frame(data_meta)[,2]# revised @201906013

  # filter and imputat the mising value
  if(inputation){
    X <- tidy_IntensityMarix_process(X, Q = Q, Imputation = TRUE)
  }


  if(ncomp > ncol(data_matrix)){
    ncomp <-  ncol(data_matrix)
  }



  # this part could be done by any method
  my.pca <- mixOmics::pca(X, ncomp = ncomp, center = TRUE, scale = TRUE)
  PCA_analysis<-PCA_post_analysis(my.pca, Y)

  # this is the plsda part, start a model, then test, how many component is good
  my.plsda <- mixOmics::plsda(X, Y, ncomp = 3)
  PLSDA_analysis <- PLSDA_post_analysis(X, my.plsda, VIP_threshold = VIP_threshold)


  return( list(
    plot_PCA_Scree =  PCA_analysis$plot_Scree,
    plot_PCA_component =  PCA_analysis$plot_component,
    plot_PLSDA_component =  PLSDA_analysis$plot_PLSDA_component,
    plot_PLSDA_heatmap = PLSDA_analysis$plot_PLSDA_heatmap,
    plot_PLSDA_AUC = PLSDA_analysis$plot_auc,
    plot_PLSDA_VIP = PLSDA_analysis$plot_vip_distribution,
    plot_PLSDA_VIP_heatmap = PLSDA_analysis$plot_vip_filtered_variables,
    data_PLSDA_VIP_filtered_features = t(PLSDA_analysis$vip.filtered.variables),
    data_PLSDA_VIP =  PLSDA_analysis$vip.all
  ))

}






PCA_post_analysis <- function(result.pca, Y){

  if(missing (result.pca)){
    print("No Input!")
  }else{

    install.packages.auto(mixOmics)
    my.pca <-result.pca


    postscript("temp")
    dev.control('enable')
    plot(my.pca)
    p1 <- recordPlot()
    dev.off()

    postscript("temp")
    dev.control('enable')
    plotIndiv(my.pca, comp = c(1, 2), ind.names = TRUE,  group = Y, ellipse = TRUE,legend = TRUE, title = 'PCA plot, PCA comp 1 - 2')
    p2 <- recordPlot()
    dev.off()

    return(list(plot_Scree = p1,
                plot_component = p2
    ))
  }
}




#________________________________________________________________________________________
#     PLSDA_CrossValication
#________________________________________________________________________________________

# ___Description___:
# 1: do cross-validation of a setup plsda model
# 2: the purpose includes 2, one is to evaluate the stability, the other is to choose what is the best number of component to choose, for a final model

# ___Arguments___:
# result_plsda: mixOmics plsda resulted object
# validation method, see ?perf for more options, could be LOO if sample size is not big, say less than 50
# folds, see ?perf for more details, usually setup as 10
# # NOTE that nrepeat is usually 50~100 times

#____Usage____;

# ___Values___:
#
#


PLSDA_CrossValication <-function(result_plsda, validation = "Mfold", folds =5, nrepeat =5 ){
  suppressMessages(install.packages.auto(mixOmics))
  print ("Performing crossing validation, will be very slow depending on the settings")
  perf.my.plsda.start <- mixOmics::perf(my.plsda.start, validation = 'Mfold', folds = fold, progressBar = TRUE, auc = TRUE, nrepeat = 2)
  # here add as many plot and return in the list
  #plotIndiv(my.plsda.start, comp = c(1, 2), ind.names = TRUE,  group = Y, ellipse = TRUE,legend = TRUE, title = 'PLSDA plot, Comp 1 - 2')
  plot.distance<-plot(perf.my.plsda.start, overlay = 'measure', sd = TRUE)
  return(list(plot.distance = plot.distance

  ))
}

#________________________________________________________________________________________
#  PLSDA_post_analysis
#________________________________________________________________________________________

# ___Description___:
# 1: orignial data matrix, for extracing variables
# 2: The main target is to extract the VIP and the variables with VIP higher than VIP_threshold set
# 3: there are also some other figures output

# ___Arguments___:
# 1: result.plsda, mixOmics plsda resulted object
# 2: VIP_threshold

#____Usage____;

# ___Values___:
# a list of figures and datamatrix
# use summary to view


# Postprocess of PSLDA
PLSDA_post_analysis<-function(X, my.plsda, VIP_threshold = 1){
  suppressMessages(install.packages.auto(mixOmics))
  # ploting and output
  Y<-my.plsda$Y

  postscript("temp")
  dev.control('enable')
  plotIndiv(my.plsda, comp = c(1, 2), ind.names = TRUE,  group = Y, ellipse = TRUE,legend = TRUE, title = 'PLSDA plot, Comp 1 - 2')
  p1 <- recordPlot()
  dev.off()


  my.side.color <- color.mixo(as.numeric(Y))

  postscript("temp")
  dev.control('enable')
  cim(my.plsda, row.sideColors = my.side.color, row.names = Y)
  p2 <- recordPlot()
  dev.off()


  #plot an auc
  postscript("temp")
  dev.control('enable')
  my.plsda.auroc = mixOmics::auroc(my.plsda, roc.comp = 1)
  p3 <- recordPlot()
  dev.off()

  # extract all the VIPs
  my.vip<-vip(my.plsda)
  #write.table(my.vip,"Out_VIP_table.txt",sep="\t",row.names = TRUE,col.names = NA)

  # oveall distribution of all the vips
  my.vip.plot <- matrix_ggboxplot(my.vip, xlabel="Component", ylabel = "VIP Score", maintitle = "VIP Score Across Component")
  p4 <- my.vip.plot$violinplot


  # filter vips, keeping >1, and ouput all the corresponding features/variables
  my.vip.filtered <- my.vip[my.vip[,1]> VIP_threshold,]
  my.vip.filtered <- my.vip.filtered[order(my.vip.filtered[,1], decreasing = TRUE),]
  my.vip.filtered.variables <- X[,as.numeric(row.names(my.vip.filtered))]
  #write.table(my.vip.filtered.variables,"Out_ProteinGroups_filtered_VIP.txt",sep="\t",row.names = TRUE,col.names = NA)

  # plot the heatmap of the orginal features
  my.vip.filtered.variables.scaled<-scale(my.vip.filtered.variables)
  postscript("temp")
  dev.control('enable')
  cim(my.vip.filtered.variables.scaled,
      row.sideColors = my.side.color,
      row.names = Y,
      col.names = FALSE,
      row.cex = 0.5,
      scale = TRUE,
      center =TRUE)
  p5 <- recordPlot()
  dev.off()


  return(list( plot_PLSDA_component = p1,
               plot_PLSDA_heatmap = p2,
               plot_auc = p3,
               plot_vip_distribution =p4,
               plot_vip_filtered_variables = p5,
               vip.all = my.vip,
               vip.filtered.variables = my.vip.filtered.variables

  ))
}






#________________________________________________________________________________________
#     flattenCorrMatrix
#________________________________________________________________________________________

# ___Description___:
# 1: flatten a matrix from a correlation analysis, into a data.frame with 5 columns

# ___Arguments___:
# result_rcorr has to be the result object of Hmisc::rcorr function

#____Usage____;
# flattenCorrMatrix(result_rcorr)

# Column 1 : A
# Column 2 : B
# Column 3 : correation
# Column 4 : the p-values of the correlations
# Column 5 : N of values used after removing

# example
# x <- c(-2, -1, 0, 1, 2)
# y <- c(4,   1, 0, 1, 4)
# z <- c(1,   2, 3, 4, NA)
# v <- c(1,   2, 3, 4, 5)
# result <- rcorr(cbind(x,y,z,v))
# t <-flattenCorrMatrix(result)

flattenCorrMatrix <- function(result_rcorr) {
  cormat <- result_rcorr$r
  nmat <-result_rcorr$n
  pmat <- result_rcorr$P

  ut <- upper.tri(cormat)
  data.frame(
    from = rownames(cormat)[row(cormat)[ut]],
    to = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut],
    N_used = nmat[ut]
  )
}




matrix_filtering<-function(data.matrix, threshold = 0.01, keep_sign = "<"){

  data.matrix <- data.matrix
  t <- data.matrix
  threshold <- threshold
  keep_sign = "<"

  if(keep_sign == "<"){
    t[data.matrix >= threshold]<-NA
  }else if(keep_sign == "<="){
    t[data.matrix > threshold]<-NA
  }else if(keep_sign == ">"){
    t[data.matrix <= threshold]<-NA
  }else if(keep_sign == ">="){
    t[data.matrix < threshold]<-NA
  }

  data.matrix_filtered<-t[!apply(is.na(t),1,all),]

}

# data_preprocess ---------------------------------------------------------




#________________________________________________________________________________________
#     readin_tidied_proteingroups
#________________________________________________________________________________________

# ___Description___:
# 1: read in tidied proteingroups
# 2: read in tidied experimental desgin


# ___Arguments___:
# file_progeinGroups: full path for tidied progeinGroups ouput
# file_experimentDesgin: full path for tidied experiment ouput



#____Usage____;

# ___Values___:
# a list of three values
# dat_matrix: a tidy data matrix
# groups: a tidy and matched sample grouping file
# a quaility checking data frame to see if the sample order in the table is the same between the datamatrix and the grouping file


readin_tidied_proteingroups <- function(file_progeinGroups="",
                                        file_experimentDesgin= ""){

  # read in protgroups
  if(nchar(file_progeinGroups) > 0) {
    proteinGroups<-read.delim(file_progeinGroups,header=TRUE, row.names = 1,  check.names = FALSE)
    # make sure that check.names = FALSE, otherwise the there will be an X added to the colnames
  }else{
    print("file does not exist, check your input!")
  }


  if(nchar(file_experimentDesgin) > 0){

    # read in experimental desgin
    grouping<-read.delim(file_experimentDesgin,header=TRUE, row.names = 1)
    # filtering proteinGroups, only keeping expression data
    alignment_check<-cbind(as.character(grouping[,1]),rownames(proteinGroups))

    return(list(data_matrix = proteinGroups,
                groups = grouping,
                alignment_check = alignment_check
    ))

  }else{
    return(list(data_matrix = proteinGroups ))

  }
}



#________________________________________________________________________________________
#     readin_raw_proteingroups_filterrows
#________________________________________________________________________________________

# ___Description___:
# 1: read in raw proteingroups
# 2: only filter out rows with contaminant, revresed, Only.identified.by.site


# ___Arguments___:
# file_progeinGroups: full path for tidied progeinGroups ouput
# rows_marked_with: usually "Only.identified.by.site", "Reverse","Potential.contaminant".
# note that it might be a bit different between different versions of maxquant



#____Usage____;

# ___Values___:
# a list of 1 value
# dat_matrix: a row filtered value data matrix

readin_raw_proteingroups_filterrows <- function(file_progeinGroups="proteinGroups.txt",
                                                rows_marked_with = c("Only.identified.by.site", "Reverse","Potential.contaminant"))
{

  # read in protgroups
  print("Reading in ProteinGroups ....")
  proteinGroups<-read.delim(file_progeinGroups,header=TRUE, blank.lines.skip = TRUE)
  print("ProteinGroups read in")
  # filtering proteinGroups, only keeping expression data

  proteinGroups_filtered<-PG_filterout_rows(proteinGroups,rows_marked_with = rows_marked_with)

  print("ProteinGroups filtered")

  return(list(data_matrix = proteinGroups_filtered))
}


















#________________________________________________________________________________________
#     IntensityMarix_process
#________________________________________________________________________________________

# ___Description___:
# 1: very basic data process: log10 transformation, normalization on column or row, missing value imputation, etc
# 2: This function uses homemade subfunction "missingvalue_filtering", be sure to load it first

# ___Argument___:
# IntensityMatrix: is the data input, which is intensity matrix selected columns from the proteinGroup file, it could be the LFQ inetnsity, intensity, or intensity for specific labeling state
# Normalize_columns: if set TRUE, normlize column (experiment) means to the same level (to zero) first. Some method, like PCA, have scale function built in, therefore no normalization need before
# Normalize_rows: if set TRUE, normalize row (proteingroyps) means to the same level (tosezro) then. Some method, like PCA, have scale function built in, therefore no normalization need before
#
# threshold: refer to the function of  "missingvalue_filtering"
#             briefly, 1 as Q100, 0.5 as Q50,
# Imputation: if set TRUE, do missing value imputation. Only works when there are missing values todo:choose differnt method,

#___Usage___:
#IntensityMarix_process(IntensityMarix, threshold = 1, Imputation = TRUE, Normalize_columns = TRUE, Normalize_rows = TRUE)

# ___Values___:
#
#




IntensityMarix_process<-function(IntensityMarix, threshold = 1, Imputation = TRUE, Normalize_columns = TRUE, Normalize_rows = TRUE){

  IntensityMarix[IntensityMarix==0]<-NaN # replace the 0 with NaN
  IntensityMarix_log10<-log10(IntensityMarix) # take log10

  # missing value filtering
  tempt_filter_result <- missingvalue_filtering(IntensityMarix_log10, threshold)
  IntensityMarix_log10_NAfiltered <- tempt_filter_result$data_qualified # home made function [missingvalue_filtering]
  IntensityMarix_log10_NAfiltered_filtering_summary <-tempt_filter_result$filtering_summary

  p1<-matrix_ggboxplot(IntensityMarix_log10_NAfiltered, maintitle = "Distribution of NA-filtered")$violinplot
  p2<-matrix_ggboxplot(IntensityMarix_log10_NAfiltered, maintitle = "Distribution of NA-filtered")$boxplot

  if(Imputation == "TRUE"){  # missing value imputation
    IntensityMarix<-rrcovNA::impSeqRob(IntensityMarix_log10_NAfiltered)$x
  }

  if (Normalize_columns == "TRUE"){ # do column scaling, keep in mind that the scale function in R is scaling by column
    IntensityMarix<-scale(IntensityMarix)
  }

  if (Normalize_rows == "TRUE"){ # scaling of each protein
    IntensityMarix<-t(scale(t(IntensityMarix)))
  }

  p3<-matrix_ggboxplot(IntensityMarix, maintitle = "Distribution of NA-filtered & Processed")$violinplot
  p4<-matrix_ggboxplot(IntensityMarix, maintitle = "Distribution of NA-filtered & Processed")$boxplot


  write.table(IntensityMarix,paste("Out_ProteinGroups_NAfiltered_Scaled",".txt",sep=""),sep="\t",row.names = TRUE,col.names = NA)
  return(list(IntensityMarix_processed = IntensityMarix,
              IntensityMarix_filtering_summary = IntensityMarix_log10_NAfiltered_filtering_summary,
              violinplot.before = p1,
              boxplot.before = p2,
              violinplot.after = p3,
              boxplot.after = p4

  ))
}


# 20181002 fix a bug for filtering out rows, originally it was applied on columns

tidy_IntensityMarix_process<-function(IntensityMarix, Q = 0.75, Imputation = TRUE){
  suppressMessages(install.packages.auto(rrcovNA))

  # IntensityMarix<-IntensityMarix[,rowSums(IntensityMarix == 0) < (ncol(IntensityMarix)*(1-Q))]

  # filter out rows with too many missing values
  IntensityMarix  <- IntensityMarix[rowSums(is.na(IntensityMarix)) <= ncol(IntensityMarix)*(1-Q) , ]

  IntensityMarix<-log10(IntensityMarix)
  IntensityMarix[IntensityMarix==-Inf]<-NA
  if(Imputation=="TRUE"){
    IntensityMarix<-t(rrcovNA::impSeqRob(t(IntensityMarix))$x)
  }
  IntensityMarix # now is a inputed(log10(intensity)), items with too many missing values ware filtered out
  # intensity matrix is a data.frame, with raw intensity (without log transformation) with column as samples, and row as items
  # Q is the Q cutoff for percentatage of missing value filtering, 0.75 means items with 75% of msising value will be filtered out
}


#________________________________________________________________________________________

#     missingvalue_filtering
#________________________________________________________________________________________

# ___Description___:
# filter a matrix out rows/clumn with more than NA/infinte preset(inf values could be generated from log transformation or dividing conversion),
# threshold is the number of the valid values, rows with more valid values than threshold will be kept as qualified
# only do row-wise filtering, transpose first if do column-wise filtering

#__Usage__:
# missingvalue_filtering(data, threshold=3)

# ___Arguments___:
# data: data matrix with missing values, NA/inf
# threshold: how many (percentage) non-missingvalues are required to be in the matrix
#             can be two types, one is the nnumber of the missing values,the other one is the so called Q value, which is the percentage(0 <= Q <= 1) to the number of columns
#             Q value == 1, require no missing values,
#             Q value == 0, no filtering,
#             Q value will be converted to the number of missing value (1 < number < ncol(data))
#             will report an error if not setup in this range
#             in this function, celing is used to convert the percentage, if not expected, try floor/round etc

# ___Values___:
# qualified data matrix, not.qulified data.matrix, number of rows of quailfied/ not qulified.
#



missingvalue_filtering<-function(data.matrix, threshold = 1){
  data.matrix[is.infinite(as.matrix(data.matrix))]<-NA # the is.infinite function does not work on data.frame,
  # in case there are infinte values there

  if(threshold < 0|threshold > ncol(data.matrix) ){
    print ("Threshold cannont be smaller than 0 or bigger than the number of columns, please check you threshold setting and rerun!!!")
  }else{

    if(threshold<=1){ # concet the q value to the real missing value number
      threshold <- ceiling(ncol(data.matrix)*threshold)
    }

    data_qualified<-data.matrix[which(apply(data.matrix,1,function(x)(sum(is.na(x)))<=(ncol(data.matrix)-threshold))),]
    data_not.qualified<-data.matrix[which(apply(data.matrix,1,function(x)(sum(is.na(x)))>(ncol(data.matrix)-threshold))),]
    return(list(data_qualified=data_qualified,
                filtering_summary = list(data_not.qualified=data_not.qualified,
                                         number.qualified=nrow(data_qualified),
                                         number.not.qualified=nrow(data_not.qualified))
    ))

  }
}



#________________________________________________________________________________________
#     extract_proteinlist_ineachsample
#________________________________________________________________________________________

# ___Description___:
# 1: extract protein list from given clean proteingroups matrix





extract_proteinlist_ineachsample <- function(proteinGroups_tidied_input){
  proteinGroups_tidied = proteinGroups_tidied_input
  protein_names <- as.vector(rownames(proteinGroups_tidied)) # this protein names is the primary proteinIDs in the protein group files
  proteins_listoflist_eachsample <- lapply(as.list(proteinGroups_tidied),function(x){protein_names[which((x != 0))]})


  return(list(protein_names_list = proteins_listoflist_eachsample)
  )
}

# more general form for prcessing

extract_idlist_each_column <- function(data_matrix){
  ids <- as.vector(rownames(data_matrix)) # this protein names is the primary proteinIDs in the protein group files
  ids_list_each_colum <- lapply(as.list(data_matrix),function(x){ids[which((x != 0))]})
  return(ids_list_each_colum)
}

# this works
readin_tsv<- function(file="proteinGroups.txt"){

  # read in protgroups
  print(paste("Reading in", file, "...."))
  tsv<-read.delim(file,header=TRUE, blank.lines.skip = TRUE)
  print(paste(file, "read in"))
  number_of_row = nrow(tsv)
  number_of_columns = ncol(tsv)
  head = head(tsv)
  # filtering proteinGroups, only keeping expression data
  return(list(data_matrix = tsv,
              number_of_row = number_of_row,
              number_of_columns= number_of_columns,
              head_of_table =  number_of_columns
  ))
}

# this works
readin_experiment_desgin <- function(file_experimentDesgin = "experimentalDesign.txt"){

  # read in experimental desgin

  if(nchar(file_experimentDesgin) > 0){

    grouping<-read.delim(file_experimentDesgin,header=TRUE, na.strings = "NA")

    # clean the experiemntal desgin, in case there are some rawfiles not used in the grouping
    grouping <- replace(grouping, grouping == "", NA)
    temp<-apply(grouping,1,function(x) length(which(is.na(x))))
    grouping<-grouping[which(temp==0),]
    print(paste(file_experimentDesgin, " read in and tidied up!"))
    return(grouping)

  }else{
    print("wrong path")
  }

}
# this works
readin_meta_data <- function(meta_table = "experimentalDesign.txt"){

  # read in experimental desgin

  if(nchar(meta_table) > 0){

    grouping<-read.delim(meta_table,header=TRUE, na.strings = "NA")

    # clean the experiemntal desgin, in case there are some rawfiles not used in the grouping
    grouping <- replace(grouping, grouping == "", NA)
    temp<-apply(grouping,1,function(x) length(which(is.na(x))))
    grouping<-grouping[which(temp==0),]
    print(paste(meta_table, " read in and tydied up!"))
    return(grouping)

  }else{
    print("wrong path")
  }

}



#________________________________________________________________________________________
#     PG_filterout_rows
#________________________________________________________________________________________

# ___Application___:
# 1: parse the dataframe of proteingroups, filtering out the contaminant, reversed, and ided only by site


# ___input___:
# proteingroups is the dataframe read-in directly from proteingroups.txt (as default from Maxquant)
# do not open and edit it in excel
# rows_marked_with: rows marked in these columns as "+" will be deleted


# ___output___:
# a filtered data matrix


PG_filterout_rows<-function(proteinGroups, remove_rows_marked_with = c("Only.identified.by.site", "Reverse","Potential.contaminant")){

  suppressMessages(install.packages.auto(dplyr))
  suppressMessages(install.packages.auto(lazyeval)) # this is very important for passing column names as parameters to the function

  proteinGroups_filtered <- proteinGroups
  proteinGroups_filtered$temp.rownames <- rownames(proteinGroups_filtered)

  for(filter_name in remove_rows_marked_with)  {
    print(paste("filtering by",filter_name))
    filter_criteria <- lazyeval::interp(quote(x != "+"), x = as.name(filter_name))
    proteinGroups_filtered<-filter(proteinGroups_filtered, filter_criteria)

  }

  # notice that the rownames are silently dropped druing filter, even there are row names,
  # set the column names by using a temp column
  rownames(proteinGroups_filtered) <- proteinGroups_filtered$temp.rowname
  proteinGroups_filtered <- proteinGroups_filtered[,-ncol(proteinGroups_filtered)]

  return(proteinGroups_filtered)
}




#________________________________________________________________________________________
#     PG_filtering
#________________________________________________________________________________________

# ___Application___:
# 1: parse the dataframe of proteingroups, filtering out the contaminant, reversed, and ided only by site
# 2: only keep the user defined expression columns, and return data matrix

#____input____
# proteingroups is the dataframe read-in directly from proteingroups.txt (as default from Maxquant)
# do not open and edit it in excel
# rows_marked_with: rows marked in these columns as "+" will be deleted
# cols_starts_with: column names start with, used the maxiumn length of the common characters

# ___output___:
# a data matrix

PG_filtering<-function(proteinGroups,rows_marked_with = c("Only.identified.by.site", "Reverse","Potential.contaminant"), cols_starts_with = "LFQ.intensity."){

  # removing rows marked with
  proteinGroups_filtered<-PG_filterout_rows(proteinGroups, rows_marked_with)
  # keeping columns starts with
  proteinGroups_filtered <- select(proteinGroups_filtered, starts_with(cols_starts_with))
  #write.table(proteinGroups_filtered,"Out_ProteinGroups_filtered.txt",sep="\t",row.names = TRUE,col.names = NA)
  return(proteinGroups_filtered)
}




#________________________________________________________________________________________
#     tidy_protein_groups
#________________________________________________________________________________________

# ___Description___:
# 1: read in proteingroups
# 2: read in experimental desgin
# 3: filtering proteingroups by rows
# 4: filtering proteingroups by columns
# 5: removing rows with all 0
# 5: filtering experimental desgin, only keep samples with grouping information
# 6: filtering proteingroups, only keeping samples with grouping information
# 7: tranpose the filtered proteingroups: rows as samples/observations, columns as variables
# 8: reordering the grouping information, to math the ordering of the samples in proteingroups
# 9: double check the of the sample names are matching
# 10: return both the data matrix and experimental design/grouping

# ___Arguments___:
# file_progeinGroups: full path for progeinGroups
# row.names: The column name to be the rownames when readin the table, this column cannot be redundant.
# proteingroups is the dataframe read-in directly from proteingroups.txt (as default from Maxquant)
# do not open and edit it in excel
# rows_marked_with: rows marked in these columns as "+" will be deleted
# cols_starts_with: column names start with, used the maxiumn length of the common characters


#____Usage____;

# ___Values___:
# a list of three values
# dat_matrix: a tidy data matrix
# groups: a tidy and matched sample grouping file
# a quaility checking data frame to see if the sample order in the table is the same between the datamatrix and the grouping file


tidy_proteingroups <- function(proteinGroups = NULL,
                               experimentDesgin = NULL,
                               remove_rows_marked_with = c("Only.identified.by.site", "Reverse","Potential.contaminant"),
                               keep_cols_starts_with = "LFQ.intensity.", # the columns to be selected starting with, use the maximum lengh
                               filter_all_zero = TRUE){
  suppressMessages(install.packages.auto(dplyr)) # select command


  # rownames to be the primary protein ID
  protein.ids_split <- strsplit(as.vector(proteinGroups$Protein.IDs), ";") # this is a list of list of split names
  protein_primary_ids <- unlist(lapply(protein.ids_split, function(x) x[1])) # only keep the first one
  rownames(proteinGroups) <- protein_primary_ids # rename the rownames of the matrix


  # removing rows marked with
  if (length(remove_rows_marked_with) >=1 ){
    proteinGroups_filtered<-PG_filterout_rows(proteinGroups, remove_rows_marked_with)
  }

  # keeping columns starts with
  if (length(keep_cols_starts_with) >=1 ){
    proteinGroups_filtered <- select(proteinGroups_filtered, starts_with(keep_cols_starts_with)) # select command in dplyr package
  }

  # shorten column names, by removing the "starts with"
  colnames(proteinGroups_filtered)<-gsub(keep_cols_starts_with, "", colnames(proteinGroups_filtered))

  #remove rows with all 0, this is only for data value matrix
  if (filter_all_zero){
    proteinGroups_filtered<-proteinGroups_filtered[-which(apply(proteinGroups_filtered,1,function(x)all(x == 0))),]
    print("ProteinGroups filtered")
  }



  # further select columns in Experiment desgin, if not all used in Experiment desgin
  if(class(experimentDesgin) ==  "data.frame"){
    proteinGroups_filtered<- proteinGroups_filtered[,which(colnames(proteinGroups_filtered) %in% experimentDesgin[,1])]
    experimentDesgin<-experimentDesgin[match(colnames(proteinGroups_filtered),as.character(experimentDesgin[,1])),] # just in case the order is not the same
    # check if the experiment is aligned properly
    alignment_check<-cbind(as.character(experimentDesgin[,1]),colnames(proteinGroups_filtered))

    #write.table(proteinGroups_filtered,"Out_ProteinGroups_tidied_up.txt",sep="\t",row.names = TRUE,col.names = NA)
    #write.table(experimentDesgin,"Out_ProteinGroups_expdesign.txt",sep="\t",row.names = TRUE,col.names = NA)

    return(list(data_matrix = proteinGroups_filtered,
                groups = experimentDesgin,
                alignment_check = alignment_check
    ))

  }else{
    #write.table(proteinGroups_filtered,"Out_ProteinGroups_tidied_up.txt",sep="\t",row.names = TRUE,col.names = NA)
    return(list(data_matrix = proteinGroups_filtered,
                experimentDesgin = experimentDesgin))
  }

}




#________________________________________________________________________________________

#     filter_PSM
#________________________________________________________________________________________

# ___Description___:
# 1: this function is for MSfragger result parsing and filter, not working for others
# 2:

# ___Arguments___:
# file input could be peptXML, or tsv
# others could be just as default

#____Usage____;
#filter_PSM(file = "MS_QC_60min.pepXML")
#t<- filter_PSM(file = "MS_QC_60min.tsv")


# ___Values___:
# will write a table out,
# return a data.frame of filtered table



filter_PSM <- function(file = "file", pepFDR=0.01, score_for_filtering = "hyperscore", decoyprefix="REVERSED_" ){

  suppressMessages(install.packages.auto(pepXMLTab))
  suppressMessages(install.packages.auto(tools))

  filetype = file_ext(file) #library(tools)

  if (filetype == "pepXML"){ # if this is a a peppxml file, parse it first
    tsv<-pepXML2tab(file)
  } else if(filetype == "tsv"){# if tsv, read in
    tsv <- read.delim(file, header =FALSE)
    header= c(
      "scanID", "precursor_neutral_mass","retention_time_sec","assumed_charge", "hit_rank","peptide",
      "peptide_prev_aa","peptide_next_aa","protein","num_matched_ions","tot_num_ions of matched theoretical fragment ions",
      "calc_neutral_pep_mass","massdiff","num_tol_term","num_missed_cleavages","modifications",
      "hyperscore","next_score","intercept_of_ep_model","slope_of_pe_mode")
    colnames(tsv) <- header
  }else{
    print ("Wrong input file type!")
  }

  passed <- PSMfilter(tsv, pepFDR=0.01, scorecolumn='hyperscore', hitrank=1, minpeplen=6, decoyprefix='REVERSED_')

  filename_base <-file_path_sans_ext(file) #library(tools)
  write.table(passed,paste(filename_base,"_FDR_filtered.txt", sep=""),sep="\t",row.names = TRUE,col.names = NA ,quote = FALSE)
  return(list(filtered = passed))
}


#________________________________________________________________________________________
#     PSMfilter
#________________________________________________________________________________________

# ___Description___:
# 1: This function function overide the same one from pepXMLTab, because the orginal one has a bug
# 2: default setting for readin tables is to convert characters into factor, therefore you cannot use nchar to count ther length anymore
# use as.character to change the target column back


PSMfilter <- function (PSMtab, pepFDR = 0.01, scorecolumn = "mvh", hitrank = 1,
                       minpeplen = 6, decoyprefix = "rev_", ...)
{
  if (dim(PSMtab)[1] == 0) {
    message("No input data")
  }
  else {
    PSMtab <- PSMtab[PSMtab$hit_rank <= hitrank, ]
    PSMtab[, "peptide"] <-as.character(PSMtab[, "peptide"]) # this line is added to fix the bug,
    peptide <- PSMtab[, "peptide"]
    idx <- which(nchar(peptide) < minpeplen)
    if (length(idx) > 0)
      PSMtab <- PSMtab[-idx, ]
    NTT <- rep(0, dim(PSMtab)[1])
    cut1 <- grep("[KR-]", PSMtab[, "peptide_prev_aa"], fixed = FALSE)
    NTT[cut1] <- 1
    cut2 <- union(grep("[KR]", substring(PSMtab[, "peptide"],
                                         nchar(PSMtab[, "peptide"])), fixed = FALSE), grep("-",
                                                                                           PSMtab[, "peptide_next_aa"], fixed = TRUE))
    NTT[cut2] <- 1
    NTT[intersect(cut1, cut2)] <- 2
    PSMtab <- cbind(PSMtab, NTT)
    PSMtabssubs <- split(PSMtab, paste(PSMtab$assumed_charge,
                                       PSMtab$NTT))
    PSMsubpass <- lapply(PSMtabssubs, function(y) {
      protein <- y[, "protein"]
      index <- grep(decoyprefix, protein, fixed = TRUE)
      if (dim(y)[1] == 1) {
        if (length(index) == 0)
          res <- y
      }
      else {
        proORrev <- rep(0, dim(y)[1])
        index <- grep(decoyprefix, protein, fixed = TRUE)
        proORrev[index] <- 1
        score <- y[, scorecolumn]
        score <- as.numeric(score)
        tmp <- cbind(score, proORrev)
        tmp <- tmp[order(score, decreasing = TRUE), ]
        FP <- cumsum(tmp[, 2])
        tmp <- cbind(tmp, FP)
        calFDR <- unlist(lapply(1:dim(y)[1], function(x) (2 * tmp[x, "FP"])/x))
        if (TRUE %in% (calFDR <= pepFDR)) {
          cutoff <- tmp[max(which(calFDR <= pepFDR)),
                        "score"]
          res <- y[which(as.numeric(y[, scorecolumn]) >=
                           cutoff), ]
          res
        }
      }
    })
    PSMpass <- do.call(rbind.data.frame, PSMsubpass)
    rownames(PSMpass) <- NULL
    PSMpass
  }
}


# General_functions -------------------------------------------------------



#________________________________________________________________________________________
#     do_foreach_file_in_folder
#________________________________________________________________________________________

# ___Description___:
# 1: This function could be used as a template, to iternate among all files of a specific file type
# for automation

# ___Arguments___:
# dir name, and file type, with extention names (without dot)
# and fuunction/command names


do_foreach_file_in_folder <- function(function_name, file_extention = c("tsv", "pepXML"), dir){

  suppressMessages(install.packages.auto(tools))
  if(missing(dir)){
    dir <- getwd()
  }else(
    setwd(dir)
  )

  inputfiles <- list_files_with_exts(dir, c("tsv", "pepXML"), full.names = FALSE)

  for(each_inputfile in inputfiles){
    print(paste("processing", each_inputfile, "...."))
    function_name(each_inputfile)
  }

}



#test
# savetable2html(iris, file = "./html/table_network.html")


savetable2html <- function(data_table =  NULL, file = NULL){
  # prepare the packge
  suppressMessages(install.packages.auto(DT))

  dv <- datatable(data_table, filter = 'top', extensions = c('Buttons','ColReorder', 'Scroller'),
                  options = list(
                    autoWidth = TRUE,
                    dom = 'Bfrtip',
                    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print'),
                    colReorder = TRUE,
                    deferRender = TRUE,
                    scrollY = 200,
                    scroller = TRUE)
  )

  saveWidget2html(widget = dv, file = file )
}


# Interactive plotting by js ----------------------------------------------

# the plot_input has to be a string, with quote
# test:
#   g<-qplot(rnorm(100),rnorm(100))
#   i_plot<-ggplotly(g)
#   single_plot_to_html(plot_input =  "i_plot", header = "interactive plot to html test",HTML_filename = "scatter_plot_Test")

# the temparary rmd will be in ./RmarkDown/
# and the html will be in ./html/
# all path is configurable

# knit a complete rmarkdown file
# path can be customorized for both source and output, even filename of the output file
# the knit_root_dir is veyr important, which set the root to knit is the upper dir of the markdown file,
# which means that render will pretend the input rmd file is in the upper dir. by setting this has another advandage is that
# when you write the rmd wfile, you also assume that the file is in the root dir(which is set here)




single_plot_to_html <- function(plot_input = NULL, header = "", HTML_filename = "myplot" ){

  cat("---","\n", sep = "", file = "./RmarkDown/temp.Rmd")
  cat("title: \"",header,"\"", "\n", sep = "",file = "./RmarkDown/temp.Rmd", append=TRUE)
  cat("output: html_document","\n",sep = "",file = "./RmarkDown/temp.Rmd", append=TRUE)
  cat("---","\n",sep = "",file = "./RmarkDown/temp.Rmd", append=TRUE)

  cat("```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.width= 10}","\n",sep = "",file = "./RmarkDown/temp.Rmd", append=TRUE)
  cat(plot_input,"\n",sep = "",file = "./RmarkDown/temp.Rmd", append=TRUE)
  cat("```","\n",sep = "",file = "./RmarkDown/temp.Rmd", append=TRUE)

  rmarkdown::render(input= "./RmarkDown/temp.Rmd", output_format = "html_document", knit_root_dir = "../",
                    output_file = paste(HTML_filename, ".html", sep = ""), output_dir = "./html/")

}


# saveWidget in htmlwidgets does not support relative path,
#
# here is the modification to use the convert the relative path to absolut path
# by doing so,saveWidget2 can save any  htmlwidgets based objects into html, them can be embeded into any webpage
# note that, if not in the present dir, saveWidget will not be able to delete the associated folder
# therefore one extra line is used to delete the folder

# test
#   g<-qplot(rnorm(100),rnorm(100))
#   i_plot<-ggplotly(g)
#  saveWidget2html(widget = i_plot, file ="./html/plotly_test.html" )


saveWidget2html <- function(widget, file, selfcontained = TRUE, libdir = NULL,
                        background = "white", knitrOptions = list()){
  suppressMessages(install.packages.auto(htmlwidgets))
  # see more http://www.htmlwidgets.org/

  file <- file.path(normalizePath(dirname(file)),basename(file))
  saveWidget(widget = widget, file = file, selfcontained =selfcontained, libdir= libdir,
             background = background, knitrOptions = knitrOptions)


  libdir <- paste(dirname(file),"/", tools::file_path_sans_ext(basename(file)),
                 "_files", sep = "")
  feedback <- unlink(libdir, recursive = TRUE, force = TRUE)
  return(feedback)
}





# co_occurence_visNetwork uses starts from a data matrix,(with rows as entries, columns as samples)
# and a meta file with two column names
# it will do the correlatin matrix first, then filter out interactions with low correlation or p-value (set by user)
# then do a second round of correlation, only keep the nodes with at least one quailified interactions
# do a matrix correlation plot
# output some dynamic or static plot, with interactivity


# data_meta format, two columns, names and grouping
# layout options:
# "layout_as_star"       "layout_components"    "layout_in_circle"     "layout_nicely"
# "layout_on_grid"       "layout_on_sphere"     "layout_randomly"      "layout_with_dh"
# "layout_with_drl"      "layout_with_fr"       "layout_with_gem"      "layout_with_graphopt"
# "layout_with_kk"       "layout_with_lgl"      "layout_with_mds"



co_occurence_visNetwork <-function(data_matrix =  NULL,
                                   data_meta = NULL,
                                   correlation_type = "spearman",
                                   correlation_threshold = 0.7,
                                   p_threshold = 0.05,
                                   edge_width_cex = 5,
                                   node_size_cex = 10,
                                   node_label_cex = 0.5,
                                   igraph_laylout = "layout_components",
                                   dynamic = FALSE,
                                   advanced = FALSE
){
  # prepare the packages
  suppressMessages(install.packages.auto(Hmisc))
  suppressMessages(install.packages.auto(corrplot))
  suppressMessages(install.packages.auto(igraph))
  suppressMessages(install.packages.auto(visNetwork))
  # see online document http://datastorm-open.github.io/visNetwork

  if(is.null(data_matrix)){ stop("data_matrix has to be defined!")}else{
    # calculate the correlation matrix
    data_matrix <- as.matrix(data_matrix) # just in case
    correlation_matrix <- rcorr(t(data_matrix), type = correlation_type)

    # filter the interections/correlations, using p value and correlation
    df_edge <- flattenCorrMatrix(correlation_matrix)
    df_edge_filtered <- df_edge[df_edge$p < p_threshold,]
    df_edge_filtered <- df_edge_filtered[abs(df_edge_filtered$cor) > correlation_threshold,]
    #df_edge_filtered <- df_edge_filtered[df_edge_filtered$cor > correlation_threshold,]

    # only keep the nodes with qualified edges for plotting the corrlatin plot
    data_matrix_filtered <- data_matrix[match(unique(c(df_edge_filtered[,1],
                                                       df_edge_filtered[,2])),rownames(data_matrix)),]

    # recalculate the correlation matrix and plot
    correlation_matrix_filtered <- rcorr(as.matrix(t(data_matrix_filtered)))
    mycol <- colorRampPalette(c("blue", "white", "red"))(100)

    # plot the corrleation matrix
    pdf("temp")
    dev.control('enable')
    corrplot(correlation_matrix_filtered$r, type="upper", order="original", col = mycol, p.mat = correlation_matrix_filtered$P,
             sig.level = 0.01, insig = "blank", tl.col = "black",tl.cex = 0.5,tl.srt = 30,
             method ="square", addgrid.col =NA, outline = FALSE, diag=FALSE)
    p_correlation <- recordPlot()
    dev.off()


    # only keep the filtered nodes/vertix
    data_meta_vertix <- data_meta[match(rownames(data_matrix_filtered),data_meta[,1]),]

    # reorganizing the node/vertix data for network plot
    colnames(data_meta_vertix) <- c("id","group")
    data_meta_vertix$label <- data_meta_vertix$id

    # for the purpose of using degeree and layout
    my_net <- graph_from_data_frame(d=df_edge_filtered, vertices=data_meta_vertix, directed=F)

    # Compute node degree (# of links) and use it to set node size:
    deg <- degree(my_net, mode="all")
    node_size <- ((deg-range(deg)[1])/(range(deg)[2]-range(deg)[1])+0.5)*node_size_cex

    data_meta_vertix$value <- node_size
    V(my_net)$size <- node_size

    # Set edge width based on correlation
    w <- abs(df_edge_filtered$cor)
    edge_width <- ((w - range(w)[1])/(range(w)[2]-range(w)[1])+0.1)*edge_width_cex
    df_edge_filtered$value <- edge_width
    E(my_net)$width <- edge_width

    # set the edge labeling and tooltip when hoover
    #df_edge_filtered$label <- df_edge_filtered$cor
    df_edge_filtered$title <- paste("correlation=", df_edge_filtered$cor, "; p=", df_edge_filtered$p, "; N=",df_edge_filtered$N_used, sep = "")


    ### igraph specific prepartion

    # set the colors
    # choose the color scheme here
    colors <- rainbow(length(levels(as.factor(V(my_net)$group))))
    names(colors) <- levels(as.factor(V(my_net)$group))
    V(my_net)$color <- colors[V(my_net)$group]

    # set the outline color of the node
    V(my_net)$frame.color <- "white"

    #change arrow size and edge color:
    #E(my_net)$arrow.size <- .2
    E(my_net)$edge.color <- "gray80"



    # cluster the nodes
    pdf("temp")
    dev.control('enable')
    my_clp <- cluster_label_prop(my_net)
    plot(my_clp, my_net, edge.curved=.1, vertex.label.cex = node_label_cex)
    p_igraphnetwork_cluster <- recordPlot()
    dev.off()


    if(dynamic){
      # interactive plot using visNetwork
      p_visnetwork <- visNetwork(data_meta_vertix, df_edge_filtered, main="",height = "1000px",width = "100%") %>%
        visOptions(highlightNearest = TRUE,nodesIdSelection = TRUE,
                   manipulation = TRUE, selectedBy = "group") %>%
        visLegend( position = "right") %>%
        visConfigure(enabled = advanced) %>%
        visEdges(smooth = TRUE) %>%
        visExport(type = "png", name = "export-network",
                  float = "right", label = "Save Network", style= "")

    }else{
      # using igraph layout native
      p_visnetwork <-visNetwork(data_meta_vertix, df_edge_filtered, main="", height = "1000px",width = "100%") %>%
        visOptions(highlightNearest = TRUE,nodesIdSelection = TRUE,
                   manipulation = TRUE, selectedBy = "group") %>%
        visIgraphLayout(layout = igraph_laylout) %>%
        visLegend( position = "right") %>%
        visConfigure(enabled = advanced) %>%
        visExport(type = "png", name = "export-network",
                  float = "right", label = "Save Network", style= "")
    }





    # using igraph layout native with smooth
    #visNetwork(data_meta_vertix, df_edge_filtered, main="") %>%
    #  visIgraphLayout()  %>%
    #  visEdges(smooth = TRUE)


    # export a network file

    return(list(plot_correlation_filtered = p_correlation,
                plot_network_static_cluster = p_igraphnetwork_cluster,
                plot_network_interactive = p_visnetwork,
                data_correlation_orignial = df_edge,
                data_for_network = df_edge_filtered,
                data_for_network_vertics = data_meta_vertix
    ))

  }

}


# layout_style: fruchtermanreingold, kamadakawai, circle

co_occurence_ggnetwork <-function(data_matrix =  NULL, data_meta = NULL, correlation_type = "spearman",
                                  correlation_threshold = 0.7, p_threshold = 0.01, node_size = 10,
                                  node_label_size = 2, layout_style = "kamadakawai"){

  # prepare the packages
  #install.packages.auto(sna)
  install.packages.auto(network)
  install.packages.auto(geomnet)

  if(is.null(data_matrix)){ stop("data_matrix has to be defined!")}else{
    # calculate the correlation matrix
    data_matrix <- as.matrix(data_matrix) # just in case
    correlation_matrix <- rcorr(t(data_matrix), type = correlation_type)

    # filter the interections/correlations, using p value and correlation
    df_edge <- flattenCorrMatrix(correlation_matrix)
    df_edge_filtered <- df_edge[df_edge$p < p_threshold,]
    df_edge_filtered <- df_edge_filtered[abs(df_edge_filtered$cor) > correlation_threshold,]
    #df_edge_filtered <- df_edge_filtered[df_edge_filtered$cor > correlation_threshold,]

    # only keep the nodes with qualified edges for plotting the corrlatin plot
    data_matrix_filtered <- data_matrix[match(unique(c(df_edge_filtered[,1],
                                                       df_edge_filtered[,2])),rownames(data_matrix)),]

    # recalculate the correlation matrix
    correlation_matrix_filtered <- rcorr(as.matrix(t(data_matrix_filtered)))

    mycol <- colorRampPalette(c("blue", "white", "red"))(100)


    postscript("temp")
    dev.control('enable')

    corrplot(correlation_matrix_filtered$r, type="upper", order="original", col = mycol, p.mat = correlation_matrix_filtered$P,
             sig.level = 0.01, insig = "blank", tl.col = "black",tl.cex = 0.5,tl.srt = 30,
             method ="square", addgrid.col =NA, outline = FALSE, diag=FALSE)

    p_correlation <- recordPlot()
    dev.off()


    # prepare the network data

    # using ggnetwork for network
    # data preparation for ggnet2 and ggnetwork
    co.net <- network::network(df_edge_filtered[, 1:2], directed = FALSE)
    # create edge attribute (number of trips)
    co.net %e% "cor"<- df_edge_filtered$cor
    co.net %e% "p_value"<- df_edge_filtered$p
    co.net %e% "N_used"<- df_edge_filtered$N_used

    if(!is.null(data_meta)){

      data_meta_vertix <- data_meta[match(rownames(data_matrix_filtered),data_meta[,1]),]
      # create vertex attribute if you have type/soucue information to be marked on the map
      co.net %v% "type" <- data_meta_vertix[,2]

    }

    set.seed(1981)

    data_network <- ggnetwork(co.net, layout.alg = layout_style)
    # scale the cor for easy plotting
    #data_network$cor <-  data_network$cor * edge_width_cex


    p_network <- ggplot(data = data_network,
                        aes(x, y, xend = xend, yend = yend)) +
      geom_edges(aes(size = abs(cor)), color = "grey50") +
      geom_nodes(aes(color = type), size = node_size) +
      geom_nodetext(aes(label = vertex.names), size = node_label_size) +
      scale_size_continuous("Correlation",range = range(abs(data_network$cor), na.rm =  TRUE))+
      scale_colour_manual("Source", values =  rainbow(length(levels(as.factor(type)))))+
      theme_blank() +
      theme(legend.position = "bottom", legend.box = "horizontal")



    return(list(plot_correlation_filtered = p_correlation,
                plot.network = p_network,
                data_for_network = df_edge_filtered,
                data_for_network_vertics = data_meta_vertix,
                data_for_plot = data_network
    ))

  }

}

# layout options:
# "layout_as_star"       "layout_components"    "layout_in_circle"     "layout_nicely"
# "layout_on_grid"       "layout_on_sphere"     "layout_randomly"      "layout_with_dh"
# "layout_with_drl"      "layout_with_fr"       "layout_with_gem"      "layout_with_graphopt"
# "layout_with_kk"       "layout_with_lgl"      "layout_with_mds"



co_occurence_igraph <-function(data_matrix =  NULL, data_meta = NULL, correlation_type = "spearman",
                               correlation_threshold = 0.6, p_threshold = 0.01, node_size_cex = 10,
                               edge_width_cex = 5, node_label_cex = 0.5, layout_style = "kamadakawai"){

  # prepare the packages
  suppressMessages(install.packages.auto(igraph))
  suppressMessages(install.packages.auto(Hmisc))
  suppressMessages(install.packages.auto(corrplot))

  if(is.null(data_matrix)){ stop("data_matrix has to be defined!")}else{
    # calculate the correlation matrix
    data_matrix <- as.matrix(data_matrix) # just in case

    correlation_matrix <- rcorr(t(data_matrix), type = correlation_type)

    # working section
    #matrix_Adjacency <- correlation_matrix$r

    #matrix_Adjacency[correlation_matrix$r >= 0.7 | correlation_matrix$r <= -0.7] <- 1
    #matrix_Adjacency[matrix_Adjacency != 1] <- 0

    #matrix_Adjacency[correlation_matrix$p < 0.01 ] <-0
    #matrix_Adjacency[correlation_matrix$n < ncol(data_matrix)/2 ] <-0

    #my_net2 <- graph_from_adjacency_matrix(matrix_Adjacency, mode = "undirected", diag = FALSE)
    #plot(my_net2)








    # filter the interections/correlations, using p value and correlation
    df_edge <- flattenCorrMatrix(correlation_matrix)
    df_edge_filtered <- df_edge[df_edge$p < p_threshold,]
    df_edge_filtered <- df_edge_filtered[abs(df_edge_filtered$cor) > correlation_threshold,]
    #df_edge_filtered <- df_edge_filtered[df_edge_filtered$cor > correlation_threshold,]

    # only keep the nodes with qualified edges for plotting the corrlatin plot
    data_matrix_filtered <- data_matrix[match(unique(c(df_edge_filtered[,1],
                                                       df_edge_filtered[,2])),rownames(data_matrix)),]

    # recalculate the correlation matrix
    correlation_matrix_filtered <- rcorr(as.matrix(t(data_matrix_filtered)))

    mycol <- colorRampPalette(c("blue", "white", "red"))(100)


    postscript("temp")
    dev.control('enable')

    corrplot(correlation_matrix_filtered$r, type="upper", order="original", col = mycol, p.mat = correlation_matrix_filtered$P,
             sig.level = 0.01, insig = "blank", tl.col = "black",tl.cex = 0.5,tl.srt = 30,
             method ="square", addgrid.col =NA, outline = FALSE, diag=FALSE)

    p_correlation <- recordPlot()
    dev.off()


    data_meta_vertix <- data_meta[match(rownames(data_matrix_filtered),data_meta[,1]),]


    #working section

    my_net <- graph_from_data_frame(d=df_edge_filtered, vertices=data_meta_vertix, directed=F)

    # Compute node degree (#links) and use it to set node size:
    deg <- degree(my_net, mode="all")

    V(my_net)$size <- ((deg-range(deg)[1])/(range(deg)[2]-range(deg)[1])+0.5)*10

    # set the colors
    # choose the color scheme here
    colors <- rainbow(length(levels(as.factor(V(my_net)$type))))

    names(colors) <- levels(as.factor(V(my_net)$type))
    V(my_net)$color <- colors[V(my_net)$type]

    # Set edge width based on correlation
    w <- E(my_net)$cor
    E(my_net)$width <- ((w - range(w)[1])/(range(w)[2]-range(w)[1])+0.1)*edge_width_cex

    # set the outline color of the node
    V(my_net)$frame.color <- "white"

    #change arrow size and edge color:
    #E(my_net)$arrow.size <- .2
    E(my_net)$edge.color <- "gray80"


    # plot all layout

    #layouts <- grep("^layout_", ls("package:igraph"), value=TRUE)[-1]
    ## Remove layouts that do not apply to our graph.
    #layouts <- layouts[!grepl("bipartite|merge|norm|sugiyama|tree", layouts)]

    #par(mfrow=c(3,3), mar=c(1,1,1,1))
    #for (layout in layouts) {
    #  print(layout)
    #  l <- do.call(layout, list(my_net))
    #  plot(my_net, edge.curved=.1,vertex.label.cex = node_label_cex,layout=l, main=layout) }


    postscript("temp")
    dev.control('enable')

    #plot(my_net, edge.curved=.1, vertex.label.cex = node_label_cex)
    plot(my_net, edge.curved=.1, vertex.label.cex = node_label_cex, layout=layout_with_fr)

    legend(x=-1.1, y=-1.1, levels(as.factor(V(my_net)$type)), pch=21,
           col="#777777", pt.bg=colors, pt.cex=2.5, bty="n", ncol=3)

    p_network <- recordPlot()
    dev.off()


    # sparse the network
    my_net_sp <- delete_edges(my_net, E(my_net)[cor<0.9])
    plot(my_net_sp, edge.curved=.1, vertex.label.cex = node_label_cex, layout=layout_with_fr)

    # cluster the nodes
    my_clp <- cluster_label_prop(my_net)
    plot(my_clp, my_net, edge.curved=.1, vertex.label.cex = node_label_cex)

    # mark specific nodes
    # V(my_net) # check the node index, then locate the index, and mark them
    # plot(my_net, mark.groups=c(5:6), mark.col="#C5E5E7", mark.border=NA)


    # interactive plot using visNetwork
    install.packages.auto(visNetwork)
    visNetwork(data_meta_vertix, df_edge_filtered, width="100%", height="400px", main="Network!")


    # export a network file

    #working section


    return(list(plot_correlation_filtered = p_correlation,
                plot.network = p_network,
                data_for_network = df_edge_filtered,
                data_for_network_vertics = data_meta_vertix,
                data_for_plot = data_network
    ))

  }

}


# subfunctions, as the function name says
combine_list_to_matrix <-function(vector_list){

  all <- unique(unlist(vector_list))
  match_list <- lapply(vector_list,function(x) all %in% x)
  presence_matrix <- matrix(unlist(match_list), ncol =length(match_list) , byrow = FALSE)
  colnames(presence_matrix) <- names(vector_list)
  rownames(presence_matrix) <-all
  presence_matrix[which(presence_matrix == FALSE)] <- 0
  presence_matrix[which(presence_matrix == TRUE)] <- 1
  return(presence_matrix)
  # this function convert a vector list into a expression matrix, with presence as 1, and absence as 0

  #test_list <- list(A = letters[1:10], B =  letters[5:15])
  #combine_list_to_matrix(test_list)
}
ningzhibin/rmdocpu documentation built on Feb. 3, 2022, 9:30 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ningzhibin/rmdocpu
Metalab report upon provided templates

inst/subfunctions_general.r
In ningzhibin/rmdocpu: Metalab report upon provided templates

R Package Documentation

Browse R Packages

We want your feedback!

ningzhibin/rmdocpu Metalab report upon provided templates

inst/subfunctions_general.r In ningzhibin/rmdocpu: Metalab report upon provided templates

R Package Documentation

Browse R Packages

We want your feedback!

ningzhibin/rmdocpu
Metalab report upon provided templates

inst/subfunctions_general.r
In ningzhibin/rmdocpu: Metalab report upon provided templates