functions_for_package.R

# set working directory
setwd("C:/Users/user/Desktop/elucidata")
# check for correct working directory
getwd()

# function to obtain the PCA plot from intensity and metadata file with scaling
plotpca_withscaling <- function(genedata, metadata){
  # data loading and error handling
  genedata1 <- if(is.character(genedata) && file.exists(genedata)){
    read.csv(genedata, sep=",", header=T)
  } else {
    stop ('intensity file does not exist')
  }
  metadata1 <- if(is.character(metadata) && file.exists(metadata)){
    read.csv(metadata, sep=",", header=T)
  } else {
    stop ('metadata file does not exist')
  }
  if (ncol(genedata1)==1) stop ('intensity file is not comma separated')
  if (ncol(metadata1)==1) stop ('metadata file is not comma separated')
  if (length(unique(metadata1['Unit']))!=1) stop ('time-point units are different in metadata file')
  if (tolower(colnames(genedata1[2])) != 'symbol') stop ('gene symbol column is missing in intensity file')
  # data preprocessing for intensity file
  # remove the identifier column
  genedata2 = genedata1[,-1]
  # check for missing values; 1 means column with missing value
  print(table(apply(genedata2, 2, function(x) {sum(is.na(x))})))
  # drop observations with missing values
  genedata2 = genedata2[complete.cases(genedata2), ]
  # transpose data so that samples become observations and gene intensities become variables
  genedata3 = t(genedata2)
  genedata3 = as.data.frame(genedata3)
  genedata3 <- genedata3[-1, ]
  genedata3$sIdx = rownames(genedata3)
  # define new column names from gene_1 to gene_22410
  name_col =   as.character(genedata2[1:22410,1])
  name_col = c(name_col, 'sIdx')
  colnames(genedata3) = name_col
  # metadata file is clean so no preprocessing required
  # merge data from intensity and metadata file
  df = merge(x = genedata3, y = metadata1, all.x=T, by = "sIdx")
  # now sIdx (sample ID) is the first column followed by the gene intensity values
  # and time data for the samples is also added to df
  # A total of 22413 columns and 30 rows
  # convert gene intensity values to the numeric data format
  for (i in 2:22411){
    df[,i] = as.numeric(as.character(df[,i]))
  }
  # identify and remove constant or not varying variables
  df1 = df[,2:22411]
  if (!require("caret")) install.packages("caret")
  require(caret)
  if (!require("foreach")) install.packages("foreach")
  require(foreach)
  nzv <- nearZeroVar(df1, freqCut=95/5, uniqueCut=10, allowParallel=T)
  df2 <- df1[,-nzv]
  df3 = cbind(df[,1], df2, df[,22412:22413])
  new_var_count = ncol(df2) + 1
  # construct the pca plot
  gene_pca <- prcomp(df3[,2:new_var_count], scale = TRUE)
  if (!require("factoextra")) install.packages("factoextra")
  require(factoextra)
  png(filename = "pca_plot_withscaling.png", width = 10, height = 5, units='in', res=300, bg = "white")
  print({p1 = fviz_pca_biplot(gene_pca, habillage=df$Time,pointsize=3,pointshape = 19,
                         label = "var", legend.title = "Time", invisible="quali", 
                         title="PCA Plot With Scaling", col.var="black", select.var=list(contrib = 4), repel=T)})
  dev.off()
  if (!require("plotly")) install.packages("plotly")
  require(plotly)
  p1 = fviz_pca_biplot(gene_pca, habillage=df$Time,pointsize=3,pointshape = 19,
                         label = "var", legend.title = "Time", invisible="quali", 
                         title="PCA Plot With Scaling", col.var="black", select.var=list(contrib = 4), repel=T)
  return(ggplotly(p1, tooltip = c("x", "y", "colour")))
}


# function to obtain the PCA plot from intensity and metadata file without scaling
plotpca_withoutscaling <- function(genedata, metadata){
  # data loading and error handling
  genedata1 <- if(is.character(genedata) && file.exists(genedata)){
    read.csv(genedata, sep=",", header=T)
  } else {
    stop ('intensity file does not exist')
  }
  metadata1 <- if(is.character(metadata) && file.exists(metadata)){
    read.csv(metadata, sep=",", header=T)
  } else {
    stop ('metadata file does not exist')
  }
  if (ncol(genedata1)==1) stop ('intensity file is not comma separated')
  if (ncol(metadata1)==1) stop ('metadata file is not comma separated')
  if (length(unique(metadata1['Unit']))!=1) stop ('time-point units are different in metadata file')
  if (tolower(colnames(genedata1[2])) != 'symbol') stop ('gene symbol column is missing in intensity file')
  # data preprocessing for intensity file
  # remove the identifier column
  genedata2 = genedata1[,-1]
  # check for missing values; 1 means column with missing value
  print(table(apply(genedata2, 2, function(x) {sum(is.na(x))})))
  # drop observations with missing values
  genedata2 = genedata2[complete.cases(genedata2), ]
  # transpose data so that samples become observations and gene intensities become variables
  genedata3 = t(genedata2)
  genedata3 = as.data.frame(genedata3)
  genedata3 <- genedata3[-1, ]
  genedata3$sIdx = rownames(genedata3)
  # define new column names from gene_1 to gene_22410
  name_col =   as.character(genedata2[1:22410,1])
  name_col = c(name_col, 'sIdx')
  colnames(genedata3) = name_col
  # metadata file is clean so no preprocessing required
  # merge data from intensity and metadata file
  df = merge(x = genedata3, y = metadata1, all.x=T, by = "sIdx")
  # now sIdx (sample ID) is the first column followed by the gene intensity values
  # and time data for the samples is also added to df
  # A total of 22413 columns and 30 rows
  # convert gene intensity values to the numeric data format
  for (i in 2:22411){
    df[,i] = as.numeric(as.character(df[,i]))
  }
  # construct the pca plot
  gene_pca <- prcomp(df[,2:22410], scale = FALSE)
  if (!require("factoextra")) install.packages("factoextra")
  require(factoextra)
  png(filename = "pca_plot_withoutscaling.png", width = 10, height = 5, units='in', res=300, bg = "white")
  print({p2 = fviz_pca_biplot(gene_pca, habillage=df$Time,pointsize=3,pointshape = 19,
                         label = "var", legend.title = "Time", invisible="quali", 
                         title="PCA Plot Without Scaling", col.var="black", select.var=list(contrib = 4), repel=T)})
  dev.off()
  if (!require("plotly")) install.packages("plotly")
  require(plotly)
  p2 = fviz_pca_biplot(gene_pca, habillage=df$Time,pointsize=3,pointshape = 19,
                         label = "var", legend.title = "Time", invisible="quali", 
                         title="PCA Plot Without Scaling", col.var="black", select.var=list(contrib = 4), repel=T)
  
  return(ggplotly(p2, tooltip = c("x", "y", "colour"))) 
}
shubhamj1510112/PCA documentation built on May 3, 2019, 8:07 p.m.