R/CalcDissim.R
In DisimForMixed: Calculate Dissimilarity Matrix for Dataset with Mixed Attributes

#' Descretize Quantitative Variables.
#'
#' Takes in a data frame which contains only Quantitative varables in columns. Standadize the variables.
#' Discretize quantitative variables and returns discretized quantitative variables. Discretization was performed by
#' equal width bining algorithm.
#' @param myDataQuant A data frame which includes quantitative variables in columns.
#' @param noice Noice indicator. If noice = TRUE data standerdization is done by
#' deviding the difference between data point and median of the variable by the range of the variable.
#' If noice = FALSE data standerdization is done by deviding the difference between data point and
#' mean of the variable by the standard deviation of the variable.
#' @return A data frame consists of discretized quantitative variables.
#' @examples
#' QuantVars <- data.frame(Qnvar1 = c(1.5,3.2,4.9,5), Qnvar2 = c(4.8,2,1.1,5.8))
#' Discretized <- discretizeQuant(QuantVars)
#' @import stats
#' @export




discretizeQuant <- function(myDataQuant, noice = TRUE){

  i = 1
  noOfObj <-nrow(myDataQuant)
  noOfVar <- ncol(myDataQuant)
  NormData <- data.frame(data = NA)

  if(noice == TRUE){
    for(i in 1:ncol(myDataQuant)){
      NormData_i <- (myDataQuant[,i] - stats::median(myDataQuant[,i]))/(max(myDataQuant[,i]) - min(myDataQuant[,i]))
      NormData <- cbind(NormData, NormData_i)
    }
  }

  if(noice == FALSE){
    for(i in 1:ncol(myDataQuant)){
      NormData_i <- (myDataQuant[,i] - mean(myDataQuant[,i]))/ stats::sd(myDataQuant[,i])
      NormData <- cbind(NormData, NormData_i)
    }
  }

  NormData <- data.frame(NormData[,-1])
  colnames(NormData) <- colnames(myDataQuant)

  #decretize by equal no of Intervals
  dectrTzed <- data.frame()
  dectrTzedFac <- matrix(data = NA, ncol = noOfVar, nrow = noOfObj)
  noOfInt <- ceiling(sqrt(noOfObj))
  IntWidth <- (max(NormData[,i]) - min(NormData[,i])) / noOfInt

  for(i in 1:ncol(NormData)){
    for (j in 1:noOfObj){
      k = 1
      while(k <= noOfInt){
        if(NormData[j,i] <= min(NormData[,i]) + k*IntWidth){
          dectrTzed[j,i] <- k
          break
        }
        if(k == noOfInt)
          dectrTzed[j,i] <- k
        k = k + 1
      }
    }
  }
  i = 1

  #Find Significance
  for(i in 1:ncol(myDataQuant)){
    dectrTzedFac[,i] <- as.factor(dectrTzed[,i])
  }
  dectrTzedFac <- data.frame(dectrTzedFac)
  colnames(dectrTzedFac) <- colnames(myDataQuant)

  return(dectrTzedFac)

}


#' Calculate Conditional Probabilities.
#'
#' Takes in a data frame which contains only qualitative variables. Discretized quantitative variables
#' , a mixture of qualitative variables and discretized quantitative variables are also accepted.
#' Calculates conditional probabilities for each pair of attribute values in  the data frame.
#' Returns a data frame consists of J, A, B and C in columns where Pr(A|B) = C and J is the column number in the input
#' data frame corresponding to the values in A.
#' @param myDataAll A data frame which includes qualitative variables OR discretized quantitative variables
#' OR a mixture of qualitative variables and discretized quantitative variables in columns.
#' @return A data frame with four columns J, A, B and C in columns where Pr(A|B) = C and J is the column number in the input
#' data frame corresponding to the values in A.
#' @examples
#' QualiVars <- data.frame(Qlvar1 = c("A","B","A","C"), Qlvar2 = c("Q","Q","R","Q"))
#' CalcForQuali <- calcCondProb(QualiVars)
#' QuantVars <- data.frame(Qnvar1 = c(1.5,3.2,4.9,5), Qnvar2 = c(4.8,2,1.1,5.8))
#' Discretized <- discretizeQuant(QuantVars)
#' CalcForQuant <- calcCondProb(Discretized)
#' AllQualQuant <- data.frame(QualiVars, Discretized)
#' CalcForAll <- calcCondProb(AllQualQuant)
#' @export




calcCondProb <- function(myDataAll){

  ProbData <- data.frame()
  NoOfVars <- ncol(myDataAll)

  for(i in 1:NoOfVars){
    levelsOfDataFirst <- data.frame(levels(as.factor(myDataAll[,i])))
    NofFirst <- nrow(levelsOfDataFirst)

    j = 1
    while(j <= NoOfVars){
      if(j == i)
        j = j + 1
      if(j > NoOfVars)
        break

      levelsOfDataSec <- data.frame(levels(as.factor(myDataAll[,j])))
      NofSec <- nrow(levelsOfDataSec)

      for(k in 1:NofFirst){
        filteredfRST = myDataAll[myDataAll[,i] == levelsOfDataFirst[k,1], ]
        denomVal <- nrow(filteredfRST)
        for(a in 1:NofSec){
          filteredSec <- filteredfRST[filteredfRST[,j] == levelsOfDataSec[a,1], ]
          if (is.na(filteredSec[1,1]) == FALSE){
            numeVal <- nrow(filteredSec)
          }else
            numeVal <- 0
          jVal = j
          occur <- levelsOfDataSec[a,1]
          given <- levelsOfDataFirst[k,1]
          condProbVal <- numeVal/denomVal
          newRowToBind <- data.frame(jVal, occur, given, condProbVal)
          ProbData <- rbind(ProbData,newRowToBind)
        }
      }
      j = j + 1
    }
  }
  return(ProbData)
}




#' Calculate Distance  Between given Attribute Values by considering only a pair of attributes.
#'
#' Takes in two lists Ai and Aj, representing values of two attributes, two values x and y from Ai.
#' Quantitative attributes are accepted only after descretization. Calculates distance between x and y
#' for Aj with respect to Ai.
#' @param Ai A list consisting values of a selected attribute
#' @param Aj A list consisting values of another selected attribute
#' @param x Value from Ai
#' @param y Another value from Ai
#' @details findMax is the implementation of find_max() function proposed by Ahmad & Dey (2007).
#' See Ahmad & Dey (2007) for more datails.
#' @return distance between x and y for Aj with respect to Ai.
#' @references Ahmad, A., & Dey, L. (2007). A k-mean clustering algorithm for mixed numeric and categorical data. Data & Knowledge Engineering, 63(2), 503-527.
#' @examples
#' Attrib_i <- c("A","B","A","C")
#' Attrib_j <- c("Q","Q","R","Q")
#' xVal <- "A"
#' yVal <- "B"
#' QualiVars <- data.frame(Qlvar1 = c("A","B","A","C"), Qlvar2 = c("Q","Q","R","Q"))
#' library(dplyr)
#' distBetXY <- findMax(Attrib_i,Attrib_j,xVal,yVal)
#' @export



findMax <- function(Ai, Aj, x, y){

  dist_xy = 0
  levelsOfAj <- data.frame(levels(as.factor(Aj)))
  NofAj <- nrow(levelsOfAj)
  ConditioanalProbs <- calcCondProb(data.frame(Ai,Aj))
  filteredCP = data.frame()
  for (i in 1:NofAj){
    filteredCPi <-  dplyr::filter(ConditioanalProbs, ConditioanalProbs$occur == as.character(levelsOfAj[i,1]) & (ConditioanalProbs$given == x | ConditioanalProbs$given == y) & ConditioanalProbs$jVal == 2)
    filteredCP <- rbind(filteredCP,filteredCPi)
  }
  for(t in 1:NofAj){
    jGivx = 100
    jGivy = 100
    for (k in 1:nrow(filteredCP)){
      if(as.character(filteredCP$occur[k]) == as.character(levelsOfAj[t,1]) & as.character(filteredCP$given[k]) == x){
        jGivx <- filteredCP$condProbVal[k]
      }
      if(as.character(filteredCP$occur[k]) == as.character(levelsOfAj[t,1]) & as.character(filteredCP$given[k]) == y){
        jGivy <- filteredCP$condProbVal[k]
      }
      if(jGivx != 100 & jGivy != 100){
        break
      }
    }

    if(jGivx >= jGivy){
      dist_xy = dist_xy + jGivx
    }else{
      dist_xy = dist_xy + jGivy
    }
  }
  dist_xy <- dist_xy - 1

  return(dist_xy)

}



#' Calculate Distance  Between Attribute Values.
#'
#' Takes in a data frame which contains only qualitative variables. Discretized quantitative variables
#' , a mixture of qualitative variables and discretized quantitative variables are also accepted.
#' Calculates distance between each pair of attribute values for a given attribute. This calculation
#' is done according to the method proposed by Ahmad & Dey (2007).
#' @param myDataAll A data frame which includes qualitative variables OR discretized quantitative variables
#' OR a mixture of qualitative variables and discretized quantitative variables in columns.
#' @details distBetPairs is an implementtion of the method proposed by Ahmad & Dey (2007)
#' to find the distance between two catogorical values corresponding to a qualitative
#' variable. This distance measure considers distribution of values in the data set.
#' This function is also used to find the distance between discretized values
#' corresponding to quantitative variables which are used in calculating the significance
#' of quantitative attributes. See Ahmad & Dey (2007) for more datails.
#' @return A data frame with four columns J, A, B and C in columns where Distance(A, B) = C
#' and J is the column number in the input data frame corresponding to the values in A.
#' @references Ahmad, A., & Dey, L. (2007). A k-mean clustering algorithm for mixed numeric and categorical data. Data & Knowledge Engineering, 63(2), 503-527.
#' @examples
#' QualiVars <- data.frame(Qlvar1 = c("A","B","A","C"), Qlvar2 = c("Q","Q","R","Q"))
#' library(dplyr)
#' distForQuali <- distBetPairs(QualiVars)
#' QuantVars <- data.frame(Qnvar1 = c(1.5,3.2,4.9,5), Qnvar2 = c(4.8,2,1.1,5.8))
#' Discretized <- discretizeQuant(QuantVars)
#' distForQuant <- distBetPairs(Discretized)
#' AllQualQuant <- data.frame(QualiVars, Discretized)
#' distForAll <- distBetPairs(AllQualQuant)
#' @export





distBetPairs <- function(myDataAll){

  DistData <- data.frame()
  NoOfVars <- ncol(myDataAll)
  for(i in 1:NoOfVars){
    levelsOfDataFirst <- data.frame(levels(as.factor(myDataAll[,i])))
    NofFirst <- nrow(levelsOfDataFirst)
    jStop <- NofFirst-1
    for (j in 1:jStop){
      aStart <- j + 1
      for (a in aStart:NofFirst){
        SumProb <- 0
        k=1
        while(k <= NoOfVars){
          if(k != i){
            distXY <- findMax(as.character(myDataAll[,i]), as.character(myDataAll[,k]), as.character(levelsOfDataFirst[j,1]), as.character(levelsOfDataFirst[a,1]))
            SumProb <- SumProb + distXY
          }
          k = k + 1
        }
        VarIndex <- i
        xVal <- levelsOfDataFirst[j,1]
        yVal <- levelsOfDataFirst[a,1]
        DistBetPair <- SumProb/(NoOfVars-1)
        newRowToBind <- data.frame(VarIndex, xVal, yVal)
        newRowToBind <- data.frame(VarIndex, xVal, yVal, DistBetPair)
        DistData <- rbind(DistData, newRowToBind)
      }
    }
  }
  return(DistData)
}



#' Calculate Significance of Quantitative Attributes.
#'
#' Takes in two data frames where first contains only qualitative attributes and the other
#' contains only quantitative attributes. Function calculates significance of quantitative
#' attributes based on the method proposed by Ahmad & Dey (2007).
#' @param myDataQuali A data frame which includes only qualitative variables in columns.
#' @param myDataQuant A data frame which includes only quantitative variables in columns.
#' @details signifOfQuantVars is an implementtion of the method proposed by Ahmad & Dey (2007)
#' to calculate the significance of quantitative attributes. Signinficance of an attribute is an
#' important fact to consider in the process of clustering. To calculate the significance
#' quantitative attributes are discreized first. These  significace values are used in calculating
#' distance between any two numeric values of aquantitative attribute. See Ahmad & Dey (2007) for
#' more datails.
#' @return A data frame with two columns A and B where A represents variable number and B
#' represents significane of corresponding variable.
#' @references Ahmad, A., & Dey, L. (2007). A k-mean clustering algorithm for mixed numeric and categorical data. Data & Knowledge Engineering, 63(2), 503-527.
#' @examples
#' QualiVars <- data.frame(Qlvar1 = c("A","B","A","C"), Qlvar2 = c("Q","Q","R","Q"))
#' QuantVars <- data.frame(Qnvar1 = c(1.5,3.2,4.9,5), Qnvar2 = c(4.8,2,1.1,5.8))
#' SigOfQuant <- signifOfQuantVars(QualiVars, QuantVars)
#' @export


signifOfQuantVars <- function(myDataQuali, myDataQuant){

  DiscretQuant <- discretizeQuant(myDataQuant)
  allData <- data.frame(myDataQuali, DiscretQuant)
  DistForAll <- distBetPairs(allData)

  QuantVarStart <- ncol(myDataQuali) + 1
  LoopStopVal <- ncol(allData)
  SignfData = data.frame()

  ct = 1
  for(i in QuantVarStart:LoopStopVal){
    VarIndex = DistForAll[i,1]
    SigDataForQuant <- dplyr::filter(DistForAll, VarIndex == i)
    SignfData[ct,1] <- ct
    SignfData[ct,2] <- mean(SigDataForQuant$DistBetPair)
    ct = ct + 1
  }
  return(SignfData)
}




#' Calculate Dissimilarity Matrix for Mixed Attributes.
#'
#' Takes in two data frames where first contains only qualitative attributes and the other
#' contains only quantitative attributes. Function calculates the dissimilarity matrix
#' based on the method proposed by Ahmad & Dey (2007).
#' @param myDataQuali A data frame which includes only qualitative variables in columns.
#' @param myDataQuant A data frame which includes only quantitative variables in columns.
#' @details calcDissimMat is an implementtion of the method proposed by Ahmad & Dey (2007)
#' to calculate the dissimilarity matrix at the presence of both qualitative and quantitative
#' attributes. This approach finds dissimilarity of qualitative and quantitative attributes seperately
#' and the final dissimilarity matrix is formed by combining both. See Ahmad & Dey (2007) for
#' more datails.
#' @return A dissimilarity matrix. This can be used as an input to pam, fanny, agnes and diana functions.
#' @references Ahmad, A., & Dey, L. (2007). A k-mean clustering algorithm for mixed numeric and categorical data. Data & Knowledge Engineering, 63(2), 503-527.
#' @examples
#' QualiVars <- data.frame(Qlvar1 = c("A","B","A","C","C","A"), Qlvar2 = c("Q","Q","R","Q","R","Q"))
#' QuantVars <- data.frame(Qnvar1 = c(1.5,3.2,4.9,5,2.8,3.1), Qnvar2 = c(4.8,2,1.1,5.8,3.1,2.2))
#' DisSimMatCalcd <- calcDissimMat(QualiVars, QuantVars)
#'
#' agnesClustering <- cluster::agnes(DisSimMatCalcd, diss = TRUE, method = "ward")
#' silWidths <- cluster::silhouette(cutree(agnesClustering, k = 2), DisSimMatCalcd)
#' mean(silWidths[,3])
#' plot(agnesClustering)
#'
#' PAMClustering <- cluster::pam(DisSimMatCalcd, k=2, diss = TRUE)
#' silWidths <- cluster::silhouette(PAMClustering, DisSimMatCalcd)
#' plot(silWidths)
#' @export




calcDissimMat <- function(myDataQuali, myDataQuant){

  #library(dplyr)

  dissimForQuant <- data.frame()
  dissimForQuali <- data.frame()
  dissimForAll <- data.frame()
  disSimForAttr <- numeric(0)
  NoOfObj <- nrow(myDataQuali)
  NoClolQuant <- ncol(myDataQuant)
  NoClolQuali <- ncol(myDataQuali)

  SignfDataFound <- signifOfQuantVars(myDataQuali, myDataQuant)

  for (a in 1:NoOfObj){
    for (b in 1:NoOfObj){
      for (i in 1:NoClolQuant){
        disSimForAttr[i] <- (SignfDataFound[i,2]*(myDataQuant[a,i] - myDataQuant[b,i]))^2
      }
      dissimForQuant[a,b] <- sum(disSimForAttr)
    }
  }

  disSimForAttr <- numeric(0)
  DistanceData <- distBetPairs(myDataQuali)

  for (a in 1:NoOfObj){
    for (b in 1:NoOfObj){
      for (i in 1:NoClolQuali){
        DistanceFound <- dplyr::filter(DistanceData, (DistanceData$xVal == as.character(myDataQuali[a,i]) & DistanceData$yVal == as.character(myDataQuali[b,i])) | (DistanceData$xVal == as.character(myDataQuali[b,i]) & DistanceData$yVal == as.character(myDataQuali[a,i])))

        if(nrow(DistanceFound) == 0){
          DistValUniq = 0
        }else
          DistValUniq <- DistanceFound$DistBetPair

        disSimForAttr[i] <- DistValUniq ^ 2
      }
      dissimForQuali[a,b] <- sum(disSimForAttr)
    }
  }
  dissimForAll <- dissimForQuant[,] + dissimForQuali[,]

  agnesClustering <- cluster::agnes(dissimForAll, diss = TRUE, method = "ward")

  return(dissimForAll)
}
Any scripts or data that you put into this service are public.
DisimForMixed documentation built on May 1, 2019, 9:25 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
DisimForMixed
Calculate Dissimilarity Matrix for Dataset with Mixed Attributes

R/CalcDissim.R
In DisimForMixed: Calculate Dissimilarity Matrix for Dataset with Mixed Attributes

Try the DisimForMixed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DisimForMixed Calculate Dissimilarity Matrix for Dataset with Mixed Attributes

R/CalcDissim.R In DisimForMixed: Calculate Dissimilarity Matrix for Dataset with Mixed Attributes

Try the DisimForMixed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DisimForMixed
Calculate Dissimilarity Matrix for Dataset with Mixed Attributes

R/CalcDissim.R
In DisimForMixed: Calculate Dissimilarity Matrix for Dataset with Mixed Attributes