R/Phenotypic_Correlation.R
In TBA: Collection of 'shiny' Apps for Tree Breeding Analysis

Phenotypic_Correlation <- function(data, low_color = "blue", high_color = "green", correlation_values_color = "black") {

  old_options <- options(scipen = 999)  # Save current options
  on.exit(options(old_options))         # Restore options when function exits

  # Convert the first two columns to factor type
  data[, 1:2] <- lapply(data[, 1:2], as.factor)

  # Convert the remaining columns to numeric
  data[, -c(1, 2)] <- lapply(data[, -c(1, 2)], as.numeric)

  # Extract trait names (excluding the first two columns)
  traits <- names(data)[-c(1, 2)][sapply(data[-c(1, 2)], is.numeric)]

  # Prepare a matrix to store correlations
  correlation_matrix1 <- matrix("", nrow = length(traits), ncol = length(traits),
                                dimnames = list(traits, traits))

  # Calculate correlations for each pair of traits
  for (i in 1:length(traits)) {
    for (j in 1:length(traits)) {
      trait1 <- traits[i]
      trait2 <- traits[j]

      if (i == j) {
        correlation_matrix1[i, j] <- "1"  # Set correlation to 1 if it's the same trait
      } else {
        # Perform linear regression for trait1
        formula1 <- as.formula(paste0("`", trait1, "` ~ `", names(data)[1], "` + `", names(data)[2], "`"))
        model1 <- lm(formula1, data = data)
        anova_result1 <- anova(model1)

        # Perform linear regression for trait2
        formula2 <- as.formula(paste0("`", trait2, "` ~ `", names(data)[1], "` + `", names(data)[2], "`"))
        model2 <- lm(formula2, data = data)
        anova_result2 <- anova(model2)

        # Calculate phenotypic variance for trait1 and trait2
        replication_levels <- nlevels(data[[1]])
        genotypic_variance1 <- round((anova_result1$`Mean Sq`[2] - anova_result1$`Mean Sq`[3]) / replication_levels,6)

        phenotypic_variance1<-genotypic_variance1+anova_result1$`Mean Sq`[3]
        genotypic_variance2 <- round((anova_result2$`Mean Sq`[2] - anova_result2$`Mean Sq`[3]) / replication_levels,6)
        phenotypic_variance2<-genotypic_variance2+anova_result2$`Mean Sq`[3]

        # Calculate covariance sums
        total_of_genotypes_trait1 <- tapply(data[[trait1]], data[[2]], sum)
        total_of_genotypes_trait2 <- tapply(data[[trait2]], data[[2]], sum)
        total_of_replication_trait1 <- tapply(data[[trait1]], data[[1]], sum)
        total_of_replication_trait2 <- tapply(data[[trait2]], data[[1]], sum)
        number_of_replication <- nlevels(data[[1]])
        number_of_genotype <- nlevels(data[[2]])
        Grand_total_trait1 <- sum(data[[trait1]])
        Grand_total_trait2 <- sum(data[[trait2]])
        CF <- ( Grand_total_trait1 * Grand_total_trait2 ) / ( number_of_replication * number_of_genotype )
        Total_SP <- round(sum(data[[trait1]] * data[[trait2]]) - CF,6)
        Genotypic_SP <- round((sum(total_of_genotypes_trait1 * total_of_genotypes_trait2) / number_of_replication) - CF,6)

        Replication_SP <- round((sum(total_of_replication_trait1 * total_of_replication_trait2) / number_of_genotype) - CF,6)
        Error_SP <- Total_SP - Genotypic_SP - Replication_SP
        DF_Replication <- number_of_replication - 1
        DF_Genotypes <- number_of_genotype - 1
        DF_Error <- DF_Replication * DF_Genotypes
        Replication_MP <- round(Replication_SP / DF_Replication,6)
        Genotypic_MP <- round(Genotypic_SP / DF_Genotypes,6)
        Error_MP <- round(Error_SP / DF_Error,6)
        Genotypic_Covariance <- round((Genotypic_MP - Error_MP) / number_of_replication,6)
        Phenotypic_Covariance <- Error_MP + Genotypic_Covariance

        # Calculate correlation
        correlation1 <- round(Phenotypic_Covariance / sqrt(phenotypic_variance1 * phenotypic_variance2), 4)

        # Calculate t-statistic and p-value
        n <- number_of_replication * number_of_genotype  # Number of observations
        df <- n - 2  # Degrees of freedom for correlation

        # Check for valid correlation
        if (!is.nan(correlation1)&& !is.na(correlation1)) {
          t_stat <- (correlation1) * (sqrt(df / (1 - (correlation1)^2)))  # Calculate t-statistic
          p_value <- 2 * pt(abs(t_stat), df = df, lower.tail = FALSE)  # Calculate two-tailed p-value
        } else {
          t_stat <- NA
          p_value <- NA
        }

        # Determine significance level symbol
        if (!is.nan(t_stat) && !is.na(t_stat)) {
          if (p_value < 0.05) {
            significance_symbol <- "*"   # Significant at 5%
          } else {
            significance_symbol <- "NS"  # Non-significant
          }
        } else {
          significance_symbol <- ""  # No significance symbol if t_stat is NA and NaN
        }

        # Format correlation value without scientific notation
        formatted_correlation1 <- format(correlation1, scientific = FALSE)

        # Fill in the correlation matrix with correlation value and significance symbol
        correlation_matrix1[i, j] <- paste(formatted_correlation1, significance_symbol, sep = "")
      }
    }
  }

  # Prepare a matrix to store formatted correlation values without significance symbols
  # Extract numeric correlations for plotting
  numeric_correlation_matrix1 <- matrix(as.numeric(gsub("[^0-9.-]", "", correlation_matrix1)),
                                        nrow = length(traits), ncol = length(traits),
                                        dimnames = list(traits, traits))

  # Melt the numeric correlation matrix for ggplot
  melted_corr_mat1 <- melt(numeric_correlation_matrix1)
  colnames(melted_corr_mat1) <- c("Var1", "Var2", "value")

  # Replace NaNs with NA for ggplot handling
  melted_corr_mat1$value[is.nan(melted_corr_mat1$value)] <- NA

  # Determine bar height based on the number of traits
  number_of_traits1 <- length(traits)
  bar_height1 <- ifelse(number_of_traits1 < 8, 9, 14)  # Adjust height for fewer than 8 traits

  # Plotting the correlation heatmap using ggplot
  plot1 <- ggplot(data = melted_corr_mat1, aes(x = Var1, y = Var2, fill = as.numeric(value))) +
    geom_tile(color = "white", linewidth = 0.5) +
    geom_text(aes(Var2, Var1, label = ifelse(
      value == 1, "1",
      ifelse(value == -1, "-1",ifelse(value == 0, "0", sub("\\.?0+$", "", format(value, scientific = FALSE))))), fontface = "bold"), color = correlation_values_color, size = 6.5) + # Reduced text size for correlation values
    scale_fill_gradient(low = low_color, high = high_color, limits = c(-1, 1), name = "Correlation Range") +
    guides(fill = guide_colorbar(barwidth = 3, barheight = bar_height1, title.position = "top", title.hjust = 0.5,title.vjust = 2,
                                 title.theme = element_text(size = 18,face = "bold"), label.theme = element_text(size = 16, face = "bold"))) +  # Increase size of color bar
    theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 2)) +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank()) +
    theme(axis.text.x = element_text(size = 15, angle = 45, hjust = 1, color = "black", face = "bold")) +
    theme(axis.text.y = element_text(size = 15, color = "black", face = "bold")) +
    theme(axis.ticks.length = unit(0.3, "cm")) +
    theme(axis.ticks = element_line(linewidth = 1)) +
    scale_x_discrete(expand = c(0, 0)) +  # Remove gap on x-axis
    scale_y_discrete(expand = c(0, 0))    # Remove gap on y-axis

  # Return the correlation matrix and plot
  return(list(correlation_matrix1 = correlation_matrix1, plot1 = plot1))
}