R/load_data_files.R

#' @title Loading data file
#' @description Loading all data
#' @param contig_lengths_file_path Path to contig lengths file
#' @param input_file_path Path to file generated by postpsass
#' @param common Whether to load common category data (default: TURE)
#' @param unique_1 Whether to load unique_2 category data (default: TURE)
#' @param unique_2 Whether to load unique_2 category data (default: TURE)
#' @param divergence_minor_allele Whether to load divergence_minor_allele category data (default: TURE)
#' @param divergence_major_allele Whether to load divergence_major_allele category data (default: TURE)
#' @param chromosomes_names_file_path Path to chromosomes names file
#' @param plot.unplaced Whether to plot unplaced scaffolds
#' @return Output all data in to a list
#' @export

load_data_files <- function(contig_lengths_file_path, input_file_path,
                            common = TRUE, unique_1 = TRUE,
                            unique_2 = TRUE, divergence_minor_allele = TRUE,
                            divergence_major_allele = TRUE,
                            chromosomes_names_file_path = NULL,
                            plot.unplaced = TRUE) {

  output <- list()

  print(" - Loading chromosomes names file")
  output$names <- load_chromosomes_names(chromosomes_names_file_path)

  print(" - Loading contig lengths file")
  output$lengths <- load_contig_lengths(contig_lengths_file_path, chromosomes_names = output$names)
  all_data <- load_single_data_file(input_file_path, output$lengths, plot.unplaced = plot.unplaced)
  output$original_data <- all_data

  if (common) {

    print(" - Loading sliding window common SNPs data")
    output$common <- all_data[,c(-4, -5, -6, -7)]
    # output$common$Color = 1

  }

  if (unique_1) {

    print(" - Loading sliding window unique_1 data")
    output$unique_1 <- all_data[,c(-3, -5, -6, -7)]

  }

  if (unique_2) {

    print(" - Loading sliding window unique_2 data")
    output$unique_2 <- all_data[,c(-3, -4, -6, -7)]
    # output$unique_2$Color = 3

  }

  if (divergence_minor_allele) {

    print(" - Loading sliding window divergence_minor_allele data")
    output$divergence_minor_allele <- all_data[,c(-3, -4, -5, -7)]
    # output$divergence_minor_allele$Color = 4

  }

  if (divergence_major_allele) {

    print(" - Loading sliding window divergence_major_allele data")
    output$divergence_major_allele <- all_data[,c(-3, -4, -5, -6)]
    # output$divergence_major_allele$Color = 5

  }

  return(output)
}


#' @title Loading a single data file generated by postpsass
#' @description Loading a single file generated by postpsass
#' @param input_file_path file generated by postpsass
#' @param chromosomes_names Generated by load_chromosomes_names_file
#' @param plot.unplaced Whether to plot unplaced scaffolds
#' @return Output data frame

load_contig_lengths <- function(input_file_path, chromosomes_names = NULL, plot.unplaced = TRUE) {

    raw_data <- suppressMessages(read.delim(input_file_path, header=FALSE))
    data <- raw_data$V2
    names(data) <- raw_data$V1

    output <- list()

    if (!is.null(chromosomes_names)) {  # If a chromosomes names file was provided, it is used to determine chromosomes

        output$lg <- subset(data, names(data) %in% names(chromosomes_names) & chromosomes_names[names(data)] != "MT")
        output$lg <- output$lg[gtools::mixedorder(chromosomes_names[names(output$lg)])]
        output$unplaced <- subset(data, !(names(data) %in% names(chromosomes_names)))

    } else {  # Otherwise, try to determine chromosomes automatically (condition: start with LG / Ch / NC)

        output$lg <- subset(data, substr(names(data), 1, 2) %in% c("LG", "lg", "Lg", "Ch", "ch", "CH", "NC"))

        if (is.vector(output$lg) && length(output$lg) > 1) {

            # Usually mitochondria is also called NC_xxx. If one chromosome is > 50 times smaller than the average of all ohter chromosomes,
            # or it is smaller than 50000 bp, it is considered to be the mitochondria and is removed
            putative_mt <- min(output$lg)
            if (50 * putative_mt < median(output$lg) | putative_mt < 50000) output$lg <- output$lg[output$lg != putative_mt]

            output$lg <- output$lg[gtools::mixedorder(names(output$lg))]  # Order chromosomes based on their ID

        } else if (length(output$lg) == 1) {

            output$lg <- output$lg[-1]  # If there is only one LG, it is assumed it is a mitochondria
        }

        output$unplaced <- subset(data, !(substr(names(data), 1, 2) %in% c("LG", "lg", "Ch", "ch", "MT", "mt", "NC")))

    }

    output$unplaced <- sort(output$unplaced, decreasing = TRUE)  # Sort unplaced scaffolds by size

    if (plot.unplaced) {

        output$plot <- c(output$lg, "Unplaced" = sum(output$unplaced))

    } else {

        output$plot <- output$lg

    }

    return(output)
}


#' @title Loading a single data file generated by postpsass
#' @description Loading a single file generated by postpsass
#' @param input_file_path file generated by postpsass
#' @param contig_lengths Generated by load_contig_lengths function
#' @param plot.unplaced Whether to plot unplaced scaffolds
#' @return Output data frame
load_single_data_file <- function(input_file_path, contig_lengths, plot.unplaced = TRUE, snp_pos = FALSE) {

    data <- suppressMessages(read.delim(input_file_path))

    data_lg <- subset(data, data$Contig %in% names(contig_lengths$lg))
    data_unplaced <- subset(data, data$Contig %in% names(contig_lengths$unplaced))

    # If lgs were found, sort them and set their color index to 2 (for the plotting later)
    if (dim(data_lg)[1] > 0) {

        data_lg$Color <- rep(2, dim(data_lg)[1])

    }

    if (plot.unplaced & dim(data_unplaced)[1] > 0) {  # If unplaced scaffolds should be grouped and there is at least one unplaced scaffold

        # Order unplaced contigs data by contig length and then by position on the contig
        data_unplaced <- data_unplaced[order(match(data_unplaced$Contig, names(contig_lengths$unplaced)), data_unplaced$Position), ]

        # Attribute a color index to each unplaced contig, alternating between 0 and 1
        order <- seq(1, length(unique(data_unplaced$Contig)))
        names(order) <- unique(data_unplaced$Contig)
        data_unplaced$Color <- order[data_unplaced$Contig] %% 2

        # Transform position on each contig into position on cumulated contig
        temp <- cumsum(contig_lengths$unplaced) - contig_lengths$unplaced[1]
        data_unplaced$Original_position <- data_unplaced$Position
        data_unplaced$Position <- data_unplaced$Position + temp[data_unplaced$Contig]
        data_unplaced$Contig_id <- data_unplaced$Contig
        data_unplaced$Contig <- "Unplaced"

        # Regroup data into one data frame
        data_lg$Contig_id <- data_lg$Contig
        data_lg$Original_position <- data_lg$Position
        data <- rbind(data_lg, data_unplaced)
        data$Contig <- factor(data$Contig, levels = c(names(contig_lengths$lg), "Unplaced"))

    } else {

        data <- data_lg
        data$Original_position <- data$Position
        data$Contig <- factor(data$Contig, levels = names(contig_lengths$lg))

    }

    return(data)
}


#' @title Loading chromosomes name file
#' @description Two column should provide in this file: original name and corresponding names for sector name
#' @param input_file_path Path to chromosomes name file
#' @return Data.frame of chromosomes name file

load_chromosomes_names <- function(input_file_path) {

    if (!is.null(input_file_path)) {

        raw_data <- suppressMessages(read.delim(input_file_path, header=FALSE))
        data <- as.character(raw_data$V2)
        names(data) <- raw_data$V1
        data <- gtools::mixedsort(data)
        return(data)

    } else {

        return(NULL)

    }
}
tankbuild/postpsassR documentation built on May 11, 2019, 3:07 p.m.