R/api_timeline.R

Defines functions .timeline_check .timeline_format .timeline_during .timeline_match_indexes .timeline_match .timeline_valid_date .timeline_class_info

#' @title Define the information required for classifying time series
#'
#' @name .timeline_class_info
#'
#' @keywords internal
#' @noRd
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description Time series classification requires a series of steps:
#' (a) Provide labelled samples that will be used as training data.
#' (b) Provide information on how the classification will be performed,
#'     including data timeline,and start and end dates per interval.
#' (c) Clean the training data to ensure it meets the specifications
#'     of the classification info.
#' (d) Use the clean data to train a machine learning classifier.
#' (e) Classify non-labelled data sets.
#'
#' In this set of steps, this function provides support for step (b).
#' It requires the user to provide a timeline, the classification interval,
#' and the start and end dates of the reference period. T
#' he results is a tibble with information that allows the user
#' to perform steps (c) to (e).
#'
#' @param  data            Description on the data being classified.
#' @param  samples         Samples used for training the classification model.
#'
#' @return A tibble with the classification information.
#'
.timeline_class_info <- function(data, samples) {
    # find the timeline
    timeline <- sits_timeline(data)
    # precondition is the timeline correct?
    .check_length(
        x = timeline,
        len_min = 1,
        msg = "sits_timeline_class_info: invalid timeline"
    )
    # find the labels
    labels <- sits_labels(samples)
    # find the bands
    bands <- sits_bands(samples)
    # what is the reference start date?
    ref_start_date <- lubridate::as_date(samples[1, ]$start_date)
    # what is the reference end date?
    ref_end_date <- lubridate::as_date(samples[1, ]$end_date)
    # number of samples
    num_samples <- nrow(samples[1, ]$time_series[[1]])
    # obtain the reference dates that match the patterns in the full timeline
    ref_dates <- .timeline_match(
        timeline,
        ref_start_date,
        ref_end_date,
        num_samples
    )
    # obtain the indexes of the timeline that match the reference dates
    dates_index <- .timeline_match_indexes(timeline, ref_dates)
    # find the number of the samples
    nsamples <- dates_index[[1]][2] - dates_index[[1]][1] + 1
    # create a class_info tibble to be used in the classification
    class_info <- tibble::tibble(
        bands = list(bands),
        labels = list(labels),
        timeline = list(timeline),
        num_samples = nsamples,
        ref_dates = list(ref_dates),
        dates_index = list(dates_index)
    )
    return(class_info)
}
#' @title Test if date fits with the timeline
#'
#' @name .timeline_valid_date
#' @keywords internal
#' @noRd
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description A timeline is a list of dates where observations are available.
#' This function estimates if a date is valid by comparing it to the timeline.
#' If the date's estimate is not inside the timeline and the difference between
#' the date and the first date of timeline is greater than the acquisition
#' interval of the timeline, the date is not valid.
#'
#' @param date        A date.
#' @param timeline    A vector of reference dates.
#'
#' @return Is this is valid starting date?
#'
.timeline_valid_date <- function(date, timeline) {
    # is the date inside the timeline?
    if (date %within% lubridate::interval(
        timeline[1],
        timeline[length(timeline)]
    )) {
        return(TRUE)
    }

    # what is the difference in days between the last two days of the timeline?
    timeline_diff <- as.integer(timeline[2] - timeline[1])
    # if the difference in days in the timeline is smaller than the difference
    # between the reference date and the first date of the timeline, then
    # we assume the date is valid
    if (abs(as.integer(date - timeline[1])) <= timeline_diff) {
        return(TRUE)
    }
    # what is the difference in days between the last two days of the timeline?
    timeline_diff <- as.integer(timeline[length(timeline)] -
        timeline[length(timeline) - 1])

    # if the difference in days in the timeline is smaller than the difference
    # between the reference date and the last date of the timeline, then
    # we assume the date is valid
    if (abs(as.integer(date - timeline[length(timeline)])) <= timeline_diff) {
        return(TRUE)
    }
    return(FALSE)
}

#' @title Find dates in the input data cube that match those of the patterns
#' @name .timeline_match
#' @keywords internal
#' @noRd
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description For correct classification, the input data set
#'              should be aligned to that of the reference data set.
#'              This function aligns these data sets.
#'
#' @param timeline_data         Timeline of input to be classified.
#' @param model_start_date      Model start date
#' @param model_end_date        Model end date.
#' @param num_samples           Number of samples.
#'
#' @return A list of breaks that will be applied to the input data set.
#'
.timeline_match <- function(timeline_data,
                            model_start_date,
                            model_end_date,
                            num_samples) {
    # set caller to show in errors
    .check_set_caller(".timeline_match")

    # make sure the timeline is a valid set of dates
    timeline_data <- lubridate::as_date(timeline_data)
    # define the input start date
    input_start_date <- timeline_data[1]

    # create a list  the subset dates to break the input data set
    subset_dates <- list()
    # consider two cases:
    # (1) start date of data is before start date model
    # (2) start date of data is the same or after start date of model
    if (timeline_data[1] < model_start_date) {
        # what is the expected start and end dates based on the patterns?
        ref_st_mday <- as.character(lubridate::mday(model_start_date))
        ref_st_month <- as.character(lubridate::month(model_start_date))
        year_st_date <- as.character(lubridate::year(input_start_date))
        est_start_date <- lubridate::as_date(paste0(
            year_st_date, "-",
            ref_st_month, "-",
            ref_st_mday
        ))
        # find the actual starting date by searching the timeline
        idx_start_date <- which.min(abs(est_start_date - timeline_data))
    } else {
        # find the actual starting date by searching the timeline
        idx_start_date <- which.min(abs(model_start_date - timeline_data))
    }

    # take the start date of the input to be classified
    start_date <- timeline_data[idx_start_date]
    # is the start date a valid one?
    .check_that(
        x = .timeline_valid_date(start_date, timeline_data),
        msg = "start date in not inside timeline"
    )
    # what is the expected end date of the classification?
    idx_end_date <- idx_start_date + (num_samples - 1)
    end_date <- timeline_data[idx_end_date]
    # is the start date a valid one?
    .check_that(
        x = !(is.na(end_date)),
        msg = paste(
            "start and end date do not match timeline/n",
            "Please compare your timeline with your samples"
        )
    )

    # go through the timeline of the data
    # find the reference dates for the classification
    while (!is.na(end_date)) {
        # add the start and end date
        subset_dates[[length(subset_dates) + 1]] <- c(start_date, end_date)

        # estimate the next start and end dates
        idx_start_date <- idx_end_date + 1
        start_date <- timeline_data[idx_start_date]
        idx_end_date <- idx_start_date + num_samples - 1
        # estimate
        end_date <- timeline_data[idx_end_date]
    }
    # is the end date a valid one?
    end_date <- subset_dates[[length(subset_dates)]][2]
    .check_that(
        x = .timeline_valid_date(end_date, timeline_data),
        msg = "end_date not inside timeline"
    )
    return(subset_dates)
}

#' @title Find indexes in a timeline that match the reference dates
#' @name .timeline_match_indexes
#' @keywords internal
#' @noRd
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description For correct classification, the time series of the input data
#'              should be aligned to that of the reference data set
#'              (usually a set of patterns).
#'              This function aligns these data sets so that shape
#'              matching works correctly
#'
#' @param timeline      Timeline of input observations (vector).
#' @param ref_dates     List of breaks to be applied to the input data set.
#'
#' @return              A list of indexes that match the reference dates
#'                      to the timelines.
#'
.timeline_match_indexes <- function(timeline, ref_dates) {
    dates_index <- ref_dates |>
        purrr::map(function(date_pair) {
            start_index <- which(timeline == date_pair[1])
            end_index <- which(timeline == date_pair[2])

            dates_index <- c(start_index, end_index)
            return(dates_index)
        })

    return(dates_index)
}
#' @title Find the subset of a timeline that is contained
#'        in an interval defined by start_date and end_date
#' @name  .timeline_during
#' @noRd
#' @keywords internal
#'
#' @param timeline      A valid timeline
#' @param start_date    A date which encloses the start of timeline
#' @param end_date      A date which encloses the end of timeline
#'
#' @return              A timeline
#'
.timeline_during <- function(timeline,
                             start_date = NULL,
                             end_date = NULL) {
    # set caller to show in errors
    .check_set_caller(".sits_timeline_during")
    # obtain the start and end indexes
    if (purrr::is_null(start_date)) {
        start_date <- timeline[1]
    }
    if (purrr::is_null(end_date)) {
        end_date <- timeline[length(timeline)]
    }
    valid <- timeline >= lubridate::as_date(start_date) &
        timeline <= lubridate::as_date(end_date)
    .check_that(
        x = any(valid),
        msg = paste("no valid data between", start_date, "and", end_date)
    )
    return(timeline[valid])
}

#' @title Find if the date information is correct
#' @name  .timeline_format
#' @keywords internal
#' @noRd
#' @description Given a information about dates, check if all dates can be
#'              interpreted by lubridate
#' @param dates  character vector representing dates
#' @return date class vector
#'
.timeline_format <- function(dates) {
    # set caller to show in errors
    .check_set_caller(".timeline_format")
    .check_length(
        x = dates,
        len_min = 1,
        msg = "invalid date parameter"
    )
    # convert to character (strsplit does not deal with dates)
    dates <- as.character(dates)
    # check type of date interval
    converted_dates <- purrr::map_dbl(dates, function(dt) {
        if (length(strsplit(dt, "-")[[1]]) == 1) {
            converted_dates <- lubridate::fast_strptime(dt, c("%Y%m%d", "%Y"))
        } else if (length(strsplit(dt, "-")[[1]]) == 2) {
            converted_dates <- lubridate::fast_strptime(dt, "%Y-%m")
        } else if (length(strsplit(dt, "-")[[1]]) == 3) {
            converted_dates <- lubridate::fast_strptime(dt, "%Y-%m-%d")
        } else
            stop("cannot convert date to YYYY-MM-DD format")
        # transform to date object
        converted_dates <- lubridate::as_date(converted_dates)
        # check if there are NAs values
        .check_that(
            x = all(!is.na(converted_dates)),
            msg = paste0("invalid date format '", dt, "' in file name")
        )
        return(converted_dates)
    })

    # convert to a vector of dates
    converted_dates <- lubridate::as_date(converted_dates)
    # post-condition
    .check_length(
        x = converted_dates,
        len_min = length(dates),
        len_max = length(dates),
        msg = "invalid date values"
    )
    return(converted_dates)
}

#' @title Checks that the timeline of all time series of a data set are equal
#' @name .timeline_check
#' @keywords internal
#' @noRd
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description This function tests if all time series in a sits tibble
#' have the same number of samples
#'
#' @param  data  Either a sits tibble
#' @return       TRUE if the length of time series is unique
#'
.timeline_check <- function(data) {
    if (length(unique(lapply(data$time_series, nrow))) == 1) {
        return(TRUE)
    } else {
        return(FALSE)
    }
}
e-sensing/sits documentation built on Jan. 28, 2024, 6:05 a.m.