R/quality_functions.R In STRMPS: Analysis of Short Tandem Repeat (STR) Massively Parallel Sequencing (MPS) Data

```#' @name phredQualityScore
#' @aliases solexaQualityScore
#'
#' @title Convert probability to quality score
#'
#' @description Calculates the quality score (Phred or Solexa) given a probability of error.
#'
#' @param p Probability of error.
#'
#' @rdname qualityScore
#' @return \code{phredQualityScore(p)} returns a Phred quality score.
#' @example inst/examples/qualityScores.R
phredQualityScore <- function(p) {
q_p <- -10*log10(p)
return(q_p)
}

#' @rdname qualityScore
#' @return \code{solexaQualityScore(p)} returns a Solexa quality score.
solexaQualityScore <- function(p) {
q_s <- -10 * log10( p / (1 - p))
return(q_s)
}

#' @name phredQualityProbability
#' @aliases solexaQualityProbability
#'
#' @title Quality score to probability
#'
#' @description Converts a quality score (Phred or Solexa) to a probability of error.
#'
#' @param q Quality score.
#'
#' @rdname probabilityOfError
#' @return \code{phredQualityScore(q_phred)} and \code{solexaQualityScore(q_solexa)} returns a probability of error.
#' @example inst/examples/probabilityOfError.R
phredQualityProbability <- function(q) {
p <- 10^(-q/10)
return(p)
}

#' @rdname probabilityOfError
solexaQualityProbability <- function(q) {
p <- 1 / (10^(q / 10) + 1)
return(p)
}

.geometricMean <- function(x) {
return(exp(1/length(x) * sum(log(x))))
}

.aggregateQuality <- function(q) {
qS <- as.matrix(PhredQuality(as.character(q)))
qAvg <- PhredQuality(apply(qS, 2, function(x) .geometricMean(phredQualityProbability(x))))
cAvg <- as.character(qAvg)

if (length(cAvg) == 0)
cAvg = ""

return(cAvg)
}

.createRanking <- function(x) {
unique_x <- unique(x)
rank = c()
coverage = c()
for (i in seq_along(unique_x)) {
rank = c(rank, rep(i, length(which(x == unique_x[i]))))
coverage = c(coverage, rep(length(which(x == unique_x[i])), length(which(x == unique_x[i]))))
}

final <- data.frame(Rank = rank, Coverage = coverage)
return(final)
}

.baseErrorProbabilityConditinal <- function(quality_variant, matching_bases, windowSize = 5) {
probability_error_variant <- phredQualityProbability(c(as(PhredQuality(quality_variant), "matrix")))

if (length(probability_error_variant) == 0)
return(0)

leftmost_neighbour <- ifelse(seq(1, length(probability_error_variant)) - windowSize < 1, 1, seq(1, length(probability_error_variant)) - windowSize)
rightmost_neighbour <- ifelse(seq(1, length(probability_error_variant)) + windowSize > length(probability_error_variant), length(probability_error_variant), seq(1, length(probability_error_variant)) + windowSize)
base_probability_error_variant <- rep(NA, length(probability_error_variant))

for (i in 1:length(base_probability_error_variant)) {
weighted_base_probability_variant <- probability_error_variant / (abs(1:length(probability_error_variant) - i) + 1)

base_probability_error_variant[i] <- max(weighted_base_probability_variant[leftmost_neighbour[i]:rightmost_neighbour[i]]) #/ sum(weighted_base_probability_variant)
}

conditional_probability <- exp(sum(log(base_probability_error_variant[-matching_bases])) + sum(log(1 - base_probability_error_variant[matching_bases])))
return(conditional_probability)
}

.baseErrorProbabilityMarginal <- function(quality_variant, windowSize = 5) {
probability_error_variant <- phredQualityProbability(c(as(PhredQuality(quality_variant), "matrix")))

if (length(probability_error_variant) == 0)
return(0)

leftmost_neighbour <- ifelse(seq(1, length(probability_error_variant)) - windowSize < 1, 1, seq(1, length(probability_error_variant)) - windowSize)
rightmost_neighbour <- ifelse(seq(1, length(probability_error_variant)) + windowSize > length(probability_error_variant), length(probability_error_variant), seq(1, length(probability_error_variant)) + windowSize)

base_probability_error_variant <- rep(NA, length(probability_error_variant))
for (i in 1:length(base_probability_error_variant)) {
weighted_base_probability_variant <- probability_error_variant / (abs(1:length(probability_error_variant) - i) + 1)

base_probability_error_variant[i] <- max(weighted_base_probability_variant[leftmost_neighbour[i]:rightmost_neighbour[i]]) #/ sum(weighted_base_probability_variant)
}

marginal_probability <- exp(sum(log(1 - base_probability_error_variant)))
return(marginal_probability)
}

.sampleQualityCleaning <- function(stringCoverageList, variantDatabase, trace = FALSE) {
class(stringCoverageList) <- "list"

sampleTibbleBind <- bind_rows(stringCoverageList)
sampleTibble <- sampleTibbleBind %>%
left_join(variantDatabase %>% ungroup() %>% mutate(Observed = 1) %>% select(Region, Observed),
by = c("Region")) %>%
mutate(Observed = ifelse(is.na(Observed), 0, Observed))
markers <- unique(sampleTibble\$Marker)

sampleTibbleCleaned_m <- vector("list", length(markers))
for (mm in seq_along(markers)) {
if (trace)
cat("\tMarker:", mm, "/", length(markers), "\n")

sampleTibble_m <- sampleTibble %>% filter(Marker == markers[mm]) %>%
mutate(BasePairs = nchar(Region))

sampleTibble_m_split <- split(sampleTibble_m, sampleTibble_m\$Allele)
sampleTibble_m_cleaned <- bind_rows(lapply(seq_along(sampleTibble_m_split), function(a) {
sampleTibble_ma <- sampleTibble_m_split[[a]]
observed <- sampleTibble_ma\$Observed
regions <- sampleTibble_ma\$Region
true_regions_index <- which(observed == 1)

true_regions <- regions[observed == 1]
varient_regions <- regions[observed == 0]

qualities <- sampleTibble_ma\$AggregateQuality
na_regions <- qualities == ""

probability_error_variant <- rep(1, length(qualities))

if (sum(!na_regions) > 0) {
probability_error_variant[!na_regions] <- unname(sapply(qualities[!na_regions], .baseErrorProbabilityMarginal)) *
sampleTibble_ma\$Coverage[!na_regions] / sum(sampleTibble_m\$Coverage)
}

if (length(true_regions) > 0) {
all_combinations <- expand.grid(seq(1, length(true_regions)), seq(1, length(regions)))

weight_matrix <- matrix(0, ncol = length(regions), nrow = length(true_regions))
for (j in 1:dim(all_combinations)[1]) {
s_1 <- true_regions[all_combinations[j, 1]]
s_2 <- regions[all_combinations[j, 2]]

if (s_1 == s_2) {
if (is.na(qualities[all_combinations[j, 1]]) & is.na(qualities[all_combinations[j, 2]])) {
weight_matrix[all_combinations[j, 1], all_combinations[j, 2]] <- 1
}
else if (is.na(qualities[all_combinations[j, 1]]) | is.na(qualities[all_combinations[j, 2]])) {
weight_matrix[all_combinations[j, 1], all_combinations[j, 2]] <- 0
}
else {
weight_matrix[all_combinations[j, 1], all_combinations[j, 2]] <- 1
}
}
else {
if (is.na(qualities[all_combinations[j, 1]]) | is.na(qualities[all_combinations[j, 2]])) {
weight_matrix[all_combinations[j, 1], all_combinations[j, 2]] <- 0
}
else {
strings_split <- str_split(c(s_1, s_2), "")

positions_equal <- which(strings_split[[1]] == strings_split[[2]])

probability_of_error <- .baseErrorProbabilityConditinal(qualities[all_combinations[j, 2]], positions_equal)
weight_matrix[all_combinations[j, 1], all_combinations[j, 2]] <- probability_of_error ##* sampleTibble_ma\$Coverage[true_regions_index[all_combinations[j, 1]]] / sum(sampleTibble_m\$Coverage)
}
}
}

weight_matrix_extended <- rbind(weight_matrix, diag(probability_error_variant, ncol = length(probability_error_variant), nrow = length(probability_error_variant))[observed == 0, ])
which_max <- apply(weight_matrix_extended, 2, which.max)
alternative_region <- c(true_regions, varient_regions)[which_max]
}
else {
weight_matrix_extended <- diag(probability_error_variant, nrow = length(probability_error_variant), ncol = length(probability_error_variant))
which_max <- apply(weight_matrix_extended, 2, which.max)
alternative_region <- varient_regions[which_max]

# if (length(probability_error_variant == 1)) {
#     if (probability_error_variant < 0.8^(nchar(regions)))
#         alternative_region = NA
# }
}

res <- sampleTibble_ma %>% mutate(AlternativeRegion = alternative_region)
return(res)
})) %>%
group_by(Marker, Type, Allele, MotifLength, AlternativeRegion) %>%
summarise(Coverage = sum(Coverage)) %>% ungroup() %>%
rename("Region" = "AlternativeRegion") %>%
filter(!is.na(Region))

sampleTibbleCleaned_m[[mm]] <- sampleTibble_m_cleaned
}

sampleTibbleCleaned <- bind_rows(sampleTibbleCleaned_m)
return(sampleTibbleCleaned)
}
```

Try the STRMPS package in your browser

Any scripts or data that you put into this service are public.

STRMPS documentation built on May 2, 2019, 3:35 a.m.