Nothing
#' Standardize the chromosome name annotations for a data frame.
#'
#' @param df A data frame whose first column contains the Chromosome name
#'
#' @return A data frame whose Chromosome names are only in the form of 1:22, "X"
#' and "Y".
#'
#' @keywords internal
StandardChromName <- function(df) {
# Is there any row in df whose Chromosome names have "GL"?
if (sum(grepl("GL", df[[1]])) > 0) {
df <- df[-grep("GL", df[[1]]), ]
}
# Is there any row in df whose Chromosome names have "KI"?
if (sum(grepl("KI", df[[1]])) > 0) {
df <- df[-grep("KI", df[[1]]), ]
}
# Is there any row in df whose Chromosome names have "random"?
if (sum(grepl("random", df[[1]])) > 0) {
df <- df[-grep("random", df[[1]]), ]
}
# Is there any row in df whose Chromosome names are "Hs37D5"?
if (sum(grepl("^Hs", df[[1]])) > 0) {
df <- df[-grep("^Hs", df[[1]]), ]
}
# Is there any row in df whose Chromosome names contain "M"?
if (sum(grepl("M", df[[1]])) > 0) {
df <- df[-grep("M", df[[1]]), ]
}
# Remove the "chr" character in the Chromosome's name
df[, 1] <- sub(pattern = "chr", replacement = "", df[[1]])
return(df)
}
#' Standardize the chromosome name annotations for a data frame.
#'
#' @param df An in-memory data.frame representing a VCF.
#'
#' @param name.of.VCF Name of the VCF file.
#'
#' @return A \strong{list} with the elements
#' * \code{df} a data frame with variants that had "legal" chromosome
#' names (see below for illegal chromosome names).
#' Leading "chr" strings are removed.
#' * \code{discarded.variants}: \strong{Non-NULL only if} there
#' variants with illegal chromosome names; these are
#' names that contain the strings "GL", "Hs", "KI", "M", "random".
#' @md
#'
#' @keywords internal
StandardChromNameNew <- function(df, name.of.VCF = NULL) {
# Create an empty data frame for discarded variants
discarded.variants <- df[0, ]
# Is there any row in df whose Chromosome names have "GL"?
if (sum(grepl("GL", df$CHROM)) > 0) {
warning("In VCF ", ifelse(is.null(name.of.VCF), "", dQuote(name.of.VCF)),
" ", sum(grepl("GL", df$CHROM)), " row out of ",
nrow(df), " had chromosome names that contain 'GL' and ",
"were removed. ",
"See discarded.variants in the return value for more details")
df1 <- df[-grep("GL", df$CHROM), ]
df1.to.remove <- df[grep("GL", df$CHROM), ]
df1.to.remove$discarded.reason <- 'Chromosome name contains "GL"'
discarded.variants <-
dplyr::bind_rows(discarded.variants, df1.to.remove)
} else {
df1 <- df
}
# Is there any row in df whose Chromosome names have "KI"?
if (sum(grepl("KI", df1$CHROM)) > 0) {
warning("In VCF ", ifelse(is.null(name.of.VCF), "", dQuote(name.of.VCF)),
" ", sum(grepl("KI", df1$CHROM)), " row out of ",
nrow(df), " had chromosome names that contain 'KI' and ",
"were removed. ",
"See discarded.variants in the return value for more details")
df2 <- df1[-grep("KI", df1$CHROM), ]
df2.to.remove <- df1[grep("KI", df1$CHROM), ]
df2.to.remove$discarded.reason <- 'Chromosome name contains "KI"'
discarded.variants <-
dplyr::bind_rows(discarded.variants, df2.to.remove)
} else {
df2 <- df1
}
# Is there any row in df whose Chromosome names have "random"?
if (sum(grepl("random", df2$CHROM)) > 0) {
warning("In VCF ", ifelse(is.null(name.of.VCF), "", dQuote(name.of.VCF)),
" ", sum(grepl("random", df2$CHROM)), " row out of ",
nrow(df), " had chromosome names that contain 'random' and ",
"were removed. ",
"See discarded.variants in the return value for more details")
df3 <- df2[-grep("random", df2$CHROM), ]
df3.to.remove <- df2[grep("random", df2$CHROM), ]
df3.to.remove$discarded.reason <- 'Chromosome name contains "random"'
discarded.variants <-
dplyr::bind_rows(discarded.variants, df3.to.remove)
} else {
df3 <- df2
}
# Is there any row in df whose Chromosome names are "Hs37D5"?
if (sum(grepl("^Hs", df3$CHROM)) > 0) {
warning("In VCF ", ifelse(is.null(name.of.VCF), "", dQuote(name.of.VCF)),
" ", sum(grepl("^Hs", df3$CHROM)), " row out of ",
nrow(df), " had chromosome names that contain 'Hs' and ",
"were removed. ",
"See discarded.variants in the return value for more details")
df4 <- df3[-grep("^Hs", df3$CHROM), ]
df4.to.remove <- df3[grep("^Hs", df3$CHROM), ]
df4.to.remove$discarded.reason <- 'Chromosome name contains "Hs"'
discarded.variants <-
dplyr::bind_rows(discarded.variants, df4.to.remove)
} else {
df4 <- df3
}
# Is there any row in df whose Chromosome names contain "M"?
if (sum(grepl("M", df4$CHROM)) > 0) {
warning("In VCF ", ifelse(is.null(name.of.VCF), "", dQuote(name.of.VCF)),
" ", sum(grepl("M", df4$CHROM)), " row out of ",
nrow(df), " had chromosome names that contain 'M' and ",
"were removed. ",
"See discarded.variants in the return value for more details")
df5 <- df4[-grep("M", df4$CHROM), ]
df5.to.remove <- df4[grep("M", df4$CHROM), ]
df5.to.remove$discarded.reason <- 'Chromosome name contains "M"'
discarded.variants <-
dplyr::bind_rows(discarded.variants, df5.to.remove)
} else {
df5 <- df4
}
if (nrow(discarded.variants) == 0) {
return(list(df = df5))
} else {
return(list(df = df5, discarded.variants = discarded.variants))
}
}
#' Check and, if possible, correct the chromosome names in a VCF \code{data.frame}.
#'
#' @param vcf.df A VCF as a \code{data.frame}. Check the names in column
#' \code{CHROM}.
#'
#' @param name.of.VCF Name of the VCF file.
#'
#' @param ref.genome The reference genome with the chromosome names to check
#' \code{vcf.df$CHROM} against; must be a Bioconductor
#' \code{BSgenome}, e.g.
#' \code{BSgenome.Hsapiens.UCSC.hg38}.
#'
#' @return If the \code{vcf.df$CHROM} values are correct or
#' can be corrected, then a vector of chromosome names
#' that can be used as a replacement for \code{vcf.df$CHROM}.
#' If the names in \code{vcf.df$CHROM} cannot be made to
#' be consistent with the chromosome names in \code{ref.genome},
#' then \code{stop}.
#'
#' @keywords internal
CheckAndFixChrNames <- function(vcf.df, ref.genome, name.of.VCF = NULL) {
names.to.check <- unique(vcf.df$CHROM)
# Check whether the naming of chromosomes in vcf.df is consistent
if(!sum(grepl("^chr", names.to.check)) %in% c(0, length(names.to.check))) {
stop("\nNaming of chromosomes in VCF ", dQuote(name.of.VCF),
" is not consistent: ",
paste(names.to.check, collapse = " "))
}
ref.genome.names <- seqnames(ref.genome)
not.matched <- setdiff(names.to.check, ref.genome.names)
# The names match -- we leave well-enough alone
if (length(not.matched) == 0) return(vcf.df$CHROM)
vcf.has.chr.prefix <- any(grepl(pattern = "^chr", names.to.check))
ref.has.chr.prefix <- any(grepl(pattern = "^chr", ref.genome.names))
new.chr.names <- vcf.df$CHROM
if (ref.has.chr.prefix && !vcf.has.chr.prefix) {
names.to.check <- paste0("chr", names.to.check)
new.chr.names <- paste0("chr", new.chr.names)
not.matched1 <- setdiff(names.to.check, ref.genome.names)
if (length(not.matched1) == 0) return(new.chr.names)
}
if (!ref.has.chr.prefix && vcf.has.chr.prefix) {
names.to.check <- gsub("chr", "", names.to.check)
new.chr.names <- gsub("chr", "", new.chr.names)
not.matched2 <- setdiff(names.to.check, ref.genome.names)
if (length(not.matched2) == 0) return(new.chr.names)
}
organism <- BSgenome::organism(ref.genome)
CheckForPossibleMatchedChrName <- function(chr1, chr2) {
if (chr1 %in% names.to.check) {
# If chr2 is already in names.to.check, then give a warning
if (chr2 %in% names.to.check) {
warningmessage <- function(x, y) {
warning("\n", x, " and ", y, " both are chromosome names in VCF ",
dQuote(name.of.VCF),
"for ", organism, ". ", x, " has been changed to ", y,
" internally for downstream processing")
}
if (vcf.has.chr.prefix) {
if (grepl(pattern = "^chr", chr1)) {
warningmessage(chr1, chr2)
} else {
x <- paste0("chr", chr1)
y <- paste0("chr", chr2)
warningmessage(x, y)
}
} else {
if (!grepl(pattern = "^chr", chr1)) {
warningmessage(chr1, chr2)
} else {
x <- gsub("chr", "", chr1)
y <- gsub("chr", "", chr2)
warningmessage(x, y)
}
}
}
new.chr.names[new.chr.names == chr1] <<- chr2
names.to.check <- setdiff(names.to.check, chr1)
names.to.check <<- unique(c(names.to.check, chr2))
}
}
if (organism == "Homo sapiens") {
# Maybe the problem is that X and Y are encoded as chr23 and chr24
CheckForPossibleMatchedChrName("chr23", "chrX")
CheckForPossibleMatchedChrName("chr24", "chrY")
# Maybe the problem is that X and Y are encoded as 23 and 24
CheckForPossibleMatchedChrName("23", "X")
CheckForPossibleMatchedChrName("24", "Y")
}
if (organism == "Mus musculus") {
# Maybe the problem is that X and Y are encoded as chr20 and chr21
CheckForPossibleMatchedChrName("chr20", "chrX")
CheckForPossibleMatchedChrName("chr21", "chrY")
# Maybe the problem is that X and Y are encoded as 20 and 21
CheckForPossibleMatchedChrName("20", "X")
CheckForPossibleMatchedChrName("21", "Y")
}
not.matched3 <- setdiff(names.to.check, ref.genome.names)
if (length(not.matched3) == 0) return(new.chr.names)
stop("\nChromosome names in VCF ", dQuote(name.of.VCF),
" not in ref.genome for ", organism, ": ",
# We report the _original_ list of not matched names
paste(not.matched, collapse = " "))
}
#' Check and, if possible, correct the chromosome names in a trans.ranges \code{data.table}
#'
#' @param trans.ranges A \code{\link[data.table]{data.table}} which contains
#' transcript range and strand information. Please refer to
#' \code{\link{TranscriptRanges}} for more details.
#'
#' @param vcf.df A VCF as a \code{data.frame}. Check the names in column
#' \code{CHROM}.
#'
#' @param name.of.VCF Name of the VCF file.
#'
#' @param ref.genome The reference genome with the chromosome names to check
#' \code{vcf.df$CHROM} against; must be a Bioconductor
#' \code{BSgenome}, e.g.
#' \code{BSgenome.Hsapiens.UCSC.hg38}.
#'
#' @return If the \code{vcf.df$CHROM} values are correct or can be corrected,
#' then a vector of chromosome names that can be used as a replacement for
#' \code{trans.ranges$chrom}. If the names in \code{vcf.df$CHROM} cannot be
#' made to be consistent with the chromosome names in
#' \code{trans.ranges$chrom}, then \code{stop}.
#'
#' @keywords internal
CheckAndFixChrNamesForTransRanges <-
function(trans.ranges, vcf.df, ref.genome, name.of.VCF = NULL) {
names.to.check <- as.character(unique(trans.ranges$chrom))
vcf.chr.names <- as.character(unique(vcf.df$CHROM))
# Check whether the naming of chromosomes in vcf.df is consistent
if(!sum(grepl("^chr", vcf.chr.names)) %in% c(0, length(vcf.chr.names))) {
stop("\nNaming of chromosomes in VCF ", dQuote(name.of.VCF),
" is not consistent: ",
paste(vcf.chr.names, collapse = " "))
}
not.matched <- setdiff(vcf.chr.names, names.to.check)
# The names match -- we leave well-enough alone
if (length(not.matched) == 0) return(trans.ranges$chrom)
vcf.has.chr.prefix <- any(grepl(pattern = "^chr", vcf.chr.names))
trans.has.chr.prefix <- any(grepl(pattern = "^chr", names.to.check))
new.chr.names <- as.character(trans.ranges$chrom)
if (trans.has.chr.prefix && !vcf.has.chr.prefix) {
names.to.check <- gsub("chr", "", names.to.check)
new.chr.names <- gsub("chr", "", names.to.check)
names.to.check <- paste0("chr", names.to.check)
new.chr.names <- paste0("chr", new.chr.names)
not.matched1 <- setdiff(vcf.chr.names, names.to.check)
if (length(not.matched1) == 0) return(new.chr.names)
}
if (!trans.has.chr.prefix && vcf.has.chr.prefix) {
names.to.check <- paste0("chr", names.to.check)
new.chr.names <- paste0("chr", new.chr.names)
not.matched2 <- setdiff(vcf.chr.names, names.to.check)
if (length(not.matched2) == 0) return(new.chr.names)
}
organism <- BSgenome::organism(ref.genome)
CheckForPossibleMatchedChrName <- function(chr1, chr2) {
if (chr2 %in% vcf.chr.names) {
# If chr1 is already in vcf.chr.names, then stop
if (chr1 %in% vcf.chr.names) {
stopmessage <- function() {
stop("\n", chr2, " and ", chr1, " both are chromosome names in VCF ",
dQuote(name.of.VCF),
", which should not be the case for ", organism, ". Please check ",
"your data or specify the correct ref.genome argument")
}
if (vcf.has.chr.prefix) {
if (grepl(pattern = "^chr", chr2)) {
stopmessage()
} else {
chr1 <- paste0("chr", chr1)
chr2 <- paste0("chr", chr2)
stopmessage()
}
} else {
if (!grepl(pattern = "^chr", chr1)) {
stopmessage()
} else {
chr1 <- gsub("chr", "", chr1)
chr2 <- gsub("chr", "", chr2)
stopmessage()
}
}
}
# Update the trans.ranges chromosome names to be consistent with
# those in vcf.df
new.chr.names[new.chr.names == chr1] <<- chr2
names.to.check <- setdiff(names.to.check, chr1)
names.to.check <<- unique(c(names.to.check, chr2))
}
}
if (organism == "Homo sapiens") {
# Maybe the problem is that X and Y are encoded as chr23 and chr24
CheckForPossibleMatchedChrName("chrX", "chr23")
CheckForPossibleMatchedChrName("chrY", "chr24")
# Maybe the problem is that X and Y are encoded as 23 and 24
CheckForPossibleMatchedChrName("X", "23")
CheckForPossibleMatchedChrName("Y", "24")
}
if (organism == "Mus musculus") {
# Maybe the problem is that X and Y are encoded as chr20 and chr21
CheckForPossibleMatchedChrName("chrX", "chr20")
CheckForPossibleMatchedChrName("chrY", "chr21")
# Maybe the problem is that X and Y are encoded as 20 and 21
CheckForPossibleMatchedChrName("X", "20")
CheckForPossibleMatchedChrName("Y", "21")
}
not.matched3 <- setdiff(vcf.chr.names, names.to.check)
if (length(not.matched3) == 0) return(new.chr.names)
stop("\nChromosome names in VCF ", dQuote(name.of.VCF),
" not in trans ranges for ", organism, ": ",
# We report the _original_ list of not matched names
paste(not.matched, collapse = " "))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.