R/read.manifest.R

Defines functions read.manifest.27k read.manifest.450k read.manifest.EPIC read.manifest.Mammal

# Internal functions -----------------------------------------------------------

read.manifest.Mammal <- function(file, type = 2) {
    # NOTE: As is, requires grep
    control.line <- system(
        sprintf("grep -n \\\\[Controls\\\\] %s", file), intern = TRUE)
    control.line <- as.integer(sub(":.*", "", control.line))
    stopifnot(length(control.line) == 1 &&
                  is.integer(control.line) &&
                  !is.na(control.line))
    if(type == 1) {
        assay.line <- system(
            sprintf("grep -n \\\\[Assay\\\\] %s", file), intern = TRUE)
        assay.line <- as.integer(sub(":.*", "", assay.line))
        stopifnot(length(assay.line) == 1 &&
                  is.integer(assay.line) &&
                  !is.na(assay.line))
    } else {
        assay.line  <- 0
    }
    colNames <- readLines(file, n = assay.line + 1L)[assay.line + 1L]
    colNames <- strsplit(colNames, ",")[[1]]
    colClasses <- rep("character", length(colNames))
    names(colClasses) <- colNames
    names(colClasses) <- make.names(names(colClasses))
    colClasses[c("MAPINFO")] <- "integer"
    if(type == 1) {
        colClasses <- c(colClasses, drop = "logical")
    }
    manifest <- read.table(
        file = file,
        header = FALSE,
        col.names = names(colClasses),
        sep = ",",
        comment.char = "",
        quote = "",
        skip = assay.line + 1L,
        colClasses = colClasses,
        nrows = control.line - assay.line - 2L)
    manifest$drop <- NULL
    if(type == 2) {
        names(manifest)[c(1,2,30)] <- c("Name", "Internal_Name", "Internal_Name2")
    }
    
    manifest$AddressA_ID <- gsub("^0*", "", manifest$AddressA_ID)
    manifest$AddressB_ID <- gsub("^0*", "", manifest$AddressB_ID)
    
    TypeI <- manifest[
        manifest$Infinium_Design_Type == "I",
        c("Name", "AddressA_ID", "AddressB_ID", "Color_Channel", "Next_Base",
          "AlleleA_ProbeSeq", "AlleleB_ProbeSeq")]
    names(TypeI)[c(2, 3, 4, 5, 6, 7)] <- c(
        "AddressA", "AddressB", "Color", "NextBase", "ProbeSeqA", "ProbeSeqB")
    
    TypeI <- as(TypeI, "DataFrame")
    TypeI$ProbeSeqA <- DNAStringSet(TypeI$ProbeSeqA)
    TypeI$ProbeSeqB <- DNAStringSet(TypeI$ProbeSeqB)
    TypeI$NextBase <- DNAStringSet(TypeI$NextBase)
    TypeI$nCpG <- as.integer(
        oligonucleotideFrequency(TypeI$ProbeSeqB, width = 2)[, "CG"] - 1L)
    TypeI$nCpG[TypeI$nCpG < 0] <- 0L
    TypeSnpI <- TypeI[grep("^rs", TypeI$Name), ]
    TypeI <- TypeI[-grep("^rs", TypeI$Name), ]

    TypeII <- manifest[
        manifest$Infinium_Design_Type == "II",
        c("Name", "AddressA_ID", "AlleleA_ProbeSeq")]
    names(TypeII)[c(2,3)] <- c("AddressA", "ProbeSeqA")
    TypeII <- as(TypeII, "DataFrame")
    TypeII$ProbeSeqA <- DNAStringSet(TypeII$ProbeSeqA)
    TypeII$nCpG <- as.integer(letterFrequency(TypeII$ProbeSeqA, letters = "R"))
    TypeII$nCpG[TypeII$nCpG < 0] <- 0L
    TypeSnpII <- TypeII[grep("^rs", TypeII$Name), ]
    TypeII <- TypeII[-grep("^rs", TypeII$Name), ]

    if(type == 1) {
        controls <- read.table(
            file = file,
            skip = control.line,
            sep = ",",
            comment.char = "",
            quote = "",
            colClasses = c(rep("character", 5)))[, 1:5]
        TypeControl <- controls[, 1:4]
        names(TypeControl) <- c("Address", "Type", "Color", "ExtendedType")
        TypeControl <- as(TypeControl, "DataFrame")
    } else {
        controls <- read.table(
            file = file,
            skip = control.line,
            sep = ",",
            comment.char = "",
            quote = "",
            colClasses = c(rep("character", 4)))[, 1:4]
        TypeControl <- controls[, 1:4]
        names(TypeControl) <- c("Address", "Type", "Color", "ExtendedType")
        TypeControl$Type  <- toupper(TypeControl$Type)
        TypeControl <- as(TypeControl, "DataFrame")
    }
        
    
    list(
        manifestList = list(
            TypeI = TypeI,
            TypeII = TypeII,
            TypeControl = TypeControl,
            TypeSnpI = TypeSnpI,
            TypeSnpII = TypeSnpII),
        manifest = manifest,
        controls = controls)
}



read.manifest.EPIC <- function(file) {
    # NOTE: As is, requires grep
    control.line <- system(
        sprintf("grep -n \\\\[Controls\\\\] %s", file), intern = TRUE)
    control.line <- as.integer(sub(":.*", "", control.line))
    stopifnot(length(control.line) == 1 &&
                  is.integer(control.line) &&
                  !is.na(control.line))
    assay.line <- system(
        sprintf("grep -n \\\\[Assay\\\\] %s", file), intern = TRUE)
    assay.line <- as.integer(sub(":.*", "", assay.line))
    stopifnot(length(assay.line) == 1 &&
                  is.integer(assay.line) &&
                  !is.na(assay.line))
    colNames <- readLines(file, n = assay.line + 1L)[assay.line + 1L]
    colNames <- strsplit(colNames, ",")[[1]]
    colClasses <- rep("character", length(colNames))
    names(colClasses) <- colNames
    names(colClasses) <- make.names(names(colClasses))
    colClasses[c("MAPINFO")] <- "integer"
    manifest <- read.table(
        file = file,
        header = FALSE,
        col.names = names(colClasses),
        sep = ",",
        comment.char = "",
        quote = "",
        skip = assay.line + 1L,
        colClasses = colClasses,
        nrows = control.line - assay.line - 2L)
    manifest$AddressA_ID <- gsub("^0*", "", manifest$AddressA_ID)
    manifest$AddressB_ID <- gsub("^0*", "", manifest$AddressB_ID)
    TypeI <- manifest[
        manifest$Infinium_Design_Type == "I",
        c("Name", "AddressA_ID", "AddressB_ID", "Color_Channel", "Next_Base",
          "AlleleA_ProbeSeq", "AlleleB_ProbeSeq")]
    names(TypeI)[c(2, 3, 4, 5, 6, 7)] <- c(
        "AddressA", "AddressB", "Color", "NextBase", "ProbeSeqA", "ProbeSeqB")
    TypeI <- as(TypeI, "DataFrame")
    TypeI$ProbeSeqA <- DNAStringSet(TypeI$ProbeSeqA)
    TypeI$ProbeSeqB <- DNAStringSet(TypeI$ProbeSeqB)
    TypeI$NextBase <- DNAStringSet(TypeI$NextBase)
    TypeI$nCpG <- as.integer(
        oligonucleotideFrequency(TypeI$ProbeSeqB, width = 2)[, "CG"] - 1L)
    TypeI$nCpG[TypeI$nCpG < 0] <- 0L
    TypeSnpI <- TypeI[grep("^rs", TypeI$Name), ]
    TypeI <- TypeI[-grep("^rs", TypeI$Name), ]

    TypeII <- manifest[
        manifest$Infinium_Design_Type == "II",
        c("Name", "AddressA_ID", "AlleleA_ProbeSeq")]
    names(TypeII)[c(2,3)] <- c("AddressA", "ProbeSeqA")
    TypeII <- as(TypeII, "DataFrame")
    TypeII$ProbeSeqA <- DNAStringSet(TypeII$ProbeSeqA)
    TypeII$nCpG <- as.integer(letterFrequency(TypeII$ProbeSeqA, letters = "R"))
    TypeII$nCpG[TypeII$nCpG < 0] <- 0L
    TypeSnpII <- TypeII[grep("^rs", TypeII$Name), ]
    TypeII <- TypeII[-grep("^rs", TypeII$Name), ]

    controls <- read.table(
        file = file,
        skip = control.line,
        sep = ",",
        comment.char = "",
        quote = "",
        colClasses = c(rep("character", 5)))[, 1:5]
    TypeControl <- controls[, 1:4]
    names(TypeControl) <- c("Address", "Type", "Color", "ExtendedType")
    TypeControl <- as(TypeControl, "DataFrame")

    list(
        manifestList = list(
            TypeI = TypeI,
            TypeII = TypeII,
            TypeControl = TypeControl,
            TypeSnpI = TypeSnpI,
            TypeSnpII = TypeSnpII),
        manifest = manifest,
        controls = controls)
}

read.manifest.450k <- function(file) {
    # NOTE: As is, requires grep
    control.line <- system(
        sprintf("grep -n \\\\[Controls\\\\] %s", file), intern = TRUE)
    control.line <- as.integer(sub(":.*", "", control.line))
    stopifnot(length(control.line) == 1 &&
                  is.integer(control.line) &&
                  !is.na(control.line))
    assay.line <- system(
        sprintf("grep -n \\\\[Assay\\\\] %s", file), intern = TRUE)
    assay.line <- as.integer(sub(":.*", "", assay.line))
    stopifnot(length(assay.line) == 1 &&
                  is.integer(assay.line) &&
                  !is.na(assay.line))

    # NOTE: Column headers is in line 8, hardcoded
    colNames <- readLines(file, n = assay.line + 1L)[assay.line + 1L]
    colNames <- strsplit(colNames, ",")[[1]]
    colClasses <- rep("character", length(colNames))
    names(colClasses) <- colNames
    colClasses[c("MAPINFO")] <- "integer"
    manifest <- read.table(
        file = file,
        header = TRUE,
        sep = ",",
        comment.char = "",
        quote = "",
        skip = 7,
        colClasses = colClasses,
        nrows = control.line - 9)
    TypeI <- manifest[
        manifest$Infinium_Design_Type == "I",
        c("Name", "AddressA_ID", "AddressB_ID", "Color_Channel", "Next_Base",
          "AlleleA_ProbeSeq", "AlleleB_ProbeSeq")]
    names(TypeI)[c(2, 3, 4, 5, 6 , 7)] <-
        c("AddressA", "AddressB", "Color", "NextBase", "ProbeSeqA", "ProbeSeqB")
    TypeI <- as(TypeI, "DataFrame")
    TypeI$ProbeSeqA <- DNAStringSet(TypeI$ProbeSeqA)
    TypeI$ProbeSeqB <- DNAStringSet(TypeI$ProbeSeqB)
    TypeI$NextBase <- DNAStringSet(TypeI$NextBase)
    TypeI$nCpG <- as.integer(
        oligonucleotideFrequency(TypeI$ProbeSeqB, width = 2)[, "CG"] - 1L)
    TypeI$nCpG[TypeI$nCpG < 0] <- 0L
    TypeSnpI <- TypeI[grep("^rs", TypeI$Name), ]
    TypeI <- TypeI[-grep("^rs", TypeI$Name), ]

    TypeII <- manifest[
        manifest$Infinium_Design_Type == "II",
        c("Name", "AddressA_ID", "AlleleA_ProbeSeq")]
    names(TypeII)[c(2, 3)] <- c("AddressA", "ProbeSeqA")
    TypeII <- as(TypeII, "DataFrame")
    TypeII$ProbeSeqA <- DNAStringSet(TypeII$ProbeSeqA)
    TypeII$nCpG <- as.integer(letterFrequency(TypeII$ProbeSeqA, letters = "R"))
    TypeII$nCpG[TypeII$nCpG < 0] <- 0L
    TypeSnpII <- TypeII[grep("^rs", TypeII$Name), ]
    TypeII <- TypeII[-grep("^rs", TypeII$Name), ]

    controls <- read.table(
        file = file,
        skip = control.line,
        sep = ",",
        comment.char = "",
        quote = "",
        colClasses = c(rep("character", 5)))[, 1:5]
    TypeControl <- controls[, 1:4]
    names(TypeControl) <- c("Address", "Type", "Color", "ExtendedType")
    TypeControl <- as(TypeControl, "DataFrame")

    list(
        manifestList = list(
            TypeI = TypeI,
            TypeII = TypeII,
            TypeControl = TypeControl,
            TypeSnpI = TypeSnpI,
            TypeSnpII = TypeSnpII),
        manifest = manifest,
        controls = controls)
}


read.manifest.27k <- function(file) {
    # NOTE: As is, requires grep
    control.line <- system(
        sprintf("grep -a -n \\\\[Controls\\\\] %s", file), intern = TRUE)
    control.line <- as.integer(sub(":.*", "", control.line))
    assay.line <- system(
        sprintf("grep -a -n \\\\[Assay\\\\] %s", file), intern = TRUE)
    assay.line <- as.integer(sub(":.*", "", assay.line))

    # NOTE: Column headers is in line 8, hardcoded
    colNames <- tail(readLines(file, n = assay.line + 1), n = 1)
    colNames <- strsplit(colNames, ",")[[1]]
    colClasses <- rep("character", length(colNames))
    names(colClasses) = colNames
    colClasses[c("MAPINFO")] <- "integer"
    manifest <- read.table(
        file = file,
        header = TRUE,
        sep = ",",
        comment.char = "",
        quote = "",
        skip = assay.line,
        colClasses = colClasses,
        nrows = control.line - (assay.line + 1),
        fill = TRUE)
    TypeI <- manifest[
        c("Name", "AddressA_ID", "AddressB_ID", "Color_Channel", "Next_Base",
          "AlleleA_ProbeSeq", "AlleleB_ProbeSeq")]
    TypeI <- TypeI[TypeI$Name != "", ]
    names(TypeI)[c(2, 3, 4, 5, 6, 7)] <- c(
        "AddressA", "AddressB", "Color", "NextBase", "ProbeSeqA", "ProbeSeqB")
    TypeI <- as(TypeI, "DataFrame")
    TypeI$ProbeSeqA <- DNAStringSet(TypeI$ProbeSeqA)
    TypeI$ProbeSeqB <- DNAStringSet(TypeI$ProbeSeqB)
    TypeI$NextBase <- DNAStringSet(TypeI$NextBase)
    TypeI$nCpG <- as.integer(
        oligonucleotideFrequency(TypeI$ProbeSeqB, width = 2)[, "CG"] - 1L)
    TypeI$nCpG[TypeI$nCpG < 0] <- 0L

    TypeII <- manifest[
        manifest$Infinium_Design_Type == "II",
        c("Name", "AddressA_ID", "AlleleA_ProbeSeq")]
    names(TypeII)[c(2, 3)] <- c("AddressA", "ProbeSeqA")
    TypeII <- as(TypeII, "DataFrame")
    TypeII$ProbeSeqA <- BStringSet(TypeII$ProbeSeqA)
    TypeII$nCpG <- as.integer(letterFrequency(TypeII$ProbeSeq, letters = "R"))

    controls <- read.table(
        file = file,
        skip = control.line,
        sep = ",",
        comment.char = "",
        quote = "",
        colClasses = c(rep("character", 5)))

    TypeControl <- controls[, 1:4]
    names(TypeControl) <- c("Address", "Type", "Color", "ExtendedType")
    TypeControl <- as(TypeControl, "DataFrame")

    snps <- TypeControl[TypeControl$Type == "Genotyping",]
    TypeControl <- TypeControl[TypeControl$Type != "Genotyping",]
    rsname <- sub("_[AB]", "", snps$ExtendedType)
    snps.sp <- split(snps, rsname)
    snps.sp <- lapply(names(snps.sp), function(rs) {
        snp <- snps.sp[[rs]]
        DataFrame(
            Name = rs,
            AddressA = snp[grep("_A", snp$ExtendedType), "Address"],
            AddressB = snp[grep("_B", snp$ExtendedType), "Address"],
            Color = "Unknown")
    })
    TypeSnpI <- do.call(rbind, snps.sp)
    TypeSnpII <- TypeSnpI[0, ]

    list(manifestList =
             list(TypeI = TypeI,
                  TypeII = TypeII,
                  TypeControl = TypeControl,
                  TypeSnpI = TypeSnpI,
                  TypeSnpII = TypeSnpII),
         manifest = manifest,
         controls = controls)
}

# TODOs ------------------------------------------------------------------------

# TODO: Lots of duplicated code; DRY

Try the minfi package in your browser

Any scripts or data that you put into this service are public.

minfi documentation built on Nov. 8, 2020, 4:53 p.m.