R/read_file.R

Defines functions getMutationFeatureVector hildaReadMPFile

Documented in getMutationFeatureVector hildaReadMPFile

#' Read the raw mutation data of Mutation Position Format.
#'
#' @description
#' The mutation position format is tab-delimited text file, where
#' the 1st-5th columns shows sample names, chromosome names,
#' coordinates, reference bases (A, C, G, or T) and
#' the alternate bases (A, C, G, or T), respectively. An example is as follows;
#'
#' ---
#'
#' sample1 chr1 100 A C
#'
#' sample1 chr1 200 A T
#'
#' sample1 chr2 100 G T
#'
#' sample2 chr1 300 T C
#'
#' sample3 chr3 400 T C
#'
#' ---
#'
#' Also, this function usually can accept compressed files (e.g., by gzip, bzip2
#' and so on) when using recent version of R.
#'
#' @param infile the path for the input file for the mutation data of Mutation
#' Position Format.
#' @param numBases the number of upstream and downstream flanking bases
#' (including the mutated base) to take into account.
#' @param trDir the index representing whether transcription direction is
#' considered or not.
#' The gene annotation information is given by UCSC knownGene
#' (TxDb.Hsapiens.UCSC.hg19.knownGene object) When trDir is TRUE, the mutations
#' located in intergenic region are excluded from the analysis.
#' @param bs_genome this argument specifies the reference genome (e.g., B
#' Sgenome.Mmusculus.UCSC.mm10 can be used for the mouse genome).
#' See https://bioconductor.org/packages/release/bioc/html/BSgenome.html for the
#' available genome list
#' @param txdb_transcript this argument specified the transcript database
#' (e.g., TxDb.Mmusculus.UCSC.mm10.knownGene can be used for the mouse genome).
#' See https://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html
#' for details.
#' @return The output is an instance of MutationFeatureData S4 class (which
#' stores summarized information on mutation data). This will be typically used
#' as the initial values for the global test and the local test.
#'
#' @importFrom utils read.table
#' @importFrom methods new
#' @examples
#' inputFile <- system.file("extdata/esophageal.mp.txt.gz", package="HiLDA")
#' G <- hildaReadMPFile(inputFile, numBases=5, trDir=TRUE)
#'
#'
#' @export
hildaReadMPFile <- function(infile, numBases = 3, trDir = FALSE,
                            bs_genome = NULL, txdb_transcript = NULL) {

    if (is.null(bs_genome) == TRUE) {
      bs_genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
    }

    if (is.null(txdb_transcript) == TRUE) {
      txdb_transcript <-
        TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
    }

    fdim <- c(6, rep(4, numBases - 1), rep(2, as.integer(trDir)))
    type <- "independent"

    if (numBases %% 2 != 1) {
      stop("numBases should be odd numbers")
    }
    centerInd <- (numBases + 1) / 2

    mutFile <- read.table(infile, sep="\t", header=FALSE,
                          stringsAsFactors = FALSE)

    chrInfo <- mutFile[,2]
    posInfo <- mutFile[,3]
    ref_base <- Biostrings::DNAStringSet(mutFile[,4])
    alt_base <- Biostrings::DNAStringSet(mutFile[,5])
    sampleName_str <- as.character(mutFile[,1])

    gr <- GenomicRanges::makeGRangesFromDataFrame(data.frame(chr = chrInfo,
                                                             start = posInfo,
                                                             end = posInfo),
                                                  ignore.strand = TRUE)

    ranges <- GenomicRanges::resize(gr, numBases, fix = "center")
    context <- Biostrings::getSeq(bs_genome, ranges)


    ## check the consistency between the input reference base and the obtained
    ## base using hg19 reference genome.
    removeInd <- which(XVector::subseq(context, start = centerInd,
                                       end = centerInd) != ref_base)
    if (length(removeInd) > 0) {
      warning("The central bases are inconsistent in ", length(removeInd), 
              " mutations. We have removed them.")
      
      
      for(varname in c("context", "ref_base", "alt_base", 
                      "sampleName_str", "chrInfo", "posInfo")) {
        assign(varname, get(varname)[-removeInd])
      }
      
    }

    # check the characters on alternative base
    alphabetFreq <- Biostrings::alphabetFrequency(alt_base)
    removeInd <- which(rowSums(alphabetFreq[,seq_len(4)]) != 1)
    if (length(removeInd) > 0) {
      warning("The characters other than (A, C, G, T) are included in
               alternate bases of ", length(removeInd),
               " mutations. We have removed them.")
      
      for(varname in c("context", "ref_base", "alt_base", 
                       "sampleName_str", "chrInfo", "posInfo")) {
        assign(varname, get(varname)[-removeInd])
      }
    }

    # check the characters on flanking bases
    alphabetFreq <- Biostrings::alphabetFrequency(context)
    removeInd <- which(alphabetFreq[,"A"] + alphabetFreq[,"C"] +
                         alphabetFreq[,"G"] + alphabetFreq[,"T"] != numBases)
    if (length(removeInd) > 0) {
      lapply(list(context, ref_base, alt_base, 
                  sampleName_str, chrInfo, posInfo), 
             function(obj) obj <- obj[-removeInd])
      warning("The characters other than (A, C, G, T) are included in
                flanking bases of ", length(removeInd),
               " mutations. We have removed them.")
    }

    # check the characters on alternative base
    alphabetFreq <- Biostrings::alphabetFrequency(alt_base)
    removeInd <- which(ref_base == alt_base)
    if (length(removeInd) > 0) {
      warning("The reference base and alternative bases are equal for ",
              length(removeInd), " mutations. We have removed them.")
      
      for(varname in c("context", "ref_base", "alt_base", 
                       "sampleName_str", "chrInfo", "posInfo")) {
        assign(varname, get(varname)[-removeInd])
      }
    }


    revCompInd <- which(as.character(XVector::subseq(context,
                                                     start = centerInd,
                                                     end = centerInd)) %in%
                          c("A", "G"))
    context[revCompInd] <- Biostrings::reverseComplement(context[revCompInd])
    ref_base[revCompInd] <- Biostrings::reverseComplement(ref_base[revCompInd])
    alt_base[revCompInd] <- Biostrings::reverseComplement(alt_base[revCompInd])

    # Obtaining transcription strand information using GenomicRanges packages
    if (trDir == TRUE) {
      gr <- GenomicRanges::makeGRangesFromDataFrame(data.frame(chr = chrInfo,
                                                               start = posInfo,
                                                               end = posInfo),
                                                    ignore.strand = TRUE)
      txdb <- txdb_transcript

      txdb_bed <- GenomicFeatures::transcripts(txdb)
      gr_txdb <- GenomicRanges::findOverlaps(gr, txdb_bed,
                                             ignore.strand = FALSE)
      gr_strand <- cbind(
        S4Vectors::queryHits(gr_txdb),
        as.character(S4Vectors::as.factor(
          BiocGenerics::strand(txdb_bed[S4Vectors::subjectHits(gr_txdb)]))))
      ugr_strand <- unique(gr_strand[gr_strand[, 2] != "*" ,], MARGIN=1)

      rmdup_ugr_strand <- ugr_strand[!duplicated(ugr_strand[, 1]), ]
      txdb_plus_gr_ind <- as.integer(
        rmdup_ugr_strand[rmdup_ugr_strand[, 2] == "+", 1])
      txdb_minus_gr_ind <-
        as.integer(rmdup_ugr_strand[rmdup_ugr_strand[, 2] == "-", 1])

      strandInfo <- rep("*", length(gr))
      strandInfo[setdiff(txdb_plus_gr_ind, revCompInd)] <- "+"
      strandInfo[intersect(txdb_plus_gr_ind, revCompInd)] <- "-"
      strandInfo[setdiff(txdb_minus_gr_ind, revCompInd)] <- "-"
      strandInfo[intersect(txdb_minus_gr_ind, revCompInd)] <- "+"

      warning("Out of ", length(context), " mutations, we could obtain",
              "transcription direction information for ",
              length(txdb_plus_gr_ind) + length(txdb_minus_gr_ind),
              " mutation. Other mutations are removed.")
      
      context <- context[strandInfo != "*"]
      ref_base <- ref_base[strandInfo != "*"]
      alt_base <- alt_base[strandInfo != "*"]
      sampleName_str <- sampleName_str[strandInfo != "*"]
      chrInfo <- chrInfo[strandInfo != "*"]
      posInfo <- posInfo[strandInfo != "*"]
      strandInfo <- strandInfo[strandInfo != "*"]
    }

    if (trDir == FALSE) {
      strandInfo <- NULL
    }

    mutFeatures <- getMutationFeatureVector(context, ref_base, alt_base,
                                            strandInfo, numBases, type)

    suSampleStr <- sort(unique(sampleName_str))
    lookupSampleInd <- seq_len(length(suSampleStr))
    names(lookupSampleInd) <- suSampleStr
    sampleIDs <- lookupSampleInd[sampleName_str]


    featStr <- apply(mutFeatures, 1, paste0, collapse=",")

    suFeatStr <- sort(unique(featStr))
    lookupFeatInd <- seq_len(length(suFeatStr))
    names(lookupFeatInd) <- suFeatStr

    rawCount <- data.frame(sample = sampleIDs, mutInds = lookupFeatInd[featStr])

    tableCount <- table(rawCount)
    w <- which(tableCount > 0, arr.ind=TRUE)
    procCount <- cbind(w[,2], w[,1], tableCount[w])


    if (length(fdim) == 1) {
      mutFeatList <- matrix(as.integer(suFeatStr), length(suFeatStr), 1)
    } else {
      mutFeatList <- t(vapply(suFeatStr,
                              function(x) as.numeric(unlist(strsplit(x, ","))),
                              numeric(numBases + as.integer(trDir))))
    }

    rownames(mutFeatList) <- NULL
    rownames(procCount) <- NULL

    if (trDir == FALSE) {
      strandInfo_for_class <- rep(NA, length(chrInfo))
    } else {
      strandInfo_for_class <- strandInfo
    }

    return(methods::new(Class = "MutationFeatureData",
               type = type,
               flankingBasesNum = as.integer(numBases),
               transcriptionDirection = trDir,
               possibleFeatures = as.integer(fdim),
               featureVectorList = t(mutFeatList),
               sampleList = suSampleStr,
               countData = t(procCount),
               mutationPosition =
                 data.frame(chr = chrInfo, pos = posInfo,
                            ref = ref_base, alt = alt_base,
                            strand = strandInfo_for_class,
                            context = context,
                            sampleID = unname(sampleIDs),
                            mutID = unname(lookupFeatInd[featStr]),
                            stringsAsFactors = FALSE)
    )
    )

  }






#' Get mutation feature vector from context sequence data and reference and
#' alternate allele information
#'
#' @param context the context sequence data around the mutated position. This
#' shoud be Biostrings::DNAStringSet class
#' @param ref_base the reference bases at the mutated position.
#' @param alt_base the alternate bases at the mutated position.
#' @param strandInfo transcribed strand information at the mutated position.
#'   (this is optional)
#' @param numBases the number of flanking bases around the mutated position.
#' @param type the type of mutation feature vecotr (should be "independent" or
#' "full").
#' @return a mutation featuer vector
#'
#'
#'
getMutationFeatureVector <- function(context, ref_base, alt_base,
                                     strandInfo = NULL, numBases, type) {

    trDir <- !is.null(strandInfo)
    if (type == "independent") {
      fdim <- c(6, rep(4, numBases - 1), rep(2, as.integer(trDir)))
    } else if (type == "full") {
      fdim <- c(6 * 4^(numBases - 1) * 2^(as.integer(trDir)))
    } else {
      stop('the type argument has to be "independent" or "full"')
    }

    if (numBases %% 2 != 1) {
      stop("numBases should be odd numbers")
    }
    centerInd <- (numBases + 1) / 2

    if (type == "independent") {

      mutFeatures <- matrix(0, length(ref_base), length(fdim))

      centralBaseList <- data.frame(ref_base=rep(c("C", "T"), each=3),
                                    alt_base=c("A", "G", "T", "A", "C", "G"),
                                    stringsAsFactors = FALSE)

    ## rewrite the part from pmsignature

    for (i in seq_len(6)) {
      mutFeatures[which(ref_base == centralBaseList$ref_base[i] &
                        alt_base == centralBaseList$alt_base[i]), 1] <- i
    }

    columnInd <- 2
    for (baseInd in seq_len(numBases)) {
      if (baseInd == centerInd) {
        next
      }

      baseList <- c("A", "C", "G", "T")

      ## rewrite the part from pmsignature
      for (base in seq_along(baseList)) {
        mutFeatures[which(XVector::subseq(context,
                                          start=baseInd,
                                          end=baseInd) == baseList[base]),
                    columnInd] <- base
      }

      columnInd <- columnInd + 1
    }

    if (trDir ==TRUE) {
      mutFeatures[which(strandInfo == "+"), length(fdim)] <- 1
      mutFeatures[which(strandInfo == "-"), length(fdim)] <- 2
    }

    } else {

    mutFeatures <- matrix(1, length(ref_base), length(fdim))

    tempDigits <- 1
    for (i in seq_len((numBases - 1) / 2)) {

      baseInd <- numBases + 1 - i

      mutFeaturesList <- c("C", "G", "T")
      
      for(i in mutFeaturesList) {
        mutFeatures[which(XVector::subseq(context, start=baseInd, end=baseInd)
                          == i), 1] <-
          mutFeatures[which(XVector::subseq(context, start=baseInd, end=baseInd)
                            == i), 1] + tempDigits * which(mutFeaturesList == i)
      }
      

      tempDigits <- tempDigits * 4
      baseInd <- i;
      
      for(i in mutFeaturesList) {
        mutFeatures[which(XVector::subseq(context, start=baseInd, end=baseInd)
                          == i), 1] <-
          mutFeatures[which(XVector::subseq(context, tart=baseInd, end=baseInd)
                            == i), 1] + tempDigits * which(mutFeaturesList == i)
      }

      tempDigits <- tempDigits * 4
    }

    
    mutFeaturesMat <- cbind(c("C", "C", "T", "T", "T"),
                            c("G", "T", "A", "C", "G"))
    
    
    for(i in seq_len(nrow(mutFeaturesMat))) {
      
      mutFea1 <- mutFeaturesMat[i, 1]
      mutFea2 <- mutFeaturesMat[i, 2]
      
      
      mutFeatures[which(ref_base == mutFea1 & alt_base == mutFea2), 1] <-
        mutFeatures[which(ref_base == mutFea1 & alt_base == mutFea2), 1] +
        tempDigits * i
      
    }


    if (trDir ==TRUE) {
      tempDigits <- tempDigits * 6
      mutFeatures[which(strandInfo == "-"), 1] <-
        mutFeatures[which(strandInfo == "-"), 1] + tempDigits * 1
    }

  }

  return(mutFeatures)

}
USCbiostats/HiLDA documentation built on Oct. 15, 2021, 10:22 a.m.