R/utilities.R

##' convert 454 summary file for generateGapFile function
##'
##'
##' @title convert_summary_file
##' @param summary_file summary_file
##' @param out_file out_file
##' @return NULL
##' @importFrom utils read.csv
##' @export
##' @author Guangchuang Yu
convert_summary_file <- function(summary_file, out_file="summary2.csv") {
    ## in summary file, sequence name contain something like CK_GY_4994_2014,
    ## which is use to match read file, fa.txt.
    ## but in 454 runs, read file only contain information like RL18.
    ## this function is to convert the filed,  CK_GY_4994_2014, to RL18.
    sf <- read.csv(summary_file)
    sf[, "name"] <- gsub("(RL\\d+)", "_\\1", sub("_.*", "", sf[, "name"]))
    write.csv(sf, file=out_file)
}


##' @importFrom Biostrings toString
##' @importFrom Biostrings DNAStringSet
identityRatio <- function(aln) {
    fa <- DNAStringSet(aln)
    seqs <- sapply(fa, toString)
    seqs <- sapply(seqs, strsplit, split="")
    sum(seqs[[1]] == seqs[[2]])/ length(seqs[[1]])
}

getIdx <- function(queryID, sourceID) {
    unlist(sapply(queryID, function(x) which(x == sourceID)))
}

pause <- function() {
    ## print("press ENTER to continue...")
    readline()
    invisible()
}

aln2seqDF <- function(aln) {
    ## seqs <- aln$seqs[,2]
    seqs <- sapply(aln, toString)
    seq.df <- do.call("rbind", strsplit(seqs, ""))
    return(seq.df)
}

printFormatSeq <- function(seq, window=80) {
    jj <- seq(1, nchar(seq), window)
    if (jj[length(jj)] < nchar(seq)) {
        jj <- c(jj, nchar(seq))
    }
    for (p in 1:(length(jj)-1)) {
        cat(substring(seq, jj[p], jj[p+1]), "\n")
    }
}


addFooter <- function(file) {
    sink(file, append=TRUE)
    cat("\n")
    cat("## Info\n")
    cat("\nThis Report was generated by `skleid` version", as.character(packageVersion("skleid")),
        ".\nContact [Guangchuang](mailto:gcyu@connect.hku.hk) if you need helps.\n")
    sink()
}


getFiles <- function(path) {
    ff <- list.files(path=path)
    ff <- paste(path, ff, sep="/")
    return(ff)
}

moveUnknownFile <- function(contig.folder) {
    contig <- getFiles(contig.folder)
    sc <- getSampleID(contig)
    idx <- which(sc == contig)
    ufolder <- "uknown"
    if (length(idx) >= 1) {
        moveFile(contig[idx], contig.folder, ufolder)
    }
}

moveFile <- function(files, from, to) {
    if (!file.exists(to))
        dir.create(to)
    for (ff in files) {
        file.rename(ff, to=sub(from, to, ff))
    }
}

moveEmptyFile <- function(folder) {
    ff <- getFiles(folder)
    ii <- which(file.info(ff)$size == 0)
    if (length(ii) > 0) {
        moveFile(ff[ii], folder, "empty")
    }
}

getMixedFileIndex <- function(files) {
    which(sapply(files, isMixed))
}


isMixed <- function(file) {
    sg <- get_sid_gn(file)
    prot <- gsub("[SRL]+\\d+([HNMP][APSB]\\d*)", "\\1", sg)

    prot.cutoff <- c(rep(2000, 4), 1100, rep(3000, 3))
    names(prot.cutoff) <- c("HA", "NA", "MP", "NP", "NS", "PA", "PB1", "PB2")

    res <- file.info(file)$size > prot.cutoff[prot]
    ## NDV that is not exists
    if (is.na(res))
        return(TRUE)
    return(res)
}

getSampleID <- function(contig) {
    sg <- get_sid_gn(contig)
    sc <- gsub("([SRL]+\\d+)[HNMP][APSB]\\d*.*", "\\1", sg)
    return(sc)
}

get_sid_gn <- function(files) {
    sc <- files
    files <- sub(".*/", "", files)
    idx <- grep("[SRL]+\\d+[HNMP][APSB]*\\d*_", files)
    if (length(idx) > 0) {
        sc[idx] <- gsub("([SRL]+\\d+[HNMP][APSB]*\\d*)_.*", replacement="\\1", files[idx])
    }
    sc <- gsub(".*_([SRL]+\\d+[HNMP][APSB]*\\d*).*", replacement="\\1", sc)
    return(sc)
}

moveMixedFile <- function(contig.folder) {
    contig <- getFiles(contig.folder)
    f454 <- contig[grep("_454M.fa[sta]$", contig)]
    sc <- getSampleID(contig)
    idx <- getMixedFileIndex(f454)

    mix <- unique(getSampleID(f454[idx]))

    if (length(mix) >= 1) {
        if (!file.exists("mixed"))
            dir.create("mixed")

        ii <- which(sc %in% mix)
        moveFile(contig[ii], contig.folder, "mixed")
    }

}

getMixedStrain <- function() {
    mix.sc <- NULL
    if (file.exists("mixed")) {
        mix <- getFiles("mixed")
        if (length(mix) > 0)
            mix.sc <- getSampleID(mix)
        if (length(mix.sc) == 0)
            mix.sc <- NULL
    }
    return(mix.sc)
}

moveNDVFile <- function(contig.folder) {
    ## S38-Yellow-CK-JX-10226-2014_S38NDV_454.fas
    ## S38-Yellow-CK-JX-10226-2014_S38NDV_Contigs.fna
    ## S38-Yellow-CK-JX-10226-2014_S38NDV_mira_delX.fasta
    ## S38-Yellow-CK-JX-10226-2014_S38NDV_mira.fasta
    contig <- getFiles(contig.folder)

    idx <- grep(".*/.*[SRL]+\\d+NDV_.*", contig)

    if (length(idx) >= 1) {
        if (!file.exists("NDV"))
            dir.create("NDV")
        moveFile(contig[idx], contig.folder, "NDV")
    }
}


## `%>%` <- function(x, FUN) FUN(x)
utils::globalVariables(".")
GuangchuangYu/skleid documentation built on May 6, 2019, 9:03 p.m.