R/Methods-ViralPop.R
In SMITIDstruct: Data Structure and Manipulations Tool for Host and Viral Population

# Part of the SMITIDstruct R package.
# Copyright (C) 2018 Jean-François Rey <jean-francois.rey@inra.fr>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation, Inc.,i
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#


#' @title loadViralPop
#' @description Load all ViralPop observated in the file.obs
#' @param directory path where is data
#' @param listFiles a dataframe with host ID, time observation and file name (filename.fasta)
#' @param listCol a list of listFiles colomns names ("id", "timeObs", "filename")
#' @param file.extension genotype file extension
#' @return a vector of VirlaPop object
#' @include Class-ViralPop.R
#' @examples
#' \donttest{
#' path = system.file("extdata", "data-simul/", package="SMITIDstruct")
#' files <- list.files(path, pattern = ".*.fasta" ,full.names=FALSE)
#' lfileinfo <- sapply(files,function(x){return(substr(x,1,nchar(x)-6))})
#' splitFiles <- strsplit(lfileinfo, "_");
#' listF <- cbind(data.frame(matrix(unlist(splitFiles),nrow=length(splitFiles), byrow=TRUE),
#'                stringsAsFactors = FALSE), names(splitFiles))
#' colnames(listF) <- c("id", "time", "filename")
#' lvpop <- loadViralPop(path,listF)
#' }
#' @export
loadViralPop <- function(directory, listFiles, listCol = list("id"="id", "timeObs"= "time", "filename" = "filename"), file.extension = "fasta") {
  if(missing(directory)) {
    stop("Directory is missing")
  }
  
  #obs.file <- paste(directory, "/", file.obs,sep="")
  
  #if( !file.exists(obs.file) ) {
  #  stop("can not find ", obs.file)
  #}
    
  #listObs <- utils::read.table(obs.file)
  listObs <- listFiles
  listObs[,listCol$filename] <- paste(directory,"/",listObs[,listCol$filename],sep="")
  
  #viralPopList <- vector()
  
  viralPopList <- as.list(apply(listObs,1, function(x){ loadViralObs(x[listCol$id], x[listCol$timeObs], x[listCol$filename])} ))
  #setNames(viralPopList, as.character(seq(1:length(viralPopList))))
  
  class(viralPopList) <- "ViralPopSet"
  
  return(unname(viralPopList))
  
  # 
  # files <- list.files(directory, file.extension, full.names = TRUE)
  # 
  # if( identical(files,character(0)) ) {
  #   stop(paste("no files",file.extension,"in",directory))
  # }
  # else {
  #   dnaset <- DNAStringSetList()
  #   for(file in files) {
  #     dnaset[[length(dnaset)+1]] <- readDNAStringSet(file)
  #   }
  # }
  # vp <- new("ViralPop", size = length(dnaset), genotypes = dnaset)
  # 
  # return(vp)
}

#' @title loadViralObs
#' @description load a ViralPop object 
#' @param id host pathogen ID
#' @param time time of the observation (numeric or Date)
#' @param file a fasta file
#' @return a new ViralPop object
#' 
#' @export
loadViralObs <- function(id, time, file) {
  if( is.numeric(as.numeric(time))) {time <- as.numeric(time)}
  else{
    if(is.StringDate(time)) time <- getTimestamp(time)
    else warning("loadViralObs can't determine time format")
  }

  obss <- readDNAStringSet(file)
  variants.nb <- length(obss)
  variants.genotypes <- unique(obss)
  variants.names <- sapply(unique(obss), FUN = function(p){ names(obss[p == obss])})
  variants.prop <- as.vector(sapply(unique(obss), FUN = function(p){ sum(p == obss) / length(obss)}))
  
  orderByProp <- order(variants.prop, decreasing=TRUE)
  variants.genotypesOrder <- variants.genotypes[orderByProp]
  variants.namesOrder <- as.list(variants.names[orderByProp])
  variants.propOrder <- variants.prop[orderByProp]
  rm(obss)
  
  return(new("ViralPop", ID = as.character(id), time = time, size = variants.nb, variants.namesOrder, genotypes = variants.genotypesOrder, proportions = variants.propOrder))
  
}

#' @title concatViralPop
#' @description concat several Viral population in one ViralPop object
#' @param lvpop a ViralPop Set
#' @param lid vector of viralpop id to concat
#' @return a ViralPop object with ID concatenation from all IDs and time at 0.
#' @export
concatViralPop <- function(lvpop, lid) {
    
    set=DNAStringSet(c())
    sizeT = 0
    for(v in lid) {
        set <- union(lvpop[[as.numeric(v)]]@genotypes,set)
        sizeT = sizeT + lvpop[[as.numeric(v)]]@size
    }
    
    res.names <- list()
    for( g in 1:length(set)) {
        for(v in lid) {
            index = which(lvpop[[as.numeric(v)]]@genotypes == set[[g]])
            if(length(index)) {
                if( is.null(res.names[g][[1]]) ){ res.names[g] <- list(lvpop[[as.numeric(v)]]@names[[index]]) }
                else{ res.names[g] <- list(c(unlist(res.names[[g]]), lvpop[[as.numeric(v)]]@names[[index]])) }
            }
        }
    }
    
    res.prop <- as.vector(sapply(1:length(set), FUN=function(x){length(res.names[[x]])/sizeT}))
    orderbyProp <- order(res.prop, decreasing = TRUE)
    all.names <- res.names[orderbyProp]
    all.prop <- res.prop[orderbyProp]
    all.set <- set[orderbyProp]
    names(all.set)<-lapply(all.names,'[[',1)
    
    # concat time
    #paste(sapply(lid, FUN=function(x){ return( lvpop[[x]]@time)}),collapse="_")
    
    return(new("ViralPop", ID = paste(lid,collapse="_"), time = 0, size = sizeT, all.names, genotypes = all.set, proportions = all.prop))
}

#' createAViralPop
#' @description Create a new ViralPop object
#' @param host_id host ID which viral pop is observed
#' @param obs_time time of the observation (numeric or date)
#' @param seq a data.frame of sequences ID, sequences and counts
#' @param id_seq column name containing the sequences ID
#' @param seq_value column name containing the sequences
#' @param prop column name containing the count of each sequences
#' @param compact boolean, default FALSE, if TRUE will try group identicals sequences (not implemented yet)
#' @export
createAViralPop <- function(host_id, obs_time, seq, id_seq="seq_ID", seq_value="seq", prop = "prop", compact=FALSE) {
    
    host_id <- as.character(host_id)
    if( !is.na(as.numeric(obs_time)) && is.numeric(as.numeric(obs_time))) {obs_time <- as.numeric(obs_time)}
    else { obs_time <- getTimestamp(obs_time) }
    proportions <- seq[[prop]]
    nb_variants <- sum(proportions)
    proportions <- seq[[prop]] / nb_variants
    names <- lapply(seq[[id_seq]], function(ids){ ids })
    names(names) <- seq[[id_seq]]
    genotypes <- DNAStringSet(seq[[seq_value]])
    names(genotypes) <- seq[[id_seq]]
    
    return(new("ViralPop", ID = host_id, time = obs_time, size = nb_variants, names, genotypes = genotypes, proportions = proportions))
}

#' loadViralPopSet
#' @description load a list of viral populations
#' @param lvpop a viralPopSet (default new one)
#' @param list a list (see details)
#' @details The list have to be on this format:  
#' list$HOST_ID$TIME$list$seq_id
#'                       $seq
#'                       $prop
#' A list indexed by host ID, follow by a list indexed by time (of observation).
#' The last list contains an array of seq_ID (sequence ID), an array of seq (sequence as characters), and an array of the count of seq.
#' example : 
#' $'HOST_42'$'2014-01-01T00:00:00'$seq_ID ["SEQ_1","SEQ_2"]
#' $'HOST_42'$'2014-01-01T00:00:00'$seq ["ACGT","TGCA"]
#' $'HOST_42'$'2014-01-01T00:00:00'$seq_ID ["46","6"]
#' @export
loadViralPopSet <- function(lvpop=list(), list) {
    
    viralPopList <- list()
    viralPop <- as.list(lapply(names(list),
                               function(host){ 
                                    viralPopList <<- append(viralPopList, as.list(lapply(names(list[[host]]), function(time){
                                        createAViralPop(host, time, data.frame(list[[host]][[time]], stringsAsFactors = FALSE ))
                                    })))
    }))
    
    class(viralPopList) <- "ViralPopSet"
    
    return(unname(viralPopList))
    
}


#' @title diversity.pDistance
#' @description diversity calculation using Mean Pairwise Distance
#' @param vpop a ViralPop object
#' @return result
#' 
#' @export
diversity.pDistance <- function(vpop) {
    matDist <- as.matrix(stringDist(vpop@genotypes, method="levenshtein"))
    
    som <- 0
    
    maxSeq <- length(vpop@genotypes)
    
    #for(i in 1:maxSeq) {
    #    for(j in i:maxSeq) {
            #if( i == j ) {
            #    som <- som + ((length(vpop@names[[i]]) * (length(vpop@names[[i]])-1)) /2) * matDist[i,j] #/length(vpop@genotypes[[i]])
            #} else {
    #            som <- som + (length(vpop@names[[i]]) * length(vpop@names[[j]])) * matDist[i,j]#/length(vpop@genotypes[[i]])
            #}
    #    }
    #}
    
    # for all i
    som <- sum(sapply(1:maxSeq, FUN = function(i) {
                                    # for all j from i to max
                                    sum(sapply(i:maxSeq, FUN = function(ind2, ind1){
                                                            length(vpop@names[[ind1]]) * length(vpop@names[[ind2]]) * matDist[ind1,ind2] }, ind1=i ))
                                }))
    
    
    p_distance <- 1/((vpop@size*(vpop@size-1))/2) * (2*som)/length(vpop@genotypes[[1]])
    
    return(p_distance)
}

#' diversity.sfs
#' @description Allele frequency spectrum or Site frequency spectra : the distribution of alternative allele frequencies across all sites of genetic sequences
#' @param vpop a viralPop class
#' @return the site frequency spectra 
#' @export
diversity.sfs <- function(vpop) {
    
    
    ### Test avec Z
    # library(adegenet)
    # library(ape)
    #load("~/Desktop/Jeff.RData")
    #Mat = Z[[3]][[1]]
    #Mat[Mat==1]="A"
    #Mat[Mat==2]="G"
    #Mat[Mat==3]="T"
    #Mat[Mat==4]="C"
    #Matbis=as.DNAbin(Mat)
    #Mat2=DNAbin2genind(Matbis,polyThres=1/1000)
    #Mat3=as.genlight(Mat2@tab)
    #Graph_Mat=glSum(Mat3,alleleAsUnit=TRUE, useC=FALSE)
    #Graph_Mat
    #ac_org <- alleleCount(Matbis, seq.char=c('a','c','g','t'))
    #sum(ac_org == Mat2@tab) == length(ac_org)
    #Graph_Mat_org <- glsum(ac_org)
    #sum(Graph_Mat == Graph_Mat_org) == length(Graph_Mat)
    
    alCount <- alleleCount(as.matrix(vpop@genotypes[])[1:length(vpop@genotypes),])
    
    # sum allele count of all sequence
    glsum <- function(x) {
        res <- integer(ncol(x))
        for(e in 1:nrow(x)){
            temp <- as.integer(x[e,])
            temp[is.na(temp)] <- 0L
            res <- res + temp
        }
    
        names(res) <- colnames(x)
        return(res)
    }
    
    globalAlleleCount <- glsum(alCount)
    
    
    return(globalAlleleCount)
}

#' alleleCount
#' @description count allele at each position
#' @param mat a genomique seq list as matrix by row
#' @param seq.char allele alphabet
#' @return a matrix, each row as a unique seq and col as allele count by position
alleleCount <- function(mat, seq.char =  c("A", "T", "G", "C")) {
    polyThres = 1/1000
    
    if (is.list(mat)) {
        mat <- as.matrix(mat)
    }
    if (is.null(colnames(mat))) {
        colnames(mat) <- 1:ncol(mat)
    }
    
    processLocus <- function(locus, posi) {
        vec <- as.character(locus)
        vec[!vec %in% seq.char] <- NA
        N <- sum(!is.na(vec))
        if (N == 0 || sum(table(vec)/N >= polyThres) < 2) 
            return(NULL)
        alleles <- unique(stats::na.omit(vec))
        out <- sapply(alleles, function(e) 1 * (vec == e))
        colnames(out) <- paste(posi, alleles, sep = ".")
        return(out)
    }
    
    temp <- lapply(1:ncol(mat), function(i) processLocus(mat[, i], 
                                                       i))
    col.names <- unlist(sapply(temp, colnames))
    temp <- as.matrix(data.frame(temp[!sapply(temp, is.null)]))
    if (is.null(temp) || ncol(temp) == 0) {
        cat("\nNo polymorphism detected - returning NULL.\n")
        return(NULL)
    }
    if (class(col.names) == "matrix") 
        col.names <- as.vector(col.names)
    
    colnames(temp) <- col.names
    rownames(temp) <- rownames(mat)
    return(temp)
}
Any scripts or data that you put into this service are public.
SMITIDstruct documentation built on June 14, 2019, 5:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
SMITIDstruct
Data Structure and Manipulations Tool for Host and Viral Population

R/Methods-ViralPop.R
In SMITIDstruct: Data Structure and Manipulations Tool for Host and Viral Population

Try the SMITIDstruct package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SMITIDstruct Data Structure and Manipulations Tool for Host and Viral Population

R/Methods-ViralPop.R In SMITIDstruct: Data Structure and Manipulations Tool for Host and Viral Population

Try the SMITIDstruct package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SMITIDstruct
Data Structure and Manipulations Tool for Host and Viral Population

R/Methods-ViralPop.R
In SMITIDstruct: Data Structure and Manipulations Tool for Host and Viral Population