R/indelSNP.R

Defines functions indelSNP

Documented in indelSNP

#' Provide effects of each single nucleotide polymorphism (SNP), insertion and deletion in virus genome
#'
#' @description This function is to annotate the mutational events and indicate their potential effects on the proteins.
#' Mutational events include SNP, insertion and deletion.
#'
#' @param nucmer An object called "nucmer", mutation information derived
#' from "nucmer.snp" variant file by "seqkit" software and "nucmer
#' SNP-calling" scripts. To be processed by "indelSNP" function, The nucmer
#' object should be first transformed by "mergeEvents" function.
#' @param saveRda Whether to save the results as ".rda" file.
#' @param refseq SARS-Cov-2 genomic reference sequence.
#' @param gff3 "GFF3" format annotation data for SARS-Cov-2.
#' @param annot Annotation of genes(corresponding proteins) list from "GFF3" file by "setNames(gff3[,10],gff3[,9])".
#' @param outdir The output directory.
#'
#' @return Write the result as ".csv" file to the specified directory.
#' @export
#' @importFrom seqinr read.fasta translate
#' @importFrom utils txtProgressBar setTxtProgressBar write.csv
#' @examples
#' data("nucmer")
#' # Fix IUPAC codes
#' nucmer<-nucmer[!nucmer$qvar%in%c("B","D","H","K","M","N","R","S","V","W","Y"),]
#' nucmer<- mergeEvents(nucmer = nucmer)## This will update the nucmer object
#' data("refseq")
#' data("gff3")
#' annot <- setNames(gff3[,10],gff3[,9])
#' #outdir <- tempdir()
#' nucmer<- indelSNP(nucmer = nucmer,
#'                   saveRda = FALSE,
#'                   refseq = refseq,
#'                   gff3 = gff3,
#'                   annot = annot,
#'                   outdir = NULL)
indelSNP <- function(nucmer = nucmer, saveRda = FALSE, refseq = refseq, gff3 = gff3, annot = annot, outdir = NULL){
  if(is.null(outdir) == FALSE){
    header<-c("sample","refpos","refvar","qvar","qpos","qlength","protein","variant","varclass","annotation")
    results<-matrix(NA,ncol=length(header),nrow=0)
    colnames(results)<-header
    samples<-unique(nucmer$qname)
    pb<-txtProgressBar(0,length(samples),style=3)
    for (pbi in 1:length(samples)){ # This will update the nucmer object
      #add condon list, the stop codon is for "*".

      sample<-samples[pbi]
      allvars<-nucmer[nucmer$qname==sample,]
      # Check changes in query protein sequence according to variants
      for(i in 1:nrow(allvars)){ # Assuming they are sorted numerically
        nucline<-allvars[i,]
        rpos<-nucline[1,"rpos"]
        rvar<-nucline[1,"rvar"]
        qvar<-nucline[1,"qvar"]
        qpos<-nucline[1,"qpos"]
        qlength<-nucline[1,"qlength"]

        # Match over GFF3 annotation   for each var in a sample: match the var with the GFF3
        a<-rpos-gff3[,4]
        b<-rpos-gff3[,5]
        signs<-sign(a)*sign(b)
        w<-which(signs==-1)#if signs == -1 then  rpos in GFF anno

        # Outside genes scenarios
        if(length(w)==0){
          if(rpos<gff3[1,4]){    #rpos in the upstream of all gene
            protein<-"5'UTR";output<-c(rpos,"extragenic")
          } else if(rpos>gff3[nrow(gff3),5]){  #?rpos in the downstream of all gene rpos>gff3[nrow(gff3),5]
            protein<-"3'UTR";output<-c(rpos,"extragenic")
          } else {
            protein<-"intergenic";output<-c(rpos,"extragenic")
          }

        } else{ # Inside genes scenario
          start<-gff3[w,4]
          end<-gff3[w,5]
          protein<-gff3[w,9]


          #refdnaseq<-DNAString(paste0(refseq[start:end],collapse=""))
          refdnaseq <- paste0(refseq[start:end],collapse="")
          #seqm<- toupper(refdnaseq)

          refpepseq<- translate(strsplit(refdnaseq, split= "")[[1]])
          #refpepseq<-Biostrings::translate(refdnaseq) #obtain protein seq
          #refpepseq<-strsplit(as.character(refpepseq),"")[[1]]
          if(qvar=="."){ # Deletion scenario
            if((nchar(rvar)%%3)!=0){ # Deletion frameshift scenario
              mutpos<-round((rpos-start+1)/3)
              output<-c(paste0(refpepseq[mutpos],mutpos),"deletion_frameshift")
            } else { # In-frame deletion
              varseq<-refseq
              varseq<-varseq[-(rpos:(rpos+nchar(rvar)-1))]
              varseq<-varseq[start:(end-nchar(rvar))]
              #vardnaseq<-DNAString(paste0(varseq,collapse=""))
              vardnaseq <- paste0(varseq,collapse="")

              varpepseq<- translate(strsplit(vardnaseq, split= "")[[1]])
              #varpepseq<-Biostrings::translate(vardnaseq)
              #varpepseq<-strsplit(as.character(varpepseq),"")[[1]]

              for(j in 1:length(refpepseq)){
                refj<-refpepseq[j]#
                varj<-varpepseq[j]
                if(refj!=varj){
                  if(varj=="*"){
                    output<-c(paste0(refj,j),"deletion_stop")
                  } else {
                    output<-c(paste0(refj,j),"deletion")
                  }
                  break()
                }
              }
            }
          } else if(rvar=="."){ # Insertion scenario
            if((nchar(qvar)%%3)!=0){ # Insertion frameshift scenario
              mutpos<-round((rpos-start+1)/3)
              output<-c(paste0(refpepseq[mutpos],mutpos),"insertion_frameshift")
            } else { # In-frame insertion
              varseq<-c(refseq[1:rpos],strsplit(qvar,"")[[1]],refseq[(rpos+1):length(refseq)])
              varseq<-varseq[start:(end+nchar(qvar))]
              #vardnaseq<-DNAString(paste0(varseq,collapse=""))

              vardnaseq <- paste0(varseq,collapse="")

              varpepseq<- translate(strsplit(vardnaseq, split= "")[[1]])
              #varpepseq<-Biostrings::translate(vardnaseq)
              #varpepseq<-strsplit(as.character(varpepseq),"")[[1]]

              for(j in 1:length(refpepseq)){
                refj<-refpepseq[j]
                varj<-varpepseq[j]
                if(refj!=varj){
                  nr_aa_inserted<-nchar(qvar)/3
                  multivarj<-varpepseq[j:(j+nr_aa_inserted-1)]
                  if(any(multivarj=="*")){
                    multivarj<-paste0(multivarj,collapse="")
                    output<-c(paste0(multivarj,j),"insertion_stop")
                  } else{
                    multivarj<-paste0(multivarj,collapse="")
                    output<-c(paste0(multivarj,j),"insertion")
                  }
                  break()
                }
              }
            }
          } else { # SNP scenario
            if(nchar(qvar)==1){ # ?Single nucleotide scenario
              varseq<-refseq
              varseq[rpos]<-qvar
              varseq<-varseq[start:end]
              #vardnaseq<-DNAString(paste0(varseq,collapse=""))

              vardnaseq <- paste0(varseq,collapse="")

              varpepseq<- translate(strsplit(vardnaseq, split= "")[[1]])
              #varpepseq<-Biostrings::translate(vardnaseq)
              #varpepseq<-strsplit(as.character(varpepseq),"")[[1]]
              mutpos<-which(varpepseq!=refpepseq)  #split
              if(length(mutpos)==0){ # Silent SNP scenario
                mutpos<-round((rpos-start+1)/3)
                refaa<-refpepseq[mutpos]
                varaa<-varpepseq[mutpos]
                output<-c(paste0(refaa,mutpos,varaa),"SNP_silent")
              } else { # Changed aa scenario
                refaa<-refpepseq[mutpos]
                varaa<-varpepseq[mutpos]
                if(varaa=="*"){
                  output<-c(paste0(refaa,mutpos,varaa),"SNP_stop")
                } else {
                  output<-c(paste0(refaa,mutpos,varaa),"SNP")
                }
              }
            } else { # Multiple neighboring nucleotides
              varlength<-nchar(qvar)
              varseq<-refseq
              varseq[rpos:(rpos+varlength-1)]<-strsplit(qvar,"")[[1]]
              varseq<-varseq[start:end]
              #vardnaseq<-DNAString(paste0(varseq,collapse=""))

              vardnaseq <- paste0(varseq,collapse="")

              varpepseq<- translate(strsplit(vardnaseq, split= "")[[1]])
              #varpepseq<-Biostrings::translate(vardnaseq)
              #varpepseq<-strsplit(as.character(varpepseq),"")[[1]]
              mutpos<-which(varpepseq!=refpepseq)
              if(length(mutpos)==0){ # Silent SNP scenario
                mutpos<-round((rpos-start+1)/3)
                refaa<-refpepseq[mutpos]
                varaa<-varpepseq[mutpos]
                output<-c(paste0(refaa,mutpos,varaa),"SNP_silent")
              } else { # Changed aa scenario
                refaa<-paste0(refpepseq[mutpos],collapse="")
                varaa<-paste0(varpepseq[mutpos],collapse="")
                if(any(varaa=="*")){
                  output<-c(paste0(refaa,mutpos[1],varaa),"SNP_stop")
                } else {
                  output<-c(paste0(refaa,mutpos[1],varaa),"SNP")
                }
              }
            }
          }
        }
        results<-rbind(results,c(sample,rpos,rvar,qvar,qpos,qlength,protein,output,annot[protein]))
      }
      setTxtProgressBar(pb,pbi)
    }
    results<-as.data.frame(results,stringsAsFactors=FALSE)
    return(results)
    csvPath<- file.path(outdir,"covid_annot.csv",sep = "")
    write.csv(results,file=csvPath,row.names = FALSE)
    if(saveRda == TRUE){
      rdaPath <- file.path(outdir,"covid_annot.rda",sep = "")
      save(results,file=rdaPath)
    }
  }

}

Try the CovidMutations package in your browser

Any scripts or data that you put into this service are public.

CovidMutations documentation built on Sept. 18, 2020, 5:06 p.m.