R/import.msf.R

import.msf <- function (file, aa.to.upper = TRUE, gap.to.dash = TRUE,log.file=NULL) {
  
  if(missing(file)) {
    stop("file is missing")
  }

  # Read as a vector of lines
  lines <- readLines(file)

  # Check msf format
  check1 <- grep("MSF:.*Type:.*Check:.*", lines)
  check2 <- grep("Name:.*Len:.*Check:.*Weight:.*", lines)
  limit <- grep("^//", lines)
  if (length(check1) == 0 || length(check2) == 0 || length(limit) == 0) {
	if(!is.null(log.file))
		write("file is not in msf format",log.file)  
        	
	stop("file is not in msf format")
	}

  # Get sequence identifiers from header
  id.head <- sub("^\\s*Name:\\s+(\\S+).*$","\\1", lines[check2])
  nb.seq <- length(id.head)

  # Check duplicated identifiers in header
  if(any(duplicated(id.head))){
	if(!is.null(log.file))
		write("duplicated identifiers in header",log.file)  
   
    stop("duplicated identifiers in header")
  }

  # Get sequence identifiers from alignment
  align <- grep("^\\s*\\S+\\s+[^1-9]+$", lines[limit:length(lines)], value = TRUE)
  id.align <- sub("^\\s*(\\S+)\\s+[^1-9]+$", "\\1", align)

  # Localize sequence pieces for each identifier
  loc <- lapply(seq_len(nb.seq), function(i) {which(id.align == id.head[i])})

  # Paste and clean sequences
  seq <- sapply(loc, function(i) {paste(sub("^\\s*\\S+\\s+([^1-9]+)$", "\\1", align[i]), collapse = "")})
  seq <- gsub("\\s", "", seq)
  
  # Turn aa into upper case
  if (aa.to.upper)
    seq <- toupper(seq)

  # Give a list of split sequences
  seq <- strsplit(seq, split = "")
  names(seq) <- id.head

  # Turn gap into dash character
  if (gap.to.dash)
    seq <- lapply(seq, function (i) {i[is.gap(i)] <- "-"; return(i)})
  class (seq) <- c("align")
  return(seq)
}

Try the Bios2cor package in your browser

Any scripts or data that you put into this service are public.

Bios2cor documentation built on July 8, 2022, 5:05 p.m.