R/convert_polyorigin.R

Defines functions convert_polyorigin

Documented in convert_polyorigin

#' Create diaQTL input files from PolyOrigin output
#' 
#' Create diaQTL input files from PolyOrigin output
#' 
#' Creates the pedigree (diaQTL_pedfile.csv) and genotype (diaQTL_genofile.csv) input files needed for \code{\link{read_data}} from the polyancestry output file generated by the PolyOrigin software. PolyOrigin outputs a genetic map in cM. To add a physical map in bp, use the option \code{mapfile}. The input file needed for \code{\link{phased_parents}} (diaQTL_parents.csv) is also created.
#'
#' @param filename Name of polyancestry file 
#' @param mapfile Optional name of CSV file containing the physical map (marker, chrom, bp)
#' @param remove.outliers Should offspring flagged as outliers be removed (default is TRUE)
#' @param outstem prefix for output filenames
#' 
#' @return NULL
#'  
#' @export
#' @importFrom utils read.csv write.csv

convert_polyorigin <- function(filename,mapfile=NULL,remove.outliers=TRUE,outstem="") {
  
  if (!is.null(mapfile)) {
    map <- read.csv(mapfile,as.is=T)[,1:3]
    colnames(map) <- c("marker","chrom","bp")
  } else {
    print("diaQTL_parents not generated because no physical mapfile.")
  }
  con <- file(filename,"r")
  temp <- readLines(con)
  ix <- grep("PolyOrigin",temp)
  
  k <- grep("offspringinfo",temp[ix])
  x <- strsplit(temp[c((ix[k]+2):(ix[k+1]-1))],split=",")
  ped <- data.frame(id=sapply(x,function(x){x[1]}),pop=sapply(x,function(x){x[2]}),stringsAsFactors = F)
  outliers <- as.logical(toupper(sapply(x,function(x){x[4]})))
  if (remove.outliers) {
    outlier.id <- ped$id[which(outliers)]
    ped <- ped[!outliers,]
  }
  
  k <- grep("ancestralgenotype",temp[ix])
  x <- strsplit(temp[c((ix[k]+2):(ix[k+1]-1))],split=",")
  parents <- apply(cbind(sapply(x,function(x){x[1]}),sapply(x,function(x){x[3]})),1,paste,collapse="|")
  parents <- unique(parents)
  y <- strsplit(parents,split="|",fixed=T)
  parents <- data.frame(pop=sapply(y,function(y){y[1]}),parent1=sapply(y,function(y){y[2]}),parent2=sapply(y,function(y){y[length(y)]}),stringsAsFactors = F)
  ped <- merge(ped,parents)
  write.csv(ped[,2:4],file=paste(outstem,"diaQTL_pedfile.csv",sep=""),row.names=F)
  
  k <- grep("parentgeno",temp[ix])
  header <- setdiff(strsplit(temp[ix[k]+1],split=",",fixed=T)[[1]],"")
  parents <- header[-(1:3)]
  header[1:3] <- c("marker","chrom","cM")
  x <- strsplit(temp[c((ix[k]+2):(ix[k+1]-1))],split=",")
  m <- length(x)
  pp <- matrix("",nrow=m,ncol=length(header))
  for (i in 1:3) {
    pp[,i] <- sapply(x,function(z){z[i]})
  }
  for (i in 4:length(header)) {
    v <- sapply(x,function(z){z[i]})
    v <- gsub("1","0",v)
    pp[,i] <- gsub("2","1",v)
  }
  colnames(pp) <- header
  if (!is.null(mapfile)) {
    pp2 <- merge(data.frame(pp,check.names = FALSE),map)
    pp <- pp2[match(pp[,"marker"],pp2$marker),]
    pp <- pp[,c(header[1:3],"bp",parents)]
    write.csv(pp,paste(outstem,"diaQTL_parents.csv",sep=""),row.names=F)
  }
  
  k <- grep("genoprob",temp[ix])
  id <- temp[ix[k]+1]
  id <- strsplit(id,split=",",fixed=T)[[1]][-(1:3)]
  keep <- 1:length(id)
  if (remove.outliers & any(outliers)) {
    keep <- setdiff(keep,match(outlier.id,id))
  } 
  n <- length(keep)
  genoprob <- matrix("",nrow=m,ncol=n+1)
  colnames(genoprob) <- c("marker",id[keep])
  for (j in 1:m) {
    i <- ix[k]+1+j
    x <- strsplit(temp[i],split=",",fixed=T)[[1]]
    genoprob[j,] <- x[c(1,keep+3)]
  }
  if (!is.null(mapfile)) {
    write.csv(cbind(pp[,1:4],genoprob[match(pp[,1],genoprob[,1]),-1]),paste(outstem,"diaQTL_genofile.csv",sep=""),row.names=F)
  } else {
    write.csv(cbind(pp[,1:3],genoprob[match(pp[,1],genoprob[,1]),-1]),paste(outstem,"diaQTL_genofile.csv",sep=""),row.names=F)
  }
  close(con)
  return(NULL)
}
jendelman/diaQTL documentation built on Jan. 27, 2024, 6:39 a.m.