R/reader.R

Defines functions find.id.col force.frame vec.extract.mat force.vec column.salvage reader classify.ext shift.rownames assess.dat.type get.ext rmv.ext is.ch as.df cat.path dir.force.slash parse.args file.nrow wc.windows file.ncol n.readLines get.delim find.file is.file add.dir.if.not conv.fixed.width

Documented in add.dir.if.not as.df assess.dat.type cat.path classify.ext column.salvage conv.fixed.width dir.force.slash file.ncol file.nrow find.file find.id.col force.frame force.vec get.delim get.ext is.ch is.file n.readLines parse.args reader rmv.ext shift.rownames vec.extract.mat wc.windows

###NAMESPACE ADDITIONS###
# Depends: R (>= 2.10), utils, NCmisc
# Imports: grDevices, graphics, stats, methods
# Suggests:
# importFrom(stats, rnorm)
# importFrom(graphics, hist, plot)
# importFrom(grDevices, colors) 
# import(utils, NCmisc)
# importFrom(methods, is)
# importFrom(stats, qnorm, sd)
###END NAMESPACE###

#' Find which column in a dataframe contains a specified set of values.
#' 
#' Starting with a list of ids, each column is searched. The column with
#'  the highest non-zero percentage matching is assumed to correspond
#'  to the id list. The search terminates early if a perfect match is 
#'  found. Useful for assembling annotation from multiple sources.
#'
#' @param frame a data.frame, or similarly 2 dimensional object which 
#'  might contain ids
#' @param ids a vector of IDs/value that might be found in at least 
#'  1 column of frame
#' @param ret specify what should be returned, see values
#' @return ret can specify a list returning, 'col': the column number 
#'  (col=0 for rownames) with the best match; 'maxpc': the percentage
#'  of ids found in the best matching column; 'index': the matching vector
#'  that maps the frame rows onto ids; 'results': the (sub)set of ids
#'  found in frame. NAs given for ids not found
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @examples
#' new.frame <- data.frame(day=c("M","T","W"),time=c(9,12,3),staff=c("Mary","Jane","John"))
#' staff.ids <- c("Mark","Jane","John","Andrew","Sally","Mary")
#' new.frame; staff.ids; find.id.col(new.frame,staff.ids)
find.id.col <- function(frame,ids,ret=c("col","maxpc","index","result"))
{
  # looks across each column of a dataframe 'frame' and selects the
  # column that has the most matches to the vector 'ids'
  # can return any/all of:
  #   the number of best match column, the match %, index numbers, matches
  opts <- c("col","maxpc","index","result")
  if(is.null(dim(frame))) { warning("object 'frame' has no dimensions, not suitable for find.id.col()"); return(NULL) }
  targ.cols <- ncol(frame)+1 # +1 if for looking at 'rownames'
  if(targ.cols==1) { warning("data.frame had no columns"); return(NULL) }
  pcs <- numeric(targ.cols)
  num.ids <- length(ids); if(num.ids <1) { warning("at least 1 ID must be entered") ; return(NULL) }
  ids <- paste(ids)
  if(!is.character(ids)) { stop("ids must be coercible to character type") }
  coln <- 1; best <- NA
  for (cc in 1:targ.cols)
  {
    if(cc==targ.cols)
    {
      # last 'cc' is actually to test the rownames of the frame
      if(!is.null(rownames(frame))) { 
        if(all(rownames(frame)==paste(1:nrow(frame)))) {
          posit <- rep(NA,length(ids)) # force NAs if rownames are just column number
          #  warning("rownames were just column numbers so NA match returned")
        } else {
          posit <- match(ids,rownames(frame)) 
        }
      } else { break }
    } else {
      posit <- match(ids,frame[,cc])
    }
    pcs[cc] <- length(which(!is.na(posit)))/num.ids
    if(pcs[cc]>max(pcs[-cc])) { best <- posit ; coln <- cc }
    if(pcs[cc]==1) { break } # exit if found a perfect match
  }
  maxpc <- max(pcs)
  if(coln==targ.cols) {
    result <- rownames(frame)[best]
    coln <- 0
  } else {
    result <- frame[best,coln]
  }
  out <- list(coln,maxpc,best,result)
  names(out) <- opts
  for (cc in length(opts):1)
  {
    if (!(opts[cc] %in% ret)) { out[[cc]] <- NULL }
  }
  return(out)
}


#' returns a dataframe if 'unknown.data' can in anyway relate to such:
#'
#' it can be:
#' - dataframe, matrix, big.matrix, sub.big.matrix, big.matrix.descriptor,
#' a bigmatrix description file, an RData file containing one of these 
#' objects, the name of a text or RData file, a named vector (names 
#' become rownames), or a list containing a matrix or dataframe. 
#' Using this within functions allows flexibility in 
#' specification of a datasource
#'
#' @param unknown.data something that is or can refer to a 2d dataset
#' @param too.big max size in GB, to prevent unintended conversion to 
#'  matrix of a very large big.matrix object.
#' @return returns a data.frame regardless of the original object type
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{force.vec}}
#' @examples
#' # create a matrix, binary file, text file, big.matrix.descriptor
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' test.files <- c("temp.rda","temp.txt")
#' mymat <- matrix(rnorm(100),nrow=10)
#' # not run yet # require(bigmemory)
#' save(mymat,file=test.files[1])
#' write.table(mymat,file=test.files[2],col.names=FALSE,row.names=FALSE)
#' test.frames <- list(mymat = mymat,
#'  myrda = test.files[1], mytxt = test.files[2] )
#'  # not run yet #: ,mybig = describe(as.big.matrix(mymat)) )
#' sapply(sapply(test.frames,is),"[",1)
#' # run the function on each, reporting specs of the object returned
#' for (cc in 1:length(test.frames)) {
#'   the.frame <- force.frame(test.frames[[cc]])
#'   cat(names(test.frames)[cc],": dim() => ",
#'       paste(dim(the.frame),collapse=","),
#'       "; is() => ",is(the.frame)[1],"\n",sep="")
#' }
#' unlink(test.files)
#' setwd(orig.dir) # reset working dir to original
force.frame <- function(unknown.data,too.big=10^7)
{
  # returns a dataframe if 'unknown.data' can in anyway relate to such:
  # it can be:
  # - dataframe, matrix, big.matrix, sub.big.matrix, big.matrix.descriptor,
  # a bigmatrix description file, an RData file containing one of these objects,
  # the name of a text or RData file, a named vector (names become rownames),
  # or a list containing a matrix or dataframe. Using this in functions allows
  # flexibility in specification of a datasource
  uis <- is(unknown.data)
  if(uis[1]=="character" & length(unknown.data)==1)
  {
    # filename?
    if(file.exists(unknown.data)) { 
      out.data <- reader(unknown.data)
    } else {
      stop("Error: argument seemed to be a filename (char, length 1) but file did not exist")
    }
  } else {
    if(uis[1] %in% c("matrix","data.frame")) {
      out.data <- unknown.data
    } else {
      if(uis[1]=="list") {
        types <- sapply(unknown.data,is)
        wty <- which(types %in% c("matrix","data.frame"))[1]
        if(length(wty)==1) {
          out.data <- unknown.data[[wty]]
        } else {
          stop("Error: object was list and no elements were a matrix or data.frame")
        }
      } else {
        if(is.null(dim(unknown.data)) & !is.null(names(unknown.data)))
        {
          cat(" converting named vector into dataframe\n")
          out.data <- as.matrix(unknown.data)
        } else {
          if(length(grep("big.matrix",uis))>0) {
            if(!exists("get.big.matrix",mode="function")) { 
              warning("'bigpc' package missing can't read big.matrix"); return(NULL) }
            unknown.data <- do.call("get.big.matrix",list(bigMat=unknown.data)) # once bigpc  is submitted replace with actual fn
            if(estimate.memory(unknown.data) <= too.big) {
              cat(" converting big.matrix object into a dataframe\n")
              out.data <- as.matrix(unknown.data) 
            } else {
              stop(paste("Error: big matrix object was too big to convert to a dataframe [>",
                         too.big,"cells]\n"))
            }
          } else {
            warning(paste("trying to convert object type",uis[1],"into a dataframe"),
                    " - result may be unpredictable")
            out.data <- as.df(unknown.data)
            #print(head(out.data))
            return(out.data)
          }
        }
      }
    }
  }
  return(as.df(out.data))
}


#' Internal function to extract the best guess at an ID vector from
#' a dataframe, without knowing any ids (set most.unique=FALSE to select col 1)
#' default to select the most unique column from the first 100 cols
vec.extract.mat <- function(X,most.unique=TRUE,max.col=100) {
  num.unique <- function(X) { length(unique(X)) }
  if(!is.null(rownames(X))) {
    if(!all(rownames(X)==paste(1:nrow(X)))) {
      return(rownames(X))
    }
  } 
  if(most.unique) {
    unz <- sapply(X[1:min(max.col,ncol(X))],num.unique)
    index <- which(unz==max(unz,na.rm=TRUE))[1]
    if(length(index)==0) { index <- 1 }
  } else {
    index <- 1
  }
  return(X[,index])
}



#' returns a vector if 'unknown.data' can in anyway relate to such:
#'
#' if the name of a file with a vector or vector, then reads the file,
#' if a matrix or dataframe, then preferentially return rownames, 
#'  otherwise return first column - designed to search for IDs.
#' Using this within functions allows flexibility in 
#' the specification of a datasource for vectors
#'
#' @param unknown.data something that is or can refer to a 2d dataset
#' @param most.unique if TRUE, select most unique column if a unknown.data
#'  is a matrix, else select the first column
#' @param dir if unknown.data is a file name, specifies directory(s) to
#'  look for the file
#' @param warn whether to display a warning if unknown.data is a matrix
#' @return returns a vector regardless of the original object type
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{force.frame}}
#' @examples
#' # create a matrix, binary file, and simple vector
#' my.ids <- paste("ID",1:4,sep="")
#' my.dat <- sample(2,4,replace=TRUE)
#' test.files <- c("temp.rda")
#' mymat <- cbind(my.ids,my.dat)
#' save(mymat,file=test.files[1])
#' test.vecs <- list(myvec = my.ids,
#'  myrda = test.files[1],mymat=mymat)
#' # show dimensions of each test object
#' sapply(test.vecs,function(x) {  if(is.null(dim(x))){ length(x)} else {dim(x)}})
#' # run the function on each, reporting specs of the object returned
#' for (cc in 1:3) {
#'   the.vec <- force.vec(test.vecs[[cc]])
#'   cat(names(test.vecs)[cc],": length() => ",
#'       length(the.vec),"; is() => ",is(the.vec)[1],"\n",sep="")
#' }
#' unlink(test.files)
force.vec <- function(unknown.data,most.unique=TRUE,dir=NULL, warn=FALSE)
{
  # returns a vector if 'unknown.data' can in anyway relate to such:
  # if the name of a file with a vector or vector, then returns that
  # if a matrix or dataframe, then preferentially return rownames, otherwise
  # return first column
  # designed to search for IDs basically
  # 
  uis <- is(unknown.data)
  if(is.null(dim(unknown.data))) {
    out.data <- unknown.data
    if(!"vector" %in% uis) {
      warning("unknown dimensionless non-vector datatype!")
    } else {
      if(uis[1]=="character" & length(unknown.data)==1) {
        if(is.file(unknown.data,dir)) { 
          out.data <- reader(find.file(unknown.data,dir))
          #print(head(out.data))
          out.data <- force.vec(out.data)
        } else {
          warning("argument length 1 might be a filename (char, length 1) but file did not exist")
          cat("arg: \n"); print(unknown.data)
        }
      } 
    }
    if("list" %in% uis) { out.data <- unlist(out.data) }
  } else {
    out.data <- force.frame(unknown.data)
    if(warn) {
      warning("input seems to be dim>1, rather than a vector, using 'vec.extract.mat' to find best vector")
    }
    out.data <- vec.extract.mat(out.data,most.unique)
  }
  return(as.vector(out.data))
}



#' Change column name in different form to desired form.
#' 
#' Searches for possible equivalents for a desired column in a dataframe
#' and replaces first name match with desired name. Useful when parsing
#' different annotation files which may have standard columns with slightly
#' different names, e.g, Gender=SEX=sex=M/F, or ID=id=ids=samples=subjectID
#'
#' @param frame a dataframe or matrix with column names
#' @param desired the column name wanted
#' @param testfor possible alternate forms of the desired column name
#' @param ignore.case whether to ignore the upper/lower case of the column names
#' @return returns the original dataframe with the target column renamed
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @examples
#' df <- data.frame(Sex=c("M","F","F"),time=c(9,12,3),ID=c("ID3121","ID3122","ID2124"))
#' # standard example
#' new.df <- column.salvage(df,"sex",c("gender","sex","M/F")); df; new.df
#' # exact column already present so no change
#' new.df <- column.salvage(df,"ID",c("ID","id","ids","samples","subjectID")); df; new.df
#' # ignore case==TRUE potentially results in not finding desired column:
#' new.df <- column.salvage(df,"sex",c("gender","sex","M/F"),ignore.case=FALSE); df; new.df
column.salvage <- function(frame,desired,testfor, ignore.case=TRUE) 
{
  ## attempt to find predictable misnaming of column (contents of 'testfor')
  # and change name of the first detected misname in the 'testfor' list to 'desired'
  # e.g, want desired column 'GRP' and look for misnamings: 'group', 'Grp', 'grp', etc
  if(is.null(colnames(frame))) { warning("frame had no column names"); return(frame) }
  if(!all(desired %in% colnames(frame))) {
    # ^ ie, if 'desired' already present, do nothing
    tf <- testfor; cf <- colnames(frame)
    if(ignore.case) { tf <- c(tolower(desired),tolower(tf)); cf <- tolower(cf) } 
    if(any(tf %in% cf)) {
      colnames(frame)[(narm(match(tf,cf)))[1]] <- desired
    } else {
      warning("couldn't find any columns: ",paste(testfor,collapse=", ")," to change to '",desired,"' in frame")
    }
  }
  return(frame)
}






#' Flexibly load from a text or binary file, accepts multiple file formats.
#' 
#' Uses file extension to distinguish between binary, csv or other 
#' text formats. Then tries to automatically determine other parameters
#' necessary to read the file. Will attempt to detect the delimiter,
#' and detect whether there is a heading/column names, and whether 
#' the first column should be rownames, or left as a data column.
#' Internal calls to standard file reading functions use 
#' 'stringsAsFactors=FALSE'.
#'
#' @param fn filename (with or without path if dir is specified)
#' @param dir optional directory if separate path/filename is preferred
#' @param want.type if loading a binary file with multiple objects, specify
#'  here the is() type of object you are trying to load
#' @param def the default delimiter to try first
#' @param force.read attempt to read the file even if the file type looks unsupported
#' @param header presence of a header should be autodetected, but can specify header status 
#' if you don't trust the autodetection
#' @param h.test.p p value to discriminate between number of characters in a column name versus
#'  a column value (sensitivity parameter for automatic header detection)
#' @param quiet run without messages and warnings
#' @param treatas a standard file extension, e.g, 'txt', to treat file as
#' @param override assume first col is rownames, regardless of heuristic
#' @param more.types optionally add more file types which are read as text
#' @param auto.vec if the file seems to only have a single column, automatically
#'  return the result as a vector rather than a dataframe with 1 column
#' @param one.byte logical parameter, passed to 'get.delim', whether to look for only 1-byte
#'  delimiters, to also search for 'whitespace' which is a multibyte (wildcard) delimiter type. 
#'  Use one.byte = FALSE, to read fixed width files, e.g, many plink files.
#' @param ... further arguments to the function used by 'reader' to parse the file,
#'  e.g, depending on file.type, can be read.table(), read.delim(), read.csv().
#' @return returns the most appropriate object depending on the file type,
#'  which is usually a data.frame except for binary files
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' # create some datasets
#' df <- data.frame(ID=paste("ID",101:110,sep=""),
#'   scores=sample(70,10,TRUE)+30,age=sample(7,10,TRUE)+11)
#' DNA <- apply(matrix(c("A","C","G","T")[sample(4,100,TRUE)],nrow=10),
#'                                                 1,paste,collapse="")
#' fix.wid <- c("    MyVal    Results        Check",
#'   "    0.234      42344          yes",
#'   "    0.334        351          yes","    0.224         46           no",
#'   "    0.214     445391          yes")
#' # save data to various file formats
#' test.files <- c("temp.txt","temp2.txt","temp3.csv",
#'                               "temp4.rda","temp5.fasta","temp6.txt")
#' write.table(df,file=test.files[1],col.names=FALSE,row.names=FALSE,sep="|",quote=TRUE)
#' write.table(df,file=test.files[2],col.names=TRUE,row.names=TRUE,sep="\t",quote=FALSE)
#' write.csv(df,file=test.files[3])
#' save(df,file=test.files[4])
#' writeLines(DNA,con=test.files[5])
#' writeLines(fix.wid,con=test.files[6])
#' # use the same reader() function call to read in each file
#' for(cc in 1:length(test.files)) {
#'   cat(test.files[cc],"\n")
#'   myobj <- reader(test.files[cc])  # add 'quiet=FALSE' to see some working
#'   print(myobj); cat("\n\n")
#' }
#' # inspect files before deleting if desired
#' unlink(test.files) 
#' # myobj <- reader(file.choose()); myobj # run this to attempt opening a file
#' setwd(orig.dir) # reset working directory to original
reader <- function(fn,dir="",want.type=NULL,def="\t",force.read=TRUE,header=NA,h.test.p=0.05,
                   quiet=TRUE,treatas=NULL,override=FALSE,more.types=NULL,
                   auto.vec=TRUE,one.byte=TRUE,...)
{
  # try to read in data from many types of datafile
  typ <- classify.ext(fn,more.txt=more.types)
  #print(typ)
  types <- c("BIN","CSV","TXT","OTH")
  treatas <- classify.ext(treatas,more.txt=more.types)
  # get file extension
  fsuf <- get.ext(fn)
  # construct the full path
  full.path <- cat.path(dir,fn)
  if(!file.exists(full.path)) { stop("file 'fn' did not exist") }
  # select file type to open as
  # typ <- types[types %in% fsuf][1]
  if(!is.null(treatas) )
  {
    if(treatas[1] %in% types) {
      cat(paste(" will assume file is",treatas,"\n"))
      typ <- treatas[1]
    } 
  }
  if(typ=="OTH") {
    if(force.read) {
      if(!quiet) { warning("looks like a non-supported file type") }
      typ <- types[3]
    } else {
      warning("looks like a non-supported file type.",
              "\nadd suffix to 'more.types' if it's a text file",
              "\nchange file suffix to .rda or .csv if it's binary or csv")
      cat("reader knows the following types:\n"); print(classify.ext(print.all=TRUE))
      return(NULL)
    }
  }
  if(typ==types[1])
  {
    ## .RData/.rda binary file
    file.out <- list()
    fns <- load(full.path)
    if(!quiet) { cat(paste(" found",paste(fns,collapse=","),"in",full.path,"\n")) }
    if (!is.null(want.type)) {
      for (ii in 1:length(fns)) {
        if(want.type %in% is(get(fns[ii])))
        { fns <- fns[ii] ; break }
      } }
    for (cc in 1:length(fns))
    {  file.out[[cc]] <- get(fns[cc]) }
    names(file.out) <- fns
    if(length(file.out)==1) { file.out <- file.out[[1]] }
  }
  if(typ==types[2])
  {
    # csv file
    file.out <- read.csv(full.path,stringsAsFactors=FALSE,...)
    file.out <- shift.rownames(file.out,override,warn=!quiet)
  }
  if(typ==types[3])
  {
    # other text .txt file
    detect <- suppressWarnings(get.delim(full.path,n=50,comment="#",large=10,one.byte=one.byte))
    if(length(detect)!=0) { def <- detect }
    first.10 <- readLines(full.path,n=10); hope10 <- length(first.10)
    if(hope10<3) { lown <- hope10 } else { lown <- 3 }
    if(hope10<2) { h2 <- lown <- 1 } else { h2 <- 2 }
    mini.parse <- strsplit(first.10,def)
    if(length(mini.parse)==0) { return(character(0)) }  # empty file
    splitto <- sapply(mini.parse,length)
    if(all(splitto[lown:hope10]==splitto[h2]) & (splitto[1]==(splitto[h2]-1))) {
      # was first row 1 delimiter shorter than all other rows? (i.e, header row)
      if(nchar(def)>1) {
        #read table only supports 1 byte delimiters (not regular expressions) 
        cat(" reading file with unspecified whitespace demiliting, check result imported correctly\n")
        cat(" header ignored!\n")
        ## START: SAME CHUNK AS BELOW FOR NON-HEADER VERSION ##
        raw.txt <- readLines(full.path); raw.txt <- raw.txt[raw.txt!=""]
        split.by.ws <- lapply(raw.txt[-1],strsplit,split=def) # hoping all same length
        lns <- sapply(split.by.ws,length)
        if(all(lns==1)) { split.by.ws <- Unlist(split.by.ws,1) } # if list got nested
        lns <- sapply(split.by.ws,length)
        if(length(unique(lns))!=1) {
          # some are different lengths, try removing blanks
          split.by.ws <- lapply(split.by.ws,function(X) { X[X!=""] })
          if(length(unique(sapply(split.by.ws,length)))!=1) {
            warning("attempt to read as white space implied different row lengths, returning list")
            return(split.by.ws)
          }
        }
        file.out <- t(as.df(split.by.ws))
        ## END: SAME CHUNK AS BELOW FOR NON-HEADER VERSION ##
        if(all(file.out[,1]=="") & ncol(file.out)>1) { file.out <- file.out[,-1] }
        rownames(file.out) <- NULL
      } else {
        file.out <- read.table(full.path, ..., sep=def,header=TRUE,row.names=1,stringsAsFactors=FALSE)
      }
    } else {
      ## probably has a header too!
      if(all(splitto==1)) {
        # probably just a vector file
        file.out <- readLines(full.path)
      } else {
        # some kind of delimited file
        if (all(splitto[1]==splitto[h2]))
        {
          # test whether there is a significant nchar difference between first and other rows
          # as a means of detecting whether the first row is a header row
          char.cnts <- sapply(mini.parse,nchar)
          #prv(char.cnts,mini.parse);
          lns <- sapply(char.cnts,length); 
          if(!sum(abs(diff(lns)))==0) {
            culprits <- lns!=Mode(lns)
            warning("Line(s): ",paste(which(culprits),collapse=","),"; seemed to have a different number of columns than the majority, will attempt to read anyway")
            mini.parse <- mini.parse[which(!culprits)]
            char.cnts <- sapply(mini.parse,nchar)
          }
          z.test <- function(X) { (X[1] - mean(X[-1],na.rm=TRUE))/max(1,sd(X[-1],na.rm=TRUE)) }
          Zs <- abs(apply(as.df(char.cnts),1,z.test))
          if(length(Zs)>0 & hope10>2) {
            #print(mini.parse); print(char.cnts); print(Zs)
            critZ <- abs(qnorm((h.test.p*(min(round(sqrt(length(Zs))),9)))/2))
            #cat("Zs mean",mean(Zs,na.rm=TRUE),"critZ",critZ,"\n")
            mZ <- mean(Zs,na.rm=TRUE)
           # prv(mZ,critZ,Zs,z.test,splitto,h.test.p,char.cnts,hope10)
            if(length(mZ)>0) {
             if(mZ<critZ) { hdr <- FALSE } else { hdr <- TRUE }
            } else { hdr <- TRUE }
            if(!is.na(header)) { if(is.logical(header)) { hdr <- header }}
            if(nchar(def)>1) {
              #read table only supports 1 byte delimiters (not regular expressions) 
              if(!quiet) {
                cat(" reading file with unspecified whitespace demiliting, check result imported correctly\n")
              }
              ## START: SAME CHUNK AS ABOVE FOR NON-HEADER VERSION ##
              raw.txt <- readLines(full.path); raw.txt <- raw.txt[raw.txt!=""]
              split.by.ws <- lapply(raw.txt,strsplit,split=def) # hoping all same length
              lns <- sapply(split.by.ws,length)
              if(all(lns==1)) { split.by.ws <- Unlist(split.by.ws,1) } # if list got nested
              lns <- sapply(split.by.ws,length)
              if(length(unique(lns))!=1) {
                # some are different lengths, try removing blanks
                split.by.ws <- lapply(split.by.ws,function(X) { X[X!=""] })
                if(length(unique(sapply(split.by.ws,length)))!=1) {
                  warning("attempt to read as white space implied different row lengths, returning list")
                  return(split.by.ws)
                }
              }
              file.out <- t(as.df(split.by.ws))
              ## END: SAME CHUNK AS ABOVE FOR NON-HEADER VERSION ##
              if(all(file.out[,1]=="") & ncol(file.out)>1) { file.out <- file.out[,-1] }
              if(nrow(file.out)>1 & hdr) {
                # assuming header row
                colnames(file.out) <- file.out[1,]; file.out <- file.out[-1,]
              }
              rownames(file.out) <- NULL
            } else {
              file.out <- read.delim(full.path, ..., sep=def,header=hdr,stringsAsFactors=FALSE)
              file.out <- shift.rownames(file.out,override,warn=!quiet)
            }
          } else {
            warning("file too small to determine structure, or other error in reader function, reverting to readLines")
            file.out <- readLines(full.path)
          }
        } else {
          if(!quiet) { 
            warning("*.txt file not delimited by",def)
            cat(" will just read as text\n")
          }
          file.out <- readLines(full.path)
        }
      }
    }
  }  
  if(exists("file.out")) {
    if(is.null(want.type)) {
      if(is.data.frame(file.out)) {
        if(is.null(rownames(file.out)) | (all(rownames(file.out)==paste(1:nrow(file.out)))) ) {
          if(dim(file.out)[2]==1) { 
            # if only 1 column, return a vector
            if(auto.vec) { file.out <- file.out[,1] }
          } else {
            # if first column has picked up default 1:nrow() rownames, remove it
            if(all(file.out[,1]==rownames(file.out))) { file.out[[1]] <- NULL }
          } } }
    }
  }
  return(file.out)
}


#' Classify file types readable by standard R I/O functions.
#' 
#' Look for known file extensions and classify as binary, comma-separated,
#' text format, or OTH=other; other files are assumed to be unreadable.
#' To read other files, need to specify more types manually.
#'
#' @param ext filenames or extensions to classify
#' @param more.txt more extensions that should be treated as txt
#' @param more.bin more extensions that should be treated as binary
#' @param more.csv more extensions that should be treated as csv
#' @param print.all setting to T, simply prints the list of supported ext
#' @return returns the 4 way classification for each file/extension
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{get.delim}}
#' @examples
#' classify.ext(c("test.txt","*.csv","tot","other","rda","test.RDatA"))
classify.ext <- function(ext=NULL,more.txt=NULL,more.bin=NULL,more.csv=NULL,print.all=FALSE) {
  bin <- c("RData","rda","bin")
  csv <- c("csv")
  txt <- c("txt","tab","map","vcf","bim","dat","fam","reg","cnv")
  if(is.character(more.txt)) { txt <- c(txt,more.txt) }
  if(is.character(more.bin)) { bin <- c(bin,more.bin) }
  if(is.character(more.csv)) { csv <- c(csv,more.csv) }
  bin <- tolower(bin); csv <- tolower(csv); Ext <- ext
  txt <- tolower(txt); ext <- tolower(ext);
  types <- c(bin,csv,txt)
  if(print.all) {  return(types) }
  if(length(grep(".",ext))>0) { 
    # maybe entered some filenames instead of extensions, try conversion
    ge <- get.ext(ext)
    ext[ge!=""] <- ge[ge!=""]
  }
  out <- rep("OTH",times=length(ext))
  out[ext %in% bin] <- "BIN"
  out[ext %in% txt] <- "TXT"
  out[ext %in% csv] <- "CSV"
  names(out) <- Ext
  return(out)
}


#' Shift the first column of a dataframe to rownames() if appropriate.
#' 
#' Checks whether the first column looks like IDs, and if so will.
#' remove the column, and move these values to rownames.
#'
#' @param dataf data.frame to run the conversion on
#' @param override assume col 1 is rownames, regardless of numeric() test
#' @param warn whether to display warnings if assumptions aren't met
#' @return returns vectors of strings of char, lengths X
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{reader}}
#' @examples
#' df1 <- data.frame(ID=paste("ID",101:110,sep=""),
#'                    scores=sample(70,10,TRUE)+30,age=sample(7,10,TRUE)+11)
#' shift.rownames(df1)
#' df2 <- data.frame(ID=paste(101:110),
#'                    scores=sample(70,10,TRUE)+30,age=sample(7,10,TRUE)+11)
#' shift.rownames(df2) # first col are all numbers, so no convert
#' shift.rownames(df2,override=TRUE) # override forces conversion
shift.rownames <- function(dataf,override=FALSE,warn=FALSE)
{
  if(is.data.frame(dataf))
  {
    if(ncol(dataf)<2) { return(dataf) } # only had 1 column, so can't convert it to rownames
    if(nrow(dataf)<1) { return(dataf) } # had no rows, so can't convert it to rownames
    typz <- sapply(sapply(dataf,is),"[[",1)
    rn <- paste(dataf[,1])
    dataout <- as.df(dataf[,-1])
    if (typz[1]=="character" & all(typz[-1] %in% c("numeric","integer")))
    {
      numerify <- TRUE
    } else {
      # test how many non-numeric in the first column (i.e. guess whether rownames)
      suppressWarnings(tst <- length(which(is.na(as.numeric(rn))))/nrow(dataout) )
      if (tst>=.75 | override)
      { 
        # test how many non-numeric in the new first column once rowname first col is removed
        suppressWarnings(tst <- length(which(is.na(as.numeric(paste(dataout[,1])))))/nrow(dataout) )
        if(tst<=.75) {
          numerify <- TRUE # convert rest of dataset to numeric (assume only 1st col was txt)
        } else {
          numerify <- FALSE }
      } else {
        if(warn) { warning("proposed rownames were mostly numbers") }
        return(dataf)
      }
    }    
    if(numerify) { for (dd in ncol(dataout))
    { suppressWarnings(dataout[,dd] <- as.numeric(as.character(dataout[,dd]))) }
    } 
    if(anyDuplicated(rn)) { if(warn) { warning("rownames not unique, so leaving as NULL") }; return (dataf) }
    rownames(dataout) <- rn
    return(dataout)
  } else {
    if(length(dim(dataf))==2)
    {
      if("character" %in% is(dataf[,1]))
      {
        if(length(unique(dataf[,1])==nrow(dataf)))
        {
          if(ncol(dataf)>1) {
            sup <- assess.dat.type(dataf)
            if(sup<2) { 
              if(warn){ warning("not sure if rownames should be added from col 1") }
              return(dataf)
            } else {
              rn <- dataf[,1]
              if(ncol(dataf)<3) { dataout <- as.df(dataf[,-1]) } else {
                dataout <- dataf[,-1] }
              if(anyDuplicated(rn)) { if(warn) { warning("rownames not unique, so leaving as NULL") }; return (dataf) }
              rownames(dataout) <- paste(rn)
              if(sup>=10) { for (dd in ncol(dataout))
              { suppressWarnings(dataout[,dd] <- as.numeric(as.character(dataout[,dd]))) }
              }    
              return(dataout)
            }            
          } else {
            if(warn) { warning("only a single column") }
            return(dataf)
          }
        } else {
          if(warn) { warning("duplicate row names") }
          return(dataf)
        }
      }
    } else {
      if(warn) { warning("must be formatted in rows and columns") }
      return(dataf)
    }
    if(warn) { warning("couldn't change first col, not a dataframe") }
    return(dataf)
  }
}


#' Internal function used by shift.rownames()
#' try to work out what format a file is in, whether IDs in column 1
assess.dat.type <- function(dat)
{
  support <- 0
  nchar.mn <- function(vec) { mean(nchar(paste(vec)),na.rm=TRUE) }
  if(length(unique(dat[,1])==nrow(dat)))
  { support <- support+1 }
  suppressWarnings(tst <- length(which(is.na(as.numeric(paste(dat[,1])))))/nrow(dat) )
  if (tst>=.75)
  { 
    suppressWarnings(tst <- length(which(is.na(as.numeric(paste(dat[,2])))))/nrow(dat) )
    if(tst<=.75) {
      support <- support+10
    } else {
      support <- support+1 }
  }
  if (ncol(dat)>2 & support <2) {
    # slow and potentially inaccurate so only run if in doubt
    rc <- dim(dat)
    if (nrow(dat)>100) { r.sel <- 1:100 } else { r.sel <- 1:rc[1] } 
    if (ncol(dat)>50) { c.sel <- 1:50 } else { c.sel <- 1:rc[2] } 
    col.chr <- apply(dat[r.sel,c.sel],2,nchar.mn)
    dif.to.1 <- mean(abs(col.chr[-1]-col.chr[1]))
    dif.to.0 <- mean(abs(col.chr[-1]-rev(col.chr[-1])))
    if (dif.to.0 < dif.to.1)
    { support <- support+1 } else { support <- support - 1 }
  }
  return(support)
}




#' Get the file extension from a file-name.
#'
#' @param fn filename(s) (with full path is ok too)
#' @return returns the (usually) 3 character file extension of a filename
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{rmv.ext}}
#' @examples
#' get.ext("/documents/nick/mydoc.xlsx")
#' get.ext(c("temp.cnv","temp.txt"))
get.ext <- function(fn) {
  # get file extension from a filename character string
  if(length(fn)<1) { warning("fn had length of zero"); return(fn) }
  if(all(is.na(fn)) | !is.character(fn)) { stop("fn should not be NA and should be of type character()") }
  strip.file.frags <- function(X) {
    file.segs <- strsplit(X,".",fixed=TRUE)[[1]]
    lss <- length(file.segs)
    if (lss>1) { out <- paste(file.segs[lss]) } else { out <- "" }
    return(out)
  }
  return(sapply(fn,strip.file.frags))
}

#' Remove the file extension from a file-name.
#'
#' Default is to only remove from a known list of file types,
#' this is to protect files with '.' which may not have an extension
#' This option can be changed, and more types can be specified too.
#'
#' @param fn filename(s) (with full path is ok too)
#' @param only.known logical, only remove extension if in the 'known' list
#' @param more.known character vector, add to the list of known extensions
#' @param print.known return the list of 'known' file extensions
#' @return returns the file name/path without the file extension
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{get.ext}}
#' @examples
#' rmv.ext(print.known=TRUE)
#' rmv.ext("/documents/nick/mydoc.xlsx")
#' rmv.ext(c("temp.cnv","temp.txt","temp.epi"))
#' # remove anything that looks like an extension
#' rmv.ext(c("temp.cnv","temp.txt","temp.epi"),only.known=FALSE) 
#' # add to list of known extensions
#' rmv.ext(c("temp.cnv","temp.txt","temp.epi"),more.known="epi") 
rmv.ext <- function(fn=NULL,only.known=TRUE,more.known=NULL,print.known=FALSE) {
  # remove file extension from a filename character string
  #if updating this function, also update internal copy in NCmisc
  known.ext <- c("TXT","RDATA","TAB","DAT","CSV","VCF","GCM","BIM","MAP","FAM",
                 "PFB","SH","R","CPP","H","DOC","DOCX","XLS","XLSX","PDF","JPG",
                 "BMP","PNG","TAR","GZ","CNV","PL","PY","ZIP","ORG","RDA","DSC","BCK",
                 "ABW","HTM","HTML",toupper(more.known))
  if(is.null(fn)) { 
    if(print.known) {
      return(known.ext)
    } else {
      warning("couldn't remove extension, not a character()"); return(fn) 
    }
  } else {
    if (all(is.na(fn))) { warning("couldn't remove extension, all values were NA"); return(fn) }
  }
  if(print.known) { cat("known file extensions:\n"); print(known.ext) }
  if(!is.character(fn)) { warning("couldn't remove extension, not a character()"); return(fn) }
  rmv.one <- function(X,known.ext) {
    file.segs <- strsplit(paste(X),".",fixed=TRUE)[[1]]
    lss <- length(file.segs)
    if (lss>1) { 
      if(only.known){
        if(toupper(file.segs[lss]) %in% known.ext) {
          out <- paste(file.segs[-lss],collapse=".") 
        } else { 
          out <- X
        }
      } else {
        out <- paste(file.segs[-lss],collapse=".") 
      }
    } else {
      out <- X 
    }
  }
  return(sapply(fn,rmv.one,known.ext=known.ext))
}


#' Internal function to assess whether data is a character or list of characters
is.ch <- function(x) { 
  # is function for character() or list of characters
  if(is.null(x)) { return(FALSE) }
  pt1 <- is.character(x)
  if(!pt1 & is.list(x)) { pt2 <- all(sapply(x,is.ch)) } else { pt2 <- pt1 }
  return(as.logical(pt1 | pt2))
}

#' Internal function 
as.df <- function(...) {
  ## unless 'stringsAsFactors' is called explicitly, wrap
  # as.data.frame so that stringsAsFactors is always FALSE
  test <- list(...)
  if(length(names(test))<1) { doit <- FALSE } else {
    if(names(test) %in% "stringsAsFactors") { doit <- TRUE } else { doit <- FALSE }
  }
  if(doit) {
    return(as.data.frame(...))
  } else {
    return(as.data.frame(...,stringsAsFactors=FALSE))
  }
}

#' Simple and robust way to create full-path file names.
#' 
#' Create a path with a file name, plus optional directory, prefix,
#' suffix, and file extension. dir/ext are robust, so that if they
#' already exist, the path produced will still make sense. Prefix is
#' applied after the directory, and suffix before the file extension.
#'
#' @param dir directory for the full path, if 'fn' already has a dir,
#'  then dir will be overridden. Auto add file separator if not present
#' @param fn compulsory vector of file names/paths
#' @param pref prefix to add in front of the file name
#' @param suf suffix to add after the file name, before the extension
#' @param ext file extension, will override an existing extension
#' @param must.exist the specified file must already exist, else error
#' @return returns vector of file names with the full paths
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @examples
#' mydir <- "/Documents"
#' cat.path(mydir,"temp.doc")
#' # dir not added if one already present
#' cat.path(mydir,"/Downloads/me/temp.doc")
#' # using prefix and suffix
#' cat.path(mydir,"temp.doc","NEW",suf=5)
#' # changing the extension from .docx to .doc
#' cat.path(mydir,"temp.docx",ext="doc")
cat.path <- function(dir="",fn,pref="",suf="",ext="",must.exist=FALSE) 
{
  #if updating this function, also update internal copy in NCmisc
  dir.ch <- .Platform$file.sep
  if(is.list(fn) & is.ch(fn)) { fn <- unlist(fn) } #; 
  if(length(dir)>1) { dir <- dir[1]; cat("only first dir was used\n") }
  if(length(ext)>1) { ext <- ext[1]; cat("only first extension was used\n") }
  if(length(grep(dir.ch,fn))>0) {
    dir <- dirname(fn)  #split into dir and fn if fn has /'s
    fn <- basename(fn)
  }
  dir <- dir.force.slash(dir)
  if(ext!="") {
    #make sure ext includes the dot
    if(substr(ext,1,1)!=".")   { ext <- paste(".",ext,sep="") }
    #if ext is already built into suffix or filename, remove it from there
    fn <- rmv.ext(paste(fn))
    suf <- rmv.ext(paste(suf))
  }
  location <- paste(dir,pref,fn,suf,ext,sep="")
  if(any(!file.exists(location)) & must.exist) {
    warn <- paste("required file",location,"not found!")
    stop(warn)
  }
  return(location)
}



#' Internal function used by cat.path
dir.force.slash <- function(dir) {
  # make sure 'dir' directory specification ends in a / character
  if(!is.null(dim(dir))) { stop("dir should be a vector") }
  dir <- paste(dir)
  dir.ch <- .Platform$file.sep
  the.test <- (dir!="" & substr(dir,nchar(dir),nchar(dir))!=dir.ch)
  dir[the.test] <- paste(dir[the.test],dir.ch,sep="")
  return(dir)
}



#' Function to collect arguments when running R from the command line
#' 
#' Allows parameter specification by A=..., B=... in the command line
#' e.g, R < myScript.R M=1 NAME=John X=10.5, using commandArgs()
#'
#' @param arg.list the result of a commandArgs() call, or else NULL to
#'  initiate this call within the function
#' @param coms list of valid commands to look for, not case sensitive
#' @param def list of default values for each parameter (in same order)
#' @param verbose logical, whether to print to the console which assignments are made and warning messages
#' @param list.out logical, whether to return output as a list or data.frame 
#' @return returns dataframe showing the resulting values [column 1, "value"] for each 'coms' (rownames); or, if
#'  list.out=TRUE, then returns a list with names corresponding to 'coms' and values equivalent to 'value' column of 
#'  the data.frame that would be returned if list.out=FALSE
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @examples
#' parse.args(c("M=1","NAME=John","X=10.5"),coms=c("M","X","NAME"))
#' parse.args(c("N=1")) # invalid command entered, ignored with warning
#' temp.fn <- "tempScript1234.R"
#' # make a temporary R Script file to call using the command line
#' # not run # writeLines(c("require(reader)","parse.args(coms=c('M','X','NAME'))"),con=temp.fn)
#' bash.cmd <- "R --no-save < tempScript1234.R M=1 NAME=John X=10.5"
#' # run above command in the terminal, or using 'system' below:
#' # not run # arg <- system(bash.cmd)
#' # not run # unlink(temp.fn) # delete temporary file
parse.args <- function(arg.list=NULL,coms=c("X"),def=0, list.out=F, verbose=TRUE)
{
  # parse arguments entered running R from the command line
  # using NAME=VALUE
  if(is.null(arg.list)) { arg.list <- commandArgs() }
  if(length(coms)>1 & length(def)==1) { def <- rep(def,length(coms)) }
  coms.original.case <- coms
  coms <- toupper(coms)
  outframe <- data.frame(value=paste(def),stringsAsFactors=FALSE)
  rownames(outframe) <- coms
  assign.cmds <- grep("=",arg.list,fixed=TRUE)
  if(length(assign.cmds)>0)
  {
    vars.lst <- strsplit(arg.list[assign.cmds],"=",fixed=TRUE)
    vars <- sapply(vars.lst,"[",1)
    vals <- sapply(vars.lst,tail,1)
    vals <- paste(vals)
    #vals <- as.integer(vals)
    if(any(toupper(vars) %in% coms))
    {
      which.coms <- match(toupper(vars),coms)
      for (cc in 1:length(which.coms))
      {
        if(!is.na(vals[cc]) & !is.na(which.coms[cc]))
        {
          if(verbose) { cat(paste("set",coms[which.coms[cc]],"=",vals[cc]),"\n") }
          outframe[coms[which.coms[cc]],1] <- paste(vals[cc])
        } else {
          if(verbose) {
            cat(paste(" skipping invalid variable",vars[cc],"or invalid value",vals[cc],"\n"))
          }
        }
      }
    } else {
      warning("command line arguments entered but none are valid")
    }
  } else {
    outframe <- NULL
  } 
  rownames(outframe) <- coms.original.case
  if(list.out) {
    outframe <- as.list(as.data.frame(t(outframe),stringsAsFactors=FALSE))
  }
  return (outframe)
}


#' Find the number of rows (lines) in a file.
#' 
#' Returns the number of lines in a file, which in the case of a datafile
#' will often correspond to the number of rows, or rows+1. Can also
#' do this for all files in the directory. File equivalent of nrow()
#'
#' @param fn name of the file(s) to get the length of
#' @param dir optional path for fn location, or specify all files in dir
#' @param all.in.dir select whether to extract length for all files in dir
#' @return returns length of file (or all files)
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{file.ncol}}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' write.table(matrix(rnorm(100),nrow=10),"temp.txt",col.names=FALSE)
#' file.nrow("temp.txt")
#' # use with caution, will be slow if dir contains large files
#' # not run # file.nrow(all.in.dir=TRUE) 
#' unlink("temp.txt")
#' setwd(orig.dir) # reset working directory to original
file.nrow <- function(fn="",dir="",all.in.dir=FALSE) {
  zip <- FALSE
  if(all(fn=="")) { 
    all.in.dir <- TRUE; fn <- paste(dir,"*",sep="") 
  } else {
    if(length(fn)>1) { warning("only first file used, to do multiple use all.in.dir") }
    if(!is.file(fn[1],dir)) { stop("Error: file did not exist") }
  }
  if(!check.linux.install("wc")) { return(suppressWarnings(wc.windows(fn))) }
  if(tolower(get.ext(fn))!="gz") { cmd <- paste("wc -l ",fn,sep="") } else {
    if(!all.in.dir) { cmd <- paste("zcat ",fn[1]," | wc -l ",sep="") ; zip <- TRUE } }
  linez <- system(cmd,intern=TRUE)
  dir.linez <- unique(c(grep("Is a directory",linez),grep("total",linez)))
  if(length(dir.linez)>0) { linez <- linez[-dir.linez] }
  splits <- strsplit(linez,"\t| +")
  splits <- lapply(splits,function(x) { x <- x[x!=""] })
  lens <- as.numeric(sapply(splits,"[",1))
  nms <- sapply(splits,"[",2)
  if(zip & all(is.na(nms))) { nms <- (fn[1]) }
  names(lens) <- nms
  return(lens)
}


# internal alternative for wc -l for windows
wc.windows <- function(fn) {
  dat.file <- file(fn)
  open(con=dat.file,open="r")
  eof <- FALSE; cc <- 0
  while(!eof) {
    eof <- length(readLines(dat.file,n=1))==0
    cc <- cc + 1
  }
  close(con=dat.file)
  return(cc-1)
}


#' Find the number of columns (lines) in a file.
#' 
#' Returns the number of columns in a datafile. File equivalent of ncol()
#'
#' @param fn name of the file(s) to get the length of
#' @param reader try to read the entire file to get a result, else
#'  looks at the top few lines (ignoring comments)
#' @param del specify a delimiter (else this will be auto-detected)
#' @param comment a comment symbol to ignore lines in files
#' @param skip number of lines to skip at top of file before processing
#' @param force try to read the file regardless of whether it looks
#'  like an invalid file type. Only use when you know the files are valid
#' @param excl.rn exclude rownames from column count (essentially subtract 1)
#' @return returns number of columns in file(s). If no delimiter, then =1
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{file.nrow}}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' write.table(matrix(rnorm(100),nrow=10),"temp.txt",col.names=FALSE,row.names=FALSE)
#' file.ncol("temp.txt",excl.rn=TRUE)
#' unlink("temp.txt")
#' # find ncol for all files in current directory:
#' # [NB: use with caution, will be slow if dir contains large files]
#' # not run # lf <- list.files(); if(length(lf)==0) { print("no files in dir") }
#' # lf <- lf[classify.ext(lf)=="TXT"]
#' # not run (only works if length(lf)>0) # file.ncol(lf) 
#' setwd(orig.dir) # reset working directory to original
file.ncol <- function(fn,reader=FALSE,del=NULL,comment="#",skip=0,force=FALSE,excl.rn=FALSE) {
  ## ncol function but for a file
  use.reader <- get("reader",mode="logical")
  if(!is.ch(fn)) { stop("Invalid format entered for filenames") }
  if(is.list(fn) | length(fn)>1 ) { return(sapply(fn,file.ncol,
      reader=reader,del=del,comment=comment,skip=skip,force=force)) }
  if(!is.file(fn,"")) { warning("Error: file did not exist"); return(NULL) }
  if(classify.ext(fn)=="BIN") { use.reader <- TRUE }
  if(classify.ext(fn)=="OTH") { 
    warn <- (paste(fn,"looks like an unsupported file type, may crash") )
    if(!force) { stop(warn) } else { warning(warn); return(NA) }
  }
  if(use.reader){
    fl <- reader(fn)
    if(is.null(dim(fl))) {
      if(is.vector(fl)) {
        fl <- force.frame(fl)
      } else {
        return(NA)
      }
    }
    ncols <- ncol(fl); rm(fl)
  } else {
    if(is.null(del)) {
      # suppress in case it's a vector file, e.g, ncol will be 1
      del <- suppressWarnings(get.delim(fn,n=6,comment=comment,skip=skip))
    }
    test.bit <- n.readLines(fn=fn,n=5,comment=comment,skip=skip)
    lens <- sapply(strsplit(test.bit,del),length)
    ncols <- ceiling(mean(lens,na.rm=TRUE))
  }
  if(excl.rn) { ncols <- ncols-1 }
  return(ncols)
}



#' Read 'n' lines (ignoring comments and header) from a file.
#' 
#' Useful when you don't know the length/structure of a file
#' and want a useful sample to look at. Can skip ahead in the file too.
#' Copes well when there are less than 'n' lines in the file.
#'
#' @param fn name of the file(s) to get the length of
#' @param n number of valid lines to attempt to read
#'  looks at the top few lines (ignoring comments)
#' @param comment a comment symbol to ignore lines in files
#' @param skip number of lines to skip at top of file before processing
#' @param header whether to allow for, and skip, a header row
#' @return returns the first n lines of the file meeting the criteria,
#'  or if 'skip' implies lines beyond the length of the file, the 
#'  result,will be truncated - although in this case, the last 
#'  line will always be read.
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' dat <- matrix(sample(100),nrow=10)
#' write.table(dat,"temp.txt",col.names=FALSE,row.names=FALSE)
#' n.readLines("temp.txt",n=2,skip=2,header=FALSE)
#' dat[3:4,]
#' unlink("temp.txt")
#' setwd(orig.dir) # reset working directory to original
n.readLines <- function(fn,n,comment="#",skip=0,header=TRUE)
{
  # read at least 'n' lines of a file, skipping lines and ignoring any starting with comment
  if(!file.exists(fn)) { warning("file doesn't exist"); return(NULL) }
  if(!is.character(comment)) { warning("illegal comment char, reverting to #"); comment <- "#" }
  rl <- 0; cc <- 0 + {if(is.numeric(skip)) skip else 0 }
  while(rl<n) { 
    test.bit <- readLines(fn,n+cc)
    if(skip>0 & length(test.bit>1)) { test.bit <- test.bit[-(1:(min((length(test.bit)-1),skip)))] }
    cmnt <- which(substr(test.bit,1,1)==comment)
    rl <- n+cc-length(cmnt)
    cc <- cc + length(cmnt)
  }
  if(length(cmnt)>0) { test.bit <- test.bit[-cmnt] } 
  if(length(test.bit)>1 & header) { test.bit <- test.bit[-1] }
  return(test.bit)
}


#' Determine the delimiter for a text data file.
#' 
#' Reads the first few lines of data in a text file and attempts to
#' infer what delimiter is in use, based on the 'delims' argument
#' that would result in the most consistent number of columns in the
#' first 'n' lines of data. Searches preferentially for delimiters
#' implying between 2 and 'large' columns, then for >large, and lastly
#' for 1 column if nothing else gives a match.
#'
#' @param fn name of the file to parse
#' @param n the number of lines to read to make the inference
#' @param comment a comment symbol to ignore lines in files
#' @param skip number of lines to skip at top of file before processing
#' @param delims the set of delimiters to test for
#' @param one.byte only check for one-byte delimiters, [e.g, whitespace regular expr is >1 byte]
#' @param large search initially for delimiters that imply more than 1, 
#'  and less than this 'large' columns; if none in this range, look next
#'  at >large.
#' @return returns character of the most likely delimiter
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{reader}}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' df <- data.frame(ID=paste("ID",101:110,sep=""),
#'   scores=sample(70,10,TRUE)+30,age=sample(7,10,TRUE)+11)
#' # save data to various file formats
#' test.files <- c("temp.txt","temp2.txt","temp3.csv")
#' write.table(df,file=test.files[1],col.names=FALSE,row.names=FALSE,sep="|",quote=TRUE)
#' write.table(df,file=test.files[2],col.names=TRUE,row.names=TRUE,sep="\t",quote=FALSE)
#' write.csv(df,file=test.files[3])
#' # report the delimiters
#' for (cc in 1:length(test.files)) { 
#'   cat("\n",test.files[cc],": ")
#'   print(get.delim(test.files[cc])) }
#' unlink(test.files)
#' setwd(orig.dir) # reset working dir to original
get.delim <- function(fn,n=10,comment="#",skip=0,
                        delims=c("\t","\t| +"," ",";",","),large=10,one.byte=TRUE)  
{
  # test top 'n' lines to determine what delimeter the file uses
  if(!file.exists(fn)) { stop(paste("cannot derive delimiter as file",fn,"was not found"))}
  test.bit <- n.readLines(fn=fn,n=n,comment=comment,skip=skip)
  #print(test.bit)
  num.del <- list()
  if(any(nchar(delims)>1) & one.byte) { delims <- delims[-which(nchar(delims)>1)] }
  for (cc in 1:length(delims)) {
    fff <- nchar(delims[[cc]])==1
    num.del[[cc]] <- sapply(strsplit(test.bit,delims[[cc]],fixed=fff),length)
  }
  #prv(num.del)
  if(all(unlist(num.del)==1)) { 
    warning("not a delimited file, probably a vector file")
    return(NA)
  }
  # are there some delimiters that produce consistent ncol between rows?
  need.0 <- sapply(num.del,function(X) { sum(diff(X)) })
  num.del <- sapply(num.del,"[",1)
  if(any(!need.0)) {
    #rng <- range(num.del)
    candidates <- which(num.del>1 & num.del<=large & !need.0)
    #print(candidates)
    if(length(candidates)>0) { out <- candidates[1] 
    } else {
      candidates <- which(num.del>large & !need.0)
      if(length(candidates)>0) { out <- candidates[1]
      } else {
        candidates <- which(num.del==1 & !need.0)
        if(length(candidates)>0) { out <- candidates[1]
        } else {
          warning("no delimiters tried were able to produce a valid file spec")
          out <- NULL
        }
      }
    }
  } else {
    warning("no delimiters tried were able to produce a valid file spec")
    out <- NULL
  }
  #print(delims)
  return(delims[out])
}




#' Search for a directory to add to the path so that a file exists.
#' 
#' Looks for a file named 'fn' in 'dir', and if not found there, 
#' broadens the search to the list or vector of directorys, 'dirs'.
#' Returns the full path of the first match that exists.
#'
#' @param fn name of the file to search for
#' @param dir the first directory to look in (expected location)
#' @param dirs vector/list, a set of directories to look in should
#'  the file not be found in 'dir'.
#' @return if the file is found, returns the full path of the file,
#'  else returns an empty string ""
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{is.file}}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' l.fn <- "temp.txt"
#' writeLines("test",con=l.fn)
#' find.file(l.fn)
#' find.file(l.fn,dir=getwd())
#' unlink(l.fn)
#' # not run # common.places <- ## <<add local folder here>> ##
#' # not run # d.fn <- cat.path(common.places[1],l.fn)
#' # write this example file to the first of the folders #
#' # not run # if(!file.exists(d.fn)) {  writeLines("test2",con=d.fn) }
#' # search the local folders for a
#' # a file named 'temp.txt'
#' # not run # find.file(l.fn,dir=getwd(),dirs=common.places)
#' # unlink(d.fn) # run only if test file produced
#' setwd(orig.dir) # reset working dir to original
find.file <- function(fn,dir="",dirs=NULL) { 
  if(!is.ch(fn)) { return("") }
  for (cc in 1:length(fn)) {
    if(is.character(fn)) {
      fn[cc] <- add.dir.if.not(fn[cc],dir,dirs,TRUE) # e.g, add 'dir$ano' if needed
    } else {
      fn[[cc]] <- find.file(fn[[cc]],dir,dirs)
    }
  }
  return(fn)
}



#' Test whether a file exists in a target directory, or alternative
#' list of directories.
#' 
#' Looks for a file named 'fn' in 'dir', and if not found there, 
#' broadens the search to the list or vector of directorys, 'dirs'.
#' Returns TRUE or FALSE as to whether the file exists.
#'
#' @param fn name of the file to search for
#' @param dir the first directory to look in (expected location)
#' @param dirs vector/list, a set of directories to look in should
#'  the file not be found in 'dir'.
#' @param combine if a list is given, test whether ALL files valid
#' @return logical vector of whether each file was found,  or  if
#'  combine is true, then a single value whether ALL valid or not.
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk}
#' @seealso \code{\link{find.file}}
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' l.fn <- "temp.txt"
#' writeLines("test",con=l.fn)
#' some.local.files <- narm(list.files()[1:10])
#' print(some.local.files)
#' is.file(l.fn)
#' is.file(l.fn,dir=getwd())
#' is.file(some.local.files)
#' # add a non-valid file to the list to see what happens
#' is.file(c(some.local.files,"fakefile.unreal"))
#' is.file(c(some.local.files,"fakefile.unreal"),combine=FALSE)
#' unlink(l.fn)
#' setwd(orig.dir) # reset working dir to original
is.file <- function(fn,dir="",dirs=NULL,combine=TRUE) {
  # if the path or raw filename 'fn' can refer to a file in dir/dirs
  # return TRUE, else FALSE
  if(combine) { FUN <- sapply } else { FUN <- lapply }
  X <- find.file(fn,dir,dirs)
  if(is.list(X)) {
    out <- FUN(X,is.file,dir,dirs,combine=combine)
  } else {
    if(is.character(X)) {
      out <- (X!="")
    } else {
      out <- rep(FALSE,times=length(X))  # was 'fn'??
    }
  }
  if(combine) {
    return(all(out))
  } else {
    return(out)
  }  
}


#' Internal function used by find.file
add.dir.if.not <- function(locs,dir="",dirs="",blank.if.not=TRUE,warn=FALSE) {
  # for any number of file names or paths, adds a directory if that is needed
  # to point to an existing file. can cycle through multiple directory possibilities
  # if desired; look first in 'dir', then try dirs if that fails
  if(is.list(dir)) { dir <- paste(unlist(dir)) }
  if(is.list(dirs)) { dirs <- paste(unlist(dirs)) }
  dir <- c(dir,dirs,"") # this makes sure 'dir' directory(s) are checked first, others, then current as last resort
  if(is.null(locs)) { 
    locs <- ""
    warning("filename was blank, might return nonsense location")
  } else {
    if(all(is.na(locs))) {
      locs <- ""
      warning("filename was NA, might return nonsense location")
    }
  }
  if(locs=="") { return("") } # otherwise will return first dir in list
  if(is.null(dir)) { dir <- "" ; if(warn) { warning("directory was blank") } }
  if(!is.character(locs)) { stop("Error: expecting filenames") }
  if(!is.character(dir)) { stop("Error: expecting directory names") }
  found <- FALSE
  for (cc in 1:length(locs)) {
    #if(!file.exists(locs[cc])) {
    dd <- 1
    while(!found & dd<=length(dir)) {
      test.next.dir <- cat.path(dir[dd],locs[cc])
      if(file.exists(test.next.dir))
      { 
        locs[cc] <- test.next.dir; found <- TRUE
      } else {
        found <- FALSE; dd <- dd + 1
      }
    }
    #}
    if(!file.exists(locs[cc]) & blank.if.not) { locs[cc] <- "" }
  }
  return(locs)
}




#' Convert a matrix or dataframe to fixed-width for nice file output
#' 
#' Pads each column to a common size so write.table() produces a
#' fixed width format that looks nice
#'
#' @param dat data.frame or matrix
#' @return returns dat with space padding as character
#' @export 
#' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk} #' @author Nicholas Cooper \email{nick.cooper@@cimr.cam.ac.uk} 
#' @examples
#' orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
#' df <- data.frame(ID=paste("ID",99:108,sep=""),
#'   scores=sample(150,10,TRUE)+30,age=sample(16,10,TRUE))
#' dff <- conv.fixed.width(df)
#' write.table(df,file="notFW.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
#' write.table(dff,file="isFW.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
#' cat("Fixed-width:\n",paste(readLines("isFW.txt"),"\n"),sep="")
#' cat("standard-format:\n",paste(readLines("notFW.txt"),"\n"),sep="")
#' unlink(c("isFW.txt","notFW.txt"))
#' setwd(orig.dir) # reset working dir to original
conv.fixed.width <- function(dat) {
  # for a dataframe convert to fixed width format prior to writing to file
  padw <- function(X,L) { paste(spc(L-nchar(paste(X))),X,sep="") }
  if(is.null(dim(dat))) { stop("Error: need a matrix/dataframe as input") }
  for (cc in 1:ncol(dat)) {
    max.ch <- max(nchar(paste(dat[,cc])))
    dat[,cc] <- padw(dat[,cc],max.ch)
  }
  return(dat)
}

Try the reader package in your browser

Any scripts or data that you put into this service are public.

reader documentation built on May 2, 2019, 9:27 a.m.