R/wfm.R
In austin: A package for doing things with words

Documented in as.docword as.wfm as.worddoc docs getdocs is.wfm trim wfm wfm2bmr wfm2lda wordmargin words

wfm <- function(mat, word.margin=1){
    if (is(mat, 'character')){
        if (!file.exists(mat))
            stop(paste("The file", mat, "does not exist"))
        mat <- read.csv(file=mat, row.names=1)
        #if (word.margin==2)
        #    mat <- t(mat)
    }

    if (is.null(rownames(mat)) || is.null(colnames(mat)))
        stop("Cannot convert this object to a wfm: It must have row and column names")

    if (!is(mat, 'matrix'))
        mat <- as.matrix(mat)

    if (word.margin==1)
        dimnames(mat) <- list(words=rownames(mat), docs=colnames(mat))
    else if (word.margin==2)
        dimnames(mat) <- list(docs=rownames(mat), words=colnames(mat))
    else
        stop("word.margin must be 1 (rows) or 2 (columns)")
    mat
}

is.wfm <- function(x){
    nms <- names(dimnames(x))
    !is.null(nms) && identical(sort(nms), c('docs', 'words'))
}

wordmargin <- function(x){
    ifelse(names(dimnames(x))[1]=='words', 1, 2)
}

'wordmargin<-' <- function(x, value){
    if (wordmargin(x) != value){
        ## relabel the dimensions but don't change their contents
        if (value==1)
            dimnames(x) <- list(words=rownames(x), docs=colnames(x))
        else
            dimnames(x) <- list(docs=rownames(x), words=colnames(x))
    }
    x
}

as.wfm <- function(mat, word.margin=1){
    ## rather speculative conversion from tm format here
    if (is(mat, 'TermDocumentMatrix'))
        return(wfm(as.matrix(mat), word.margin=1))

    if (is(mat, 'DocumentTermMatrix'))
        return(wfm(as.matrix(mat), word.margin=2))

    ## if it's a data.frame or a matrix this should work
    ## provided there are row and column names
    if (is.null(rownames(mat)) || is.null(colnames(mat)))
        stop("Cannot convert this object to a wfm.  It must have row and column names")

    wfm(mat, word.margin=word.margin)
}

as.docword <- function(wfm){
    if (wordmargin(wfm)==1)
        t(wfm)
    else
        wfm
}

as.worddoc <- function(wfm){
    if (wordmargin(wfm)==1)
        wfm
    else
        t(wfm)
}

words <- function(wfm){
    if (wordmargin(wfm)==1)
        rownames(wfm)
    else
	colnames(wfm)
}

'words<-' <- function(x, value){
    if (length(words(x)) != length(value))
        stop("Replacement values are not the same length as the originals")

    if (wordmargin(x)==1)
        rownames(x) <- value
    else
        colnames(x) <- value
    return(x)
}

docs <- function(wfm){
    if (wordmargin(wfm)==1)
        colnames(wfm)
    else
        rownames(wfm)
}

'docs<-' <- function(x, value){
    if (length(docs(x)) != length(value))
        stop("Replacement value is not the same length as original")

    if (wordmargin(x)==1)
        colnames(x) <- value
    else
        rownames(x) <- value
    return(x)
}

getdocs <- function(wfm, which){
    if (wordmargin(wfm)==1)
        wfm[, which, drop=FALSE]
    else {
        wfm[which, , drop=FALSE]
    }
}

trim <- function(wfm, min.count=5, min.doc=5, sample=NULL, verbose=TRUE){
    ## eject words occurring less than min.count times and
    ## in fewer than min.doc documents
    ## and optionally sample a subset of words

    if (!is.wfm(wfm))
        stop("Function not applicable to this object")

    mY <- as.worddoc(wfm)

    rs1 <- which(rowSums(mY) >= min.count)
    if (verbose)
        cat("Words appearing less than", min.count, "times:", (nrow(mY) - length(rs1)), "\n")

    rs2 <- which(apply(mY, 1, function(x){ sum(x>0) >= min.doc } ))
    if (verbose)
        cat("Words appearing in fewer than", min.doc, "documents:", (nrow(mY) - length(rs2)), "\n")

    tokeep <- intersect(rs1, rs2)
    if (length(tokeep)==0)
        stop("No words left after trimming")

    if (!is.null(sample)){
        if (sample > length(tokeep))
            warning(paste('Sample size', sample, 'larger than',
                          length(tokeep), "already filtered from", nrow(mY), "so ignoring sampling request"))
        tokeep <- sample(tokeep, min(length(tokeep), sample))
        if (verbose)
            cat("Retaining a random sample of", sample, "words\n")
    }
    return(mY[sort(tokeep),])
}

wfm2lda <- function(wfm, dir=NULL, names=c("mult.dat", "vocab.dat")){
    m <- as.worddoc(wfm)
    v <- words(m)

    d <- list()
    for (i in 1:length(docs(m))){
        nzero <- which(m[,i]>0)
        d[[i]] <- t(matrix(as.integer(c(nzero-1, m[nzero, i])), ncol=2))
    }
    if (!is.null(dir))
        return(list(vocab=v, data=d))

    if (!file.exists(dir))
        stop(paste("Folder", dir, "does not exist"))
    lines <- rep("", length(d))
    for (i in 1:length(d)){
        lines[i] <- paste(NCOL(d[[i]]),
                          paste(d[[i]][1,], ":", d[[i]][2,], sep="", collapse=" "))
    }

    writeLines(lines, file.path(dir, names[1]))
    writeLines(v, file.path(dir, names[2]))
}

wfm2bmr <- function(y, wfm, filename){
    if (!is.null(y)){
        if (!(0 %in% y))
            stop("Dependent variable must index from 0")
        y <- as.numeric(y)+1 # to make 1 and 2 not 0 and 1
    }
    x <- as.worddoc(wfm)

    lines <- rep("", NCOL(x))
    for (i in 1:NCOL(x)){
        nonz <- which(x[,i] != 0.0)
        nonz.vals <- as.numeric(x[nonz,i])
        yy <- ifelse(!is.null(y), y[i], NULL)

        ldat <- paste(nonz, ":", nonz.vals, collapse=" ", sep="")
        lines[i] <- ifelse(is.null(y), ldat, paste(y[i], ldat))
    }

    writeLines(lines, filename)
}

Any scripts or data that you put into this service are public.

austin documentation built on May 2, 2019, 5:48 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

austin
A package for doing things with words

R/wfm.R
In austin: A package for doing things with words

Defines functions wfm is.wfm wordmargin as.wfm as.docword as.worddoc words docs getdocs trim wfm2lda wfm2bmr

Documented in as.docword as.wfm as.worddoc docs getdocs is.wfm trim wfm wfm2bmr wfm2lda wordmargin words

Try the austin package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

austin A package for doing things with words

R/wfm.R In austin: A package for doing things with words

Defines functions wfm is.wfm wordmargin as.wfm as.docword as.worddoc words docs getdocs trim wfm2lda wfm2bmr

Documented in as.docword as.wfm as.worddoc docs getdocs is.wfm trim wfm wfm2bmr wfm2lda wordmargin words

Try the austin package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

austin
A package for doing things with words

R/wfm.R
In austin: A package for doing things with words