Nothing
# tm-adtFR.R
# J-P Mจนller, SSP/ UNIL, jean-pierre.mueller@unil.ch
# version 1.1 du 13 mars 2009
# distributed under the terms of the GNU General Public License Version 2, June 1991.
# http://wwwpeople.unil.ch/jean-pierre.mueller/ATO_avec_R_files/tm-adtFR.R
setGeneric("FRtreetager", function(object, keep = c("word", "pos", "lemma") , reduce = TRUE, sep = "/", bef = "", aft = "", ... ) standardGeneric("FRtreetager"))
setMethod("FRtreetager",
signature(object = "PlainTextDocument"),
function(object, keep = c("word", "pos", "lemma") , reduce = TRUE, sep = "/", bef = "", aft = "", ... ) {
zz <- file("~/treein.txt", "w", encoding="latin1")
cat(Content(object), file=zz)
close(zz)
system("~/cmd/tree-tagger-french ~/treein.txt >treeout.txt ", ignore.stderr=TRUE)
con <- file("~/treeout.txt", encoding="latin1")
toout <- read.csv(con, header = FALSE, sep="\t", stringsAsFactors = FALSE)
u <- NULL
if("word" %in% keep)
{
u <- cbind(u, toout[,1])
}
if("pos" %in% keep)
{
u <- cbind(u, toout[,2])
}
if("lemma" %in% keep)
{
u <- cbind(u, toout[,3])
}
vv <- apply(u, 1 , paste, collapse = sep)
vv <- paste( bef, vv, aft)
if (reduce)
{
Content(object) <- paste(vv, collapse = " ")
}
else
{
Content(object) <- vv
}
return(object)
}
)
splitDoc <- function(corpus, words= 30, keep.sent=FALSE, keep.par.bound=TRUE) {
require("tm", quietly = TRUE)
spl.docs <- NULL
new.DMD <- NULL
DMD <- DMetaData(corpus)
DMD <- data.frame(DMD, ID.ori = row.names(DMD))
for (k in seq_along(corpus)) {
c.k <- corpus[[k]]
if ((keep.par.bound)|(length(c.k)==1)) {
c.k <- unlist(strsplit(c.k, "\r"))
c.k <- c.k[nchar(c.k)!=0]
}
if (!keep.par.bound) {
c.k <- paste(c.k,collapse=" ")
}
c.k <- c(c.k)
s <- 1: length(c.k)
for (i in s) {
zz <- file("~/treein.txt", "w", encoding="latin1")
cat( c.k[i], file=zz)
close(zz)
system("~/cmd/tree-tagger-french ~/treein.txt >treeout.txt ", ignore.stderr=TRUE)
con <- file("~/treeout.txt", encoding="latin1")
tab.w.pos <- read.csv(con, header = FALSE, sep="\t", stringsAsFactors = FALSE)
tab.w.pos$wc <- 0
tab.w.pos[ (tab.w.pos[,2]!="PUN") & (tab.w.pos[,2]!="PUN:cit") & (tab.w.pos[,2]!="SENT"), "wc"] <- 1
tab.w.pos$wc <- cumsum(tab.w.pos$wc)
ss <- c( seq(1, tab.w.pos$wc[length(tab.w.pos$wc)], words) , tab.w.pos$wc[length(tab.w.pos$wc)] +1 )
if (keep.sent) {
st.li <- tab.w.pos$wc[tab.w.pos[,2]=="PUN:cit"] +1
st.li <- c(1, (tab.w.pos$wc[tab.w.pos[,2]=="SENT"] +1 ), st.li , tab.w.pos$wc[length(tab.w.pos$wc)] +1 )
st.li <- sort(unique(st.li))
vs <- NULL
for (j in 1:(length(ss))) {
vs <- c(vs, st.li[ which.min( abs(st.li- ss[j]))])
}
ss <- vs
ss <- unique(ss)
}
zl <- NULL
for (j in 1:(length(ss)-1)) {
start.v <- min( (1:nrow(tab.w.pos)) [tab.w.pos$wc == (ss[j])] )
end.v <- min( c( nrow(tab.w.pos)+1 , (1:nrow(tab.w.pos)) [tab.w.pos$wc == (ss[j+1])] ) -1)
zt <- tab.w.pos[ start.v : end.v , 1]
zt <- paste(zt, collapse=" ")
zl <- c(zl, zt)
}
spl.docs <- c(spl.docs, zl)
k.rep <- rep( k, length(zl) )
r.DMD <- DMD[k.rep, ]
new.DMD <- rbind(new.DMD, r.DMD)
}
}
spl.docs <- Corpus( VectorSource(spl.docs), readerControl = list( language = "french"))
row.names(new.DMD) <- NULL
meta(spl.docs, names(new.DMD)) <- new.DMD
return(spl.docs)
}
speci.calc <- function (tle) {
rstle <- rowSums(tle)
cstle <- colSums(tle)
ttle <- sum(tle)
nb <- matrix( cstle, dim(tle)[1], dim(tle)[2], byrow=TRUE )
nech <- matrix( rstle, dim(tle)[1], dim(tle)[2], byrow=FALSE )
spos <- phyper(as.matrix(tle) - 1, nb, ttle - nb, nech, lower.tail = FALSE)
sneg <- phyper(as.matrix(tle), nb, ttle-nb, nech, lower.tail = TRUE)
specificite <- list( pos = spos, neg = sneg)
return(specificite)
}
speci.extract <- function (speci, level)
{
speci.cells <- which(speci <= (level) , arr.ind = TRUE)
z <- NULL
if (nrow(speci.cells) > 0) {
o <- order(speci.cells[, 1])
speci.cells <- speci.cells[o, ]
z <- data.frame(Doc = rownames(speci)[speci.cells[, 1]],
Terms = colnames(speci)[speci.cells[, 2]],
p.value = speci[ speci.cells] )
}
return(z)
}
speci.indic <- function (tle, tab.ind) {
rstle <- rowSums(tle)
cstle <- colSums(tle)
ttle <- sum(tle)
sum.ind <- matrix(0, nrow = ncol(tab.ind), ncol = ncol(tle))
for (i in 1:ncol(tab.ind)) {
sum.ind[i, ] <- colSums(tle[tab.ind[,i]==1, ])
}
nb <- matrix( cstle, dim(sum.ind)[1], dim(sum.ind)[2], byrow=TRUE )
nech <- matrix( rowSums(sum.ind), dim(sum.ind)[1], dim(sum.ind)[2], byrow=FALSE )
spos <- phyper(sum.ind - 1, nb, ttle - nb, nech, lower.tail = FALSE)
colnames(spos) <- colnames(tle)
rownames(spos) <- colnames(tab.ind)
sneg <- phyper(sum.ind, nb, ttle-nb, nech, lower.tail = TRUE)
colnames(sneg) <- colnames(tle)
rownames(sneg) <- colnames(tab.ind)
specificite <- list( pos = spos, neg = sneg)
return(specificite)
}
rep.mod <- function(tle, speci.col, tab.ind.col, n=5)
{
u <- as.matrix(tle) %*% as.vector(speci.col)
u[tab.ind.col== 0] <- NA
z <- order(u, na.last=TRUE, decreasing=FALSE)
z <- z[1:n]
return(z)
}
codings2tm <- function(Code,language="eng",byFile = FALSE){
retrieval <- NULL
currentCode <- Code
if (length(currentCode)!=0)
{
currentCode <- iconv(currentCode,to="UTF-8")
currentCid <- RQDAQuery(sprintf("select id from freecode where name== '%s' ",currentCode))[1,1]
## reliable is more important
if(!is.null(currentCid))
{
## retrieval <- RQDAQuery(sprintf("select cid,fid, selfirst, selend,seltext from coding where status==1 and cid=%i",as.numeric(currentCid)))
retrieval <- RQDAQuery(sprintf("select cid,fid, selfirst, selend, seltext from coding, source where coding.status==1 and coding.cid= %i and source.id == coding.fid",as.numeric(currentCid)))
## the new sql syntax is provided by Benson Ye 2010-7-25
if (nrow(retrieval)!=0)
{
retrieval <- retrieval[order( retrieval$fid),]
fid <- unique(retrieval$fid)
retrieval$fname <-""
for (i in fid)
{
FileName <- RQDAQuery(sprintf("select name from source where status==1 and id==%i",i))[['name']]
tryCatch(Encoding(FileName) <- "UTF-8",error=function(e){})
retrieval$fname[retrieval$fid==i] <- FileName
}
Encoding(retrieval$seltext) <- Encoding(retrieval$fname) <- "UTF-8"
}
}
if (byFile == TRUE){
retrieval_f <- data.frame(NULL)
for (j in 1 : nrow(retrieval)){
retrieval_f[paste(retrieval[j,"fid"]),"seltext"] <- paste(retrieval_f[paste(retrieval[j,"fid"]),"seltext"]
,retrieval[j,"seltext"])
retrieval_f[paste(retrieval[j,"fid"]),"fname"] <- retrieval[j,"fname"]
}
retrieval_f[,"fid"] <- as.numeric(rownames(retrieval_f))
retrieval <- retrieval_f
}
retrived <- tm:::Corpus(tm::VectorSource(retrieval$seltext), readerControl = list( language = language))
retrieval$seltext <- NULL
meta(retrived,tag=names(retrieval)) <- retrieval
return(retrived)
}
}
files2tm <- function(Code,language="eng"){
fcat <- RQDAQuery("select treefile.fid, filecat.name from treefile, filecat on filecat.catid==treefile.catid and treefile.status==1 and filecat.status==1")
Encoding(fcat$name) <- "UTF-8"
names(fcat) <- c("id","filecat")
txt <-RQDAQuery("select name, id, file, owner, date from source where status==1")
txt <- merge(txt,fcat,by="id",all.x=TRUE,all.y=FALSE)
Encoding(txt$file) <- "UTF-8"
Encoding(txt$name) <- "UTF-8"
fcorpus <- tm:::Corpus(tm::VectorSource(txt$file),
readerControl = list(language = language))
meta(fcorpus, tag = c("fname","id","owner","date","filecat")) <- txt[,c("name","id","owner","date","filecat")]
fcorpus
}
setGeneric("tmcollapse", function(object, collapse=" ") standardGeneric("tmcollapse"))
setMethod("tmcollapse",
signature(object = "PlainTextDocument"),
function(object, collapse=" ") {
object <- paste(object, sep = "", collapse = collapse)
return(object)
}
)
setGeneric("tm2codings", function(object) standardGeneric("tm2codings"))
setMethod("tm2codings", signature(object = "Corpus"),
function(object) {
require("RQDA", quietly = TRUE)
u <- lapply(object,tmcollapse, collapse="\r")
fname <- tryCatch(meta(object,'fname')[,'fname'],error=function(e) 1:length(u))
names(u) <- fname
write.FileList(u)
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.