R/misc.R

Defines functions dec_to_binary rank.inv.norm predict.pcc getFormattedMCSummary MCsummary get.sim.res sample.for.cv multi.outer make_args_mtx summ get.st.mean weighted.ave write.svm.input covariability pr.1 N.G.E N.L N.L.E pava empty2na

Documented in dec_to_binary empty2na getFormattedMCSummary get.sim.res MCsummary multi.outer pava predict.pcc rank.inv.norm sample.for.cv summ

# replace empty strings with NAs
empty2na=function(x) {x[x==""]=NA; x = as.factor(as.character(x))}

##  From the R-help e-mail by Ted Harding: http://tolstoy.newcastle.edu.au/R/e2/help/07/03/12853.html
##  See also http://tolstoy.newcastle.edu.au/R/help/05/05/4254.html
pava <- function(x, wt = rep(1, length(x)))
{
    n <- length(x)
    if (n <= 1) return(x)
    lvlsets <- 1:n
    repeat 
    {
        viol <- (as.vector(diff(x)) < 0)
        if (!(any(viol))) break
        i <- min((1:(n-1))[viol])
    
        lvl1 <- lvlsets[i]
        lvl2 <- lvlsets[i+1]
        ilvl <- ( (lvlsets == lvl1) | (lvlsets == lvl2) )
    
        x[ilvl] <- sum(x[ilvl] * wt[ilvl]) / sum(wt[ilvl])     
        lvlsets[ilvl] <- lvl1
    }
    x
} 


# from Cai Tianxi
## count how many YY's are smaller or equal to yy
N.L.E <- function(yy, YY)  ## sum I(YY <= yy[i])
{
   rank(c(yy+1e-8,YY))[1:length(yy)] - rank(yy)  ### add a small pertubation to avoid ties when calculating rank
}
N.L <- function(yy, YY)  ## sum I(YY < yy[i])
{
   rank(c(yy-1e-8,YY))[1:length(yy)] - rank(yy)  ### add a small pertubation to avoid ties when calculating rank
}
N.G.E <- function(yy, YY)  ## sum I(YY >= yy[i])
{
   length(YY)-(rank(c(yy-1e-8,YY))[1:length(yy)] - rank(yy))
}

# get first prinipal component
pr.1=function(x){
    x.s=scale(x)
    pr.s = prcomp(x.s)
    out=c(x.s %*% pr.s$rotation[,1])
    attr(out, "rotation")=pr.s$rotation[,1]
    out
}

# compute covariability as defined in Clarke 1995 page 2271, second formula
# x is a vector of characters, so is y
covariability=function(x,y){
    tmp=table(x)
    p.x=data.frame(tmp/(sum(tmp)))
    tmp=table(y)
    p.y=data.frame(tmp/(sum(tmp)))
    
    # tabulate all pairwise combination of aa
    pair = paste(x,y,sep="")
    tmp=table(pair)
    dat=data.frame(tmp/(sum(tmp)))
    
    ps=numeric(nrow(dat))
    for (i in 1:nrow(dat)) {
        row.=dat[i,]
        a2=as.character(row.[1,1])
        p.i=p.x$Freq[p.x$x==substr(a2,1,1)]
        p.j=p.y$Freq[p.y$y==substr(a2,2,2)]
        p.ij=row.[1,2]
        ps[i]=p.ij**2*log(p.ij/p.i/p.j)        
    }
    sum(ps)
}
## test
#dat=readFastaFile ("D:/1CHMM/domains/SET/SETpfamseed/seq/SETpfamseed_aligned.fasta")
#seqs.mat.a=mysapply(dat, s2c)
#covar=matrix(0,ncol(seqs.mat.a),ncol(seqs.mat.a))
#for (i in 1:(ncol(seqs.mat.a)-1)){
#    for (j in (i+1):ncol(seqs.mat.a)){
#        covar[i,j]<-covar[j,i]<-covariability(seqs.mat.a[,i], seqs.mat.a[,j])
#    }
#}
#sort(round(covar,3), decre=TRUE)[1:100]

# output format:
#+1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.419847 9:-1 10:-0.225806 12:1 13:-1 
# y: vector of 1 and non-1
# z: matrix or data.frame of covariates
# file.name: e.g. test. There should be no file extension
# ws: vector of weights
write.svm.input=function(y, z, file.name="svm_input", ws=NULL){
    rows=sapply(1:length(y), function(i){
        paste(c(ifelse(y[i]==1,"+1","-1"),(1:ncol(z))%.%":"%.%z[i,]), collapse=" ")
    })
    write(paste(rows,collapse="\n"), file = file.name%.%".dat", append = FALSE, sep = " ")        
    # write weight file
    if (!is.null(ws)) write(paste(ws,collapse="\n"), file = file.name%.%".wgt", append = FALSE, sep = " ")    
}

# m is data frame or matrix, each row is an observation
# w is a named vector, the names may not match m, may not have same number even
# return a vector
weighted.ave=function(m, w){
    if(is.data.frame(m)){
        nam=names(m)
        m=as.matrix(m)
    } else {
        nam=colnames(m)
    }
    ret=w[match(nam, names(w))]
    ret[is.na(ret)]=0
    names(ret)=nam
    c(m %*% ret )
}

# st.fit is an object returned by sn::st.mle, skew normal
get.st.mean=function(st.fit) {
    p=st.fit$dp
    
    xi=p["location"];
    alpha=p["shape"];
    df=p["df"]
    #omega=sd(X);
    omega=p["scale"]
    delta=alpha/sqrt(1+alpha^2);
    mu=delta*sqrt(df/pi)*gamma(0.5*(df-1))/gamma(0.5*df);
    
    # the 4 first moments
    moment1=xi+omega*mu;
    moment2=xi^2 + 2*omega*mu*xi + omega^2 * df/(df-2);
    moment3=xi^3 + 3*omega*mu*xi^2 + 3*omega^2*df/(df-2)*xi +
    omega^3*mu*(3-delta^2)*df/(df-3);
    moment4=xi^4 + 4*omega*mu*xi^3 + 6*omega^2*df/(df-2)*xi^2 +
    4*omega^3*mu*(3-delta^2)*df/(df-3)*xi+omega^4*3*df^2/((df-2)*(df-4));
    
    # the 4 useful measures
    mean=moment1;
    var=moment2;
    skew=moment3/var^(3/2);
    kurt=moment4/var^2 - 3;
    
    c(mean)
}


#iterated sum, like diff
summ=function(x) {
    x[-1]+x[-length(x)]
}

list_args <- Vectorize( function(a,b) c( as.list(a), as.list(b) ), SIMPLIFY = FALSE)
make_args_mtx <- function( alist ) Reduce(function(x, y) outer(x, y, list_args), alist)
multi.outer <- function(f, ... ) {
  args <- make_args_mtx(list(...))
  apply(args, 1:length(dim(args)), function(a) do.call(f, a[[1]] ) )
}


last = function (x, n=1, ...) {
    if (length(x)==1 & is.character(x)) tail (readLines(x), n=n, ...) # read file, length(x)==1 is needed b/c otherwise last(c("a","b")) won't behave properly
    else if (is.vector(x)) x[length(x)]
    else if (is.matrix(x)) x[nrow(x),]
    else if (is.array(x)) x[length(x)]
    else if (is.list(x)) x[[length(x)]]
    else stop ("last(): x not supported")
}


sample.for.cv=function(dat, v, seed){
    save.seed <- try(get(".Random.seed", .GlobalEnv), silent=TRUE) 
    if (inherits(save.seed,"try-error")) {set.seed(1); save.seed <- get(".Random.seed", .GlobalEnv) }      
    set.seed(seed)
    n1=sum(dat$y==1)
    n2=sum(!dat$y==1)
    test.set = c(sample(which (dat$y==1), round(n1/v)), sample(which (dat$y==0), round(n2/v)))
    train.set = setdiff(1:nrow(dat), test.set)
    assign(".Random.seed", save.seed, .GlobalEnv)     
    list("train"=train.set, "test"=test.set)
}


get.sim.res = function(dir, res.name="res", verbose=TRUE) {
    
    if (!requireNamespace("abind")) {print("abind does not load successfully"); return (NULL) }
    
    ## get file names
    tmp=list.files(path=dir, pattern="batch[0-9]+.*.Rdata")
    if (length(tmp)==0) {
        # no files yet
        stop (paste0("no files in ",dir,"\n"))
    } else {
        fileNames = dir %.%"/"%.%tmp
    }
    
    # remove empty files, which sometimes appear, e.g. when disk is full
    empty.files=NULL
    for (x in fileNames) {
        if(file.info(x)$size==0) empty.files=c(empty.files, x)
    }
    fileNames=setdiff(fileNames, empty.files)
    
    ## read files
    res=lapply(fileNames, function(x) {
        if(verbose>=2) print(x); 
        load(file=x); 
        get(res.name)
    })
    # check dimension
    dims=lapply(res, dim)
    if (!is.null(dims[[1]])) {
        dim.same=sapply(dims, function(x) x==dims[[1]])
        if(!all(dim.same)) {
            print(dims)
            stop("not all files have the same dimensional results")
        }
    }
    
    res.all = do.call(abind::abind, res)
    names(dimnames(res.all))=names(dimnames(res[[1]]))
    
    if(verbose) str(res.all)
    
    if(verbose>=2) {
        cat("number of files: "%.%length(fileNames),"\n")
        print(dimnames(res.all)[[1]])
    }
    
    res.all    
}

MCsummary=function(dir, res.name="res", exclude.some=TRUE, exclude.col=1, verbose=TRUE){
    
    if(verbose) cat("\nsummarizing simulation results from ",dir,"\n",sep="")
    res.all=get.sim.res(dir,res.name,verbose=verbose)
    
    # exclude extreme values
    if(exclude.some) {
        q.1=0.05
        subset=res.all[exclude.col,1,]>quantile(res.all[exclude.col,1,],1-q.1,na.rm=T) 
        myprint(mean(subset)) # percent to be excluded
        res.all=res.all[,,!subset]
    }    
        
    mean.=      apply(res.all, 1:(length(dim(res.all))-1), function(x) mean(x, trim=0, na.rm=T))
    median.=    apply(res.all, 1:(length(dim(res.all))-1), function(x) median(x, na.rm=T)) 
    sd.=        apply(res.all, 1:(length(dim(res.all))-1), function(x) sd(x, na.rm=T)) 
    skew.=      apply(res.all, 1:(length(dim(res.all))-1), function(x) skew(x, na.rm=T))    
    width.mc=   apply(res.all, 1:(length(dim(res.all))-1), function(x) diff(quantile(x, c(.025,.975), na.rm=T))) # MC 95% interval
    # if res.all is two-dimensional, mean. etc is only one-dimensional, needs fix
    if(is.null(dim(mean.))) {dim(mean.)<-dim(median.)<-dim(sd.)<-dim(skew.)<-dim(width.mc)<-dim(bias)<-dim(rel.bias)<-dim(medbias)<-dim(rel.medbias)<-c(length(mean.),1)} # 
    
#    # coverage of MC.sd, not implemented for now
#    mc.sd=apply(res.all, 1:(length(dim(res.all))-1), function(x) sd(x, na.rm=T))[1:0]
#    cvg.mc.sd=apply(res.all[1:p,]>coef.0-1.96*mc.sd & res.all[1:p,]<coef.0+1.96*mc.sd, 1, mean, na.rm=T)
    
    list(mean=mean., median=median., sd=sd., skew=skew., width.mc=width.mc) 
        
}

# even if some of the n in nn have not results, it will result a row with NA for that n
# sum.sd determines whether we take mean of sd or median of sd, e.g. when computing summary of CI width
#exclude.some=T; verbose=T; coef.0=NULL; digit1=2; sum.est="mean"; sum.sd="median"; style=1; keep.intercept=FALSE
getFormattedMCSummary=function(path, sim, nn, fit.method, exclude.some=TRUE, exclude.col=1, verbose=TRUE, coef.0=NULL, digit1=2, sum.est=c("mean","median"), sum.sd=c("median","mean"),style=1, keep.intercept=FALSE) {
  
    sum.est<-match.arg(sum.est)    
    sum.sd<-match.arg(sum.sd)    
    names(nn)=nn
    stats=lapply(nn, function(n) {
        dir=path%.%"/"%.% sim%.%"_"%.%n%.%"_"%.%fit.method    
        try(MCsummary(dir, exclude.some=exclude.some, exclude.col=exclude.col, verbose=verbose))
    })
    subset.=!sapply(stats, function(x) inherits(x,"try-error"))
    stats=stats[subset.]
    nn=nn[subset.]    
    
    stat.names=last(dimnames(stats[[1]]$mean))[[1]] 
    sd.methods=sub("sd.","",stat.names[startsWith(stat.names,"sd.")  ])
    if (verbose) myprint(sd.methods)        
    
    # in the following objects, stats from different sample sizes are stacked one upon another
    # c() allows us to assign names 
    tab.1=cbind(
         mean.est=   c(do.call(rbind, lapply(stats, function (x) x$mean       [,"est",drop=F])))
        ,median.est= c(do.call(rbind, lapply(stats, function (x) x$median     [,"est",drop=F]))) 
        ,sd.est=     c(do.call(rbind, lapply(stats, function (x) x$sd         [,"est",drop=F]))) 
        ,skew.est=   c(do.call(rbind, lapply(stats, function (x) x$skew       [,"est",drop=F]))) 
        ,mcwdth.est= c(do.call(rbind, lapply(stats, function (x) x$width.mc   [,"est",drop=F]))) 
        ,              do.call(rbind, lapply(stats, function (x) x[[sum.sd]]  [,startsWith(stat.names,"sd."),drop=F]))
        ,              do.call(rbind, lapply(stats, function (x) x$mean       [,startsWith(stat.names,"covered."),drop=F]))
    )    
    p=nrow(tab.1)/length(nn)
    
    # add width of CI based on sd
    tmp=               do.call(rbind, lapply(stats, function (x) x[[sum.sd]]  [,startsWith(stat.names,"sd."),drop=F])) * 2 * 1.96
    colnames(tmp)=sub("sd.","wdth.",colnames(tmp))
    tab.1=cbind(tab.1, tmp)
    
    # compute bias based on coef.0
    if(!is.null(coef.0)) {
        tab.1=cbind(tab.1
            ,meanbias=        tab.1[,"mean.est"]-coef.0
            ,rel.meanbias=   (tab.1[,"mean.est"]-coef.0)/coef.0
            ,medianbias=      tab.1[,"median.est"]-coef.0
            ,rel.medianbias= (tab.1[,"median.est"]-coef.0)/coef.0
        )
    }
    
    # ro-oder the rows such that stats from different parameters are stacked one upon another    
    tab.2=apply(tab.1, 2, function(x) t(matrix(x,nrow=p)))
    
    #if (verbose) print(tab.2)
    
    if(style==1) {
        out=cbind(
              "$n$"=          c(rep(nn,p))
            , "est"=          c(formatDouble(tab.2[,sum.est%.%".est"] ,digit1))
            , "est(\\%bias)"= c(formatDouble(tab.2[,sum.est%.%".est"] ,digit1)%.%"("%.%round( tab.2[,"rel."%.%sum.est%.%"bias"] *100)%.%")")
            , "range"=        c(formatDouble(tab.2[,"mcwdth.est"],digit1))
            # put width and coverage prob together, e.g. 0.17(93)
            ,                 matrix(formatDouble(tab.2[,startsWith(colnames(tab.2),"wdth.")],digit1) %.% ifelse (is.nan(tab.2[,startsWith(colnames(tab.2),"covered.")]),"", "("%.%round( tab.2[,startsWith(colnames(tab.2),"covered.")] *100)%.%")"), 
                                  nrow=p*length(nn), dimnames=list(NULL,sd.methods))
        ) 
        rownames(out)=outer(nn, names(coef.0), paste)
        if(!keep.intercept) out=out[-(1:length(nn)),] 
    
    } else stop("style not supported")
    
    out
        
}


#Need to re-implement this more efficiently!!!
#    library(maptools)
nearestPointOnSegment=function (s, p) {
    ap = c(p[1] - s[1, 1], p[2] - s[1, 2])
    ab = c(s[2, 1] - s[1, 1], s[2, 2] - s[1, 2])
    t = sum(ap * ab)/sum(ab * ab)
    t = ifelse(t < 0, 0, ifelse(t > 1, 1, t))
    x = s[1, 1] + ab[1] * t
    y = s[1, 2] + ab[2] * t
    result = c(x, y, sqrt((x - p[1])^2 + (y - p[2])^2))
    names(result) = c("X", "Y", "distance")
    result
}


# neither project_to_curve or earlier version of get.lam works correctly to project,  nor pcurve package works
predict.pcc=function(object, newdat, ...) {
    s=object$s[object[["ord"]],]
    res=sapply (1:nrow(newdat), function(i) {
        tmp=sapply (2:nrow(s), function(j) {
            fit=nearestPointOnSegment(s[(j-1):j,], newdat[i,])
        })
        fit=tmp[,which.min(tmp[3,])]
        #segments(newdat[i,1], newdat[i,2], fit[1], fit[2])
    })
    #sum(res["distance",])
    res
}


# rank-based inverse normal transformation
rank.inv.norm = function(x) {
    qnorm(rank(x)/(1+length(x)))
}

INT=rank.inv.norm
    
# covert a decimal number to a binary representation with d digits
dec_to_binary <- function(x,d) sapply(x, function(xx) ifelse(is.na(xx), NA, paste(rev(as.integer(intToBits(xx))[1:d]), collapse="")))

Try the kyotil package in your browser

Any scripts or data that you put into this service are public.

kyotil documentation built on Nov. 28, 2023, 1:09 a.m.