R/AnnoMass.R

Defines functions .onLoad trim.trailing.spaces merge.MSfiles construct.AM cluster.groups run.annotation order.AM write.cdt get.data get.clusters set.components get.components get.presence roc.AM1 pr_tables roc.AM plot.prAM analyze.MSfile addreplace.column categorize_cluster

Documented in analyze.MSfile get.clusters get.data merge.MSfiles plot.prAM pr_tables

.onLoad<-function(...){
    packageStartupMessage("This package contains detailed walkthrough for the current version.\nUse command:\nvignette(\"MetaMass\")")
}
trim.trailing.spaces<-function(x){
    .trim<-function(xx){
        xx<-gsub(" $","",xx)
        xx<-gsub("^ ","",xx)
        return(xx)
    }
    char.cols<-which(sapply(x,is.character))
    message("Trimming trailing spaces")
    for (i in char.cols) x[[i]]<-.trim(x[[i]])

    return(x)
}
merge.MSfiles<-function(MSfiles,by=1,all=TRUE,sep="\t",output=NULL){

    res<-read.table(MSfiles[1],header=TRUE,stringsAsFactors=FALSE,sep=sep,comment.char="")

    if (length(MSfiles)>1){
          if (any(duplicated(res[,by]))){
              warning("Duplicated IDs are not alowed for multiple files. Duplicated IDs are removed.")
              res<-res[-which(duplicated(res[,by])),]
          }
          for (i in 2:length(MSfiles)){
              resloc<-read.table(MSfiles[i],header=TRUE,stringsAsFactors=FALSE,sep=sep,comment.char="")
              if (any(duplicated(resloc[,by]))){
                  warning("Duplicated IDs are not alowed for multiple files. Duplicated IDs are removed.")
                  resloc<-resloc[-which(duplicated(resloc[,by])),]
              }
              res<-merge(res,data.frame(non_NUM_space="-",resloc),by.x=by,by.y=by+1,all.x=all,all.y=all)
          }
    }

    cols<-which(sapply(res,is.numeric))
    resloc<-res[,cols]
    resloc[is.na(resloc)]<-1
    res[,cols]<-resloc
    jumps<-which(diff(cols)>1)
    ngroups<-length(jumps)+1

    groups<-list()[1:ngroups]
    start<-1
    names(jumps)<-NULL

    if (length(jumps)>0){
        for (i in 1:length(jumps)){
            groups[[i]]<-c(start,jumps[i])
            start<-jumps[i]+1

        }
    }

    groups[[ngroups]]<-c(start,length(cols))
    present<-matrix(1,ncol=ngroups,nrow=nrow(res))

    for (i in 1:ngroups){
        cls<-c(groups[[i]][1]:groups[[i]][2])
        ss<-rowSums(res[,cols[cls]])
        selNA<-which(ss<=length(cls))
        present[selNA,i]<-0
     }

    overlap<-rowSums(present)
    nonnum<-sapply(res,function(x) if (is.factor(x)) return(levels(x)=="-") else return(FALSE))
    res[,nonnum]<-"-"
    res<-data.frame(res,overlap=paste("overlaps",overlap))
    if (!is.null(output)) write.table(res,file=output,col.names=TRUE,row.names=FALSE,sep="\t")
    return(invisible(res))
}

construct.AM<-function(annotation,data,annotation.ID=1,data.ID=1,annotation.component=3,group_names=NULL,meta_gr=1){
    annotation<-trim.trailing.spaces(annotation)
    data<-trim.trailing.spaces(data)
    colnames(data)<-paste("data",gsub("\\s","_",gsub("\\s+"," ",colnames(data))),sep="_")
    data.ID<-colnames(data)[data.ID]
    if(any(duplicated(data[,data.ID]))) warning("Protein IDs duplicated in MS data.frame")
    if (any(duplicated(data[,data.ID]))) {
        warning("Duplicated IDs are not alowed. Duplicated IDs are removed.")
        data<-data[-which(duplicated(data[,data.ID[1]])),]
    }
    colnames(annotation)<-paste("Annot",gsub("\\s","_",gsub("\\s+"," ",colnames(annotation))),sep="_")
    annotation.ID<-colnames(annotation)[annotation.ID]
    annotation.component<-colnames(annotation)[annotation.component]
    if(any(duplicated(annotation[,annotation.ID]))) warning("Protein IDs duplicated in Annotation data.frame")
    if (any(duplicated((annotation[,annotation.ID])))) {
        warning("Duplicated IDs are not alowed. Duplicated IDs are removed.")
        annotation<-annotation[-which(duplicated(annotation[,annotation.ID])),]
    }
    empty.lines<-which(is.na(annotation[,annotation.ID]) | annotation[,annotation.ID]=="")
    if (length(empty.lines)>0) annotation<-annotation[-empty.lines,]

    empty.lines<-which(is.na(data[,data.ID]) | data[,data.ID]=="")
    if (length(empty.lines)>0) data<-data[-empty.lines,]

    for (i in annotation.component) annotation[,i]<-toupper(annotation[,i])
    if (length(annotation.component)==1){
        components<-unique(annotation[,annotation.component])
        empty.lines<-which(components=="" | is.na(components))
        if (length(empty.lines)>0) components<-components[-empty.lines]
        components<-list(components)
    } else {
        components<-lapply(annotation.component,FUN=function(i,annotation){fr<-annotation[,i];ss<-which(fr=="" | is.na(fr)); if(length(ss)>0) fr<-fr[-ss];return(unique(fr))},annotation=annotation)
    }


    for (i in 1:length(annotation.component)) message(paste("Cellular components:",paste(components[[i]],collapse=" ")))
    data.cols<-which(sapply(data,is.numeric))

    message(paste("MS data columns:", paste(data.cols,collapse=" ")))
    jumps<-which(diff(data.cols)>1)
    ngroups<-length(jumps)+1
    groups<-list()[1:ngroups]
    start<-1
    names(jumps)<-NULL
    if (length(jumps)>0){
        for (i in 1:length(jumps)){
            groups[[i]]<-c(start,jumps[i])
            start<-jumps[i]+1

        }
    }
    groups[[ngroups]]<-c(start,length(data.cols))

    present<-matrix(1,ncol=ngroups,nrow=nrow(data))
    if (meta_gr>1) present[,c(1:(meta_gr-1))]<-0
    if (!is.null(group_names)){
        if (length(groups)!=length(group_names)) stop("Nb of groups and group names differ")
        colnames(present)<-names(groups)<-group_names
    }

    for (i in meta_gr:ngroups){
        cls<-c(groups[[i]][1]:groups[[i]][2])
        ## print(cls)
        ss<-rowSums(data[,data.cols[cls]])
        selNA<-which(ss<=length(cls))

        present[selNA,i]<-0


    }
    data<-data.frame(data,overlap=paste("overlaps", rowSums(present)))

    res<-list(annotation=list(annotation=annotation,ID=annotation.ID,components=components,components.col=annotation.component),data=list(data=data,ID=data.ID,data.cols=data.cols,groups=groups,present=present))
    class(res)<-"AnnoMass"

    return(res)


}

cluster.groups<-function(AM,group=NULL,subset=NULL){
    sel<-1:nrow(AM$data$data)
    if (!is.null(subset)) sel<-subset
    grS<-NULL
    if (!is.null(group)){
        grS<-NULL
        for (i in group){
            grS<-c(grS,c(AM$data$groups[[i]][1]:AM$data$groups[[i]][2]))
        }
    }
    if (is.null(grS)) grS<-1:length(AM$data$data.cols)
    data<-AM$data$data[sel,]
    if (is.null(group)) group<-1:length(AM$data$groups)
    selNA<-NULL
    for (i in group){
        cls<-c(AM$data$groups[[i]][1]:AM$data$groups[[i]][2])
        ss<-rowSums(data[,AM$data$data.cols[cls]])
        selNA<-unique(c(selNA,which(ss==length(cls))))


    }

    if (length(selNA)>0){

        message(paste("Only", length(sel)-length(selNA), "proteins used in analysis"))
        data<-data[-selNA,AM$data$data.cols[grS]]
    }
    if (nrow(data)==0) stop()
    studies_correlation<-as.dist(1-cor(data))
    hcl<-hclust(studies_correlation)
    return(hcl)




}

run.annotation<-function(AM,method="kmeans",metric="correlation",clusters=60,iter.max=100,nstart=1,group=NULL,subset=NULL){
    sel<-1:nrow(AM$data$data)
    grS<-NULL

    if (!is.null(group)){

        grS<-NULL
        for (i in group){
            grS<-c(grS,c(AM$data$groups[[i]][1]:AM$data$groups[[i]][2]))
        }
    }
    ## print(colnames(AM$data$data[sel,AM$data$data.cols[grS]]))
    if (is.null(grS)) grS<-1:length(AM$data$data.cols)

    if (!is.null(subset)) sel<-subset

    if (length(sel)==0) stop("No annotated protein ID in data")

    message("Clustering...")
    if (method=="kmeans") {
        if (metric=="euclidean") res.cluster<-kmeans(AM$data$data[sel,AM$data$data.cols[grS]],centers=clusters,nstart=nstart,iter.max=iter.max) else res.cluster<-Kmeans(x=AM$data$data[sel,AM$data$data.cols[grS]],method=metric,centers=clusters,iter.max=iter.max,nstart=nstart)
    }
    if (method=="pam"){
        if (metric %in% c("euclidean","manhattan")) res.cluster<-pam(x=AM$data$data[sel,AM$data$data.cols[grS]],metric=metric,k=clusters)
        if (metric=="correlation"){
            message("building distance matrix")
            dd<-as.dist(1-cor(t(AM$data$data[sel,AM$data$data.cols[grS]])))
            message("done")
            res.cluster<-pam(x=dd,k=clusters)
        }


    }
    if (method=="crude"){
        data_loc<-AM$data$data[sel,AM$data$data.cols[grS]]
        if (ncol(data_loc)>1) for (ii in 2:ncol(data_loc)) data_loc[,ii]<--data_loc[,ii]
        ord<-do.call(order,data_loc)

        res.cluster<-list(cluster=rep(NA,length(sel)))

        cls<-data.frame(ord=ord,cls=rep(1:clusters,each=ceiling(length(sel)/clusters))[1:length(sel)])

        res.cluster$cluster[cls$ord]<-cls$cls

    }

    message("Running cluster annotations...")

    AM$data$data$clusters<-NA

    AM$data$data$clusters[sel]=res.cluster$cluster
    AM$results$clusters<-list()[1:length(AM$annotation$components)]
    for (i in 1:length(AM$annotation$components)){
        incMat<-matrix(NA,nrow=clusters,ncol=length(AM$annotation$components[[i]]))

        colnames(incMat)<-AM$annotation$components[[i]]

        resData<-merge(AM$annotation$annotation,AM$data$data,by.x=AM$annotation$ID,by.y=AM$data$ID)

        TP.FP<-rep(NA,clusters)
        for (fr in AM$annotation$components[[i]]){
            for (cl in 1:clusters){

                incMat[cl,fr]<-length(grep(fr,resData[resData$clusters==cl,AM$annotation$components.col[i]]))
            }
        }
        incMat<-data.frame(cluster=1:clusters,incMat,Nb_of_annotations=NA,precision_main_component=NA,main_component=NA,stringsAsFactors=FALSE)

        incMat$Nb_of_annotations=rowSums(incMat[,AM$annotation$components[[i]]])

        loc<-incMat[,AM$annotation$components[[i]]]/incMat$Nb_of_annotations

        whichMain<-apply(loc,MARGIN=1,FUN=function(x) which.max(x)[1])
        precision_main_component<-sapply(1:length(whichMain),FUN=function(i,whichMain,loc) loc[i,whichMain[i]],loc=loc,whichMain=whichMain)
        ss<-which(sapply(precision_main_component,is.null))

        if (length(ss)>0) precision_main_component[ss]<-NA
        incMat$precision_main_component<-unlist(precision_main_component)

        incMat$main_component<-AM$annotation$components[[i]][whichMain]
        colnames(loc)<-paste(colnames(loc),"ratio",sep="_")
        AM$results$clusters[[i]]<-data.frame(incMat,loc,stringsAsFactors=FALSE)
        for (cl in 1:clusters){

            fr<-incMat$main_component[cl]


            NbFr<-length(grep(fr,resData[,AM$annotation$components.col[i]]))

            TP<-length(grep(fr,resData[resData$clusters==cl,AM$annotation$components.col[i]]))

            FP<-length(which(!(resData[resData$clusters==cl,AM$annotation$components.col[i]] %in% c("",fr)) & !is.na(resData[resData$clusters==cl,AM$annotation$components.col[i]])))

            NbOther<-length(which(!(resData[,AM$annotation$components.col[i]] %in% c("",fr)) & !is.na(resData[,AM$annotation$components.col[i]])))

            TP.FP[cl]<-(TP/NbFr)/(FP/NbOther)



        }
        ##AM$results$clusters[[i]]$TP.FP<-TP.FP
        AM$data$data[,paste("main_component",i,sep="_")]<-AM$results$clusters[[i]]$main_component[AM$data$data$clusters]
        AM$data$data[,paste("precision_main_component",i,sep="_")]<-AM$results$clusters[[i]]$precision_main_component[AM$data$data$clusters]

    }
    return(AM)



}

order.AM<-function(AM,ordering=NULL,rID=1){
    if (is.null(ordering)) ord<-order(AM$results$clusters[[rID]]$main_component) else ord<-ordering
    cls<-1:nrow(AM$results$clusters[[rID]])
    names(cls)<-ord

    AM$results$clusters[[rID]]$updated_order<-cls[as.character(1:nrow(AM$results$clusters[[rID]]))]
    AM$data$data$updated_order<-AM$results$clusters[[rID]]$updated_order[AM$data$data$clusters]
    ord<-order(AM$data$data$updated_order)
    AM$data$data<-AM$data$data[ord,]

    return(AM)

}
write.cdt<-function(AM,file="results.cdt",annotation=FALSE,rsID=1){
    AM$data$data[,1]<-as.character(AM$data$data[,1])
    dd<-AM$data$data
    dd<-data.frame(dd,ord=1:nrow(dd))
    dd<-merge(AM$annotation$annotation,dd,by.x=AM$annotation$ID,by.y=AM$data$ID,all.y=TRUE)
    ord<-order(dd$ord)
    Annot<-dd[ord,AM$annotation$components.col[rsID]]
    ##AM$data$data<-dd
    if (length(grep("assigned_location",colnames(AM$data$data)))>0) AM$data$data[,1]<-paste(AM$data$data[,1],AM$data$data$clusters,"Marker=",Annot,"Assign=",AM$data$data[,paste("assigned_location",rsID,sep="_")]) else AM$data$data[,1]<-paste(AM$data$data[,1],AM$data$data$clusters,"Marker=",Annot,"Assign=",AM$data$data[,paste("main_component",rsID,sep="_")])
    data<-AM$data$data
    data.cols<-AM$data$data.cols
    endC<-max(data.cols)
    data[,data.cols]<-t(scale(t(data[,data.cols]),center=FALSE))
    data<-data[,1:endC]
    data<-data.frame(Gene_cluster=data[,1],NAME=data[,1],GWEIGHT=1,data[,-1],stringsAsFactors=FALSE)
    data2<-data[1,]
    for (i in 1:ncol(data2)) data2[,i]<-as.character(data2[,i])
    data2[1,]<-c("EWEIGHT","",rep(1,ncol(data)-2))
    data<-rbind(data2,data)

    write.table(data,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
    return(invisible(data))
}

get.data<-function(AM,annotation=TRUE,data.only=FALSE,out=FALSE,fulltext=FALSE){
    if(out){
        ss<-which(colnames(AM$data$data)=="updated_order")
        AM$data$data<-AM$data$data[,-ss]
        ss<-(AM$data$data.cols[length(AM$data$data.cols)]+1):ncol(AM$data$data)
        s1<-grep("^main_component_([0-9]+)$",colnames(AM$data$data))
        s1p<-grep("^precision_main_component_([0-9]+)$|^main_component_([0-9]+)$",colnames(AM$data$data)[ss])
        ss2<-(AM$data$data.cols[length(AM$data$data.cols)]+2):ncol(AM$data$data)
        anots<-as.numeric(gsub("^main_component_([0-9]+)$","\\1",colnames(AM$data$data)[s1]))
        cc<-colnames(AM$data$data)[ss2]
        for (i in anots) cc<-gsub(paste("_",i,"$",sep=""),sub("^Annot","",AM$annotation$components.col[i]),cc)
        colnames(AM$data$data)[ss2]<-cc

        if (fulltext){
            Annot_out<-AM$annotation$annotation[,c(colnames(AM$annotation$annotation)[1],AM$annotation$ID,AM$annotation$components.col)]
            data(full_text_annotation,envir =  environment())
            Annot_out<-merge(Annot_out,full_text_annotation[,-2],by.x=1,by.y=1,all.x=TRUE)[,-1]

        } else {
            Annot_out<-AM$annotation$annotation[,c(AM$annotation$ID,AM$annotation$components.col)]
        }
        return(merge(Annot_out,cbind(AM$data$data[,ss[-s1p]],AM$data$data[,-ss]),by.x=AM$annotation$ID,by.y=AM$data$ID,all.y=TRUE))
    }
    if (!annotation) {
        if (data.only){
            out<-cbind(AM$data$data[,AM$data$ID],cluster=AM$data$data$updated_order,AM$data$data[,AM$data$data.cols])
            colnames(out)[1]<-AM$data$ID
            return(out)
        } else {return(AM$data$data)}
    }  else {
        if (data.only){
            out<-cbind(AM$data$data[,AM$data$ID],cluster=AM$data$data$clusters,AM$data$data[,AM$data$data.cols])
            colnames(out)[1]<-AM$data$ID
            return(out)}
        else {
            return(merge(AM$annotation$annotation,AM$data$data,by.x=AM$annotation$ID,by.y=AM$data$ID,all.y=TRUE))
        }

    }
}

get.clusters<-function(AM,rID=1) return(AM$results$clusters[[rID]])

set.components<-function(AM,clusterRes=NULL,components=NULL,rID=1,col="main_component"){
    if (!is.null(clusterRes)){
        AM$results$clusters[[rID]]$main_component<-clusterRes$main_components
        AM$data$data$main_component<-AM$results$clusters[[rID]]$main_component[AM$data$data$clusters]
    }
    if(!is.null(components)){

        uu<-unique(AM$results$clusters[[rID]][,col])

        if (length(which(is.na(uu)))>0) uu<-uu[-which(is.na(uu))]

        if (length(intersect(components,uu))!=length(uu)) stop("incorrect components!")

        AM$results$clusters[[rID]][,col]<-factor(AM$results$clusters[[rID]][,col],levels=components)
    }
    return(AM)
}

get.components<- function(AM) AM$annotation$components

get.presence<-function(AM) AM$data$present

roc.AM1<-function(AM,rID=NULL,component=NULL){
    if (is.null(rID)) rID<-1:length(AM$annotation$components)
    dd<-get.data(AM)

    if (is.null(component)){
        component<-unique(unlist(AM$annotation$components))
    }
    res<-list()[1:length(component)]
    names(res)<-component

    for (i in component){
        comList<-list()[1:length(rID)]
        res[[i]]<-comList
        for (j in 1:length(rID)){
            cls<-get.clusters(AM,rID=j)
            ##sel<-which(dd[,paste("main_component",j,sep="_")]==i)
            sel<-which(dd[,AM$annotation$components.col[j]]==i)
            ## pur<-cls$precision_main_component[dd$clusters[sel]]
            if (is.null(cls[dd$clusters[sel],paste(i,"ratio",sep="_")])) next else pur<-cls[dd$clusters[sel],paste(i,"ratio",sep="_")]
            precision<-sort(unique(pur),decreasing=TRUE)

            if (length(precision)>0) loc<-data.frame(precision=precision,ratio=NA) else {
                                                                              loc<-NULL
                                                                              next
                                                                          }
            N<-length(precision)
            for (a in 1:N){
                loc[a,2]<-length(which(pur>=precision[a])) /length(sel)

            }
            loc<-rbind(c(precision=1,ratio=0),loc,c(precision=0,ratio=1))

            res[[i]][[j]]<-loc
        }
    }
    reso<-list()[1:2]
    names(reso)<-c("Annotation","rocAM")
    reso[[1]]<-sub("^Annot_","",AM$annotation$components.col)
    reso[[2]]<-res
    class(reso)<-"rocAM"
    return(reso)
}
pr_tables<-function(AM,output=NULL){
    l1<-roc.AM(AM)
    annotation<-l1[[1]]
    l1<-l1[[2]]
    l2<-roc.AM(AM,abs=TRUE)[[2]]
    res1<-res2<-NULL
    for (i in 1:length(l1)){
        for (j in 1:length(annotation)){
            tabloc<-NULL
            if (!is.null(l1[[i]][[j]])){
                tabloc<-l1[[i]][[j]]
                colnames(tabloc)<-c("precission","recall")
                tabloc<-data.frame(annotation=annotation[j],localisation=names(l1)[i],tabloc)
            }
            res1<-rbind(res1,tabloc)

            tabloc<-NULL
            if (!is.null(l1[[i]][[j]])){
                tabloc<-l2[[i]][[j]]
                colnames(tabloc)<-c("precission","number_of_assigned_proteins")
                tabloc<-data.frame(annotation=annotation[j],localisation=names(l2)[i],tabloc)
            }
            res2<-rbind(res2,tabloc)


        }
    }
    if(!is.null(output)){
        write.table(res1,file=paste("recall",output,sep="_"),col.names=TRUE,row.names=FALSE,sep="\t")
        write.table(res2,file=paste("assigned_proteins",output,sep="_"),col.names=TRUE,row.names=FALSE,sep="\t")

    }
    return(invisible(list(res1,res2)))

}
roc.AM<-function(AM,rID=NULL,component=NULL,abs=FALSE){
    if (is.null(rID)) rID<-1:length(AM$annotation$components)
    dd<-get.data(AM)

    if (is.null(component)){
        for (i in 1:length(AM$annotation$components)) {
            sel<-which(dd[,AM$annotation$components.col[i]] %in% c("LYSOSOME","ENDOSOME"))
            dd[sel,AM$annotation$components.col[i]]<-"LYSOSOME&ENDOSOME"
            sel<-which(dd[,AM$annotation$components.col[i]] %in% c("NUCLEOLUS"))
            dd[sel,AM$annotation$components.col[i]]<-"NUCLEUS"
        }
        component<-NULL
        for (i in 1:length(AM$annotation$components)) component<-unique(c(component,unique(as.character(get.clusters(AM,rID=i)$assigned_location))))
        component<-as.character(na.omit(component))

    }
    res<-list()[1:length(component)]
    names(res)<-component

    for (i in component){
        comList<-list()[1:length(rID)]
        res[[i]]<-comList
        for (j in 1:length(rID)){
            cls<-get.clusters(AM,rID=j)
            ##sel<-which(dd[,paste("main_component",j,sep="_")]==i)

            if (!abs) sel<-which(dd[,AM$annotation$components.col[j]]==i)
            if (abs) sel<-which(dd[,paste("assigned_location",j,sep="_")]==i)
            ## pur<-cls$precision_main_component[dd$clusters[sel]]
            if (is.null(cls[dd$clusters[sel],"precision_assigned_location"])) next else pur<-cls[dd$clusters[sel],"precision_assigned_location"]
            precision<-sort(unique(pur),decreasing=TRUE)

            if (length(precision)>0) loc<-data.frame(precision=precision,ratio=NA) else {
                                                                              loc<-NULL
                                                                              next
                                                                          }
            N<-length(precision)

            for (a in 1:N){
                loc[a,2]<-ifelse(abs,length(which(pur>=precision[a])),length(which(pur>=precision[a]))/length(sel))

            }
            loc<-rbind(c(precision=1,ratio=0),loc,c(precision=0,ratio=max(loc[a,2])))

            res[[i]][[j]]<-loc
        }
    }
    reso<-list()[1:2]
    names(reso)<-c("Annotation","rocAM")
    reso[[1]]<-sub("^Annot_","",AM$annotation$components.col)
    reso[[2]]<-res
    class(reso)<-"rocAM"
    return(reso)
}



plot.prAM<-function(rocAM,abs=FALSE,legend.position="bottomleft"){  ##,mfrow=c(1,1),mar=c(1, 4, 2.2, 1) + 0.1)
    ## par(mfrow=mfrow,mar=mar)
    Recall<-ifelse(abs,"Number of classified proteins","Recall")
    if (class(rocAM)!="rocAM") {if (class(rocAM)=="AnnoMass") rocAM<-roc.AM(rocAM,abs=abs) else stop()}
    annotation<-rocAM[[1]]
    rocAM<-rocAM[[2]]
    ord<-order(names(rocAM))
    rocAM<-rocAM[ord]
    for(i in 1:length(rocAM)){

        for (jj in 1:(length(rocAM[[i]]))) if (!is.null(rocAM[[i]][[jj]])) break
        if (is.null(rocAM[[i]][[jj]])) next
        xlm1<-max(as.numeric(unlist(rocAM[[i]])),na.rm=TRUE)
        plot(rocAM[[i]][[jj]],type="l",col=jj,lty=jj,ylim=c(0,xlm1),xlim=c(0,1),main=names(rocAM)[i],ylab=Recall,xlab="Precision",lwd=1.8)
        K<-length(rocAM[[i]])
        legend(legend.position,legend=paste(annotation),lty=1:K,col=1:K,cex=1)
        if (length(rocAM[[i]])<=jj) next

        for (j in (jj+1):length(rocAM[[i]])){

            lines(rocAM[[i]][[j]],col=j,lty=j,lwd=1.8)

        }
    }

}

analyze.MSfile<-function(MSfile,Annotation=NULL,Metadata="Christoforou",annotation.ID=2,data.ID=1,markers=3,group_names=NULL,clusters=NULL,output="results",sep="\t",method="kmeans",metric="euclidean",iter.max=100,nstart=1,group=NULL,subset=NULL,sort.by=1,cluster.metadata=FALSE,overlap=NULL){
    annotation.component<-markers
    all_ov<-ifelse(is.null(overlap),FALSE,TRUE)
    if (!is.null(output)){
        output.data<-paste(output,"_table.txt",sep="")
        output.roc<-paste(output,"_pr.pdf",sep="")
        output.roc_abs<-paste(output,"_pr_abs.pdf",sep="")
        output.cdt<-paste(output,"_javatree.cdt",sep="")
    } else output.data<-output.roc<-output.cdt<-NULL
    if (is.null(Annotation)){
        data(AnnotationAM,envir =  environment())
        Annotation<-AnnotationAM
    } else {
        if (!is.data.frame(Annotation)) Annotation<-read.table(Annotation,header=TRUE,stringsAsFactors=FALSE,sep=sep,comment.char="")
    }

    if (!is.null(Metadata)){
        metaD<-list()[1:length(Metadata)]
        names(metaD)<-Metadata
        for (i in Metadata){
            data(list=i,envir =  environment())
            metaD[[i]]<-eval(parse(text=i))

        }
    }


    if (!is.data.frame(MSfile)) if (length(MSfile)>1){
                                    if (length(data.ID)==1) data.ID<-rep(data.ID,length(MSfile))
                                    if (!all(is.character(MSfile))) stop("multiple MSfiles must be filenames")
                                    res<-read.table(MSfile[1],header=TRUE,stringsAsFactors=FALSE,sep=sep,comment.char="")
                                    if (any(duplicated(res[,data.ID[1]]))){
                                        warning("Duplicated IDs are not alowed for multiple files. Duplicated IDs are removed.")
                                        res<-res[-which(duplicated(res[,data.ID[1]])),]
                                    }
                                    for (i in 2:length(MSfile)){

                                        resloc<-read.table(MSfile[i],header=TRUE,stringsAsFactors=FALSE,sep=sep,comment.char="")
                                        if (any(duplicated(resloc[,data.ID[i]]))){
                                            warning("Duplicated IDs are not alowed for multiple files. Duplicated IDs are removed.")
                                            resloc<-resloc[-which(duplicated(resloc[,data.ID[i]])),]
                                        }
                                        res<-merge(res,data.frame(non_NUM_space="-",resloc),by.x=data.ID[1],by.y=data.ID[i]+1,all.x=all_ov,all.y=all_ov)
                                    }
                                    MSfile<-res
                                    data.ID<-1



                                } else  if (!is.data.frame(MSfile)) MSfile<-read.table(MSfile,header=TRUE,stringsAsFactors=FALSE,sep=sep,comment.char="")

    if (!is.null(group) & length(group)==1) if (group==0 & (is.null(Metadata) | !cluster.metadata)) stop("Nothing to cluster!")
    meta_gr<-1
    if (!is.null(Metadata)){
        Metadata<-metaD[[1]]

        if (length(metaD)>1) for (i in 2:length(metaD)){
                                 Metadata<-merge(data.frame(Metadata,non_NUM_space="-"),metaD[[i]],by=1)

                             }
        Metadata<-merge(Annotation[,c(2,annotation.ID)],data.frame(non_NUM_space="-",Metadata),by.x=1,by.y=2)
        Mdata.cols<-which(sapply(Metadata,is.numeric))
        jumps<-which(diff(Mdata.cols)>1)
        ngroups<-length(jumps)+1
        MSdata.cols<-which(sapply(MSfile,is.numeric))
        jumps<-which(diff(MSdata.cols)>1)
        ngroupsMS<-length(jumps)+1
        meta_gr<-length(which(diff(which(sapply(Metadata,is.numeric)))>1))+2


        if(is.null(group)) group<-1:ngroupsMS

        MSfile<-merge(data.frame(Metadata,non_NUM_space="-"),MSfile,by.x=2,by.y=1,all.x=all_ov,all.y=all_ov)

        if (!is.null(group) & length(group)==1){
            if (group==0) group<-1:ngroups else if (cluster.metadata)  group<-c(1:ngroups,group+ngroups) else group<-group+ngroups
        } else {
            if (cluster.metadata)  group<-c(1:ngroups,group+ngroups) else group<-group+ngroups
        }
        if (cluster.metadata) meta_gr<-1
        ## print(group)

    }

    if (all_ov){
        cc<-which(sapply(MSfile,is.numeric))
        ss<-which(is.na(MSfile[,cc]),arr.ind=TRUE)
        MSFc<-MSfile[,cc]
        MSFc[is.na(MSFc)]<-1
        MSfile[,cc]<-MSFc
        nonnum<-sapply(MSfile,function(x) if (is.factor(x)) return(levels(x)=="-") else return(FALSE))
        MSfile[,nonnum]<-"-"
    }


    AM<-construct.AM(Annotation,MSfile,annotation.ID=annotation.ID,data.ID=data.ID,annotation.component=annotation.component,group_names=group_names,meta_gr=meta_gr)

    if (all_ov){
        if (ncol(get.presence(AM))>=overlap){
        pres<-rowSums(get.presence(AM))
        subset<-which(pres>=overlap)
        } else {
             subset<-1:nrow(AM$data$data)
        }
    } else {
        subset<-1:nrow(AM$data$data)
    }

    if(is.null(clusters)){
        clusters<-length(subset)%/%5
        message("Setting number of clusters to ", clusters)
    }
    res<-run.annotation(AM,clusters=clusters,method=method,metric=metric,iter.max=iter.max,nstart=nstart,group=group,subset=subset)

    data(AnnotationAM,envir =  environment())
    ctexist<-FALSE


    for (i in 1:length(annotation.component)){

        if (TRUE){
            data(levelsC)
            presentCOMP<-unique(res$results$clusters[[i]][,"main_component"])
            ss<-which(!(presentCOMP %in% levelsC) & !is.na(presentCOMP) & presentCOMP!="")
            ss<-which(!(presentCOMP %in% levelsC))
            presentCOMP<-presentCOMP[ss]
            ss<-which(!is.na(presentCOMP))
            presentCOMP<-presentCOMP[ss]
            ss<-which(levelsC %in% res$results$clusters[[i]][,"main_component"])
            try(res<-set.components(res,rID=i,components=c(levelsC[ss],sort(presentCOMP)),col="main_component"))
            cls<-get.clusters(res,rID=i)
            try(categ<-categorize_cluster(cls))
            try(nbct<-categ[[2]])
            try(categ<-categ[[1]])
            score<-2*nbct-cls$Nb_of_annotations
            try(res<-addreplace.column(res,rID=i,assigned_location=categ))
            levelsCo<-levelsC
            levelsC<-levelsC[-4]
            levelsC[4]<-"LYSOSOME&ENDOSOME"
            presentCOMP<-unique(res$results$clusters[[i]][,"assigned_location"])

            ss<-which(!(presentCOMP %in% levelsC) & !is.na(presentCOMP) & presentCOMP!="")
            ss<-which(!(presentCOMP %in% levelsC))
            presentCOMP<-presentCOMP[ss]
            ss<-which(!is.na(presentCOMP))
            presentCOMP<-presentCOMP[ss]
            ss<-which(levelsC %in% res$results$clusters[[i]][,"assigned_location"])
            try(res<-set.components(res,rID=i,components=c(levelsC[ss],presentCOMP),col="assigned_location"))
            levelsC<-levelsCo
            ##try(res<-addreplace.column(res,rID=i,score=score,add2data=FALSE))
            try(res<-addreplace.column(res,rID=i,Nb_main_component=cls$Nb_of_annotations*cls$precision_main_component,Nb_assigned_location=nbct,add2data=FALSE))
            try(res<-addreplace.column(res,rID=i,precision_assigned_location=nbct/cls$Nb_of_annotations,add2data=TRUE))
            ctexist<-TRUE
        }
    }


    cls<-get.clusters(res,rID=sort.by)



    if (ctexist) ord<-order(cls$assigned_location,-cls$Nb_assigned_location, -cls$precision_assigned_location) else ord<-order(cls$main_component,-cls$Nb_main_component, -cls$precision_main_component)
    res<-order.AM(res,ordering=ord)
    data<-get.data(res)
    ord<-order(data$updated_order)
    if (!is.null(output.data)) write.table(get.data(res,out=TRUE,fulltext=TRUE)[ord,],file=output.data,col.names=TRUE,row.names=FALSE,sep=sep)

    if (!is.null(output.cdt)) write.cdt(res,file=output.cdt)

    if (!is.null(output.roc_abs)){

        if (ctexist) Categ<-"assigned_location" else Categ<-"main_component"
        dd<-get.data(res)


        pdf(output.roc)
        par(mfrow=c(2,2),cex=0.5)
        plot.prAM(res)
        dev.off()
        pdf(output.roc_abs)
        par(mfrow=c(2,2),cex=0.5)
        plot.prAM(res,abs=TRUE)
        dev.off()


    }
    return(invisible(res))
}

addreplace.column<-function(AM,rID=1,add2data=TRUE,...){
    x<-list(...)
    if (any(lapply(x,length)!=nrow(AM$results$clusters[[rID]]))) stop("column must be the same length as number of clusters!")
    namesD<-paste(names(x),rID,sep="_")
    namesCls<-names(x)
    AM$results$clusters[[rID]][,namesCls]<-x
    if (add2data) AM$data$data[,namesD]<-AM$results$clusters[[rID]][AM$data$data$clusters,namesCls]
    return(AM)

}

categorize_cluster<-function(cls){

    res<-rep(NA,nrow(cls))
    counts<-rep(NA,nrow(cls))
    if (is.null(cls$ENDOSOME_ratio)) cls$ENDOSOME_ratio<-0
    if (is.null(cls$PROTEASOME_ratio)) cls$PROTEASOME_ratio<-0
    if (is.null(cls$LYSOSOME_ratio)) cls$LYSOSOME_ratio<-0
    if (is.null(cls$CS_ratio)) cls$CS_ratio<-0
    if (is.null(cls$GOLGI_ratio)) cls$GOLGI_ratio<-0
    if (is.null(cls$NUCLEOLUS_ratio))  cls$NUCLEOLUS_ratio<-0
     if (is.null(cls$NUCLEUS_ratio))  cls$NUCLEUS_ratio<-0
    if (is.null(cls$PM_ratio)) cls$PM_ratio<-0
    if (is.null(cls$ER_ratio)) cls$ER_ratio<-0
    if (is.null(cls$CYTOSOL_ratio)) cls$CYTOSOL_ratio<-0
    ##if (is.null(cls$CYTOSOL_ratio)) print(cls)
    if(is.null(cls$MITOCHONDRION_ratio)) cls$MITOCHONDRION_ratio<-0

    for (i in 1:nrow(cls)){

        Nb<-cls$Nb_of_annotations[i]
        if (cls$Nb_of_annotations[i]==0) next
        if (!is.null(cls$RIBOSOME_ratio)) if (cls$RIBOSOME_ratio[i]>0.51){
                                              res[i]<-"RIBOSOME"
                                              counts[i]<-Nb*cls$RIBOSOME_ratio[i]
                                              next
                                          }
        if ((cls$CYTOSOL_ratio[i]+cls$CS_ratio[i])>=0.51){
            if (cls$CS_ratio[i]>0.3) res[i]<-"CS" else res[i]<-"CYTOSOL"
            counts[i]<-Nb*(cls$CYTOSOL_ratio[i]+cls$CS_ratio[i])
            next
        }
        if ((cls$PM_ratio[i]+cls$ER_ratio[i]+cls$GOLGI_ratio[i]+cls$MITOCHONDRION_ratio[i]+cls$LYSOSOME_ratio[i]+cls$ENDOSOME_ratio[i])>=0.51){
            if ((cls$LYSOSOME_ratio[i]+cls$ENDOSOME_ratio[i])>0.51){
                res[i]<-"LYSOSOME&ENDOSOME"

                next
            }
            rr<-c(PM=cls$PM_ratio[i],ER=cls$ER_ratio[i],GOLGI=cls$GOLGI_ratio[i],MITOCHONDRION=cls$MITOCHONDRION_ratio[i],LYSOSOME=cls$LYSOSOME_ratio[i],ENDOSOME=cls$ENDOSOME_ratio[i])

            res[i]<-names(rr)[which.max(rr)[1]]
            if (res[i] %in% c("LYSOSOME","ENDOSOME")){
                res[i]<-"LYSOSOME&ENDOSOME"
                counts[i]<-Nb*(cls$LYSOSOME_ratio[i]+cls$ENDOSOME_ratio[i])
            } else {
                counts[i]<-Nb*max(rr)
            }
            next
        }

        if ((cls$NUCLEOLUS_ratio[i]+cls$NUCLEUS_ratio[i])>=0.51){
            if (cls$NUCLEOLUS_ratio[i]>0.25) res[i]<-"NUCLEOLUS" else res[i]<-"NUCLEUS"
            counts[i]<-Nb*(cls$NUCLEOLUS_ratio[i]+cls$NUCLEUS_ratio[i])
            next
        }
        i
    }
    ss<-which(is.na(res))

    res[ss]<-as.character(cls$main_component[ss])
    ss<-which(res %in% c("LYSOSOME","ENDOSOME"))
    if (length(ss)>0) res[ss]<-"LYSOSOME&ENDOSOME"
    ss<-which(is.na(counts))
    counts[ss]<-cls$Nb_of_annotations[ss]*cls$precision_main_component[ss]
    return(list(res,counts))

}
stuchly/MetaMass documentation built on Nov. 14, 2019, 10:58 p.m.