R/makeCNOlist.R
In CellNOptR: Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data

Documented in makeCNOlist

#
#  This file is part of the CNO software
#
#  Copyright (c) 2011-2012 - EMBL - European Bioinformatics Institute
#
#  File author(s): CNO developers (cno-dev@ebi.ac.uk)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  CNO website: http://www.cellnopt.org
#
##############################################################################
makeCNOlist<-function(dataset,subfield, verbose=TRUE){

    #check that all the needed elements are present
    if(!is.list(dataset)){
        stop("The input to this function should be a list with elements 'dataMatrix', 'TRcol','DAcol',and 'DVcol'")
        }

    if(length(dataset) != 4){
        stop("The input to this function should be a list with elements 'dataMatrix', 'TRcol','DAcol',and 'DVcol'")
        }

    if(sum(c("dataMatrix","TRcol","DAcol","DVcol") %in% names(dataset))!=4){
        stop("The input to this function should be a list with elements 'dataMatrix', 'TRcol','DAcol',and 'DVcol'")
        }

    # first, we summarise the replicates in the dataMatrix: replicates are rows in the dataset$dataMatrix
    # that have the exact same values in all TR: and DA: columns
    duplCond<-as.matrix(dataset$dataMatrix[,c(dataset$TRcol,dataset$DAcol)])
    duplRows<-which(duplicated(duplCond) == TRUE)
    if (verbose == TRUE){
        if (length(duplRows)>0){
            print("Cleaning duplicated rows")
        }
    }

    # creates a variance matrix
    variances = dataset$dataMatrix * 0


    while(length(duplRows) != 0){
        # the all(x == ) is buggy in the case of NA hence the compareNA function.
        #dupIndex<-apply(duplCond,MARGIN=1,function(x) all(x == duplCond[duplRows[1],]))
        dupIndex<-apply(duplCond,MARGIN=1,function(x)all(compareNA(x,duplCond[duplRows[1],])))

        dupIndex<-which(dupIndex == TRUE)
        dupMatrix<-dataset$dataMatrix[dupIndex,]
        #compute the new row as the average across duplicate rows
        newRow<-colMeans(dupMatrix, na.rm=TRUE)
        # variance for these rows
        newVariance = apply(dupMatrix, MARGIN=2, FUN=var, na.rm=TRUE)

        #replace the first duplicated row by the summarised one
        dataset$dataMatrix[dupIndex[1],]<-newRow

        # same for the variance
        variances[dupIndex[1],]<-newVariance

        #remove the other summarised rows
        dataset$dataMatrix<-dataset$dataMatrix[-dupIndex[2:length(dupIndex)],]
        variances<-variances[-dupIndex[2:length(dupIndex)],]

        duplCond<-as.matrix(dataset$dataMatrix[,c(dataset$TRcol,dataset$DAcol)])
        duplRows<-which(duplicated(duplCond) == TRUE)
    }

    if (any(is.nan(as.matrix(dataset$dataMatrix)))){
        dataset$dataMatrix[is.nan(as.matrix(dataset$dataMatrix))] <- NA
    }

    #now extract the names of the cues, and the inhibitors/stimuli
    namesCues<-colnames(dataset$dataMatrix)[dataset$TRcol]

    if(subfield == TRUE){
        namesCues<-sub(pattern="(TR:)",x=namesCues, replacement="",perl=TRUE)
        tagInhib<-grep(pattern=":Inhibitor", x=namesCues)
        # tagInhibL is a logical version of the previous statement.
        # must be set now before namesCues is changed. See JIRA bug 27
        tagInhibL<-grepl(pattern=":Inhibitor", x=namesCues)

        # remove the trailing :Inhibitors and :Stimuli
        namesCues<-sub(pattern="(:\\w*$)",x=namesCues, replacement="",perl=TRUE)

        # remove trailing i
        namesCues[tagInhib]<-sub(pattern="(i$)", x=namesCues[tagInhib], replacement="", perl=TRUE)

        # if no inhibitors, grep returns integer(0), so we now need to use grepl
        # (logical version of grep)
        namesStimuli<-namesCues[tagInhibL==FALSE]
        namesInhibitors<-namesCues[tagInhibL==TRUE]
        }

    if(subfield == FALSE){
        namesCues<-sub(pattern="(TR:)",x=namesCues, replacement="",perl=TRUE)
        tagInhib<-grep(pattern="(i$)", x=namesCues, perl=TRUE,ignore.case=FALSE)
        # tagInhibL must be set now before namesCues is changed See JIRA bug 27
        tagInhibL<-grepl(pattern="(i$)", x=namesCues, perl=TRUE,ignore.case=FALSE)
        namesCues[tagInhib]<-sub(pattern="(i$)", x=namesCues[tagInhib], replacement="", perl=TRUE)
        # if no inhibitors, grep returns integer(0), so we now need to use grepl
        # (logical version of grep)
        namesStimuli<-namesCues[tagInhibL==FALSE]
        namesInhibitors<-namesCues[tagInhibL==TRUE]

    }

    if(sum("NOCYTO" %in% namesCues) != 0){
        namesCues<-namesCues[-grep(pattern="NOCYTO", namesCues)]
        namesStimuli<-namesStimuli[-grep(pattern="NOCYTO", namesStimuli)]
        }

    if(sum("NOINHIB" %in% namesCues) != 0){
        namesCues<-namesCues[-grep(pattern="NOINHIB", namesCues)]
        namesStimuli<-namesStimuli[-grep(pattern="NOINHIB", namesStimuli)]
        }

    if(sum("NO-INHIB" %in% namesCues) != 0){
        stop("Found a column with NO-INHIB tag. MIDAS files must use NOINHIB instead. Fix your MIDAS file please")
    }
    if(sum("NO-LIG" %in% namesCues) != 0){
        stop("Found a column with NO-LIG tag. MIDAS files do not accept NO-LIG. use NOINHIB or NOCYTO instead. Fix your MIDAS file please")
    }
    if(sum("NOLIG" %in% namesCues) != 0){
        stop("Found a column with NO-LIG tag. MIDAS files do not accept NO-LIG. use NOINHIB or NOCYTO instead. Fix your MIDAS file please")
    }
    if(sum("NO-CYTO" %in% namesCues) != 0){
        stop("Found a column with NO-CYTO tag. MIDAS file must use NOCYTO instead. Fix your MIDAS file please")
    }

    #now extract the names of the signals
    namesSignals<-colnames(dataset$dataMatrix)[dataset$DAcol]
    namesSignals<-sub(
        pattern="(DA:p-)",
        x=namesSignals,
        replacement="",
        perl=TRUE)
    namesSignals<-sub(
        pattern="(DA:)",
        x=namesSignals,
        replacement="",
        perl=TRUE)

    #now extract the time signals
    times<-as.factor(as.vector(as.character(as.matrix(dataset$dataMatrix[,dataset$DAcol]))))
    timeSignals<-sort(as.double(levels(times)))

    #Build the valueCues matrix (i.e. a matrix with nrows=nrows in dataMatrix and ncol=number of cues,
    #filled with 0/1 if the particular cue is present or not)

#1.I create a matrix that is a subset of the data, and only contains the TR columns
#(the cellLine TR column was removed previously)

#2.I remove the columns with NOCYTO or NOINHIB (if they exist), they don't bring any info

    if(length(grep(pattern="NOCYTO",colnames(dataset$dataMatrix)[dataset$TRcol])) != 0){
        nocyto<-grep(pattern="NOCYTO",colnames(dataset$dataMatrix)[dataset$TRcol])
        TRcol<-dataset$TRcol[-nocyto]
        cues<-dataset$dataMatrix[,TRcol]

        }else{
            # use as.matrix and then set colnames to cope for particular case of
            # only one cue
            cues<-as.matrix(dataset$dataMatrix[,dataset$TRcol])
            colnames(cues) = colnames(dataset$dataMatrix)[dataset$TRcol]
            }

    if(length(grep(pattern="NOINHIB",colnames(cues))) != 0){
        noinhib<-grep(pattern="NOINHIB",colnames(cues))
        cues<-cues[,-noinhib]
        }

#3. The cues sub-data frame now contains 1 if the cue is present and 0/NA otherwise,
#so I just need to transform the data frame into a numerical matrix
#and replace the NA in there by zeros
    cues<-as.matrix(cues,nrow=dim(cues)[1],ncol=dim(cues)[2],byrow=TRUE)
    cues[is.na(cues)]<-0

#Build the valueSignals matrices. I am going to build one big matrix
#that includes all the time points, and then I will split it into one matrix for each time point
#And then I will arrange the valueCues matrix accordingly
    valueSignals<-as.matrix(dataset$dataMatrix[,dataset$DVcol])
    valueVariance<-as.matrix(dataset$dataMatrix[,dataset$DVcol])

#This bit will create an index that contains all rows with timept1, 2, 3,...

#1.First I create a matrix that holds the time information for each row
    times<-as.matrix(dataset$dataMatrix[,dataset$DAcol])

#2. Now I check that all the columns are tha same, i.e. that each row
#will contain data on the same time point

    if (length(dataset$DAcol)>1){
        check<-rep(FALSE,(length(dataset$DAcol)-1))
        for(i in 1:length(check)){
            check[i]<-all.equal(times[,i],times[,(i+1)])
            }
        if(sum(check) != length(check))    {
           warning("Each row of your data file should contain measurements at the same time point.
               The times for the first DA column will be considered as the times for all measurements")
           }
    }
#3.Now I will only use the first column of times

#First, I create a vector timeRows that contains the indexes of the rows that contain data
#about each time point (in increasing order of time), and the vector whereTimes that contain`
#the info about how many rows I have for each time (which will allow me to extract the right
#entries from timesRows)

    times<-times[,1]
    ntimes<-length(timeSignals)
    if (ntimes <2){
        stop("Error while parsing the data. Only one time was found.")
    }
    whereTimes<-rep(0,ntimes)
    timesRows<-0  # Melody uses timesRows <- rep(0,1)
    for(i in 1:ntimes){
        timesRows<-c(timesRows,which(times == timeSignals[i]))
        whereTimes[i]<-length(which(times == timeSignals[i]))
        }

    timesRows<-timesRows[2:length(timesRows)]

#Check that we have data across all conditions for all time points except zero
    if(length(unique(whereTimes[2:length(whereTimes)])) != 1){
        warning("This program expects data across all conditions at all time points (except t=0) ")
        }
    if (verbose){
        print("Please be aware that if you only have some conditions at time zero (e.g.only inhibitor/no inhibitor), the measurements for these conditions will be copied across matching conditions at t=0")
    }
#Do the t=0 matrix, and produce a new cues matrix, that does not contain duplicates
#(1 row per condition and different matrices will be build for the different times)
    valueSignals<-list(matrix(data=0,nrow=whereTimes[2],ncol=length(dataset$DVcol)))
    valueVariance<-list(matrix(data=0,nrow=whereTimes[2],ncol=length(dataset$DVcol)))

#This vector tells me which columns of the cues matrix I should pay attention to when
#copying data across for time=0

    # bug report 31 and
    if (dim(cues)[2] >1){
        # bug report 44
        if (whereTimes[1] == 1){
            zerosCond <- 0
        }
        else{
            zerosCond<-apply(cues[timesRows[1:whereTimes[1]],],1,function(x) which(x > 0))
        }
    }
    else{
        warning("unusual case with 1 dimension in cues")
        zerosCond<-which(cues[timesRows[1:whereTimes[1]],] > 0)
    }
    zerosCond<-unique(unlist(zerosCond))
    count=1
    newcues<-matrix(data=0,nrow=whereTimes[2],ncol=dim(cues)[2])

    #fix bug report 38 to be able to have mixed times in a MIDAS file
    #for(i in timesRows[1]:timesRows[whereTimes[1]]){
    for(i in timesRows[1:whereTimes[1]]){
        # 15.12.2017: is this a bug? "i" is already going through some element of timesRow variable.
    	# is this "timesRows[i]" make sense?! or we can just use i instead
    	#present<-zerosCond[which(cues[timesRows[i],zerosCond] > 0)]
    	present<-zerosCond[which(cues[i ,zerosCond] > 0)]

        if(length(present) == 0){
            for(n in timesRows[(whereTimes[1]+1):(whereTimes[1]+whereTimes[2])]){
                    if(sum(cues[n,zerosCond]) == 0){

                        valueSignals[[1]][count,]<-as.numeric(dataset$dataMatrix[i,dataset$DVcol])
                        valueVariance[[1]][count,]<-as.numeric(variances[i,dataset$DVcol])
                        newcues[count,]<-cues[n,]
                        count=count+1
                        }
                    }
        }else{

            for(n in timesRows[(whereTimes[1]+1):(whereTimes[1]+whereTimes[2])]){
                if(length(zerosCond[which(cues[n,zerosCond] > 0)]) == length(present)){
                    if(all(zerosCond[which(cues[n,zerosCond] > 0)] == present) &&   # same cues are there
                        length(which(cues[n,zerosCond] > 0)) != 0 &&  # there is at least one non-zero cue
                       # same principle as above: use i instead of timesRows[i]
                       #all(cues[timesRows[i],zerosCond] == cues[n,zerosCond]) # cues have the same level
                       all(cues[i,zerosCond] == cues[n,zerosCond]) # cues have the same level
                       ){
                    	valueSignals[[1]][count,]<-as.numeric(dataset$dataMatrix[i,dataset$DVcol])
                        valueVariance[[1]][count,]<-as.numeric(variances[i,dataset$DVcol])
                        newcues[count,]<-cues[n,]
                        count=count+1
                    }
                }
            }
        }
    }

    #Now build the matrices for the other time points
    for(i in 2:length(timeSignals)){
        valuesTi<-matrix(data=0,nrow=whereTimes[2],ncol=length(dataset$DVcol))
        valuesVarianceTi<-matrix(data=0,nrow=whereTimes[2],ncol=length(dataset$DVcol))
        for(n in 1:dim(newcues)[1]){
            rowsMatchCues<-apply(cues,1,function(x) all(x == newcues[n,]))
            rowsmatchTime<-times == timeSignals[i]
            rowsMatch<-which((rowsMatchCues + rowsmatchTime) == 2)
            valuesTi[n,]<-as.numeric(dataset$dataMatrix[rowsMatch,dataset$DVcol])
            valuesVarianceTi[n,]<-as.numeric(variances[rowsMatch,dataset$DVcol])
            }

        valueSignals[[i]]<-valuesTi
        valueVariance[[i]]<-valuesVarianceTi

        }

#Build the valueInhibitors and valueStimuli  matrices, which are a subset of the cues one

    if(subfield == TRUE){
        valueInhibitors<-newcues[,grep(
            pattern="Inhibitor",x=colnames(cues),ignore.case=TRUE)]
        valueStimuli<-newcues[,grepl(pattern="Inhibitor",x=colnames(cues),ignore.case=TRUE)==FALSE]

        }else{

            valueInhibitors<-newcues[,grep(pattern="(i$)",x=colnames(cues),ignore.case=FALSE,perl=TRUE)]
            valueStimuli <- newcues[,grepl(pattern="(i$)",x=colnames(cues),ignore.case=FALSE,perl=TRUE)==FALSE]

            }
    if(is.null(dim(valueInhibitors))){
        valueInhibitors<-matrix(valueInhibitors,nrow=dim(newcues)[1])
        }

    if(is.null(dim(valueStimuli))){
        valueStimuli<-matrix(valueStimuli,nrow=dim(newcues)[1])
        }


    return(list(
        namesCues=namesCues,
        namesStimuli=namesStimuli,
        namesInhibitors=namesInhibitors,
        namesSignals=namesSignals,
        timeSignals=timeSignals,
        valueCues=newcues,
        valueInhibitors=valueInhibitors,
        valueStimuli=valueStimuli,
        valueSignals=valueSignals,
        valueVariances=valueVariance
    ))

}


compareNA <- function(v1,v2) {
    # This function returns TRUE wherever elements are the same, including NA's,
    # and false everywhere else.
    same <- (v1 == v2)  |  (is.na(v1) & is.na(v2))
    same[is.na(same)] <- FALSE
    return(same)
}

Any scripts or data that you put into this service are public.

CellNOptR documentation built on Nov. 8, 2020, 6:58 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CellNOptR
Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data

R/makeCNOlist.R
In CellNOptR: Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data

Defines functions compareNA makeCNOlist

Documented in makeCNOlist

Try the CellNOptR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CellNOptR Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data

R/makeCNOlist.R In CellNOptR: Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data

Defines functions compareNA makeCNOlist

Documented in makeCNOlist

Try the CellNOptR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CellNOptR
Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data

R/makeCNOlist.R
In CellNOptR: Training of boolean logic models of signalling networks using prior knowledge networks and perturbation data