R/functions.R

Defines functions .readSeqAnalysisMethods .readArrayAnalysisMethods createSummarizedExperiment .createExpressionSet createAnalyticsSDRF .createBiocObject .mergeUnits .addUnitCols parseSDRF check_file_exists summarizeAtlasExperiment createArrayExpressSiteConfig createAtlasSiteConfig deduceSiteConfigPath parseAtlasConfig

Documented in createArrayExpressSiteConfig createAtlasSiteConfig parseAtlasConfig summarizeAtlasExperiment

# parseAtlasConfig
#     - Read Atlas XML config file and return a list of Analytics objects, as
#     well as the experiment type.
#     - The function returns a list with two elements: the list of Analytics
#     objects, and the experiment type from the XML.
parseAtlasConfig <- function( atlasXMLconfigFile ) {
    
  # Read the XML file.
    xmlTree <- xmlInternalTreeParse( atlasXMLconfigFile )

    # Get the configuration node -- the root of the tree.
    configNode <- xmlRoot( xmlTree )

    # Get the config node attributes.
    configNodeAttrs <- xmlAttrs( configNode )

    # Get the Atlas experiment type.
    atlasExperimentType <- configNodeAttrs[[ "experimentType" ]]

    # Go through the analytics node(s).
    # Get them from the configuration node.
    allAnalyticsNodes <- xmlElementsByTagName( configNode, "analytics" )

    # Create a list of Analytics objects.
    allAnalytics <- lapply( allAnalyticsNodes, function( analyticsNode ) {
      analyticsObject <- new( "Analytics", atlasExperimentType, analyticsNode )
    })
    
    names( allAnalytics ) <- sapply( allAnalytics,
        function( analytics ) {
            platform( analytics )
        }
    )

    # Since we can't return more than one thing, make a list to put the
    # analytics list in as well as the experiment type.
    parsedXMLlist <- list()
    
    parsedXMLlist$allAnalytics <- allAnalytics
    parsedXMLlist$experimentType <- atlasExperimentType

    return( parsedXMLlist )
}


deduceSiteConfigPath <- function( dbName ) {

    # We need the path of the YAML config file.
    # We can deduce this using the output of the R.home() function, which is
    # the directory where R is installed.
    rHome <- R.home()

    # Split the path on "/" characters so that we can remove everything after
    # the atlasinstall directory more easily. Use the platform's file separator
    # (in UNIX a "/").
    splitPath <- strsplit( rHome, .Platform$file.sep )[[ 1 ]]

    # Find the index of the atlasinstall_<instance> directory.
    atlasInstallIndex <- grep( "atlasinstall", splitPath )

    # Check that we got an index. If not, we must not be using an atlasinstall R.
    if( length( atlasInstallIndex ) == 0 ) {
        stop( "Could not find atlasinstall directory in RHOME path. Unable to deduce path to site config file. Please ensure you are using an atlasinstall R installation." )
    }

    # Remove everything after the atlasinstall directory from the path.
    atlasInstallSplitPath <- head( splitPath, n = atlasInstallIndex )

    # Stick the path back together again.
    atlasInstallPath <- paste( atlasInstallSplitPath, collapse=.Platform$file.sep )

    siteConfigBasename <- paste( dbName, "SiteConfig", ".yml", sep="" )
    
    # Add the path to the site config file.
    siteConfigFile <- file.path( atlasInstallPath, "atlasprod", "supporting_files", siteConfigBasename )

    return( siteConfigFile )
}


# createAtlasSiteConfig
#   - Return a list containing the Atlas site config.
createAtlasSiteConfig <- function( ) {
    
    # Get the path to the Atlas site config file.
    atlasSiteConfigFile <- deduceSiteConfigPath( "Atlas" )

    # Read in the site config file.
    atlasSiteConfig <- yaml.load_file( atlasSiteConfigFile )

    return( atlasSiteConfig )
}


# createArrayExpressSiteConfig
#    - Return a list containing the ArrayExpress site config.
createArrayExpressSiteConfig <- function( ) {
    
    # Get the path to the ArrayExpress site config file.
    arrayExpressSiteConfigFile <- deduceSiteConfigPath( "ArrayExpress" )

    # Read in the site config file
    arrayExpressSiteConfig <- yaml.load_file( arrayExpressSiteConfigFile )

    return( arrayExpressSiteConfig )
}


# summarizeAtlasExperiment
#
#   Block of comments below is not following compliant R documentation format
#
#     - Main function for the package. Takes an experiment accession and a
#     directory path where it can find Atlas XML config and expressions matrices.
#     - Returns a list of ExpressionSet and/or MAList and/or SummarizedExperiment objects.
summarizeAtlasExperiment <- function( experimentAccession, atlasExperimentDirectory ) {
    
    # Quit if the accession does not look like an ArrayExpress one.
    if( !grepl( "^E-\\w{4}-\\d+$", experimentAccession ) ) {
      stop( paste( "Accession \"", experimentAccession, "\" does not look like an ArrayExpress accession. Cannot continue", sep="" ) )
    }

    # Atlas XML config file name.
    atlasExperimentXMLfile <- paste( experimentAccession, "-configuration.xml", sep="" )
    atlasExperimentXMLfile <- file.path( atlasExperimentDirectory, atlasExperimentXMLfile )

    # Die if we can't find the XML config file.
    if( !file.exists( atlasExperimentXMLfile ) ) {
      stop( paste( "XML config file", atlasExperimentXMLfile, "does not exist or is not readable." ) )
    }
    
    cat( paste( "Reading experiment config from", atlasExperimentXMLfile, "...\n" ) )

    # Parse the XML file.
    experimentXMLlist <- parseAtlasConfig( atlasExperimentXMLfile )

    cat( "Finished reading experiment config\n" )

    #read SDRF file from a shell environment variable if it exists (otherwise it returns "")
    sdrfPath <- Sys.getenv( 'SDRF_PATH', unset = NA, names = NA )
    
    # if SDRF path not found
    if (is.na(sdrfPath)){
      
        # Get the pipeline code from the experiment accession e.g. MTAB, MEXP
        pipeline <- unlist(strsplit(experimentAccession, '-'))[2] 
      
        # Filename for SDRF.
        sdrfBasename <- paste( experimentAccession, ".sdrf.txt", sep="" )
    
        # Get ArrayExpress site config to get path to experiment load directories.
        aeSiteConfig <- createArrayExpressSiteConfig( )
    

        # Complete path to SDRF file.
        if ( pipeline == "GEOD" ) {
            sdrfPath <- file.path( aeSiteConfig$GEO_SUBMISSIONS_TARGET, pipeline, experimentAccession, sdrfBasename )

            # If it doesnt exist in GEO_SUBMISSIONS TARGET then retrieve from AE2_LOAD_DIR.
            if( !file.exists( sdrfPath ) ) {    
                sdrfPath <- file.path( aeSiteConfig$AE2_LOAD_DIR, "EXPERIMENT", pipeline, experimentAccession, sdrfBasename )
            }
        
        } else if (pipeline == "ENAD") {
            sdrfPath <- file.path( aeSiteConfig$ENA_SUBMISSIONS_TARGET, pipeline, experimentAccession, sdrfBasename )

        } else {
            sdrfPath <- file.path( aeSiteConfig$AE2_LOAD_DIR, "EXPERIMENT", pipeline, experimentAccession, sdrfBasename )
        }
      
    }

    # Check SDRF exists, die if not.
    if( !file.exists( sdrfPath ) ) {
      stop( paste( "SDRF", sdrfPath, "does not exist or is not readable." ) )
    }

    # Get the experiment type from the parsed XML.
    atlasExperimentType <- experimentXMLlist$experimentType
    
    cat( paste( "Reading SDRF from", sdrfPath, "...\n" ) )

    # Parse the SDRF.
    atlasSDRF <- parseSDRF( sdrfPath, atlasExperimentType )

    cat( "Finished reading SDRF\n" )

    # Get the list of Analytics objects.
    allAnalytics <- experimentXMLlist$allAnalytics

    cat( "Creating experiment summary...\n" )

    # Next step is to go through the analytics objects created from the XML,
    # pull out the right rows form the SDRF, get the right expressions matrix,
    # the gene annotations, and make the Bioconductor object (ExpressionSet,
    # MAList, or SummarizedExperiment).
    atlasExperimentSummary <- lapply( allAnalytics, function( analytics ) {
        
    # Get the SDRF rows for this analytics object's assays.
    analyticsSDRF <- createAnalyticsSDRF( analytics, atlasSDRF )

    # Create Bioconductor object, either ExpressionSet, SummarizedExperiment, or MAList.
    biocObject <- .createBiocObject( 
            analytics, 
            analyticsSDRF, 
            atlasExperimentType, 
            experimentAccession, 
            atlasExperimentDirectory 
        )
    })

    atlasExperimentSummary <- SimpleList( atlasExperimentSummary )
    
    cat( "Experiment summary created.\n" )

    return( atlasExperimentSummary )
}

check_file_exists <- function( filename ) {

    if( !file.exists( filename ) ) {
        stop( paste( 
            "Cannot find:",
            filename
        ) )
    }
}


##########################
# Non-exported functions #
##########################

# parseSDRF
#     - Take an SDRF filename and the experiment type from the XML config, and
#     return a subset of the SDRF containing only the assay names (or ENA runs),
#     the Characteristics, and FactorValue columns. 
#     - It tries to include unit columns as well.
#     - It renames the columns to remove e.g. Characteristics[].
#     - It removes duplicated columns, e.g. if genotype is a Characteristic and a
#     Factor.
#     - It returns the new "SDRF" as a data frame.
parseSDRF <- function( filename, atlasExperimentType ) {

    # Read in the SDRF file. Set header=FALSE because we don't want the column
    # headings to be made "R-safe" -- this confuses things when we're trying to
    # find the Charactersitic and Factor names. Set stringsAsFactors=FALSE so
    # that we can use grep.
    completeSDRF <- read.delim( filename, header = FALSE, stringsAsFactors = FALSE )
    
    cat( "Getting characteristic and factor column indices\n" )

    # Get the Characteristics column indices, and any unit columns next to them.
    charColIndices <- grep( "^Characteristics", ignore.case = FALSE, completeSDRF[ 1, ] )
    charColIndices <- .addUnitCols( charColIndices, completeSDRF )
    
    # Get the Factor column indices, an any unit columns next to them.
    factorColIndices <- grep( "^Factor\\s?Value", ignore.case = FALSE, completeSDRF[ 1, ] )
    factorColIndices <- .addUnitCols( factorColIndices, completeSDRF )

    cat( "Checking for technical replicates\n" )

    # Get the index of the Comment[technical replicate group] column, if there
    # is one.
    techRepGroupColIndex <- grep( "Comment\\s?\\[\\s?technical[ _]replicate[ _]group\\s?\\]", ignore.case = TRUE, completeSDRF[ 1, ] )
    
    cat( "Locating assay names...\n" )

    # Get the column index for assay names. For microarray data, this is "Assay
    # Name" or "Hybridization Name". For RNA-seq data, this is "Comment[ENA_RUN]"
    if( grepl( "rnaseq", atlasExperimentType ) ) {
        
        assayNameColIndex <- grep( "Comment\\s?\\[\\s?ENA_RUN\\s?\\]", ignore.case = TRUE, completeSDRF[ 1, ] )

        if( length( assayNameColIndex ) != 1 ) {
            assayNameColIndex <- grep( "Comment\\s?\\[\\s?RUN_NAME\\s?\\]", ignore.case = TRUE, completeSDRF[ 1, ] )
        }

        if( length( assayNameColIndex ) != 1 ) {
            assayNameColIndex <- grep( "Scan\\s?Name", ignore.case = TRUE, completeSDRF[ 1, ] )
        }
        
        if( length( assayNameColIndex ) != 1 ) {
            stop( "Did not find Comment[ ENA_RUN ] or Comment[ RUN_NAME ] or Scan Name column in SDRF." )
        }
    }
    else {
        
        assayNameColIndex <- grep( "Assay\\s?Name", ignore.case = TRUE, completeSDRF[ 1, ] )

        if( length( assayNameColIndex ) != 1 ) {

            assayNameColIndex <- grep( "Hybridi[sz]ation\\s?Name", ignore.case = TRUE, completeSDRF[ 1, ] )

            # Check that we got something.
            if( length( assayNameColIndex ) != 1 ) {
                stop( "Did not find an Assay Name column or a Hybridization Name column, cannot continue." )
            }
        }

        # For two-colour array data, also want to get the label column.
        if( grepl( "2colour", atlasExperimentType ) ) {
          labelColIndex <- which( completeSDRF[ 1, ] == "Label" )
        }
    }
    
    cat( "Found assay names. Subsetting SDRF...\n" )

    # Now we should have everything we need to get the right columns and make a
    # more friendly SDRF.
    if( grepl( "2colour", atlasExperimentType ) ) {

        subsetSDRF <- completeSDRF[ , c( assayNameColIndex, labelColIndex, charColIndices, factorColIndices ) ]
    }
    else {
        
        subsetSDRF <- completeSDRF[ , c( assayNameColIndex, charColIndices, factorColIndices ) ]
    }

    cat( "Finished subsetting SDRF.\n" )

    # If we got a technical replicate group column, add this at the end of the
    # subsetSDRF.
    if( length( techRepGroupColIndex ) > 0 ) {
      subsetSDRF <- cbind( subsetSDRF, completeSDRF[ , techRepGroupColIndex ] )
    }

    # Next, merge the contents of unit columns with the column before.
    subsetSDRF <- .mergeUnits( subsetSDRF )
    
    cat( "Fixing column headings...\n" )

    # Next thing is to name the columns so they have nice names.
    newColNames <- gsub( "Characteristics\\s?\\[", "", subsetSDRF[1,] )
    newColNames <- gsub( "Factor\\s?Value\\s?\\[", "", newColNames )
    newColNames <- gsub( "\\s?\\]", "", newColNames )
    newColNames[ 1 ] <- "AssayName"

    # Replace the last column name with one for the technical replicate group
    # column, if there is one.
    if( length( techRepGroupColIndex ) > 0 ) {
      newColNames[ length( newColNames ) ] <- "technical_replicate_group"
    }
    
    # Replace spaces with underscores.
    newColNames <- gsub( " ", "_", newColNames )
    
    cat( "Finished fixing column headings.\n" )

    cat( "Removing duplicated columns...\n" )

    # Now we've got the new names for the columns, check if any are the same
    # (use "tolower" function to convert all to lower case).
    duplicateColIndices <- which( duplicated( tolower( newColNames ) ) )

    # Remove the duplicated columns from the new column names and the subset
    # SDRF.
    if( length( duplicateColIndices ) > 0 ) {
        subsetSDRF <- subsetSDRF[ , -duplicateColIndices ]
        newColNames <- newColNames[ -duplicateColIndices ]
    }

    cat( "Finished removing duplicated columns.\n" )

    # Remove the first row of the SDRF (this is the old column headings)
    subsetSDRF <- subsetSDRF[ -1, ]
    
    cat( "Applying new column headings...\n" )

    # Add the new column names as the column headings.
    colnames( subsetSDRF ) <- newColNames

    cat( "Finished applying new column headings.\n" )
    
    cat( "Removing duplicated rows...\n" )

    # Remove duplicated rows, which occur e.g. if an assay has more than one file.
    duplicateRowIndices <- which( duplicated( subsetSDRF ) )
    if( length( duplicateRowIndices ) > 0 ) {
        subsetSDRF <- subsetSDRF[ -duplicateRowIndices, ]
    }

    cat( "Finished removing duplicated rows.\n" )

    # Make assay names "R-safe".
    subsetSDRF$AssayName <- make.names( subsetSDRF$AssayName )

    # Return the subset SDRF.
    return( subsetSDRF )
}


# .addUnitCols
#     - Given a vector of column indices and a data frame with the complete SDRF,
#     return a vector of column indices containing the original ones plus any
#     Unit[] columns that are next to them.
.addUnitCols <- function( colIndices, SDRF ) {
    
    # Get the indices of unit columns. 
    unitCols <- unlist(
        
        # Go through each column index provided.
        sapply(
            colIndices,
            function( colNumber ) {
                    
                # Look at the column to the right of it.
                nextCol = colNumber + 1

                # Only try if this is not the very last column.
                if( nextCol <= ncol( SDRF ) ) {

                    # If it's a unit column, return the index.
                    if( grepl( "Unit", SDRF[ 1, nextCol ] ) ) {
                        nextCol
                    }
                }
            }
        )
    )

    # Combine the vector of unit column indices with the original one, and then
    # sort them so the unit column indices are next to the correct non-unit
    # column indices. E.g. if "Characteristics[ age ]" was the 2nd column out
    # of 5, that means its "Unit[ time unit ]" column is the 3rd column.
    # Combining the unit and non-unit column indices gives:
    #     1, 2, 4, 5, 3
    # So then we sort it so it becomes:
    #     1, 2, 3, 4, 5
    # Now when we use this vector of indices to select the Characteristics
    # columns we'll get them in the right order.
    allColIndices <- sort( c( colIndices, unitCols ) )

    return( allColIndices )
}


.mergeUnits <- function( subsetSDRF ) {

    # Find the unit columns.
    unitCols <- grep( "Unit", subsetSDRF[ 1, ] )
    
    if( length( unitCols ) > 0 ) {
        
        cat( "Merging unit columns...\n" )
    
        # Create some new merged columns.
        mergedCols <- data.frame( 
            sapply( 
                unitCols,
                function( unitCol ) {
                    paste( subsetSDRF[ , unitCol - 1 ], subsetSDRF[ , unitCol ] )
                }
            ),
            stringsAsFactors = FALSE
        )

        # Get the indices of the columns for which these unit columns apply.
        valueCols <- unitCols - 1
        
        # Replace the first row (header) with the one from the original SDRF.
        mergedCols[ 1 , ] <- subsetSDRF[ 1, valueCols ]
        
        # Replace the original value columns with the new merged columns.
        subsetSDRF[ , valueCols ] <- mergedCols

        # Delete the Unit columns.
        subsetSDRF <- subsetSDRF[ , -unitCols ]

        cat( "Finished merging unit columns.\n" )
    
        return( subsetSDRF )

    } else {

        # If there aren't any Unit columns, just return without doing anything.
        cat( "No Unit columns found.\n" )
        return( subsetSDRF )
    }
}


# .createBiocObject
#     - Takes an Analytics object, experiment type, experiment accession, and
#     path to directory containing expressions file.
#     - Returns either an ExpressionSet, MAList, or SummarizedExperiment.
.createBiocObject <- function( analytics, analyticsSDRF, atlasExperimentType, experimentAccession, atlasExperimentDirectory ) {
    
    # Create path to analysis methods file.
    analysisMethodsFile <- paste( experimentAccession, "-analysis-methods.tsv", sep="" )
    analysisMethodsFile <- file.path( atlasExperimentDirectory, analysisMethodsFile )
    
    # Is this a 1-colour microarray experiment?
    if( grepl( "microarray_1colour", atlasExperimentType ) ) {
        
        # Need the array design as it makes up the file name.
        arrayDesign <- platform( analytics )
        
        # Create the file name of the normalized expressions.
        expressionsFile <- paste( experimentAccession, "_", arrayDesign, "-normalized-expressions.tsv.undecorated", sep="" )
        
        # Add the full path to the file.
        expressionsFile <- file.path( atlasExperimentDirectory, expressionsFile )

        # Check that the expressions file exists, die if not.
        if( !file.exists( expressionsFile ) ) {
            stop( paste( "Expressions file \"", expressionsFile, "\" does not exist. Cannot continue.", sep="" ) )
        }
        
        # Read the expressions file.
        expressions <- read.delim( expressionsFile, header=TRUE, stringsAsFactors=FALSE, row.names = 1 )
        
        # Create an ExpressionSet.
        biocObject <- .createExpressionSet( expressions, analyticsSDRF, analysisMethodsFile )
        
        # Return it.
        return( biocObject )
    }
    # Is this a 2-colour array experiment?
    else if( grepl( "microarray_2colour", atlasExperimentType ) ) {

        # Get the array design.
        arrayDesign <- platform( analytics )
        
        # Create the name of the file with log2 fold-changes.
        mValuesFile <- paste( experimentAccession, "_", arrayDesign, "-log-fold-changes.tsv.undecorated", sep="" )
        mValuesFile <- file.path( atlasExperimentDirectory, mValuesFile )

        # Create the name of the file with average intensities.
        aValuesFile <- paste( experimentAccession, "_", arrayDesign, "-average-intensities.tsv.undecorated", sep="" )
        aValuesFile <- file.path( atlasExperimentDirectory, aValuesFile )
        
        # Read files.
        mValues <- read.delim( mValuesFile, header = TRUE, stringsAsFactors = FALSE, row.names = 1 )
        aValues <- read.delim( aValuesFile, header = TRUE, stringsAsFactors = FALSE, row.names = 1 )

        # Create an MAList
        maList <- list(
            genes = rownames( mValues ),
            M = mValues,
            A = aValues
        )

        biocObject <- new( "MAList", maList )

        return( biocObject )
    }
    # Is this an RNA-seq experiment?
    else if( grepl( "rnaseq", atlasExperimentType ) ) {
        
        # Add the full path to the file.
        expressionsFile <- file.path( atlasExperimentDirectory, paste( experimentAccession, "-raw-counts.tsv.undecorated", sep = "" ) )
        
        # Check that the expressions file exists, die if not.
        if( !file.exists( expressionsFile ) ) {
            stop( paste( "Expressions file \"", expressionsFile, "\" does not exist. Cannot continue.", sep="" ) )
        }
        
        # Read the expressions file.
        expressions <- read.delim( expressionsFile, header=TRUE, stringsAsFactors=FALSE, row.names = 1 )
        
        # Create a SummarizedExperiment.
        biocObject <- createSummarizedExperiment( expressions, analyticsSDRF, analysisMethodsFile )
    }
    # Otherwise, don't recognise this experiment type so die.
    else {
        stop( paste( "Don't know how handle experiment type \"", atlasExperimentType, "\". Cannot continue.", sep="" ) )
    }
    
    # Return the data frame with expressions.
    return( biocObject )
}


# createAnalyticsSDRF
#     - Take an Analytics object and a parsed SDRF.
#     - Return a new SDRF data frame with just the assay_names from the Analytics
#     object, as well as a column denoting which assay group each assay belongs
#     to.
createAnalyticsSDRF <- function( analytics, atlasSDRF ) {

    # Get the assay groups.
    assayGroups <- assay_groups( analytics )

    # Get the SDRF rows for these assay groups.
    assayGroupSDRFs <- lapply( assayGroups, function( assayGroup ) {
        
        # Get the assay names.
        assayNames <- assay_names( assayGroup )

        # Sanity checking.
        if( length( assayNames ) == 0 ) { 
            stop( "ERROR - Did not find any assay names for an assay group. Cannot continue." )
        }

        # TODO: strip .Cy* from 2-colour assay names? Check diffAtlas_DE_limma.R

        # Get the SDRF rows for these assays.
        atlasSDRF[ which( atlasSDRF$AssayName %in% assayNames ), ]
    })

    # Make a new data frame from the SDRF chunks list.
    analyticsSDRF <- ldply( assayGroupSDRFs, data.frame )

    # Change the name of the first column.
    colnames( analyticsSDRF )[1] <- "AtlasAssayGroup"

    # Make the assay names the row names.
    rownames( analyticsSDRF ) <- analyticsSDRF$AssayName
    analyticsSDRF$AssayName <- NULL

    # Sort the rows by assay name.
    analyticsSDRF <- analyticsSDRF[ sort( rownames( analyticsSDRF ) ) , ]
    
    return( analyticsSDRF )
}


# .createExpressionSet
#     - Take a data frame of normalized expressions and a parsed SDRF.
#     - Return an ExpressionSet object.
.createExpressionSet <- function( expressions, analyticsSDRF, analysisMethodsFile ) {

    # Only select columns with assay names in our SDRF -- these are the
    # ones that passed QC and are still in the XML.
    expressions <- expressions[ , rownames( analyticsSDRF ) ]

    # Turn data frame into matrix.
    expressionsMatrix <- as.matrix( expressions )
    
    # Create a new AssayData object
    expressionData <- assayDataNew( storage.mode = "lockedEnvironment", exprs = expressionsMatrix )
    
    # Add feature names (probe set names).
    featureNames( expressionData ) <- rownames( expressionsMatrix )
    
    # Add sample names (assay names).
    sampleNames( expressionData ) <- colnames( expressionsMatrix )

    # Add the SDRF data.
    phenoData <- new( "AnnotatedDataFrame", data = analyticsSDRF )
    
    # Add featureData -- this is just the probe set names for now.
    featureData <- new( "AnnotatedDataFrame", data = data.frame( probeSets = rownames( expressionsMatrix ) ) )
    featureNames( featureData ) <- rownames( expressionsMatrix )
    
    # Add analysis methods.
    analysisMethodsList <- .readArrayAnalysisMethods( analysisMethodsFile )
    # Add this to a MIAME object.
    exptData <- new( "MIAME", preprocessing = analysisMethodsList )

    # Create new ExpressionSet.
    return( new( "ExpressionSet", 
        assayData = expressionData, 
        phenoData = phenoData, 
        featureData = featureData, 
        experimentData = exptData 
    ) )
}


createSummarizedExperiment <- function( expressions, analyticsSDRF, analysisMethodsFile ) {

    # Make sure all the assays we want are present in the expressions matrix.
    wantedAssays <- rownames( analyticsSDRF )
    matrixAssays <- colnames( expressions )

    # Die if any assays we wanted are missing.
    if( !all( wantedAssays %in% matrixAssays ) ) {

        missingAssays <- wantedAssays[ which( !wantedAssays %in% matrixAssays ) ]

        missingAssayString <- paste( missingAssays, collapse = "\n" )

        stop( paste( 
              "The following assays are missing from the expression matrix:",
              missingAssayString,
              "Cannot continue.",
              sep = "\n"
        ) )
    }
    
    # Only select columns with assay names in our SDRF.
    expressions <- expressions[ , rownames( analyticsSDRF ) ]

    # Turn data frame into matrix.
    expressionsMatrix <- as.matrix( expressions )

    # Turn SDRF into a DataFrame.
    analyticsSDRF <- DataFrame( analyticsSDRF )

    # Get analysis methods
    analysisMethodsList <- .readSeqAnalysisMethods( analysisMethodsFile )
    
    # Create dummy row ranges (copied from SummarizedExperiment class code)
    partitioning <- PartitioningByEnd( integer( nrow( expressionsMatrix ) ), names = rownames( expressionsMatrix ) )
    dummyRanges <- BiocGenerics::relist( GRanges(), partitioning )

    # Create SummarizedExperiment
    summarizedExperiment <- SummarizedExperiment( 
                                    assays = SimpleList( counts = expressionsMatrix ), 
                                    colData = analyticsSDRF, 
                                    metadata = analysisMethodsList,
                                    rowRanges = dummyRanges
                                )

    # Return it.
    return( summarizedExperiment )
}


.readArrayAnalysisMethods <- function( analysisMethodsFile ) {
    
    # Read the analysis methods file.
    analysisMethodsDF <- read.delim( analysisMethodsFile, header=FALSE, stringsAsFactors=FALSE )
    
    # Find the row that has the normalization method.
    normalizationRow <- which( analysisMethodsDF[ , 1 ] == "Normalization" )
    
    # Get the text on this row in the second column.
    normalizationText <- analysisMethodsDF[ normalizationRow, 2 ]
    
    # Reorganise text.
    normalizationText <- sub( "^(.*) <a href=(.*)>(.*)</a> (version.*). <a.*", "\\1 \\3 (\\2) \\4", normalizationText)
    
    # Make a list containing the normalization text.
    analysisMethodsList <- list( normalization = normalizationText )
    
    # Return it.
    return( analysisMethodsList )
}


.readSeqAnalysisMethods <- function( analysisMethodsFile ) {
    
    # Read the analysis methods file.
    analysisMethodsDF <- read.delim( analysisMethodsFile, header=FALSE, stringsAsFactors=FALSE )
    
    # Get the row with iRAP information.
    irapRow <- grep( "Pipeline version", analysisMethodsDF[ , 1 ] )

    # Get the text for iRAP.
    irapInfo <- analysisMethodsDF[ irapRow, 2 ]

    # Reorganise iRAP text.
    irapInfo <- sub( "^<a href=(.*)>(.*)</a> (.*) ", "\\2 version \\3 (\\1)", irapInfo, )

    # Get the rows with filtering info.
    filteringRows <- grep( "Filtering Step", analysisMethodsDF[ , 1 ] )

    # Get the text for the filtering steps.
    filteringInfo <- analysisMethodsDF[ filteringRows, 2 ]

    # Get the row woth mapping info.
    mappingRow <- grep( "Read Mapping", analysisMethodsDF[ , 1 ] )

    # Get the text for the mapping info.
    mappingInfo <- analysisMethodsDF[ mappingRow, 2 ]

    # Get the row with quantification info.
    quantRow <- grep( "Quantification", analysisMethodsDF[ , 1 ] )
    
    # Get the quantification info.
    quantInfo <- analysisMethodsDF[ quantRow, 2 ]

    analysisMethodsList <- list( 
        pipeline = irapInfo,
        filtering = filteringInfo,
        mapping = mappingInfo,
        quantification = quantInfo
    )
    
    return( analysisMethodsList )
}
ebi-gene-expression-group/internal-ExpressionAtlas documentation built on Aug. 7, 2021, 9:28 p.m.