inst/scripts/import/uniprobe/import.R

# uniprobe/import.R
#------------------------------------------------------------------------------------------------------------------------
#library (RMySQL)
library (RSQLite)
library (org.Hs.eg.db)
library (org.Mm.eg.db)
library (org.Sc.sgd.db)
library (org.Ce.eg.db)
library(tools)   # for md5sum
#------------------------------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#------------------------------------------------------------------------------------------------------------------------
run = function (dataDir)
{
  dataDir <- file.path(dataDir, "uniprobe")
  stopifnot(file.exists(dataDir))

  all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
  matrices = readAndParse (all.files)
  tbl.pubRef = createPublicationRefTable ()
  tbl.geneRef = createGeneRefTable (dataDir)
  tbl.md = createMetadata (matrices, tbl.pubRef, tbl.geneRef)
  stopifnot (length (matrices) == nrow (tbl.md))
  matrices = renameMatrices (matrices, tbl.md)

  serializedFile <- "uniprobe.RData"
  save (matrices, tbl.md, file=serializedFile)
  printf("saved %d matrices to %s", length(matrices), serializedFile)
  printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)

} # run
#------------------------------------------------------------------------------------------------------------------------
createMatrixNameUniqifier = function (matrix)
{
  temporary.file <- tempfile ()
  write (as.character (matrix), file=temporary.file)
  md5sum.string <- as.character (md5sum (temporary.file))
  stopifnot (nchar (md5sum.string) == 32)
  md5.6chars = substr (md5sum.string, 29, 32)
  #unlink (temporary.file)

} # createMatrixNameUniqifier
#------------------------------------------------------------------------------------------------------------------------
parsePWMfromText = function (lines.of.text)
{
  z = lines.of.text
  stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4)

    # determine the number of columns
  token.count.first.line = length (strsplit (lines.of.text [1], '\t')[[1]])
  column.count = token.count.first.line - 1  # subtract off the 'A:\t' token
  result = matrix (nrow=4, ncol=column.count, byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:column.count))

  for (line in lines.of.text) {
    tokens = strsplit (line, '\\s*[:\\|]') [[1]]
    nucleotide = tokens [1]
    numbers.raw = tokens [2]
    number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
    while (nchar (number.tokens [1]) == 0)
      number.tokens = number.tokens [-1]
    numbers = as.numeric (number.tokens)
    #printf ('adding %s: %s', nucleotide, list.to.string (numbers))
    result [nucleotide,] = numbers
    }

  return (result)

} # parsePWMfromText
#------------------------------------------------------------------------------------------------------------------------
extractPWMfromFile = function (filename)
{
  text = scan (filename, sep='\n', what=character (0), quiet=TRUE)
  matrix.start.lines = grep ('A\\s*[:\\|]', text)
  stopifnot (length (matrix.start.lines) == 1)
  #printf ('%50s: %s', filename, list.to.string (matrix.start.lines))
  start.line = matrix.start.lines [1]
  end.line = start.line + 3
  lines = text [start.line:end.line]
  pwm.matrix = parsePWMfromText (lines)
  return (pwm.matrix)

} # extractPWMfromFile
#------------------------------------------------------------------------------------------------------------------------
createPublicationRefTable = function ()
{
  options (stringsAsFactors=FALSE)
  tbl.ref = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'))
  tbl.ref = cbind (tbl.ref, author=c('Scharer', 'Berger', 'Grove', 'Wei', 'Lesch', 'Zhu', 'Pompeani', 'Berger', 'De Silva', 'Badis'))
  tbl.ref = cbind (tbl.ref, pmid=c('19147588','18585359','19632181','20517297','19204119','19158363','18681939','16998473','18541913','19443739'))
  tbl.ref = cbind (tbl.ref, organism=c('Hsapiens', 'Mmusculus', 'Celegans', 'Mmusculus', 'Celegans', 'Scerevisiae', 'Vharveyi',
                                       'Scerevisiae;Hsapiens;Mmusculus', 'Apicomplexa', 'Mmusculus'))
  tbl.ref = cbind (tbl.ref, count=c(1, 168, 34, 25, 1, 89, 1, 5, 3, 104))
  titles = c ('Genome-wide promoter analysis of the SOX4 transcriptional network in prostate cancer cells',
              'Variation in homeodomain DNA binding revealed by high-resolution analysis of sequence preferences',
              'A Multiparameter Network Reveals Extensive Divergence between C. elegans bHLH Transcription Factors',
              'Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo',
              'Transcriptional regulation and stabilization of left right neuronal identity in C. elegans',
              'High-resolution DNA-binding specificity analysis of yeast transcription factors',
              'The Vibrio harveyi master quorum-sensing regulator, LuxR...',
              'Compact, universal DNA microarrays...',
              'Specific DNA-binding by Apicomplexan AP2 transcription factors',
              'Diversity and complexity in DNA recognition by transcription factors')
  tbl.ref = cbind (tbl.ref, title=titles)

  tbl.ref

} # createPublicationRefTable
#------------------------------------------------------------------------------------------------------------------------
identifyFiles = function (filePath)
{
  stopifnot(file.exists(filePath))
  
  cmd = sprintf ('find %s -name "*pwm*"', filePath)

  files.raw = system (cmd, intern=TRUE)

      # all legit files end in ".pwm" or ".txt"
  files = c (grep (".pwm$", files.raw, value=T, ignore.case=T), 
             grep (".txt$", files.raw, value=T, ignore.case=T))

  reverseComplementMatrices = grep ('RC', files)
  if (length (reverseComplementMatrices) > 0)
    files = files [-reverseComplementMatrices]

      # don't know why these were excluded.  leave them in for now
  embo10.excluders = grep ('EMBO', files)
  if (length (embo10.excluders) > 0)
    files = files [-embo10.excluders]

  cell09.excluders = grep ('Cell09', files)   # include these once the sql tables are updated
  if (length (cell09.excluders) > 0)
    files = files [-cell09.excluders]
  secondary.excluders = grep ('secondary', files)
  if (length (secondary.excluders) > 0)
    files = files [-secondary.excluders]

  invisible (files)

} # identifyFiles
#------------------------------------------------------------------------------------------------------------------------
readAndParse = function (file.list)
{
  matrices = list ()

  for (file in file.list) {
    #printf ('read and parse %s', file)
    text = scan (file, sep='\n', what=character (0), quiet=TRUE)
    matrix.start.lines = grep ('A:', text)
    stopifnot (length (matrix.start.lines) == 1)
    start.line = matrix.start.lines [1]
    end.line = start.line + 3
    lines.of.text = text [start.line:end.line]
    pwm.matrix = parsePWMfromText (lines.of.text)
    name.tokens <- strsplit(file,"/")[[1]]
    token.count <- length(name.tokens)

    matrix.name <- paste(name.tokens[(token.count-1):token.count], collapse="/")
    matrices [[matrix.name]] = pwm.matrix
    }

  invisible (matrices)

} # readAndParse
#------------------------------------------------------------------------------------------------------------------------
# eg, ./All_PWMs/GD09/Nsy-7.pwm to simply 'Nsy-7'
#
translateFileNameToGeneName = function (long.name)
{
  if (length (grep ('/', long.name)) > 0) {
    tokens = strsplit (long.name, '/')[[1]]
    count = length (tokens)
    gene.name.raw = tokens [count]  # get the last one
    }
  else {
    gene.name.raw = long.name
    }

  gene.name = strsplit (gene.name.raw, '\\.')[[1]][1]   # remove any file suffix

  gene.name

} # test.translateFileNameToGeneName
#------------------------------------------------------------------------------------------------------------------------
# and update matrix names
createMetadata = function (matrices, tbl.pubRef, tbl.geneRef)
{
  options (stringsAsFactors=FALSE)
  dataSource = 'UniPROBE'
  trim = function (s) {sub (' +$', '', sub ('^ +', '', s))}
  tbl.md = data.frame ()
  
  removers = list (Cart1=110, Cutl1=115, Hoxa7=156, Irx3=185)
 
  for (m in 1: length (matrices)) {
    matrix.name = names (matrices) [m]
    #printf ('%d: %s', m, matrix.name)
    native.name.raw = gsub ('All_PWMs/', '', matrix.name)
    native.name = extractNativeNames (native.name.raw)

       # extractNativeNames needs to be rethought.  but for now, just hack in some fixes
    if (native.name == 'Cgd2') native.name='Cgd2_3490'   # inconsistency at uniprobe, accomodated here
    if (native.name == 'PF14') native.name='PF14_0633'
    if (native.name == 'Uncx4') native.name='Uncx4.1'

    if (!native.name %in% tbl.geneRef$name) {
      browser (text='native.name not in tbl.geneRef')
      }
    
    experiment.folder = strsplit (native.name.raw, '/')[[1]][1]
    up.id.number = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$id
    # todo BUG here!
    full.uniprobe.id = sprintf ('UP%05d', up.id.number)
    if (length (full.uniprobe.id) != 1) {
      browser (text='full.uniprobe.id has wrong length')
      }

    geneId = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$stdID
    if (is.na (geneId) | geneId == '')
       geneId = NA_character_

    bindingDomain = trim (subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$domain)
    organism = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$species

      # a native name (a gene symbol) may have a dash, but for the long name, we want dashes to separate
      # the organism-dataSource-geneIdentifier matrix name.
      # so covert this here, but do not eclipse any dash-including native.names

    native.name.no.dashes = gsub ('-', '_', native.name)
    native.name.uniq = sprintf ('%s-%s-%s.%s', organism, dataSource, native.name.no.dashes, full.uniprobe.id)
    if (native.name.uniq %in% rownames (tbl.md)) {
      matrix = matrices [[m]]
      uniqifier = createMatrixNameUniqifier (matrix)
      #printf ('before, not unique: %s', native.name.uniq)
      native.name.uniq = paste (native.name.uniq, uniqifier, sep='.')
      #printf ('after, not unique: %s', native.name.uniq)
      }
      
    sequenceCount = NA_integer_
    bindingSequence = NA_character_
    tfFamily = NA_character_

    experimentType = 'protein binding microarray'
    pubmedID = subset (tbl.pubRef, folder==experiment.folder)$pmid
    #printf ('%12s: %20s', native.name, organism)

    if (is.na (geneId))
      geneIdType = NA
    else if (organism == 'Scerevisiae')
      geneIdType = 'SGD'
    else if (organism %in% c ('Mmusculus', 'Hsapiens', 'Celegans'))
      geneIdType = 'ENTREZ'
    else
      geneIdType = 'todo'

    proteinId = NA_character_
    proteinIdType = NA_character_

    proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$uniprot
    if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
      #printf ('getting uniprot id for %s: %s', native.name, proteinId.tmp)
      proteinId = proteinId.tmp
      proteinIdType = 'UNIPROT'
      }  # found good id in the uniprot column

    if (is.na (proteinId.tmp) | nchar (proteinId.tmp) == 0) {
      proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$refseq_id
      if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
        #printf ('getting refseq id for %s: %s', native.name, proteinId.tmp)
        proteinId = proteinId.tmp
        proteinIdType = 'REFSEQ'
        }
      } # need to look in the refseq column


    bindingSequence =  subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$bindingSequence
    #printf ('%12s: %40s', native.name, bindingSequence)

    new.row = list (providerName=native.name.raw,
                    providerId=full.uniprobe.id,
                    dataSource=dataSource,
                    geneSymbol=native.name,
                    geneId=geneId,
                    geneIdType=geneIdType,
                    proteinId=proteinId,
                    proteinIdType=proteinIdType,
                    organism=organism,
                    sequenceCount=NA,
                    bindingSequence=bindingSequence,
                    bindingDomain=bindingDomain,
                    tfFamily=tfFamily,
                    experimentType=experimentType,
                    pubmedID=pubmedID)
    if (native.name.uniq %in% rownames (tbl.md)) browser (text='dup row name')
    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
    if (length (native.name.uniq) != 1) browser (text='native.name.unique length != 1')
    rownames (tbl.md) [m] = native.name.uniq
    }

  invisible (tbl.md)

} # createMetadata
#------------------------------------------------------------------------------------------------------------------------
extractNativeNames = function (native.names.raw)
{
  name.count = length (native.names.raw)
  result = vector ('character', name.count)

  for (i in 1:name.count) {
    native.name.raw = native.names.raw [i]
    tokens = strsplit (native.name.raw, '/') [[1]]
    count = length (tokens)
    cooked.1 = native.name.raw
    if (count > 1)
      cooked.1 = tokens [length (tokens)]
    tokens = strsplit (cooked.1, '[_\\.]')[[1]]
    cooked.2 = tokens [1]
    result [i] = cooked.2
    }

  invisible (result)

} # extractNativeNames
#------------------------------------------------------------------------------------------------------------------------
# 'standard' is usually entrez gene ID.  for yeast it is orf. for fly ...
uniprotToStandardID = function (organism, uniprot.ids)
{
#  uniprot.ids = unique (uniprot.ids)

  if (!exists ('lst.yeast.uniprot')) {
    tbl.tmp = toTable (org.Sc.sgdUNIPROT)
    yeast.uniprot <<- tbl.tmp$systematic_name
    names (yeast.uniprot) <<- tbl.tmp$uniprot_id
    }

  if (!exists ('mouse.uniprot')) {
    tbl.tmp = toTable (org.Mm.egUNIPROT)
    mouse.uniprot <<- tbl.tmp$gene_id
    names (mouse.uniprot) <<- tbl.tmp$uniprot_id
    }

  if (!exists ('human.uniprot')) {
    tbl.tmp = toTable (org.Hs.egUNIPROT)
    human.uniprot <<- tbl.tmp$gene_id
    names (human.uniprot) <<- tbl.tmp$uniprot_id
    }

  if (!exists ('worm.uniprot')) {
    tbl.tmp = toTable (org.Ce.egUNIPROT)
    worm.uniprot <<- tbl.tmp$gene_id
    names (worm.uniprot) <<- tbl.tmp$uniprot_id
    }

  organism = unique (organism)
  stopifnot (length (unique (organism)) == 1)
  stopifnot (organism %in% (c ('human', 'mouse', 'yeast', 'worm')))
 
  if (organism == 'human') {
    ids = human.uniprot [uniprot.ids]
    }
  else if (organism == 'mouse') {
    ids = mouse.uniprot [uniprot.ids]
    }
  else if (organism == 'yeast') {
    ids = yeast.uniprot [uniprot.ids]
    }
  else if (organism == 'worm') {
    ids = worm.uniprot [uniprot.ids]
    }


    # embarrassing but true:  could not get this to work by creating a list directly
    # round-about solution:  make a 1-column data.frame, then construct a named list from that
  df = data.frame (uniprot=uniprot.ids, std=as.character (ids), stringsAsFactors=FALSE)
  #rownames (df) = uniprot.ids
  #result = df$stdID
  #names (result) = uniprot.ids
  return (df)

} # uniprotToStandardID
#------------------------------------------------------------------------------------------------------------------------
# see ~/v/snotes/log "* load uniprobe sql dump file (11 may 2012)" for info on the uniprobe mysql database 
# used here.  
createGeneRefTable = function (dataDir)
{
  if (!exists ('db')){
      dbFile <- file.path(dataDir, "uniprobe.sqlite")
      stopifnot(file.exists(dbFile))
      db <<- dbConnect (dbDriver("SQLite"), dbFile)
      }

  tbl.pubmed = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
                 pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
                 stringsAsFactors=FALSE)
       #CR09	1	Scharer 	19147588	human	Genome-wide promoter analysis of the SOX4 transcriptional 
       #                                                        network in prostate cancer cells
       #Cell08	168	Berger  	18585359	mouse	Variation in homeodomain DNA binding revealed by high-resolution 
       #                                                        analysis of sequence preferences
       #Cell09	34	Grove   	19632181	worm	A Multiparameter Network Reveals Extensive Divergence between 
       #                                                        C. elegans bHLH Transcription Factors
       #EMBO10	25	Wei     	20517297	mouse	Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo
       #GD09	1	Lesch   	19204119	worm	Transcriptional regulation and stabilization of left right neuronal 
       #                                                         identity in C. elegans
       #GR09	89	Zhu     	19158363	yeast	High-resolution DNA-binding specificity analysis of yeast transcription factors
       #MMB08	1	Pompeani	18681939	Vibrio	The Vibrio harveyi master quorum-sensing regulator, LuxR...
       #NBT06	5	Berger  	16998473	yeast;human;mouse	Compact, universal DNA microarrays...
       #PNAS08	3	De Silva	18541913	apicomplexa	Specific DNA-binding by Apicomplexan AP2 transcription factors
       #SCI09	104	Badis   	19443739	mouse	Diversity and complexity in DNA recognition by transcription factors


  tbl.gene = dbGetQuery (db, 'select * from gene_ids_public')
  colnames (tbl.gene) = c ('id', 'name', 'species', 'pub', 'type')
  tbl.genomic = dbGetQuery (db, 'select * from genomic_info') [, -c(2,3,8:11)]  # remove the longish 'description' field
  tbl.pub = dbGetQuery (db, 'select * from publication_ids') [,-3]  # remove 'full_ref' for legibility
  tbl = merge (tbl.gene, tbl.pub [, c (1,4)], by.x='pub', by.y='publication_id', all.x=TRUE)
  tbl = merge (tbl, tbl.pubmed, by.x='folder_name', by.y='folder', all.x=TRUE)
  tbl = merge (tbl, tbl.genomic, all.x=TRUE, by.x='name', by.y='gene_name')
  redundant.column = grep ('species.y', colnames (tbl))
  stopifnot (length (redundant.column) == 1)
  tbl = tbl [, -redundant.column]
  column.to.rename = grep ('species.x', colnames (tbl))
  stopifnot (length (column.to.rename) == 1)
  colnames (tbl) [column.to.rename] = 'species'
  

   # now add 'standard IDs' -- orf names for yeast, entrez geneIDs for everything else

  stdID = getAllStandardIDs (tbl)
  tbl = cbind (tbl, stdID)
  duplicates = which (duplicated (tbl [, c (1,2)]))
  if (length (duplicates) > 0)
    tbl = tbl [-duplicates,]
  
    # fix the organism (species) name, from (e.g.) "Homo sapiens" to "Hsapiens"
  tbl$species = standardizeSpeciesNames (tbl$species)
  invisible (tbl)

    # now add bindingSequence, from DBDs:  gene_name + seq, apparently the binding sequence when known, of 171 of 372 genes
  tbl.dbds = dbGetQuery (db, 'select * from DBDs')  
  dbds.list = tbl.dbds$seq
  names (dbds.list) = tbl.dbds$gene_name
  bindingSequence = rep (NA_character_, nrow (tbl))
  names (bindingSequence) = tbl$name
  shared.names = unique (intersect (tbl.dbds$gene_name, tbl$name))
  bindingSequence [shared.names] = dbds.list [shared.names]
  tbl = cbind (tbl, bindingSequence)
  invisible (tbl)
  
} # createGeneRefTable
#------------------------------------------------------------------------------------------------------------------------
# make successive species-specific calls to 'uniprotToStandardID', assembling a new column to be added to tbl.geneRef
getAllStandardIDs = function (tbl.geneRef)
{
  mouse.rows  = grep ('Mus musculus', tbl.geneRef$species)
  yeast.rows = grep ('Saccharomyces cerevisiae', tbl.geneRef$species)
  human.rows = grep ('Homo sapiens', tbl.geneRef$species)
  worm.rows = grep ('Caenorhabditis elegans', tbl.geneRef$species)

  stdID = rep (NA_character_, nrow (tbl.geneRef))

  mouse.uids = tbl.geneRef$uniprot [mouse.rows]
  yeast.uids = tbl.geneRef$uniprot [yeast.rows]
  human.uids = tbl.geneRef$uniprot [human.rows]
  worm.uids = tbl.geneRef$uniprot [worm.rows]

    # add mouse geneIDs
  tbl.mouse =  uniprotToStandardID ('mouse', mouse.uids)
  stopifnot (length (mouse.rows) == nrow (tbl.mouse))
  stdID [mouse.rows] = tbl.mouse$std
  
    # add yeast orfs
  tbl.yeast =  uniprotToStandardID ('yeast', yeast.uids)
  stopifnot (length (yeast.rows) == nrow (tbl.yeast))
  stdID [yeast.rows] = tbl.yeast$std
  
    # add human geneIDs
  tbl.human =  uniprotToStandardID ('human', human.uids)
  stopifnot (length (human.rows) == nrow (tbl.human))
  stdID [human.rows] = tbl.human$std

    # add worm geneIDs
  tbl.worm =  uniprotToStandardID ('worm', worm.uids)
  stopifnot (length (worm.rows) == nrow (tbl.worm))
  stdID [worm.rows] = tbl.worm$std

  invisible (stdID)

} # getAllStandardIDs
#------------------------------------------------------------------------------------------------------------------------
# change, e.g., "Homo sapiens" to "Hsapiens"
standardizeSpeciesNames = function (names)
{
   fix = function (name) {
     tokens = strsplit (name, ' ')[[1]]
     stopifnot (length (tokens) == 2)
     genus = tokens [1]
     species = tokens [2]
     return (paste (substr (genus, 1, 1), species, sep=''))
     } 

   fixed.names = as.character (sapply (names, fix))
   invisible (fixed.names)

} # standardizeSpeciesNames
#------------------------------------------------------------------------------------------------------------------------
renameMatrices = function (matrices, tbl.md)
{
  stopifnot (length (matrices) == nrow (tbl.md))
  names (matrices) = rownames (tbl.md)
  invisible (matrices)

} # renameMatrices
#------------------------------------------------------------------------------------------------------------------------

Try the MotifDb package in your browser

Any scripts or data that you put into this service are public.

MotifDb documentation built on Nov. 8, 2020, 6:28 p.m.