Nothing
.Random.seed <-
c(403L, 1L, -754469287L, 1611343336L, 409493526L, -487257199L,
-1171770301L, 1912392530L, -551743516L, 172658215L, -1911126403L,
-2105337996L, 1780845402L, 312237477L, -1208624705L, -1974316474L,
1432961232L, -882173197L, -1181066223L, 554658368L, 2113547806L,
51538153L, 288959003L, 235297322L, -1119154868L, -383201521L,
-1908977083L, 1447736316L, 2029890130L, 78436301L, 461046151L,
-1059679122L, 1435307080L, 1176483403L, 50711465L, -655801608L,
-597383642L, 458866497L, 810976467L, 1481803906L, 1205053108L,
867844663L, -2103047891L, -677926332L, -1134618774L, 698909653L,
1773016431L, -1126723466L, -1364018272L, -1496474525L, -1267422975L,
817276017L, -681869298L, 1040064633L, 1635557387L, -707858118L,
307916860L, 1885206399L, 1271565525L, -207851284L, 1889544770L,
1944464797L, 478042135L, 1250057598L, -1127414984L, -189621221L,
940074361L, -1188326456L, 1529946102L, 488153649L, 151442531L,
1733499954L, -1123616380L, -1975353401L, -104074595L, -825282028L,
-1338146630L, 1884515589L, -238545185L, -1119823002L, 487111408L,
-1044772333L, 79602609L, -1317269920L, 980728830L, 1664959369L,
-785980229L, 50479242L, 545588076L, -1286240017L, 722993957L,
-731410980L, 1419762610L, 1845789357L, -318571929L, -1318188658L,
1870077544L, -1192318357L, 770819081L, 114989720L, -624796218L,
-605553375L, -1728418381L, -1335400158L, -1331723500L, 275028887L,
-1529281651L, -1375879516L, -2002161526L, -1421445387L, 855771471L,
-813242794L, -1203102464L, -630495165L, -1927984159L, -2049338544L,
-739015250L, 1433553753L, 904126827L, -381400870L, -291954276L,
-1338438241L, -1253810187L, 1894010508L, 113535842L, -456998083L,
257192887L, -1741752226L, -1516227176L, 932069627L, -1897907047L,
871193896L, 1413721046L, -843452335L, 1851376387L, -1744031086L,
150192164L, 1334996711L, 909574973L, 385825716L, 1885202458L,
1929468773L, 454411775L, -660223866L, -930483184L, 1044948659L,
-2039147183L, -952021760L, 52615134L, -721726423L, -1367231525L,
1212393706L, 412919436L, 656185807L, 144805253L, -259901892L,
1297898258L, -1019430899L, -1339339961L, 1134027054L, -809662328L,
2033388939L, 1276884329L, -336259272L, -1321068058L, -798057087L,
-1603759853L, -1537478974L, 1043007860L, 1506439543L, 668410477L,
-45273852L, 1301596202L, -397507179L, 1490086447L, 1002813238L,
-495507616L, 994307491L, 1012500161L, 1906327216L, 1057765838L,
-1045885639L, -137223861L, 19129082L, -879627268L, 298103871L,
609520917L, 1241085484L, -1939924222L, 1716520925L, 2070229719L,
-957362754L, -1817188360L, 690938843L, -1407115719L, 394962824L,
959805238L, 476603761L, -1435672669L, -42236174L, -2072318396L,
1792192775L, -1290172195L, 40492500L, -1401556742L, 763709509L,
677236895L, -1612686810L, 2098024112L, 1913205587L, -210690447L,
-1975950432L, 85467966L, 1438370377L, -2108875013L, -1321877814L,
-1893490644L, 324474927L, 2037868773L, -2137833060L, -1512731662L,
-381151635L, 1907909543L, 1977957838L, 252377896L, -1670852309L,
-2017195191L, -1655173032L, 2056799494L, -1789116222L, -1060713432L,
969283500L, 2004478652L, -365339662L, -1733880272L, -2113794140L,
-1235989064L, -1006397606L, 779030672L, 1452657532L, -954228460L,
-1658222270L, 1088180480L, 2030001596L, 1816371536L, -1843308254L,
-247588280L, 790360076L, -282237348L, 562055634L, -2059274752L,
-142400172L, 1145646312L, -352497414L, -1243289904L, 800762668L,
-1647709964L, 1529141650L, 454049120L, 94821388L, -470139680L,
1398927970L, 824009896L, -132252596L, -1149253764L, 64641330L,
-1918295440L, 1459512132L, -266976296L, -1497902214L, 707156016L,
-281571396L, -1006931372L, 1785573442L, 1038397088L, -1790836036L,
1300466640L, -1258963614L, -1602793752L, -46011188L, -544841220L,
1718746610L, 1035248768L, 1180117492L, -10663288L, 1368169274L,
1540590928L, 1389014124L, -1801103980L, 415719506L, -67600928L,
-106998580L, 1692532384L, 1116959106L, 51947752L, 335252204L,
-1393187524L, 1796911218L, -1682288016L, -209544668L, 804792376L,
-504641254L, 1738212560L, -2092971716L, -1454541932L, -694548606L,
1556007872L, 1494652476L, 1502511504L, 52762658L, 1404704008L,
1956374668L, -1973326052L, -891448174L, -2026302080L, 194338836L,
1593411368L, 1872873018L, 1904955856L, 500873580L, -494976076L,
-1761785326L, 273516512L, 1157016012L, 1160870112L, 2134723618L,
-1492380888L, -1769178612L, 948589372L, -1665765390L, 643023600L,
-351184316L, -1563345320L, -1856686278L, 6262768L, -1644158276L,
1656182036L, 1687264450L, 1675476064L, -633313924L, 1100947408L,
-859590366L, 1114937000L, -100722804L, -303807300L, 1324120882L,
14870080L, -1616616268L, 646632520L, -214608454L, -1716078064L,
348650732L, 1324015956L, 611005202L, 431123872L, -1140466996L,
-925033888L, -2111524926L, -492473560L, 1458813356L, 2128198972L,
1727953778L, 1780084016L, 2085283620L, -794434248L, 1201063642L,
-1266106480L, 2144623356L, -2009719916L, 1199699010L, 457315584L,
-1356306372L, -356229808L, -745814878L, 1105431368L, 2034040588L,
1996013276L, -1688965294L, -977889664L, 1701009492L, -623107096L,
1863740154L, 621696336L, 361594668L, 951788404L, -307715182L,
-1529474464L, -1827178612L, -1025583264L, 1314161634L, 1732451240L,
-1528935220L, 449542652L, -1009576014L, 351446512L, 843335108L,
-394815144L, 1081855354L, 418594224L, 1230742460L, 2137256020L,
479137730L, -408412128L, -21450564L, -1769629104L, 747030498L,
-225411480L, 752627276L, 519557372L, -1811408910L, -1726651136L,
-715824268L, -410345464L, 1921962426L, -1867391536L, -2088430612L,
-207385708L, 415013842L, 1946281824L, -455933620L, -1656114912L,
-1625723006L, -583629080L, 1032558828L, -1416694212L, -92392590L,
1821832816L, -815291356L, -1639878856L, -337626982L, -2058506544L,
-539784004L, -127210860L, 190800514L, 414838976L, 42804540L,
-505825136L, -1774889438L, 77394440L, 618036492L, -1484225892L,
-1595558382L, 313596288L, -640329196L, -487809496L, 1364394298L,
1911811024L, -1700802964L, 1239442228L, 1693770386L, -555375264L,
1812815180L, 1402418144L, -1031254110L, 640597928L, 526100364L,
1588828831L, -1895979223L, -1245045826L, 945212076L, 1583126525L,
-1267523769L, 1756831792L, -647059122L, 699724155L, 2095548125L,
-757973238L, -2067810776L, 465411729L, 1797633667L, -1365973116L,
1216830898L, 1683886855L, -453826223L, -1575980842L, -819044940L,
-268355131L, -794560241L, 559105928L, 17102502L, 1045309011L,
-83060811L, -1711540974L, -1812017440L, -329526455L, 1598459259L,
-858191796L, -2018700838L, -1103336497L, 1436738969L, 1225768366L,
-1771107652L, -462066899L, -47261481L, -1330388704L, -1867585922L,
106131531L, 417057357L, 834967674L, -790501192L, -1300428191L,
505982995L, 721165876L, 2128377922L, 1530624087L, -2036032607L,
183950438L, -1227672476L, -1153495147L, 1038870719L, -1338910632L,
-380793738L, -2079583101L, 1751839941L, -611978782L, -917309360L,
-1054684807L, -639906133L, 963347996L, 1724199242L, -26697409L,
1664288649L, -524851298L, -1008994740L, -2015920099L, -1931751001L,
1511347408L, 682858094L, 1447342363L, 234267261L, 1808087146L,
-459897592L, 886362545L, 758654499L, -249945308L, 600163282L,
-1731280857L, 384998001L, -733065034L, -768071596L, 481401061L,
-1429130001L, -1130776536L, 1699122310L, -1578416461L, 490784789L,
-884200846L, -817121600L, 380013353L, 1714604443L, -191426964L,
334082682L, -1102318673L, -200808775L, 1578748110L, 762962460L,
553494925L, 486500343L, 1471245056L, -621735586L, 1265857067L,
616258861L, 2076672410L, 1801944920L, -1514498367L, 1009510771L,
1827652244L, 1034811170L, 641847863L, 616822529L, 310406662L,
-1516284860L, -863640203L, 170551519L, 651926584L, 1879997462L,
1192786979L, -258184219L, -131604990L, 1110570096L, -1111145959L,
646441995L, 840232700L, -1732114646L, 27076959L, -146202391L,
-306456578L, 689265644L, -1570001091L, 1940697863L, -813798672L,
-826010994L, -618402629L, -521688931L, -17632182L, -1697304984L,
-2128264623L, 77873475L, 668678212L, -83445006L, 1291567815L,
1196408977L, -1012953578L, 1037962356L, 1441482501L, -1392900785L,
-171362488L, -967721242L, -1653563373L, 2028828533L, -1790511278L,
997011360L, 155392649L, -2067505477L, 594963724L, -1283649382L,
-1911233009L, 1612436057L, 1402801902L, 2079231356L, 2006248941L,
1295321751L, -746961568L, -750091970L, -1141061109L, -394766781L
)
.create_df <-
function(class, chemical_table, parentTable, id_col, inchi_col, smiles_col) {
test=1
regex = 'InChI.+/.+/(c.+?)/.+'
child_list = parentTable[parentTable$parent == class, 'child']
while(test > 0) {
middle_parents = child_list[child_list %in% parentTable$parent]
test = length(middle_parents)
next_child = parentTable[parentTable$parent %in% middle_parents, 'child']
child_list = unique(c(child_list[child_list %in% middle_parents == F], next_child))
}
child_table = chemical_table[chemical_table[[id_col]] %in% child_list, c(id_col, inchi_col, smiles_col)]
child_table[[inchi_col]] = sub(regex, '\\1', child_table[[inchi_col]])
ind_empty_smiles = grep('^$', child_table[[smiles_col]])
ind_start_smiles = grep('\\*', child_table[[smiles_col]])
ind_remove = c(ind_empty_smiles, ind_start_smiles)
if(length(ind_remove) > 0) {
child_table = child_table[-ind_remove,]
}
return(child_table)
}
.formula2matrix <-
function(formula_vector) {
pattern = "([A-Z]{1}[a-z]?)([0-9]*)"
tokenized_formula = unlist(str_extract_all(string=formula_vector, pattern = pattern))
# Add 1 if atom doesn't have number
ind_noNum = which(grepl('[0-9]', tokenized_formula) == F)
tokenized_formula[ind_noNum] = sub('$', '1', tokenized_formula[ind_noNum])
atoms = sub(pattern, '\\1', tokenized_formula)
numbers = as.numeric(sub(pattern, '\\2', tokenized_formula))
atom_df = data.frame(cbind(atoms, numbers), stringsAsFactors=F)
atom_df$numbers = as.numeric(atom_df$numbers)
atom_df = tapply(atom_df$numbers, INDEX=list(atom_df$atoms),FUN=sum)
return(atom_df)
}
.get.c.num <-
function(id, chemical_table, id_col, formula_col) {
pattern_c = 'C{1}[0-9]*'
formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
carbon = str_extract(formula, pattern_c)
carbon = sub('^C$', 'C1', carbon)
carbon = grep('.+', carbon, value=T)
carbon = sub('C', '', carbon)
carbon = as.numeric(carbon)
if(length(carbon) == 0) {
carbon = 0
}
return(carbon)
}
.get.formula <-
function(id, chemical_table, id_col, formula_col) {
formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
return(formula)
}
.get.h.num <-
function(id, chemical_table, id_col, formula_col) {
pattern_h = 'H{1}[0-9]*'
formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
proton = str_extract(formula, pattern_h)
proton = sub('^H$', 'H1', proton)
proton = grep('.+', proton, value=T)
proton = sub('H', '', proton)
proton = as.numeric(proton)
if(length(proton) == 0) {
proton = 0
}
return(proton)
}
.get.o.num <-
function(id, chemical_table, id_col, formula_col) {
pattern_o = 'O{1}[0-9]*'
formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
oxygen = str_extract(formula, pattern_o)
oxygen = sub('^O$', 'O1', oxygen)
oxygen = grep('.+', oxygen, value=T)
oxygen = sub('O', '', oxygen)
oxygen = as.numeric(oxygen)
if(length(oxygen) == 0) {
oxygen = 0
}
return(oxygen)
}
.get.participant <-
function(equation, direction_type = c(' <=> ', ' => ', ' <\\?> ')) {
pattern = '.+? (.+)'
participants = equation
for(i in direction_type) {
participants = unlist(strsplit(participants, i))
}
participants = unlist(strsplit(participants, ' \\+ '))
participants = sub('\\(.+\\)', '', participants)
participants = sub(pattern, '\\1', participants)
participants = trim(participants)
return(unique(participants))
}
.id2formula <-
function(id, chemical_table, id_col, formula_col) {
id = gsub('\\|', '', id)
formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
return(formula)
}
.parse.chebi <-
function(owl,chebi_compound, chebi_chemical_data) {
con1 = grepl(' <owl:Class rdf:about="http://purl.obolibrary.org/obo/CHEBI_',owl)
con2 = grepl(' <rdfs:label rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
con3 = grepl(' <obo2:Synonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
con4 = grepl(' <obo2:SMILES rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
con5 = grepl(' <obo2:InChI rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
con6 = grepl(' <obo2:xref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">kegg COMPOUND:C', owl)
con7 = grepl(' <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_', owl)
con8 = grepl("</owl:Class>",owl)
id = ""
name = ""
synonym = c()
smiles = ""
inchi = ""
kegg = ""
chebi = c()
parent = c()
entry_start = grep('<\\!-- http://purl.obolibrary.org/obo/CHEBI_100 -->', owl)
cat("parsing", length(owl), 'lines\n')
for (i in entry_start:length(owl)) {
if(i == floor(length(owl)/10)) {
cat('10% finished\n')
} else if(i == floor(length(owl)/5)) {
cat('20% finished\n')
} else if(i == floor(length(owl)/2)) {
cat('50% finished\n')
}
if(con1[i]) { # ID
regexp = '( <owl:Class rdf:about=\"http://purl.obolibrary.org/obo/CHEBI_)(.*)(">)'
id = sub(pattern = regexp, replacement = "\\2", x = owl[i])
id = trim(id)
}
else if(con2[i]) { # Name
regexp = '( <rdfs:label rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</rdfs:label>)'
name = sub(pattern = regexp, replacement = "\\2", x = owl[i])
name = trim(name)
}
else if(con3[i]) { # synonym
regexp = '( <obo2:Synonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">)(.*)(</obo2:Synonym>)'
synonym = c(synonym, sub(pattern = regexp, replacement = '\\2', x = owl[i]))
synonym = trim(synonym)
}
else if(con4[i]) { # smiles
regexp = '( <obo2:SMILES rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</obo2:SMILES>)'
smiles = sub(pattern = regexp, replacement = '\\2', x = owl[i])
smiles = trim(smiles)
}
else if(con5[i]) { # inchi
regexp = '( <obo2:InChI rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</obo2:InChI>)'
inchi = sub(pattern = regexp, replacement = '\\2', x = owl[i])
inchi = trim(inchi)
}
else if(con6[i]) { # kegg
regexp = '( <obo2:xref rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">kegg COMPOUND:)(.*)(</obo2:xref>)'
kegg = sub(pattern = regexp, replacement = '\\2', x = owl[i])
kegg = trim(kegg)
}
else if(con7[i]) { # subClassOf
regexp = '( <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_)(.*)("/>)'
parent = c(parent, sub(pattern = regexp, replacement = '\\2', x = owl[i]))
parent = trim(parent)
}
else if(con8[i]) {
data = c(id, name, paste(synonym, collapse="///"), smiles, inchi, kegg, paste(parent, collapse="///"))
chebi = rbind(chebi, data)
id = ""
name = ""
synonym = c()
parent = c()
smiles = ""
inchi = ""
kegg = ""
}
}
# Chemical formula
chebi_compound2 = chebi_compound[,c('ID', 'ID')]
ind_parent = grep('[0-9]', chebi_compound$PARENT_ID)
chebi_compound2[ind_parent, 'ID.1'] = chebi_compound$PARENT_ID[ind_parent]
colnames(chebi_compound2) = c('id', 'chebi')
chebi_formula = chebi_chemical_data[chebi_chemical_data$TYPE == 'FORMULA', c('COMPOUND_ID', 'CHEMICAL_DATA')]
colnames(chebi_formula) = c('id', 'formula')
chebi_formula2 = join(chebi_compound2, chebi_formula, by='id')
chebi_formula2 = chebi_formula2[is.na(chebi_formula2$formula) == F,]
chebi_formula2 = unique(chebi_formula2[,c('chebi', 'formula')])
chebi_formula3 = data.table(chebi_formula2)
chebi_formula3 = chebi_formula3[,lapply(.SD, paste, collapse='///'), by="chebi", .SDcols=c("formula")]
chebi = gsub("'", "'", chebi) # convert HTML code
# Join chebi + formula
colnames(chebi) = c("chebi", "name", "synonyms", "smiles", "inchi", "kegg", "parent")
rownames(chebi) = 1:nrow(chebi)
chebi = as.data.frame(chebi, stringsAsFactors=FALSE)
chebi_formula3 = as.data.frame(chebi_formula3, stringsAsFactors=F)
chebi2 = join(chebi, chebi_formula3, by='chebi')
return(chebi2)
}
.parse.rhea <-
function(owl) {
entry = c('biochemicalReaction', 'transportReaction', 'equationWithCommonName', 'ecNumber', 'metacyc', 'kegg', 'sameParticipant', 'mapped', 'formuled', 'polymerization', 'chemicallyBalanced', 'iubmb', 'status', 'transport', 'direction', 'classOfReactions', 'closeBiochemicalReaction', 'closeTransport')
# define regular expression
regexp = list()
regexp[[entry[1]]] = '(<bp:biochemicalReaction rdf:about="http://identifiers\\.org/rhea/)(.*)(">)' # biochemical reaction
regexp[[entry[2]]] = '(<bp:transport.* rdf:about=")(.*)(">)' # transport reaction
regexp[[entry[3]]] = '(<bp:NAME .*>)(.*)(</bp:NAME>)' # reaction equation expressed by chemical name
regexp[[entry[4]]] = '(<bp:EC-NUMBER .*>)(.*)(</bp:EC-NUMBER>)' # EC number
regexp[[entry[5]]] = '(<bp:XREF rdf:resource="#METACYC:)(.*)(" />)' # cross-reference to MetaCyc
regexp[[entry[6]]] = '(<bp:XREF rdf:resource="#KEGG_REACTION:)(.*)(" />)' # cross-reference to KEGG
regexp[[entry[7]]] = '(<bp:XREF rdf:resource="#rel/../RHEA:)(.*)(" />)' # same participants, different direction
regexp[[entry[8]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Mapped=)(.*)(</bp:COMMENT>)'
regexp[[entry[9]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Formuled=)(.*)(</bp:COMMENT>)'
regexp[[entry[10]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Polymerization=)(.*)(</bp:COMMENT>)'
regexp[[entry[11]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Chemically balanced=)(.*)(</bp:COMMENT>)'
regexp[[entry[12]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:IUBMB=)(.*)(</bp:COMMENT>)'
regexp[[entry[13]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Status=)(.*)(</bp:COMMENT>)'
regexp[[entry[14]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Transport=)(.*)(</bp:COMMENT>)'
regexp[[entry[15]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Direction=)(.*)(</bp:COMMENT>)'
regexp[[entry[16]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Class of reactions=)(.*)(</bp:COMMENT>)'
regexp[[entry[17]]] = '</bp:biochemicalReaction>' # close tag
regexp[[entry[18]]] = '</bp:transport' # close tag
# pre-calculate condtions (TRUE/FALSE) for fast 'for' loop operation
condition = list()
for(i in entry) {
condition[[i]] = grepl(regexp[[i]], owl)
}
# parsing using regular expressions
cat("parsing", length(owl), 'lines\n')
rheaReaction = data.frame()
rheaReactionRow = list()
for(i in 1:length(owl)) {
if(i == floor(length(owl)/10)) {
cat('10% finished\n')
} else if(i == floor(length(owl)/5)) {
cat('20% finished\n')
} else if(i == floor(length(owl)/2)) {
cat('50% finished\n')
}
for(j in 1:length(entry)) {
# tag start
if(j %in% 1:2 && condition[[entry[j]]][i]) {
rheaReactionRow = list()
value = unlist(strsplit(owl[i], split = '"'))[2]
value = tail(unlist(strsplit(value, split = '/')), 1)
rheaReactionRow[['rheaId']] = c(rheaReactionRow[[entry[1]]], value)
rheaReactionRow[['reactionType']] = entry[j]
}
else if(j %in% 3:16 && condition[[entry[j]]][i]) {
value = sub(regexp[[entry[j]]], '\\2', owl[i])
rheaReactionRow[[entry[j]]] = paste(rheaReactionRow[[entry[j]]], value, sep=',')
}
# tag close
else if(j %in% 17:18 && condition[[entry[j]]][i]) {
rheaReactionRow = as.data.frame(rheaReactionRow, stringsAsFactors = FALSE)
rheaReaction = rbind.fill(rheaReaction, rheaReactionRow)
rheaReactionRow = list()
}
}
}
# post process
for(i in names(rheaReaction)) {
rheaReaction[[i]] = sub('^,', '', rheaReaction[[i]])
rheaReaction[[i]] = gsub('%2b', '+', rheaReaction[[i]])
}
rheaReaction = trim(rheaReaction)
rheaReaction$direction = gsub(' ', '-', rheaReaction$direction)
numberOfBiochecmialReaction = length(unique(rheaReaction[rheaReaction$reactionType == 'biochemicalReaction', 'rheaId']))
numberOfTransportReaction = length(unique(rheaReaction[rheaReaction$reactionType == 'transportReaction', 'rheaId']))
cat('# of biochemical reaction: ', numberOfBiochecmialReaction, '\n')
cat('# of transport reaction: ', numberOfTransportReaction, '\n')
# replace html code
rheaReaction[,'equationWithCommonName'] = gsub(pattern = ">", replacement = ">", x = rheaReaction[,'equationWithCommonName'])
rheaReaction[,'equationWithCommonName'] = gsub(pattern = "<", replacement = "<", x = rheaReaction[,'equationWithCommonName'])
rheaReaction[,'equationWithCommonName'] = gsub(pattern = "'", replacement = "'", x = rheaReaction[,'equationWithCommonName'])
# Sorting
rheaReaction = rheaReaction[order(rheaReaction[,'rheaId']),]
rownames(rheaReaction) = 1:nrow(rheaReaction)
##### Parsing equation expressed with ChEBI ID #####
cat("Parsing equation expressed with ChEBI ID\n")
entry = c('compoundId', 'name', 'chebiId', 'close')
# define regular expression
regexp = list()
regexp[[entry[1]]] = '(<bp:physicalEntity rdf:about="#compound:)(.+)(">)' # ChEBI ID
regexp[[entry[2]]] = '(<bp:NAME rdf:datatype .+>)(.*)(</bp:NAME>)' # compound common name used in equation
regexp[[entry[3]]] = '(<bp:COMMENT rdf:datatype = "http://www.w3.org/2001/XMLSchema#string">.* CHEBI:)(.*)(</bp:COMMENT>)'
regexp[[entry[4]]] = '</bp:physicalEntity>' # close
# pre-calculate condtions (TRUE/FALSE) for fast 'for' loop operation
condition = list()
for(i in entry) {
condition[[i]] = grepl(regexp[[i]], owl)
}
# parsing using regular expressions
cat("Parsing owl to create list mapping chemical common name into ChEBI ID\n")
rheaChemical = data.frame()
rheaChemicalRow = list()
for(i in 1:length(owl)) {
for(j in 1:length(entry)) {
# Tag start. Please change the number in if statement depending on length of entry
if(j == 1 && condition[[entry[j]]][i]) {
rheaChemicalRow = list()
value = unlist(strsplit(owl[i], split = '"'))[2]
value = sub('#', '', value)
rheaChemicalRow[[entry[j]]] = c(rheaChemicalRow[[entry[1]]], value)
}
else if(j %in% 2:3 && condition[[entry[j]]][i]) {
value = sub(regexp[[entry[j]]], '\\2', owl[i])
rheaChemicalRow[[entry[j]]] = value
}
# Tag close
else if(j == 4 && condition[[entry[j]]][i]) {
rheaChemicalRow = as.data.frame(rheaChemicalRow, stringsAsFactors = FALSE)
rheaChemical = rbind.fill(rheaChemical, rheaChemicalRow)
rheaChemicalRow = list()
}
}
}
rheaChemical = trim(rheaChemical)
rheaChemical[,'name'] = gsub(pattern = ">", replacement = ">", x = rheaChemical[,'name'])
rheaChemical[,'name'] = gsub(pattern = "<", replacement = "<", x = rheaChemical[,'name'])
rheaChemical[,'name'] = gsub(pattern = "'", replacement = "'", x = rheaChemical[,'name'])
numberOfChemicals = nrow(rheaChemical)
cat("# of chemicals used in Rhea: ", numberOfChemicals, '\n')
##### Building equation with the list #####
cat("Building equation with the list\n")
# Define direction
directionList = list()
directionList[['undefined']] = ' <?> '
directionList[['bidirectional']] = ' <=> '
directionList[['right-to-left']] = ' => '
directionList[['left-to-right']] = ' => '
# Define regular expressions
regexp_coefficient = "(^\\(?[0-9]*n?\\+?[0-9]*\\)? )(.+)"
regexp_localization = "(.+)(\\([inout]+\\))"
# Conversion (split - conversion - paste)
equationWithChebi = character(nrow(rheaReaction))
equationParticipant = character(nrow(rheaReaction))
for(i in 1:nrow(rheaReaction)) {
reactantsChebi = c()
productsChebi= c()
arrow = directionList[[rheaReaction[i, 'direction']]]
arrow2 = sub('\\?', '\\\\?', arrow)
participants = unlist(strsplit(rheaReaction[i,'equationWithCommonName'], split=arrow2))
reactants = unlist(strsplit(participants[1], split = " \\+ "))
reactantsWOcoefficient = sub(regexp_coefficient, "\\2", reactants)
for(j in 1:length(reactants)) {
if(grepl(regexp_coefficient, reactants[j])) { # process coefficeint
coefficient = sub(regexp_coefficient, "\\1", reactants[j])
} else {coefficient = ""}
if(grepl(regexp_localization, reactants[j])) { # process (in), (out)
localization = sub(regexp_localization, "\\2", x = reactants[j])
} else {localization = ""}
reactantsWOcoefficient[j] = gsub("\\(in\\)", "", reactantsWOcoefficient[j])
reactantsWOcoefficient[j] = gsub("\\(out\\)", "", reactantsWOcoefficient[j])
chebi = rheaChemical[rheaChemical$name == reactantsWOcoefficient[j], 'chebiId']
if(length(chebi) > 1) {
reactants[j] = "Unknown"
} else {
reactants[j] = paste(coefficient, chebi, localization, sep="")
}
reactantsChebi = c(reactantsChebi, chebi)
}
products = unlist(strsplit(participants[2], split = " \\+ "))
productsWOcoefficient = sub(regexp_coefficient, "\\2", products)
for(k in 1:length(products)) {
if(grepl(regexp_coefficient, products[k])) { # process coefficeint
coefficient = sub(regexp_coefficient, "\\1", products[k])
} else {coefficient = ""}
if(grepl(regexp_localization, products[k])) { # process (in), (out)
localization = sub(regexp_localization, "\\2", x = products[k])
} else {localization = ""}
productsWOcoefficient[k] = gsub("\\(in\\)", "", productsWOcoefficient[k])
productsWOcoefficient[k] = gsub("\\(out\\)", "", productsWOcoefficient[k])
chebi = rheaChemical[rheaChemical$name == productsWOcoefficient[k], 'chebiId']
if(length(chebi) > 1) {
products[k] = "Unknown"
} else {
products[k] = paste(coefficient, chebi, localization, sep='')
}
productsChebi = c(productsChebi, chebi)
}
equationWithChebi[i] = paste(paste(reactants, collapse = " + "), paste(products, collapse = " + "), sep = arrow)
equationParticipant[i] = paste(c(reactantsChebi, productsChebi), collapse=',')
}
result = cbind(rheaReaction, I(equationWithChebi), I(equationParticipant))
result[is.na(result)] = ''
return(result)
}
.unique_column <-
function(vector) {
if(length(vector)>1) {
return(vector[1])
}
}
.convert.html = function(dataFrame) {
col = colnames(dataFrame)
dataFrame_result = dataFrame
for(i in col) {
dataFrame_result[[i]] = gsub('<i>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('</i>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('&beta;', 'beta', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('<SUP>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('</SUP>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('<sup>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('</sup>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('<sub>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('</sub>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub(' &rarr; ', '=>', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub(' &larr; ', '<=', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub(' &harr; ', '<=>', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub(' = ', ' <=> ', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('&pi;', 'pi', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('&alpha;', 'alpha', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('<I>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('</I>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('<SUB>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('</SUB>', '', dataFrame_result[[i]])
dataFrame_result[[i]] = gsub(''', "'", dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('>', ">", dataFrame_result[[i]])
dataFrame_result[[i]] = gsub('&gamma;', "gamma", dataFrame_result[[i]])
}
return(dataFrame_result)
}
.parse.biopax <-
function(biopax) {
### Define regula expression
## BiochemicalReaction set
regex_biochemicalReaction_tag = '<bp:BiochemicalReaction rdf:ID=".+">'
regex_biochemicalReaction_close = '</bp:BiochemicalReaction>'
regex_standardName_tag = '<bp:standardName rdf:datatype="http://www.w3.org/2001/XMLSchema#string">(.+)</bp:standardName>'
regex_eCNumber_tag = '<bp:eCNumber rdf:datatype="http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:eCNumber>'
regex_xref_tag = '<bp:xref rdf:resource=\"#(.+)"/>'
regex_left_tag = '<bp:left rdf:resource=\"#(.+)"/>'
regex_right_tag = '<bp:right rdf:resource=\"#(.+)"/>'
regex_participantStoichiometry_tag = '<bp:participantStoichiometry rdf:resource=\"#(.+)"/>'
index_biochemicalReaction_tag = grepl(regex_biochemicalReaction_tag, biopax)
index_biochemicalReaction_close = grepl(regex_biochemicalReaction_close, biopax)
index_standardName_tag = grepl(regex_standardName_tag, biopax)
index_eCNumber_tag = grepl(regex_eCNumber_tag, biopax)
index_xref_tag = grepl(regex_xref_tag, biopax)
index_left_tag = grepl(regex_left_tag, biopax)
index_right_tag = grepl(regex_right_tag, biopax)
index_participantStoichiometry_tag = grepl(regex_participantStoichiometry_tag, biopax)
## Transport set
regex_Transport_tag = '<bp:Transport rdf:ID=".+>'
regex_Transport_close = '</bp:Transport>'
index_Transport_tag = grepl(regex_Transport_tag, biopax)
index_Transport_close = grepl(regex_Transport_close, biopax)
## TransportWithBiochemicalReaction set
regex_TransportWithBiochemicalReaction_tag = '<bp:TransportWithBiochemicalReaction rdf:ID=".+">'
regex_TransportWithBiochemicalReaction_close = '</bp:TransportWithBiochemicalReaction>'
index_TransportWithBiochemicalReaction_tag = grepl(regex_TransportWithBiochemicalReaction_tag, biopax)
index_TransportWithBiochemicalReaction_close = grepl(regex_TransportWithBiochemicalReaction_close, biopax)
## ComplexAssembly set
regex_ComplexAssembly_tag = '<bp:ComplexAssembly rdf:ID=\".+">'
regex_ComplexAssembly_close = '</bp:ComplexAssembly>'
index_ComplexAssembly_tag = grepl(regex_ComplexAssembly_tag, biopax)
index_ComplexAssembly_close = grepl(regex_ComplexAssembly_close, biopax)
## UnificationXref & RelationshipXref set
regex_UnificationXref_tag = '<bp:UnificationXref rdf:ID="(.+)">'
regex_UnificationXref_close = '</bp:UnificationXref>'
regex_RelationshipXref_tag = '<bp:RelationshipXref rdf:ID="(.+)">'
regex_RelationshipXref_close = '</bp:RelationshipXref>'
regex_xref_integrated_tag = "<bp:.+ rdf:ID=\"(.+)\">"
regex_id_tag = '<bp:id rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:id>'
regex_db_tag = '<bp:db rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:db>'
index_UnificationXref_tag = grepl(regex_UnificationXref_tag, biopax)
index_UnificationXref_close = grepl(regex_UnificationXref_close, biopax)
index_RelationshipXref_tag = grepl(regex_RelationshipXref_tag, biopax)
index_RelationshipXref_close = grepl(regex_RelationshipXref_close, biopax)
index_id_tag = grepl(regex_id_tag, biopax)
index_db_tag = grepl(regex_db_tag, biopax)
## Small Molecule set
regex_SmallMolecule_tag = '<bp:SmallMolecule rdf:ID=\"(.+)">'
regex_SmallMolecule_close = '</bp:SmallMolecule>'
regex_cellularLocation_tag = '<bp:cellularLocation rdf:resource="#(.+)"/>'
index_SmallMolecule_tag = grepl(regex_SmallMolecule_tag, biopax)
index_SmallMolecule_close = grepl(regex_SmallMolecule_close, biopax)
index_cellularLocation_tag = grepl(regex_cellularLocation_tag, biopax)
## Protein set
regex_Protein_tag = '<bp:Protein rdf:ID="(.+)">'
regex_Protein_close = '</bp:Protein>'
index_Protein_tag = grepl(regex_Protein_tag, biopax)
index_Protein_close = grepl(regex_Protein_close, biopax)
## Complext set
regex_Complex_tag = '<bp:Complex rdf:ID="(.+)">'
regex_Complex_close = '</bp:Complex>'
index_Complex_tag = grepl(regex_Complex_tag, biopax)
index_Complex_close = grepl(regex_Complex_close, biopax)
## Rna set
regex_Rna_tag = '<bp:Rna rdf:ID="(.+)">'
regex_Rna_close = '</bp:Rna>'
index_Rna_tag = grepl(regex_Rna_tag, biopax)
index_Rna_close = grepl(regex_Rna_close, biopax)
## CellularLocationVocabulary set
regex_CellularLocationVocabulary_tag = '<bp:CellularLocationVocabulary rdf:ID=\"(.+)">'
regex_term_tag = '<bp:term rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:term>'
regex_CellularLocationVocabulary_close = '</bp:CellularLocationVocabulary>'
index_CellularLocationVocabulary_tag = grepl(regex_CellularLocationVocabulary_tag, biopax)
index_term_tag = grepl(regex_term_tag, biopax)
index_CellularLocationVocabulary_close = grepl(regex_CellularLocationVocabulary_close, biopax)
## Flags
biochemicalReaction_flag = FALSE
Transport_flag = FALSE
TransportWithBiochemicalReaction_flag = FALSE
UnificationXref_flag = FALSE
SmallMolecule_flag = FALSE
CellularLocationVocabulary_flag = FALSE
Protein_flag = FALSE
Complex_flag = FALSE
Rna_flag = FALSE
ComplexAssembly_flag = FALSE
## Read each line and parsing
ec_numbers = c()
xrefs_biochemicalReaction = c()
xrefs_Transport = c()
xrefs_TransportWithBiochemicalReaction = c()
xrefs_ComplexAssembly = c()
df_biochemicalReaction = c()
df_Transport = c()
df_TransportWithBiochemicalReaction = c()
df_ComplexAssembly = c()
df_smallMolecule = c()
df_Protein = c()
df_Complex = c()
df_Rna = c()
df_CellularLocationVocabulary = c()
tmp_xrefs_smallMolecule = c()
tmp_xrefs_Protein = c()
tmp_xrefs_Complex = c()
tmp_xrefs_Rna = c()
lefts = c()
rights = c()
df_UnificationXref = c()
participantStoichiometries = c()
cellularLocation = ''
for(i in 1:length(biopax)) {
# Set flag
if(index_biochemicalReaction_tag[i]) {
biochemicalReaction_flag = TRUE
} else if(index_Transport_tag[i]) {
Transport_flag = TRUE
} else if(index_TransportWithBiochemicalReaction_tag[i]) {
TransportWithBiochemicalReaction_flag = TRUE
} else if(index_ComplexAssembly_tag[i]) {
ComplexAssembly_flag = TRUE
} else if(index_UnificationXref_tag[i] || index_RelationshipXref_tag[i]) {
UnificationXref_flag = TRUE
} else if(index_SmallMolecule_tag[i]) {
SmallMolecule_flag = TRUE
} else if(index_CellularLocationVocabulary_tag[i]) {
CellularLocationVocabulary_flag = TRUE
} else if(index_Protein_tag[i]) {
Protein_flag = TRUE
} else if(index_Complex_tag[i]) {
Complex_flag = TRUE
} else if(index_Rna_tag[i]) {
Rna_flag = TRUE
}
# Parsing
if(biochemicalReaction_flag) {
if(index_standardName_tag[i]) {
reaction_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if (index_eCNumber_tag[i]) {
ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
} else if(index_xref_tag[i]) {
xrefs_biochemicalReaction = c(xrefs_biochemicalReaction, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_left_tag[i]) {
lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
} else if(index_right_tag[i]) {
rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
} else if(index_participantStoichiometry_tag[i]) {
participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
} else if(index_biochemicalReaction_close[i]) {
paste_xrefs_biochemicalReaction = paste(xrefs_biochemicalReaction, collapse='///')
paste_lefts = paste(lefts, collapse='///')
paste_rights = paste(rights, collapse='///')
paste_ec_numbers = paste(ec_numbers, collapse='///')
paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
df_biochemicalReaction = cbind(reaction_equation_name, paste_ec_numbers, paste_xrefs_biochemicalReaction, paste_lefts, paste_rights, paste_participantStoichiometries)
df_biochemicalReaction = data.frame(df_biochemicalReaction, stringsAsFactors=F)
biochemicalReaction_flag = FALSE
}
}
else if(Transport_flag) {
if(index_standardName_tag[i]) {
transport_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if (index_eCNumber_tag[i]) {
ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
} else if(index_xref_tag[i]) {
xrefs_Transport = c(xrefs_Transport, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_left_tag[i]) {
lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
} else if(index_right_tag[i]) {
rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
} else if(index_participantStoichiometry_tag[i]) {
participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
} else if(index_Transport_close[i]) {
paste_xrefs_Transport = paste(xrefs_Transport, collapse='///')
paste_lefts = paste(lefts, collapse='///')
paste_rights = paste(rights, collapse='///')
paste_ec_numbers = paste(ec_numbers, collapse='///')
paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
df_Transport = cbind(transport_equation_name, paste_ec_numbers, paste_xrefs_Transport, paste_lefts, paste_rights, paste_participantStoichiometries)
df_Transport = data.frame(df_Transport, stringsAsFactors=F)
Transport_flag = FALSE
}
}
else if(TransportWithBiochemicalReaction_flag) {
if(index_standardName_tag[i]) {
TransportWithBiochemicalReaction_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if (index_eCNumber_tag[i]) {
ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
} else if(index_xref_tag[i]) {
xrefs_TransportWithBiochemicalReaction = c(xrefs_TransportWithBiochemicalReaction, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_left_tag[i]) {
lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
} else if(index_right_tag[i]) {
rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
} else if(index_participantStoichiometry_tag[i]) {
participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
} else if(index_TransportWithBiochemicalReaction_close[i]) {
paste_xrefs_TransportWithBiochemicalReaction = paste(xrefs_TransportWithBiochemicalReaction, collapse='///')
paste_lefts = paste(lefts, collapse='///')
paste_rights = paste(rights, collapse='///')
paste_ec_numbers = paste(ec_numbers, collapse='///')
paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
df_TransportWithBiochemicalReaction = cbind(TransportWithBiochemicalReaction_equation_name, paste_ec_numbers, paste_xrefs_TransportWithBiochemicalReaction, paste_lefts, paste_rights, paste_participantStoichiometries)
df_TransportWithBiochemicalReaction = data.frame(df_TransportWithBiochemicalReaction, stringsAsFactors=F)
TransportWithBiochemicalReaction_flag = FALSE
}
}
else if(ComplexAssembly_flag) {
if(index_standardName_tag[i]) {
ComplexAssembly_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if (index_eCNumber_tag[i]) {
ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
} else if(index_xref_tag[i]) {
xrefs_ComplexAssembly = c(xrefs_ComplexAssembly, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_left_tag[i]) {
lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
} else if(index_right_tag[i]) {
rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
} else if(index_participantStoichiometry_tag[i]) {
participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
} else if(index_ComplexAssembly_close[i]) {
paste_xrefs_ComplexAssembly = paste(xrefs_ComplexAssembly, collapse='///')
paste_lefts = paste(lefts, collapse='///')
paste_rights = paste(rights, collapse='///')
paste_ec_numbers = paste(ec_numbers, collapse='///')
paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
df_ComplexAssembly = cbind(ComplexAssembly_equation_name, paste_ec_numbers, paste_xrefs_ComplexAssembly, paste_lefts, paste_rights, paste_participantStoichiometries)
df_ComplexAssembly = data.frame(df_ComplexAssembly, stringsAsFactors=F)
TransportWithBiochemicalReaction_flag = FALSE
}
}
else if(UnificationXref_flag) {
if(index_UnificationXref_tag[i] || index_RelationshipXref_tag[i]) {
UnificationXref_id = trim(sub(regex_xref_integrated_tag, '\\1', biopax[i]))
} else if(index_id_tag[i]) {
id = trim(sub(regex_id_tag, '\\1', biopax[i]))
} else if(index_db_tag[i]) {
db = trim(sub(regex_db_tag, '\\1', biopax[i]))
} else if(index_UnificationXref_close[i] || index_RelationshipXref_close[i]) {
tmp = c(UnificationXref_id, id, db)
df_UnificationXref = rbind(df_UnificationXref, tmp)
df_UnificationXref = data.frame(df_UnificationXref, stringsAsFactors=F)
UnificationXref_flag = FALSE
}
}
else if(SmallMolecule_flag) {
if(index_SmallMolecule_tag[i]) {
master_id = trim(sub(regex_SmallMolecule_tag, '\\1', biopax[i]))
} else if(index_standardName_tag[i]) {
chemial_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if(index_xref_tag[i]) {
tmp_xrefs_smallMolecule = c(tmp_xrefs_smallMolecule, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_cellularLocation_tag[i]) {
cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
} else if(index_SmallMolecule_close[i]) {
tmp = c(master_id, chemial_name, paste(tmp_xrefs_smallMolecule, collapse='///'), cellularLocation)
df_smallMolecule = rbind(df_smallMolecule, tmp)
SmallMolecule_flag = FALSE
tmp_xrefs_smallMolecule = c()
cellularLocation = ''
}
}
else if(Protein_flag) {
if(index_Protein_tag[i]) {
master_id = trim(sub(regex_Protein_tag, '\\1', biopax[i]))
} else if(index_standardName_tag[i]) {
protein_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if(index_xref_tag[i]) {
tmp_xrefs_Protein = c(tmp_xrefs_Protein, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_cellularLocation_tag[i]) {
cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
} else if(index_Protein_close[i]) {
tmp = c(master_id, protein_name, paste(tmp_xrefs_Protein, collapse='///'), cellularLocation)
df_Protein = rbind(df_Protein, tmp)
Protein_flag = FALSE
tmp_xrefs_Protein = c()
cellularLocation = ''
}
}
else if(Complex_flag) {
if(index_Complex_tag[i]) {
master_id = trim(sub(regex_Complex_tag, '\\1', biopax[i]))
} else if(index_standardName_tag[i]) {
complex_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if(index_xref_tag[i]) {
tmp_xrefs_Complex = c(tmp_xrefs_Complex, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_cellularLocation_tag[i]) {
cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
} else if(index_Complex_close[i]) {
tmp = c(master_id, complex_name, paste(tmp_xrefs_Complex, collapse='///'), cellularLocation)
df_Complex = rbind(df_Complex, tmp)
Complex_flag = FALSE
tmp_xrefs_Complex = c()
cellularLocation = ''
}
}
else if(Rna_flag) {
if(index_Rna_tag[i]) {
master_id = trim(sub(regex_Rna_tag, '\\1', biopax[i]))
} else if(index_standardName_tag[i]) {
rna_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
} else if(index_xref_tag[i]) {
tmp_xrefs_Rna = c(tmp_xrefs_Rna, trim(sub(regex_xref_tag, '\\1', biopax[i])))
} else if(index_cellularLocation_tag[i]) {
cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
} else if(index_Rna_close[i]) {
tmp = c(master_id, rna_name, paste(tmp_xrefs_Rna, collapse='///'), cellularLocation)
df_Rna = rbind(df_Rna, tmp)
Rna_flag = FALSE
tmp_xrefs_Rna = c()
cellularLocation = ''
}
}
if(CellularLocationVocabulary_flag) {
if(index_CellularLocationVocabulary_tag[i]) {
CellularLocationVocabulary_id = trim(sub(regex_CellularLocationVocabulary_tag, '\\1', biopax[i]))
} else if(index_term_tag[i]) {
CellularLocationVocabulary_term = trim(sub(regex_term_tag, '\\1', biopax[i]))
} else if(index_CellularLocationVocabulary_close[i]) {
tmp = c(CellularLocationVocabulary_id, CellularLocationVocabulary_term)
df_CellularLocationVocabulary = rbind(df_CellularLocationVocabulary, tmp)
CellularLocationVocabulary_flag = FALSE
}
}
}
## Post-processing data frame
if(length(df_smallMolecule)>0) {
rownames(df_smallMolecule) = 1:nrow(df_smallMolecule)
df_smallMolecule = data.frame(df_smallMolecule, stringsAsFactors=F)
}
if(length(df_Protein)>0) {
rownames(df_Protein) = 1:nrow(df_Protein)
df_Protein = data.frame(df_Protein, stringsAsFactors=F)
}
if(length(df_Complex)>0) {
rownames(df_Complex) = 1:nrow(df_Complex)
df_Complex = data.frame(df_Complex, stringsAsFactors=F)
}
if(length(df_Rna)>0) {
rownames(df_Rna) = 1:nrow(df_Rna)
df_Rna = data.frame(df_Rna, stringsAsFactors=F)
}
rownames(df_CellularLocationVocabulary) = 1:nrow(df_CellularLocationVocabulary)
df_CellularLocationVocabulary = data.frame(df_CellularLocationVocabulary, stringsAsFactors=F)
## Concatenate data.frames
if(length(df_biochemicalReaction)>0) {
df_reaction = df_biochemicalReaction
} else if(length(df_Transport)>0) {
df_reaction = df_Transport
} else if(length(df_TransportWithBiochemicalReaction)>0) {
df_reaction = df_TransportWithBiochemicalReaction
} else if(length(df_ComplexAssembly)>0) {
df_reaction = df_ComplexAssembly
}
df_compound = rbind.fill(df_smallMolecule, df_Protein, df_Complex, df_Rna)
## Handle Xref
colnames(df_UnificationXref) = c('xref','id','db')
colnames(df_reaction) = c('name','ec_number','xref','lefts','rights','participantStoichiometries')
colnames(df_compound) = c('name', 'standard_name','xref', 'cellularLocation')
colnames(df_CellularLocationVocabulary) = c('master_id', 'term')
df_reaction = .handle.xref(df_reaction, df_UnificationXref)
df_compound = .handle.xref(df_compound, df_UnificationXref)
## Convert HTML code to character
df_reaction = .convert.html(df_reaction)
df_compound = .convert.html(df_compound)
## Parse stoichimetric coefficient
regex_Stoichiometry_tag = '<bp:Stoichiometry rdf:ID="(.+)">'
regex_Stoichiometry_close = '</bp:Stoichiometry>'
regex_stoichiometricCoefficient_tag = '<bp:stoichiometricCoefficient rdf:datatype="http://www.w3.org/2001/XMLSchema#.+">(.+)</bp:stoichiometricCoefficient>'
regex_id = '<bp:.+ rdf:ID="(.+)">'
regex_id2 = '<bp:PHYSICAL-ENTITY rdf:resource="#(.+)"/>'
regex_location = '<bp:cellularLocation rdf:resource="#(.+)"/>'
regex_location2 = '<bp:cellularLocation>'
regex_location3 = '<bp:CellularLocationVocabulary rdf:ID="(.+)">'
index_Stoichiometry_tag = grep(regex_Stoichiometry_tag, biopax)
index_Stoichiometry_close = grep(regex_Stoichiometry_close, biopax)
master_id = trim(sub(regex_Stoichiometry_tag, '\\1', biopax[index_Stoichiometry_tag]))
stoichiometric_coefficient = trim(sub(regex_stoichiometricCoefficient_tag, '\\1', biopax[index_Stoichiometry_tag+1]))
id = trim(sub(regex_id, '\\1', biopax[index_Stoichiometry_tag+3]))
id2 = trim(sub(regex_id2, '\\1', biopax[index_Stoichiometry_tag+2]))
id_new = character(length(id))
id_new[grepl('>', id)==FALSE] = id[grepl('>', id)==FALSE]
id_new[grepl('>', id2)==FALSE] = id2[grepl('>', id2)==FALSE]
location = trim(sub(regex_location, '\\1', biopax[index_Stoichiometry_close-3]))
index_location = grep(regex_location2, biopax)
if(length(location) != length(index_location)) {
location[grep('comment', location)] = ''
location[grep('dataSource', location)] = ''
}
if(length(index_location)>0) {
location2 = trim(sub(regex_location3, '\\1', biopax[index_location+1]))
location[grep('<', location)] = location2[grepl('>', location2)==F]
} else {
location[grep('>', location)] = df_compound[df_compound$name == id_new, 'cellularLocation']
}
participants = data.frame(cbind(master_id, stoichiometric_coefficient, id_new, location), stringsAsFactors=F)
left_sub =build.subtable(df_reaction, 'MetaCyc', 'lefts', '///')
right_sub = build.subtable(df_reaction, 'MetaCyc', 'rights', '///')
## Sometimes there is no 'Stoichiometry' tag
if(nrow(participants) < (nrow(left_sub) + nrow(right_sub))) {
tmp_df_CellularLocationVocabulary = df_CellularLocationVocabulary
colnames(tmp_df_CellularLocationVocabulary)[grep('master_id', names(tmp_df_CellularLocationVocabulary))] = 'cellularLocation'
participants = merge(df_compound, tmp_df_CellularLocationVocabulary, by = 'cellularLocation', all.x=T)
colnames(participants)[grep('term', names(participants))] = 'location2'
colnames(participants)[grep('^name$', names(participants))] = 'id_new'
participants[is.na(participants)] = 'cytosol'
} else {
## Generate equation with MetaCyc ID (not name)
stoichio_sub = build.subtable(df_reaction, 'MetaCyc', 'participantStoichiometries', '///')
if(nrow(df_compound) > 0) {
compound_tmp = df_compound
colnames(compound_tmp)[1] = 'id_new'
participants = merge(participants, compound_tmp[,c('id_new','MetaCyc')], by='id_new')
}
CellularLocationVocabulary_tmp = df_CellularLocationVocabulary
colnames(CellularLocationVocabulary_tmp)[1] = 'location'
colnames(CellularLocationVocabulary_tmp)[2] = 'location2'
if(any(participants$location %in% CellularLocationVocabulary_tmp$location)) {
participants = merge(participants, CellularLocationVocabulary_tmp[,c('location','location2')], by='location')
} else {
participants[['location2']] = ''
}
}
participants$location2 = sub('cytosol', '(in)', participants$location2)
participants$location2 = sub('chloroplast stroma', '(out)', participants$location2)
participants$location2 = sub('extracellular region', '(out)', participants$location2)
participants$location2 = sub('outer membrane-bounded periplasmic space', '(out)', participants$location2)
participants$location2 = sub('plasma membrane', '(out)', participants$location2)
participants[['assemble']] = paste(participants$stoichiometric_coefficient, participants$MetaCyc, participants$location2)
participants$assemble = sub('^1 ', '', participants$assemble)
participants$assemble = sub('^ ', '', participants$assemble)
participants$assemble = sub(' \\(in)', '(in)', participants$assemble)
participants$assemble = sub(' \\(out)', '(out)', participants$assemble)
participants$assemble = sub(' $', '', participants$assemble)
participants_tmp = participants
colnames(participants)[grep('id_new', names(participants))] = 'lefts'
left_sub = merge(participants[,c('lefts', 'assemble')], left_sub, by='lefts')
left_sub = data.table(left_sub)
left_sub2 = left_sub[, lapply(.SD, paste, collapse=' + '), by="MetaCyc", .SDcols=c("assemble")]
left_sub2 = data.frame(left_sub2, stringsAsFactors=F)
colnames(participants)[grep('lefts', names(participants))] = 'rights'
right_sub = merge(right_sub, participants[,c('rights', 'assemble')], by='rights')
right_sub = data.table(right_sub)
right_sub2 = right_sub[, lapply(.SD, paste, collapse = ' + '), by="MetaCyc", .SDcols=c("assemble")]
right_sub2 = data.frame(right_sub2, stringsAsFactors=F)
ind_bidirection = grep(' <=> ', df_reaction$name)
ind_right = grep(' => ', df_reaction$name)
ind_left = grep(' <= ', df_reaction$name)
direction = character(nrow(df_reaction))
direction[ind_bidirection] = " <=> "
direction[ind_right] = ' => '
direction[ind_left] = ' <= '
equation = paste(left_sub2$assemble, direction, right_sub2$assemble, sep='')
ind_without_out = grepl('\\(out\\)', equation) == F
equation[ind_without_out] = gsub('\\(in\\)', '', equation[ind_without_out])
## Remove unnasserary columns
df_reaction[['xref']] = NULL
df_reaction[['lefts']] = NULL
df_reaction[['rights']] = NULL
df_reaction[['participantStoichiometries']] = NULL
## Add equation and reorder the data.frame
df_reaction[['name_id']] = equation
colnames = names(df_reaction)
colnames = colnames[-(grep('MetaCyc', colnames))]
colnames = c('MetaCyc', colnames)
df_reaction = df_reaction[,colnames]
return(df_reaction)
}
.handle.xref = function(df, df_UnificationXref) {
# Convert xref into column 'db' and value 'id'
xref = build.subtable(df, 'name', 'xref', '///')
xref.1 = xref[grep('^U', xref$xref),]
xref.2 = xref[grep('^R', xref$xref),]
xref = rbind(xref.1, xref.2)
xref_merged = merge(xref, df_UnificationXref, by='xref')
for(i in 1:nrow(xref_merged)) {
if(xref_merged[i,'db'] %in% colnames(df) == FALSE) {
df[xref_merged[i,'db']] = ''
}
df[df$name == xref_merged$name[i], xref_merged[i,'db']] = paste(xref_merged[i,'id'], df[df$name == xref_merged$name[i], xref_merged[i,'db']], sep='///')
}
col = colnames(df)
for(i in col) {
if(i == 'MetaCyc') {
df[[i]] = gsub('///.*', '', df[[i]])
}
df[[i]] = gsub('///$', '', df[[i]])
}
return(df)
}
.parse.metacyc.biopax <- function(reaction_ids) {
result_df = get.metacyc.reaction.byId(reaction_ids)
result_df[is.na(result_df)] = ''
return(result_df)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.