R/RbioRXN-internal.R

Defines functions parse.metacyc.biopax handle.xref parse.biopax convert.html unique_column parse.rhea parse.chebi d2formula get.participant get.o.num get.h.num get.formula get.c.num formula2matrix create_df

.Random.seed <-
c(403L, 1L, -754469287L, 1611343336L, 409493526L, -487257199L, 
-1171770301L, 1912392530L, -551743516L, 172658215L, -1911126403L, 
-2105337996L, 1780845402L, 312237477L, -1208624705L, -1974316474L, 
1432961232L, -882173197L, -1181066223L, 554658368L, 2113547806L, 
51538153L, 288959003L, 235297322L, -1119154868L, -383201521L, 
-1908977083L, 1447736316L, 2029890130L, 78436301L, 461046151L, 
-1059679122L, 1435307080L, 1176483403L, 50711465L, -655801608L, 
-597383642L, 458866497L, 810976467L, 1481803906L, 1205053108L, 
867844663L, -2103047891L, -677926332L, -1134618774L, 698909653L, 
1773016431L, -1126723466L, -1364018272L, -1496474525L, -1267422975L, 
817276017L, -681869298L, 1040064633L, 1635557387L, -707858118L, 
307916860L, 1885206399L, 1271565525L, -207851284L, 1889544770L, 
1944464797L, 478042135L, 1250057598L, -1127414984L, -189621221L, 
940074361L, -1188326456L, 1529946102L, 488153649L, 151442531L, 
1733499954L, -1123616380L, -1975353401L, -104074595L, -825282028L, 
-1338146630L, 1884515589L, -238545185L, -1119823002L, 487111408L, 
-1044772333L, 79602609L, -1317269920L, 980728830L, 1664959369L, 
-785980229L, 50479242L, 545588076L, -1286240017L, 722993957L, 
-731410980L, 1419762610L, 1845789357L, -318571929L, -1318188658L, 
1870077544L, -1192318357L, 770819081L, 114989720L, -624796218L, 
-605553375L, -1728418381L, -1335400158L, -1331723500L, 275028887L, 
-1529281651L, -1375879516L, -2002161526L, -1421445387L, 855771471L, 
-813242794L, -1203102464L, -630495165L, -1927984159L, -2049338544L, 
-739015250L, 1433553753L, 904126827L, -381400870L, -291954276L, 
-1338438241L, -1253810187L, 1894010508L, 113535842L, -456998083L, 
257192887L, -1741752226L, -1516227176L, 932069627L, -1897907047L, 
871193896L, 1413721046L, -843452335L, 1851376387L, -1744031086L, 
150192164L, 1334996711L, 909574973L, 385825716L, 1885202458L, 
1929468773L, 454411775L, -660223866L, -930483184L, 1044948659L, 
-2039147183L, -952021760L, 52615134L, -721726423L, -1367231525L, 
1212393706L, 412919436L, 656185807L, 144805253L, -259901892L, 
1297898258L, -1019430899L, -1339339961L, 1134027054L, -809662328L, 
2033388939L, 1276884329L, -336259272L, -1321068058L, -798057087L, 
-1603759853L, -1537478974L, 1043007860L, 1506439543L, 668410477L, 
-45273852L, 1301596202L, -397507179L, 1490086447L, 1002813238L, 
-495507616L, 994307491L, 1012500161L, 1906327216L, 1057765838L, 
-1045885639L, -137223861L, 19129082L, -879627268L, 298103871L, 
609520917L, 1241085484L, -1939924222L, 1716520925L, 2070229719L, 
-957362754L, -1817188360L, 690938843L, -1407115719L, 394962824L, 
959805238L, 476603761L, -1435672669L, -42236174L, -2072318396L, 
1792192775L, -1290172195L, 40492500L, -1401556742L, 763709509L, 
677236895L, -1612686810L, 2098024112L, 1913205587L, -210690447L, 
-1975950432L, 85467966L, 1438370377L, -2108875013L, -1321877814L, 
-1893490644L, 324474927L, 2037868773L, -2137833060L, -1512731662L, 
-381151635L, 1907909543L, 1977957838L, 252377896L, -1670852309L, 
-2017195191L, -1655173032L, 2056799494L, -1789116222L, -1060713432L, 
969283500L, 2004478652L, -365339662L, -1733880272L, -2113794140L, 
-1235989064L, -1006397606L, 779030672L, 1452657532L, -954228460L, 
-1658222270L, 1088180480L, 2030001596L, 1816371536L, -1843308254L, 
-247588280L, 790360076L, -282237348L, 562055634L, -2059274752L, 
-142400172L, 1145646312L, -352497414L, -1243289904L, 800762668L, 
-1647709964L, 1529141650L, 454049120L, 94821388L, -470139680L, 
1398927970L, 824009896L, -132252596L, -1149253764L, 64641330L, 
-1918295440L, 1459512132L, -266976296L, -1497902214L, 707156016L, 
-281571396L, -1006931372L, 1785573442L, 1038397088L, -1790836036L, 
1300466640L, -1258963614L, -1602793752L, -46011188L, -544841220L, 
1718746610L, 1035248768L, 1180117492L, -10663288L, 1368169274L, 
1540590928L, 1389014124L, -1801103980L, 415719506L, -67600928L, 
-106998580L, 1692532384L, 1116959106L, 51947752L, 335252204L, 
-1393187524L, 1796911218L, -1682288016L, -209544668L, 804792376L, 
-504641254L, 1738212560L, -2092971716L, -1454541932L, -694548606L, 
1556007872L, 1494652476L, 1502511504L, 52762658L, 1404704008L, 
1956374668L, -1973326052L, -891448174L, -2026302080L, 194338836L, 
1593411368L, 1872873018L, 1904955856L, 500873580L, -494976076L, 
-1761785326L, 273516512L, 1157016012L, 1160870112L, 2134723618L, 
-1492380888L, -1769178612L, 948589372L, -1665765390L, 643023600L, 
-351184316L, -1563345320L, -1856686278L, 6262768L, -1644158276L, 
1656182036L, 1687264450L, 1675476064L, -633313924L, 1100947408L, 
-859590366L, 1114937000L, -100722804L, -303807300L, 1324120882L, 
14870080L, -1616616268L, 646632520L, -214608454L, -1716078064L, 
348650732L, 1324015956L, 611005202L, 431123872L, -1140466996L, 
-925033888L, -2111524926L, -492473560L, 1458813356L, 2128198972L, 
1727953778L, 1780084016L, 2085283620L, -794434248L, 1201063642L, 
-1266106480L, 2144623356L, -2009719916L, 1199699010L, 457315584L, 
-1356306372L, -356229808L, -745814878L, 1105431368L, 2034040588L, 
1996013276L, -1688965294L, -977889664L, 1701009492L, -623107096L, 
1863740154L, 621696336L, 361594668L, 951788404L, -307715182L, 
-1529474464L, -1827178612L, -1025583264L, 1314161634L, 1732451240L, 
-1528935220L, 449542652L, -1009576014L, 351446512L, 843335108L, 
-394815144L, 1081855354L, 418594224L, 1230742460L, 2137256020L, 
479137730L, -408412128L, -21450564L, -1769629104L, 747030498L, 
-225411480L, 752627276L, 519557372L, -1811408910L, -1726651136L, 
-715824268L, -410345464L, 1921962426L, -1867391536L, -2088430612L, 
-207385708L, 415013842L, 1946281824L, -455933620L, -1656114912L, 
-1625723006L, -583629080L, 1032558828L, -1416694212L, -92392590L, 
1821832816L, -815291356L, -1639878856L, -337626982L, -2058506544L, 
-539784004L, -127210860L, 190800514L, 414838976L, 42804540L, 
-505825136L, -1774889438L, 77394440L, 618036492L, -1484225892L, 
-1595558382L, 313596288L, -640329196L, -487809496L, 1364394298L, 
1911811024L, -1700802964L, 1239442228L, 1693770386L, -555375264L, 
1812815180L, 1402418144L, -1031254110L, 640597928L, 526100364L, 
1588828831L, -1895979223L, -1245045826L, 945212076L, 1583126525L, 
-1267523769L, 1756831792L, -647059122L, 699724155L, 2095548125L, 
-757973238L, -2067810776L, 465411729L, 1797633667L, -1365973116L, 
1216830898L, 1683886855L, -453826223L, -1575980842L, -819044940L, 
-268355131L, -794560241L, 559105928L, 17102502L, 1045309011L, 
-83060811L, -1711540974L, -1812017440L, -329526455L, 1598459259L, 
-858191796L, -2018700838L, -1103336497L, 1436738969L, 1225768366L, 
-1771107652L, -462066899L, -47261481L, -1330388704L, -1867585922L, 
106131531L, 417057357L, 834967674L, -790501192L, -1300428191L, 
505982995L, 721165876L, 2128377922L, 1530624087L, -2036032607L, 
183950438L, -1227672476L, -1153495147L, 1038870719L, -1338910632L, 
-380793738L, -2079583101L, 1751839941L, -611978782L, -917309360L, 
-1054684807L, -639906133L, 963347996L, 1724199242L, -26697409L, 
1664288649L, -524851298L, -1008994740L, -2015920099L, -1931751001L, 
1511347408L, 682858094L, 1447342363L, 234267261L, 1808087146L, 
-459897592L, 886362545L, 758654499L, -249945308L, 600163282L, 
-1731280857L, 384998001L, -733065034L, -768071596L, 481401061L, 
-1429130001L, -1130776536L, 1699122310L, -1578416461L, 490784789L, 
-884200846L, -817121600L, 380013353L, 1714604443L, -191426964L, 
334082682L, -1102318673L, -200808775L, 1578748110L, 762962460L, 
553494925L, 486500343L, 1471245056L, -621735586L, 1265857067L, 
616258861L, 2076672410L, 1801944920L, -1514498367L, 1009510771L, 
1827652244L, 1034811170L, 641847863L, 616822529L, 310406662L, 
-1516284860L, -863640203L, 170551519L, 651926584L, 1879997462L, 
1192786979L, -258184219L, -131604990L, 1110570096L, -1111145959L, 
646441995L, 840232700L, -1732114646L, 27076959L, -146202391L, 
-306456578L, 689265644L, -1570001091L, 1940697863L, -813798672L, 
-826010994L, -618402629L, -521688931L, -17632182L, -1697304984L, 
-2128264623L, 77873475L, 668678212L, -83445006L, 1291567815L, 
1196408977L, -1012953578L, 1037962356L, 1441482501L, -1392900785L, 
-171362488L, -967721242L, -1653563373L, 2028828533L, -1790511278L, 
997011360L, 155392649L, -2067505477L, 594963724L, -1283649382L, 
-1911233009L, 1612436057L, 1402801902L, 2079231356L, 2006248941L, 
1295321751L, -746961568L, -750091970L, -1141061109L, -394766781L
)
.create_df <-
function(class, chemical_table, parentTable, id_col, inchi_col, smiles_col) {  
	test=1
	regex = 'InChI.+/.+/(c.+?)/.+'
  child_list = parentTable[parentTable$parent == class, 'child']
  while(test > 0) {
    middle_parents = child_list[child_list %in% parentTable$parent]
    test = length(middle_parents)
    next_child = parentTable[parentTable$parent %in% middle_parents, 'child']
    child_list = unique(c(child_list[child_list %in% middle_parents == F], next_child))
  }
  child_table = chemical_table[chemical_table[[id_col]] %in% child_list, c(id_col, inchi_col, smiles_col)]
	child_table[[inchi_col]] = sub(regex, '\\1', child_table[[inchi_col]])
	ind_empty_smiles = grep('^$', child_table[[smiles_col]])
	ind_start_smiles = grep('\\*', child_table[[smiles_col]])
	ind_remove = c(ind_empty_smiles, ind_start_smiles)
	if(length(ind_remove) > 0) {
		child_table = child_table[-ind_remove,]
	}
  return(child_table)
}
.formula2matrix <-
function(formula_vector) {
	pattern = "([A-Z]{1}[a-z]?)([0-9]*)"
	tokenized_formula = unlist(str_extract_all(string=formula_vector, pattern = pattern))
	# Add 1 if atom doesn't have number
	ind_noNum = which(grepl('[0-9]', tokenized_formula) == F)
	tokenized_formula[ind_noNum] = sub('$', '1', tokenized_formula[ind_noNum])
	atoms = sub(pattern, '\\1', tokenized_formula)
	numbers = as.numeric(sub(pattern, '\\2', tokenized_formula))
	atom_df = data.frame(cbind(atoms, numbers), stringsAsFactors=F)
	atom_df$numbers = as.numeric(atom_df$numbers)
	atom_df = tapply(atom_df$numbers, INDEX=list(atom_df$atoms),FUN=sum)
	return(atom_df)
}
.get.c.num <-
function(id, chemical_table, id_col, formula_col) {
	pattern_c = 'C{1}[0-9]*'
	formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
	carbon = str_extract(formula, pattern_c)
	carbon = sub('^C$', 'C1', carbon)
	carbon = grep('.+', carbon, value=T)
	carbon = sub('C', '', carbon)
	carbon = as.numeric(carbon)
	if(length(carbon) == 0) {
		carbon = 0
	}
	return(carbon)
}
.get.formula <-
function(id, chemical_table, id_col, formula_col) {
	formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
	return(formula)
}
.get.h.num <-
function(id, chemical_table, id_col, formula_col) {
  pattern_h = 'H{1}[0-9]*'
  formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
  proton = str_extract(formula, pattern_h)
  proton = sub('^H$', 'H1', proton)
  proton = grep('.+', proton, value=T)
  proton = sub('H', '', proton)
  proton = as.numeric(proton)
  if(length(proton) == 0) {
    proton = 0
  }
  return(proton)
}
.get.o.num <-
function(id, chemical_table, id_col, formula_col) {
  pattern_o = 'O{1}[0-9]*'
  formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
  oxygen = str_extract(formula, pattern_o)
  oxygen = sub('^O$', 'O1', oxygen)
  oxygen = grep('.+', oxygen, value=T)
  oxygen = sub('O', '', oxygen)
  oxygen = as.numeric(oxygen)
  if(length(oxygen) == 0) {
    oxygen = 0
  }
  return(oxygen)
}
.get.participant <-
function(equation, direction_type = c(' <=> ', ' => ', ' <\\?> ')) {
	pattern = '.+? (.+)'
	participants = equation
	for(i in direction_type) {
		participants = unlist(strsplit(participants, i))
	}
	participants = unlist(strsplit(participants, ' \\+ '))
	participants = sub('\\(.+\\)', '', participants)
	participants = sub(pattern, '\\1', participants)
	participants = trim(participants)
	return(unique(participants))
}
.id2formula <-
function(id, chemical_table, id_col, formula_col) {
  id = gsub('\\|', '', id)
	formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
	return(formula)
}
.parse.chebi <-
function(owl,chebi_compound, chebi_chemical_data) {
    
  con1 = grepl('    <owl:Class rdf:about="http://purl.obolibrary.org/obo/CHEBI_',owl)
  con2 = grepl('        <rdfs:label rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con3 = grepl('        <obo2:Synonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con4 = grepl('        <obo2:SMILES rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con5 = grepl('        <obo2:InChI rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con6 = grepl('        <obo2:xref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">kegg COMPOUND:C', owl)
  con7 = grepl('        <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_', owl)
  con8 = grepl("</owl:Class>",owl)
  
  id = ""
  name = ""
  synonym = c()
  smiles = ""
  inchi = ""
  kegg = ""
  chebi = c()
  parent = c()
	
	entry_start = grep('<\\!-- http://purl.obolibrary.org/obo/CHEBI_100 -->', owl)
	cat("parsing", length(owl), 'lines\n')
  for (i in entry_start:length(owl)) {
		if(i == floor(length(owl)/10)) {
      cat('10% finished\n')
    } else if(i == floor(length(owl)/5)) {
      cat('20% finished\n')
    } else if(i == floor(length(owl)/2)) {
      cat('50% finished\n')
    }
    if(con1[i]) { # ID
      regexp = '(    <owl:Class rdf:about=\"http://purl.obolibrary.org/obo/CHEBI_)(.*)(">)'
      id = sub(pattern = regexp, replacement = "\\2", x = owl[i])
      id = trim(id)
    }
    else if(con2[i]) { # Name
      regexp = '(        <rdfs:label rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</rdfs:label>)'
      name = sub(pattern = regexp, replacement = "\\2", x = owl[i])
      name = trim(name)
    }
    else if(con3[i]) { # synonym
      regexp = '(        <obo2:Synonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">)(.*)(</obo2:Synonym>)'
      synonym = c(synonym, sub(pattern = regexp, replacement = '\\2', x = owl[i]))
      synonym = trim(synonym)
    }
    else if(con4[i]) { # smiles
      regexp = '(        <obo2:SMILES rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</obo2:SMILES>)'
      smiles = sub(pattern = regexp, replacement = '\\2', x = owl[i])
      smiles = trim(smiles)
    }
    else if(con5[i]) { # inchi
      regexp = '(        <obo2:InChI rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</obo2:InChI>)'
      inchi = sub(pattern = regexp, replacement = '\\2', x = owl[i])
      inchi = trim(inchi)
    }
    else if(con6[i]) { # kegg
      regexp = '(        <obo2:xref rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">kegg COMPOUND:)(.*)(</obo2:xref>)'
      kegg = sub(pattern = regexp, replacement = '\\2', x = owl[i])
      kegg = trim(kegg)
    }
    else if(con7[i]) { # subClassOf
      regexp = '(        <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_)(.*)("/>)'
      parent = c(parent, sub(pattern = regexp, replacement = '\\2', x = owl[i]))
      parent = trim(parent)
    }
    else if(con8[i]) {
      data = c(id, name, paste(synonym, collapse="///"), smiles, inchi, kegg, paste(parent, collapse="///"))
      chebi = rbind(chebi, data)
      id = ""
      name = ""
      synonym = c()
      parent = c()
      smiles = ""
      inchi = ""
      kegg = ""
    }
  }

	# Chemical formula
	chebi_compound2 = chebi_compound[,c('ID', 'ID')]
	ind_parent = grep('[0-9]', chebi_compound$PARENT_ID)
	chebi_compound2[ind_parent, 'ID.1'] = chebi_compound$PARENT_ID[ind_parent]
	colnames(chebi_compound2) = c('id', 'chebi')

	chebi_formula = chebi_chemical_data[chebi_chemical_data$TYPE == 'FORMULA', c('COMPOUND_ID', 'CHEMICAL_DATA')]
	colnames(chebi_formula) = c('id', 'formula')
	chebi_formula2 = join(chebi_compound2, chebi_formula, by='id')
	chebi_formula2 = chebi_formula2[is.na(chebi_formula2$formula) == F,]
	chebi_formula2 = unique(chebi_formula2[,c('chebi', 'formula')])

	chebi_formula3 = data.table(chebi_formula2)
	chebi_formula3 = chebi_formula3[,lapply(.SD, paste, collapse='///'), by="chebi", .SDcols=c("formula")]
	
	chebi = gsub("&#39;", "'", chebi) # convert HTML code
	# Join chebi + formula
	colnames(chebi) = c("chebi", "name", "synonyms", "smiles", "inchi", "kegg", "parent")
  rownames(chebi) = 1:nrow(chebi)
  chebi = as.data.frame(chebi, stringsAsFactors=FALSE)
	chebi_formula3 = as.data.frame(chebi_formula3, stringsAsFactors=F)

	chebi2 = join(chebi, chebi_formula3, by='chebi')
  return(chebi2)
}
.parse.rhea <-
function(owl) {
	entry = c('biochemicalReaction', 'transportReaction', 'equationWithCommonName', 'ecNumber', 'metacyc', 'kegg', 'sameParticipant', 'mapped', 'formuled', 'polymerization', 'chemicallyBalanced', 'iubmb', 'status', 'transport', 'direction', 'classOfReactions', 'closeBiochemicalReaction', 'closeTransport')

	# define regular expression

	regexp = list()	

	regexp[[entry[1]]] = '(<bp:biochemicalReaction rdf:about="http://identifiers\\.org/rhea/)(.*)(">)' # biochemical reaction
	regexp[[entry[2]]] = '(<bp:transport.* rdf:about=")(.*)(">)' # transport reaction
	regexp[[entry[3]]] = '(<bp:NAME .*>)(.*)(</bp:NAME>)' # reaction equation expressed by chemical name
	regexp[[entry[4]]] = '(<bp:EC-NUMBER .*>)(.*)(</bp:EC-NUMBER>)' # EC number
	regexp[[entry[5]]] = '(<bp:XREF rdf:resource="#METACYC:)(.*)(" />)' # cross-reference to MetaCyc
	regexp[[entry[6]]] = '(<bp:XREF rdf:resource="#KEGG_REACTION:)(.*)(" />)' # cross-reference to KEGG
	regexp[[entry[7]]] = '(<bp:XREF rdf:resource="#rel/../RHEA:)(.*)(" />)' # same participants, different direction
	regexp[[entry[8]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Mapped=)(.*)(</bp:COMMENT>)'
	regexp[[entry[9]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Formuled=)(.*)(</bp:COMMENT>)'
	regexp[[entry[10]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Polymerization=)(.*)(</bp:COMMENT>)'
	regexp[[entry[11]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Chemically balanced=)(.*)(</bp:COMMENT>)'
	regexp[[entry[12]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:IUBMB=)(.*)(</bp:COMMENT>)'
	regexp[[entry[13]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Status=)(.*)(</bp:COMMENT>)'
	regexp[[entry[14]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Transport=)(.*)(</bp:COMMENT>)'
	regexp[[entry[15]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Direction=)(.*)(</bp:COMMENT>)'
	regexp[[entry[16]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Class of reactions=)(.*)(</bp:COMMENT>)'
	regexp[[entry[17]]] = '</bp:biochemicalReaction>' # close tag
	regexp[[entry[18]]] = '</bp:transport' # close tag

	
	# pre-calculate condtions (TRUE/FALSE) for fast 'for' loop operation

	condition = list()

	for(i in entry) {
		condition[[i]] = grepl(regexp[[i]], owl)
	}
	
	# parsing using regular expressions

	cat("parsing", length(owl), 'lines\n')

	rheaReaction = data.frame()
	rheaReactionRow = list()

	for(i in 1:length(owl)) {
		if(i == floor(length(owl)/10)) {
			cat('10% finished\n')
		} else if(i == floor(length(owl)/5)) {
			cat('20% finished\n')
		} else if(i == floor(length(owl)/2)) {
			cat('50% finished\n')
		}
		
    for(j in 1:length(entry)) {
			# tag start
			if(j %in% 1:2 && condition[[entry[j]]][i]) {
				rheaReactionRow = list()
				value = unlist(strsplit(owl[i], split = '"'))[2]
        value = tail(unlist(strsplit(value, split = '/')), 1)
				rheaReactionRow[['rheaId']] = c(rheaReactionRow[[entry[1]]], value)
				rheaReactionRow[['reactionType']] = entry[j]
			}
      
			else if(j %in% 3:16 && condition[[entry[j]]][i]) {
				value = sub(regexp[[entry[j]]], '\\2', owl[i])
				rheaReactionRow[[entry[j]]] = paste(rheaReactionRow[[entry[j]]], value, sep=',')
			}

			# tag close
			else if(j %in% 17:18 && condition[[entry[j]]][i]) {
    		rheaReactionRow = as.data.frame(rheaReactionRow, stringsAsFactors = FALSE)
    		rheaReaction = rbind.fill(rheaReaction, rheaReactionRow)
				rheaReactionRow = list()
			}
		}
	}
	
	# post process
	for(i in names(rheaReaction)) {
		rheaReaction[[i]] = sub('^,', '', rheaReaction[[i]])
		rheaReaction[[i]] = gsub('%2b', '+', rheaReaction[[i]])
	}
	rheaReaction = trim(rheaReaction)
	rheaReaction$direction = gsub(' ', '-', rheaReaction$direction)
  
	numberOfBiochecmialReaction = length(unique(rheaReaction[rheaReaction$reactionType == 'biochemicalReaction', 'rheaId']))
	numberOfTransportReaction = length(unique(rheaReaction[rheaReaction$reactionType == 'transportReaction', 'rheaId']))

	cat('# of biochemical reaction: ', numberOfBiochecmialReaction, '\n')
	cat('# of transport reaction: ', numberOfTransportReaction, '\n')

	# replace html code
	rheaReaction[,'equationWithCommonName'] = gsub(pattern = "&gt;", replacement = ">", x = rheaReaction[,'equationWithCommonName'])
	rheaReaction[,'equationWithCommonName'] = gsub(pattern = "&lt;", replacement = "<", x = rheaReaction[,'equationWithCommonName'])
	rheaReaction[,'equationWithCommonName'] = gsub(pattern = "&apos;", replacement = "'", x = rheaReaction[,'equationWithCommonName'])
  
	# Sorting
	rheaReaction = rheaReaction[order(rheaReaction[,'rheaId']),]
	rownames(rheaReaction) = 1:nrow(rheaReaction)

	##### Parsing equation expressed with ChEBI ID ##### 
	
	cat("Parsing equation expressed with ChEBI ID\n")

	entry = c('compoundId', 'name', 'chebiId', 'close')

	# define regular expression

	regexp = list()

	regexp[[entry[1]]] = '(<bp:physicalEntity rdf:about="#compound:)(.+)(">)' # ChEBI ID
  regexp[[entry[2]]] = '(<bp:NAME rdf:datatype .+>)(.*)(</bp:NAME>)' # compound common name used in equation
	regexp[[entry[3]]] = '(<bp:COMMENT rdf:datatype = "http://www.w3.org/2001/XMLSchema#string">.* CHEBI:)(.*)(</bp:COMMENT>)'
	regexp[[entry[4]]] = '</bp:physicalEntity>' # close

	# pre-calculate condtions (TRUE/FALSE) for fast 'for' loop operation

	condition = list()

	for(i in entry) {
		condition[[i]] = grepl(regexp[[i]], owl)
	}

	# parsing using regular expressions

	cat("Parsing owl to create list mapping chemical common name into ChEBI ID\n")

	rheaChemical = data.frame()
	rheaChemicalRow = list()

	for(i in 1:length(owl)) {
		for(j in 1:length(entry)) {
			# Tag start. Please change the number in if statement depending on length of entry
			if(j == 1 && condition[[entry[j]]][i]) {
				rheaChemicalRow = list()
				value = unlist(strsplit(owl[i], split = '"'))[2]
				value = sub('#', '', value)
				rheaChemicalRow[[entry[j]]] = c(rheaChemicalRow[[entry[1]]], value)
			}
			else if(j %in% 2:3 && condition[[entry[j]]][i]) {
				value = sub(regexp[[entry[j]]], '\\2', owl[i])
				rheaChemicalRow[[entry[j]]] = value
			}

			# Tag close
			else if(j == 4 && condition[[entry[j]]][i]) {
				rheaChemicalRow = as.data.frame(rheaChemicalRow, stringsAsFactors = FALSE)
				rheaChemical = rbind.fill(rheaChemical, rheaChemicalRow)
				rheaChemicalRow = list()
			}
		}
	}

	rheaChemical = trim(rheaChemical)
	rheaChemical[,'name'] = gsub(pattern = "&gt;", replacement = ">", x = rheaChemical[,'name'])
	rheaChemical[,'name'] = gsub(pattern = "&lt;", replacement = "<", x = rheaChemical[,'name'])
	rheaChemical[,'name'] = gsub(pattern = "&apos;", replacement = "'", x = rheaChemical[,'name'])

	numberOfChemicals = nrow(rheaChemical)

	cat("# of chemicals used in Rhea: ", numberOfChemicals, '\n') 


	##### Building equation with the list #####
	
	cat("Building equation with the list\n")

	# Define direction

	directionList = list()
	directionList[['undefined']] = ' <?> '
	directionList[['bidirectional']] = ' <=> '
	directionList[['right-to-left']] = ' => '
	directionList[['left-to-right']] = ' => '
  
	# Define regular expressions
  
	regexp_coefficient = "(^\\(?[0-9]*n?\\+?[0-9]*\\)? )(.+)"
	regexp_localization = "(.+)(\\([inout]+\\))"

	# Conversion (split - conversion - paste)
  
	equationWithChebi = character(nrow(rheaReaction))
	equationParticipant = character(nrow(rheaReaction))
	for(i in 1:nrow(rheaReaction)) {
		reactantsChebi = c()
		productsChebi= c()
		arrow = directionList[[rheaReaction[i, 'direction']]]
		arrow2 = sub('\\?', '\\\\?', arrow)
		participants = unlist(strsplit(rheaReaction[i,'equationWithCommonName'], split=arrow2))
   	reactants = unlist(strsplit(participants[1], split = " \\+ "))
   	reactantsWOcoefficient = sub(regexp_coefficient, "\\2", reactants)
   	for(j in 1:length(reactants)) { 
			if(grepl(regexp_coefficient, reactants[j])) { # process coefficeint
 				coefficient = sub(regexp_coefficient, "\\1", reactants[j])
   		} else {coefficient = ""}
			if(grepl(regexp_localization, reactants[j])) { # process (in), (out)
      	localization = sub(regexp_localization, "\\2", x = reactants[j])
      } else {localization = ""}
			reactantsWOcoefficient[j] = gsub("\\(in\\)", "", reactantsWOcoefficient[j])
      reactantsWOcoefficient[j] = gsub("\\(out\\)", "", reactantsWOcoefficient[j])
			chebi = rheaChemical[rheaChemical$name == reactantsWOcoefficient[j], 'chebiId']
      if(length(chebi) > 1) {
        reactants[j] = "Unknown"
      } else {
        reactants[j] = paste(coefficient, chebi, localization, sep="")
      }
			reactantsChebi = c(reactantsChebi, chebi)
    }
		products = unlist(strsplit(participants[2], split = " \\+ "))
    productsWOcoefficient = sub(regexp_coefficient, "\\2", products)
		
		for(k in 1:length(products)) {
     	if(grepl(regexp_coefficient, products[k])) { # process coefficeint
       	coefficient = sub(regexp_coefficient, "\\1", products[k])
     	} else {coefficient = ""}
     	if(grepl(regexp_localization, products[k])) { # process (in), (out)
       	localization = sub(regexp_localization, "\\2", x = products[k])
     	} else {localization = ""}
     	productsWOcoefficient[k] = gsub("\\(in\\)", "", productsWOcoefficient[k])
     	productsWOcoefficient[k] = gsub("\\(out\\)", "", productsWOcoefficient[k])
     	chebi = rheaChemical[rheaChemical$name == productsWOcoefficient[k], 'chebiId']
     	if(length(chebi) > 1) {
        products[k] = "Unknown"  
     	} else {
     	  products[k] = paste(coefficient, chebi, localization, sep='')   
     	}
      productsChebi = c(productsChebi, chebi)
    }
   	equationWithChebi[i] = paste(paste(reactants, collapse = " + "), paste(products, collapse = " + "), sep = arrow)
		equationParticipant[i] = paste(c(reactantsChebi, productsChebi), collapse=',')
 	}
	result = cbind(rheaReaction, I(equationWithChebi), I(equationParticipant))
	result[is.na(result)] = ''
	return(result)
}
.unique_column <-
function(vector) {
  if(length(vector)>1) {
    return(vector[1])
  }
}
.convert.html = function(dataFrame) {
  col = colnames(dataFrame)
  dataFrame_result = dataFrame
  for(i in col) {
    dataFrame_result[[i]] = gsub('&lt;i&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/i&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;beta;', 'beta', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;SUP&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/SUP&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;sup&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/sup&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;sub&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/sub&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub(' &amp;rarr; ', '=>', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub(' &amp;larr; ', '<=', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub(' &amp;harr; ', '<=>', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('  =  ', ' <=> ', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;pi;', 'pi', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;alpha;', 'alpha', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;I&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/I&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;SUB&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/SUB&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&apos;', "'", dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&gt;', ">", dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;gamma;', "gamma", dataFrame_result[[i]])
  }
  return(dataFrame_result)
}

.parse.biopax <-
function(biopax) {
  ### Define regula expression
  ## BiochemicalReaction set 
  regex_biochemicalReaction_tag = '<bp:BiochemicalReaction rdf:ID=".+">'
  regex_biochemicalReaction_close = '</bp:BiochemicalReaction>'
  regex_standardName_tag = '<bp:standardName rdf:datatype="http://www.w3.org/2001/XMLSchema#string">(.+)</bp:standardName>'
  regex_eCNumber_tag = '<bp:eCNumber rdf:datatype="http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:eCNumber>'
  regex_xref_tag = '<bp:xref rdf:resource=\"#(.+)"/>'
  regex_left_tag = '<bp:left rdf:resource=\"#(.+)"/>'
  regex_right_tag = '<bp:right rdf:resource=\"#(.+)"/>'
  regex_participantStoichiometry_tag = '<bp:participantStoichiometry rdf:resource=\"#(.+)"/>'
  
  index_biochemicalReaction_tag = grepl(regex_biochemicalReaction_tag, biopax)
  index_biochemicalReaction_close = grepl(regex_biochemicalReaction_close, biopax)
  index_standardName_tag = grepl(regex_standardName_tag, biopax)
  index_eCNumber_tag = grepl(regex_eCNumber_tag, biopax)
  index_xref_tag = grepl(regex_xref_tag, biopax)
  index_left_tag = grepl(regex_left_tag, biopax)
  index_right_tag = grepl(regex_right_tag, biopax)
  index_participantStoichiometry_tag = grepl(regex_participantStoichiometry_tag, biopax)
  
  ## Transport set
  regex_Transport_tag = '<bp:Transport rdf:ID=".+>'
  regex_Transport_close = '</bp:Transport>'
  
  index_Transport_tag = grepl(regex_Transport_tag, biopax)
  index_Transport_close = grepl(regex_Transport_close, biopax)
  
  ## TransportWithBiochemicalReaction set
  regex_TransportWithBiochemicalReaction_tag = '<bp:TransportWithBiochemicalReaction rdf:ID=".+">'
  regex_TransportWithBiochemicalReaction_close = '</bp:TransportWithBiochemicalReaction>'
  
  index_TransportWithBiochemicalReaction_tag = grepl(regex_TransportWithBiochemicalReaction_tag, biopax)
  index_TransportWithBiochemicalReaction_close = grepl(regex_TransportWithBiochemicalReaction_close, biopax)
  
  ## ComplexAssembly set
  regex_ComplexAssembly_tag = '<bp:ComplexAssembly rdf:ID=\".+">'
  regex_ComplexAssembly_close = '</bp:ComplexAssembly>'
  
  index_ComplexAssembly_tag = grepl(regex_ComplexAssembly_tag, biopax)
  index_ComplexAssembly_close = grepl(regex_ComplexAssembly_close, biopax)
  
  ## UnificationXref & RelationshipXref set
  regex_UnificationXref_tag = '<bp:UnificationXref rdf:ID="(.+)">'
  regex_UnificationXref_close = '</bp:UnificationXref>'
  regex_RelationshipXref_tag = '<bp:RelationshipXref rdf:ID="(.+)">'
  regex_RelationshipXref_close = '</bp:RelationshipXref>'
  regex_xref_integrated_tag = "<bp:.+ rdf:ID=\"(.+)\">"
  regex_id_tag = '<bp:id rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:id>'
  regex_db_tag = '<bp:db rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:db>'
  
  index_UnificationXref_tag = grepl(regex_UnificationXref_tag, biopax)
  index_UnificationXref_close = grepl(regex_UnificationXref_close, biopax)
  index_RelationshipXref_tag = grepl(regex_RelationshipXref_tag, biopax)
  index_RelationshipXref_close = grepl(regex_RelationshipXref_close, biopax)
  index_id_tag = grepl(regex_id_tag, biopax)
  index_db_tag = grepl(regex_db_tag, biopax)
  
  ## Small Molecule set
  regex_SmallMolecule_tag = '<bp:SmallMolecule rdf:ID=\"(.+)">'
  regex_SmallMolecule_close = '</bp:SmallMolecule>'
  regex_cellularLocation_tag = '<bp:cellularLocation rdf:resource="#(.+)"/>'
  
  index_SmallMolecule_tag = grepl(regex_SmallMolecule_tag, biopax)
  index_SmallMolecule_close = grepl(regex_SmallMolecule_close, biopax)
  index_cellularLocation_tag = grepl(regex_cellularLocation_tag, biopax)
  
  ## Protein set
  regex_Protein_tag = '<bp:Protein rdf:ID="(.+)">'
  regex_Protein_close = '</bp:Protein>'
  
  index_Protein_tag = grepl(regex_Protein_tag, biopax)
  index_Protein_close = grepl(regex_Protein_close, biopax)
  
  ## Complext set
  regex_Complex_tag = '<bp:Complex rdf:ID="(.+)">'
  regex_Complex_close = '</bp:Complex>'
  
  index_Complex_tag = grepl(regex_Complex_tag, biopax)
  index_Complex_close = grepl(regex_Complex_close, biopax)
  
  ## Rna set
  regex_Rna_tag = '<bp:Rna rdf:ID="(.+)">'
  regex_Rna_close = '</bp:Rna>'
  
  index_Rna_tag = grepl(regex_Rna_tag, biopax)
  index_Rna_close = grepl(regex_Rna_close, biopax)
  
  ## CellularLocationVocabulary set
  regex_CellularLocationVocabulary_tag = '<bp:CellularLocationVocabulary rdf:ID=\"(.+)">'
  regex_term_tag = '<bp:term rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:term>'
  regex_CellularLocationVocabulary_close = '</bp:CellularLocationVocabulary>'
  
  index_CellularLocationVocabulary_tag = grepl(regex_CellularLocationVocabulary_tag, biopax)
  index_term_tag = grepl(regex_term_tag, biopax)
  index_CellularLocationVocabulary_close = grepl(regex_CellularLocationVocabulary_close, biopax)
  
  ## Flags
  biochemicalReaction_flag = FALSE
  Transport_flag = FALSE
  TransportWithBiochemicalReaction_flag = FALSE
  UnificationXref_flag = FALSE
  SmallMolecule_flag = FALSE
  CellularLocationVocabulary_flag = FALSE
  Protein_flag = FALSE
  Complex_flag = FALSE
  Rna_flag = FALSE
  ComplexAssembly_flag = FALSE
  
  ## Read each line and parsing
  ec_numbers = c()
  xrefs_biochemicalReaction = c()
  xrefs_Transport = c()
  xrefs_TransportWithBiochemicalReaction = c()
  xrefs_ComplexAssembly = c()
  df_biochemicalReaction = c()
  df_Transport = c()
  df_TransportWithBiochemicalReaction = c()
  df_ComplexAssembly = c()
  df_smallMolecule = c()
  df_Protein = c()
  df_Complex = c()
  df_Rna = c()
  df_CellularLocationVocabulary = c()
  tmp_xrefs_smallMolecule = c()
  tmp_xrefs_Protein = c()
  tmp_xrefs_Complex = c()
  tmp_xrefs_Rna = c()
  lefts = c()
  rights = c()
  df_UnificationXref = c()
  participantStoichiometries = c()
  cellularLocation = ''
  for(i in 1:length(biopax)) {
    # Set flag
    if(index_biochemicalReaction_tag[i]) {
      biochemicalReaction_flag = TRUE
    } else if(index_Transport_tag[i]) {
      Transport_flag = TRUE
    } else if(index_TransportWithBiochemicalReaction_tag[i]) {
      TransportWithBiochemicalReaction_flag = TRUE
    } else if(index_ComplexAssembly_tag[i]) {
      ComplexAssembly_flag = TRUE
    } else if(index_UnificationXref_tag[i] || index_RelationshipXref_tag[i]) {
      UnificationXref_flag = TRUE
    } else if(index_SmallMolecule_tag[i]) {
      SmallMolecule_flag = TRUE
    } else if(index_CellularLocationVocabulary_tag[i]) {
      CellularLocationVocabulary_flag = TRUE
    } else if(index_Protein_tag[i]) {
      Protein_flag = TRUE
    } else if(index_Complex_tag[i]) {
      Complex_flag = TRUE
    } else if(index_Rna_tag[i]) {
      Rna_flag = TRUE
    }
    
    # Parsing
    if(biochemicalReaction_flag) {
      if(index_standardName_tag[i]) {
        reaction_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_biochemicalReaction = c(xrefs_biochemicalReaction, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_biochemicalReaction_close[i]) {
        paste_xrefs_biochemicalReaction = paste(xrefs_biochemicalReaction, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_biochemicalReaction = cbind(reaction_equation_name, paste_ec_numbers, paste_xrefs_biochemicalReaction, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_biochemicalReaction = data.frame(df_biochemicalReaction, stringsAsFactors=F)
        biochemicalReaction_flag = FALSE
      }
    }
    
    else if(Transport_flag) {
      if(index_standardName_tag[i]) {
        transport_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_Transport = c(xrefs_Transport, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_Transport_close[i]) {
        paste_xrefs_Transport = paste(xrefs_Transport, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_Transport = cbind(transport_equation_name, paste_ec_numbers, paste_xrefs_Transport, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_Transport = data.frame(df_Transport, stringsAsFactors=F)
        Transport_flag = FALSE
      }
    }
    
    else if(TransportWithBiochemicalReaction_flag) {
      if(index_standardName_tag[i]) {
        TransportWithBiochemicalReaction_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_TransportWithBiochemicalReaction = c(xrefs_TransportWithBiochemicalReaction, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_TransportWithBiochemicalReaction_close[i]) {
        paste_xrefs_TransportWithBiochemicalReaction = paste(xrefs_TransportWithBiochemicalReaction, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_TransportWithBiochemicalReaction = cbind(TransportWithBiochemicalReaction_equation_name, paste_ec_numbers, paste_xrefs_TransportWithBiochemicalReaction, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_TransportWithBiochemicalReaction = data.frame(df_TransportWithBiochemicalReaction, stringsAsFactors=F)
        TransportWithBiochemicalReaction_flag = FALSE
      }
    }
    
    else if(ComplexAssembly_flag) {
      if(index_standardName_tag[i]) {
        ComplexAssembly_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_ComplexAssembly = c(xrefs_ComplexAssembly, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_ComplexAssembly_close[i]) {
        paste_xrefs_ComplexAssembly = paste(xrefs_ComplexAssembly, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_ComplexAssembly = cbind(ComplexAssembly_equation_name, paste_ec_numbers, paste_xrefs_ComplexAssembly, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_ComplexAssembly = data.frame(df_ComplexAssembly, stringsAsFactors=F)
        TransportWithBiochemicalReaction_flag = FALSE
      }
    }
    
    else if(UnificationXref_flag) {
      if(index_UnificationXref_tag[i] || index_RelationshipXref_tag[i]) {
        UnificationXref_id = trim(sub(regex_xref_integrated_tag, '\\1', biopax[i]))
      } else if(index_id_tag[i]) {
        id = trim(sub(regex_id_tag, '\\1', biopax[i]))
      } else if(index_db_tag[i]) {
        db = trim(sub(regex_db_tag, '\\1', biopax[i]))
      } else if(index_UnificationXref_close[i] || index_RelationshipXref_close[i]) {
        tmp = c(UnificationXref_id, id, db)
        df_UnificationXref = rbind(df_UnificationXref, tmp)
        df_UnificationXref = data.frame(df_UnificationXref, stringsAsFactors=F)
        UnificationXref_flag = FALSE
      }
    }
    
    else if(SmallMolecule_flag) {
      if(index_SmallMolecule_tag[i]) {
        master_id = trim(sub(regex_SmallMolecule_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        chemial_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_smallMolecule = c(tmp_xrefs_smallMolecule, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_SmallMolecule_close[i]) {
        tmp = c(master_id, chemial_name, paste(tmp_xrefs_smallMolecule, collapse='///'), cellularLocation)
        df_smallMolecule = rbind(df_smallMolecule, tmp)
        SmallMolecule_flag = FALSE
        tmp_xrefs_smallMolecule = c()
        cellularLocation = ''
      }
    }
    
    else if(Protein_flag) {
      if(index_Protein_tag[i]) {
        master_id = trim(sub(regex_Protein_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        protein_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_Protein = c(tmp_xrefs_Protein, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_Protein_close[i]) {
        tmp = c(master_id, protein_name, paste(tmp_xrefs_Protein, collapse='///'), cellularLocation)
        df_Protein = rbind(df_Protein, tmp)
        Protein_flag = FALSE
        tmp_xrefs_Protein = c()
        cellularLocation = ''
      }
    }
    
    else if(Complex_flag) {
      if(index_Complex_tag[i]) {
        master_id = trim(sub(regex_Complex_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        complex_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_Complex = c(tmp_xrefs_Complex, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_Complex_close[i]) {
        tmp = c(master_id, complex_name, paste(tmp_xrefs_Complex, collapse='///'), cellularLocation)
        df_Complex = rbind(df_Complex, tmp)
        Complex_flag = FALSE
        tmp_xrefs_Complex = c()
        cellularLocation = ''
      }
    }
    
    else if(Rna_flag) {
      if(index_Rna_tag[i]) {
        master_id = trim(sub(regex_Rna_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        rna_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_Rna = c(tmp_xrefs_Rna, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_Rna_close[i]) {
        tmp = c(master_id, rna_name, paste(tmp_xrefs_Rna, collapse='///'), cellularLocation)
        df_Rna = rbind(df_Rna, tmp)
        Rna_flag = FALSE
        tmp_xrefs_Rna = c()
        cellularLocation = ''
      }
    }
    
    if(CellularLocationVocabulary_flag) {
      if(index_CellularLocationVocabulary_tag[i]) {
        CellularLocationVocabulary_id = trim(sub(regex_CellularLocationVocabulary_tag, '\\1', biopax[i]))
      } else if(index_term_tag[i]) {
        CellularLocationVocabulary_term = trim(sub(regex_term_tag, '\\1', biopax[i]))
      } else if(index_CellularLocationVocabulary_close[i]) {
        tmp = c(CellularLocationVocabulary_id, CellularLocationVocabulary_term)
        df_CellularLocationVocabulary = rbind(df_CellularLocationVocabulary, tmp)
        CellularLocationVocabulary_flag = FALSE
      }
    }
  }
  
  ## Post-processing data frame
  if(length(df_smallMolecule)>0) {
    rownames(df_smallMolecule) = 1:nrow(df_smallMolecule)
    df_smallMolecule = data.frame(df_smallMolecule, stringsAsFactors=F)
  }
  if(length(df_Protein)>0) {
    rownames(df_Protein) = 1:nrow(df_Protein)
    df_Protein = data.frame(df_Protein, stringsAsFactors=F)
  }
  if(length(df_Complex)>0) {
    rownames(df_Complex) = 1:nrow(df_Complex)
    df_Complex = data.frame(df_Complex, stringsAsFactors=F)
  }
  if(length(df_Rna)>0) {
    rownames(df_Rna) = 1:nrow(df_Rna)
    df_Rna = data.frame(df_Rna, stringsAsFactors=F)
  }
  
  rownames(df_CellularLocationVocabulary) = 1:nrow(df_CellularLocationVocabulary)
  df_CellularLocationVocabulary = data.frame(df_CellularLocationVocabulary, stringsAsFactors=F)
  
  ## Concatenate data.frames
  if(length(df_biochemicalReaction)>0) {
    df_reaction = df_biochemicalReaction
  } else if(length(df_Transport)>0) {
    df_reaction = df_Transport
  } else if(length(df_TransportWithBiochemicalReaction)>0) {
    df_reaction = df_TransportWithBiochemicalReaction
  } else if(length(df_ComplexAssembly)>0) {
    df_reaction = df_ComplexAssembly
  }
  
  df_compound = rbind.fill(df_smallMolecule, df_Protein, df_Complex, df_Rna)
  
  ## Handle Xref
  colnames(df_UnificationXref) = c('xref','id','db')
  colnames(df_reaction) = c('name','ec_number','xref','lefts','rights','participantStoichiometries')
  colnames(df_compound) = c('name', 'standard_name','xref', 'cellularLocation')
  colnames(df_CellularLocationVocabulary) = c('master_id', 'term')
  
  df_reaction = .handle.xref(df_reaction, df_UnificationXref)
  df_compound = .handle.xref(df_compound, df_UnificationXref)
  
  ## Convert HTML code to character
  df_reaction = .convert.html(df_reaction)
  df_compound = .convert.html(df_compound)
  
  ## Parse stoichimetric coefficient
  regex_Stoichiometry_tag = '<bp:Stoichiometry rdf:ID="(.+)">'
  regex_Stoichiometry_close = '</bp:Stoichiometry>'
  regex_stoichiometricCoefficient_tag = '<bp:stoichiometricCoefficient rdf:datatype="http://www.w3.org/2001/XMLSchema#.+">(.+)</bp:stoichiometricCoefficient>'
  
  regex_id = '<bp:.+ rdf:ID="(.+)">'
  regex_id2 = '<bp:PHYSICAL-ENTITY rdf:resource="#(.+)"/>'
  
  regex_location = '<bp:cellularLocation rdf:resource="#(.+)"/>'
  regex_location2 = '<bp:cellularLocation>'
  regex_location3 = '<bp:CellularLocationVocabulary rdf:ID="(.+)">'
  
  index_Stoichiometry_tag = grep(regex_Stoichiometry_tag, biopax)
  index_Stoichiometry_close = grep(regex_Stoichiometry_close, biopax)
  
  master_id = trim(sub(regex_Stoichiometry_tag, '\\1', biopax[index_Stoichiometry_tag]))
  stoichiometric_coefficient = trim(sub(regex_stoichiometricCoefficient_tag, '\\1', biopax[index_Stoichiometry_tag+1]))
  
  id = trim(sub(regex_id, '\\1', biopax[index_Stoichiometry_tag+3]))
  id2 = trim(sub(regex_id2, '\\1', biopax[index_Stoichiometry_tag+2]))
  
  id_new = character(length(id))
  id_new[grepl('>', id)==FALSE] = id[grepl('>', id)==FALSE]
  id_new[grepl('>', id2)==FALSE] = id2[grepl('>', id2)==FALSE]
  
  location = trim(sub(regex_location, '\\1', biopax[index_Stoichiometry_close-3]))
  index_location = grep(regex_location2, biopax)
  if(length(location) != length(index_location)) {
    location[grep('comment', location)] = ''
    location[grep('dataSource', location)] = ''
  }
  if(length(index_location)>0) {
    location2 = trim(sub(regex_location3, '\\1', biopax[index_location+1]))
    location[grep('<', location)] = location2[grepl('>', location2)==F]
  } else {
    location[grep('>', location)] = df_compound[df_compound$name == id_new, 'cellularLocation']
  }
  
  participants = data.frame(cbind(master_id, stoichiometric_coefficient, id_new, location), stringsAsFactors=F)
  
  left_sub =build.subtable(df_reaction, 'MetaCyc', 'lefts', '///')
  right_sub = build.subtable(df_reaction, 'MetaCyc', 'rights', '///')
  
  ## Sometimes there is no 'Stoichiometry' tag
  if(nrow(participants) < (nrow(left_sub) + nrow(right_sub))) {
    tmp_df_CellularLocationVocabulary = df_CellularLocationVocabulary
    colnames(tmp_df_CellularLocationVocabulary)[grep('master_id', names(tmp_df_CellularLocationVocabulary))] = 'cellularLocation'
    participants = merge(df_compound, tmp_df_CellularLocationVocabulary, by = 'cellularLocation', all.x=T)
    colnames(participants)[grep('term', names(participants))] = 'location2'
    colnames(participants)[grep('^name$', names(participants))] = 'id_new'
    participants[is.na(participants)] = 'cytosol'
  } else {
    ## Generate equation with MetaCyc ID (not name)
    stoichio_sub = build.subtable(df_reaction, 'MetaCyc', 'participantStoichiometries', '///')
    
    if(nrow(df_compound) > 0) {
      compound_tmp = df_compound
      colnames(compound_tmp)[1] = 'id_new'
      participants = merge(participants, compound_tmp[,c('id_new','MetaCyc')], by='id_new')
    }
    
    CellularLocationVocabulary_tmp = df_CellularLocationVocabulary
    colnames(CellularLocationVocabulary_tmp)[1] = 'location'
    colnames(CellularLocationVocabulary_tmp)[2] = 'location2'
    if(any(participants$location %in% CellularLocationVocabulary_tmp$location)) {
      participants = merge(participants, CellularLocationVocabulary_tmp[,c('location','location2')], by='location')
    } else {
      participants[['location2']] = ''
    }  
  }
  
  participants$location2 = sub('cytosol', '(in)', participants$location2)
  participants$location2 = sub('chloroplast stroma', '(out)', participants$location2)
  participants$location2 = sub('extracellular region', '(out)', participants$location2)
  participants$location2 = sub('outer membrane-bounded periplasmic space', '(out)', participants$location2)
  participants$location2 = sub('plasma membrane', '(out)', participants$location2)
  
  participants[['assemble']] = paste(participants$stoichiometric_coefficient, participants$MetaCyc, participants$location2)
  participants$assemble = sub('^1 ', '', participants$assemble)
  participants$assemble = sub('^ ', '', participants$assemble)
  participants$assemble = sub(' \\(in)', '(in)', participants$assemble)
  participants$assemble = sub(' \\(out)', '(out)', participants$assemble)
  participants$assemble = sub(' $', '', participants$assemble)
  
  participants_tmp = participants
  colnames(participants)[grep('id_new', names(participants))] = 'lefts'
  left_sub = merge(participants[,c('lefts', 'assemble')], left_sub, by='lefts')
  left_sub = data.table(left_sub)
  left_sub2 = left_sub[, lapply(.SD, paste, collapse=' + '), by="MetaCyc", .SDcols=c("assemble")]
  left_sub2 = data.frame(left_sub2, stringsAsFactors=F)
  
  colnames(participants)[grep('lefts', names(participants))] = 'rights'
  right_sub = merge(right_sub, participants[,c('rights', 'assemble')], by='rights')
  right_sub = data.table(right_sub)
  right_sub2 = right_sub[, lapply(.SD, paste, collapse = ' + '), by="MetaCyc", .SDcols=c("assemble")]
  right_sub2 = data.frame(right_sub2, stringsAsFactors=F)
  
  ind_bidirection = grep(' <=> ', df_reaction$name)
  ind_right = grep(' => ', df_reaction$name)
  ind_left = grep(' <= ', df_reaction$name)
  
  direction = character(nrow(df_reaction))
  
  direction[ind_bidirection] = " <=> "
  direction[ind_right] = ' => '
  direction[ind_left] = ' <= '
  
  equation = paste(left_sub2$assemble, direction, right_sub2$assemble, sep='')
  
  ind_without_out = grepl('\\(out\\)', equation) == F
  equation[ind_without_out] = gsub('\\(in\\)', '', equation[ind_without_out])
  
  ## Remove unnasserary columns
  df_reaction[['xref']] = NULL
  df_reaction[['lefts']] = NULL
  df_reaction[['rights']] = NULL
  df_reaction[['participantStoichiometries']] = NULL
  
  ## Add equation and reorder the data.frame
  df_reaction[['name_id']] = equation
  
  colnames = names(df_reaction)
  colnames = colnames[-(grep('MetaCyc', colnames))]
  colnames = c('MetaCyc', colnames)
  
  df_reaction = df_reaction[,colnames]
  
  return(df_reaction)
}

.handle.xref = function(df, df_UnificationXref) {
  # Convert xref into column 'db' and value 'id'
  xref = build.subtable(df, 'name', 'xref', '///')
  xref.1 = xref[grep('^U', xref$xref),]
  xref.2 = xref[grep('^R', xref$xref),]
  xref = rbind(xref.1, xref.2)
  xref_merged = merge(xref, df_UnificationXref, by='xref')
  
  for(i in 1:nrow(xref_merged)) {
    if(xref_merged[i,'db'] %in% colnames(df) == FALSE) {
      df[xref_merged[i,'db']] = ''
    }
    df[df$name == xref_merged$name[i], xref_merged[i,'db']] = paste(xref_merged[i,'id'], df[df$name == xref_merged$name[i], xref_merged[i,'db']], sep='///')
  }
  col = colnames(df)
  for(i in col) {
    if(i == 'MetaCyc') {
      df[[i]] = gsub('///.*', '', df[[i]])
    }
    df[[i]] = gsub('///$', '', df[[i]])
  }
  return(df)
}

.parse.metacyc.biopax <- function(reaction_ids) {
  result_df = get.metacyc.reaction.byId(reaction_ids)
  result_df[is.na(result_df)] = ''
  return(result_df)
}

Try the RbioRXN package in your browser

Any scripts or data that you put into this service are public.

RbioRXN documentation built on May 29, 2017, 10:56 a.m.