R/RbioRXN-internal.R
In RbioRXN: Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data

.Random.seed <-
c(403L, 1L, -754469287L, 1611343336L, 409493526L, -487257199L, 
-1171770301L, 1912392530L, -551743516L, 172658215L, -1911126403L, 
-2105337996L, 1780845402L, 312237477L, -1208624705L, -1974316474L, 
1432961232L, -882173197L, -1181066223L, 554658368L, 2113547806L, 
51538153L, 288959003L, 235297322L, -1119154868L, -383201521L, 
-1908977083L, 1447736316L, 2029890130L, 78436301L, 461046151L, 
-1059679122L, 1435307080L, 1176483403L, 50711465L, -655801608L, 
-597383642L, 458866497L, 810976467L, 1481803906L, 1205053108L, 
867844663L, -2103047891L, -677926332L, -1134618774L, 698909653L, 
1773016431L, -1126723466L, -1364018272L, -1496474525L, -1267422975L, 
817276017L, -681869298L, 1040064633L, 1635557387L, -707858118L, 
307916860L, 1885206399L, 1271565525L, -207851284L, 1889544770L, 
1944464797L, 478042135L, 1250057598L, -1127414984L, -189621221L, 
940074361L, -1188326456L, 1529946102L, 488153649L, 151442531L, 
1733499954L, -1123616380L, -1975353401L, -104074595L, -825282028L, 
-1338146630L, 1884515589L, -238545185L, -1119823002L, 487111408L, 
-1044772333L, 79602609L, -1317269920L, 980728830L, 1664959369L, 
-785980229L, 50479242L, 545588076L, -1286240017L, 722993957L, 
-731410980L, 1419762610L, 1845789357L, -318571929L, -1318188658L, 
1870077544L, -1192318357L, 770819081L, 114989720L, -624796218L, 
-605553375L, -1728418381L, -1335400158L, -1331723500L, 275028887L, 
-1529281651L, -1375879516L, -2002161526L, -1421445387L, 855771471L, 
-813242794L, -1203102464L, -630495165L, -1927984159L, -2049338544L, 
-739015250L, 1433553753L, 904126827L, -381400870L, -291954276L, 
-1338438241L, -1253810187L, 1894010508L, 113535842L, -456998083L, 
257192887L, -1741752226L, -1516227176L, 932069627L, -1897907047L, 
871193896L, 1413721046L, -843452335L, 1851376387L, -1744031086L, 
150192164L, 1334996711L, 909574973L, 385825716L, 1885202458L, 
1929468773L, 454411775L, -660223866L, -930483184L, 1044948659L, 
-2039147183L, -952021760L, 52615134L, -721726423L, -1367231525L, 
1212393706L, 412919436L, 656185807L, 144805253L, -259901892L, 
1297898258L, -1019430899L, -1339339961L, 1134027054L, -809662328L, 
2033388939L, 1276884329L, -336259272L, -1321068058L, -798057087L, 
-1603759853L, -1537478974L, 1043007860L, 1506439543L, 668410477L, 
-45273852L, 1301596202L, -397507179L, 1490086447L, 1002813238L, 
-495507616L, 994307491L, 1012500161L, 1906327216L, 1057765838L, 
-1045885639L, -137223861L, 19129082L, -879627268L, 298103871L, 
609520917L, 1241085484L, -1939924222L, 1716520925L, 2070229719L, 
-957362754L, -1817188360L, 690938843L, -1407115719L, 394962824L, 
959805238L, 476603761L, -1435672669L, -42236174L, -2072318396L, 
1792192775L, -1290172195L, 40492500L, -1401556742L, 763709509L, 
677236895L, -1612686810L, 2098024112L, 1913205587L, -210690447L, 
-1975950432L, 85467966L, 1438370377L, -2108875013L, -1321877814L, 
-1893490644L, 324474927L, 2037868773L, -2137833060L, -1512731662L, 
-381151635L, 1907909543L, 1977957838L, 252377896L, -1670852309L, 
-2017195191L, -1655173032L, 2056799494L, -1789116222L, -1060713432L, 
969283500L, 2004478652L, -365339662L, -1733880272L, -2113794140L, 
-1235989064L, -1006397606L, 779030672L, 1452657532L, -954228460L, 
-1658222270L, 1088180480L, 2030001596L, 1816371536L, -1843308254L, 
-247588280L, 790360076L, -282237348L, 562055634L, -2059274752L, 
-142400172L, 1145646312L, -352497414L, -1243289904L, 800762668L, 
-1647709964L, 1529141650L, 454049120L, 94821388L, -470139680L, 
1398927970L, 824009896L, -132252596L, -1149253764L, 64641330L, 
-1918295440L, 1459512132L, -266976296L, -1497902214L, 707156016L, 
-281571396L, -1006931372L, 1785573442L, 1038397088L, -1790836036L, 
1300466640L, -1258963614L, -1602793752L, -46011188L, -544841220L, 
1718746610L, 1035248768L, 1180117492L, -10663288L, 1368169274L, 
1540590928L, 1389014124L, -1801103980L, 415719506L, -67600928L, 
-106998580L, 1692532384L, 1116959106L, 51947752L, 335252204L, 
-1393187524L, 1796911218L, -1682288016L, -209544668L, 804792376L, 
-504641254L, 1738212560L, -2092971716L, -1454541932L, -694548606L, 
1556007872L, 1494652476L, 1502511504L, 52762658L, 1404704008L, 
1956374668L, -1973326052L, -891448174L, -2026302080L, 194338836L, 
1593411368L, 1872873018L, 1904955856L, 500873580L, -494976076L, 
-1761785326L, 273516512L, 1157016012L, 1160870112L, 2134723618L, 
-1492380888L, -1769178612L, 948589372L, -1665765390L, 643023600L, 
-351184316L, -1563345320L, -1856686278L, 6262768L, -1644158276L, 
1656182036L, 1687264450L, 1675476064L, -633313924L, 1100947408L, 
-859590366L, 1114937000L, -100722804L, -303807300L, 1324120882L, 
14870080L, -1616616268L, 646632520L, -214608454L, -1716078064L, 
348650732L, 1324015956L, 611005202L, 431123872L, -1140466996L, 
-925033888L, -2111524926L, -492473560L, 1458813356L, 2128198972L, 
1727953778L, 1780084016L, 2085283620L, -794434248L, 1201063642L, 
-1266106480L, 2144623356L, -2009719916L, 1199699010L, 457315584L, 
-1356306372L, -356229808L, -745814878L, 1105431368L, 2034040588L, 
1996013276L, -1688965294L, -977889664L, 1701009492L, -623107096L, 
1863740154L, 621696336L, 361594668L, 951788404L, -307715182L, 
-1529474464L, -1827178612L, -1025583264L, 1314161634L, 1732451240L, 
-1528935220L, 449542652L, -1009576014L, 351446512L, 843335108L, 
-394815144L, 1081855354L, 418594224L, 1230742460L, 2137256020L, 
479137730L, -408412128L, -21450564L, -1769629104L, 747030498L, 
-225411480L, 752627276L, 519557372L, -1811408910L, -1726651136L, 
-715824268L, -410345464L, 1921962426L, -1867391536L, -2088430612L, 
-207385708L, 415013842L, 1946281824L, -455933620L, -1656114912L, 
-1625723006L, -583629080L, 1032558828L, -1416694212L, -92392590L, 
1821832816L, -815291356L, -1639878856L, -337626982L, -2058506544L, 
-539784004L, -127210860L, 190800514L, 414838976L, 42804540L, 
-505825136L, -1774889438L, 77394440L, 618036492L, -1484225892L, 
-1595558382L, 313596288L, -640329196L, -487809496L, 1364394298L, 
1911811024L, -1700802964L, 1239442228L, 1693770386L, -555375264L, 
1812815180L, 1402418144L, -1031254110L, 640597928L, 526100364L, 
1588828831L, -1895979223L, -1245045826L, 945212076L, 1583126525L, 
-1267523769L, 1756831792L, -647059122L, 699724155L, 2095548125L, 
-757973238L, -2067810776L, 465411729L, 1797633667L, -1365973116L, 
1216830898L, 1683886855L, -453826223L, -1575980842L, -819044940L, 
-268355131L, -794560241L, 559105928L, 17102502L, 1045309011L, 
-83060811L, -1711540974L, -1812017440L, -329526455L, 1598459259L, 
-858191796L, -2018700838L, -1103336497L, 1436738969L, 1225768366L, 
-1771107652L, -462066899L, -47261481L, -1330388704L, -1867585922L, 
106131531L, 417057357L, 834967674L, -790501192L, -1300428191L, 
505982995L, 721165876L, 2128377922L, 1530624087L, -2036032607L, 
183950438L, -1227672476L, -1153495147L, 1038870719L, -1338910632L, 
-380793738L, -2079583101L, 1751839941L, -611978782L, -917309360L, 
-1054684807L, -639906133L, 963347996L, 1724199242L, -26697409L, 
1664288649L, -524851298L, -1008994740L, -2015920099L, -1931751001L, 
1511347408L, 682858094L, 1447342363L, 234267261L, 1808087146L, 
-459897592L, 886362545L, 758654499L, -249945308L, 600163282L, 
-1731280857L, 384998001L, -733065034L, -768071596L, 481401061L, 
-1429130001L, -1130776536L, 1699122310L, -1578416461L, 490784789L, 
-884200846L, -817121600L, 380013353L, 1714604443L, -191426964L, 
334082682L, -1102318673L, -200808775L, 1578748110L, 762962460L, 
553494925L, 486500343L, 1471245056L, -621735586L, 1265857067L, 
616258861L, 2076672410L, 1801944920L, -1514498367L, 1009510771L, 
1827652244L, 1034811170L, 641847863L, 616822529L, 310406662L, 
-1516284860L, -863640203L, 170551519L, 651926584L, 1879997462L, 
1192786979L, -258184219L, -131604990L, 1110570096L, -1111145959L, 
646441995L, 840232700L, -1732114646L, 27076959L, -146202391L, 
-306456578L, 689265644L, -1570001091L, 1940697863L, -813798672L, 
-826010994L, -618402629L, -521688931L, -17632182L, -1697304984L, 
-2128264623L, 77873475L, 668678212L, -83445006L, 1291567815L, 
1196408977L, -1012953578L, 1037962356L, 1441482501L, -1392900785L, 
-171362488L, -967721242L, -1653563373L, 2028828533L, -1790511278L, 
997011360L, 155392649L, -2067505477L, 594963724L, -1283649382L, 
-1911233009L, 1612436057L, 1402801902L, 2079231356L, 2006248941L, 
1295321751L, -746961568L, -750091970L, -1141061109L, -394766781L
)
.create_df <-
function(class, chemical_table, parentTable, id_col, inchi_col, smiles_col) {  
	test=1
	regex = 'InChI.+/.+/(c.+?)/.+'
  child_list = parentTable[parentTable$parent == class, 'child']
  while(test > 0) {
    middle_parents = child_list[child_list %in% parentTable$parent]
    test = length(middle_parents)
    next_child = parentTable[parentTable$parent %in% middle_parents, 'child']
    child_list = unique(c(child_list[child_list %in% middle_parents == F], next_child))
  }
  child_table = chemical_table[chemical_table[[id_col]] %in% child_list, c(id_col, inchi_col, smiles_col)]
	child_table[[inchi_col]] = sub(regex, '\\1', child_table[[inchi_col]])
	ind_empty_smiles = grep('^$', child_table[[smiles_col]])
	ind_start_smiles = grep('\\*', child_table[[smiles_col]])
	ind_remove = c(ind_empty_smiles, ind_start_smiles)
	if(length(ind_remove) > 0) {
		child_table = child_table[-ind_remove,]
	}
  return(child_table)
}
.formula2matrix <-
function(formula_vector) {
	pattern = "([A-Z]{1}[a-z]?)([0-9]*)"
	tokenized_formula = unlist(str_extract_all(string=formula_vector, pattern = pattern))
	# Add 1 if atom doesn't have number
	ind_noNum = which(grepl('[0-9]', tokenized_formula) == F)
	tokenized_formula[ind_noNum] = sub('$', '1', tokenized_formula[ind_noNum])
	atoms = sub(pattern, '\\1', tokenized_formula)
	numbers = as.numeric(sub(pattern, '\\2', tokenized_formula))
	atom_df = data.frame(cbind(atoms, numbers), stringsAsFactors=F)
	atom_df$numbers = as.numeric(atom_df$numbers)
	atom_df = tapply(atom_df$numbers, INDEX=list(atom_df$atoms),FUN=sum)
	return(atom_df)
}
.get.c.num <-
function(id, chemical_table, id_col, formula_col) {
	pattern_c = 'C{1}[0-9]*'
	formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
	carbon = str_extract(formula, pattern_c)
	carbon = sub('^C$', 'C1', carbon)
	carbon = grep('.+', carbon, value=T)
	carbon = sub('C', '', carbon)
	carbon = as.numeric(carbon)
	if(length(carbon) == 0) {
		carbon = 0
	}
	return(carbon)
}
.get.formula <-
function(id, chemical_table, id_col, formula_col) {
	formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
	return(formula)
}
.get.h.num <-
function(id, chemical_table, id_col, formula_col) {
  pattern_h = 'H{1}[0-9]*'
  formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
  proton = str_extract(formula, pattern_h)
  proton = sub('^H$', 'H1', proton)
  proton = grep('.+', proton, value=T)
  proton = sub('H', '', proton)
  proton = as.numeric(proton)
  if(length(proton) == 0) {
    proton = 0
  }
  return(proton)
}
.get.o.num <-
function(id, chemical_table, id_col, formula_col) {
  pattern_o = 'O{1}[0-9]*'
  formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
  oxygen = str_extract(formula, pattern_o)
  oxygen = sub('^O$', 'O1', oxygen)
  oxygen = grep('.+', oxygen, value=T)
  oxygen = sub('O', '', oxygen)
  oxygen = as.numeric(oxygen)
  if(length(oxygen) == 0) {
    oxygen = 0
  }
  return(oxygen)
}
.get.participant <-
function(equation, direction_type = c(' <=> ', ' => ', ' <\\?> ')) {
	pattern = '.+? (.+)'
	participants = equation
	for(i in direction_type) {
		participants = unlist(strsplit(participants, i))
	}
	participants = unlist(strsplit(participants, ' \\+ '))
	participants = sub('\\(.+\\)', '', participants)
	participants = sub(pattern, '\\1', participants)
	participants = trim(participants)
	return(unique(participants))
}
.id2formula <-
function(id, chemical_table, id_col, formula_col) {
  id = gsub('\\|', '', id)
	formula = chemical_table[chemical_table[[id_col]] == id, formula_col]
	return(formula)
}
.parse.chebi <-
function(owl,chebi_compound, chebi_chemical_data) {
    
  con1 = grepl('    <owl:Class rdf:about="http://purl.obolibrary.org/obo/CHEBI_',owl)
  con2 = grepl('        <rdfs:label rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con3 = grepl('        <obo2:Synonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con4 = grepl('        <obo2:SMILES rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con5 = grepl('        <obo2:InChI rdf:datatype="http://www.w3.org/2001/XMLSchema#string">', owl)
  con6 = grepl('        <obo2:xref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">kegg COMPOUND:C', owl)
  con7 = grepl('        <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_', owl)
  con8 = grepl("</owl:Class>",owl)
  
  id = ""
  name = ""
  synonym = c()
  smiles = ""
  inchi = ""
  kegg = ""
  chebi = c()
  parent = c()
	
	entry_start = grep('<\\!-- http://purl.obolibrary.org/obo/CHEBI_100 -->', owl)
	cat("parsing", length(owl), 'lines\n')
  for (i in entry_start:length(owl)) {
		if(i == floor(length(owl)/10)) {
      cat('10% finished\n')
    } else if(i == floor(length(owl)/5)) {
      cat('20% finished\n')
    } else if(i == floor(length(owl)/2)) {
      cat('50% finished\n')
    }
    if(con1[i]) { # ID
      regexp = '(    <owl:Class rdf:about=\"http://purl.obolibrary.org/obo/CHEBI_)(.*)(">)'
      id = sub(pattern = regexp, replacement = "\\2", x = owl[i])
      id = trim(id)
    }
    else if(con2[i]) { # Name
      regexp = '(        <rdfs:label rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</rdfs:label>)'
      name = sub(pattern = regexp, replacement = "\\2", x = owl[i])
      name = trim(name)
    }
    else if(con3[i]) { # synonym
      regexp = '(        <obo2:Synonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">)(.*)(</obo2:Synonym>)'
      synonym = c(synonym, sub(pattern = regexp, replacement = '\\2', x = owl[i]))
      synonym = trim(synonym)
    }
    else if(con4[i]) { # smiles
      regexp = '(        <obo2:SMILES rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</obo2:SMILES>)'
      smiles = sub(pattern = regexp, replacement = '\\2', x = owl[i])
      smiles = trim(smiles)
    }
    else if(con5[i]) { # inchi
      regexp = '(        <obo2:InChI rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">)(.*)(</obo2:InChI>)'
      inchi = sub(pattern = regexp, replacement = '\\2', x = owl[i])
      inchi = trim(inchi)
    }
    else if(con6[i]) { # kegg
      regexp = '(        <obo2:xref rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">kegg COMPOUND:)(.*)(</obo2:xref>)'
      kegg = sub(pattern = regexp, replacement = '\\2', x = owl[i])
      kegg = trim(kegg)
    }
    else if(con7[i]) { # subClassOf
      regexp = '(        <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_)(.*)("/>)'
      parent = c(parent, sub(pattern = regexp, replacement = '\\2', x = owl[i]))
      parent = trim(parent)
    }
    else if(con8[i]) {
      data = c(id, name, paste(synonym, collapse="///"), smiles, inchi, kegg, paste(parent, collapse="///"))
      chebi = rbind(chebi, data)
      id = ""
      name = ""
      synonym = c()
      parent = c()
      smiles = ""
      inchi = ""
      kegg = ""
    }
  }

	# Chemical formula
	chebi_compound2 = chebi_compound[,c('ID', 'ID')]
	ind_parent = grep('[0-9]', chebi_compound$PARENT_ID)
	chebi_compound2[ind_parent, 'ID.1'] = chebi_compound$PARENT_ID[ind_parent]
	colnames(chebi_compound2) = c('id', 'chebi')

	chebi_formula = chebi_chemical_data[chebi_chemical_data$TYPE == 'FORMULA', c('COMPOUND_ID', 'CHEMICAL_DATA')]
	colnames(chebi_formula) = c('id', 'formula')
	chebi_formula2 = join(chebi_compound2, chebi_formula, by='id')
	chebi_formula2 = chebi_formula2[is.na(chebi_formula2$formula) == F,]
	chebi_formula2 = unique(chebi_formula2[,c('chebi', 'formula')])

	chebi_formula3 = data.table(chebi_formula2)
	chebi_formula3 = chebi_formula3[,lapply(.SD, paste, collapse='///'), by="chebi", .SDcols=c("formula")]
	
	chebi = gsub("&#39;", "'", chebi) # convert HTML code
	# Join chebi + formula
	colnames(chebi) = c("chebi", "name", "synonyms", "smiles", "inchi", "kegg", "parent")
  rownames(chebi) = 1:nrow(chebi)
  chebi = as.data.frame(chebi, stringsAsFactors=FALSE)
	chebi_formula3 = as.data.frame(chebi_formula3, stringsAsFactors=F)

	chebi2 = join(chebi, chebi_formula3, by='chebi')
  return(chebi2)
}
.parse.rhea <-
function(owl) {
	entry = c('biochemicalReaction', 'transportReaction', 'equationWithCommonName', 'ecNumber', 'metacyc', 'kegg', 'sameParticipant', 'mapped', 'formuled', 'polymerization', 'chemicallyBalanced', 'iubmb', 'status', 'transport', 'direction', 'classOfReactions', 'closeBiochemicalReaction', 'closeTransport')

	# define regular expression

	regexp = list()	

	regexp[[entry[1]]] = '(<bp:biochemicalReaction rdf:about="http://identifiers\\.org/rhea/)(.*)(">)' # biochemical reaction
	regexp[[entry[2]]] = '(<bp:transport.* rdf:about=")(.*)(">)' # transport reaction
	regexp[[entry[3]]] = '(<bp:NAME .*>)(.*)(</bp:NAME>)' # reaction equation expressed by chemical name
	regexp[[entry[4]]] = '(<bp:EC-NUMBER .*>)(.*)(</bp:EC-NUMBER>)' # EC number
	regexp[[entry[5]]] = '(<bp:XREF rdf:resource="#METACYC:)(.*)(" />)' # cross-reference to MetaCyc
	regexp[[entry[6]]] = '(<bp:XREF rdf:resource="#KEGG_REACTION:)(.*)(" />)' # cross-reference to KEGG
	regexp[[entry[7]]] = '(<bp:XREF rdf:resource="#rel/../RHEA:)(.*)(" />)' # same participants, different direction
	regexp[[entry[8]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Mapped=)(.*)(</bp:COMMENT>)'
	regexp[[entry[9]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Formuled=)(.*)(</bp:COMMENT>)'
	regexp[[entry[10]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Polymerization=)(.*)(</bp:COMMENT>)'
	regexp[[entry[11]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Chemically balanced=)(.*)(</bp:COMMENT>)'
	regexp[[entry[12]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:IUBMB=)(.*)(</bp:COMMENT>)'
	regexp[[entry[13]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Status=)(.*)(</bp:COMMENT>)'
	regexp[[entry[14]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Transport=)(.*)(</bp:COMMENT>)'
	regexp[[entry[15]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Direction=)(.*)(</bp:COMMENT>)'
	regexp[[entry[16]]] = '(<bp:COMMENT rdf:datatype = .+>RHEA:Class of reactions=)(.*)(</bp:COMMENT>)'
	regexp[[entry[17]]] = '</bp:biochemicalReaction>' # close tag
	regexp[[entry[18]]] = '</bp:transport' # close tag

	
	# pre-calculate condtions (TRUE/FALSE) for fast 'for' loop operation

	condition = list()

	for(i in entry) {
		condition[[i]] = grepl(regexp[[i]], owl)
	}
	
	# parsing using regular expressions

	cat("parsing", length(owl), 'lines\n')

	rheaReaction = data.frame()
	rheaReactionRow = list()

	for(i in 1:length(owl)) {
		if(i == floor(length(owl)/10)) {
			cat('10% finished\n')
		} else if(i == floor(length(owl)/5)) {
			cat('20% finished\n')
		} else if(i == floor(length(owl)/2)) {
			cat('50% finished\n')
		}
		
    for(j in 1:length(entry)) {
			# tag start
			if(j %in% 1:2 && condition[[entry[j]]][i]) {
				rheaReactionRow = list()
				value = unlist(strsplit(owl[i], split = '"'))[2]
        value = tail(unlist(strsplit(value, split = '/')), 1)
				rheaReactionRow[['rheaId']] = c(rheaReactionRow[[entry[1]]], value)
				rheaReactionRow[['reactionType']] = entry[j]
			}
      
			else if(j %in% 3:16 && condition[[entry[j]]][i]) {
				value = sub(regexp[[entry[j]]], '\\2', owl[i])
				rheaReactionRow[[entry[j]]] = paste(rheaReactionRow[[entry[j]]], value, sep=',')
			}

			# tag close
			else if(j %in% 17:18 && condition[[entry[j]]][i]) {
    		rheaReactionRow = as.data.frame(rheaReactionRow, stringsAsFactors = FALSE)
    		rheaReaction = rbind.fill(rheaReaction, rheaReactionRow)
				rheaReactionRow = list()
			}
		}
	}
	
	# post process
	for(i in names(rheaReaction)) {
		rheaReaction[[i]] = sub('^,', '', rheaReaction[[i]])
		rheaReaction[[i]] = gsub('%2b', '+', rheaReaction[[i]])
	}
	rheaReaction = trim(rheaReaction)
	rheaReaction$direction = gsub(' ', '-', rheaReaction$direction)
  
	numberOfBiochecmialReaction = length(unique(rheaReaction[rheaReaction$reactionType == 'biochemicalReaction', 'rheaId']))
	numberOfTransportReaction = length(unique(rheaReaction[rheaReaction$reactionType == 'transportReaction', 'rheaId']))

	cat('# of biochemical reaction: ', numberOfBiochecmialReaction, '\n')
	cat('# of transport reaction: ', numberOfTransportReaction, '\n')

	# replace html code
	rheaReaction[,'equationWithCommonName'] = gsub(pattern = "&gt;", replacement = ">", x = rheaReaction[,'equationWithCommonName'])
	rheaReaction[,'equationWithCommonName'] = gsub(pattern = "&lt;", replacement = "<", x = rheaReaction[,'equationWithCommonName'])
	rheaReaction[,'equationWithCommonName'] = gsub(pattern = "&apos;", replacement = "'", x = rheaReaction[,'equationWithCommonName'])
  
	# Sorting
	rheaReaction = rheaReaction[order(rheaReaction[,'rheaId']),]
	rownames(rheaReaction) = 1:nrow(rheaReaction)

	##### Parsing equation expressed with ChEBI ID ##### 
	
	cat("Parsing equation expressed with ChEBI ID\n")

	entry = c('compoundId', 'name', 'chebiId', 'close')

	# define regular expression

	regexp = list()

	regexp[[entry[1]]] = '(<bp:physicalEntity rdf:about="#compound:)(.+)(">)' # ChEBI ID
  regexp[[entry[2]]] = '(<bp:NAME rdf:datatype .+>)(.*)(</bp:NAME>)' # compound common name used in equation
	regexp[[entry[3]]] = '(<bp:COMMENT rdf:datatype = "http://www.w3.org/2001/XMLSchema#string">.* CHEBI:)(.*)(</bp:COMMENT>)'
	regexp[[entry[4]]] = '</bp:physicalEntity>' # close

	# pre-calculate condtions (TRUE/FALSE) for fast 'for' loop operation

	condition = list()

	for(i in entry) {
		condition[[i]] = grepl(regexp[[i]], owl)
	}

	# parsing using regular expressions

	cat("Parsing owl to create list mapping chemical common name into ChEBI ID\n")

	rheaChemical = data.frame()
	rheaChemicalRow = list()

	for(i in 1:length(owl)) {
		for(j in 1:length(entry)) {
			# Tag start. Please change the number in if statement depending on length of entry
			if(j == 1 && condition[[entry[j]]][i]) {
				rheaChemicalRow = list()
				value = unlist(strsplit(owl[i], split = '"'))[2]
				value = sub('#', '', value)
				rheaChemicalRow[[entry[j]]] = c(rheaChemicalRow[[entry[1]]], value)
			}
			else if(j %in% 2:3 && condition[[entry[j]]][i]) {
				value = sub(regexp[[entry[j]]], '\\2', owl[i])
				rheaChemicalRow[[entry[j]]] = value
			}

			# Tag close
			else if(j == 4 && condition[[entry[j]]][i]) {
				rheaChemicalRow = as.data.frame(rheaChemicalRow, stringsAsFactors = FALSE)
				rheaChemical = rbind.fill(rheaChemical, rheaChemicalRow)
				rheaChemicalRow = list()
			}
		}
	}

	rheaChemical = trim(rheaChemical)
	rheaChemical[,'name'] = gsub(pattern = "&gt;", replacement = ">", x = rheaChemical[,'name'])
	rheaChemical[,'name'] = gsub(pattern = "&lt;", replacement = "<", x = rheaChemical[,'name'])
	rheaChemical[,'name'] = gsub(pattern = "&apos;", replacement = "'", x = rheaChemical[,'name'])

	numberOfChemicals = nrow(rheaChemical)

	cat("# of chemicals used in Rhea: ", numberOfChemicals, '\n') 


	##### Building equation with the list #####
	
	cat("Building equation with the list\n")

	# Define direction

	directionList = list()
	directionList[['undefined']] = ' <?> '
	directionList[['bidirectional']] = ' <=> '
	directionList[['right-to-left']] = ' => '
	directionList[['left-to-right']] = ' => '
  
	# Define regular expressions
  
	regexp_coefficient = "(^\\(?[0-9]*n?\\+?[0-9]*\\)? )(.+)"
	regexp_localization = "(.+)(\\([inout]+\\))"

	# Conversion (split - conversion - paste)
  
	equationWithChebi = character(nrow(rheaReaction))
	equationParticipant = character(nrow(rheaReaction))
	for(i in 1:nrow(rheaReaction)) {
		reactantsChebi = c()
		productsChebi= c()
		arrow = directionList[[rheaReaction[i, 'direction']]]
		arrow2 = sub('\\?', '\\\\?', arrow)
		participants = unlist(strsplit(rheaReaction[i,'equationWithCommonName'], split=arrow2))
   	reactants = unlist(strsplit(participants[1], split = " \\+ "))
   	reactantsWOcoefficient = sub(regexp_coefficient, "\\2", reactants)
   	for(j in 1:length(reactants)) { 
			if(grepl(regexp_coefficient, reactants[j])) { # process coefficeint
 				coefficient = sub(regexp_coefficient, "\\1", reactants[j])
   		} else {coefficient = ""}
			if(grepl(regexp_localization, reactants[j])) { # process (in), (out)
      	localization = sub(regexp_localization, "\\2", x = reactants[j])
      } else {localization = ""}
			reactantsWOcoefficient[j] = gsub("\\(in\\)", "", reactantsWOcoefficient[j])
      reactantsWOcoefficient[j] = gsub("\\(out\\)", "", reactantsWOcoefficient[j])
			chebi = rheaChemical[rheaChemical$name == reactantsWOcoefficient[j], 'chebiId']
      if(length(chebi) > 1) {
        reactants[j] = "Unknown"
      } else {
        reactants[j] = paste(coefficient, chebi, localization, sep="")
      }
			reactantsChebi = c(reactantsChebi, chebi)
    }
		products = unlist(strsplit(participants[2], split = " \\+ "))
    productsWOcoefficient = sub(regexp_coefficient, "\\2", products)
		
		for(k in 1:length(products)) {
     	if(grepl(regexp_coefficient, products[k])) { # process coefficeint
       	coefficient = sub(regexp_coefficient, "\\1", products[k])
     	} else {coefficient = ""}
     	if(grepl(regexp_localization, products[k])) { # process (in), (out)
       	localization = sub(regexp_localization, "\\2", x = products[k])
     	} else {localization = ""}
     	productsWOcoefficient[k] = gsub("\\(in\\)", "", productsWOcoefficient[k])
     	productsWOcoefficient[k] = gsub("\\(out\\)", "", productsWOcoefficient[k])
     	chebi = rheaChemical[rheaChemical$name == productsWOcoefficient[k], 'chebiId']
     	if(length(chebi) > 1) {
        products[k] = "Unknown"  
     	} else {
     	  products[k] = paste(coefficient, chebi, localization, sep='')   
     	}
      productsChebi = c(productsChebi, chebi)
    }
   	equationWithChebi[i] = paste(paste(reactants, collapse = " + "), paste(products, collapse = " + "), sep = arrow)
		equationParticipant[i] = paste(c(reactantsChebi, productsChebi), collapse=',')
 	}
	result = cbind(rheaReaction, I(equationWithChebi), I(equationParticipant))
	result[is.na(result)] = ''
	return(result)
}
.unique_column <-
function(vector) {
  if(length(vector)>1) {
    return(vector[1])
  }
}
.convert.html = function(dataFrame) {
  col = colnames(dataFrame)
  dataFrame_result = dataFrame
  for(i in col) {
    dataFrame_result[[i]] = gsub('&lt;i&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/i&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;beta;', 'beta', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;SUP&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/SUP&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;sup&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/sup&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;sub&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/sub&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub(' &amp;rarr; ', '=>', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub(' &amp;larr; ', '<=', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub(' &amp;harr; ', '<=>', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('  =  ', ' <=> ', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;pi;', 'pi', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;alpha;', 'alpha', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;I&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/I&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;SUB&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&lt;/SUB&gt;', '', dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&apos;', "'", dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&gt;', ">", dataFrame_result[[i]])
    dataFrame_result[[i]] = gsub('&amp;gamma;', "gamma", dataFrame_result[[i]])
  }
  return(dataFrame_result)
}

.parse.biopax <-
function(biopax) {
  ### Define regula expression
  ## BiochemicalReaction set 
  regex_biochemicalReaction_tag = '<bp:BiochemicalReaction rdf:ID=".+">'
  regex_biochemicalReaction_close = '</bp:BiochemicalReaction>'
  regex_standardName_tag = '<bp:standardName rdf:datatype="http://www.w3.org/2001/XMLSchema#string">(.+)</bp:standardName>'
  regex_eCNumber_tag = '<bp:eCNumber rdf:datatype="http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:eCNumber>'
  regex_xref_tag = '<bp:xref rdf:resource=\"#(.+)"/>'
  regex_left_tag = '<bp:left rdf:resource=\"#(.+)"/>'
  regex_right_tag = '<bp:right rdf:resource=\"#(.+)"/>'
  regex_participantStoichiometry_tag = '<bp:participantStoichiometry rdf:resource=\"#(.+)"/>'
  
  index_biochemicalReaction_tag = grepl(regex_biochemicalReaction_tag, biopax)
  index_biochemicalReaction_close = grepl(regex_biochemicalReaction_close, biopax)
  index_standardName_tag = grepl(regex_standardName_tag, biopax)
  index_eCNumber_tag = grepl(regex_eCNumber_tag, biopax)
  index_xref_tag = grepl(regex_xref_tag, biopax)
  index_left_tag = grepl(regex_left_tag, biopax)
  index_right_tag = grepl(regex_right_tag, biopax)
  index_participantStoichiometry_tag = grepl(regex_participantStoichiometry_tag, biopax)
  
  ## Transport set
  regex_Transport_tag = '<bp:Transport rdf:ID=".+>'
  regex_Transport_close = '</bp:Transport>'
  
  index_Transport_tag = grepl(regex_Transport_tag, biopax)
  index_Transport_close = grepl(regex_Transport_close, biopax)
  
  ## TransportWithBiochemicalReaction set
  regex_TransportWithBiochemicalReaction_tag = '<bp:TransportWithBiochemicalReaction rdf:ID=".+">'
  regex_TransportWithBiochemicalReaction_close = '</bp:TransportWithBiochemicalReaction>'
  
  index_TransportWithBiochemicalReaction_tag = grepl(regex_TransportWithBiochemicalReaction_tag, biopax)
  index_TransportWithBiochemicalReaction_close = grepl(regex_TransportWithBiochemicalReaction_close, biopax)
  
  ## ComplexAssembly set
  regex_ComplexAssembly_tag = '<bp:ComplexAssembly rdf:ID=\".+">'
  regex_ComplexAssembly_close = '</bp:ComplexAssembly>'
  
  index_ComplexAssembly_tag = grepl(regex_ComplexAssembly_tag, biopax)
  index_ComplexAssembly_close = grepl(regex_ComplexAssembly_close, biopax)
  
  ## UnificationXref & RelationshipXref set
  regex_UnificationXref_tag = '<bp:UnificationXref rdf:ID="(.+)">'
  regex_UnificationXref_close = '</bp:UnificationXref>'
  regex_RelationshipXref_tag = '<bp:RelationshipXref rdf:ID="(.+)">'
  regex_RelationshipXref_close = '</bp:RelationshipXref>'
  regex_xref_integrated_tag = "<bp:.+ rdf:ID=\"(.+)\">"
  regex_id_tag = '<bp:id rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:id>'
  regex_db_tag = '<bp:db rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:db>'
  
  index_UnificationXref_tag = grepl(regex_UnificationXref_tag, biopax)
  index_UnificationXref_close = grepl(regex_UnificationXref_close, biopax)
  index_RelationshipXref_tag = grepl(regex_RelationshipXref_tag, biopax)
  index_RelationshipXref_close = grepl(regex_RelationshipXref_close, biopax)
  index_id_tag = grepl(regex_id_tag, biopax)
  index_db_tag = grepl(regex_db_tag, biopax)
  
  ## Small Molecule set
  regex_SmallMolecule_tag = '<bp:SmallMolecule rdf:ID=\"(.+)">'
  regex_SmallMolecule_close = '</bp:SmallMolecule>'
  regex_cellularLocation_tag = '<bp:cellularLocation rdf:resource="#(.+)"/>'
  
  index_SmallMolecule_tag = grepl(regex_SmallMolecule_tag, biopax)
  index_SmallMolecule_close = grepl(regex_SmallMolecule_close, biopax)
  index_cellularLocation_tag = grepl(regex_cellularLocation_tag, biopax)
  
  ## Protein set
  regex_Protein_tag = '<bp:Protein rdf:ID="(.+)">'
  regex_Protein_close = '</bp:Protein>'
  
  index_Protein_tag = grepl(regex_Protein_tag, biopax)
  index_Protein_close = grepl(regex_Protein_close, biopax)
  
  ## Complext set
  regex_Complex_tag = '<bp:Complex rdf:ID="(.+)">'
  regex_Complex_close = '</bp:Complex>'
  
  index_Complex_tag = grepl(regex_Complex_tag, biopax)
  index_Complex_close = grepl(regex_Complex_close, biopax)
  
  ## Rna set
  regex_Rna_tag = '<bp:Rna rdf:ID="(.+)">'
  regex_Rna_close = '</bp:Rna>'
  
  index_Rna_tag = grepl(regex_Rna_tag, biopax)
  index_Rna_close = grepl(regex_Rna_close, biopax)
  
  ## CellularLocationVocabulary set
  regex_CellularLocationVocabulary_tag = '<bp:CellularLocationVocabulary rdf:ID=\"(.+)">'
  regex_term_tag = '<bp:term rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string\">(.+)</bp:term>'
  regex_CellularLocationVocabulary_close = '</bp:CellularLocationVocabulary>'
  
  index_CellularLocationVocabulary_tag = grepl(regex_CellularLocationVocabulary_tag, biopax)
  index_term_tag = grepl(regex_term_tag, biopax)
  index_CellularLocationVocabulary_close = grepl(regex_CellularLocationVocabulary_close, biopax)
  
  ## Flags
  biochemicalReaction_flag = FALSE
  Transport_flag = FALSE
  TransportWithBiochemicalReaction_flag = FALSE
  UnificationXref_flag = FALSE
  SmallMolecule_flag = FALSE
  CellularLocationVocabulary_flag = FALSE
  Protein_flag = FALSE
  Complex_flag = FALSE
  Rna_flag = FALSE
  ComplexAssembly_flag = FALSE
  
  ## Read each line and parsing
  ec_numbers = c()
  xrefs_biochemicalReaction = c()
  xrefs_Transport = c()
  xrefs_TransportWithBiochemicalReaction = c()
  xrefs_ComplexAssembly = c()
  df_biochemicalReaction = c()
  df_Transport = c()
  df_TransportWithBiochemicalReaction = c()
  df_ComplexAssembly = c()
  df_smallMolecule = c()
  df_Protein = c()
  df_Complex = c()
  df_Rna = c()
  df_CellularLocationVocabulary = c()
  tmp_xrefs_smallMolecule = c()
  tmp_xrefs_Protein = c()
  tmp_xrefs_Complex = c()
  tmp_xrefs_Rna = c()
  lefts = c()
  rights = c()
  df_UnificationXref = c()
  participantStoichiometries = c()
  cellularLocation = ''
  for(i in 1:length(biopax)) {
    # Set flag
    if(index_biochemicalReaction_tag[i]) {
      biochemicalReaction_flag = TRUE
    } else if(index_Transport_tag[i]) {
      Transport_flag = TRUE
    } else if(index_TransportWithBiochemicalReaction_tag[i]) {
      TransportWithBiochemicalReaction_flag = TRUE
    } else if(index_ComplexAssembly_tag[i]) {
      ComplexAssembly_flag = TRUE
    } else if(index_UnificationXref_tag[i] || index_RelationshipXref_tag[i]) {
      UnificationXref_flag = TRUE
    } else if(index_SmallMolecule_tag[i]) {
      SmallMolecule_flag = TRUE
    } else if(index_CellularLocationVocabulary_tag[i]) {
      CellularLocationVocabulary_flag = TRUE
    } else if(index_Protein_tag[i]) {
      Protein_flag = TRUE
    } else if(index_Complex_tag[i]) {
      Complex_flag = TRUE
    } else if(index_Rna_tag[i]) {
      Rna_flag = TRUE
    }
    
    # Parsing
    if(biochemicalReaction_flag) {
      if(index_standardName_tag[i]) {
        reaction_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_biochemicalReaction = c(xrefs_biochemicalReaction, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_biochemicalReaction_close[i]) {
        paste_xrefs_biochemicalReaction = paste(xrefs_biochemicalReaction, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_biochemicalReaction = cbind(reaction_equation_name, paste_ec_numbers, paste_xrefs_biochemicalReaction, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_biochemicalReaction = data.frame(df_biochemicalReaction, stringsAsFactors=F)
        biochemicalReaction_flag = FALSE
      }
    }
    
    else if(Transport_flag) {
      if(index_standardName_tag[i]) {
        transport_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_Transport = c(xrefs_Transport, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_Transport_close[i]) {
        paste_xrefs_Transport = paste(xrefs_Transport, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_Transport = cbind(transport_equation_name, paste_ec_numbers, paste_xrefs_Transport, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_Transport = data.frame(df_Transport, stringsAsFactors=F)
        Transport_flag = FALSE
      }
    }
    
    else if(TransportWithBiochemicalReaction_flag) {
      if(index_standardName_tag[i]) {
        TransportWithBiochemicalReaction_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_TransportWithBiochemicalReaction = c(xrefs_TransportWithBiochemicalReaction, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_TransportWithBiochemicalReaction_close[i]) {
        paste_xrefs_TransportWithBiochemicalReaction = paste(xrefs_TransportWithBiochemicalReaction, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_TransportWithBiochemicalReaction = cbind(TransportWithBiochemicalReaction_equation_name, paste_ec_numbers, paste_xrefs_TransportWithBiochemicalReaction, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_TransportWithBiochemicalReaction = data.frame(df_TransportWithBiochemicalReaction, stringsAsFactors=F)
        TransportWithBiochemicalReaction_flag = FALSE
      }
    }
    
    else if(ComplexAssembly_flag) {
      if(index_standardName_tag[i]) {
        ComplexAssembly_equation_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if (index_eCNumber_tag[i]) {
        ec_numbers = c(ec_numbers, trim(sub(regex_eCNumber_tag, '\\1', biopax[i])))
      } else if(index_xref_tag[i]) {
        xrefs_ComplexAssembly = c(xrefs_ComplexAssembly, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_left_tag[i]) {
        lefts = c(lefts, trim(sub(regex_left_tag, '\\1', biopax[i])))
      } else if(index_right_tag[i]) {
        rights = c(rights, trim(sub(regex_right_tag, '\\1', biopax[i])))
      } else if(index_participantStoichiometry_tag[i]) {
        participantStoichiometries = c(participantStoichiometries, trim(sub(regex_participantStoichiometry_tag, '\\1', biopax[i])))
      } else if(index_ComplexAssembly_close[i]) {
        paste_xrefs_ComplexAssembly = paste(xrefs_ComplexAssembly, collapse='///')
        paste_lefts = paste(lefts, collapse='///')
        paste_rights = paste(rights, collapse='///')
        paste_ec_numbers = paste(ec_numbers, collapse='///')
        paste_participantStoichiometries = paste(participantStoichiometries, collapse='///')
        df_ComplexAssembly = cbind(ComplexAssembly_equation_name, paste_ec_numbers, paste_xrefs_ComplexAssembly, paste_lefts, paste_rights, paste_participantStoichiometries)
        df_ComplexAssembly = data.frame(df_ComplexAssembly, stringsAsFactors=F)
        TransportWithBiochemicalReaction_flag = FALSE
      }
    }
    
    else if(UnificationXref_flag) {
      if(index_UnificationXref_tag[i] || index_RelationshipXref_tag[i]) {
        UnificationXref_id = trim(sub(regex_xref_integrated_tag, '\\1', biopax[i]))
      } else if(index_id_tag[i]) {
        id = trim(sub(regex_id_tag, '\\1', biopax[i]))
      } else if(index_db_tag[i]) {
        db = trim(sub(regex_db_tag, '\\1', biopax[i]))
      } else if(index_UnificationXref_close[i] || index_RelationshipXref_close[i]) {
        tmp = c(UnificationXref_id, id, db)
        df_UnificationXref = rbind(df_UnificationXref, tmp)
        df_UnificationXref = data.frame(df_UnificationXref, stringsAsFactors=F)
        UnificationXref_flag = FALSE
      }
    }
    
    else if(SmallMolecule_flag) {
      if(index_SmallMolecule_tag[i]) {
        master_id = trim(sub(regex_SmallMolecule_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        chemial_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_smallMolecule = c(tmp_xrefs_smallMolecule, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_SmallMolecule_close[i]) {
        tmp = c(master_id, chemial_name, paste(tmp_xrefs_smallMolecule, collapse='///'), cellularLocation)
        df_smallMolecule = rbind(df_smallMolecule, tmp)
        SmallMolecule_flag = FALSE
        tmp_xrefs_smallMolecule = c()
        cellularLocation = ''
      }
    }
    
    else if(Protein_flag) {
      if(index_Protein_tag[i]) {
        master_id = trim(sub(regex_Protein_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        protein_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_Protein = c(tmp_xrefs_Protein, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_Protein_close[i]) {
        tmp = c(master_id, protein_name, paste(tmp_xrefs_Protein, collapse='///'), cellularLocation)
        df_Protein = rbind(df_Protein, tmp)
        Protein_flag = FALSE
        tmp_xrefs_Protein = c()
        cellularLocation = ''
      }
    }
    
    else if(Complex_flag) {
      if(index_Complex_tag[i]) {
        master_id = trim(sub(regex_Complex_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        complex_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_Complex = c(tmp_xrefs_Complex, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_Complex_close[i]) {
        tmp = c(master_id, complex_name, paste(tmp_xrefs_Complex, collapse='///'), cellularLocation)
        df_Complex = rbind(df_Complex, tmp)
        Complex_flag = FALSE
        tmp_xrefs_Complex = c()
        cellularLocation = ''
      }
    }
    
    else if(Rna_flag) {
      if(index_Rna_tag[i]) {
        master_id = trim(sub(regex_Rna_tag, '\\1', biopax[i]))
      } else if(index_standardName_tag[i]) {
        rna_name = trim(sub(regex_standardName_tag, '\\1', biopax[i]))
      } else if(index_xref_tag[i]) {
        tmp_xrefs_Rna = c(tmp_xrefs_Rna, trim(sub(regex_xref_tag, '\\1', biopax[i])))
      } else if(index_cellularLocation_tag[i]) {
        cellularLocation = trim(sub(regex_cellularLocation_tag, '\\1', biopax[i]))
      } else if(index_Rna_close[i]) {
        tmp = c(master_id, rna_name, paste(tmp_xrefs_Rna, collapse='///'), cellularLocation)
        df_Rna = rbind(df_Rna, tmp)
        Rna_flag = FALSE
        tmp_xrefs_Rna = c()
        cellularLocation = ''
      }
    }
    
    if(CellularLocationVocabulary_flag) {
      if(index_CellularLocationVocabulary_tag[i]) {
        CellularLocationVocabulary_id = trim(sub(regex_CellularLocationVocabulary_tag, '\\1', biopax[i]))
      } else if(index_term_tag[i]) {
        CellularLocationVocabulary_term = trim(sub(regex_term_tag, '\\1', biopax[i]))
      } else if(index_CellularLocationVocabulary_close[i]) {
        tmp = c(CellularLocationVocabulary_id, CellularLocationVocabulary_term)
        df_CellularLocationVocabulary = rbind(df_CellularLocationVocabulary, tmp)
        CellularLocationVocabulary_flag = FALSE
      }
    }
  }
  
  ## Post-processing data frame
  if(length(df_smallMolecule)>0) {
    rownames(df_smallMolecule) = 1:nrow(df_smallMolecule)
    df_smallMolecule = data.frame(df_smallMolecule, stringsAsFactors=F)
  }
  if(length(df_Protein)>0) {
    rownames(df_Protein) = 1:nrow(df_Protein)
    df_Protein = data.frame(df_Protein, stringsAsFactors=F)
  }
  if(length(df_Complex)>0) {
    rownames(df_Complex) = 1:nrow(df_Complex)
    df_Complex = data.frame(df_Complex, stringsAsFactors=F)
  }
  if(length(df_Rna)>0) {
    rownames(df_Rna) = 1:nrow(df_Rna)
    df_Rna = data.frame(df_Rna, stringsAsFactors=F)
  }
  
  rownames(df_CellularLocationVocabulary) = 1:nrow(df_CellularLocationVocabulary)
  df_CellularLocationVocabulary = data.frame(df_CellularLocationVocabulary, stringsAsFactors=F)
  
  ## Concatenate data.frames
  if(length(df_biochemicalReaction)>0) {
    df_reaction = df_biochemicalReaction
  } else if(length(df_Transport)>0) {
    df_reaction = df_Transport
  } else if(length(df_TransportWithBiochemicalReaction)>0) {
    df_reaction = df_TransportWithBiochemicalReaction
  } else if(length(df_ComplexAssembly)>0) {
    df_reaction = df_ComplexAssembly
  }
  
  df_compound = rbind.fill(df_smallMolecule, df_Protein, df_Complex, df_Rna)
  
  ## Handle Xref
  colnames(df_UnificationXref) = c('xref','id','db')
  colnames(df_reaction) = c('name','ec_number','xref','lefts','rights','participantStoichiometries')
  colnames(df_compound) = c('name', 'standard_name','xref', 'cellularLocation')
  colnames(df_CellularLocationVocabulary) = c('master_id', 'term')
  
  df_reaction = .handle.xref(df_reaction, df_UnificationXref)
  df_compound = .handle.xref(df_compound, df_UnificationXref)
  
  ## Convert HTML code to character
  df_reaction = .convert.html(df_reaction)
  df_compound = .convert.html(df_compound)
  
  ## Parse stoichimetric coefficient
  regex_Stoichiometry_tag = '<bp:Stoichiometry rdf:ID="(.+)">'
  regex_Stoichiometry_close = '</bp:Stoichiometry>'
  regex_stoichiometricCoefficient_tag = '<bp:stoichiometricCoefficient rdf:datatype="http://www.w3.org/2001/XMLSchema#.+">(.+)</bp:stoichiometricCoefficient>'
  
  regex_id = '<bp:.+ rdf:ID="(.+)">'
  regex_id2 = '<bp:PHYSICAL-ENTITY rdf:resource="#(.+)"/>'
  
  regex_location = '<bp:cellularLocation rdf:resource="#(.+)"/>'
  regex_location2 = '<bp:cellularLocation>'
  regex_location3 = '<bp:CellularLocationVocabulary rdf:ID="(.+)">'
  
  index_Stoichiometry_tag = grep(regex_Stoichiometry_tag, biopax)
  index_Stoichiometry_close = grep(regex_Stoichiometry_close, biopax)
  
  master_id = trim(sub(regex_Stoichiometry_tag, '\\1', biopax[index_Stoichiometry_tag]))
  stoichiometric_coefficient = trim(sub(regex_stoichiometricCoefficient_tag, '\\1', biopax[index_Stoichiometry_tag+1]))
  
  id = trim(sub(regex_id, '\\1', biopax[index_Stoichiometry_tag+3]))
  id2 = trim(sub(regex_id2, '\\1', biopax[index_Stoichiometry_tag+2]))
  
  id_new = character(length(id))
  id_new[grepl('>', id)==FALSE] = id[grepl('>', id)==FALSE]
  id_new[grepl('>', id2)==FALSE] = id2[grepl('>', id2)==FALSE]
  
  location = trim(sub(regex_location, '\\1', biopax[index_Stoichiometry_close-3]))
  index_location = grep(regex_location2, biopax)
  if(length(location) != length(index_location)) {
    location[grep('comment', location)] = ''
    location[grep('dataSource', location)] = ''
  }
  if(length(index_location)>0) {
    location2 = trim(sub(regex_location3, '\\1', biopax[index_location+1]))
    location[grep('<', location)] = location2[grepl('>', location2)==F]
  } else {
    location[grep('>', location)] = df_compound[df_compound$name == id_new, 'cellularLocation']
  }
  
  participants = data.frame(cbind(master_id, stoichiometric_coefficient, id_new, location), stringsAsFactors=F)
  
  left_sub =build.subtable(df_reaction, 'MetaCyc', 'lefts', '///')
  right_sub = build.subtable(df_reaction, 'MetaCyc', 'rights', '///')
  
  ## Sometimes there is no 'Stoichiometry' tag
  if(nrow(participants) < (nrow(left_sub) + nrow(right_sub))) {
    tmp_df_CellularLocationVocabulary = df_CellularLocationVocabulary
    colnames(tmp_df_CellularLocationVocabulary)[grep('master_id', names(tmp_df_CellularLocationVocabulary))] = 'cellularLocation'
    participants = merge(df_compound, tmp_df_CellularLocationVocabulary, by = 'cellularLocation', all.x=T)
    colnames(participants)[grep('term', names(participants))] = 'location2'
    colnames(participants)[grep('^name$', names(participants))] = 'id_new'
    participants[is.na(participants)] = 'cytosol'
  } else {
    ## Generate equation with MetaCyc ID (not name)
    stoichio_sub = build.subtable(df_reaction, 'MetaCyc', 'participantStoichiometries', '///')
    
    if(nrow(df_compound) > 0) {
      compound_tmp = df_compound
      colnames(compound_tmp)[1] = 'id_new'
      participants = merge(participants, compound_tmp[,c('id_new','MetaCyc')], by='id_new')
    }
    
    CellularLocationVocabulary_tmp = df_CellularLocationVocabulary
    colnames(CellularLocationVocabulary_tmp)[1] = 'location'
    colnames(CellularLocationVocabulary_tmp)[2] = 'location2'
    if(any(participants$location %in% CellularLocationVocabulary_tmp$location)) {
      participants = merge(participants, CellularLocationVocabulary_tmp[,c('location','location2')], by='location')
    } else {
      participants[['location2']] = ''
    }  
  }
  
  participants$location2 = sub('cytosol', '(in)', participants$location2)
  participants$location2 = sub('chloroplast stroma', '(out)', participants$location2)
  participants$location2 = sub('extracellular region', '(out)', participants$location2)
  participants$location2 = sub('outer membrane-bounded periplasmic space', '(out)', participants$location2)
  participants$location2 = sub('plasma membrane', '(out)', participants$location2)
  
  participants[['assemble']] = paste(participants$stoichiometric_coefficient, participants$MetaCyc, participants$location2)
  participants$assemble = sub('^1 ', '', participants$assemble)
  participants$assemble = sub('^ ', '', participants$assemble)
  participants$assemble = sub(' \\(in)', '(in)', participants$assemble)
  participants$assemble = sub(' \\(out)', '(out)', participants$assemble)
  participants$assemble = sub(' $', '', participants$assemble)
  
  participants_tmp = participants
  colnames(participants)[grep('id_new', names(participants))] = 'lefts'
  left_sub = merge(participants[,c('lefts', 'assemble')], left_sub, by='lefts')
  left_sub = data.table(left_sub)
  left_sub2 = left_sub[, lapply(.SD, paste, collapse=' + '), by="MetaCyc", .SDcols=c("assemble")]
  left_sub2 = data.frame(left_sub2, stringsAsFactors=F)
  
  colnames(participants)[grep('lefts', names(participants))] = 'rights'
  right_sub = merge(right_sub, participants[,c('rights', 'assemble')], by='rights')
  right_sub = data.table(right_sub)
  right_sub2 = right_sub[, lapply(.SD, paste, collapse = ' + '), by="MetaCyc", .SDcols=c("assemble")]
  right_sub2 = data.frame(right_sub2, stringsAsFactors=F)
  
  ind_bidirection = grep(' <=> ', df_reaction$name)
  ind_right = grep(' => ', df_reaction$name)
  ind_left = grep(' <= ', df_reaction$name)
  
  direction = character(nrow(df_reaction))
  
  direction[ind_bidirection] = " <=> "
  direction[ind_right] = ' => '
  direction[ind_left] = ' <= '
  
  equation = paste(left_sub2$assemble, direction, right_sub2$assemble, sep='')
  
  ind_without_out = grepl('\\(out\\)', equation) == F
  equation[ind_without_out] = gsub('\\(in\\)', '', equation[ind_without_out])
  
  ## Remove unnasserary columns
  df_reaction[['xref']] = NULL
  df_reaction[['lefts']] = NULL
  df_reaction[['rights']] = NULL
  df_reaction[['participantStoichiometries']] = NULL
  
  ## Add equation and reorder the data.frame
  df_reaction[['name_id']] = equation
  
  colnames = names(df_reaction)
  colnames = colnames[-(grep('MetaCyc', colnames))]
  colnames = c('MetaCyc', colnames)
  
  df_reaction = df_reaction[,colnames]
  
  return(df_reaction)
}

.handle.xref = function(df, df_UnificationXref) {
  # Convert xref into column 'db' and value 'id'
  xref = build.subtable(df, 'name', 'xref', '///')
  xref.1 = xref[grep('^U', xref$xref),]
  xref.2 = xref[grep('^R', xref$xref),]
  xref = rbind(xref.1, xref.2)
  xref_merged = merge(xref, df_UnificationXref, by='xref')
  
  for(i in 1:nrow(xref_merged)) {
    if(xref_merged[i,'db'] %in% colnames(df) == FALSE) {
      df[xref_merged[i,'db']] = ''
    }
    df[df$name == xref_merged$name[i], xref_merged[i,'db']] = paste(xref_merged[i,'id'], df[df$name == xref_merged$name[i], xref_merged[i,'db']], sep='///')
  }
  col = colnames(df)
  for(i in col) {
    if(i == 'MetaCyc') {
      df[[i]] = gsub('///.*', '', df[[i]])
    }
    df[[i]] = gsub('///$', '', df[[i]])
  }
  return(df)
}

.parse.metacyc.biopax <- function(reaction_ids) {
  result_df = get.metacyc.reaction.byId(reaction_ids)
  result_df[is.na(result_df)] = ''
  return(result_df)
}
Any scripts or data that you put into this service are public.
RbioRXN documentation built on May 29, 2017, 10:56 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
RbioRXN
Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data

R/RbioRXN-internal.R
In RbioRXN: Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data

Defines functions create_df formula2matrix get.c.num get.formula get.h.num get.o.num get.participant d2formula parse.chebi parse.rhea unique_column convert.html parse.biopax handle.xref parse.metacyc.biopax

Try the RbioRXN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RbioRXN Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data

R/RbioRXN-internal.R In RbioRXN: Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data

Defines functions create_df formula2matrix get.c.num get.formula get.h.num get.o.num get.participant d2formula parse.chebi parse.rhea unique_column convert.html parse.biopax handle.xref parse.metacyc.biopax

Try the RbioRXN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RbioRXN
Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data

R/RbioRXN-internal.R
In RbioRXN: Process Rhea, KEGG, MetaCyc, Unipathway Biochemical Reaction Data