#!/usr/bin/python import fluxor2.reader, sets class ReactionMatcher: def __init__( self ): self.rp = fluxor2.reader.FFParser() def fill_empty(self, rxn, headers ): """ensure a value exists for every key """ for header in headers: if header not in rxn: rxn[header] = '' return rxn def filter_file(self, superclass_file, filter_p, args, subclass_file ): """ """ superclass = self.rp.parse( superclass_file ) headers = superclass[0] header_template = self.get_header_template( headers ) out = open( subclass_file, 'w' ) out.write('\t'.join( headers ) + '\n') for rxn in superclass[1:]: if apply( filter_p, (rxn, args) ): out.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers ) + '\n') out.close() return self.rp.parse( subclass_file ) def unmapped_wo_genes_p(self, rxn, mapped_rxns ): return rxn['abbreviation'] not in mapped_rxns and ('geneAssociation' not in rxn or rxn['geneAssociation'] == '') def make_hash_table(self, name, dictionary, filename, write_or_append='w', test='fequal' ): out = open( filename, write_or_append ) out.write("(setf %s (make-hash-table :test '%s)\n" % (name, test) ) for key in dictionary: out.write( "(setf (gethash '%s %s) '%s)\n" % (key, name, dictionary[key])) out.close() def make_genes_of_ucsd_rxn( self, ucsd_rxns_file ): rp = fluxor2.reader.FFParser() gp = fluxor2.reader.GeneParser() gene_predicate = {} ucsd_rxns = rp.parse( ucsd_rxns_file ) for rxn in ucsd_rxns[1:]: if rxn.has_key( 'geneAssociation' ): print "%s: %s" % (rxn['abbreviation'], rxn['geneAssociation'] ) gene_predicate[rxn['abbreviation']] = gp.parse( rxn['geneAssociation'] ) return gene_predicate def print_flatfile_to_lisp( self, flatfile, lispdir, lisp_name ): rp = fluxor2.reader.FFParser() ff = rp.parse( flatfile ) headers = ff[0] header_template = self.get_header_template( headers ) out = open( lispdir + lisp_name + '.lisp', 'w' ) out.write("(setq *%s* '(" % lisp_name ) out.write( '( "' + '" "'.join( headers ) + '" )\n' ) def print_ucsd_rxns_of_gene_to_lisp( self, rxns_of_gene, lispdir, lisp_name ): lisp_file = '%s%s.lisp' % ( lispdir, lisp_name ) out = open( lisp_file, 'w' ) out.write( "(setq *%s* '(" % lisp_name ) for gene in rxns_of_gene: out.write( '("%s" ( "%s" ) )\n' % (gene, '" "'.join( rxns_of_gene[gene] ) ) ) out.write( "))\n") out.close() return lisp_file def get_ucsd_rxns_of_gene( self, ucsd_rxns_file ): rp = fluxor2.reader.FFParser() rxns_of_gene = {} genes_of_rxn = {} ucsd_rxns = rp.parse( ucsd_rxns_file ) gp = fluxor2.reader.GeneParser() for rxn in ucsd_rxns[1:]: if 'geneAssociation' in rxn: genes_of_rxn[rxn['abbreviation']] = gp.parse( rxn['geneAssociation'] ).get_literals() for rxn in genes_of_rxn: for gene in genes_of_rxn[rxn]: if gene in rxns_of_gene: rxns_of_gene[gene].append( rxn ) else: rxns_of_gene[gene] = [rxn] return rxns_of_gene def print_genes_of_ucsd_rxn_to_lisp(self, ucsd_rxns_file, lispdir, lisp_name ): gene_predicate = self.make_genes_of_ucsd_rxn( ucsd_rxns_file ) lisp_file = '%s%s.lisp' % (lispdir, lisp_name) out = open( lisp_file , 'w' ) out.write( "(setq *%s* '(" % lisp_name ) gene_rxns = [] for rxn in gene_predicate: out.write( '("%s" ( "%s" ) )' % (rxn, '" "'.join( gene_predicate[rxn].get_literals() ) ) ) out.write( "))\n" ) out.close() return lisp_file def print_comment_and_rxns_of_gene(self, comment_of_rxn_gene, comment_and_rxns_of_gene_file ): fp = fluxor2.reader.FFParser() comment_and_rxns_of_gene = {} for gene, rxn in comment_of_rxn_gene: if gene in comment_and_rxns_of_gene: comment_and_rxns_of_gene[gene].append( rxn ) else: comment_and_rxns_of_gene[gene] = [comment_of_rxn_gene[(gene, rxn)].replace('\n',' '), rxn] genes = comment_and_rxns_of_gene.keys() genes.sort() out = open( comment_and_rxns_of_gene_file , 'w' ) out.write('gene\treactions\tcomment\n') for gene in genes: out.write( '%s\t%s\t%s\n' % ( gene, ','.join( comment_and_rxns_of_gene[gene][1:] ), comment_and_rxns_of_gene[gene][0] ) ) out.close() return fp.parse( comment_and_rxns_of_gene_file ) def print_unmapped_rxns_w_genes_comments(self, genes_wo_ecocyc_rxns_file, unmapped_rxns_w_genes_file, unmapped_rxns_w_genes_comments_file ): unmapped_rxns_w_genes_comments = {} rp = fluxor2.reader.FFParser() genes_wo_ecocyc_rxns = rp.parse( genes_wo_ecocyc_rxns_file ) unmapped_rxns_w_genes = rp.parse( unmapped_rxns_w_genes_file ) for gene in genes_wo_ecocyc_rxns[1:]: for rxn in gene['reactions'].split(',' ): if rxn in unmapped_rxns_w_genes_comments: unmapped_rxns_w_genes_comments[rxn].append( gene ) else: unmapped_rxns_w_genes_comments[rxn] = [gene ] headers = unmapped_rxns_w_genes[0] header_template = self.get_header_template( headers ) out = open( unmapped_rxns_w_genes_comments_file, 'w' ) out.write( '\t'.join( headers ) + '\n' ) for rxn in unmapped_rxns_w_genes[1:]: if rxn['abbreviation'] in unmapped_rxns_w_genes_comments: notes = [] for gene in unmapped_rxns_w_genes_comments[rxn['abbreviation']]: notes.append( '%s : %s' % ( gene['gene'], gene['comment'] ) ) rxn['notes'] = '\t'.join( notes ) out.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers ) + '\n' ) out.close() return rp.parse( unmapped_rxns_w_genes_comments_file ) def get_header_template( self, headers ): return ['%%(%s)s' % header for header in headers] def print_unmapped_rxns_wo_cpd_mappings(self, rxns_wo_cpds_file, mapped_rxns_wo_cpds_file, unmapped_rxns_wo_cpd_mappings_file ): rp = fluxor2.reader.FFParser() cat3 = rp.parse( rxns_wo_cpds_file ) cat3[0] = headers mapped_cat3 = rp.parse( mapped_rxns_wo_cpds_file ) mapped_cat3_rxns = {} for rxn in mapped_cat3[1:]: mapped_cat3_rxns[rxn['abbreviation']] = rxn out = open( unmapped_rxns_wo_cpd_mappings_file, 'w' ) header_template = self.get_header_template( headers ) out.write( '\t'.join( headers ) + '\n' ) for rxn in cat3[1:]: if rxn['abbreviation'] not in mapped_cat3_rxns: out.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers ) + '\n' ) out.close() return rp.parse( unmapped_rxns_wo_cpd_mappings_file ) def match_rxns_by_gene(self, gene_ecocyc_rxns_file, unmatched_ucsd_rxns_file, ecocyc_ucsd_map, matched_ucsd_rxns_file, all_p=True ): rp = fluxor2.reader.FFParser() gene_ecocyc_rxns = rp.parse( gene_ecocyc_rxns_file ) unmatched_ucsd_rxns = rp.parse( unmatched_ucsd_rxns_file ) headers = unmatched_ucsd_rxns[0] header_template = self.get_header_template( headers ) matched_ucsd_rxns = open( matched_ucsd_rxns_file, 'w' ) matched_ucsd_rxns.write( '\t'.join( headers ) + '\n' ) for rxn in unmatched_ucsd_rxns[1:]: if rxn.has_key( 'equation' ): ucsd_substrates = self.parseEqn( rxn['equation'] ) print "UCSD substrates: %s" % ', '.join( ucsd_substrates ) best_score = 0 best_gene = {} difference = sets.Set() ucsd_minus_ecocyc = sets.Set() ecocyc_minus_ucsd = sets.Set() score = 0 ecocyc_substrates = sets.Set() for gene in gene_ecocyc_rxns[1:]: if rxn['abbreviation'].lower() == gene['ucsd-rxn'].lower() and gene.has_key( 'ecocyc-substrates' ): ecocyc_substrates = sets.Set( gene['ecocyc-substrates'][:-1].split( ',' ) ) print "EcoCyc substrates: %s" % ', '.join( ecocyc_substrates ) score = len( ucsd_substrates.intersection( ecocyc_substrates ) ) print "Score %d" % score ecocyc_minus_ucsd = ecocyc_substrates.difference( ucsd_substrates ) ucsd_minus_ecocyc = ucsd_substrates.difference( ecocyc_substrates ) if score >= best_score: best_score = score best_gene = gene best_emu = ecocyc_minus_ucsd best_ume = ucsd_minus_ecocyc if all_p: matched_ucsd_rxns.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers ) ) matched_ucsd_rxns.write('%(ecocyc-rxn)s\t:gene-match\t%(ecocyc-eqn)s' % gene ) matched_ucsd_rxns.write('\t%s\t%s\t%d\n' % (', '.join( ecocyc_minus_ucsd ), ', '.join( ucsd_minus_ecocyc ), score ) ) if not all_p and best_gene != {}: matched_ucsd_rxns.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers) ) matched_ucsd_rxns.write('%(ecocyc-rxn)s\t:gene-match\t%(ecocyc-eqn)s' % best_gene ) matched_ucsd_rxns.write('\t%s\t%s\t%d\n' % (', '.join( best_emu ), ', '.join( best_ume ), best_score ) ) matched_ucsd_rxns.close() return rp.parse( matched_ucsd_rxns_file ) def print_complement_file(self, superclass_file, subclass_file, complement_file ): rp = fluxor2.reader.FFParser() superclass = rp.parse( superclass_file ) subclass = rp.parse( subclass_file ) headers = superclass[0] header_template = self.get_header_template( headers ) subclass_rxns = {} for rxn in subclass[1:]: subclass_rxns[rxn['abbreviation']] = rxn out = open( complement_file, 'w' ) out.write( '\t'.join( headers ) + '\n' ) for rxn in superclass[1:]: if rxn['abbreviation'] not in subclass_rxns: out.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers ) + '\n' ) out.close() return rp.parse( complement_file ) def print_ecocyc_ucsd_map_to_lisp(self, cpd_filename, lispdir, lisp_name ): ecocyc_ucsd_map = self.make_ecocyc_ucsd_map( cpd_filename ) self.make_hash_table( lisp_name, ecocyc_ucsd_map, lispdir + lisp_name + ".lisp" ) return ecocyc_ucsd_map def make_ecocyc_ucsd_map(self, cpd_filename ): rp = fluxor2.reader.FFParser() cpds = rp.parse( cpd_filename ) ecocyc_ucsd_map = {} for cpd in cpds[1:]: if 'ecocyc-id' in cpd: if cpd['ecocyc-id'] in ecocyc_ucsd_map: ecocyc_ucsd_map[cpd['ecocyc-id']].append( cpd['abbreviation'] ) else: ecocyc_ucsd_map[cpd['ecocyc-id']] = [cpd['abbreviation']] return ecocyc_ucsd_map def parseEqn(self, eqn ): rp = fluxor2.reader.ReactionParser() (left, right, direction, conversion_type) = rp.parseEqn( eqn, '') substrates = sets.Set() for physEnt, coefficient, compartment in left: substrates.add( physEnt ) for physEnt, coefficient, compartment in right: substrates.add( physEnt ) return substrates def fgrep(self, rxn, args ): column = args[0] match = args[1] return rxn[column].find( match ) != -1 def transport_p(self, rxn, args ): return not self.fgrep( rxn, ('equation', ' : ' ) ) def extracellular_p(self, rxn, args ): return self.fgrep( rxn, ('equation', '[e] : ') ) def periplasmic_p(self, rxn, args ): return self.fgrep( rxn, ('equation', '[p] : ') ) def cytoplasmic_p(self, rxn, args ): return self.fgrep( rxn, ('equation', '[c] : ') ) def filter_categories(self, category, iafdir ): rxns = {} rxns['w_genes'] = [] rxns['unmatched_rxns_w_genes'] = [] rxns['wo_genes'] = [] rxns['wo_genes_extracellular'] = [] rxns['wo_genes_periplasmic'] = [] rxns['wo_genes_cytoplasmic'] = [] rxns['wo_genes_transport'] = [] for i in range( len( category ) ): rxns['w_genes'].append( self.filter_file( iafdir + category[i], has_genes, (), iafdir + 'category%d-rxns-w-genes.txt' % i ) ) rxns['unmatched_rxns_w_genes'] = self.print_complement_file( iafdir + 'category%d-rxns-w-genes.txt' % i, iafdir + 'category%d-matched-rxns-by-gene-best.txt' % i, iafdir + 'category%d-unmatched-rxns-w-gene.txt' % i ) rxns['wo_genes'].append( self.print_complement_file( iafdir + category[i], iafdir + 'category%d-rxns-w-genes.txt' % i , iafdir + 'category%d-rxns-wo-genes.txt' % i ) ) rxns['wo_genes_extracellular'].append( self.filter_file( iafdir + 'category%d-rxns-wo-genes.txt' % i, extracellular_p, (), iafdir + 'category%d-rxns-wo-genes-extracellular.txt' % i ) ) rxns['wo_genes_periplasmic'].append( self.filter_file( iafdir + 'category%d-rxns-wo-genes.txt' % i, periplasmic_p, (), iafdir + 'category%d-rxns-wo-genes-periplasmic.txt' % i ) ) rxns['wo_genes_cytoplasmic'].append( self.filter_file( iafdir + 'category%d-rxns-wo-genes.txt' % i , cytoplasmic_p, (), iafdir + 'category%d-rxns-wo-genes-cytoplasmic.txt' %i ) ) rxns['wo_genes_transport'].append( self.filter_file( iafdir + 'category%d-rxns-wo-genes.txt' % i , transport_p, (), iafdir + 'category%d-rxns-wo-genes-transport.txt' %i ) ) return rxns def get_genes_of_ecocyc_rxn( self, ecocyc_rxns_and_comments_of_gene ): genes_of_ecocyc_rxn = {} ercg = ecocyc_rxns_and_comments_of_gene for gene in ercg: for rxn in ercg[gene]['ecocyc-rxns']: if rxn in genes_of_ecocyc_rxn: genes_of_ecocyc_rxn[rxn].add( gene ) else: genes_of_ecocyc_rxn[rxn] = sets.Set( [gene] ) return genes_of_ecocyc_rxn def get_ecocyc_rxns_of_gene_set( self, genes_of_ecocyc_rxn ): ecocyc_rxns_of_gene_set = {} for rxn in genes_of_ecocyc_rxn: if sets.ImmutableSet( genes_of_ecocyc_rxn[rxn] ) in ecocyc_rxns_of_gene_set: ecocyc_rxns_of_gene_set[sets.ImmutableSet( genes_of_ecocyc_rxn[rxn] ) ].add( rxn ) else: ecocyc_rxns_of_gene_set[sets.ImmutableSet( genes_of_ecocyc_rxn[rxn] ) ] = sets.Set( [rxn] ) return ecocyc_rxns_of_gene_set def get_genes_of_ucsd_rxn( self, ercg ): genes_of_ucsd_rxn = {} for gene in ercg: for rxn in ercg[gene]['ecocyc-rxns']: if rxn in genes_of_ucsd_rxn: genes_of_ucsd_rxn[rxn].add( gene ) else: genes_of_ucsd_rxn[rxn] = sets.Set( [gene] ) return genes_of_ucsd_rxn def get_ucsd_rxns_of_gene_set( self, genes_of_ucsd_rxn ): ucsd_rxns_of_gene_set = {} for rxn in genes_of_ucsd_rxn: if sets.ImmutableSet( genes_of_ucsd_rxn[rxn] ) in ucsd_rxns_of_gene_set: ucsd_rxns_of_gene_set[sets.ImmutableSet( genes_of_ucsd_rxn[rxn] ) ].add( rxn ) else: ucsd_rxns_of_gene_set[sets.ImmutableSet( genes_of_ucsd_rxn[rxn] ) ] = sets.Set( [rxn] ) return ucsd_rxns_of_gene_set def get_rxn_matches( self, ucsd_rxns_of_gene_set, ecocyc_rxns_of_gene_set ): rxn_matches = {} for gene_set in ucsd_rxns_of_gene_set: if gene_set in ecocyc_rxns_of_gene_set: rxn_matches[gene_set] = {'ucsd-rxns': ucsd_rxns_of_gene_set[gene_set], 'ecocyc-rxns': ecocyc_rxns_of_gene_set[gene_set] } return rxn_matches def get_gene_predicate( self, ucsd_rxns ): gp = fluxor2.reader.GeneParser() gene_predicate = {} for rxn in ucsd_rxns[1:]: if 'geneAssociation' in rxn: gene_predicate[rxn['abbreviation']] = gp.parse( rxn['geneAssociation'] ) return gene_predicate def get_rxn_match_by_gene_predicate( self, ecocyc_rxns_of_gene_set, ercg, gene_predicate ): rxn_match_by_gene_predicate = {} for gene_set in ecocyc_rxns_of_gene_set: genes = self.initialize_assignment( ercg ) genes = self.set_assignment( genes, gene_set ) for rxn in gene_predicate: if gene_predicate[rxn]( genes ): if rxn in rxn_match_by_gene_predicate: rxn_match_by_gene_predicate[rxn].add( ecocyc_rxns_of_gene_set[gene_set] ) else: rxn_match_by_gene_predicate[rxn] = sets.Set( [ecocyc_rxns_of_gene_set[gene_set]] ) return rxn_match_by_gene_predicate def initialize_assignment(self, list_of_literals ): assignment = {} for literal in list_of_literals: assignment[literal] = False return assignment def set_assignment( self, assignment, list_of_literals ): for literal in list_of_literals: assignment[literal] = True return assignment def count_mapped_rxns( self, ucsd_rxns ): mapped_rxns = {} for rxn in ucsd_rxns[1:]: if 'ecocyc-rxn-ids' in rxn: mapped_rxns[rxn['abbreviation']] = rxn['ecocyc-rxn-ids'].strip('( )').split(' ') return mapped_rxns def print_test_rxn_match_by_gene_predicate( self, ucsd_rxns, ecocyc_rxns_and_comments_of_gene, rxn_match_by_gene_predicate_file ): ercg = ecocyc_rxns_and_comments_of_gene genes_of_ecocyc_rxn = self.get_genes_of_ecocyc_rxn( ercg ) ecocyc_rxns_of_gene_set = self.get_ecocyc_rxns_of_gene_set( genes_of_ecocyc_rxn ) genes_of_ucsd_rxn = self.get_genes_of_ucsd_rxn( ercg ) ucsd_rxns_of_gene_set = self.get_ucsd_rxns_of_gene_set( genes_of_ucsd_rxn ) rxn_matches = self.get_rxn_matches( ucsd_rxns_of_gene_set, ecocyc_rxns_of_gene_set ) gene_predicate = self.get_gene_predicate( ucsd_rxns ) rxn_match_by_gene_predicate = self.get_rxn_match_by_gene_predicate( ecocyc_rxns_of_gene_set, ercg, gene_predicate ) out = open( rxn_match_by_gene_predicate_file,'w') out.write('ucsd-rxn\tecocyc-rxns\tgene-predicate\n') for rxn in rxn_match_by_gene_predicate: out.write( rxn + '\t' ) for ecocyc_rxn_set in rxn_match_by_gene_predicate[rxn]: for ecocyc_rxn in ecocyc_rxn_set: out.write(ecocyc_rxn + ',' ) out.write( '\t' ) out.write( str( gene_predicate[rxn] ) + '\n' ) out.close() def print_rxn_match_by_gene_predicate( self, ucsd_rxns, ecocyc_rxns_and_comments_of_gene, rxn_match_by_gene_predicate_file ): ercg = ecocyc_rxns_and_comments_of_gene genes_of_ecocyc_rxn = self.get_genes_of_ecocyc_rxn( ercg ) ecocyc_rxns_of_gene_set = self.get_ecocyc_rxns_of_gene_set( genes_of_ecocyc_rxn ) genes_of_ucsd_rxn = self.get_genes_of_ucsd_rxn( ercg ) ucsd_rxns_of_gene_set = self.get_ucsd_rxns_of_gene_set( genes_of_ucsd_rxn ) rxn_matches = self.get_rxn_matches( ucsd_rxns_of_gene_set, ecocyc_rxns_of_gene_set ) gene_predicate = self.get_gene_predicate( ucsd_rxns ) rxn_match_by_gene_predicate = self.get_rxn_match_by_gene_predicate( ecocyc_rxns_of_gene_set, ercg, gene_predicate ) headers = ucsd_rxns[0] header_template = self.get_header_template( headers ) out = open( rxn_match_by_gene_predicate_file,'w') out.write( '\t'.join( headers ) + '\n' ) for rxn in ucsd_rxns[1:]: if 'ecocyc-rxn-ids' in rxn: ecocyc_rxns = rxn['ecocyc-rxn-ids'].strip('( )').split(' ') else: ecocyc_rxns = [] if rxn['abbreviation'] in rxn_match_by_gene_predicate: for ecocyc_rxn_set in rxn_match_by_gene_predicate[rxn['abbreviation']]: for ecocyc_rxn in ecocyc_rxn_set: if ecocyc_rxn not in ecocyc_rxns: ecocyc_rxns.append( ecocyc_rxn ) rxn['ecocyc-rxn-ids'] = ','.join( ecocyc_rxns ) if 'ecocyc-rxn-ids' in rxn: print '%s\t%s' % (rxn['abbreviation'], rxn['ecocyc-rxn-ids'] ) out.write( '\t'.join( header_template ) % self.fill_empty( rxn, headers ) + '\n' ) out.close() return ucsd_rxns if __name__ == "__main__": iafdir = '/home/zucker/src/lsw/trunk/bug/iAF1261/' lispdir = iafdir category = ['iAF1261-ecocyc-rxn-mappings.txt', 'iAF1261-ecocyc-rxns-unmapped-exchange.txt', 'iAF1261-ecocyc-rxns-unmapped-diffusion.txt', 'iAF1261-ecocyc-rxns-with-unmapped-cpds.txt', 'iAF1261-ecocyc-rxn-unmapped-otherwise.txt'] fp = fluxor2.reader.FFParser() rm = ReactionMatcher() ucsd_rxns_of_gene = rm.get_ucsd_rxns_of_gene( iafdir + category[0] ) lispfile = rm.print_ucsd_rxns_of_gene_to_lisp( ucsd_rxns_of_gene, lispdir, 'ucsd-rxns-of-gene' ) import ecocyc_rxns_and_comments_of_gene ercg = ecocyc_rxns_and_comments_of_gene.ecocyc_rxns_and_comments_of_gene ucsd_rxns = fp.parse( iafdir + category[0] ) #rm.print_test_rxn_match_by_gene_predicate( ucsd_rxns, ercg, iafdir + 'test-rxn-match-by-gene-predicate.txt' ) #rm.print_rxn_match_by_gene_predicate( ucsd_rxns, ercg, iafdir + 'rxn-match-by-gene-predicate3.txt' )