#!/usr/bin/python import getopt, string, sets import re from fluxor2.util.predicate import * import fluxor2.reader def delimited_file2array_of_dictionary( filename, delimiter='\t', comment='#' ): delimited_file = open( filename ) headers = delimited_file.readline().strip().split(delimiter) array = [] for line in delimited_file: if line.strip()[0] != comment: columns = line.strip().split(delimiter) dictionary = {} for i in range( len( columns ) ): dictionary[headers[i]] = columns[i] array.append( dictionary ) delimited_file.close() return array def get_similarity_match( ucsd, ecocyc, eu_map ): score = 0 for e in ecocyc: for u in ucsd: if u in eu_map[e]: score += 1 return score def match_rxns_by_gene( gene_ecocyc_rxns_file, unmatched_ucsd_rxns_file, ecocyc_ucsd_map, matched_ucsd_rxns_file, all_p=True ): rp = fluxor2.reader.FFParser() gene_ecocyc_rxns = rp.parse( gene_ecocyc_rxn_file ) unmatched_ucsd_rxns = rp.parse( unmatched_ucsd_rxns_file ) headers = unmatched_ucsd_rxns[0] header_template = get_header_template( headers ) matched_ucsd_rxns = open( matched_ucsd_rxns_file, 'w' ) matched_ucsd_rxns.write( '\t'.join( headers ) + '\n' ) for rxn in unmatched_ucsd_rxns[1:]: if rxn.has_key( 'equation' ): ucsd_substrates = parseEqn( rxn['equation'] ) print "UCSD substrates: %s" % ', '.join( ucsd_substrates ) best_score = 0 best_gene = {} difference = sets.Set() ucsd_minus_ecocyc = sets.Set() ecocyc_minus_ucsd = sets.Set() for gene in gene_ecocyc_rxns[1:]: if rxn['abbreviation'].lower() == gene['ucsd-rxn'].lower() and gene.has_key( 'ecocyc-substrates' ): ecocyc_substrates = sets.Set( gene['ecocyc-substrates'][:-1].split( ',' ) ) print "EcoCyc substrates: %s" % ', '.join( ecocyc_substrates ) score = len( ucsd_substrates.intersection( ecocyc_substrates ) ) print "Score %d" % score ecocyc_minus_ucsd = ecocyc_substrates.difference( ucsd_substrates ) ucsd_minus_ecocyc = ucsd_substrates.difference( ecocyc_substrates ) if score >= best_score: best_score = score best_gene = gene best_emu = ecocyc_minus_ucsd best_ume = ucsd_minus_ecocyc if all_p: matched_ucsd_rxns.write( '\t'.join( header_template ) % rxn ) matched_ucsd_rxns.write('%(ecocyc-rxn)s\t:gene-match\t%(ecocyc-eqn)s' % gene ) matched_ucsd_rxns.write('\t%s\t%s\t%d\n' % (', '.join( ecocyc_minus_ucsd ), ', '.join( ucsd_minus_ecocyc ), score ) ) if not all_p and best_gene != {}: matched_ucsd_rxns.write( '\t'.join( header_template ) % rxn ) matched_ucsd_rxns.write('%(ecocyc-rxn)s\t:gene-match\t%(ecocyc-eqn)s' % best_gene ) matched_ucsd_rxns.write('\t%s\t%s\t%d\n' % (', '.join( best_emu ), ', '.join( best_ume ), best_score ) ) matched_ucsd_rxns.close() def merge_rxns_by_gene( map_by_gene, rxns, ecocyc_ucsd_map, filename, all_p=True ): mapped_others = open( filename, 'w' ) mapped_others.write('abbreviation\tofficialName\tequation\tgeneAssociation\tecocyc-rxn\tanalysis\tecocyc-equation\tunmatched-ecocyc-cpds\tunmatched-ucsd-cpds\tscore\n' ) for line in rxns: if line.has_key( 'equation' ): ucsd_substrates = parseEqn( line['equation'] ) print "UCSD substrates: %s" % ', '.join( ucsd_substrates ) best_score = 0 best_gene = {} difference = sets.Set() ucsd_minus_ecocyc = sets.Set() ecocyc_minus_ucsd = sets.Set() for gene in map_by_gene: if line['abbreviation'].lower() == gene['ucsd-rxn'].lower() and gene.has_key( 'ecocyc-substrates' ): ecocyc_substrates = sets.Set( gene['ecocyc-substrates'][:-1].split(',') ) print "EcoCyc substrates: %s" % ', '.join( ecocyc_substrates ) #score = get_similarity_match( ucsd_substrates, ecocyc_substrates, ecocyc_ucsd_map ) score = len( ucsd_substrates.intersection( ecocyc_substrates ) ) print "Score %d" % score ecocyc_minus_ucsd = ecocyc_substrates.difference( ucsd_substrates ) ucsd_minus_ecocyc = ucsd_substrates.difference( ecocyc_substrates ) if score >= best_score: best_score = score best_gene = gene best_emu = ecocyc_minus_ucsd best_ume = ucsd_minus_ecocyc if all_p: mapped_others.write('%(abbreviation)s\t%(officialName)s\t%(equation)s\t%(geneAssociation)s\t' % line) mapped_others.write('%(ecocyc-rxn)s\t:gene-match\t%(ecocyc-eqn)s' % gene ) mapped_others.write('\t%s\t%s\t%d\n' % (', '.join( ecocyc_minus_ucsd ), ', '.join( ucsd_minus_ecocyc ), score ) ) if not all_p and best_gene != {}: mapped_others.write('%(abbreviation)s\t%(officialName)s\t%(equation)s\t%(geneAssociation)s\t' % line) mapped_others.write('%(ecocyc-rxn)s\t:gene-match\t%(ecocyc-eqn)s' % best_gene ) mapped_others.write('\t%s\t%s\t%d\n' % (', '.join( ecocyc_minus_ucsd ), ', '.join( ucsd_minus_ecocyc ), best_score ) ) mapped_others.close() def parseEqn( eqn ): rp = fluxor2.reader.ReactionParser() (left, right, direction, conversion_type) = rp.parseEqn( eqn, '') substrates = sets.Set() for physEnt, coefficient, compartment in left: substrates.add( physEnt ) for physEnt, coefficient, compartment in right: substrates.add( physEnt ) return substrates def gene_rxns( rxns ): gp = GeneParser() gene_rxns = [] for rxn in rxns: if rxn.has_key( 'geneAssociation' ): print rxn['geneAssociation'] rxn['genepredicate'] = gp.parse( rxn['geneAssociation' ] ) gene_rxns.append( rxn ) return gene_rxns def make_gene_list( name, rxns, filename ): out = open( filename, 'w' ) out.write( '(setq %s (list ' % name ) for gene in rxns: out.write( '(list "%s" ' % gene['abbreviation'] + make_list( gene['genepredicate'].get_literals() ) + ' ) ' ) out.write( ' ) )\n' ) out.close() def make_list( list ): return '(list "' + '" "'.join( list ) + '" )\t' def query_select( db, query ): result_set = [] for row in db: if query_match( row, query ): result_set.append( row ) return result_set class GeneParser: """ predicate ::= operand | conjunction | disjunction conjunction :: = operand | operand 'and' conjunction disjunction :: = operand | operand 'or' disjunction operand ::= literal | '(' predicate ')' """ def __init__( self ): pass def parse(self , gpr, regexp='[A-Za-z0-9_]+' ): self.tokens = gpr.split(' ') for i in range( self.tokens.count( '' ) ): self.tokens.remove( '' ) #print self.tokens self.bnumber = re.compile(regexp) if len( self.tokens ) > 0: return self.parse_predicate() def consume_token( self ): token = self.tokens[0] del self.tokens[0] return token def peek_token( self ): if len( self.tokens ) > 0: return self.tokens[0] else: print "Token: %s" % self.tokens return '' def has_token( self ): return len( self.tokens ) def parse_conjunction( self, operand ): conjunction = [ operand ] while self.has_token() and self.peek_token().lower() == 'and': self.consume_token() conjunction.append( self.parse_operand() ) return And( conjunction ) def parse_disjunction( self, operand ): disjunction = [ operand ] while self.has_token() and self.peek_token().lower() == 'or': self.consume_token() disjunction.append( self.parse_operand() ) return Or( disjunction ) def parse_predicate( self ): predicate = self.parse_operand() while self.has_token(): if self.peek_token().lower() == 'or': predicate = self.parse_disjunction( predicate ) elif self.peek_token().lower() == 'and': predicate = self.parse_conjunction( predicate ) else: break return predicate # def parse_predicate( self ): # predicate = self.parse_operand() # while self.has_token(): # if self.peek_token().lower() in ('and', 'or'): # operator = self.consume_token() # operand = self.parse_predicate() # if operator.lower() == 'and': # predicate = And( [ predicate, operand ] ) # else: # predicate = Or( [ predicate, operand ] ) # else: # break # return predicate def parse_operand( self ): if self.bnumber.match( self.peek_token() ): return Literal( self.consume_token() ) else: if self.peek_token() == '(': self.consume_token() predicate = self.parse_predicate() if self.peek_token() == ')': self.consume_token() return predicate def __call__( self, operands ): return self.predicate( operands ) def __repr__( self ): return str( self.predicate ) def make_hash_table( name, dictionary, filename, test='fequal'): out = open( filename, 'w' ) out.write("(setf %s (make-hash-table :test '%s)\n" % (name, test) ) for key in dictionary: out.write( "(setf (gethash '%s %s) '%s)\n" % (key, name, dictionary[key])) out.close() def unmatched_genes_to_lisp( superclass_file, lispdir, lisp_name ): rp = fluxor2.reader.FFParser() gp = fluxor2.reader.GeneParser() superclass = rp.parse( superclass_file ) lisp_file = '%s%s.lisp' % (lispdir, lisp_name) out = open( lisp_file , 'w' ) out.write( "(setq *%s* '(" % lisp_name ) gene_rxns = [] for rxn in superclass: if rxn.has_key( 'geneAssociation' ): genepredicate = gp.parse( rxn['geneAssociation'] ) out.write( '(%s ( "%s" ) )' % (rxn['abbreviation'], '" "'.join( genepredicate.get_literals() ) ) ) out.write( "))\n" ) out.close() return lisp_file def unmapped_genes_to_lisp( genes, filename ): out = open( filename, 'w' ) out.write( "(setq *unmatched-genes* '(" ) for gene in genes: if gene.has_key( 'genepredicate' ) and gene['genepredicate']: out.write( '(%s ( "%s" ) )' % ( gene['abbreviation'], '" "'.join( gene['genepredicate'].get_literals() ) ) ) out.write( "))\n" ) out.close() #if __name__ == '__main__': # rxns = delimited_file2array_of_dictionary( 'iAF1261-ecocyc-rxn-unmapped-otherwise.txt') # genes = gene_rxns( rxns ) # unmapped_genes_to_lisp( genes, 'unmapped-otherwise.lisp' ) # map_by_gene = delimited_file2array_of_dictionary('other-rxn-mappings-by-gene.txt') # ecocyc_ucsd_map = make_ecocyc_ucsd_map( 'iAF1261-ecocyc-cpd-mappings.txt' ) # merge_by_gene( map_by_gene, rxns, ecocyc_ucsd_map, 'mapped-other-rxns.txt' ) # rxns_unmapped_cpds = delimited_file2array_of_dictionary( 'iAF1261-ecocyc-rxns-with-unmapped-cpds.txt' ) # rxns_unmapped_cpds_w_genes = gene_rxns( rxns_umapped_cpds ) # unmapped_genes_to_lisp( rxns_unmapped_cpds_w_genes, 'rxns-with-unmapped-cpds.lisp' ) # rxns_with_unmapped_cpds_by_gene = delimited_file2array_of_dictionary( 'rxns-with-unmapped-cpds-by-gene.txt' ) # merge_by_gene( rxns_with_unmapped_cpds_by_gene, rxns_unmapped_cpds, ecocyc_ucsd_map, 'mapped-rxns-with-unmapped-cpds-by-gene.txt' )