(in-package :cl-user) #| Prefixes on id in *FIELD* TI first line define what kind of record this is: * gene with known sequence + gene with known sequence and phenotype # phenotype description, molecular basis known % mendelian phenotype or locus, molecular basis unknown no prefix other, mainly phenotypes with suspected mendelian basis Example record from omim.txt. We want the names listed below *FIELD* TI *RECORD* *FIELD* NO 150292 *FIELD* TI *150292 LAMININ, GAMMA-2; LAMC2 ;;LAMININ B2 POLYPEPTIDE, TRUNCATED;; LAMB2, TRUNCATED; LAMB2T;; KALININ;; LAMININ, NICEIN, BETA-2; LAMNB2;; LAMININ 5, GAMMA-2 SUBUNIT *FIELD* TX |# (defun untangle-omim-titles (titles) (let ((main (car titles)) (rest (rest titles))) (let ((titles (remove "" (split-at-regex (caar (all-matches main "[*+]\\d+ (.*)" 1)) ";\\s+") :test 'string=)) (synonyms (mapcan (lambda(entry) (remove "" (split-at-regex entry ";\\s*") :test 'string=)) rest))) (remove-if (lambda(s) (or (#"matches" s "^\\d+$") (search ", INCLUDED" s))) (append titles synonyms))))) ;; funcall function with two arguments: id, and names. If you want the first can be considered the name ;; and the rest the synonyms. (defvar *omim-to-ll* nil) (defun omim-to-ll () (or *omim-to-ll* (setq *omim-to-ll* (let ((table (make-hash-table :test 'equal))) (each-entrez-gene-summary (lambda(id omim) (when (gethash id (locuslink-to-info)) ;; what's this for?? - Mon July 17, 2006 (push id (gethash omim table)))) :boa '(:id :omim)) table)))) (defun each-omim-synonyms (function) (when (config :omim) (let ((regex (compile-regex "(^[*][A-Z]*[*]( [A-Z]+){0,1})"))) (with-open-file (f (config :omim)) (loop with collecting = :nothing with id and titles and is-a-gene and token with map = (omim-to-ll) for line = (read-line f nil :eof) until (eq line :eof) do (when (and (not (equal line "")) (char= (char line 0) #\*)) (setq token (caar (all-matches line regex 1))) (when token (cond ((equal token "*RECORD*") (and id is-a-gene (let ((ll (gethash id map))) (funcall function id (untangle-omim-titles (reverse titles)) (car ll)))) (setq id nil titles nil collecting :nothing is-a-gene nil)) ((equal token "*FIELD* NO") (setq collecting :id)) ((equal token "*FIELD* TI") (setq collecting :title)) (t (setq collecting :nothing))))) (if token (setq token nil) (progn (ecase collecting (:nothing nil) (:id (if id (error "Already got id ~a" id) (setq id line))) (:title (when (member (char line 0) '(#\* #\+) :test 'char=) (setq is-a-gene t)) (when (and id is-a-gene) (push line titles)))))))))))