;; each name and synonyns (keep track of which is name, which synonym) ;; get ll ;; foreach transformation ;; used upcase transformed as hash ;; value is list of (actual source transform) ;; Given name ;; foreach transform ;; collect (actual preference source transform) ;; apply preference rules (defvar *all-name-transforms* nil) (defclass name-transform () ((name :initarg :name :initform nil :accessor name) (more-general-than :initarg :more-general-than :initform nil :accessor more-general-than) (all-more-general-than :initarg :all-more-general-than :initform nil :accessor all-more-general-than-slot) (transformer :initarg :transformer :initform nil :accessor transformer) )) (defmethod all-more-general-than ((nt name-transform)) (or (all-more-general-than-slot nt) (setf (all-more-general-than-slot nt) (remove-duplicates (apply 'append (more-general-than nt) (mapcar (lambda(g) (more-general-than (name-transform-named g))) (more-general-than nt))))))) (defmethod initialize-instance ((nt name-transform) &key) (call-next-method) (pushnew nt *all-name-transforms*)) (defun name-transform-named (name) (find name *all-name-transforms* :key 'name)) (defmacro define-name-transform (name genls args &body body) (let ((it (gensym))) `(let ((,it (or (name-transform-named ',name) (make-instance 'name-transform :name ',name)))) (setf (transformer ,it) (lambda (,@args) ,@body)) (setf (more-general-than ,it) ',genls)))) (defmethod transform ((nt name-transform) name) (funcall (transformer nt) name)) (define-name-transform :as-is () (name) name) (define-name-transform :uppercase (as-is) (name) (string-upcase name)) ;; really should be a bit more careful. Don't merge two separated numbers into a larger number ;; don't make a short acronym by removing punctuation. Fixed - suggestion of prothesaurus - collapse ;; noise into a special character (null) (define-name-transform :denoise (uppercase) (name) (string-upcase (regex-replace-all "[^A-Za-z0-9]+" name (load-time-value (string #\null))))) (define-name-transform :parenthesis (uppercase) (name) (and (#"matches" name ".+\\(.*\\)$") (mapcar 'string-upcase (car (all-matches name "^(.*)\\s*\\(\(.*)\\)$" 1 2))))) ;; some notes on transforms: merging numbers is usually not a good ;; idea P1-2 shouldn't become P12. Not all noise is equal. spaces, ;; commas, semicolons parentheses seem to be added at will. square ;; brackets are more unusual and maybe signficant. (defclass synonym-registry () ((table :initarg :table :initform nil :accessor table) (transforms :initarg :transforms :initform (mapcar 'name-transform-named '(:as-is :uppercase :denoise)) :accessor transforms))) (defclass priority-synonym-registry (synonym-registry) ((next-synonym-registry :initarg :next-synonym-registry :initform nil :accessor next-synonym-registry) (transforms :initform (mapcar 'name-transform-named '(:as-is :uppercase)) ) )) (defmethod initialize-instance ((sn synonym-registry) &key entries) (call-next-method) (setf (table sn) (make-hash-table :test 'equal :size (* (length (transforms sn)) entries)))) (defmethod each-name-transform ((r synonym-registry) name f) (loop for trans in (transforms r) for transformed = (transform trans name) with already do (loop for atransformed in (if (listp transformed) transformed (list transformed)) when (and atransformed (not (or (and (not (eq (name trans) :as-is)) (string= name atransformed)) (member atransformed already :test 'string=)))) do (push atransformed already ) and do (funcall f (name trans) name atransformed)))) (defmethod register-names ((r synonym-registry) name synonyms source &rest identifiers) (let ((all nil) (syntable (table r))) (each-name-transform r name (lambda (method name transformed) (push (list name transformed method) all))) (loop for syn in (remove name synonyms :test 'string=) do (each-name-transform r syn (lambda (method name transformed) (push (list name transformed method t) all)))) (loop for (name transformed method synonym) in all do (pushnew (list* name :method method :source source :synonym synonym identifiers) (gethash transformed syntable) :test 'equal)))) (defmethod lookup-name ((r synonym-registry) name) (let ((all nil)) (each-name-transform r name (lambda (method name transformed) (declare (ignore name method)) (push (gethash transformed (table r)) all))) (apply 'append all))) (defmethod lookup-name ((r priority-synonym-registry) name) (or (call-next-method) (lookup-name (next-synonym-registry r) name))) (defmethod abolish-synonym ((r synonym-registry) what entrez-id) (setf (gethash what (table r)) (remove-if (lambda(e) (equal (getf (cdr e) :gene) entrez-id)) (gethash what (table r))))) ; (each-entrez-gene-summary (lambda(synonyms) (incf count (1+ (length synonyms)))) '(:synonyms)) (defvar *synonym-registry* (make-instance 'synonym-registry :entries 2000000)) (defvar *human-locuslink* (make-hash-table :test 'equal)) (defparameter *dont-use-these-synonyms* '(("FAS" "2194") ("Claudin-1" "1366") ("2" "55240") ("II" "2651"))) (defparameter *added-synonyms* '(("c-jun" "jun") ("c-fos" "fos") ("ap2a" "AP2-ALPHA") ("ca15-3" "muc1") ("nfkb" "nfkb1") ("von Hippel-Lindau Protein" "vhl") ("p16INK4a" "CDKN2a") ("Stat1 p84/p91" "STAT1") ("BRN-3" "POU4F3") ("CEF1" "TNNC1") ("CEF2" "TNNC2") ("CGPR" "CALCR") )) (defvar *gene-id-to-current-gene-id* (make-hash-table :test 'equal)) (defvar *omim-to-ll* nil) (defun register-synonyms (&key (registry *synonym-registry*) (force nil) entrez omim enzyme (all t)) (when (or force (zerop (hash-table-count (table registry)))) (unless *omim-to-ll* (setf *omim-to-ll* (make-hash-table :test 'equal))) (when (or all entrez) (format *debug-io* "entrez") (each-entrez-gene-summary (lambda(id name type locuslink refseq-mrna refseq-protein omim synonyms species current-gene status) (unless (or (null name) (eq status :discontinued)) (when (or (eq species :human) (eq species :|HOMO SAPIENS|)) (setf (gethash id *human-locuslink*) t)) (apply 'register-names registry name synonyms :gene `(:gene ,id ,@(when current-gene (setf (gethash id *gene-id-to-current-gene-id*) current-gene) (list :current-gene current-gene)) ,@(if refseq-mrna (list :refseq-mrna refseq-mrna)) :type ,type ,@(if refseq-protein (list :refseq-protein refseq-protein)) ,@(when omim (pushnew id (gethash omim *omim-to-ll*) :test 'equal) (list :omim omim)) ,@(if locuslink (list :locuslink locuslink))))) ) :boa '(:id :name :type :locuslink :refseq-mrna :refseq-protein :omim :synonyms :species :current-id :status))) (when (or all enzyme) (format *debug-io* "enzyme") (each-enzyme-synonyms (lambda(lls ec names) (loop for ll in lls do (register-names registry (car names) (cdr names) :enzyme ec :locuslink ll))))) (when (or all omim) (format *debug-io* "omim") (each-omim-synonyms (lambda (id names ll) (apply 'register-names registry (car names) (cdr names) :omim :omim id (when ll `(:locuslink ,ll)))))) (loop for (name id) in *dont-use-these-synonyms* do (abolish-synonym *synonym-registry* name id)) (loop for (new old) in *added-synonyms* do (add-synonym registry new old)) )) (defmethod add-new-synonyms ((sn synonym-registry) ) (loop for (new old) in *added-synonyms* do (add-synonym sn new old))) (defmethod add-synonym ((s synonym-registry) new old) (if (lookup-name s new) (let ((entry (copy-list (nthcdr 7 (car (lookup-name s old)))))) (setf (getf entry :synonym) nil) (apply 'register-names s new nil :gene entry)))) (defmethod add-synonym ((s priority-synonym-registry) new old) (let ((entry (copy-list (nthcdr 7 (car (lookup-name s old)))))) (unless entry (warn "Couldn't find entry for ~a" old)) (setf (getf entry :synonym) nil) (apply 'register-names s new nil :gene entry))) ;; Pick up the locuslink references in the hits. ;; Either there is only one, or there is only one human or there is more than one human ;; Report this. First value is non-nil if consistent. Second value is list of locuslinks (defun unique-human-gene-id? (name-or-hits) (let* ((hits (if (stringp name-or-hits) (lookup-name *synonym-registry* name-or-hits) name-or-hits)) (them (remove-duplicates (loop for id in (loop for hit in hits for locuslink = (or (getf (cdr hit) :locuslink) (getf (cdr hit) :gene)) if (listp locuslink) append locuslink else collect locuslink) collect (or (gethash id *gene-id-to-current-gene-id*) id)) :test 'equal))) (if (= (length them) 1) (values them them hits) (let ((onlyhuman (remove-if-not (lambda(l) (gethash l *human-locuslink*)) them))) (if (= (length onlyhuman) 1) (values onlyhuman onlyhuman (remove-if-not (lambda(hit) (equal (or (getf (cdr hit) :locuslink) (getf (cdr hit) :gene)) (car onlyhuman ))) hits)) (if (some (lambda(hit) (and (find :type hit) (not (eq (getf (cdr hit) :type) :protein-coding)))) hits) (funcall 'unique-human-gene-id? (remove-if (lambda(hit) (and (find :type hit) (not (eq (getf (cdr hit) :type) :protein-coding)))) hits)) ;; logic here is a bit unsafe - if there is a main name and a synonym then use the main name. (if (and (some (lambda(hit) (getf (cdr hit) :synonym)) hits) (not (every (lambda(hit) (getf (cdr hit) :synonym)) hits))) (progn ; (print-db hits (remove-if (lambda(hit) (getf (cdr hit) :synonym)) hits)) (funcall 'unique-human-gene-id? (remove-if (lambda(hit) (getf (cdr hit) :synonym)) hits))) (values nil onlyhuman (remove-if-not (lambda(hit) (member (or (getf (cdr hit) :locuslink) (getf (cdr hit) :gene)) onlyhuman :test 'equal)) hits)))))))))