(eval-when (:compile-toplevel :load-toplevel :execute) (defvar *gene-name-rewrite-rules* (make-hash-table))) (eval-when (:load-toplevel) (defvar *dont-use-these-synonyms* nil) (pushnew '(("FAS" "2194")) *dont-use-these-synonyms* :test 'equal) ) (eval-when (:compile-toplevel :load-toplevel :execute) (defstruct gene-name-rewrite-rule name options rewrites)) (defvar *rewrite-rule-sets-by-name* (make-hash-table)) (defclass rewrite-rule-set () ((name :initarg :name :initform nil :accessor name) (rules :initarg :rules :initform nil :accessor rules))) (defmethod initialize-instance ((rs rewrite-rule-set) &key) (call-next-method) (assert (name rs) () "No name!") (setf (gethash (name rs) *rewrite-rule-sets-by-name*) rs)) (defmethod add-rule ((rs rewrite-rule-set) rule) (setf (getf (rules rs) (gene-name-rewrite-rule-name rule)) rule)) ;; (defclass gene-name-matcher () ((unprocessed-name-set :initarg :unprocessed-name-set :initform nil :accessor unprocessed-name-set) (synonym-table :initarg :synonym-table :initform nil :accessor synonym-table))) (eval-when (:compile-toplevel :load-toplevel :execute) (defstruct gene-name-rewrite-rule name options rewrites set)) (defvar *debug-rule-rewrites* nil) (defmacro define-gene-name-rewrite-rule (name set options &rest rewrites) `(progn (unless (gethash ',set *rewrite-rule-sets-by-name*) (format t "Adding rule set ~a~%" ',set) (make-instance 'rewrite-rule-set :name ',set)) #+abcl (cond (*load-truename* (setf (get ',name 'system::%source) (list *load-truename*)))) (add-rule (gethash ',set *rewrite-rule-sets-by-name*) (make-gene-name-rewrite-rule :name ',name :options ',options :rewrites ',rewrites)) )) (defun apply-gene-name-rewrite-rule (rule input how-so-far test) (let ((rule-name (gene-name-rewrite-rule-name rule))) (unless (find-if (lambda(how) (and (eq (car how) rule-name) (equal input (second how)))) how-so-far) (loop for rewrite in (gene-name-rewrite-rule-rewrites rule) for rewritten = (if (functionp rewrite) (or (funcall rewrite input) input) (if (and (consp rewrite) (eq (car rewrite) 'lambda)) (let ((compiled (compile nil rewrite))) (setf (gene-name-rewrite-rule-rewrites rule) (substitute compiled rewrite (gene-name-rewrite-rule-rewrites rule))) (or (funcall compiled input) input)) (progn ;(print-db input (first rewrite) (second rewrite)) (#"replaceFirst" input (first rewrite) (second rewrite))))) for result = (and (not (equal rewritten input)) (progn (and *debug-rule-rewrites* (format t "~a:~a->~a~%" rule-name input rewritten)) t) (funcall test rewritten (cons (list rule-name input rewritten) how-so-far))) when result do (return result))))) (defun apply-gene-name-rewrite-rules (rules input how-so-far test) (loop for rule in rules for result = (apply-gene-name-rewrite-rule rule input how-so-far test) when result do (return result))) (defvar *last-gene-name-rewrite-rules* nil) (defun all-gene-name-rewrite-rules (&rest names) (if (and *last-gene-name-rewrite-rules* (equal (car *last-gene-name-rewrite-rules*) names)) (cdr *last-gene-name-rewrite-rules*) (progn (unless names (setq names (loop for k being the hash-keys of *rewrite-rule-sets-by-name* collect k))) (setf *last-gene-name-rewrite-rules* (cons names (loop for name in names for set = (gethash name *rewrite-rule-sets-by-name*) append (loop for (rule nil) on (reverse (rules set)) by #'cddr collect rule)))) (cdr *last-gene-name-rewrite-rules*)))) (eval-when (:load-toplevel :execute) (define-gene-name-rewrite-rule remove-phosphorylation reagent-context () ("(?i)diphos\\S*" "") ("(?i)dual phos\\S*" "") ("(?i)\\S+.*" "") ; ("(?i)\\s*p{0,1}(ser|tyr|thr|lys).*" "") ; lysine acetylation? ("(?i)\\s*p{0,1}\\({0,1}(ser|tyr|thr|lys)\\d+\\){0,1}" "") ; replace above with more conservative. Might want to move above to alzforum specific ("pY\\d+" "") ("(?i)^p-" "") ("(?i)\\s+(H|N)-{0,1}\\d+" "") ("(?i)^\\s*phospho-" "") ; ("(?i)^\\s*phospho.*" "") ("(?i)^\\s*phospho" "") ; ("(?i)\\s*phos.*" "") ("(?i)\\s*p(T|S){0,1}\\d*\\s*$" "") ("(?i)\\s*p\\d*\\s*$" "")) (define-gene-name-rewrite-rule remove-cleaved reagent-context() ("(?i)^\\s*cleaved\\s*" "") ("(?i)\\s*cleaved.*$" "") ("(?i)\\s*mature.*$" "")) (define-gene-name-rewrite-rule greek-full-to-letter tweaks () ("(?i)beta" "b") ("(?i)alpha" "a") ("(?i)gamma" "g") ("(?i)delta" "d") ("(?i)lambda" "l") ("(?i)epsilon" "e") ("(?i)theta" "t") ("(?i)iota" "i") ("(?i)zeta" "z")) (define-gene-name-rewrite-rule dash-greek-no-dash tweaks () ("(?i)-(alpha|beta|gamma|delta|lambda|epsilon|theta|iota|zeta)" "$1") ) (define-gene-name-rewrite-rule kinase-kinase tweaks() ("(?i)kkk" "3k") ("(?i)kk" "2k")) (define-gene-name-rewrite-rule remove-some-attributes reagent-context () ("(?i)total" "") ("(?i)concentrated" "") ("(?i)activ(ated|e)" "")) (define-gene-name-rewrite-rule amyloid-beta tweaks () ("(?i)amyloid[- ]*beta" "APP") ("(?i)beta[- ]*amyloid" "APP") ("(?i)app\\s*\\d+" "APP")) ; http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=2116540&dopt=Abstract (define-gene-name-rewrite-rule catenin-flip tweaks () ("(?i)catenin([- ])(\\S+)" "$2$1catenin")) (define-gene-name-rewrite-rule beta-catenin tweaks () ("(?i)beta[- ]catenin" "CTNNB1")) (define-gene-name-rewrite-rule tgf-beta tweaks () ("(?i)tgf-{0,1}\\s*beta" "TGFB")) (define-gene-name-rewrite-rule subunit tweaks () ("(?i)subunit\\s*$" "")) (define-gene-name-rewrite-rule remove-kit reagent-context () ("(?i)(ifa|staining|signal|peroxidase|elisa){0,1}\\s*kit\\s*" "")) (define-gene-name-rewrite-rule roman-to-arabic-numbers tweaks () ("(?i)iv" "4") ("(?i)iii" "3") ("(?i)ii" "2") ("(?i)\\bi\\b" "1")) (define-gene-name-rewrite-rule strip-viral tweaks () ("^v-" "") ) (define-gene-name-rewrite-rule remove-pro tweaks () ("(?i)^pro(?!tein)" "") ("(?i)pro$" "")) (define-gene-name-rewrite-rule remove-variations reagent-context () ("(?i)(variant|basic|fragment|flg|fluorescein|block|all isoforms|Frameshift mutant).*" "") ("(?i)(light|heavy){0,1}\\s*chain.*" "")) (define-gene-name-rewrite-rule remove-trailing-range reagent-context () ("\\d+-\\d+\\s*$" "")) (define-gene-name-rewrite-rule abbreviate-kinase reagent-context() ("(?i)kinase" "k") ("(?i)-(poly|mono)" "")) (define-gene-name-rewrite-rule remove-braced reagent-context () ("(?i)\\[.*?\\]" "") ("(?i)\\(.*?\\)" "")) (define-gene-name-rewrite-rule maybe-abbreviation-braced reagent-context () ("(?i)\\[.*?\\]" "") (".*\\(([A-Z]{3,})\\)" "$1")) (define-gene-name-rewrite-rule receptor-abbreviation reagent-context () ("(?i)rece$" "receptor")) (define-gene-name-rewrite-rule remove-pan reagent-context () ("(?i)pan\\s*" "")) (define-gene-name-rewrite-rule acetylcholine-abbreviation reagent-context () ("(?i)ach" " acetylcholine ") ("(?i)mach" " muscarinic acetylcholine ")) (define-gene-name-rewrite-rule notch reagent-context () (lambda(k) (and (#"matches" k "(?i)^notch.*") (caar (all-matches k "(?i)notch\\S*" 0))))) (define-gene-name-rewrite-rule nuclear-receptor-is-receptor tweaks () ("(?i)\\sNR" " R")) (define-gene-name-rewrite-rule nmdar-redundant-receptor reagent-context () ("(?i)NMDAR\\s*R" "NMDA R")) (define-gene-name-rewrite-rule remove-immunoglobulin reagent-context () ("(?i)\\s+ig.*" "")) (define-gene-name-rewrite-rule fas reagent-context () ("(?i)^sfas" "fas") ("(?i)ligasnd" "ligand") ("(?i)soluble" "") ("(?i)intercel.*" "") ("(?i)intracel.*" "")) (define-gene-name-rewrite-rule remove-leading-species tweaks() ("(?i)human\\s*" "") ("(?i)mouse\\s*" "") ("(?i)rat\\s*" "") ) (define-gene-name-rewrite-rule remove-splice-variant reagent-context () ("(?i)\\S+\\s*splice variant" "")) (define-gene-name-rewrite-rule remove-trailing-protein reagent-context () ("protein(\\s*\\d*\\s*)" "")) (define-gene-name-rewrite-rule nfkb reagent-context() (lambda(k) (when (#"matches" k "(?i).*nf-{0,1}k(appa){0,1}\s*b.*") (if (#"matches" k "(?i).*p65.*") "RELA" (if (#"matches" k "(?i).*p50.*") "NFKB1" (if (#"matches" k "(?i).*(p52|p100).*") "NFKB2")))))) (defvar *trying-receptor-expansion* nil) (define-gene-name-rewrite-rule receptor-expansion tweaks () (lambda(k) (unless *trying-receptor-expansion* (let ((*trying-receptor-expansion* t)) (unless (#"matches" k ".*receptor$") (#"replaceFirst" k "(?i)r$" " receptor" )))))) (define-gene-name-rewrite-rule slash-synonyms reagent-context () (lambda(k) (when (= (count #\/ k) 1) (let ((parts (split-at-regex k "\\s*/\\s*"))) (and (unique-human-gene-id? (first parts)) (equal (unique-human-gene-id? (first parts)) (unique-human-gene-id? (second parts))) (first parts)))))) (define-gene-name-rewrite-rule ampersand-synonyms reagent-context () (lambda(k) (when (= (count #\& k) 1) (let ((parts (split-at-regex k "\\s*&\\s*"))) (and (unique-human-gene-id? (first parts)) (equal (unique-human-gene-id? (first parts)) (unique-human-gene-id? (second parts))) (first parts))))))) (defun maybe-its-a-protein-family (name) (flet ((more-than-one (list) (and (> (length list) 1) list))) (unless (#"matches" name "CD\\d+") (or (more-than-one (loop for letter in '("alpha" "beta" "gamma" "delta" "sigma" "lambda" "zeta" "epsilion" "tau" "theta" "eta" "psi" "omega" "phi" "chi" "psi" "rho" "nu" "mu" "kappa" "iota") for gene = (unique-human-gene-id? (format nil "~a ~a" name letter)) when gene collect (list (format nil "~a~a" name letter) gene))) (more-than-one (loop for numeral in '("i" "ii" "iii" "iv" "v" "vi" "vii" "vii" "viii" "ix" "x") for gene = (unique-human-gene-id? (format nil "~a ~a" name numeral)) when gene collect (list (format nil "~a~a" name numeral) gene))) (more-than-one (loop for digit from 1 to 25 for gene = (unique-human-gene-id? (format nil "~a~a" name digit)) when gene collect (list (format nil "~a~a" name digit) gene))) (more-than-one (loop for letter in '("A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P") for gene = (unique-human-gene-id? (format nil "~a~a" name letter)) when gene collect (list (format nil "~a~a" name letter) gene))) )))) (defun resolve-name (name &key (tried (make-hash-table :test 'equal)) (rules (all-gene-name-rewrite-rules)) (test 'unique-human-gene-id?) (trace nil)) (let ((*debug-rule-rewrites* (or trace *debug-rule-rewrites*))) (when (every 'symbolp rules) (setq rules (apply 'all-gene-name-rewrite-rules rules))) (let ((ambiguous-hits nil)) (labels ((resolve-internal (name &optional how-so-far) (unless (> (length name) 125) (if (eq (gethash name tried :not-tried) :not-tried) (setf (gethash name tried) (multiple-value-bind (unique multiple entries) (funcall test name) (if unique (progn (setf (gethash name tried) (list unique how-so-far))) (progn (pushnew multiple ambiguous-hits :test 'equalp) (or (apply-gene-name-rewrite-rules rules name how-so-far #'resolve-internal) (setf (gethash name tried) nil)))))) (gethash name tried))))) (let* ((resolved (resolve-internal name)) (unambiguous (caar resolved)) (used-methods nil)) (when unambiguous (maphash (lambda(k v) (when (equal (caar v) unambiguous) (loop for (meth) in (cadr v) do (pushnew meth used-methods)))) tried)) (values (car resolved) (car (last ambiguous-hits)) used-methods (third (first (second resolved)))) ;; first ambiguous hit is most specific ))))) (defun resolve-name-multiple-method (name tests &rest args) (loop for test in tests for results = (apply 'resolve-name name :test test args) when results do (return results))) ;; really want to protect slashes in paretheses (defun extract-slash-names (name) (let ((no-parenthesis (regex-replace-all "\\s*\\(.*?\\)" name ""))) (caar (all-matches no-parenthesis "(?i)[A-Za-z0-9&;-]+( [A-Za-z0-9-&;]+){0,1}(\\s*/\\s*[A-Za-z0-9-&;]+)+( [A-Za-z0-9-&;]+){0,1}" 0)))) (defun slashed-names-resolved (name) (let* ((names (split-at-regex name "\\s*/\\s*" )) (results (loop for namepart in names collect (unless (#"matches" namepart "^\\s*\\d+\\s*$") (car (unique-human-gene-id? namepart)))))) (and (every 'stringp results) (remove-duplicates results :test 'equal)))) (defun induction-names-resolved (name) (let* ((names (split-at-regex (regex-replace-all "^\\s*(.*?)\\s*$" name "$1") "\\s*/\\s*" )) (greek "(?i)(alpha|beta|gamma|delta|lambda|eta|tau|zeta)") (something-greek "(?i)(.*?)(alpha|beta|gamma|delta|lambda|eta|tau|zeta)") (greek-something "(?i)(alpha|beta|gamma|delta|lambda|eta|tau|zeta)(.*)")) (when (plusp (length names)) (let ((names (cond ((and (every (lambda(name) (#"matches" name "\\d+")) (cdr names)) (#"matches" (car names) ".*?\\d+")) (destructuring-bind (root number) (car (all-matches (car names) "(.*?)(\\d+)" 1 2)) (mapcar (lambda(number) (format nil "~a~a" root number)) (cons number (rest names))))) ((and (every (lambda(name) (#"matches" name greek)) (cdr names)) (#"matches" (car names) something-greek)) (destructuring-bind (root number) (car (all-matches (car names) something-greek 1 2)) (mapcar (lambda(number) (format nil "~a~a" root number)) (cons number (rest names))))) ((and (every (lambda(name) (#"matches" name greek)) (butlast names)) (#"matches" (car (last names)) greek-something)) (destructuring-bind (root number) (car (all-matches (car (last names)) greek-something 2 1)) (mapcar (lambda(number) (format nil "~a~a" number root)) (cons number (butlast names))))) ))) (let ((resolved (mapcar 'resolve-name names))) (or (and (not (position nil resolved)) (apply 'append resolved)) (ambiguous-alone-intersection-resolved names))))))) (defun ambiguous-alone-intersection-resolved (name-or-names) (let* ((names (if (stringp name-or-names) (split-at-regex (regex-replace-all "^\\s*(.*?)\\s*$" name-or-names "$1") "\\s*/\\s*" ) name-or-names))) (when (= (length names) 2) (multiple-value-bind (nil multihits1) (unique-human-gene-id? (first names)) (multiple-value-bind (nil multihits2) (unique-human-gene-id? (second names)) (intersection multihits2 multihits1)))))) #| cases: a/b each ambiguous, but there is a single intersection e.g. p35/25 xxx1/2/3 = xxx1/xxx2/xxx3 e.g. smad2/3 xxxalpha/beta yyy = xxxalpha xxxbeta yyy Phospho-PKCalpha/beta II , Phospho-PKCdelta/theta Antibody xxx/yyy = two different proteins Ezrin/Radixin/Moesin xxxI/II = xxxI xxxII Dynamin I/II Annoying stuff: cea/cd66 = cd66 (cea is part of name of protein) but there is cea ambiguous protein Mst1 (Thr183)/Mst2 (Thr180) |# (defun resolve-to-gene (&key test-name usewild (show :matches) list-of-names (:include-ambiguous nil)) (register-synonyms) (let ((gene-names (unless test-name (or list-of-names (alz-gene-names)))) (count 0) (missed 0) (numbers nil) (tried (make-hash-table :test 'equal)) (all-rules (all-gene-name-rewrite-rules)) (which-won? (make-hash-table :test 'equalp))) (labels ((resolve-one (k &optional how-so-far) (if (eq (gethash k tried :not-tried) :not-tried) (prog1 (setf (gethash k tried) (multiple-value-bind (unique multiple entries) (unique-human-gene-id? k) (if unique (progn (incf (gethash (mapcar 'car how-so-far) which-won? 0)) (setf (gethash k tried) (list (unique-human-gene-id? k) how-so-far))) (progn (pushnew multiple ambiguous-hits :test 'equalp) (or (apply-gene-name-rewrite-rules all-rules k how-so-far #'resolve-one) (setf (gethash k tried) nil)))))) (when (gethash k tried) (incf (gethash (mapcar 'car (second (gethash k tried))) which-won? 0)))) (gethash k tried)))) (if test-name (let ((*debug-rule-rewrites* t)) (let ((ambiguous-hits nil)) (declare (special ambiguous-hits)) (resolve-one test-name nil))) (progn (maphash (lambda(k v) (let ((ambiguous-hits nil)) (declare (special ambiguous-hits)) (let ((found (resolve-one k nil))) (if found (progn (when (eq show :matches) (format t "~a ~a~%" k (caar found))) (incf count)) (progn (incf missed) (when (gethash (string-downcase k) (infobiogen-synonyms)) (format t "Found ~a in infobiogen ~a" k (gethash (string-downcase k) (infobiogen-synonyms)))) (when (gethash (string-downcase k) *response-elements*) (format t "Found ~a in response-elements ~a" k (gethash (string-downcase k) *response-elements*))) (let ((family? (maybe-its-a-protein-family k))) (when family? (format t "Maybe it's a protein family :~a~%" family?))) (if (#"matches" k "(?i).*neurofilament.*") ;; about 90 hits '(progn (print k) (setq numbers (union numbers (all-matches k "\\d+" 0) :test 'equal ))) (when (eq show :misses) (format t "~a ~a~%" k ambiguous-hits)))))) )) gene-names) (values count missed numbers (setq *last-which-won* which-won?) (setq *last-tried* tried))))))) (defun show-rule-successes (rule-name) (loop for v being the hash-values of *last-tried* when (member rule-name (second v) :key 'car) do (print v))) (defun print-antibody-names-matching (regex) (loop for name being the hash-keys of (alz-gene-names) when (#"matches" name "(?i).*(&).*") do (format t "~a~%" name))) (defun print-antibodies-matching (searchfield regex &optional (printfields '(:datasheetlinktext)) (separator (string #\tab))) (let ((formatstr (format nil "~~{~~a~~^~a~~}~~%" separator) )) (foreach-antibody-entry (lambda(e) (let ((tosearch (getf searchfield))) (when (#"matches" name regex) (format t formatstr (mapcar (lambda(f) (getf e f)) printfields))))) 1 t))) (defun row-to-resolved () (let ((resolved2id (make-hash-table :test 'equal))) (maphash (lambda(k v)a (when v (destructuring-bind ((id) (&rest transforms)) v (setf (gethash k resolved2id) id) (loop for (nil from to) in transforms do (setf (gethash from resolved2id) id) (setf (gethash to resolved2id) id))))) *last-tried*) (foreach-antibody-entry (lambda(e) (let ((resolved (gethash (cdr (assoc :datasheetlinktext e)) resolved2id))) (when resolved (format t "~a ~a~%" (cdr (assoc :antibodyid e)) resolved))))))) (defparameter *xml-character-replacements* (loop with table = (make-hash-table :test 'equalp) for (ent replacement) in '(("Δ" " delta") ("ζ" " zeta") ("®" "(R)") ("⓰" " alpha") ("≤" "<=") ("™" "(TM)") ("–" "-") ("ʇ" " beta") ("’" "'") ("’" "'") ("λ" " lambda") ("Λ" " lambda") ("δ" " delta") ("ε" " epsilon") ("γ" " gamma") ("α" " alpha") ("κ" " kappa") ("β" " beta") ("γ" " gamma") ("α" " alpha")) do (setf (gethash ent table) replacement) finally (return table))) (defun replace-xml-characers (string) (replace-all string "&#\\S+?;" (lambda (s) (gethash s *xml-character-replacements* s)) 0)) ; (gethash (cdr (assoc :datasheetlinktext e)) *last-tried*)))) 100)