(in-package :cl-user) ;http://www.cellsignal.com/ddt/elisa.asp ;http://www.cellsignal.com/product.asp?product%5fid=2395 (defclass cell-signal-technology-products () ((cookie :initarg :cookie :initform nil :accessor cookie) (entries :initarg :entries :initform (make-hash-table :test 'equal) :accessor entries) (no-such-page :initarg :no-such-page :initform (make-array 10000 :element-type 'bit :initial-element 0) :accessor no-such-page) (indices :initarg :indices :initform nil :accessor indices))) (defmethod dump-entries ((c cell-signal-technology-products) path) (with-open-file (f path :if-does-not-exist :create :if-exists :supersede :direction :output) (maphash (lambda(k v) (pprint v f)) (entries c)))) (defmethod retrieve-entries ((c cell-signal-technology-products) path) (with-open-file (f path :direction :input) (loop for entry = (read f nil :eof) until (eq entry :eof) do (setf (gethash (second (assoc :id entry)) (entries c)) entry)))) ;(dump-entries *cst* "~/lsw/hcls/biordf/reagents/cell-signal-technologies-products2.txt") ;(retrieve-entries *cst* "~/lsw/hcls/biordf/reagents/cell-signal-technologies-products2.txt") (defmethod common-title-words ((c cell-signal-technology-products)) (let ((wordcounts (make-hash-table :test 'equalp))) (maphash (lambda(k v) (declare (ignore k)) (let ((title (second (assoc :title v)))) (loop for word in (split-at-regex title "(\\s+|-|\\(|\\)|( ))") do (incf (gethash word wordcounts 0))))) (entries c)) (let ((counts (make-hash-table :test 'eql))) (maphash (lambda(k v) (declare (ignore k)) (incf (gethash v counts 0))) wordcounts) (values wordcounts (sort (loop for k being the hash-keys of counts using (hash-value v) collect (cons k v)) '< :key 'car) (maphash (lambda(word count) (when (> count 15) (print (cons word count)))) wordcounts ))))) (defmethod classify-product-type ((c cell-signal-technology-products) entry) (let ((title (second (assoc :title entry)))) (cond ((#"matches" title "(?i).*substrate antibody.*") :kinase-target-antibody) ((#"matches" title "(?i).*antibody.*") :antibody) ((#"matches" title "(?i)^htscan.*") :htscan) ((#"matches" title "(?i)^pathscan.*elisa.*") :elisa) ((#"matches" title ".*\\b.mab\\b.*") :antibody) ((#"matches" title "(?i)^signalsilence.*") :signalsilence) ((#"matches" title "(?i).*kinase.*") :kinase) ((#"matches" title "(?i).*peptide.*") :kinase) ((#"matches" title "(?i).*\\bkit\\b.*") :kit) ((#"matches" title "(?i).inhibitor.*") :inhibitor) ((#"matches" title "(?i).*cell extracts.*") :cell-extracts) ((#"matches" title "(?i).*pack.*") :pack) ))) (define-gene-name-rewrite-rule prefixes cst-context() ("(?i)^\\s*(signalsilence|htscan|phosphoplus|immobilized)\\s*\\s*" "") ("(?i)^\\s*(non-){0,1}phospho-" "") ("(?i)^\\s*acetyl-" "") ("(?i)^\\s*immobilized\\s*" "") ("(?i)^\\s*apoptosis marker:" "")) (define-gene-name-rewrite-rule abbreviate-receptor cst-context () ("(?i)\\s*receptor\\s*(\\s*)" "r$1")) (define-gene-name-rewrite-rule abbreviate-kinase cst-context () ("(?i) kinase" "k")) (define-gene-name-rewrite-rule pi3k cst-context () ("(?i)pi3k" "pik3")) (define-gene-name-rewrite-rule greek-entities cst-context() ("(?i)&(alpha|beta|gamma|delta|tau|zeta|lambda);" "$1")) (defvar *cst-synonym-registry* (make-instance 'priority-synonym-registry :next-synonym-registry *synonym-registry* :entries 100)) (loop for (cst-name name) in '(("cbp" "creb-binding protein") ("pi3 kinase p85" "pik3r1") ("egf receptor" "egfr") ("pdk1" "pdpk1") ("myosin" "mylpf") ("yap" "yap1") ("aim-1" "aurora b") ("mad-1" "mxd1") ("eif2alpha" "eif2s1") ("a1/bfl-1" "bfl-1") ("myt1" "pkmyt1") ("pak5" "pak7") ("p21 waf1" "cdkn1a") ("cip1" "cdkn1a") ("dff35" "dff45") ;; synonym on mouse homolog - fix later ("p38 mapk" "mapk14") ("smad8" "smad9") ("p95/nbs1" "nbs1") ("hr6a" "hhr6a") ("pp2a a" "pp2a alpha") ("pp2a b" "pp2a beta") ("troponin i (cardiac)" "tnni3") ("p73" "tp73") ; ("c-jun ii and c-jun" "c-jun") ("tak1" "map3k7") ("tpl2" "map3k8") ("p63" "tp73l") ("s6 ribosomal protein" "ribosomal protein s6") ("mkk3b" "mkk3") ; 3 isoforms. we aren't that granular yet. ("nik" "map3k14") ; diambiguated from extra info page ("braf" "c-raf") ; diambiguated from phosphosite ("rbp1 ctd" "rbp1") ("trf1" "terf1") ("trf2" "terf2") ("neurofilament-h" "nefh") ("neurofilament-l" "nefh") ("neurofilament-m" "nef3") ("14-3-3 tau" "14-3-3 protein tau") ("basic fgf" "fgf2") ; extra info disamb. ("acidic fgf" "fgf1") ("beta gal" "glb1") ("akt" "akt1") ; looked at the s308 phosphorylation site and saw that it wasn't present in akt2, akt3. assume consistent use. ("ilk1" "ilk") ("delta-opioid receptor" "opioid receptor delta 1") ("igf-i receptor beta" "igf1r") ("mst1" "stk4") ("gck" "map4k2") ("myosin" "mylpf") ("hnrnp a2/b1" "hnrpa1") ; splice variants ("pkm1" "pkm2") ; former is apparently isoform of latter ) do (add-synonym *cst-synonym-registry* cst-name name)) (define-gene-name-rewrite-rule specific-names cst-context () ("(?i)cbp" "creb-binding protein") ("(?i)topoisomerase ii" "top2") ; prefix of other names ("(?i)pi3 kinase p85" "pik3r1") ; this is here because it runs too late for synonym - remove-phosphorylation should run later... ("(?i)egf receptor" "egfr") ("(?i)pdk1" "pdpk1") ; needs to run early ("(?i)myosin" "mylpf") ; needs to run early ("(?i)aim-1" "aurora b") ; needs to run early ("(?i)mad-1" "mxd1") ("(?i)eif2alpha" "eif2s1") ; not eif2a! ("(?i)p44/42\\s*map(k| kinase)\\s*" "erk1/erk2") ("(?i)a1/bfl-1" "bfl-1") ("(?i)prk(\\d)" "pkn$1") ("(?i)myt1" "pkmyt1") ("(?i)pak5" "pak7") ("(?i)p21 waf1" "cdkn1a") ("(?i)cip1" "cdkn1a") ("(?i)dff35" "dff45") ;; synonym on mouse homolog - fix later ("(?i)p38 mapk" "mapk14") ; ("(?i)pkc" "protein kinase c") ("(?i)pkc" "prkc") ; ("(?i)smad8" "smad9") ; ("(?i)p95/nbs1" "nbs1") ; ("(?i)\\bhr6a" "hhr6a") ; ("(?i)pp2a a\\b" "pp2a alpha") ; ("(?i)pp2a b\\b" "pp2a beta") ("(?i)sapk/jnk" "jnk1/jnk2/jnk3") ; ("(?i)troponin i (cardiac)" "tnni3") ("(?i)(?>t)p73" "tp73") ; ("(?i)c-jun ii and c-jun" "c-jun") ; ("(?i)tak1" "map3k7") ; ("(?i)tpl2" "map3k8") ; ("(?i)p63" "tp73l") ("(?i)s6 ribosomal protein" "ribosomal protein s6") ("(?i)mkk3b" "mkk3") ; 3 isoforms. we aren't that granular yet. ("(?i)nik" "map3k14") ; diambiguated from extra info page ("(?i)(^|[^-])\\braf" "c-raf") ; diambiguated from phosphosite ("(?i)rbp1 ctd" "rbp1") ("(?i)pkc" "protein kinase c ") ; needed because of pkcdelta (no space). ("(?i)trf(1|2)" "terf$1") ("(?i)thioredoxin 1.*human" "txn") ; only one of them in human it seems. went to cited paper. in mouse rat, it appears to be called trx1 trx2 ("(?i)neurofilament-(h|l)" "nef$1") ("(?i)neurofilament-m" "nef3") ("(?i)m-csf" "csf1") ;no longer needed ("(?i)14-3-3 tau" "14-3-3 protein tau") ("(?i)(alpha|beta).*?fodrin" "fodrin $1") ("(?i)basic fgf" "fgf2"); extra info disamb. ("(?i)acidic fgf" "fgf1") ("(?i)beta gal" "glb1") ; ("(?i)c-kit" "kit") no longer needed ("(?i)akt\\b" "akt1") ; looked at the s308 phosphorylation site and saw that it wasn't present in akt2, akt3. assume consistent use. ("(?i)ilk1" "ilk") ("(?i)delta-opioid receptor" "opioid receptor delta 1") ("(?i)igf-i receptor beta" "igf1r") ; ("(?i)thioredoxin (\\d)\\s*(?!.*human).*" "trx$1") nope. there is an unrelated trx1 human ; ("p38 map kinase delta" "mapk13") ) #| ampkalpha -> prkaa1,prkaa2 ampkbeta -> prkab1,prkab2 ephrin b is efbn1,2,3 pp2a to sort out: gene search pp2a and "homo sapiens"[orgn] and ppp*[gene/protein name] p44/42 mapk -> erk1, erk2 p90rsk -> rsk1, rsk2, rsk3 = rps6ka1 rps6ka2 rps6ka3 rip -> rip1,2,3 rpb1 ctd (ser2/5) 2,5 relative to c terminal domain repeat! ug. deal with histones. p46 p54 sapk p54 might be mapk9,10 maybe mapk8 is p46. yes. pretty sure this resolves to jnk1,jnk2,jnk3 (("sapk1" ("5599")) ("sapk2" ("5600")) ("sapk3" ("6300")) ("sapk4" ("5603"))) cl-user> (maybe-its-a-protein-family "jnk") (("jnk1" ("5599")) ("jnk2" ("5601")) ("jnk3" ("5602"))) pkr is eif2ak1-4 don't know which they mean... |# (define-gene-name-rewrite-rule antibody-suffix cst-context () ("(?i)\\s*(antibody).*|(rat|mouse|rabbit) mab.*" "") ("(?i)monoclonal|polyclonal" "")) (defmethod resolve-antibodies ((c cell-signal-technology-products) &key show) (let ((*synonym-registry* *cst-synonym-registry*)) (loop for v being the hash-values of (entries *cst*) for title = (second (assoc :title v)) for type = (classify-product-type *cst* v) for name = title for resolved = (and (eq type :antibody) (resolve-name-multiple-method name '(unique-human-gene-id? slashed-names-resolved induction-names-resolved ambiguous-alone-intersection-resolved))) ; (multiple-value-list (resolve-name name))) ; append how into hows when (and (not resolved) (eq type :antibody)) sum 1 into sum when (or (and (not resolved) (eq type :antibody) (eq show :misses)) (and resolved (eq type :antibody) (eq show :hits))) do (format t "~a | ~a ~%" name resolved) finally (return (values #|(remove-duplicates hows)|# sum))))) (defmethod test-slash ((c cell-signal-technology-products) &key show) (let ((*synonym-registry* *cst-synonym-registry*)) (loop for v being the hash-values of (entries *cst*) for title = (second (assoc :title v)) for type = (classify-product-type *cst* v) for name = title for can-resolved-by-name = nil for resolved = (and (eq type :antibody) (prog1 (extract-slash-names name) (setq can-resolved-by-name (resolve-name name)))) when (and (not resolved) (eq type :antibody)) sum 1 ; do (print-db name can-resolved-by-name resolved) do (sleep .001) (when (and resolved (eq type :antibody) (not can-resolved-by-name)) (let ((slash-resolved (or (resolve-name (extract-slash-names name) :test 'slashed-names-resolved) (resolve-name name :test 'slashed-names-resolved)))) (if slash-resolved (when (eq show :hits) (format t "~a | ~a | ~a~%" name resolved (setq slash-resolved (or (resolve-name (extract-slash-names name) :test 'slashed-names-resolved) (resolve-name name :test 'slashed-names-resolved))))) (if (eq show :missed) (format t "missed ~a~%" name)))))))) (defmethod attach-entrez-ids ((c cell-signal-technology-products) &key just) (let ((*synonym-registry* *cst-synonym-registry*) (rewrite-rules ;; boost our specific rules to the front (cons (find 'specific-names (all-gene-name-rewrite-rules) :key 'gene-name-rewrite-rule-name) (remove 'specific-names (all-gene-name-rewrite-rules) :key 'gene-name-rewrite-rule-name)))) (loop for v being the hash-values of (entries *cst*) for name = (second (assoc :title v)) for type = (classify-product-type *cst* v) if (and (eq type :antibody) (not (assoc :entrez-id v)) (or (not just) (some (lambda(s) (search s name)) just))) do (sleep .001) (let ((resolved (resolve-name-multiple-method name '(unique-human-gene-id? slashed-names-resolved induction-names-resolved ambiguous-alone-intersection-resolved) :rules rewrite-rules))) (if resolved (progn (if (assoc :entrez-id v) (setf (cdr (assoc :entrez-id v)) resolved) (nconc v (list (cons :entrez-id resolved) ))) (princ "+")) (princ "-") )) else do (princ "0")))) (defmethod attach-modification ((c cell-signal-technology-products) &key) (maphash (lambda(k v) (let ((matches (all-matches (second (assoc :title v)) "(?s)\\((ser|thr|tyr)([0-9/]+)\\)" 1 2))) (when matches (let ((residue (caar matches)) (sites (loop for (nil site) in matches if (search "/" site) append (split-at-char site #\/) collect site))) (if (assoc :ptm v) (setf (cdr (assoc :ptm v)) (list (list* residue sites))) (setf (cdr (last v)) (list (list :ptm (list* residue sites))))) )))) (entries c))) ;; need to get a cookie manually for now. (defparameter *cst* (make-instance 'cell-signal-technology-products :cookie "mscs2000testcookie=1; aspsessionidasrctcsr=mnkjcepbcaanhedloeobadgg; mscsprofile=95385a1f52dea1a229d5b375420544642f66bf74b44b9be0370205f6ef544d2727287e56ede3023eb9a76583825b050390fb8151e112f128c3770e578d79d6356e78e2da59846b1f39f3bbe7eb0f5b760d7bfc6b9a2044905fb3aee74d684dab6c67c1027f37c216ca7005a0663371d9690540e32f86a1b13eaa2eb8d0cccd3db8aebf3c3715bfc2")) (defmethod get-url-cst ((c cell-signal-technology-products) url &key (canfetch nil) (persist t)) (let ((value (get-url url :cookiestring (cookie c) :persist persist :nofetch (not canfetch)))) value)) (defmethod product-page-products ((c cell-signal-technology-products) url) (let ((index (get-url-cst c url))) (let ((results (union (mapcar 'car (all-matches index "(?s)(?i)(\\d+).*?" 1)) (mapcar 'car (all-matches index "(?s)(?i)http://www.cellsignal\\s+?product%5fid=(\\d+)" 1)) ))) (unless results (format t "no products found on ~a~%" url)) results))) (defmethod retrieve-product-page ((c cell-signal-technology-products) id) (if (probe-file (url-cached-file-name (format nil "http://www.cellsignal.com/product.asp?productid=~a" id))) (get-url-from-cache (format nil "http://www.cellsignal.com/product.asp?productid=~a" id)) (if (probe-file (url-cached-file-name (format nil "http://www.cellsignal.com/iproduct.asp?productid=~a" id))) (get-url-from-cache (format nil "http://www.cellsignal.com/iproduct.asp?productid=~a" id)) (error "oops ~a" id)))) (defmethod pull-product-page ((c cell-signal-technology-products) id) (let ((page (if (probe-file (url-cached-file-name (format nil "http://www.cellsignal.com/product.asp?productid=~a" id))) (get-url-from-cache (format nil "http://www.cellsignal.com/product.asp?productid=~a" id)) (if (probe-file (url-cached-file-name (format nil "http://www.cellsignal.com/iproduct.asp?productid=~a" id))) (get-url-from-cache (format nil "http://www.cellsignal.com/iproduct.asp?productid=~a" id)) (error "oops ~a" id) ;(get-url-cst c (format nil "http://www.cellsignal.com/product.asp?productid=~a" id)) )))) (let ((sections (all-matches page "(?i)(?s)(){0,1}\\s*([^>]+?)\\s*(){0,1}.*?(" 2 4)) (citations (pull-citations c id))) (list* `(:id ,id) `(:title ,(caar (all-matches page "(?i)(.*?)" 1))) `(:page ,(format nil "http://www.cellsignal.com/product.asp?productid=~a" id)) `(:specification ,(format nil "http://www.cellsignal.com/pdf/~a.pdf" id)) `(:citations ,@citations) (process-product-page c sections))))) (defmethod process-product-page ((c cell-signal-technology-products) sections) (loop for (head body) in sections append (cond ((#"matches" head "(?i).*species.*") (let ((parsed-species (parse-cst-species c body)) (parsed-methods (parse-cst-methods c body))) (append (and parsed-species `((:species ,@parsed-species))) (and parsed-methods `((:methods ,@parsed-methods)))))) ((#"matches" head "(?i)(?s).*introduction.*") (let ((parsed (parse-introduction c body))) (and parsed `((:description ,parsed) (:references ,(mapcar 'car (all-matches parsed "(?s)[^&](#\\d+)" 1))))))) ((#"matches" head "(?i)(?s).*specificity.*") (let ((parsed (parse-introduction c body))) (and parsed `((:specificity ,@parsed))))) ((#"matches" head "(?i)(?s).*purification.*") (let ((parsed (parse-introduction c body))) (and parsed `((:source ,@parsed))))) (t nil)))) (defmethod parse-cst-species ((c cell-signal-technology-products) text) (remove-if (lambda(s) (#"matches" s "(?i).*(cyto|blot|immuno).*")) (mapcar 'car (all-matches text "\\b(.{1,2})=(.*?\\s+?)<" 2)))) (defmethod parse-cst-methods ((c cell-signal-technology-products) text) (remove-if-not (lambda(s) (#"matches" s "(?i).*(cyto|blot|immuno).*")) (mapcar 'car (all-matches text "\\b(.{1,2})=(.*?\\s+?)<" 2)))) (defmethod parse-introduction ((c cell-signal-technology-products) text) (regex-replace-all "’" (regex-replace-all "^\\s*;\\s*" (regex-replace-all "(\\s* \\s*)|\\s+" (regex-replace-all "<.*?>" text "") " ") "") "'")) (defmethod extract-named-ul-links ((c cell-signal-technology-products) text name) (mapcar (lambda(el) (if (char= (char (car el) 0) #\/) (concatenate 'string "http://www.cellsignal.com" (car el)) (car el))) (all-matches (caar (all-matches text (format nil "(?s)(?i)