(defpackage :scrape-panomics (:use "COMMON-LISP" "CL-USER")) (in-package :scrape-panomics) (shadowing-import '(cl-user::all-matches cl-user::get-url cl-user::define-gene-name-rewrite-rule) 'scrape-panomics) (defparameter *panomics-root-pages* '((:antibody "http://www.panomics.com/antibodies1.cfm") (:tf-elisa "http://www.panomics.com/TransBindingELISAkits.cfm"))) (define-gene-name-rewrite-rule panomics-reporter panomics () ("(?i)reporter (\\S+\\s*){0,1}Stable Cell Line" "") ("(?i)(\\S+\\s*){0,1}reporter Stable Cell Line" "") ("(?i)\\s*Gene Promoter.*" "")) (defun get-antibodies () (let ((root-url (second (assoc :antibody *panomics-root-pages*)))) (let ((pages (mapcar 'car (butlast (all-matches (cl-user::get-url root-url) "antibodies1\\.cfm\\?StartRow=\\d+&oneachpage=\\d+&PageNum=\\d+&searchterm=" 0))))) (loop for page in pages for url = (format nil "http://www.panomics.com/~a" page) append (let ((body (caar (all-matches (cl-user::get-url url) "(?s)(Product Type.*)" 1)))) (let ((entries (all-matches body "(?s)\\s*(.*?)\\s*" 1))) (loop for (entry) in entries for (description href dilution quantity methods species price) = (mapcar 'car (all-matches entry "(?s)\\s*(.*?)\\s*" 1)) for ((specpage id)) = (and href (all-matches href "(.*?)" 1 2)) when (and href id) collect (list :type :antibody :id id :name description :specification (format nil "http://www.panomics.com/~a" specpage) :dilution dilution :quantity quantity :methods methods :species species :price price)))))))) (defun get-transcription-factor-elisas () (let ((root-url (second (assoc :tf-elisa *panomics-root-pages*)))) (let* ((body (cl-user::get-url root-url)) (pages (all-matches body "" 1 2))) (loop for (relative-url title) in pages for url = (format nil "http://www.panomics.com/~a" relative-url) for body = (cl-user::get-url url) for captions = (all-matches body "(?s)
\\s*\\s*(.*?)\\s*(.*?)\\s*
" 1 2) for table = (caar (all-matches body "(?s)]*?border=\"1\".*?>(.*?)" 1)) for items = (mapcar 'car (all-matches table "(?s)\\s*(.*?)\\s*" 1)) for descriptions = (loop for item in items collect (mapcar 'car (all-matches item "(?s)\\s*( ){0,1}(.*?)\\s*" 2))) for manuals = (union (all-matches body "
  • ([^<]*?)([^<]*?)Manual" 1 2) (all-matches body "
  • ([^<]*?Manual)[^<]*?\\s*" 1 2)) do (setq captions (mapcar (lambda(c) (cons (first c) (cl-user::regex-replace-all "(?s)(
    \\s*)*(<[^>]*?>\\s*)*$" (second c) ""))) captions)) append (loop for desc in descriptions for manual in manuals collect (list :type :elisa :id (third desc) :name (first desc) :specification (format nil "http://www.panomics.com/~a" (second manual)) :price (fourth desc) :description captions)))))) (defun get-stable-cell-lines () (let ((root-url "http://www.panomics.com/StableCellLines.cfm")) (let* ((body (cl-user::get-url root-url)) (pages (all-matches body "
    .*?(.*?)" 1 2))) (loop for (relative-url title) in pages for url = (format nil "http://www.panomics.com/~a" relative-url) for body = (cl-user::get-url url) for table = (or (caar (all-matches (setq *b* body) "(?s)]*border=\"1\"[^>]*style[^>]*>\\s*(.*?)\\s*" 1)) (caar (all-matches (setq *b* body) "(?s)]*border=\"1\" class=\"tblborder\"*>\\s*(.*?)\\s*" 1))) for items = (mapcar 'car (all-matches table "(?s)\\s*(.*?)\\s*" 1)) for descriptions = (loop for item in items collect (mapcar 'car (all-matches item "(?s)\\s*( ){0,1}(.*?)\\s*" 2))) append (loop for desc in descriptions collect (list :type :stable-cell :id (third desc) :name (first desc) :quantity (second desc) :page url )))))) (defun pull-out-panomics-table (url &optional table-match) (let* ((root-url url) (body (cl-user::get-url root-url)) (table (or (and table-match (caar (all-matches (setq *b* body) (format nil "(?s)~a\\s*(.*?)\\s*" table-match) 1))) (caar (all-matches (setq *b* body) "(?s)]*border=\"1\" class=\"tblborder\"*>\\s*(.*?)\\s*" 1)) (caar (all-matches (setq *b* body) "(?s)]*border=\"1\"[^>]*style[^>]*>\\s*(.*?)\\s*" 1)))) (items (mapcar 'car (all-matches table "(?s)\\s*(.*?)\\s*" 1)))) (loop for item in items collect (mapcar (lambda(e) (or (car (all-matches (car e) "\\s*([^<]*?)\\s*<" 1 2)) (#"replaceAll" (car e) "\\s*<[^>]*>\\s*" ""))) (all-matches item "(?s)\\s*( ){0,1}(.*?)\\s*" 2)) ))) (defun get-tad-reporters () (loop for (name region upstream-kinase id price) in (pull-out-panomics-table "http://www.panomics.com/TADinvivo.cfm") append (loop for name in (cl-user::split-at-regex upstream-kinase ",|/") collect (list :type :kinase-assay-kit :name (cl-user::regex-replace-all "^\\s*" name "") :id id :price price :region (car (all-matches region "(\\d+)-(\\d+)" 1 2)))))) (defun get-gene-reporter-vectors () (loop for (name vector-name (spec id) price) in (pull-out-panomics-table "http://www.panomics.com/TransLucent_Gene_Promoter_Reporter_Vectors.cfm") collect (list :type :gene-promotor-reporter-vector :id id :name name :price price :specification spec :description vector-name))) (defun get-translucent-reporter-vectors () (loop for (name id description) in (pull-out-panomics-table "http://www.panomics.com/LR1000a.cfm" "") when (not (equal id "")) collect (list :type :reporter-vector :id id :name (if (consp name) (second name) name) :specification (if (consp name) (format nil "http://www.panomics.com/~a" (first name))) :description description :page "http://www.panomics.com/LR1000.cfm"))) ;; http://www.panomics.com/MA4010.cfm (defvar *tfarray1* '("NFKB" "CRE" "GAS" "c-myc" "ISRE" "E2F" "p53" "RXRE" "Ets" "YY1" "AP1(1)" "SRE" "VDR" "AP1(2)" "EGR" "C/EBP" "ERE" "TR" "NFAT" "GATA" "Stat 3")) (defvar *tfarray2* '("ADR" "AP1(1)" "AP1(2)" "AP2" "AP3" "AP4" "AR" "Brn-3" "CBF" "CDP" "CdxA/NKX2" "CEF1" "CEF2" "CRE" "CRE(2)" "E2F" "E2F(1)" "E2F(2)" "E2F1(3)" "E47" "EGR" "EGR(2)" "EKLF(1)" "ELK1" "ELK1(2)" "ER" "ER(2)" "ETS" "ETS(2)" "FAST1" "FKHR" "FREAC2" "GAG" "GAS" "GAS/ISRE" "GATA" "GATA-1" "GATA-1/2" "GATA2" "GATA-3" "GATA4" "GR" "HNF-1A" "HNF-1A(2)" "HNF-3 (a, b, g)" "HNF-3 b" "HNF-4" "HSE" "IRF-1" "ISRE" "KTP1" "LEF1" "MEF-1" "MEF2" "MEF2(2)" "MEF-3")) (defvar *tfarray3* '("MR" "Myc" "myc" "MZF1" "NF-1" "NFAT" "NF-E2" "NFkB" "NFkB(1)" "NFkB(2)" "NRF-1" "NRF-2" "OCT-1(1)" "p53" "PAX5" "Pax8" "PBX1" "PIT1" "PPARa" "PPARg" "PR " "RARE(DR5)" "RB" "RXR(2)" "RXRE(DR1)" "Smad 3/4" "Sp1" "SRE" "SRF" "SRY" "STAT1 p84/p91" "STAT1(GAS)" "STAT3" "Stat3" "STAT3(1)" "STAT3(2)" "STAT4" "STAT4(2)" "STAT5" "STAT5/6(1)" "STAT5/6(2)" "TA-luc" "Tax/CREB" "TCF/LEF" "TFE3" "TR(2)" "TR(3)" "TR(DR4)" "USF1" "VDR(2)" "VDR(3)" "VDR(DR3)" "v-MAF" "Xbp-1" "YY1")) (defun write-panomics () (with-open-file (f "~/repos/lsw/trunk/hcls/biordf/reagents/panomics.txt" :direction :output :if-exists :supersede) (let ((*print-case* :downcase)) (loop for entry in (append (get-antibodies) (get-transcription-factor-elisas) (get-stable-cell-lines) (get-tad-reporters) (get-translucent-reporter-vectors)) do (pprint entry f)) ))) (defparameter *protein-families* '(("NFAT" "NFATC1" "NFATC2" "NFATC3" "NFATC4" "NFAT5") ("STAT5" "STAT5B" "STAT5A") ("v-maf" "MAF" "MAFA" "MAFB" "MAFF" "MAFK" "MAFG") ("E2F" "E2F1" "E2F2" "E2F3" "E2F4" "E2F5" "E2F6" "E2F7") ("Smad" "Smad1" "Smad2" "Smad3" "Smad4" "Smad5" "Smad6" "Smad7" "Smad9") ("RAR" "RARA" "RARB" "RARC" "RARF" "RARG") ("RXR" "RXRA" "RXRB" "RXRC" "RXRG") ("EGR" "EGR2" "EGR1" "EGR3" "EGR4") ("CBF" "CBF1" "CBF2") ("ETS" "ETS1" "ETS2") ("AP2" "TFAP2A" "TFAP2B" "TFAP2C" "TFAP2D") ("NRF-2" "GABPA" "GABPB1" "GABPB2") ("COUP-TF" "NR2F1" "NR2F2") ("MREBP" "MTF1" "MTF2") ("MEF-2" "MEF2A" "MEF2B" "MEF2C" "MEF2D") ("ISRE" "STAT1" "STAT2") ("HNF-3" "FOXA1" "FOXA2" "FOXA3") ("Catenin alpha" "CTNNA1" "CTNNA2" "CTNNA3") ("Notch" "Notch1" "Notch2" "Notch3" "Notch4") ("S100" "S100A" "S100B" "S100C" "S100D" "S100E" "S100F" "S100G") ("Calponin" "Calponin1" "Calponin2" "Calponin3") ("Keratin 6" "Keratin 6A" "Keratin 6B" "Keratin 6C" "Keratin 6D" "Keratin 6E") ("GATA-1/2" "GATA1" "GATA2") ("CdxA/NKX2" "NKX2-1" "NKX2-2" "NKX2-3" "NKX2-4" "NKX2-5" "NKX2-8" "NKX2-9") ("NKX2" "NKX2-1" "NKX2-2" "NKX2-3" "NKX2-4" "NKX2-5" "NKX2-8" "NKX2-9") ("GSK3" "GSK3A" "GSK3B") ("GSK" "GSK3A" "GSK3B") ("RSK" "RSK1" "RSK2" "RSK3" "RSK4" ) ("ERK" "ERK1" "ERK2" "ERK3" "ERK4" "ERK5" "ERK8") ("MAPK" "MAPK1" "MAPK2" "MAPK3" "MAPK4" "MAPK5" "MAPK6" "MAPK7" "MAPK8" "MAPK9" "MAPK10" "MAPK11" "MAPK12" "MAPK13" "MAPK14") ("AKT" "AKT1" "AKT2" "AKT3") )) ;;http://www.panomics.com/LR1000B.cfm ; List of TF-Specific Inducer Reagents (defparameter *panomics-replacements* '("ER" "ESR1" "HIF" "HIF1A" "SAP-1" "Elk4" "PIT1" "POU1F1" "Pur-1" "MAZ" "PR" "PGR" "oct-{0,1}1" "POU2F1" "HNF-1x" "HNF1" "MZF1" "ZNF42" "MEF-1" "MYOD" "GAS" "STAT1" "E47" "TCF3" "AP4" "TFAP4" "Claudin 1" "CLDN1" ; bogus claudin-1 synonym for cld7 "Villin" "Vil1" ; there are 2, but the spec sounds like the first one "CRE" "CREB1" "TCF/LEF" "TCF" "GSK" "GSK-3")) ;; http://www.panomics.com/TF_TF2.cfm has more info on tfs (defun possibly-patch-panomics-entry (entry continue) (setf (getf entry :name) (cl-user::regex-replace-all "^(\\S+)\\(\\d+\\)" (getf entry :name) "$1")) (loop for (family . members) in *protein-families* when (#"matches" (getf entry :name) (format nil "(?i)~a\\b.*" family)) return (loop for name in members do (setf (getf entry :usename) name) (funcall continue entry))) (loop for (old new) on *panomics-replacements* by #'cddr when (#"matches" (getf entry :name) (format nil "(?i)~a(\\(\\d+\\)){0,1}\\b.*" old)) return (progn (setf (getf entry :usename) "EGFR") (funcall continue entry))) (when (equalp (getf entry :id) "LK0016") (setf (getf entry :usename) "Elk4")) (funcall continue entry)) (defun debug-panomics (type) '(with-open-file (f "~/repos/lsw/trunk/hcls/biordf/reagents/panomics.txt") (let ((names (make-hash-table :test 'equal))) (loop for entry = (read f nil :eof) until (eq entry :eof) do (when (eq (getf entry :type) type) (when (stringp (getf entry :description)) (let ((alternate (caar (all-matches (getf entry :description) "is designed to measure transcriptional activity of (.*?)\\." 1)))) (when alternate (setf (gethash alternate names) t)))))) (resolve-to-gene :list-of-names names :show :matches))) (with-open-file (f "~/repos/lsw/trunk/hcls/biordf/reagents/panomics.txt") (let ((names (make-hash-table :test 'equal))) (loop for entry = (read f nil :eof) until (eq entry :eof) do (when (eq (getf entry :type) type) (possibly-patch-panomics-entry entry (lambda(entry) (setf (gethash (cl-user::replace-xml-characers (or (getf entry :usename) (getf entry :name))) names) t))))) (cl-user::resolve-to-gene :list-of-names names :show :misses))) ) (defparameter *product-registry* (make-hash-table :test 'equalp)) (defun found (entry id) (pushnew (cons (getf (cl-user::info cl-user::*entrez-gene* id) :name) (cons (getf entry :type) entry)) (gethash id *product-registry*) :test 'equalp)) (defun ambiguous (entry id) nil) (defun process-panomics (type) (let ((hits 0) (ambiguous 0) (no-idea 0)) (with-open-file (f "~/repos/lsw/trunk/hcls/biordf/reagents/panomics.txt") (loop for entry = (read f nil :eof) for hit? = nil for amb? = nil until (eq entry :eof) do (when (eq (getf entry :type) type) (possibly-patch-panomics-entry entry (lambda(entry) (multiple-value-bind (id amb) (cl-user::resolve-name (cl-user::replace-xml-characers (or (getf entry :usename) (getf entry :name)))) (if id (found entry (setq hit? id)) (if amb (ambiguous entry (setq amb? id))))))) (cond (hit? (incf hits)) (amb? (incf ambiguous)) (t (incf no-idea)))) )) (list type :hits hits :ambiguous ambiguous :no-idea no-idea))) ;(cl-user::info cl-user::*entrez-gene* "7157") (defun process-all-panomics () (list (process-panomics :kinase-assay-kit) (process-panomics :elisa) (process-panomics :stable-cell) (process-panomics :reporter-vector) (process-panomics :antibody) (process-panomics :gene-promotor-reporter-vector))) ;;(scrape-panomics::process-all-panomics) (defun dump-panomics (&optional (stream t)) (maphash (lambda(id entries) (loop for entry in entries for (name type . plist) = entry do (unless (getf plist :specification) (setf (getf plist :specification) (format nil "http://www.google.com/search?q=panomics+~a" (getf plist :id)))) (apply 'cl-user::print-tabbed stream id name (append (loop for key in '(:type :id :name :species :specification :dilution :quantity :region :methods :price) collect (getf plist key)) (let ((description (getf plist :description))) (when (consp description) (list (with-output-to-string (s) (loop for (caption . text) in description do (format s "~a ~a " caption text)))))))))) *product-registry*)) ;; what works : ;; (process-panomics :kinase-assay-kit) ;; (process-panomics :elisa) ;; (process-panomics :stable-cell) ;; close: (process-panomics :reporter-vector) ;; ambiguous (process-panomics :kinase-assay-kit) ;;SAP-1 ((5660 2005 1673 5794)) prob 2005 elk4 based on p38 mapk upstream and interaction on locuslink page. ;;OCT1 ((6580 5451)) 5451 - transcription factor ;; need some way to disambiguate in a context. ;; ambiguous (process-panomics :elisa) ;; HIF HIF1A (spec says so) ;; ca-19-9 carbohyhdrate antigen 19-9 prob not a protein ;; efgr/pi3k and Phosphotyrosine/PI3K are the same. Should be indexed under both. Really an EFGR assay. ;; ER ELISA Kit ((2099 2069) NIL) 2099 estrogen receptor according to the spec. ;; ;;erg. CEF-2 is a transcription binding site. ;;DNase I footprint and electrophoretic mobility shift assay analyses demonstrated that this cardiac-specific promoter/enhancer contains five nuclear protein binding sites (designated CEF1, CEF-2, and CPF1-3), four of which bind novel cardiac-specific nuclear protein complexes ;; ;;keratinocyte transcriptional protein-1" = KPT1 ;; p38 is erk2 = mapk1 ;; p38 alpha beta delta ;; paralogs of p53 : p63 p73 (knockout either of these and it is lethal, but ko of p53 is ok) ;; dna damage velcade synergy