;http://www.abcam.com/index.html?pageconfig=datasheet&intAbID=24668 (defclass abcam-products () ((antibody-roots :initarg :antibody-roots :initform nil :accessor antibody-roots) (antibody-ids :initarg :antibody-ids :initform nil :accessor antibody-ids) (entries :initarg :entries :initform (make-hash-table :test 'equal) :accessor entries) )) (defparameter *abcam* (make-instance 'abcam-products :antibody-roots (append (loop for charcode from (char-code #\A) to (char-code #\Z) collect (code-char charcode)) (loop for charcode from (char-code #\0) to (char-code #\9) collect (code-char charcode))))) (defmethod cache-index-pages ((a abcam-products)) (let ((ids (make-hash-table :test 'equal))) (loop for char in (antibody-roots a) for url = (format nil "http://www.abcam.com/index.html?pageconfig=catalog_byproducttype&intProductTypeID=1&strStartChar=~a" char) do (let* ((firstpage (get-url url :persist t :verbose t)) (otherpages (mapcar 'car (all-matches firstpage "(?s)(?i)intResultsPage=(\\d+)" 1)))) (dolist (id (all-matches firstpage "(?s)(?i)ab\\d+" 0)) (setf (gethash (car id) ids) t)) (loop for pagenum in otherpages for url = (format nil "http://www.abcam.com/index.html?pageconfig=catalog_byproducttype&intProductTypeID=1&strStartChar=~a&intResultsPage=~a&tr=59" char pagenum) for page = (get-url url :persist t :verbose t) do (dolist (id (all-matches page "(?s)(?i)ab\\d+" 0)) (setf (gethash (car id) ids) t)) ))) ids)) (defmethod antibody-ids :around ((a abcam-products)) (or (call-next-method) (let ((them nil)) (maphash (lambda(id _) (declare (ignore _)) (assert (stringp id) () "oops ~a" id) (push (regex-replace-all "^ab" id "") them)) (setq @ (cache-index-pages a))) (setf (antibody-ids a) them) them))) (defvar *stopit* nil) ;; ssh -L 8080:www.abcam.com:80 alanr@mumble.net (defmethod cache-datasheet-pages ((a abcam-products)) (loop for id in (antibody-ids a) for url = (format nil "http://www.abcam.com/index.html?datasheet=~a" id) do (unless (probe-file (url-cached-file-name url)) (get-url url :dont-cache t :persist t :verbose t ));:tunnel "localhost:8080")) (sleep .001))) (defmethod each-datasheet-page ((a abcam-products) f) (loop for id in (antibody-ids a) for url = (format nil "http://www.abcam.com/index.html?datasheet=~a" id) do (funcall f id (get-url url :dont-cache t :nofetch t)) (sleep .001))) (defmethod parse-datasheet-page ((a abcam-products) page) (let* ((datasheet (caar (all-matches page "(?s)(.*)" 1))) (inner-table (caar (all-matches datasheet "(?s)(?i)(.*)" 1))) (inner-table2 (caar (all-matches inner-table "(?s)(?i)(.*)" 1))) (database-links (and (#"matches" inner-table2 "(?s)(?i).*Database links.*") (mapcar 'car (all-matches (caar (all-matches inner-table2 "(?s)(?i)Database links(.*)" 1)) "(?s)(?i).*?" inner-table2 "")) (table-rows (mapcar 'car (all-matches noise-removed "(?s)(?i)\\s*(.*?)\\s*" 1))) (key-values (loop for row in table-rows append (all-matches row "(?s)(?i)([^<].*?).*?\\s*(

){0,1}\\s*(.*?)\\s*(

){0,1}\\s*" 1 3)))) (setq key-values (loop for (key value) in key-values do (setq value (regex-replace-all "
(.*?)" value "$1")) collect (cond ((equalp key "Research areas") (list key (remove-if (lambda(el) (#"matches" el "(^&.*$)|(^\\s*$)")) (split-at-regex value "\\s*<.*?>\\s*")))) (t (list key value))))) (when database-links (setf (cdr (assoc "Database links" key-values :test 'equal)) database-links)) ;; (print-db ;; (length page) ;; (length datasheet) ;; (length inner-table) ;; (length inner-table2) ;; (length noise-removed) ;; (length table-rows) ;; ) (if pmids (append key-values (list (list* "pmids" pmids)) ) key-values))) (defmethod parse-all-remaining ((a abcam-products)) (loop for id in (antibody-ids a) do (unless (gethash id (entries a)) (sleep .001) (format t "[~a]" id) (multiple-value-bind (value errorp) (ignore-errors (setf (gethash id (entries a)) (parse-datasheet-page a (get-url (format nil "http://www.abcam.com/index.html?datasheet=~a" id) :dont-cache t :nofetch t)))) (when errorp (format t "~%Error in ~a~%" id )))))) (defmethod dump-entries ((c abcam-products) path) (with-open-file (f path :if-does-not-exist :create :if-exists :supersede :direction :output) (maphash (lambda(k v) (unless (assoc :id v) (setq v (cons (list :id k) v))) (pprint v f)) (entries c)))) (defmethod retrieve-entries ((c abcam-products) path) (with-open-file (f path :direction :input) (loop for entry = (read f nil :eof) until (eq entry :eof) do (setf (gethash (second (assoc :id entry)) (entries c)) entry)))) ;(dump-entries *abcam* "~/lsw/hcls/biordf/reagents/abcam-products.txt") ;(retrieve-entries *abcam* "~/lsw/hcls/biordf/reagents/abcam-products.txt") get rid of grab title get rid of comments <-- .* --> two top level tables. inner is content. then small inner tables to ignore ignore stuff below

Plasmodium falciparum antibody [MPFG-55P] (HRP) - related products:

;(cached-url-safari "http://www.abcam.com/index.html?datasheet=30384")