;http://www.abcam.com/index.html?pageconfig=datasheet&intAbID=24668
(defclass abcam-products ()
((antibody-roots :initarg :antibody-roots :initform nil :accessor antibody-roots)
(antibody-ids :initarg :antibody-ids :initform nil :accessor antibody-ids)
(entries :initarg :entries :initform (make-hash-table :test 'equal) :accessor entries)
))
(defparameter *abcam* (make-instance 'abcam-products
:antibody-roots
(append
(loop for charcode from (char-code #\A) to (char-code #\Z)
collect (code-char charcode))
(loop for charcode from (char-code #\0) to (char-code #\9)
collect (code-char charcode)))))
(defmethod cache-index-pages ((a abcam-products))
(let ((ids (make-hash-table :test 'equal)))
(loop for char in (antibody-roots a)
for url = (format nil "http://www.abcam.com/index.html?pageconfig=catalog_byproducttype&intProductTypeID=1&strStartChar=~a" char)
do
(let* ((firstpage (get-url url :persist t :verbose t))
(otherpages (mapcar 'car (all-matches firstpage "(?s)(?i)intResultsPage=(\\d+)" 1))))
(dolist (id (all-matches firstpage "(?s)(?i)ab\\d+" 0))
(setf (gethash (car id) ids) t))
(loop for pagenum in otherpages
for url = (format nil "http://www.abcam.com/index.html?pageconfig=catalog_byproducttype&intProductTypeID=1&strStartChar=~a&intResultsPage=~a&tr=59"
char pagenum)
for page = (get-url url :persist t :verbose t)
do
(dolist (id (all-matches page "(?s)(?i)ab\\d+" 0))
(setf (gethash (car id) ids) t))
)))
ids))
(defmethod antibody-ids :around ((a abcam-products))
(or (call-next-method)
(let ((them nil))
(maphash (lambda(id _)
(declare (ignore _))
(assert (stringp id) () "oops ~a" id)
(push (regex-replace-all "^ab" id "") them))
(setq @ (cache-index-pages a)))
(setf (antibody-ids a) them)
them)))
(defvar *stopit* nil)
;; ssh -L 8080:www.abcam.com:80 alanr@mumble.net
(defmethod cache-datasheet-pages ((a abcam-products))
(loop for id in (antibody-ids a)
for url = (format nil "http://www.abcam.com/index.html?datasheet=~a" id)
do
(unless (probe-file (url-cached-file-name url))
(get-url url :dont-cache t :persist t :verbose t ));:tunnel "localhost:8080"))
(sleep .001)))
(defmethod each-datasheet-page ((a abcam-products) f)
(loop for id in (antibody-ids a)
for url = (format nil "http://www.abcam.com/index.html?datasheet=~a" id)
do
(funcall f id (get-url url :dont-cache t :nofetch t))
(sleep .001)))
(defmethod parse-datasheet-page ((a abcam-products) page)
(let* ((datasheet (caar (all-matches page "(?s)
){0,1}\\s*(.*?)\\s*(
){0,1}\\s*" 1 3)))) (setq key-values (loop for (key value) in key-values do (setq value (regex-replace-all "(.*?)" value "$1")) collect (cond ((equalp key "Research areas") (list key (remove-if (lambda(el) (#"matches" el "(^&.*$)|(^\\s*$)")) (split-at-regex value "\\s*<.*?>\\s*")))) (t (list key value))))) (when database-links (setf (cdr (assoc "Database links" key-values :test 'equal)) database-links)) ;; (print-db ;; (length page) ;; (length datasheet) ;; (length inner-table) ;; (length inner-table2) ;; (length noise-removed) ;; (length table-rows) ;; ) (if pmids (append key-values (list (list* "pmids" pmids)) ) key-values))) (defmethod parse-all-remaining ((a abcam-products)) (loop for id in (antibody-ids a) do (unless (gethash id (entries a)) (sleep .001) (format t "[~a]" id) (multiple-value-bind (value errorp) (ignore-errors (setf (gethash id (entries a)) (parse-datasheet-page a (get-url (format nil "http://www.abcam.com/index.html?datasheet=~a" id) :dont-cache t :nofetch t)))) (when errorp (format t "~%Error in ~a~%" id )))))) (defmethod dump-entries ((c abcam-products) path) (with-open-file (f path :if-does-not-exist :create :if-exists :supersede :direction :output) (maphash (lambda(k v) (unless (assoc :id v) (setq v (cons (list :id k) v))) (pprint v f)) (entries c)))) (defmethod retrieve-entries ((c abcam-products) path) (with-open-file (f path :direction :input) (loop for entry = (read f nil :eof) until (eq entry :eof) do (setf (gethash (second (assoc :id entry)) (entries c)) entry)))) ;(dump-entries *abcam* "~/lsw/hcls/biordf/reagents/abcam-products.txt") ;(retrieve-entries *abcam* "~/lsw/hcls/biordf/reagents/abcam-products.txt") get rid of grab title get rid of comments <-- .* -->