(in-package :cl-user) (defvar *page-cache* (make-hash-table :test 'equal)) (defvar *cookies* nil) (defun head-url (url) (get-url url :head t)) (defun get-url (url &key force-refetch dont-cache persist cookiestring nofetch verbose tunnel referer (follow-redirects t) post (ignore-errors nil) head &aux headers) "Get the contents of a page, saving it for this session in *page-cache*, so when debugging we don't keep fetching" (sleep 0.0001) ; give time for control-c (and head (setq force-refetch t dont-cache t persist nil follow-redirects nil)) (or (and (not force-refetch) (gethash url *page-cache*)) (and (not force-refetch) (probe-file (url-cached-file-name url)) (get-url-from-cache url)) (and nofetch (error "Didn't find cached version of ~a" url)) (labels ((stream->string (stream) (let ((buffer (jnew-array "byte" 4096))) (apply 'concatenate 'string (loop for count = (#"read" stream buffer) while (plusp count) do (sleep 0.0001) collect (#"toString" (new 'lang.string buffer 0 count)) )))) (doit() (when verbose (format t "~&;Fetching ~s~%" url)) (let ((connection (#"openConnection" (new 'java.net.url (maybe-rewrite-for-tunnel url tunnel)))) ) (if follow-redirects (#"setInstanceFollowRedirects" connection t) (#"setInstanceFollowRedirects" connection nil)) (dolist (c *cookies*) (#"setRequestProperty" connection "Cookie" c)) (when cookiestring (#"setRequestProperty" connection "Cookie" cookiestring)) (when referer (#"setRequestProperty" connection "Referer" referer)) (#"setRequestProperty" connection "User-Agent" "Mozilla/4.0 (compatible)") (when post (#"setRequestMethod" connection "POST") (#"setDoOutput" connection t) (with-output-to-string (s) (loop for (prop value) in post collect (format s "~a=~a&" prop (#"encode" 'java.net.URLEncoder value "UTF-8"))) (let ((out (new 'PrintWriter (#"getOutputStream" connection))) (params (get-output-stream-string s))) (#"println" out (subseq params 0 (- (length params) 1))) (#"close" out)))) (when head ;(#"setRequestMethod" connection "HEAD") (return-from get-url (unpack-headers (prog1 (#"getHeaderFields" connection) (#"disconnect" connection))))) (setq headers (#"getHeaderFields" connection)) (let ((responsecode (#"getResponseCode" connection))) (if (not (member responsecode '(200 301 302) :test 'eql)) (let ((errstream (#"getErrorStream" connection))) (assert errstream (errstream) "There is no errorstream! Fixme") (error "Bad HTTP response ~A: ~A" responsecode (stream->string errstream))) (let ((stream (#"getInputStream" connection))) (if (and (member responsecode '(301 302)) follow-redirects) (progn (setq url (second (assoc "Location" (unpack-headers headers) :test 'equal))) (doit)) (stream->string stream)) )))))) (if ignore-errors (multiple-value-bind (value errorp) (ignore-errors (doit)) (if errorp (progn (when verbose (format t "~a" (java-exception-message errorp))) (values (list :error errorp (java-exception-message errorp)) (unpack-headers headers))) (progn (if persist (save-url-contents-in-cache url value) value) (if dont-cache (values value (unpack-headers headers)) (values (setf (gethash url *page-cache*) value) (unpack-headers headers)))))) (progn (let ((value (doit))) (if persist (save-url-contents-in-cache url value) value) (if dont-cache (values value (unpack-headers headers)) (values (setf (gethash url *page-cache*) value) (unpack-headers headers))))) )))) (defun persist-page-cache () (maphash (lambda(k v) (unless (probe-file (url-cached-file-name v)) (save-url-contents-in-cache k v))) *page-cache*)) (defun header-value (header headers) (cadr (assoc header headers :test 'equal))) (defun maybe-rewrite-for-tunnel (url tunnel) (if tunnel (destructuring-bind (protocol path) (car (all-matches url "^([a-z]*)://[^/]*(.*)" 1 2)) (concatenate 'string protocol "://" tunnel path)) url)) (defun unpack-headers (headers) (and headers (loop for key in (set-to-list (#"keySet" headers)) for value = (#"get" headers key) when value collect (cons key (loop for i below (#"size" value) collect (#"get" value i))) ))) #|(defun get-url (url &key force-refetch dont-cache persist) "Get the contents of a page, saving it for this session in *page-cache*, so when debugging we don't keep fetching" (or (and (not force-refetch) (gethash url *page-cache*)) (and persist (probe-file (url-cached-file-name url)) (get-url-from-cache url)) (multiple-value-bind (value errorp) (ignore-errors (let ((stream (#"openStream" (new 'net.url url))) (buffer (jnew-array "byte" 4096))) (apply 'concatenate 'string (loop for count = (#"read" stream buffer) while (plusp count) collect (#"toString" (new 'lang.string buffer 0 count)) )))) (if errorp (progn (list :error (java-exception-message errorp))) (if dont-cache value (setf (gethash url *page-cache*) (save-url-contents-in-cache url value)))))))|# (defun cache-url ()) (defun url-cached-file-name (url) (let ((it (new 'com.hp.hpl.jena.shared.uuid.MD5 url))) (#"processString" it) (let* ((digest (#"getStringDigest" it)) (subdirs (coerce (subseq digest 0 4) 'list))) (merge-pathnames (make-pathname :directory (cons :relative (mapcar 'string subdirs)) :name digest :type "urlcache") (config :web-cache))))) (defun save-url-contents-in-cache (url content) (let ((fname (url-cached-file-name url))) (ensure-directories-exist fname) (with-open-file (f fname :direction :output :if-does-not-exist :create) (format f "~s" url) (write-string content f)))) (defun get-url-from-cache (url) (let ((fname (url-cached-file-name url))) (with-open-file (f fname :direction :input) (let ((url-saved (read f))) (assert (equalp url url-saved) () "md5 collision(!) ~s, ~s" url url-saved) (let ((result (make-string (- (file-length f) (file-position f))))) (read-sequence result f) result))))) (defun forget-cached-url (url) (delete-file (url-cached-file-name url)) (remhash url *page-cache*)) (defun java-exception-message (exception) (ignore-errors (caar (all-matches (#"toString" (slot-value exception 'system::cause)) "(?s)=+\\s*(.*?)\\n" 1)))) (defun wikipedia (term) "Lookup a wikipedia page by name and return it's url. If ambiguous, return :ambiguous. If missing return :missing" (let ((page (get-url (format nil "http://en.wikipedia.org/wiki/~a" (#"replaceAll" term " " "_"))))) (if (consp page) page (let ((pagename (caar (all-matches page "