;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Create a mapping from issn to journal title and abbreviation. ;; ;; Journal titles are at: ftp.ncbi.nih.gov/pubmed/J_Entrez.gz ;; Entry looks like this: ;; -------------------------------------------------------- ;; JrId: 1 ;; JournalTitle: AADE editors' journal. ;; MedAbbr: AADE Ed J ;; ISSN: 0160-6999 ;; ESSN: ;; IsoAbbr: ;; NlmId: 7708172 ;; -------------------------------------------------------- (defvar *issn-to-journal* (make-hash-table :test 'equal)) ; returns a list: journal full name, journal name abbreviation (defun issn-to-journal (&optional (path (config :j_entrez))) (let* ((re (compile-regex "(.+?): (.+)")) (table *issn-to-journal*)) (if (> (hash-table-count table) 0) table (with-open-file (f path) (loop for line = (read-line f nil :eof) with issn and abbrev and title until (eq line :eof) do (if (char= (char line 0) #\-) (when issn (setf (gethash issn table) (list title abbrev)) (setq issn nil abbrev nil title nil)) (let* ((match (car (all-matches line re 1 2))) (field (car match)) (value (second match))) (cond ((string-equal field "ISSN") (setq issn value)) ((string-equal field "JournalTitle") (setq title value)) ((string-equal field "MedAbbr") (setq abbrev value)))))) table)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Pubmed id to title map ;; Map id (as string) to list: title citation id date (to match what I get from Ingenuity) ;; Call (get-pmid-titles-etc pmids) and get back a list of lists, each is (title citation id date) ;; ;; Implementation: ;; ;; Titles are retrieved from ncbi using their ESummary web service ;; http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html ;; ;; Give it a url such as for the following pmids: 12429098,10995770 ;; http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=12429098,10995770&retmode=text&email=alanr@mpi.com&tool=paris ;; Use portable aserve's do-http-request to do the query ;; Returns some xml with information. ;; Use xmls to parse that. ;; ;; Batch them up into requests of at most 50 at a time (to not make too large a url). ;; Wait 3 seconds between requests (per ncbi instructions) ;; As you retrieve them save them in *pmid-title-etc-cache* and on disk ;; ;; Exception handling is primitive. It's your responsibility to supply valid pmids. ;; There is an issue that has come up with hprd where they supply what appears to be ;; a medline id which isn't the same (sometimes) as a pubmed id. NCBI returns information ;; with the pubmed id so matchup is confusing. ;; Currently this is handled by warning that you are not getting back all the ids you asked for. ;; You then need to edit shifted-ids in get-pmid-titles-etc adding (list pmid medlineid) to the ;; alist you see there. ;; ;; When this happens there is some notation in the xml - the "ArticleIds" field will note a ;; pubmed and medline id. I decided not to go there yet - there were very few of these oddities in ;; a run of 25k journal articles and I don't want to see what kind of further oddities show up ;; if I look at this field all the time. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun read-pmid-titles-etc (&optional (file (config :pubmed-titles))) (let ((table (make-hash-table :test 'equal))) (when (probe-file file) (with-open-file (f file) (read-line f) ; skip headers (loop for line = (read-line f nil :eof) until (eq line :eof) for (id title citation date) = (split-at-char line #\tab) do (setf (gethash id table) (list title citation id (if (or (null date) (equal date "")) (progn (print id) 0) (parse-integer date))))))) table)) (defvar *pmid-title-etc-cache* nil) (defun cached-pmid-titles-etc (pmids) (unless *pmid-title-etc-cache* (setq *pmid-title-etc-cache* (read-pmid-titles-etc))) (loop for pmid in pmids for already = (gethash pmid *pmid-title-etc-cache*) if already collect already into got else collect pmid into dont-got finally (return (values got dont-got)))) (defun cache-pmid-titles-etc (entries) (unless *pmid-title-etc-cache* (setq *pmid-title-etc-cache* (read-pmid-titles-etc))) (loop for entry in entries for id = (third entry) do (setf (gethash id *pmid-title-etc-cache*) entry)) (write-pmid-titles-etc entries) entries) (defun write-pmid-titles-etc (entries &optional (file (config :pubmed-titles))) (with-open-file (f file :if-exists :append :direction :output :if-does-not-exist :create) (when (zerop (file-length f)) (print-tabbed f "id" "title" "citation" "date")) (if (hash-table-p entries) (loop for (title citation id date) being the hash-values of entries do (print-tabbed f id title citation date)) (loop for (title citation id date) in entries do (print-tabbed f id title citation date))))) (defun remove-xml-headers (string) (#"replaceFirst" string "(?s)<.*?>\\s*<.*?>\\s*" "")) (defun get-pmid-titles-etc (pmids) (let ((shifted-ids '(("11806965" "21665855") ("8961069" "97120398") ("8425219" "93145324") ("11114888" "20566643"))) ; (a b) query b and you get back a instead. Use this list to fix. ; Upon inspection one finds that there is an ArticleIds items that lists ; a as the pubmed id and b as the medline id (error-pmids '("112114015" "764255" "153029355" "101851270" "12517743" "12101255" "14970315" "176260"))) ; should have a better soln. But for now these pmids don't retrieve (setq pmids (set-difference pmids error-pmids)) (let ((per-hit 50)) (multiple-value-bind (already rest-pmids) (cached-pmid-titles-etc pmids) (append already (loop for some-pmids on rest-pmids by (lambda(l) (nthcdr per-hit l)) for bucket = (subseq some-pmids 0 (min (length some-pmids) per-hit)) for retrieved = (cache-pmid-titles-etc (let* ((result (xmls::parse (remove-xml-headers (get-url (format nil "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=~{~a~^,~}~a" bucket "&retmode=xml&email=alanr@mumble.net&tool=hcls"))))) (date-re (compile-regex "(\\d+).*?([A-Za-z]*)")) (issn-to-journal (issn-to-journal))) (loop with (summary nil . docs) = result for (docsum nil idpart . items) in docs for (id nil pmid) = (if (listp idpart) idpart nil) unless (null id) ; null if there was an error getting information for this id do (assert (and (equal docsum "DocSum") (equal summary "eSummaryResult") (equal id "Id")) () "Error in pubmed xml document") and collect (let* ((plist (loop for (nil (nil (nil field)) value) in items collect (list field value))) (location (second (assoc "So" plist :test 'string-equal)))) (when (not (scan date-re location)) (print "Couldn't get date in ~a for pmid~&" location)) (list (second (assoc "Title" plist :test 'string-equal)) (format nil "~a, ~a" (second (gethash (second (assoc "issn" plist :test 'string-equal)) issn-to-journal)) location) (or (second (assoc pmid shifted-ids :test 'string=)) pmid) (destructuring-bind (year month) (car (all-matches location date-re 1 2)) (encode-universal-time 0 0 0 1 (let ((monthnum (position month '("Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec") :test 'string-equal))) (if monthnum (1+ monthnum) 1)) ; sometimes there is no month (parse-integer year))))) when (null id) do (warn "Error looking up pubmed id ~a" idpart) ))) append retrieved do (when (set-difference bucket (mapcar 'third retrieved) :test 'string=) (warn "got pubmed info for these ids that I didn't ask for: ~a and didn't get these ~a that I did" (set-difference (mapcar 'third retrieved) bucket :test 'string=) (set-difference bucket (mapcar 'third retrieved) :test 'string=))) when (> (length some-pmids) per-hit) do (sleep 3) )))))) ;; same for mesh terms ;; Except that the service seems to be a superset of the one above so I should probably merge code ;; And it also returns abstract and other information that might be worth caching. ;; And the duplication of code is ugly ;; And.. (defun read-pmid-mesh-terms (&optional (file (config :mesh-terms))) (let ((table (make-hash-table :test 'equal))) (when (probe-file file) (with-open-file (f file) (read-line f) ; skip headers (loop for line = (read-line f nil :eof) until (eq line :eof) for (id major minor) = (split-at-char line #\tab) do (setf (gethash id table) (list (split-at-char major #\|) (split-at-char minor #\|)))))) table)) (defvar *pmid-mesh-terms-cache* nil) (defun cached-pmid-mesh-terms (pmids) (unless *pmid-mesh-terms-cache* (setq *pmid-mesh-terms-cache* (read-pmid-mesh-terms))) (loop for pmid in pmids for already = (gethash pmid *pmid-mesh-terms-cache*) if already collect (cons pmid already) into got else collect pmid into dont-got finally (return (values got dont-got)))) (defun cache-pmid-mesh-terms (entries) (unless *pmid-mesh-terms-cache* (setq *pmid-mesh-terms-cache* (read-pmid-mesh-terms))) (loop for (id major minor) in entries do (setf (gethash id *pmid-mesh-terms-cache*) (list major minor)) ) (write-pmid-mesh-terms entries) entries) (defun write-pmid-mesh-terms (entries &optional (file (config :mesh-terms))) (with-open-file (f file :if-exists :append :direction :output :if-does-not-exist :create) (when (zerop (file-length f)) (print-tabbed f "id" "major" "minor")) (if (hash-table-p entries) (loop for (major minor) being the hash-values of entries using (hash-key id) do (print-tabbed f id (join-with-char major #\|) (join-with-char minor #\|))) (loop for (id major minor) in entries do (print-tabbed f id (join-with-char major #\|) (join-with-char minor #\|)))) entries)) (defun pmid-mesh-terms (pmids &optional (which '(:major :minor))) (let ((per-hit 50)) (multiple-value-bind (already rest-pmids) (cached-pmid-mesh-terms pmids) (loop for (id major minor) in (append already (loop for some-pmids on rest-pmids by (lambda(l) (nthcdr per-hit l)) for bucket = (subseq some-pmids 0 (min (length some-pmids) per-hit)) append (cache-pmid-mesh-terms (loop with (nil nil . docs) = (xmls::parse (remove-xml-headers (get-url (format nil "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=~{~a~^,~}&retmode=xml&rettype=medline~a" bucket "&tool=paris&email=alanr@mpi.com") ))) for doc in docs for pmid = (third (find-element-with-tag doc "PMID")) collect (loop for (nil attr term) on (find-elements-with-tag doc "MeshHeading" "DescriptorName") by #'cdddr if (equal (second (assoc "MajorTopicYN" attr :test 'string=)) "N") collect term into minor else collect term into major finally (return (list pmid major minor))))) when (> (length some-pmids) per-hit) do (sleep 3) )) collect (list id (append (if (member :major which) major) (if (member :minor which) minor))))))) (defun fetch-pubmed-abstracts (pmids) (let ((entry (xmls::parse (remove-xml-headers (get-url (format nil "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=~{~a~^,~}&retmode=xml&rettype=medline~a" pmids "&tool=paris&email=alanr@mpi.com") ))))) (loop for pmid in pmids for title in (mapcar 'third (find-elements-with-tag entry "ArticleTitle")) for abstract in (mapcar 'third (find-elements-with-tag entry "AbstractText")) collect (list pmid title abstract))))