(defparameter *infobiogen-synonyms* nil)
;; (defun parse-infobiogen-synonyms ()
;; (let ((page (get-url "http://www.infobiogen.fr/services/chromcancer/Genes/Geneliste.html")))
;; (let ((bit (subseq page 4000 10000)))
;; (let ((entries (all-matches bit "(?s)[^<]*[^<]*
" 0)))
;; (length entries)))))
(defun parse-infobiogen-synonyms ()
(let ((page (get-url "http://www.infobiogen.fr/services/chromcancer/Genes/Geneliste.html" :persist t)))
(let ((bit (subseq page (search "Annotated genes
-- \\S+A --
" page))))
(let ((entries (all-matches bit "(?si)\\s*
" 0)))
(loop for (entry) in entries
with has = 0 and hasnt = 0
for clean = (#"replaceAll" entry "(?si)( \\s*)|
" " ")
for ((name)) = (all-matches clean ">(.*)<" 1)
for aliases = (mapcar 'car (all-matches clean "(?si).*?Alias\\s*(.*?)(\\n|
)" 1))
do
(setq name (#"replaceAll" name "\\s+" " "))
(when (char= (char name (1- (length name))) #\))
(setq name (#"replaceAll" name "\\s\\(.*" "")))
(if (unique-human-gene-id? name)
(incf has)
(progn (setf (gethash name *infobiogen-synonyms*) (cons name aliases)) (incf hasnt)))
(dolist (alias aliases)
(setq alias (#"replaceAll" alias "\\s+" " "))
(when (char= (char alias (1- (length alias))) #\))
(setq alias (#"replaceAll" alias "\\s\\(.*" "")))
(if (unique-human-gene-id? alias)
(incf has)
(progn (setf (gethash alias *infobiogen-synonyms*) (cons name aliases)) (incf hasnt))))
finally (return (values has hasnt)))))))
(defun infobiogen-synonyms ()
(or *infobiogen-synonyms*
(progn
(setq *infobiogen-synonyms* (make-hash-table :test 'equalp))
(parse-infobiogen-synonyms)
*infobiogen-synonyms*)))
;\\s*
#|
CTIP2 (Ctip-2) chicken ovalbumin upstream promoter transcription factor (COUP-TF)-interacting protein
Alias BCL11B (B-cell lymphoma/leukaemia 11B)
Alias Rit1 zinc finger protein hRit1 alpha (not to be confused with RIT1 on chr. 1q22)
|#
#|
(.*?)( \\s*)+(.*)\\n(
( )+Alias (.*?)( \\s*)*(.*)(\\n)*