;; Example use of opennlp http://opennlp.sourceforge.net/ ;; http://opennlp.sourceforge.net/api/index.html (defvar *sentence-detector* (new 'english.SentenceDetector (format nil "~a/models/sentdetect/EnglishSD.bin.gz" *open-nlp-path*))) (defun split-into-sentences (string) (map 'list #"toString" (#"sentDetect" *sentence-detector* string))) (defvar *tokenizer* (new 'english.Tokenizer (format nil "~a/models/tokenize/EnglishTok.bin.gz" *open-nlp-path*))) (defun tokenize-sentence (string) (map 'list #"toString" (#"tokenize" *tokenizer* string))) (defvar *parser* (#"getParser" 'TreebankParser (format nil "~a/models/parser/" *open-nlp-path*) t t (get-java-field 'ParserME "defaultBeamSize") (get-java-field 'ParserME "defaultAdvancePercentage"))) (defvar *tagger* (new 'opennlp.tools.lang.english.ParserTagger (format nil "~a/models/postag/EnglishPOS.bin.gz" *open-nlp-path*) (make-immediate-object nil :ref))) (defun treebank-parse (string &optional (howmany 5)) (map 'list 'parse2sexp (#"parseLine" 'TreebankParser string *parser* howmany))) ;; http://www.med.harvard.edu/AANLIB/cases/case26/case.html (defvar *something-clinical* "This 62 year old man came to the emergency room after suffering a first seizure, a tonic-clonic convulsion with focal onset, witnessed by his wife. He suddenly became quiet, had eye deviation to the left, and began to twitch in the left face. He became unresponsive to verbal commands, and had generalized jerking movements of arms and legs, lasting for a few minutes. There was a history of carcinoma of the colon, with recent metastasis to the liver and lung. MR images show a lesion involving the right second frontal convolution and another in the cerebellum, near the fourth ventricle, also visible on the sagittal imagemap. There is contrast enhancement of the rim of both lesions. Metastatic brain lesions are typically but not always multiple. The low signal on T2-weighted images of the frontal lesion is remarkable, since metastases are often associated with high signal. There is very little surrounding edema or distortion of the frontal cortical architecture. In contrast, the cerebellar lesion is quite swollen and displaces the underlying brainstem, a potentially dangerous situation because of the proximity of vital brainstem centers involved in regulation of basic functions such as ventilation. This lesion required prompt attention and careful monitoring.") ;; from opennlp.tools.parser.Parser.show() (defun parse2sexp (parse) (let ((tok-node (get-java-field 'ParserME "TOK_NODE"))) (let* ((span (#"getSpan" parse)) (start (#"getStart" span)) (type (#"getType" parse)) (text (#"getText" parse)) (parts (#"getChildren" parse))) (let ((inner (loop for inner-parse across parts for inner-span = (#"getSpan" inner-parse) when (< start (1- (#"getStart" inner-span))) collect (#"substring" text start (#"getStart" inner-span)) collect (parse2sexp inner-parse) do (setq start (#"getEnd" inner-span )) ))) (when (> (#"getEnd" span) (+ start 1)) (setq inner (append inner (list (#"substring" text start (#"getEnd" span)))))) (if (not (equal type tok-node)) (progn (unless (tagdoc (intern type 'keyword)) (warn "Unknown tag ~a" type)) (cons (intern type 'keyword) inner)) inner))))) (defun test-parse () (loop for sentence in (split-into-sentences *something-clinical*) do (format t "~%~%~a~%" sentence) (pprint (car (treebank-parse (format nil "~{~a ~}"(tokenize-sentence sentence)) 1))))) ;; training: https://sourceforge.net/forum/forum.php?thread_id=1277438&forum_id=9943sentence detector: one sentence per line ;; pos-tagger: one sentence per line, word_tag ;; chunker: as you say one word per line: word tag chunk_tag ;; blank line between sentences ;; name: one sentence per line sgml tags: Name ;; parser: one sentence per line, treebank style parses ;; http://lieke.artiintel.com/index.php?option=com_content&task=view&id=5&Itemid=26 ;; https://sourceforge.net/forum/forum.php?thread_id=1402402&forum_id=9943 (defun tag (list) (let ((array (jnew-array "java.lang.String" (length list)))) (loop for el in list for i from 0 do (jarray-set array el i)) (let ((results (#"topKSequences" *tagger* array))) (loop for res across results collect (set-to-list (#"getOutcomes" res)))))) ;; http://acl.ldc.upenn.edu/W/W05/W05-0603.pdf looking for biological noun phrases ;; http://acl.ldc.upenn.edu/ archive of comp linguistics papers.