(in-package :cl-user) (defclass lnc-parse () ((raw :accessor raw :initarg :raw) (path :accessor path :initarg :path) (lexicon-entries :initarg :lexicon-entries :initform (make-hash-table :test 'equal) :accessor lexicon-entries) (sentences :initarg :sentences :initform nil :accessor sentences) (tokens :initarg :tokens :initform (make-hash-table :test 'eql) :accessor tokens) )) (defmethod initialize-instance ((p lnc-parse) &key path) (call-next-method) (setf (raw p) nil) (with-open-file (f path) (setf (raw p) (xmls::parse f)))) (defmethod print-object ((p lnc-parse) stream) (print-unreadable-object (p stream) (format stream "~a, ~a paragraphs" (pathname-name (path p)) (length (find-elements-with-tag (raw p) "paragraph"))))) (defmethod as-string ((p lnc-parse)) (with-output-to-string (s) (loop for par in (find-elements-with-tag (raw p) "paragraph") do (format s "~{~a~^ ~}~%~%" (mapcar 'third (find-elements-with-tag par "string"))) ))) (defmethod as-words ((p lnc-parse)) (loop for par in (find-elements-with-tag (raw p) "paragraph") append (mapcar 'third (find-elements-with-tag par "string")))) (defmethod word-snomeds ((p lnc-parse)) (loop for token in (find-elements-with-tag (raw p) "token") for string = (third (find-element-with-tag token "string")) for codes = (find-elements-with-tag token "code") with table = (make-hash-table :test 'equal) do (loop for code in codes when (equal (attribute-named code "coding-system") "SNCT CATEGORY") do (pushnew (attribute-named code "kn") (gethash string table) :test 'equal)) finally (return (loop for k being the hash-keys of table using (hash-value v) collect (cons k v))))) (defmethod extract-lexicon-entries ((p lnc-parse)) (let ((count 0) (total (length (find-elements-with-tag (raw p) "grammar-id")))) (loop for token in (find-elements-with-tag (raw p) "token") for string = (third (find-element-with-tag token "string")) for lexicon-entries = (find-immediate-children-with-tag token "grammar-id") do (incf count (length lexicon-entries)) (loop for entry in lexicon-entries do (pushnew entry (gethash string (lexicon-entries p)) :test 'equal))) (unless (= count total) (break)))) (defmethod extract-tokens ((p lnc-parse)) (loop for token in (find-elements-with-tag (raw p) "token") for string = (third (find-element-with-tag token "string")) with table = (make-hash-table :test 'equal) for tokeni = (make-instance 'token :type (attribute-named token "type") :number(attribute-named token "number") :id (attribute-named token "id") :string string :start (attribute-named token "begin") :end (attribute-named token "end")) do (setf (gethash (attribute-named token "id") table) tokeni) finally (setf (tokens p) table))) (defmethod extract-sentences ((p lnc-parse)) (flet ((true/false (val) (if (equal val "true") t (if (equal val "false") nil (error "Expected true/false but got: ~a" val))))) (loop for sentence in (find-elements-with-tag (raw p) "sentence") with table = (make-hash-table :test 'equal) collect (setf (gethash (attribute-named sentence "sentence_id") table ) (make-instance 'sentence :id (attribute-named sentence "sentence_id") :begin (attribute-named sentence "begin") :end (attribute-named sentence "end") :clause-or-phrases (loop for phrase in (find-elements-with-tag sentence "clause_or_phrase") for tokens = (mapcar (lambda(tok) (gethash (attribute-named tok "id") (tokens p))) (find-immediate-children-with-tag phrase "token")) collect (make-instance 'clause-or-phrase :tokens tokens :future (true/false (attribute-named phrase "future")) :negation (true/false (attribute-named phrase "negation")) :modality (true/false (attribute-named phrase "modality")))))) into sentences finally (setf (sentences p) sentences) ))) ;; with table = (make-hash-table :test 'equal) ;; do ;; (loop for code in codes ;; when (equal (attribute-named code "coding-system") "SNCT CATEGORY") ;; do (pushnew (attribute-named code "kn") (gethash string table) :test 'equal)) ;; finally ;; (return (loop for k being the hash-keys of table using (hash-value v) ;; collect (cons k v))))) (defclass lexicon-entry () ()) (defclass token () ((token-type :initarg :type :initform nil :accessor token-type) (token-number :initarg :number :initform nil :accessor token-number) (token-id :initarg :id :initform nil :accessor token-id) (token-start :initarg :start :initform nil :accessor token-start) (token-end :initarg :end :initform nil :accessor token-end) (token-string :initarg :string :initform nil :accessor token-string) )) (defclass sentence () ((clause-or-phrases :initarg :clause-or-phrases :initform nil :accessor clause-or-phrases) (dont-parse :initarg :dont-parse :initform nil :accessor dont-parse) (sentence-begin :initarg :begin :initform nil :accessor sentence-begin) (sentence-end :initarg :end :initform nil :accessor sentence-end) (sentence-id :initarg :id :initform nil :accessor sentence-id) )) (defclass clause-or-phrase () ((tokens :initarg :tokens :initform nil :accessor tokens) (future :initarg :future :initform nil :accessor future) (negation :initarg :negation :initform nil :accessor negation) (modality :initarg :modality :initform nil :accessor modality) )) (defmethod print-object ((o token) stream) (print-unreadable-object (o stream :type t) (format stream "~a(~a) ~a:\"~a\" ~a-~a" (token-id o)(token-number o) (token-type o) (token-string o) (token-start o) (token-end o)))) ; (pprint (word-snomeds p)) (defun hello-lnc (path) (let ((p (make-instance 'lnc-parse :path path))) (print (as-words p)) (pprint (word-snomeds p)) p)) #| "target" (of a link - semantic rdf) "inferredconcepts" (semantic rdf) "origin" (part-of-link) "token" :id "id" "tag-sequence" looks like probability of each tag assignment "component" part of header - java class "clause_or_phrase" (captures modality, future tense, etc) "text" (main tag) "flow" (main) "parse" "id" (sort of like grammar-id) "edge" (part of parse) "dependent" "code" "link" "linktype" "paragraph" "source" "reftoken" "string" "param" "sentence" "infl_form" "grammar-id" Tag nesting structure. (map nil 'print (loop for tag in (what-tags (raw p)) collect (list tag (remove tag (what-tags (find-element-with-tag (raw p) tag)) :test 'equal)))) ("text" ("target" "inferredconcepts" "origin" "token" "tag-sequence" "component" "clause_or_phrase" "flow" "parse" "id" "edge" "dependent" "code" "link" "linktype" "paragraph" "source" "reftoken" "string" "param" "sentence" "infl_form" "grammar-id")) ("target" ("origin" "code")) ("inferredconcepts" ("reftoken" "string" "token" "code")) ("token" ("string" "id" "grammar-id")) ("component" ("param")) ("clause_or_phrase" ("string" "token" "id" "grammar-id")) ("flow" ("param" "component")) ("parse" ("edge" "dependent")) ("edge" ("dependent")) ("dependent" ("edge")) ("link" ("linktype" "target" "source" "origin" "code")) ("linktype" ("origin" "code")) ("paragraph" ("string" "token" "tag-sequence" "sentence" "clause_or_phrase" "parse" "id" "edge" "grammar-id" "dependent")) ("source" ("origin" "code")) ("sentence" ("string" "token" "tag-sequence" "clause_or_phrase" "parse" "id" "edge" "grammar-id" "dependent")) ("infl_form" NIL) ("grammar-id" NIL) ("reftoken" NIL) ("string" NIL) ("param" NIL) ("tag-sequence" NIL) ("origin" NIL) ("id" NIL) ("code" NIL) (map nil 'print (loop for tag in (what-tags (raw p)) collect (list tag (remove tag (what-immediate-attributes (find-element-with-tag (raw p) tag)) :test 'equal)))) ("target" ("begin" "end")) ("inferredconcepts" NIL) ("origin" ("fromId")) ("token" ("number" "begin" "end" "type" "id")) ("tag-sequence" ("probs" "sequence" "prob")) ("component" ("timestamp" "name" "class" "duration")) ("clause_or_phrase" ("future" "negation" "modality")) ("text" NIL) ("flow" NIL) ("parse" ("probs" "validator" "type")) ("id" ("number" "features" "root")) ("edge" ("end_node" "head" "start_node" "word_class" "head_node" "grammar-id-number")) ("dependent" ("slot" "function")) ("code" ("kn" "coding-system" "id")) ("link" ("id")) ("linktype" ("begin" "end")) ("paragraph" ("begin" "end")) ("source" ("begin" "end")) ("reftoken" ("deptoken" "refid")) ("string" ("begin" "end")) ("param" ("key" "value")) ("sentence" ("dont_parse" "sentence_id" "begin" "end")) ("infl_form" ("features")) ("grammar-id" ("number" "features" "root")) (map nil 'print (loop for tag in (what-tags (raw p)) collect (list tag (what-immediate-tags (find-element-with-tag (raw p) tag))))) ("target" ("origin" "code")) ("inferredconcepts" ("token")) ("origin" NIL) ("token" ("string" "id" "grammar-id")) ("tag-sequence" NIL) ("component" ("param")) ("clause_or_phrase" ("token")) ("text" ("paragraph" "flow")) ("flow" ("component")) ("parse" ("edge")) ("id" NIL) ("edge" ("dependent")) ("dependent" ("edge")) ("code" NIL) ("link" ("linktype" "target" "source")) ("linktype" ("origin" "code")) ("paragraph" ("sentence")) ("source" ("origin" "code")) ("reftoken" NIL) ("string" NIL) ("param" NIL) ("sentence" ("tag-sequence" "clause_or_phrase" "parse")) ("infl_form" NIL) ("grammar-id" NIL) |#