(defun foreach-antibody-entry (function &optional limit cleanup) (let ((count 0)) (with-open-file (f (config :alzforum-antibody-db-dump)) (let ((headers (mapcar (lambda(header) (intern (string-upcase header) 'keyword)) (split-at-regex (read-line f) "\\t")))) (print headers) (loop for line = (read-line f nil :eof) until (eq line :eof) for fields = (if cleanup (maybe-clean-characters (split-at-regex line "\\t")) (split-at-regex line "\\t")) do (when (and limit (> (incf count) limit)) (return-from foreach-antibody-entry nil)) (funcall function (mapcar 'cons headers fields))))))) (defparameter *funny-character-regex* (load-time-value (#"compile" 'java.util.regex.pattern (format nil ".*(~{(\\0~o)|~}(&#\\d+;)|(<[a-zA-Z]*>)|( )|(^\")|(\"$)).*" '(129 143 149 154 170 181 191 201 202 208 210 211 213))))) (defun maybe-clean-characters (strings) (loop for string in strings if (#"matches" (#"matcher" *funny-character-regex* string)) collect (cleanit string) else collect string)) (defparameter *character-cleanups* '(("\\0201" "") ("\\0217" "e") ("\\0225" "i") ("\\0232" "o") ("\\0252" "(TM)") ("\\0262" "micro") ("\\0277" "o") ("\\0311" "...") ("\\0312" " ") ("\\0320" "-") ("\\0322" "\"") ("\\0323" "\"") ("\\0325" "'") ("Δ" " delta") ("ζ" " zeta") ("®" "(R)") ("⓰" " alpha") ("≤" "<=") ("™" "(TM)") ("–" "-") ("ʇ" " beta") ("’" "'") ("’" "'") ("λ" " lambda") ("Λ" " lambda") ("δ" " delta") ("ε" " epsilon") ("γ" " gamma") ("α" " alpha") ("κ" " kappa") ("β" " beta") ("(?i) " " ") ("(?i)<[/]{0,1}(NO){0,1}BR>" " ") ("(?i)<[/]{0,1}SUB>" "") ("^\"" "") ("\"$" "")) "first is pattern, second value is replacement") (defun cleanit (string) (loop for (pattern replacement) in *character-cleanups* for new = (#"replaceAll" string pattern replacement) then (#"replaceAll" new pattern replacement) finally (return new))) (defvar *alz-gene-names* nil) (defun alz-gene-names (&optional force) (or (and (not force) *alz-gene-names*) (let ((table (make-hash-table :test 'equal))) (foreach-antibody-entry (lambda(entry) (setf (gethash (cdr (assoc :datasheetlinktext entry)) table) t)) nil t) (setq *alz-gene-names* table)))) ;; (class !Supplier :partial) ;; (class !Manufacturer :partial !supplier) ;; (class !Supplier :partial !supplier) ;; (class !epitopeDescription :partial) ;; (class !SequenceRange :partial !epitopeDescription) ;; (datatype-property !fromSequencePosition range(xsd:int)) ;; (datatype-property !toSequencePosition range(xsd:int)) ;; (class !ProteinSequenceRange :partial !SequenceRange) ;; (object-property !ofProtein domain(!ProteinSequenceRange) range(!Protein)) ;; (class !method :partial) ;; (("antibodyID" "626") ;; ("antigenID" "Bcl") ; class. (does this include families?) ;; ("datasheet" "trans610746.asp") ;; ("datasheetLinkText" "Bcl-X") ;; ("externalLink" "1") ;; ("clonalityID" "monoclonal") ;; ("cloneNum" "44") ;; ("antigenInfo" " IgG1") ;; ("companyName" "BD Biosciences PHARMINGEN") ;; http://www.bdbiosciences.com/index.shtml ;; ("distributorInfo" "Formerly Transduction") ;; entrez 598 http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=full_report&list_uids=598 ;; ("epitope" "aa. 18-233 of human Bcl-XL") ;; ("host" "mouse") ;; ("hostFastOutput" "monoclonal mouse") ;; ("hostExtraInfo" "BSA, azite") ;; ("methodFastOutput" "IF, IH, WB") ;; http://www.alzforum.org/res/com/ant/glossary.asp ;; ("methodExtraInfo" "") ;; ("specificity" "Bcl-X, 26kDa") ;; ("reactivityFastOutput" "human, rat, mouse") ;; ("reactivityExtrainfo" "NULL") ;; ("fromWhoID" "Manufacturers")) ;; (individual !monoclonal (type !MonoclonalAntibody)) ;; (individual !polyclonal (type !PolyClonalAntibody)) ;; (individual !alzant:626 !Antibody ;; (value !hasClonality !monoclonal) ;; (value ! ;; ) ;; (loop for a in ;; (map 'list #"toString" (#"split" "antibodyID antigenID datasheet datasheetLinkText externalLink clonalityID cloneNum antigenInfo companyName distributorInfo epitope host hostFastOutput hostExtraInfo methodFastOutput methodExtraInfo specificity reactivityFastOutput reactivityExtrainfo fromWhoID" ;; "\\t")) ;; for b in ;; (map 'list #"toString" (#"split" "626 Bcl trans610746.asp Bcl-X 1 monoclonal 44 IgG1 BD Biosciences PHARMINGEN Formerly Transduction aa. 18-233 of human Bcl-XL mouse monoclonal mouse BSA, azite IF, IH, WB Bcl-X, 26kDa human, rat, mouse NULL Manufacturers")) ;; collect (list a b)) ;; (("antibodyID" "626") ;; ("antigenID" "Bcl") ; class. (does this include families?) ;; ("datasheet" "trans610746.asp") ;; ("datasheetLinkText" "Bcl-X") ;; ("externalLink" "1") ;; ("clonalityID" "monoclonal") ;; ("cloneNum" "44") ;; ("antigenInfo" " IgG1") ;; ("companyName" "BD Biosciences PHARMINGEN") ;; http://www.bdbiosciences.com/index.shtml ;; ("distributorInfo" "Formerly Transduction") ;; entrez 598 http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=full_report&list_uids=598 ;; ("epitope" "aa. 18-233 of human Bcl-XL") ;; ("host" "mouse") ;; ("hostFastOutput" "monoclonal mouse") ;; ("hostExtraInfo" "BSA, azite") ;; ("methodFastOutput" "IF, IH, WB") ;; http://www.alzforum.org/res/com/ant/glossary.asp ;; ("methodExtraInfo" "") ;; ("specificity" "Bcl-X, 26kDa") ;; ("reactivityFastOutput" "human, rat, mouse") ;; ("reactivityExtrainfo" "NULL") ;; ("fromWhoID" "Manufacturers")) #|(define-ontology alzantibodies (:base "http://www.w3.org/2001/sw/hcls/ontologies/reagent.owl" :includes '(reagent)) (let ((individuals nil) (count 0)) (block limit (foreach-antibody-entry (lambda(entry) (let ((lastfield nil) (lastvalue nil)) (flet ((field (field) (if (eq field lastfield) lastvalue (setq lastvalue (let ((value (cdr (assoc field entry)))) (if (and (char= (char value 0) #\") (char= (char value (1- (length value))) #\")) (setq value (subseq value 1 (- (length value) 2)))) (if (or (equal value "NULL") (equal value " ")) nil (progn (#"replaceAll" (#"replaceAll" value "&" "&") "\"" "%22")))))))) (when (> (incf count) 2) (return-from limit nil)) (let* ((name (make-uri-base-relative (format nil "antibody-~a" (field :antibodyid)) "alz:")) (individual (individual name (annotation !alz:hasAntibodyId (field :antibodyid)) (and (field :epitope) (annotation !hasEpitopeDescription (field :epitope))) (and (field :specificity) (annotation !hasSpecificityDescription (field :specificity))) (and (field :hostfastoutput) (annotation !hasHostDescription (format nil "~a~a" (field :hostfastoutput) (if (field :hostExtraInfo) (format nil " (~a)" (field :hostExtraInfo)) "")))) (and (field :reactivityfastoutput) (annotation !hasReactivityDescription (format nil "~a~a" (field :reactivityfastoutput) (if (field :reactivityextrainfo) (format nil " (~a)" (field :reactivityextrainfo)) "")))) (and (field :companyname) (annotation !hasCompanyDescription (field :companyname))) (annotation !alz:hasAntigenClass (field :antigenId)) (type !Antibody)))) (push individual individuals))))))) individuals))|# (defun ant (&optional limit) (let ((count 0) (unknowns nil)) (foreach-antibody-entry (lambda (entry) (if (and limit (> (incf count) limit)) (return-from ant unknowns) (progn ;;(setq unknowns (union unknowns (second (multiple-value-list (parse-methods entry))) :test 'equal)) ;;(pushnew (parse-antigen-info entry) unknowns :test 'equalp) (pushnew (parse-specificity entry) unknowns :test 'equalp) )))) unknowns)) (defun parse-methods (entry) (let ((known (test)) (unknowns nil)) (let* ((methods (cdr (assoc :methodfastoutput entry))) (cleaned (#"replaceAll" methods "<.{0,1}nobr>" ""))) (setq cleaned (#"replace" cleaned "\"" "")) (let ((composites (all-matches cleaned "(I.)\\((.*?)\\)" 0 1 2))) (loop for (got) in composites do (setq cleaned (#"replace" cleaned got ""))) (setq cleaned (split-at-regex cleaned "\\s*,\\s*")) (setq cleaned (remove "NULL" cleaned :test 'equalp)) (loop for (got prefix which) in composites for specifics = (split-at-regex which "\\s*,\\s*") do (loop for specific in specifics do (push (format nil "~a(~a)" prefix specific) cleaned))) (loop for one in cleaned when (not (find one known :test 'equalp)) do (pushnew one unknowns :test 'equal)))) (values cleaned unknowns))) (defun all-alz-methods (&aux all) (foreach-antibody-entry (lambda(e) (multiple-value-bind (cleaned unknowns) (parse-methods entry) (setq all (union all parse-methods :test 'equal)) (setq all (union all unknowns :test 'equal))))) all) ;; http://www.antibodyresource.com/educational.html ;; http://nhpreagents.bidmc.harvard.edu/NHP/default.aspx ;; http://www.keithbahjat.com/abcxr/ ;; http://www.hiv.lanl.gov/content/immunology/search_help.html ;; http://www.google.com/search?client=safari&rls=en&q=antibody+database&ie=UTF-8&oe=UTF-8 ;; http://www.cellnucleus.com/antibody.htm ;; http://www.antibodyresource.com/findantibody.html ;; http://www.linscottsdirectory.com/ ;; http://researchlink.labvelocity.com/products/index.jhtml;$sessionid$UGVHAQYAAAQXCQBICNVBNWQ?nodeId=1017&path=0 ;; http://en.wikipedia.org/wiki/Immunoglobulin ;; http://en.wikipedia.org/wiki/Immunoglobulin#IgG ;; IgG: 1,2a,2b,3 - can be annotated kappa ;; http://en.wikipedia.org/wiki/Immunoglobulin#IgM ;; IgM ;; IgY (chicken) ;; 954 is kappa ;; 955 is lambda ;; RMAB recombinant monoclonal antibody (rmAb) http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=1751781&dopt=Abstract (only one example in altz) (defparameter *isotype-noise-pattern* (load-time-value (#"compile" 'java.util.regex.pattern "(
)|(&)|( )|([lL]igand)|(^[ \"]*)|([ \"]*$)|(<\\/>)|((?i)<.{0,1}sub[>.])"))) (defun parse-antigen-info (entry) (let ((antigenInfo (cdr (assoc :antigenInfo entry)))) (setq antigenInfo (#"replaceAll" (#"matcher" *isotype-noise-pattern* antigenInfo) "")) ; noise (setq antigenInfo (#"replaceAll" antigenInfo "^\\s*" "")) (setq antigenInfo (#"replaceAll" antigenInfo "[ ,]{0,1}κ" " kappa")) (setq antigenInfo (#"replaceAll" antigenInfo "954" " kappa")) (setq antigenInfo (#"replaceAll" antigenInfo "[ ,]{0,1}&#(955|923);" " lambda")) (setq antigenInfo (#"replaceFirst" antigenInfo "^.*?(?=[Ii][Gg].*)" "")) (setq antigenInfo (#"replaceAll" antigenInfo "[;> (incf count) 1) ; (return-from collect-charset byfield)) ; (setq *entry* entry) (loop for (field . value) in entry do ; (format t "~a: ~s~%" field value) (loop for char across value do (pushnew (char-code char) (gethash field byfield) :test #'eql)) (loop for (match) in (all-matches value "&#\\d+;" 0 ) do (pushnew match xmls :test 'equal)) ))) (value byfield xmls)))) #| funny characters 129 #x81 #o201 '' (never on it's own - utf bridge character?) 143 #x8F #o217 '' e accent egu 149 #x95 #o225 '' typo should be "i" 154 #x9A #o232 '' o umlaut 170 #xAA #o252 '' TM= trademark 181 #xB5 #o265 '' mu (as in micromole - 1 use) 191 #xBF #o277 '' bimeda "o" as in biomeda 201 #xC9 #o311 '' = "..." 202 #xCA #o312 '' looks like nbsp; 208 #xD0 #o320 '' looks like a dash "-" 210 #xD2 #o322 '' TARGET=_new used as quotation inside a href 211 #xD3 #o323 '' TARGET=_new used as quotation inside a href 213 #xD5 #o325 '' apostrophe |# #| XML Character entities used in the database 916 394 capital delta 950 3B6 small zeta 174 AE registered sign 9456 24F0 (typo for 945 = alpha) 8804 2264 less-than-or-equal 8482 2122 trademark 150 96 (type for "-" dash) 647 287 (typo for 946 = beta) 146 92 (typo for apostrophe ') 8217 2019 single quotation mark, right 955 3BB small lambda 923 39B capital Lambda 948 3B4 small delta 949 3B5 small epsilon 947 3B3 small gamma 945 3B1 small alpha 954 3BA small kappa 946 3B2 small beta |# ;; GO:0019814 immunoglobulin complex ;; !alz:antigenClass ;; !alz:Antibody ;; !alz:hasIsotype ;; !alz:hasHeavyChain ;; !alz:targetsEpitope ;; !alz:doesNotTargetEpitope ;; !alz:targetsAntigen ;; !alz:derivedFromHostSpecies ;; !alz:hasClonality ;; !alz:usedForMethod ;; !alz:isReactiveInSpecies ;; !alz:isAvailableFromManufacterer ;; !alz:hasSpecificationSheet ;; "IgG2a, IgG2a, IgG1" ;; "IgM + IgG1" ;; "IgG2b, IgG1" ;; "IgGb" ;; "IgG1c" ;; "IgG2aIgG2a" ;; "IgG2a, IgG2b" ;; "IgG1 IgM" ;; "IgM & IgG1" ;; "Ig1" ;; "IgG1 and IgG2a" ;; "IgG3* kappa" ;; "IgG2 lambda2" ;; "IgG1and IgG2a" ;; "IgM kappa" ;; "IgG3 kappa" ;; "IgY" ;; "IgG)" ;; "IgG1 lambda" ;; "IgG1 kappa, lambda" ;; "IgG2c kappa" ;; "IgG1+IgG2a" ;; "IgG2a, IgG2a, IgG1" ;; "IgG1 kappa,IgG2a kappa, IgG2a kappa," ;; "IgG1, kappa" ;; "IgM" ;; "IgG2b kappa" ;; "IgG2a kappa" ;; "Ig" ;; "IgG2b" ;; "IgG2 kappa" ;; "IgG3" ;; "IgG2a" ;; "IgG1" ;; "IgG" ;; NIL ;; "IgG1 kappa" ;(loop for species in *reactivity-species* do (format t "~a: ~a~%" species (ncbi-taxid species))) ;; FIXME: Handle: (wikipedia "d. elegans") ;; avian Bird ;; corn_(zea_mays) Maize ;; cynomolgus Cynomolgus_monkey ;; fetal_calf -> http://en.wikipedia.org/wiki/Calf (modifier fetal) ;; http://en.wikipedia.org/wiki/guinea_dog -> http://en.wikipedia.org/wiki/New_Guinea_Singing_Dog ? ;; http://en.wikipedia.org/wiki/myxamoeba -> http://en.wikipedia.org/wiki/slime_molds ;; http://en.wikipedia.org/wiki/myxo-_mycetes -> http://en.wikipedia.org/wiki/slime_molds ;; http://en.wikipedia.org/wiki/ox -> http://en.wikipedia.org/wiki/Cattle#Ox ;; http://en.wikipedia.org/wiki/rhesus -> http://en.wikipedia.org/wiki/Rhesus_Macaque ;; http://en.wikipedia.org/wiki/seal -> http://en.wikipedia.org/wiki/Pinniped ;; http://en.wikipedia.org/wiki/torpedo -> http://en.wikipedia.org/wiki/Electric_ray (defparameter *alz-species-wikipedia-exceptions* '(("avian" "Bird") ("corn (zea mays)" "Maize") ("cynomolgus" "Cynomolgus_monkey") ("fetal calf" "Calf") ;; what to do about modifier fetal? ("guinea dog" "New_Guinea_Singing_Dog") ("myxamoeba" "slime_molds") ("myxo- mycetes" "slime_molds") ("ox" "Cattle#Ox") ("rhesus" "Rhesus_Macaque") ("seal" "Pinniped") ("torpedo" "Electric_ray")))