;; Done: ;; ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/HUMAN/gene_association.goa_human.gz ;; ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/HUMAN/human.xrefs.gz ;; Worth considering: ;; ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/gene_association.goa_mouse.gz ;; ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/RAT/gene_association.goa_rat.gz ;; ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/mouse.xrefs.gz ;; ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/RAT/rat.xrefs.gz ;; http://www.geneontology.org/cgi-bin/downloadGOGA.pl/gene_association.rgd.gz (rat) ;; http://www.geneontology.org/cgi-bin/downloadGOGA.pl/gene_association.mgi.gz (mouse) ;; http://www.geneontology.org/cgi-bin/downloadGOGA.pl/gene_association.Compugen_GenBank.gz ;; http://www.geneontology.org/cgi-bin/downloadGOGA.pl/gene_association.Compugen_UniProt.gz (in-package :cl-user) (config-set :goa-xref "~/Desktop/Data/GOA/HUMAN/human.xrefs") (config-set :goa-human "~/Desktop/Data/GOA/HUMAN/gene_association.goa_human") (config-set :go-obo "~/Desktop/Data/GOA/gene-ontology.obo.txt") (defclass gene-ontology () ((id2record :initarg :id2record :initform (make-hash-table :test 'equal) :accessor id2record) (name2record :initarg :name2record :initform nil :accessor name2record) (gene2association :initarg :gene2association :initform nil :accessor gene2association) (hierarchy :initarg :hierarchy :initform nil :accessor hierarchy) (header :initarg :header :initform nil :accessor header))) (defclass go-term () ((id :initarg :id :initform nil :accessor id) (name :initarg :name :initform nil :accessor name) (namespace :initarg :namespace :initform nil :accessor namespace) (definition :initarg :definition :initform nil :accessor definition) (go-parents :initarg :go-parents :initform nil :accessor go-parents) (part-of :initarg :part-of :initform nil :accessor part-of) (has-parts :initarg :has-parts :initform nil :accessor has-parts) (go-children :initarg :go-children :initform nil :accessor go-children) (direct-associations :initarg :direct-associations :initform nil :accessor direct-associations) (is-obsolete :initarg :is-obsolete :initform nil :accessor is-obsolete) )) (defmethod all-associations ((term go-term) &optional (include-parts t)) (if include-parts (all-associations-include-parts term) (apply 'append (direct-associations term) (mapcar 'all-associations (go-children term))))) (defmethod all-associations-include-parts ((term go-term)) (apply 'append (direct-associations term) (append (mapcar 'all-associations-include-parts (go-children term)) (mapcar 'all-associations-include-parts (has-parts term))))) (defmethod gene-direct-associations ((g gene-ontology) ll) (or (gene2association g) (let ((table (make-hash-table :test 'equal))) (maphash (lambda (id record) (declare (ignore id)) (loop for (association . nil) in (direct-associations record) do (pushnew record (gethash association table nil)))) (id2record g)) (setf (gene2association g) table) (gene-direct-associations g ll))) (gethash ll (gene2association g))) (defmethod print-object ((term go-term) stream) (print-unreadable-object (term stream) (ignore-errors (format stream "~a(~a) ~a (~a)" (id term) (namespace term) ; (regex-replace "^.*_" (namespace term) "") (name term) (length (remove-duplicates (mapcar 'car (all-associations term)) :test 'equal)))))) (defmethod get-record ((g gene-ontology) id) (unless (or (null id) (not (and (> (length id) 3) (string-equal (subseq id 0 3) "GO:")))) (if (typep id 'go-term) id (or (gethash id (id2record g)) (setf (gethash id (id2record g)) (make-instance 'go-term)))))) (defmethod get-record-by-name ((g gene-ontology) name) (unless (name2record g) (setf (name2record g) (make-hash-table :test 'equal)) (maphash (lambda(id record) (declare (ignore id)) (setf (gethash (name record) (name2record g)) record)) (id2record g))) (gethash name (name2record g))) (defmethod initialize-instance ((go gene-ontology) &key) (call-next-method) (setq @ go) (read-obo go) (read-go-associations go)) ;; [Term] ;; id: GO:0000001 ;; name: mitochondrion inheritance ;; namespace: biological_process ;; def: "The distribution of mitochondria\, including the mitochondrial genome\, into daughter cells after mitosis or meiosis\, mediated by interactions between mitochondria and the cytoskeleton." [PMID:11389764, PMID:10873824, SGD:mcc] ;; is_a: GO:0048308 ! organelle inheritance ;; is_a: GO:0048311 ! mitochondrion distribution (defmethod read-obo ((g gene-ontology)) (when (config-maybe :go-obo) (with-open-file (f (config :go-obo)) (setf (header g) (read-obo-key-values g f)) (pprint (header g)) (loop for line = (read-line f nil :eof) until (eq line :eof) for (match part) = (car (all-matches line "^\\[(.*?)\\]\\s*" 0 1)) do (assert match () "Didn't find a record! '~a" line) do (read-go-obo-record g part f) ) nil))) (defmethod read-obo-key-values ((g gene-ontology) stream) (loop for line = (read-line stream) for (tag value) = (car (all-matches line "^(\\S+): (.*)$" 1 2)) until (null tag) append (list (intern (string-upcase tag) 'keyword) value) )) (defmethod read-go-obo-record ((g gene-ontology) type stream) (if (equal type "Term") (let ((plist (read-obo-key-values g stream)) (id nil) (name nil) (def nil) (isa nil) (namespace nil) (partof nil) (obsolete nil)) (princ ".") (macrolet ((grab-once (var ) `(progn (assert (null ,var) () "~a is already ~a, trying to reassing to ~a" ',var ,var value) (setq ,var value)))) (flet ((grab-isa (value) (register-groups-bind (thisone) ("^(GO:\\d+)" value) (push thisone isa))) (grab-def (value) (setq def (regex-replace-all "\"|\\\\" value ""))) (grab-relationship (value) (register-groups-bind (relation id) ("^(\\S+) (GO:\\d+)" value) (assert (and relation id) () "Bad relation parsing: ~a" value) (assert (equal relation "part_of") () "Only know how to deal with part_of relations, not ~a" relation) (push id partof)))) (loop for (key value) on plist by #'cddr do (ecase key (:id (grab-once id)) (:name (grab-once name)) (:subset nil) (:namespace (grab-once namespace)) (:def (grab-def value)) (:is_a (grab-isa value)) (:alt_id) (:comment) (:is_obsolete (setq obsolete t)) ((:exact_synonym :synonym :narrow_synonym :broad_synonym :related_synonym)) (:relationship (grab-relationship value)) ((:xref_analog :xref_unknown))))) (let ((record (get-record g id))) (setf (id record) id (name record) name (definition record) def (namespace record) namespace (is-obsolete record) obsolete (go-parents record) (loop for id in isa for parent = (get-record g id) do (pushnew record (go-children parent)) collect parent) (part-of record) (loop for id in partof for container = (get-record g id) do (pushnew record (has-parts container)) collect container)) ))) (cons type (read-obo-key-values g stream)))) #| Column descriptions human.xrefs from goa http://www.ebi.ac.uk/GOA/goaHelp.html 1. Database from which master entry of this IPI entry has been taken. One of either SP (UniProtKB/Swiss-Prot), TR (UniProtKB/TrEMBL), ENSEMBL (Ensembl), ENSEMBL_HAVANA (Ensembl Havana subset), REFSEQ_STATUS (where STATUS corresponds to the RefSeq entry 2. revision status), VEGA (Vega), TAIR (TAIR Protein data set) or HINV (H-Invitational Database). UniProtKB accession number or Vega ID or Ensembl ID or RefSeq ID or TAIR Protein ID or H-InvDB ID. 3. International Protein Index identifier. 4. Supplementary UniProtKB/Swiss-Prot entries associated with this IPI entry. 5. Supplementary UniProtKB/TrEMBL entries associated with this IPI entry. 6. Supplementary Ensembl entries associated with this IPI entry. Havana curated transcripts preceeded by the key HAVANA: (e.g. HAVANA:ENSP00000237305;ENSP00000356824;). 7. Supplementary list of RefSeq STATUS:ID couples (separated by a semi-colon ';') associated with this IPI entry (RefSeq entry revision status details). 8. Supplementary TAIR Protein entries associated with this IPI entry. 9. Supplementary H-Inv Protein entries associated with this IPI entry. 10. Protein identifiers (cross reference to EMBL/Genbank/DDBJ nucleotide databases). 11. List of HGNC number, HGNC official gene symbol couples (separated by a semi-colon ';') associated with this IPI entry. 12. List of NCBI Entrez Gene gene number, Entrez Gene Default Gene Symbol couples (separated by a semi-colon ';') associated with this IPI entry. 13. UNIPARC identifier associated with the sequence of this IPI entry. 14. UniGene identifiers associated with this IPI entry. 15. CCDS identifiers associated with this IPI entry. 16. RefSeq GI protein identifiers associated with this IPI entry. 17. Supplementary Vega entries associated with this IPI entry. The mouse, rat, zebrafish and arabidopsis xref files have the following differences: * Column 11 in the mouse file contains the MGI (Mouse Genome Informatics) identifier and symbol for the genes * Column 11 in the rat file contains the RGD (Rat Genome Database) identifier and symbol for the genes. * Column 11 in the zebrafish file contains the ZFIN (Zebrafish information network) identifier and symbol for the genes. * Column 11 in the arabidopsis file contains the TAIR Gene (The Arabidopsis Information Resource) symbol and locus identifier for the genes. * Column 11 does not contain any data for chicken and cow. **************************************************************** For the annotations file: 1. DB Database from which annotated entry has been taken. One of either UniProt (UniProt:Swiss-Prot/TrEMBL) or ENSEMBL (Ensembl). Example: UniProt 2. DB_Object_ID A unique identifier in the DB for the item being annotated. Here: Accession number or identifier of the annotated protein. Either UniProt accession number or an Ensembl peptide ID. Example: O00165 3. DB_Object_Symbol A (unique and valid) symbol to which DB_Object_ID is matched. Here: UniProt entry name or Ensembl peptide ID. Example: HAX1_HUMAN 4. Qualifier This column is used for flags that modify the interpretation of an annotation. This field may be equal to: NOT, colocalizes_with, contributes_to. Example: NOT (calmodulin binding) 5. GOid The GO identifier for the term attributed to the DB_Object_ID. Example: GO:0005625 6. DB:Reference Reference cited to support the attribution. See section 7 for an explanation of the reference types used. Examples: PUBMED:9058808, GOA:interpro, GOA:hamap, GOA:spkw, GOA:spec. 7. Evidence One of either IMP,IGI,IPI,ISS,IDA,IEP,IEA,TAS,NAS,NR or ND. Example: TAS 8. With Example: UniProt:O00341 9. Aspect One of the three ontologies: P (biological process), F (molecular function) or C (cellular component). Example: P 10. DB_Object_Name Name of gene or gene product Here: Either empty (for Ensembl peptides) or the gene name and abbreviated description line (for UniProt entries). Example: HS1-associated protein X-1 11. Synonym Gene_symbol [or other text] Here: International Protein Index identifier (section 4). Example: IPI00010440 12. DB_Object_Type What kind of entity is being annotated. Here: always 'protein' Example: protein 13. Taxon_ID Identifier for the species being annotated. Here: always 'taxon:9606' for human proteins. Example: taxon:9606 14. Date The date of last annotation update in the format 'YYYYMMDD' eg: 20050101 15. Assigned_By Attribute describing the source of the annotation. One of either UniProt, MGI, SGD, FB, HGNC, RGD, GeneDB, PINC (Proteome Inc.), TAIR or ZFIN. |# (defmethod read-go-associations ((g gene-ontology)) (unless (eq (config :goa-xref) :ignore) (destructuring-bind (swiss hinv refseq ensembl vega) (read-xrefs g) (with-open-file (f (config :goa-human)) (loop for line = (read-line f nil :eof) until (eq line :eof) for (db id nil qualifier goid citation evidence with) = (split-at-char line #\tab) do (let ((lls (cond ((or (equal db "HINV") (equal db "H-invDB")) (gethash id hinv)) ((equal db "ENSEMBL") (gethash id ensembl)) ((equal db "VEGA") (gethash id vega)) ((equal db "RefSeq") (gethash id refseq)) ((equal db "UniProt") (gethash id swiss)) (t (error "Found and db I didn't know about: ~a" db))))) (when (eq lls :none) (setq lls nil)) (if (null lls) (pushnew (list qualifier db id nil evidence citation with) (direct-associations (get-record g goid)) :test 'equal) (loop for ll in lls with record = (get-record g goid) do (pushnew (list qualifier db id ll evidence citation with) (direct-associations record) :test 'equal))) ))) ))) (defmethod read-xrefs ((g gene-ontology)) (let ((swiss (make-hash-table :test 'equal)) (hinv (make-hash-table :test 'equal)) (refseq (make-hash-table :test 'equal)) (ensembl (make-hash-table :test 'equal)) (vega (make-hash-table :test 'equal))) (unless (eq (config :goa-xref) :ignore) (with-open-file (f (config :goa-xref)) (loop for line = (read-line f nil :eof) until (eq line :eof) for comment = (char= (char line 0) #\#) for (db id ipi iswiss itrembl iensembl irefseq tair H-Inv genbank hgnc lls uniparc unigene ccds gi ivega) = (split-at-char line #\tab) unless comment do (if (equal lls "") (setq lls :none) (setq lls (mapcar (lambda(ll) (car (split-at-char ll #\,))) (split-at-char lls #\;)))) ; (when (> (length lls) 2) ; (print-db db id ipi iswiss itrembl iensembl irefseq tair H-Inv genbank hgnc lls uniparc unigene ccds gi ivega)) (cond ((or (equal db "SP") (equal db "TR")) (setf (gethash id swiss) lls)) ((or (equal db "ENSEMBL") (equal db "ENSEMBL_HAVANA")) (setf (gethash id ensembl) lls)) ((equal db "VEGA") (setf (gethash id vega) lls)) ((or (equal db "HINV") (equal db "H-invDB")) (setf (gethash id hinv) lls)) ((and (>= (length db) 6) (equal (subseq db 0 6) "REFSEQ")) (setf (gethash id refseq) lls)) (t (error "what's this database: ~a?")))) )) (list swiss hinv refseq ensembl vega) )) (defun go-apropos (re &optional ids-only) (let ((scanner (create-scanner (concatenate 'string "(?i)" re))) (them nil)) (maphash (lambda(id record) (if (or (scan scanner (name record)) (scan scanner (definition record))) (if ids-only (push id them) (format t "~a(~a) ~a ~a~&" id (namespace record) (name record) (definition record))))) (id2record *go*)) them)) ; ("HINV" "ENSEMBL" "RefSeq" "UniProt") (defmethod add-go-children-links ((g gene-ontology)) (maphash (lambda(name term) (declare (ignore name)) (loop for is in (go-parents term) do (pushnew term (go-children is)))) (id2record g))) ;; ID is either "GO:0000001" or "1", but never and integer. (defmethod category-name ((g gene-ontology) id) (if (integerp id) (setq id (format nil "GO:~7,'0d" id)) (if (not (char= (char id 0) #\G)) (setq id (format nil "GO:~7,'0d" (parse-integer id))))) (let ((result (gethash id (id2record g)))) (assert result () "~a is not a valid GO id") (name result))) (defmethod category-ontology ((g gene-ontology) id) (if (integerp id) (setq id (format nil "GO:~7,'0d" id)) (if (not (char= (char id 0) #\G)) (setq id (format nil "GO:~7,'0d" (parse-integer id))))) (let ((result (gethash id (id2record g)))) (assert result () "~a is not a valid GO id") (let ((name (namespace result))) (cond ((equal name "biological_process") :process) ((equal name "molecular_function") :function) ((equal name "cellular_component") :component))))) (defmethod url-for-category ((g gene-ontology) id) (unless (search "GO:" id) (setq id (format nil "GO:~7,'0d" (parse-integer id)))) (format nil "http://www.godatabase.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&depth=0&query=~a&taxid=9606&show_associations=list" id)) (defmethod all-go-parents ((g go-term)) (append (go-parents g) (apply 'append (mapcar 'all-go-parents (go-parents g))))) (defmethod all-go-children ((g go-term)) (append (go-children g) (apply 'append (mapcar 'all-go-children (go-children g))))) (defmethod all-containers ((g go-term)) (append (part-of g) (apply 'append (mapcar 'all-containers (part-of g))))) (defmethod all-parts ((g go-term)) (append (has-parts g) (apply 'append (mapcar 'all-parts (has-parts g))))) (defmethod all-ancestors ((g go-term)) (union (all-containers g) (all-go-parents g))) (defmethod all-descendents ((g go-term)) (union (all-parts g) (all-go-children g))) (defmethod remove-go-subsumed ((g gene-ontology) ids &optional (relation :parent)) (let ((records (mapcar (lambda(id) (get-record g id)) ids))) (loop for record in records with subsumed when (some (lambda(parent) (member parent records)) (ecase relation (:parent (all-go-parents record)) (:container (all-containers record)) (:both (union (all-containers record) (all-go-parents record))))) do (push record subsumed) finally (return (values (set-difference records subsumed) subsumed))))) (defmethod genes-and-direct-associations-below ((g gene-ontology) ids &optional (include-parts t) bygene) (let ((records (mapcar (lambda(id) (get-record g id)) ids)) (bygene (or bygene (make-hash-table :test 'equal)))) (loop for record in records do (loop for (ll . evidence) in (direct-associations record) do (pushnew record (gethash ll bygene))) (genes-and-direct-associations-below g (go-children record) include-parts bygene) (when include-parts (genes-and-direct-associations-below g (has-parts record) include-parts bygene))) bygene)) (defun remove-empty-go-categories (cats) (remove-if (lambda(e) (zerop (length (all-associations e)))) cats)) (defmethod dump ((g gene-ontology) file) (with-open-file (f file :if-exists :supersede :if-does-not-exist :create :direction :output) (format f "~{~a~^ ~}~%" '(id name status direct all go-children parts definition )) (loop for record being the hash-values of (name2record g) do (format f "~{~a~^ ~}~%" (list (id record) (name record) (if (is-obsolete record) "Obsolete" "Current") (format nil "~{~a~^;~}" (mapcar 'car (direct-associations record))) (format nil "~{~a~^;~}" (mapcar 'car (all-associations-include-parts record))) (format nil "~{~a~^;~}" (mapcar 'id (go-children record))) (format nil "~{~a~^;~}" (mapcar 'id (has-parts record))) (definition record))) ))) (defvar *go* (if (config-maybe :go-obo nil) (make-instance 'gene-ontology) (progn (warn "Not loading gene ontology. Need to define config :go-obo, :goa-human, :goa-xref") nil))) (defun count-all-associations () (apply '+ (loop for top in '("GO:0003674" "GO:0008150" "GO:0005575") collect (let ((hash (make-hash-table))) (loop for a in (all-associations (get-record *go* top)) do (setf (gethash a hash) t)) (hash-table-count hash))))) (defun foreach-go-record (go f) (funcall f "GO:0006898" (get-record *go* "GO:0006898"))) (maphash (lambda (id record) (funcall f id record)) (id2record go))) Evidence, WITH What annotations are annotated to: h-invdb genes ensembl genes refseq XP_001129443 protein uniprot protein vega gene evidence ontology http://www.berkeleybop.org/ontologies/owl/ECO (defun create-owl-proteins () (let ((count 0)) (let ((done (make-hash-table :test 'equal))) (foreach-go-record *go* (lambda(goid record) (unless (is-obsolete record) (loop for (qualifier db id ll evidence citation with) in (direct-associations record) for dbstring = (or (second (assoc db '(("h-invdb" "h-invdb") ("ensembl" "ensembl") ("refseq" "ncbi_np") ("uniprot" "uniprotkb") ("vega" "vega")) :test 'equalp)) (error "unknown db: ~a" db)) for recordtype = (second (assoc db '(("h-invdb" "gene") ("ensembl" "gene") ("refseq" "protein") ("uniprot" "protein") ("vega" "gene")) :test 'equalp)) for protein-uri = (make-uri (format nil "http://purl.org/commons/hcls/protein/bysequence/~a.~a" dbstring id)) for record-uri = (make-uri (format nil "http://purl.org/commons/record/~a/~a" dbstring id)) for goed-uri = (make-uri (format nil "http://purl.org/commons/hcls/protein/subjects/~a.~a.~a" dbstring id (substitute #\_ #\: goid ))) for go-id = (make-uri nil (string-downcase goid)) collect `(class ,protein-uri :partial ,!hclsdef:protein (restriction ,(if (equal recordtype "gene") !hclsdef:geneProductOfDNADescribedBy !hclsdef:peptideSequenceDescribedBy) (has-value ,record-uri))) collect `(individual ,record-uri (type ,(if (equal recordtype "gene") !hclsdef:geneRecord !hclsdef:proteinRecord))) collect `(class ,goed-uri :partial (intersection-of ,protein-uri ,(cond ((equal (namespace record) "cellular_component") (list 'restriction !oborel:located_in (if (equal qualifier "NOT") `(all-values-from (complement-of ,go-id)) `(some-values-from ,go-id)))) ((equal (namespace record) "molecular_function") (list 'restriction !oborel:has_function (if (equal qualifier "NOT") `(all-values-from (complement-of ,go-id)) `(some-values-from ,go-id)))) ((equal (namespace record) "biological_process") (if (equal qualifier "NOT") `(restriction !oborel:participates_in (all-values-from (complement-of ,go-id))) (list 'restriction !oborel:has_function `(some-values-from (restriction ,!oborel:realized_as (some-values-from ,go-id)))))))))) )))))) (eval `(define-ontology foo () (ontology-annotation !owl:imports !) (ontology-annotation !owl:imports !) (object-property !oborel:has_function) (object-property !oborel:participates_in) (object-property !oborel:realized_as) (object-property !oborel:located_in) (object-property !hclsdef:peptideSequenceDescribedBy) (class !hclsdef:protein :partial !snap:object) (class !go:0006898 :partial !span:Process) (class !hclsdef:proteinRecord :partial !snap:object) ,@ (create-owl-proteins))) #URLs to use: http://purl.org/obo/owl/GO#GO_0003720 (:FORMAT-VERSION "1.0" :DATE "16:04:2007 19:30" :SAVED-BY "gocvs" :AUTO-GENERATED-BY "OBO-Edit 1.002" :SUBSETDEF "goslim_generic \"Generic GO slim\"" :SUBSETDEF "goslim_goa \"GOA and proteome slim\"" :SUBSETDEF "goslim_plant \"Plant GO slim\"" :SUBSETDEF "goslim_yeast \"Yeast GO slim\"" :SUBSETDEF "gosubset_prok \"Prokaryotic GO subset\"" :DEFAULT-NAMESPACE "gene_ontology" :REMARK "geneontology.org version: Revision: 4.225") #| Evidence codes IEA - inferred from electronic annotation IC - inferred by curator IDA - inferred from direct assay - Enzyme assays - In vitro reconstitution (e.g. transcription) - Immunofluorescence (for cellular component) - Cell fractionation (for cellular component) - Physical interaction/binding IEP inferred from expression pattern - Transcript levels (e.g. Northerns, microarray data) - Protein levels (e.g. Western blots) IGI - inferred from genetic interaction - "Traditional" genetic interactions such as suppressors, synthetic lethals, etc. - Functional complementation - Rescue experiments - Inference about one gene drawn from the phenotype of a mutation in a different gene. IMP - inferred from mutant phenotype - Any gene mutation/knockout - Overexpression/ectopic expression of wild-type or mutant genes - Anti-sense experiments - RNAi experiments - Specific protein inhibitors IPI - inferred from physical interaction - 2-hybrid interactions - Co-purification - Co-immunoprecipitation - Ion/protein binding experiments ISS - inferred from sequence or structural similarity - Sequence similarity (homologue of/most closely related to) - Recognized domains - Structural similarity - Southern blotting RCA - inferred from reviewed computational analysis - Large-scale protein-protein interaction experiments - Microarray experiments - Integration of large-scale datasets of several types - Text-based computation NAS - non-traceable author statement ND - no biological data available TAS - traceable author statement NR - not recorded |#