o;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Mon October 24, 2005: INCOMPATIBLE CHANGE. In the versions after binary asn, there is an extra { in the input ;; causing the old reader to try to read the whole 1 gig file into a single list. The workaround is to read-line the ;; first line away. ;; ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Converting binary to text asn ;; To convert binary asn to text asn. ;; Get asn tools from ncbi. ;; assume it is in $ncbi ;; open $ncbi/make/xCode/NCBI.xcode (so xcode can compile it. Darwinports doesn't do it) ;; select target all command line tools (or something like that). Build. ;; Find asntool. (on my machine I seem to have set the build directory to be ~/safari/build) ;; $ncbi = "~/Desktop/ncbi/"; ;; $in = "~/Desktop/BigStuff/Entrez\ Gene/Homo_sapiens.ags" ;; $out = "~/Desktop/BigStuff/Entrez\ Gene/Homo_sapiens.asn" ;; ./asntool -m $ncbi/access/entrezgene.asn -d $in -t Entrezgene-Set -p $out -M `find $ncbi -name \*\.asn | perl -e '@a=<>;print join(",",map {chop;$_} @a)'` ;; The find finds all the asn specification files that ncbi has nested. ;; Actually, when converting entrez, it logs only the following as necessary. ;; access/entrez2.asn,access/entrezgene.asn,access/mim.asn,access/taxon3.asn,api/fastadl.asn,asn/access.asn,asn/biblio.asn,asn/featdef.asn,asn/gbseq.asn,asn/general.asn, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; some documentation about the file format: ;; ;; The schema: ;; http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/src/objects/entrezgene/entrezgene.asn ;; ;; Makes reference to other scheme, described here: ;; http://sdm.lbl.gov/OPM/DM_TOOLS/OPM/MBD/DIR_LIB/NCBI.html ;; to find NCBI-gene, use "exports gene" inurl:IEB inurl:ToolBox inurl:source inurl:src -inurl:cpp -inurl:lib ;; -inurl:in site:www.ncbi.nlm.nih.gov ;; ;; Shows how to retrieve things if you are used to locuslink: ;; http://www.ncbi.nlm.nih.gov/entrez/query/static/help/genehelp.html#program ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Generic code to read asn. See asn-each for description of parsed record. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *asn-readtable* (copy-readtable)) (defun braces-reader (stream char ) (declare (ignore char)) (read-delimited-list #\} stream t)) (defun return-comma (stream char) (declare (ignore stream char)) #\,) ; { starts a list (set-macro-character #\{ 'braces-reader nil *asn-readtable*) ; } ends a list (set-macro-character #\} (get-macro-character #\) *asn-readtable*) nil *asn-readtable*) ; returns a #\, separator (set-macro-character #\, 'return-comma nil *asn-readtable*) ; #\: is not package separator (set-syntax-from-char #\: #\A *asn-readtable*) ; #\\ is not quote char (it is junk) (set-syntax-from-char #\\ #\A *asn-readtable*) (defun asn-read (stream &optional (eof-marker :eof)) (let ((*readtable* *asn-readtable*) (*package* (load-time-value (find-package :keyword)))) (asn-uncomma (read stream nil eof-marker)))) ; each list is of the form (entry[, entry]*), where entry is key [type] value ; This removes the type and the commas, effectively making it a plist (defun asn-uncomma (asn) (if (null asn) nil (if (listp asn) (loop with head = asn for (a b c d) = head if (eq c #\,) collect (asn-uncomma a) and collect (asn-uncomma b) and do (setq head (cdddr head)) else if (eq d #\,) collect (asn-uncomma a) and collect (asn-uncomma c) and do (setq head (cddddr head)) else if (null (cdr head)) collect (asn-uncomma a) and do (setq head nil) ; change - use cdr since can't tell head = (1 nil) from head = (1) before else if (null (cddr head)) collect (asn-uncomma a) and collect (asn-uncomma b) and do (setq head nil) else if (null (cdddr head)) collect (asn-uncomma a) and collect (asn-uncomma c) and do (setq head nil) else collect (asn-uncomma a) and collect (asn-uncomma b) and do (setq head (cddr head)) while head) asn))) ; Iterate over a specified set of fields in a parsed asn record. ; ; At this point the asn record looks like a plist at top level. ; The values of each entry are either a plist, a list of plists, or some value (possibly list valued) ; ; The tags tells you how to get the part you want. ; ; Each tag is either a keyword or a list of key value pairs (possibly nil) ; ; If the tag is a keyword then you retrieve the value of the record specified by the tag ; If the tag is a list of (key value) pairs then the you are looking through a list of plists ; and operating on any that have that list of key values in them. As a special case, if the tag ; is nil then you operate on all of them ; This process is repeated until you process all the tags. If you have anything left, then fn is called on the value ; (possibly multiple times) ; ; There are two special values of fn that are special cased. ; If fn is :collect then the values are collected into a list ; If fn is :collect-distinct then the values are collected into a list and duplicates are removed using 'equal (defun asn-each (asn fn &rest tags) (labels ((asn-each-1 (asn tags) (cond ((null tags) (funcall fn asn)) ((listp (car tags)) ;expect a list of plists and find the one with bindings specificed as pairs (loop for plist in asn when (loop for (key value) on (car tags) by #'cddr always (equal (getf plist key) value)) do (asn-each-1 plist (cdr tags)))) (t (asn-each-1 (getf asn (car tags)) (cdr tags)))))) (if (member fn '(:collect :collect-distinct)) (let ((them nil)) (apply 'asn-each asn (lambda(item) (push item them)) tags) (if (eq fn :collect-distinct) (remove-duplicates them :test 'equal) them)) (asn-each-1 asn tags)))) ; When you know there is only one value then asn-get returns it. Same method as asn-each. (defun asn-get (asn &rest tags) (apply #'asn-each asn (lambda(item) (return-from asn-get item)) tags)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Code to deal with entrez gene asn file specifically ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun get-gene-dbrefs (e) (flet ((normalize-gene-db (db) (cond ((string-equal db "locusid") :locuslink) ((string-equal db "mim") :omim) ((string-equal db "geneid") :gene) (t db)))) (loop for (nil db nil id) in e collect (normalize-gene-db db) collect id))) ;; writes a tab delimited file with the bits of gene that we want. Fields with multiple values have the values separated by ;; "|". A header line names the fields. ;; The fields are: ;; :id - The entrez gene id ;; :name - The name of the protein. Sometimes long, sometimes short. ;; :type - One of: :snrna :snorna :rrna :trna :miscrna :other :unknown :pseudo :protein-coding ;; :refseq-mrna - refseq identifiers (list) (possibly nil) ;; :refseq-protein - refseq identifiers (list) (possibly nil) ;; :synonyms - list of synonyms (includes name as first element) ;; :go - list of go categories (numerical id as string) (possibly nil) ;; :unigene - unigene ids (list) (possibly nil) ;; :summary - the longer description of the protein ;; :current-id, :current-locusid - if this record is superseded, where id for the current record. (possibly nil) ;; :omim - omim id (possibly nil) ;; :locuslink - locuslink id (possibly nil) ;; :species - :human, :mouse, or :rat (defun get-gene-info (e) (flet ((clean-whitespace (s) (if (and s #+abcl (scan "(?s).*(\\s+|\\s*\\n\\s*).*" s) #-abcl (scan "\\s+|\\s*\\n\\s*" s)) (regex-replace-all "\\s+|\\s*\\n\\s*" s " ") s))) (let* ((locus (asn-get e :gene :locus)) (name (clean-whitespace (asn-get e :gene :desc))) (synonyms `(,@(and locus (list locus)) ; may or may not be one ,@(and name (list name)) ; should always be one (NOT!, see e.g. id 44) ,@(mapcar #'clean-whitespace (asn-get e :gene :syn)) ,@(mapcar #'clean-whitespace (asn-get e :prot :name)))) (type (asn-get e :type)) (dbs (get-gene-dbrefs (asn-get e :gene :db))) (summary (clean-whitespace (asn-get e :summary))) (id (asn-get e :track-info :geneid)) (current-ids (get-gene-dbrefs (asn-get e :track-info :current-id))) (status (asn-get e :track-info :status)) (refseq-mrna (union (asn-each e :collect-distinct :locus '(:type :genomic) :products '(:type :mrna) :accession) (asn-each e :collect :comments '(:heading "NCBI Reference Sequences (RefSeq)") :products '(:type :mrna) :accession) :test 'equal)) (refseq-protein (union (asn-each e :collect-distinct :locus '(:type :genomic) :products '(:type :mrna) :products '(:type :peptide) :accession) (asn-each e :collect :comments '(:heading "NCBI Reference Sequences (RefSeq)") :products '(:type :peptide) :accession) :test 'equal)) (go-categories (asn-each e :collect-distinct :properties '(:type :comment :heading "GeneOntology") :comment nil :comment '(:type :comment) :source nil :src :tag)) (unigene (asn-each e :collect-distinct ':comments '(:heading "Additional Links") :comment '(:text "UniGene") :source nil :src :tag)) (chromosome (asn-get e :locus '(:type :genomic) :accession)) (range (let ((interval (asn-get e :locus '(:type :genomic) :seqs :int))) (list (case (getf interval :strand) (:minus :-) (:plus :+) (otherwise nil)) (getf interval :from) (getf interval :to)))) (species (asn-get e :source :org :taxname))) (assert species) (setq chromosome (if chromosome (regex-replace "^NC_0*" chromosome "") "")) (setq synonyms (remove-duplicates synonyms :test 'equal)) (nsubstitute :current-locuslink :locuslink current-ids) (nsubstitute :current-id :gene current-ids) `(:id ,id :name ,(or locus name) :type ,type :species ,species :status ,status :refseq-mrna ,refseq-mrna :refseq-protein ,refseq-protein :synonyms ,synonyms :go ,go-categories :unigene ,unigene ,@(if chromosome (list :chromosome chromosome :strand (first range) :start (second range) :end (third range))) :summary ,summary ,@current-ids ,@dbs)))) (defclass entrez-gene-summary () ((table :initarg :table :initform nil :accessor table-slot) (equivalent-ids :initarg :equivalent-ids :initform nil :accessor equivalent-ids-slot) )) (defvar *entrez-gene* (make-instance 'entrez-gene-summary)) (defmethod fields ((s entrez-gene-summary)) '(:id :status :name :type :species :locuslink :current-id :current-locuslink :refseq-mrna :refseq-protein :omim :unigene :go :chromosome :strand :start :end :synonyms :summary)) (defmethod keyword-fields ((s entrez-gene-summary)) '(:status :type :species :strand)) (defmethod list-fields ((s entrez-gene-summary)) '(:go :synonyms :refseq-mrna :refseq-protein :unigene)) (defmethod table ((s entrez-gene-summary)) (or (table-slot s) (progn (read-summary s) (table-slot s)))) (defmethod read-summary ((s entrez-gene-summary)) (declare (optimize (speed 3) (safety 0))) (let ((table (make-hash-table :test 'equal)) (keywords (keyword-fields s)) (lists (list-fields s))) (with-open-file (f (config :entrez-gene-summary)) (loop for line = (read-line f nil :eof) until (eq line :eof) with headers = (mapcar (lambda(s) (intern (string-upcase s) 'keyword)) (split-at-char (read-line f) #\tab)) for fields = (split-at-char line #\tab) for plist = (loop for header in headers for field in fields for field-parsed = (cond ((member header keywords :test 'eq) (intern (string-upcase field) 'keyword)) ((member header lists :test 'eq) (if (equal field "") nil (split-at-char field #\|))) ((equal "" field) nil) (t field)) collect header collect field-parsed) do (setf (gethash (getf plist :id) table) plist)) (setf (table-slot s) table)))) (defmethod equivalent-ids ((s entrez-gene-summary) ll) (or (equivalent-ids-slot s) (let ((table (make-hash-table :test 'equal))) (maphash (lambda(id info) (when (eq (getf info :status) :secondary) (when (getf info :current-id) (pushnew (getf info :current-id) (gethash id table) :test 'equal) (pushnew id (gethash (getf info :current-id) table) :test 'equal)) (when (getf info :current-locuslink) (pushnew (getf info :current-locuslink) (gethash id table) :test 'equal) (pushnew id (gethash (getf info :current-locuslink) table) :test 'equal)) )) (table s)) (setf (equivalent-ids-slot s) table))) (gethash ll (equivalent-ids-slot s))) (defmethod info ((s entrez-gene-summary) locuslink) (unless (table s) (read-summary s)) (gethash locuslink (table s))) (defmethod gene-field ((s entrez-gene-summary) locuslink field) (unless (table s) (read-summary s)) (let ((res (getf (gethash locuslink (table s)) field))) (and (not (equal res "")) res))) ;; Read asn, write out the tab delimited file (defun create-entrez-gene-summary (&key (source (merge-pathnames "homo_sapiens" (config :entrez-gene-asn))) (dest (config :entrez-gene-summary))) (with-open-file (in source) (read-line in) ; skip first line (see comment regarding INCOMPATIBLE CHANGE at top) (with-open-file (out dest :if-does-not-exist :create :direction :output :if-exists :supersede) (let ((fields '(:id :status :name :type :species :locuslink :current-id :current-locuslink :refseq-mrna :refseq-protein :omim :unigene :go :chromosome :strand :start :end :synonyms :summary))) (apply 'print-tabbed out (mapcar (lambda(f) (string-upcase (symbol-name f))) fields)) (loop for form = (if (equal (peek-char t in) #\}) :eof ; extra trailing brace (see comment regarding INCOMPATIBLE CHANGE at top) (asn-read in)) when (listp form) do #+abcl (sleep .001) ; interruptability (let ((info (get-gene-info form))) (apply 'print-tabbed out (mapcar (lambda(f) (let ((it (or (getf info f) ""))) (cond ((listp it) (format nil "~{~a~^|~}" it)) ((stringp it) it) ((symbolp it) (string-downcase (string it))) (t (princ-to-string it))))) fields))) until (eq form :eof)))))) ;; What jar uses in locuslink digest ;; locusid ;; product (description field in paris) (look and see what this is) ;; organism (human mouse rat) but jar only uses human now. (like "Homo Sapiens") ;; searchs for a name official symbol then preferred symbol then alias symbol. ;; current locusid ;; iterate over gene summary entries ;; returns value as either plist of specific list of fields. ;; If you want a specific list of fields passed as argument to function then put the list of them in boa ;; Turns empty fields into nil. ;; Keywordifies species and type and status fields ;; Splits apart list valued fields (defun each-entrez-gene-summary (function &key (path (config :entrez-gene-summary)) boa) (flet ((parse-field (header value) (if (= (length value) 0) nil (if (member header (list-fields *entrez-gene*) :test 'eq) (split-at-char value #\|) (if (member header (keyword-fields *entrez-gene*) :test 'eq) (intern (string-upcase value) 'keyword) value))))) (if (slot-value *entrez-gene* 'table) (maphash (lambda(id info) (let ((info `(:id ,id ,@info))) (if boa (apply function (loop for boa in boa collect (getf info boa))) (apply function info)))) (table *entrez-gene*)) (with-open-file (f path) (let ((headers (mapcar (lambda(s)(intern (string-upcase s) 'keyword)) (split-at-char (read-line f) #\tab)))) (let ((boapos (mapcar (lambda(key) (position key headers)) boa))) (loop for line = (read-line f nil :eof) until (eq line :eof) for fields = (split-at-char line #\tab) do (apply function (if boa (loop for pos in boapos for boa in boa collect (parse-field boa (nth pos fields))) (loop for header in headers for field in fields collect header collect (parse-field header field))))))))))) ;; (defvar *locuslink-to-info* nil) ;; (defun locuslink-to-info () ;; (or *locuslink-to-info* ;; (setq *locuslink-to-info* ;; (let ((table (make-hash-table :test 'equal :size 150000))) ;; (each-entrez-gene-summary ;; (lambda (&rest info) ;; (when (member (getf info :species) '(:human :|HOMO SAPIENS|) :test 'eq) ;; (setf (gethash (getf info :id) table) info)))) ;; table)) ))