summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--LICENSE26
-rw-r--r--README.md46
-rw-r--r--doc.go4
-rw-r--r--obo.go170
-rw-r--r--obo_test.go94
5 files changed, 340 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..3f92569
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,26 @@
+Copyright (c) 2013, Silvan Jegen
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those
+of the authors and should not be interpreted as representing official policies,
+either expressed or implied, of the FreeBSD Project.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ee8a2e9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,46 @@
+# Introduction to the obo package
+
+This is a WIP for a go library for parsing [.obo files](http://www.geneontology.org/GO.format.obo-1_2.shtml). The library currently parses .obo files and returns either a channel to, or a slice of the [Term] stanzas contained in the files.
+
+The parsing of other stanza types is still under construction as are several of the key: value-pairs associated with the [Term] stanza. Please see the TODO section further below.
+
+
+
+# Examples
+
+Reading a .obo file into a slice of OboTermEntries
+
+
+```Go
+var obolist []*OboTermEntry
+var parentchildren map[string][]*OboTermEntry
+
+reader := bufio.NewReader(os.File("/path/to/file/file.obo"))
+obolist, parentchildren = ParseToSlice(*reader, parentchildren, obolist)
+
+```
+
+To parse the OboTermEntries and having them fed to a channel:
+
+```Go
+obochan := make(chan *OboTermEntry)
+var obolist []*OboTermEntry
+
+reader := bufio.NewReader(os.File("/path/to/file/file.obo"))
+obochan = ParseToChannel(*reader, obochan)
+
+for ent := range obochan {
+ obolist = append(obolist, ent)
+}
+
+```
+
+
+# TODO
+
+* Parsing of header data
+* Writing of .obo files
+* Parsing of [Typedef] stanzas
+* Parsing of [Instance] stanzas
+* Handling of several tags is still missing
+
diff --git a/doc.go b/doc.go
new file mode 100644
index 0000000..24d56ca
--- /dev/null
+++ b/doc.go
@@ -0,0 +1,4 @@
+/* See LICENSE file for copyright and license details. */
+
+// A simple parsing library for the .obo file format
+package obo
diff --git a/obo.go b/obo.go
new file mode 100644
index 0000000..7ae6e00
--- /dev/null
+++ b/obo.go
@@ -0,0 +1,170 @@
+/* See LICENSE file for copyright and license details. */
+
+package obo
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "strings"
+)
+
+type OboTermEntry struct {
+ Id string
+ Altids []string
+ Name string
+ Def string
+ Xrefs []string
+ Synonyms []string
+ IsA []string
+ Obsolete bool
+}
+
+// Use this function to filter the OboTermEntry pointer slice
+func Filter(s []*OboTermEntry, fn func(*OboTermEntry) bool) []*OboTermEntry {
+ var p []*OboTermEntry
+ for _, i := range s {
+ if fn(i) {
+ p = append(p, i)
+ }
+ }
+ return p
+}
+
+// This function prints a simple representation of the parsed Obo
+// data. It is used for debugging purposes and may be removed at a later
+// stage.
+func Dump(oboent []*OboTermEntry, parentchildrenmap map[string][]*OboTermEntry) {
+ var potentialroots []string
+ for _, entry := range oboent {
+ if len(entry.IsA) == 0 {
+ potentialroots = append(potentialroots, entry.Id)
+ }
+
+ fmt.Printf("%s\n\tPT %s\n", entry.Id, entry.Name)
+ if len(entry.Synonyms) > 0 {
+ fmt.Print("\tSYN ")
+ fmt.Print(strings.Join(entry.Synonyms, "\n\tSYN "))
+ fmt.Print("\n")
+ }
+ if children, ok := parentchildrenmap[entry.Id]; ok {
+ fmt.Print("\tNT ")
+ for _, child := range children {
+ fmt.Print(child.Id, "\n\tNT ")
+ }
+ fmt.Print("\n")
+ }
+ fmt.Print("\n")
+ }
+ fmt.Fprintf(os.Stderr, "Number of entries in the list: %d\n", len(oboent))
+ fmt.Fprintf(os.Stderr, "Number of entries with children: %d\n", len(parentchildrenmap))
+ fmt.Fprintf(os.Stderr, "Number of orphan nodes: %d\n", len(potentialroots))
+
+ fmt.Print("root\n\tPT YourOntologyNameHere\n")
+ for _, potroot := range potentialroots {
+ fmt.Printf("\tNT %s\n", potroot)
+ }
+ fmt.Print("\n")
+}
+
+func parseObo(oboinput bufio.Reader, obochan chan *OboTermEntry, parentchildren map[string][]*OboTermEntry) {
+ lineno := 0
+ var entry *OboTermEntry
+ var termsstarted bool
+ defer close(obochan)
+
+ for {
+ line, err := oboinput.ReadString('\n')
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ fmt.Printf("Error while reading obo file at line nr. %d: %v\n", lineno, err)
+ os.Exit(1)
+ }
+ lineno++
+ line = line[:len(line)-1] // chop \n
+ if lineno%1000000 == 0 {
+ fmt.Fprintf(os.Stderr, "Chopped line number: %d\n", lineno)
+ }
+
+ if line == "[Term]" {
+ termsstarted = true
+ if entry != nil {
+ obochan <- entry
+ }
+
+ entry = new(OboTermEntry)
+ continue
+ } else if line == "\n" {
+ continue
+ } else if line == "[Typedef]" {
+ continue
+ } else if line == "" {
+ continue
+ } else if line[0] == '!' {
+ continue
+ }
+
+ if termsstarted {
+ splitline := strings.SplitN(line, ":", 2)
+ trimmedvalue := strings.Trim(splitline[1], " ")
+ field := strings.Trim(splitline[0], " ")
+ switch field {
+ case "id":
+ entry.Id = trimmedvalue
+ case "name":
+ entry.Name = trimmedvalue
+ case "def":
+ entry.Def = trimmedvalue
+ case "alt_id":
+ entry.Altids = append(entry.Altids, trimmedvalue)
+ case "xref":
+ entry.Xrefs = append(entry.Xrefs, trimmedvalue)
+ case "synonym":
+ syn := strings.SplitN(trimmedvalue, "\" ", 2)
+ r := strings.NewReplacer("\"", "")
+ entry.Synonyms = append(entry.Synonyms, r.Replace(syn[0]))
+ case "is_a":
+ isa := strings.SplitN(trimmedvalue, "!", 2)
+ trimmedisa := strings.Trim(isa[0], " ")
+ entry.IsA = append(entry.IsA, trimmedisa)
+ if parentchildren != nil {
+ parentchildren[trimmedisa] = append(parentchildren[trimmedisa], entry)
+ }
+ case "is_obsolete":
+ entry.Obsolete = true
+ }
+ }
+ }
+ obochan <- entry
+}
+
+// Parses a .obo file given as a bufio.Reader into a slice of
+// OboTermEntry's. Hierarchical information is saved in a map and
+// returned together with the slice.
+func ParseToSlice(oboinput bufio.Reader, parentchildren map[string][]*OboTermEntry, obolist []*OboTermEntry) ([]*OboTermEntry, map[string][]*OboTermEntry) {
+ var ent *OboTermEntry
+ obochan := make(chan *OboTermEntry, 100)
+
+ go parseObo(oboinput, obochan, parentchildren)
+
+ for ent = range obochan {
+ obolist = append(obolist, ent)
+ }
+ return obolist, parentchildren
+}
+
+// This function returns a channel on which pointers to the parsed
+// OboTermEntry structs will be sent. Please note that this function
+// does not return the hierarchy map. If you want to parse the .obo
+// file asynchronously while still having access to the hierarchical
+// information you will have to build the structure containing the
+// hierarchical information yourself.
+func ParseToChannel(oboinput bufio.Reader, obochan chan *OboTermEntry) chan *OboTermEntry {
+
+ go parseObo(oboinput, obochan, nil)
+
+ return obochan
+}
diff --git a/obo_test.go b/obo_test.go
new file mode 100644
index 0000000..140f948
--- /dev/null
+++ b/obo_test.go
@@ -0,0 +1,94 @@
+/* See LICENSE file for copyright and license details. */
+
+package obo
+
+import (
+ "bufio"
+ "strings"
+ "testing"
+)
+
+func TestOboParsing(*testing.T) {
+ parentchildren := make(map[string][]*OboTermEntry)
+ obochan := make(chan *OboTermEntry)
+ var obolist []*OboTermEntry
+
+ s := `
+
+format-version: 1.2
+date: 17:11:2011 13:07
+saved-by: lschriml
+auto-generated-by: OBO-Edit 2.1-beta6
+default-namespace: symptoms
+
+[Typedef]
+id: part_of
+name: part_of
+
+[Term]
+id: SYMP:0000000
+name: cellulitis
+def: "Cellulitis is a musculoskeletal system symptom characterized as a diffuse and especially subcutaneous inflammation of connective tissue." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=cellulitis]
+is_a: SYMP:0000891 ! musculoskeletal system symptom
+
+[Term]
+id: SYMP:0000001
+name: abdominal cramp
+is_a: SYMP:0000461 ! abdominal symptom
+
+[Term]
+id: SYMP:0000002
+name: abdominal distention
+is_a: SYMP:0000461 ! abdominal symptom
+
+[Term]
+id: SYMP:0000003
+name: acute enteritis in newborns
+is_obsolete: true
+
+[Term]
+id: SYMP:0000004
+name: arrested moulting
+is_obsolete: true
+[Term]
+id: SYMP:0000005
+name: ataxia
+def: "Ataxia is a neurological and physiological symptom characterized by an inability to coordinate voluntary muscular movements that is symptomatic of some nervous disorders." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=ataxia]
+synonym: "uncoordination" EXACT []
+is_a: SYMP:0000410 ! neurological and physiological symptom
+
+[Term]
+id: SYMP:0000006
+name: backache
+def: "Backache is a pain occurring in the lower back." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=backache]
+synonym: "back pain" EXACT []
+is_a: SYMP:0000099 ! pain
+
+[Term]
+id: SYMP:0000007
+name: bleeding
+def: "A general symptom that is characterized as an act, instance, or result of being bled or the process by which something is bled: as a the escape of blood from vessels." [url:http\://www.merriam-webster.com/medlineplus/bleeding]
+is_a: SYMP:0000567 ! general symptom
+
+[Term]
+id: SYMP:0000008
+name: blindness
+is_a: SYMP:0000320 ! vision symptom
+
+[Term]
+id: SYMP:0000009
+name: blister
+def: "Blister is a skin and integumentary tissue symptom characterized as a fluid-filled elevation of the epidermis." [url:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=blister]`
+
+ stringreader1 := bufio.NewReader(strings.NewReader(s))
+ stringreader2 := bufio.NewReader(strings.NewReader(s))
+
+ obolist, parentchildren = ParseToSlice(*stringreader1, parentchildren, obolist)
+ obochan = ParseToChannel(*stringreader2, obochan)
+
+ for ent := range obochan {
+ obolist = append(obolist, ent)
+ }
+
+ Dump(obolist, parentchildren)
+}