From 389e98a4fa0b9d1fde5343a62b64df3ca5ccbda9 Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Thu, 17 Oct 2013 20:36:39 +0200 Subject: Initial commit --- LICENSE | 26 ++++++++++ README.md | 46 ++++++++++++++++ doc.go | 4 ++ obo.go | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ obo_test.go | 94 +++++++++++++++++++++++++++++++++ 5 files changed, 340 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 doc.go create mode 100644 obo.go create mode 100644 obo_test.go diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3f92569 --- /dev/null +++ b/LICENSE @@ -0,0 +1,26 @@ +Copyright (c) 2013, Silvan Jegen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ee8a2e9 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# Introduction to the obo package + +This is a WIP for a go library for parsing [.obo files](http://www.geneontology.org/GO.format.obo-1_2.shtml). The library currently parses .obo files and returns either a channel to, or a slice of the [Term] stanzas contained in the files. + +The parsing of other stanza types is still under construction as are several of the key: value-pairs associated with the [Term] stanza. Please see the TODO section further below. + + + +# Examples + +Reading a .obo file into a slice of OboTermEntries + + +```Go +var obolist []*OboTermEntry +var parentchildren map[string][]*OboTermEntry + +reader := bufio.NewReader(os.File("/path/to/file/file.obo")) +obolist, parentchildren = ParseToSlice(*reader, parentchildren, obolist) + +``` + +To parse the OboTermEntries and having them fed to a channel: + +```Go +obochan := make(chan *OboTermEntry) +var obolist []*OboTermEntry + +reader := bufio.NewReader(os.File("/path/to/file/file.obo")) +obochan = ParseToChannel(*reader, obochan) + +for ent := range obochan { + obolist = append(obolist, ent) +} + +``` + + +# TODO + +* Parsing of header data +* Writing of .obo files +* Parsing of [Typedef] stanzas +* Parsing of [Instance] stanzas +* Handling of several tags is still missing + diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..24d56ca --- /dev/null +++ b/doc.go @@ -0,0 +1,4 @@ +/* See LICENSE file for copyright and license details. */ + +// A simple parsing library for the .obo file format +package obo diff --git a/obo.go b/obo.go new file mode 100644 index 0000000..7ae6e00 --- /dev/null +++ b/obo.go @@ -0,0 +1,170 @@ +/* See LICENSE file for copyright and license details. */ + +package obo + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" +) + +type OboTermEntry struct { + Id string + Altids []string + Name string + Def string + Xrefs []string + Synonyms []string + IsA []string + Obsolete bool +} + +// Use this function to filter the OboTermEntry pointer slice +func Filter(s []*OboTermEntry, fn func(*OboTermEntry) bool) []*OboTermEntry { + var p []*OboTermEntry + for _, i := range s { + if fn(i) { + p = append(p, i) + } + } + return p +} + +// This function prints a simple representation of the parsed Obo +// data. It is used for debugging purposes and may be removed at a later +// stage. +func Dump(oboent []*OboTermEntry, parentchildrenmap map[string][]*OboTermEntry) { + var potentialroots []string + for _, entry := range oboent { + if len(entry.IsA) == 0 { + potentialroots = append(potentialroots, entry.Id) + } + + fmt.Printf("%s\n\tPT %s\n", entry.Id, entry.Name) + if len(entry.Synonyms) > 0 { + fmt.Print("\tSYN ") + fmt.Print(strings.Join(entry.Synonyms, "\n\tSYN ")) + fmt.Print("\n") + } + if children, ok := parentchildrenmap[entry.Id]; ok { + fmt.Print("\tNT ") + for _, child := range children { + fmt.Print(child.Id, "\n\tNT ") + } + fmt.Print("\n") + } + fmt.Print("\n") + } + fmt.Fprintf(os.Stderr, "Number of entries in the list: %d\n", len(oboent)) + fmt.Fprintf(os.Stderr, "Number of entries with children: %d\n", len(parentchildrenmap)) + fmt.Fprintf(os.Stderr, "Number of orphan nodes: %d\n", len(potentialroots)) + + fmt.Print("root\n\tPT YourOntologyNameHere\n") + for _, potroot := range potentialroots { + fmt.Printf("\tNT %s\n", potroot) + } + fmt.Print("\n") +} + +func parseObo(oboinput bufio.Reader, obochan chan *OboTermEntry, parentchildren map[string][]*OboTermEntry) { + lineno := 0 + var entry *OboTermEntry + var termsstarted bool + defer close(obochan) + + for { + line, err := oboinput.ReadString('\n') + if err != nil { + if err == io.EOF { + break + } + fmt.Printf("Error while reading obo file at line nr. %d: %v\n", lineno, err) + os.Exit(1) + } + lineno++ + line = line[:len(line)-1] // chop \n + if lineno%1000000 == 0 { + fmt.Fprintf(os.Stderr, "Chopped line number: %d\n", lineno) + } + + if line == "[Term]" { + termsstarted = true + if entry != nil { + obochan <- entry + } + + entry = new(OboTermEntry) + continue + } else if line == "\n" { + continue + } else if line == "[Typedef]" { + continue + } else if line == "" { + continue + } else if line[0] == '!' { + continue + } + + if termsstarted { + splitline := strings.SplitN(line, ":", 2) + trimmedvalue := strings.Trim(splitline[1], " ") + field := strings.Trim(splitline[0], " ") + switch field { + case "id": + entry.Id = trimmedvalue + case "name": + entry.Name = trimmedvalue + case "def": + entry.Def = trimmedvalue + case "alt_id": + entry.Altids = append(entry.Altids, trimmedvalue) + case "xref": + entry.Xrefs = append(entry.Xrefs, trimmedvalue) + case "synonym": + syn := strings.SplitN(trimmedvalue, "\" ", 2) + r := strings.NewReplacer("\"", "") + entry.Synonyms = append(entry.Synonyms, r.Replace(syn[0])) + case "is_a": + isa := strings.SplitN(trimmedvalue, "!", 2) + trimmedisa := strings.Trim(isa[0], " ") + entry.IsA = append(entry.IsA, trimmedisa) + if parentchildren != nil { + parentchildren[trimmedisa] = append(parentchildren[trimmedisa], entry) + } + case "is_obsolete": + entry.Obsolete = true + } + } + } + obochan <- entry +} + +// Parses a .obo file given as a bufio.Reader into a slice of +// OboTermEntry's. Hierarchical information is saved in a map and +// returned together with the slice. +func ParseToSlice(oboinput bufio.Reader, parentchildren map[string][]*OboTermEntry, obolist []*OboTermEntry) ([]*OboTermEntry, map[string][]*OboTermEntry) { + var ent *OboTermEntry + obochan := make(chan *OboTermEntry, 100) + + go parseObo(oboinput, obochan, parentchildren) + + for ent = range obochan { + obolist = append(obolist, ent) + } + return obolist, parentchildren +} + +// This function returns a channel on which pointers to the parsed +// OboTermEntry structs will be sent. Please note that this function +// does not return the hierarchy map. If you want to parse the .obo +// file asynchronously while still having access to the hierarchical +// information you will have to build the structure containing the +// hierarchical information yourself. +func ParseToChannel(oboinput bufio.Reader, obochan chan *OboTermEntry) chan *OboTermEntry { + + go parseObo(oboinput, obochan, nil) + + return obochan +} diff --git a/obo_test.go b/obo_test.go new file mode 100644 index 0000000..140f948 --- /dev/null +++ b/obo_test.go @@ -0,0 +1,94 @@ +/* See LICENSE file for copyright and license details. */ + +package obo + +import ( + "bufio" + "strings" + "testing" +) + +func TestOboParsing(*testing.T) { + parentchildren := make(map[string][]*OboTermEntry) + obochan := make(chan *OboTermEntry) + var obolist []*OboTermEntry + + s := ` + +format-version: 1.2 +date: 17:11:2011 13:07 +saved-by: lschriml +auto-generated-by: OBO-Edit 2.1-beta6 +default-namespace: symptoms + +[Typedef] +id: part_of +name: part_of + +[Term] +id: SYMP:0000000 +name: cellulitis +def: "Cellulitis is a musculoskeletal system symptom characterized as a diffuse and especially subcutaneous inflammation of connective tissue." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=cellulitis] +is_a: SYMP:0000891 ! musculoskeletal system symptom + +[Term] +id: SYMP:0000001 +name: abdominal cramp +is_a: SYMP:0000461 ! abdominal symptom + +[Term] +id: SYMP:0000002 +name: abdominal distention +is_a: SYMP:0000461 ! abdominal symptom + +[Term] +id: SYMP:0000003 +name: acute enteritis in newborns +is_obsolete: true + +[Term] +id: SYMP:0000004 +name: arrested moulting +is_obsolete: true +[Term] +id: SYMP:0000005 +name: ataxia +def: "Ataxia is a neurological and physiological symptom characterized by an inability to coordinate voluntary muscular movements that is symptomatic of some nervous disorders." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=ataxia] +synonym: "uncoordination" EXACT [] +is_a: SYMP:0000410 ! neurological and physiological symptom + +[Term] +id: SYMP:0000006 +name: backache +def: "Backache is a pain occurring in the lower back." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=backache] +synonym: "back pain" EXACT [] +is_a: SYMP:0000099 ! pain + +[Term] +id: SYMP:0000007 +name: bleeding +def: "A general symptom that is characterized as an act, instance, or result of being bled or the process by which something is bled: as a the escape of blood from vessels." [url:http\://www.merriam-webster.com/medlineplus/bleeding] +is_a: SYMP:0000567 ! general symptom + +[Term] +id: SYMP:0000008 +name: blindness +is_a: SYMP:0000320 ! vision symptom + +[Term] +id: SYMP:0000009 +name: blister +def: "Blister is a skin and integumentary tissue symptom characterized as a fluid-filled elevation of the epidermis." [url:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=blister]` + + stringreader1 := bufio.NewReader(strings.NewReader(s)) + stringreader2 := bufio.NewReader(strings.NewReader(s)) + + obolist, parentchildren = ParseToSlice(*stringreader1, parentchildren, obolist) + obochan = ParseToChannel(*stringreader2, obochan) + + for ent := range obochan { + obolist = append(obolist, ent) + } + + Dump(obolist, parentchildren) +} -- cgit v1.2.1-18-gbd029