diff options
| -rw-r--r-- | LICENSE | 26 | ||||
| -rw-r--r-- | README.md | 46 | ||||
| -rw-r--r-- | doc.go | 4 | ||||
| -rw-r--r-- | obo.go | 170 | ||||
| -rw-r--r-- | obo_test.go | 94 | 
5 files changed, 340 insertions, 0 deletions
| @@ -0,0 +1,26 @@ +Copyright (c) 2013, Silvan Jegen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met:  + +1. Redistributions of source code must retain the above copyright notice, this +   list of conditions and the following disclaimer.  +2. Redistributions in binary form must reproduce the above copyright notice, +   this list of conditions and the following disclaimer in the documentation +   and/or other materials provided with the distribution.  + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies,  +either expressed or implied, of the FreeBSD Project. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ee8a2e9 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# Introduction to the obo package + +This is a WIP for a go library for parsing [.obo files](http://www.geneontology.org/GO.format.obo-1_2.shtml). The library currently parses .obo files and returns either a channel to, or a slice of the [Term] stanzas contained in the files. + +The parsing of other stanza types is still under construction as are several of the key: value-pairs associated with the [Term] stanza. Please see the TODO section further below. + + + +# Examples + +Reading a .obo file into a slice of OboTermEntries + + +```Go +var obolist []*OboTermEntry +var parentchildren map[string][]*OboTermEntry + +reader := bufio.NewReader(os.File("/path/to/file/file.obo")) +obolist, parentchildren = ParseToSlice(*reader, parentchildren, obolist) + +``` + +To parse the OboTermEntries and having them fed to a channel: + +```Go +obochan := make(chan *OboTermEntry) +var obolist []*OboTermEntry + +reader := bufio.NewReader(os.File("/path/to/file/file.obo")) +obochan = ParseToChannel(*reader, obochan) + +for ent := range obochan { +	obolist = append(obolist, ent) +} + +``` + + +# TODO + +* Parsing of header data +* Writing of .obo files +* Parsing of [Typedef] stanzas +* Parsing of [Instance] stanzas +* Handling of several tags is still missing + @@ -0,0 +1,4 @@ +/* See LICENSE file for copyright and license details. */ + +// A simple parsing library for the .obo file format +package obo @@ -0,0 +1,170 @@ +/* See LICENSE file for copyright and license details. */ + +package obo + +import ( +	"bufio" +	"fmt" +	"io" +	"os" +	"strings" +) + +type OboTermEntry struct { +	Id       string +	Altids   []string +	Name     string +	Def      string +	Xrefs    []string +	Synonyms []string +	IsA      []string +	Obsolete bool +} + +// Use this function to filter the OboTermEntry pointer slice +func Filter(s []*OboTermEntry, fn func(*OboTermEntry) bool) []*OboTermEntry { +	var p []*OboTermEntry +	for _, i := range s { +		if fn(i) { +			p = append(p, i) +		} +	} +	return p +} + +// This function prints a simple representation of the parsed Obo +// data. It is used for debugging purposes and may be removed at a later +// stage. +func Dump(oboent []*OboTermEntry, parentchildrenmap map[string][]*OboTermEntry) { +	var potentialroots []string +	for _, entry := range oboent { +		if len(entry.IsA) == 0 { +			potentialroots = append(potentialroots, entry.Id) +		} + +		fmt.Printf("%s\n\tPT %s\n", entry.Id, entry.Name) +		if len(entry.Synonyms) > 0 { +			fmt.Print("\tSYN ") +			fmt.Print(strings.Join(entry.Synonyms, "\n\tSYN ")) +			fmt.Print("\n") +		} +		if children, ok := parentchildrenmap[entry.Id]; ok { +			fmt.Print("\tNT ") +			for _, child := range children { +				fmt.Print(child.Id, "\n\tNT ") +			} +			fmt.Print("\n") +		} +		fmt.Print("\n") +	} +	fmt.Fprintf(os.Stderr, "Number of entries in the list: %d\n", len(oboent)) +	fmt.Fprintf(os.Stderr, "Number of entries with children: %d\n", len(parentchildrenmap)) +	fmt.Fprintf(os.Stderr, "Number of orphan nodes: %d\n", len(potentialroots)) + +	fmt.Print("root\n\tPT YourOntologyNameHere\n") +	for _, potroot := range potentialroots { +		fmt.Printf("\tNT %s\n", potroot) +	} +	fmt.Print("\n") +} + +func parseObo(oboinput bufio.Reader, obochan chan *OboTermEntry, parentchildren map[string][]*OboTermEntry) { +	lineno := 0 +	var entry *OboTermEntry +	var termsstarted bool +	defer close(obochan) + +	for { +		line, err := oboinput.ReadString('\n') +		if err != nil { +			if err == io.EOF { +				break +			} +			fmt.Printf("Error while reading obo file at line nr. %d: %v\n", lineno, err) +			os.Exit(1) +		} +		lineno++ +		line = line[:len(line)-1] // chop \n +		if lineno%1000000 == 0 { +			fmt.Fprintf(os.Stderr, "Chopped line number: %d\n", lineno) +		} + +		if line == "[Term]" { +			termsstarted = true +			if entry != nil { +				obochan <- entry +			} + +			entry = new(OboTermEntry) +			continue +		} else if line == "\n" { +			continue +		} else if line == "[Typedef]" { +			continue +		} else if line == "" { +			continue +		} else if line[0] == '!' { +			continue +		} + +		if termsstarted { +			splitline := strings.SplitN(line, ":", 2) +			trimmedvalue := strings.Trim(splitline[1], " ") +			field := strings.Trim(splitline[0], " ") +			switch field { +			case "id": +				entry.Id = trimmedvalue +			case "name": +				entry.Name = trimmedvalue +			case "def": +				entry.Def = trimmedvalue +			case "alt_id": +				entry.Altids = append(entry.Altids, trimmedvalue) +			case "xref": +				entry.Xrefs = append(entry.Xrefs, trimmedvalue) +			case "synonym": +				syn := strings.SplitN(trimmedvalue, "\" ", 2) +				r := strings.NewReplacer("\"", "") +				entry.Synonyms = append(entry.Synonyms, r.Replace(syn[0])) +			case "is_a": +				isa := strings.SplitN(trimmedvalue, "!", 2) +				trimmedisa := strings.Trim(isa[0], " ") +				entry.IsA = append(entry.IsA, trimmedisa) +				if parentchildren != nil { +					parentchildren[trimmedisa] = append(parentchildren[trimmedisa], entry) +				} +			case "is_obsolete": +				entry.Obsolete = true +			} +		} +	} +	obochan <- entry +} + +// Parses a .obo file given as a bufio.Reader into a slice of +// OboTermEntry's. Hierarchical information is saved in a map and +// returned together with the slice. +func ParseToSlice(oboinput bufio.Reader, parentchildren map[string][]*OboTermEntry, obolist []*OboTermEntry) ([]*OboTermEntry, map[string][]*OboTermEntry) { +	var ent *OboTermEntry +	obochan := make(chan *OboTermEntry, 100) + +	go parseObo(oboinput, obochan, parentchildren) + +	for ent = range obochan { +		obolist = append(obolist, ent) +	} +	return obolist, parentchildren +} + +// This function returns a channel on which pointers to the parsed +// OboTermEntry structs will be sent. Please note that this function +// does not return the hierarchy map. If you want to parse the .obo +// file asynchronously while still having access to the hierarchical +// information you will have to build the structure containing the +// hierarchical information yourself. +func ParseToChannel(oboinput bufio.Reader, obochan chan *OboTermEntry) chan *OboTermEntry { + +	go parseObo(oboinput, obochan, nil) + +	return obochan +} diff --git a/obo_test.go b/obo_test.go new file mode 100644 index 0000000..140f948 --- /dev/null +++ b/obo_test.go @@ -0,0 +1,94 @@ +/* See LICENSE file for copyright and license details. */ + +package obo + +import ( +	"bufio" +	"strings" +	"testing" +) + +func TestOboParsing(*testing.T) { +	parentchildren := make(map[string][]*OboTermEntry) +	obochan := make(chan *OboTermEntry) +	var obolist []*OboTermEntry + +	s := ` + +format-version: 1.2 +date: 17:11:2011 13:07 +saved-by: lschriml +auto-generated-by: OBO-Edit 2.1-beta6 +default-namespace: symptoms + +[Typedef] +id: part_of +name: part_of + +[Term] +id: SYMP:0000000 +name: cellulitis +def: "Cellulitis is a musculoskeletal system symptom characterized as a diffuse and especially subcutaneous inflammation of connective tissue." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=cellulitis] +is_a: SYMP:0000891 ! musculoskeletal system symptom + +[Term] +id: SYMP:0000001 +name: abdominal cramp +is_a: SYMP:0000461 ! abdominal symptom + +[Term] +id: SYMP:0000002 +name: abdominal distention +is_a: SYMP:0000461 ! abdominal symptom + +[Term] +id: SYMP:0000003 +name: acute enteritis in newborns +is_obsolete: true + +[Term] +id: SYMP:0000004 +name: arrested moulting +is_obsolete: true +[Term] +id: SYMP:0000005 +name: ataxia +def: "Ataxia is a neurological and physiological symptom characterized by an inability to coordinate voluntary muscular movements that is symptomatic of some nervous disorders." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=ataxia] +synonym: "uncoordination" EXACT [] +is_a: SYMP:0000410 ! neurological and physiological symptom + +[Term] +id: SYMP:0000006 +name: backache +def: "Backache is a pain occurring in the lower back." [URL:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=backache] +synonym: "back pain" EXACT [] +is_a: SYMP:0000099 ! pain + +[Term] +id: SYMP:0000007 +name: bleeding +def: "A general symptom that is characterized as an act, instance, or result of being bled or the process by which something is bled: as a the escape of blood from vessels." [url:http\://www.merriam-webster.com/medlineplus/bleeding] +is_a: SYMP:0000567 ! general symptom + +[Term] +id: SYMP:0000008 +name: blindness +is_a: SYMP:0000320 ! vision symptom + +[Term] +id: SYMP:0000009 +name: blister +def: "Blister is a skin and integumentary tissue symptom characterized as a fluid-filled elevation of the epidermis." [url:http\://www2.merriam-webster.com/cgi-bin/mwmednlm?book=Medical&va=blister]` + +	stringreader1 := bufio.NewReader(strings.NewReader(s)) +	stringreader2 := bufio.NewReader(strings.NewReader(s)) + +	obolist, parentchildren = ParseToSlice(*stringreader1, parentchildren, obolist) +	obochan = ParseToChannel(*stringreader2, obochan) + +	for ent := range obochan { +		obolist = append(obolist, ent) +	} + +	Dump(obolist, parentchildren) +} | 
