From e0d2ede90d34fc35440c414228a536c1c869dbfc Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Thu, 1 Dec 2016 22:27:15 +0100 Subject: Make sure to get text in sub-elements We use the simpler API offered by the Go standard library to extract the article titles. I could not find another way to make sure I get the nested sub-elements. --- goencxml.go | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/goencxml.go b/goencxml.go index b4fb7cf..35ddf5c 100644 --- a/goencxml.go +++ b/goencxml.go @@ -2,23 +2,56 @@ package main import ( "bufio" + "bytes" "encoding/xml" "fmt" + "io" "os" ) -type article struct { - Title string `xml:"front>article-meta>title-group>article-title"` -} - func process(r *bufio.Reader) { - var a article + var ( + intitle bool + inarticlemeta bool + buffer bytes.Buffer + ) - err := xml.NewDecoder(r).Decode(&a) - if err != nil { - fmt.Fprintf(os.Stderr, "Error when decoding XML file %q\n", err) + dec := xml.NewDecoder(r) + for { + token, err := dec.Token() + if err == io.EOF { + break + } + if err != nil { + fmt.Fprintf(os.Stderr, "Error when decoding XML file %q\n", err) + os.Exit(1) + } + switch t := token.(type) { + case xml.StartElement: + if t.Name.Local == "article-meta" { + inarticlemeta = true + continue + } + if t.Name.Local == "article-title" && inarticlemeta { + intitle = true + } + case xml.CharData: + if !intitle || !inarticlemeta { + continue + } + buffer.Write(t) + case xml.EndElement: + if t.Name.Local == "article-meta" { + inarticlemeta = false + continue + } + if t.Name.Local == "article-title" && inarticlemeta { + intitle = false + fmt.Printf("article-title: %s\n", buffer.String()) + buffer.Reset() + } + } } - fmt.Printf("article-title: %s\n", a.Title) } func main() { -- cgit v1.2.3