From d8683f6bb8032ceb3b73c79603bc185202b129a4 Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Sat, 3 Jan 2015 15:56:24 +0100 Subject: Initial commit --- unxml.go | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 unxml.go diff --git a/unxml.go b/unxml.go new file mode 100644 index 0000000..d8d588e --- /dev/null +++ b/unxml.go @@ -0,0 +1,70 @@ +/* See LICENSE file for copyright and license details. */ + +package unxml + +import ( + "bytes" + "fmt" + "io" + "os" + + "golang.org/x/net/html" +) + +type Reader struct { + reader io.Reader + tagmap map[string]bool +} + +func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { + var tagmap map[string]bool + + if len(tagstokeep) > 0 { + tagmap = make(map[string]bool, 10) + for _, tag := range tagstokeep { + tagmap[tag] = true + } + } else { + tagmap = nil + } + + return &Reader{reader: r, + tagmap: tagmap, + } +} + +func (r *Reader) Read(out []byte) (n int, err error) { + hr := html.NewTokenizer(r.reader) + buf := bytes.NewBuffer(make([]byte, len(out))) + depth := 0 + + for { + tt := hr.Next() + + switch tt { + case html.ErrorToken: + fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt) + return 0, hr.Err() + + case html.TextToken: + if depth > 0 { + // emitBytes should copy the []byte it receives, + // if it doesn't process it immediately. + n, err = buf.Write(hr.Text()) + } + + case html.StartTagToken, html.EndTagToken: + tn, _ := hr.TagName() + + if len(tn) == 1 && tn[0] == 'a' { + if tt == html.StartTagToken { + depth++ + } else { + depth-- + } + } + } + } + + return +} -- cgit v1.2.1-18-gbd029