diff options
author | Silvan Jegen <s.jegen@gmail.com> | 2015-01-03 15:56:24 +0100 |
---|---|---|
committer | Silvan Jegen <s.jegen@gmail.com> | 2015-01-03 15:56:24 +0100 |
commit | d8683f6bb8032ceb3b73c79603bc185202b129a4 (patch) | |
tree | d3bfe452d36e5d28cf76c8855b9abcf4586d7336 /unxml.go |
Initial commit
Diffstat (limited to 'unxml.go')
-rw-r--r-- | unxml.go | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/unxml.go b/unxml.go new file mode 100644 index 0000000..d8d588e --- /dev/null +++ b/unxml.go @@ -0,0 +1,70 @@ +/* See LICENSE file for copyright and license details. */ + +package unxml + +import ( + "bytes" + "fmt" + "io" + "os" + + "golang.org/x/net/html" +) + +type Reader struct { + reader io.Reader + tagmap map[string]bool +} + +func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { + var tagmap map[string]bool + + if len(tagstokeep) > 0 { + tagmap = make(map[string]bool, 10) + for _, tag := range tagstokeep { + tagmap[tag] = true + } + } else { + tagmap = nil + } + + return &Reader{reader: r, + tagmap: tagmap, + } +} + +func (r *Reader) Read(out []byte) (n int, err error) { + hr := html.NewTokenizer(r.reader) + buf := bytes.NewBuffer(make([]byte, len(out))) + depth := 0 + + for { + tt := hr.Next() + + switch tt { + case html.ErrorToken: + fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt) + return 0, hr.Err() + + case html.TextToken: + if depth > 0 { + // emitBytes should copy the []byte it receives, + // if it doesn't process it immediately. + n, err = buf.Write(hr.Text()) + } + + case html.StartTagToken, html.EndTagToken: + tn, _ := hr.TagName() + + if len(tn) == 1 && tn[0] == 'a' { + if tt == html.StartTagToken { + depth++ + } else { + depth-- + } + } + } + } + + return +} |