summaryrefslogtreecommitdiff
path: root/unxml.go
diff options
context:
space:
mode:
authorSilvan Jegen <s.jegen@gmail.com>2015-01-03 15:56:24 +0100
committerSilvan Jegen <s.jegen@gmail.com>2015-01-03 15:56:24 +0100
commitd8683f6bb8032ceb3b73c79603bc185202b129a4 (patch)
treed3bfe452d36e5d28cf76c8855b9abcf4586d7336 /unxml.go
Initial commit
Diffstat (limited to 'unxml.go')
-rw-r--r--unxml.go70
1 files changed, 70 insertions, 0 deletions
diff --git a/unxml.go b/unxml.go
new file mode 100644
index 0000000..d8d588e
--- /dev/null
+++ b/unxml.go
@@ -0,0 +1,70 @@
+/* See LICENSE file for copyright and license details. */
+
+package unxml
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "os"
+
+ "golang.org/x/net/html"
+)
+
+type Reader struct {
+ reader io.Reader
+ tagmap map[string]bool
+}
+
+func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
+ var tagmap map[string]bool
+
+ if len(tagstokeep) > 0 {
+ tagmap = make(map[string]bool, 10)
+ for _, tag := range tagstokeep {
+ tagmap[tag] = true
+ }
+ } else {
+ tagmap = nil
+ }
+
+ return &Reader{reader: r,
+ tagmap: tagmap,
+ }
+}
+
+func (r *Reader) Read(out []byte) (n int, err error) {
+ hr := html.NewTokenizer(r.reader)
+ buf := bytes.NewBuffer(make([]byte, len(out)))
+ depth := 0
+
+ for {
+ tt := hr.Next()
+
+ switch tt {
+ case html.ErrorToken:
+ fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt)
+ return 0, hr.Err()
+
+ case html.TextToken:
+ if depth > 0 {
+ // emitBytes should copy the []byte it receives,
+ // if it doesn't process it immediately.
+ n, err = buf.Write(hr.Text())
+ }
+
+ case html.StartTagToken, html.EndTagToken:
+ tn, _ := hr.TagName()
+
+ if len(tn) == 1 && tn[0] == 'a' {
+ if tt == html.StartTagToken {
+ depth++
+ } else {
+ depth--
+ }
+ }
+ }
+ }
+
+ return
+}