unxml.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

/* See LICENSE file for copyright and license details. */

package unxml

import (
	"bytes"
	"fmt"
	"io"
	"os"

	"golang.org/x/net/html"
)

type Reader struct {
	reader io.Reader
	tagmap map[string]bool
}

func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
	var tagmap map[string]bool

	if len(tagstokeep) > 0 {
		tagmap = make(map[string]bool, 10)
		for _, tag := range tagstokeep {
			tagmap[tag] = true
		}
	} else {
		tagmap = nil
	}

	return &Reader{reader: r,
		tagmap: tagmap,
	}
}

func (r *Reader) Read(out []byte) (n int, err error) {
	hr := html.NewTokenizer(r.reader)
	buf := bytes.NewBuffer(make([]byte, len(out)))
	depth := 0

	for {
		tt := hr.Next()

		switch tt {
		case html.ErrorToken:
			fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt)
			return 0, hr.Err()

		case html.TextToken:
			if depth > 0 {
				// emitBytes should copy the []byte it receives,
				// if it doesn't process it immediately.
				n, err = buf.Write(hr.Text())
			}

		case html.StartTagToken, html.EndTagToken:
			tn, _ := hr.TagName()

			if len(tn) == 1 && tn[0] == 'a' {
				if tt == html.StartTagToken {
					depth++
				} else {
					depth--
				}
			}
		}
	}

	return
}