1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
/* See LICENSE file for copyright and license details. */
package unxml
import (
"bytes"
"fmt"
"io"
"os"
"golang.org/x/net/html"
)
type Reader struct {
reader io.Reader
tagmap map[string]bool
}
func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
var tagmap map[string]bool
if len(tagstokeep) > 0 {
tagmap = make(map[string]bool, 10)
for _, tag := range tagstokeep {
tagmap[tag] = true
}
} else {
tagmap = nil
}
return &Reader{reader: r,
tagmap: tagmap,
}
}
func (r *Reader) Read(out []byte) (n int, err error) {
hr := html.NewTokenizer(r.reader)
buf := bytes.NewBuffer(make([]byte, len(out)))
depth := 0
for {
tt := hr.Next()
switch tt {
case html.ErrorToken:
fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt)
return 0, hr.Err()
case html.TextToken:
if depth > 0 {
// emitBytes should copy the []byte it receives,
// if it doesn't process it immediately.
n, err = buf.Write(hr.Text())
}
case html.StartTagToken, html.EndTagToken:
tn, _ := hr.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == html.StartTagToken {
depth++
} else {
depth--
}
}
}
}
return
}
|