1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
/* See LICENSE file for copyright and license details. */
package unxml
import (
"fmt"
"io"
"os"
"golang.org/x/net/html"
)
type Reader struct {
reader io.Reader
tagmap map[string]bool
lastread []byte
count int
tokenizer *html.Tokenizer
}
func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
var tagmap map[string]bool
if len(tagstokeep) > 0 {
tagmap = make(map[string]bool, 10)
for _, tag := range tagstokeep {
tagmap[tag] = true
}
} else {
tagmap = nil
}
return &Reader{reader: r,
tagmap: tagmap,
tokenizer: html.NewTokenizer(r),
}
}
func (r *Reader) Read(out []byte) (int, error) {
fmt.Fprintf(os.Stderr, "Read has been called.\n")
var err error
r.count = 0
n := 0
lenout := len(out)
if lenout == 0 {
return r.count, nil
}
lenlr := len(r.lastread)
if lenlr > 0 {
n = copy(out[0:], r.lastread)
r.count += n
r.lastread = make([]byte, len(out))
lenout -= n
}
for {
tt := r.tokenizer.Next()
switch tt {
case html.ErrorToken:
fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.tokenizer.Err())
return r.count, io.EOF
case html.TextToken:
text := r.tokenizer.Text()
lentext := len(text)
if lentext <= lenout {
n = copy(out[r.count:], text)
r.count += n
lenout -= n
//fmt.Printf("HAD SPACE: %q, count: %d, err: %s\n", text, r.count, err)
} else {
n = copy(out[r.count:], text[:lenout-1])
r.count += n
r.lastread = text[lenout-1:]
//fmt.Printf("HAD NO SPACE: count: %d, err: %s\n", r.count, err)
return r.count, err
}
case html.StartTagToken, html.EndTagToken:
_, _ = r.tokenizer.TagName()
//fmt.Printf("TagName: %s\n", tn)
}
}
}
|