1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
/* See LICENSE file for copyright and license details. */
package unxml
import (
"fmt"
"io"
"os"
"golang.org/x/net/html"
)
type Reader struct {
reader io.Reader
tagmap map[string]bool
lastread []byte
count int
tokenizer *html.Tokenizer
}
type ElementReader struct {
r Reader
tagsinstack map[string]int
}
//type stack []string
//
//func (s stack) Empty() bool { return len(s) == 0 }
//func (s stack) Peek() string { return s[len(s)-1] }
//func (s *stack) Pop() string {
// d := (*s)[len(*s)-1]
// (*s) = (*s)[:len(*s)-1]
// return d
//}
func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
var tagmap map[string]bool
if len(tagstokeep) > 0 {
tagmap = make(map[string]bool, 10)
for _, tag := range tagstokeep {
tagmap[tag] = true
}
} else {
tagmap = nil
}
return &Reader{reader: r,
tagmap: tagmap,
tokenizer: html.NewTokenizer(r),
}
}
func (r *ElementReader) Read(out []byte) (int, error) {
fmt.Fprintf(os.Stderr, "Read has been called.\n")
var err error
r.r.count = 0
n := 0
lenout := len(out)
if lenout == 0 {
return r.r.count, nil
}
lenlr := len(r.r.lastread)
if lenlr > 0 {
n = copy(out[0:], r.r.lastread)
r.r.count += n
r.r.lastread = make([]byte, len(out))
lenout -= n
}
for {
tt := r.r.tokenizer.Next()
switch tt {
case html.ErrorToken:
//fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.r.tokenizer.Err())
return r.r.count, io.EOF
case html.TextToken:
text := r.r.tokenizer.Text()
lentext := len(text)
if lentext <= lenout {
n = copy(out[r.r.count:], text)
r.r.count += n
lenout -= n
//fmt.Printf("HAD SPACE: %q, count: %d, err: %s\n", text, r.r.count, err)
} else {
n = copy(out[r.r.count:], text[:lenout-1])
r.r.count += n
r.r.lastread = text[lenout-1:]
//fmt.Printf("HAD NO SPACE: count: %d, err: %s\n", r.r.count, err)
return r.r.count, err
}
case html.StartTagToken:
tn, _ := r.r.tokenizer.TagName()
if _, ok := r.r.tagmap[string(tn)]; ok {
}
fmt.Printf("TagName: %s\n", tn)
case html.EndTagToken:
tn, _ := r.r.tokenizer.TagName()
fmt.Printf("TagEndName: %s\n", tn)
if count, ok := r.tagsinstack[string(tn)]; ok {
if count == 1 {
delete(r.tagsinstack, string(tn))
} else {
r.tagsinstack[string(tn)]--
}
}
}
}
}
|