summaryrefslogtreecommitdiff
path: root/unxml.go
diff options
context:
space:
mode:
authorSilvan Jegen <s.jegen@gmail.com>2015-06-07 14:54:11 +0200
committerSilvan Jegen <s.jegen@gmail.com>2015-06-07 14:54:38 +0200
commit184819b5888665844defa43324edcea09cab4429 (patch)
tree808f5268beaeb565afdf5df3774435973d1a8fd1 /unxml.go
parent6c5a12885f101882148d91effa9a290dfe15d639 (diff)
Make the KeepElements functionality mostly work
Diffstat (limited to 'unxml.go')
-rw-r--r--unxml.go29
1 files changed, 25 insertions, 4 deletions
diff --git a/unxml.go b/unxml.go
index e38af31..2bdb354 100644
--- a/unxml.go
+++ b/unxml.go
@@ -68,6 +68,7 @@ func NewReaderKeepElements(r io.Reader, tagstokeep []string) *ElementReader {
tagmap: tagmap,
tokenizer: html.NewTokenizer(r),
},
+ tagsinstack: make(map[string]int, 5),
}
}
@@ -124,8 +125,9 @@ func (r *Reader) Read(out []byte) (int, error) {
}
func (r *ElementReader) Read(out []byte) (int, error) {
- fmt.Fprintf(os.Stderr, "Read has been called.\n")
+ //fmt.Fprintf(os.Stderr, "Read has been called.\n")
var err error
+ intagtokeep := true
r.xr.count = 0
n := 0
@@ -147,10 +149,13 @@ func (r *ElementReader) Read(out []byte) (int, error) {
switch tt {
case html.ErrorToken:
- //fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err())
+ fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err())
return r.xr.count, io.EOF
case html.TextToken:
+ if !intagtokeep {
+ continue
+ }
text := r.xr.tokenizer.Text()
lentext := len(text)
if lentext <= lenout {
@@ -167,20 +172,36 @@ func (r *ElementReader) Read(out []byte) (int, error) {
}
case html.StartTagToken:
+ if !intagtokeep {
+ continue
+ }
tn, _ := r.xr.tokenizer.TagName()
+ //fmt.Printf("TagNameStart: %s\n", tn)
if _, ok := r.xr.tagmap[string(tn)]; ok {
+ r.tagsinstack[string(tn)]++
+ intagtokeep = true
+ raw := r.xr.tokenizer.Raw()
+ //fmt.Printf("TokenRaw: %s\n", raw)
+ n := copy(out[r.xr.count:], raw)
+ r.xr.count += n
+ lenout -= n
}
- fmt.Printf("TagName: %s\n", tn)
case html.EndTagToken:
tn, _ := r.xr.tokenizer.TagName()
- fmt.Printf("TagEndName: %s\n", tn)
+ //fmt.Printf("TagEndName: %s\n", tn)
if count, ok := r.tagsinstack[string(tn)]; ok {
+ //fmt.Printf("TagEndNameInStack: %s, %d\n", tn, count)
if count == 1 {
delete(r.tagsinstack, string(tn))
+ intagtokeep = false
} else {
r.tagsinstack[string(tn)]--
}
+ raw := r.xr.tokenizer.Raw()
+ n := copy(out[r.xr.count:], raw)
+ r.xr.count += n
+ lenout -= n
}
}
}