summaryrefslogtreecommitdiff
path: root/unxml.go
diff options
context:
space:
mode:
authorSilvan Jegen <s.jegen@gmail.com>2015-01-03 18:11:33 +0100
committerSilvan Jegen <s.jegen@gmail.com>2015-01-03 18:11:33 +0100
commitaf7a90d550c98b3c6610c121ae49d9abf943fdaa (patch)
treeacde6ff185dfb1cce7688f49f8d1fe65c330154f /unxml.go
parentd8683f6bb8032ceb3b73c79603bc185202b129a4 (diff)
Putting more state into unxml.Reader
Diffstat (limited to 'unxml.go')
-rw-r--r--unxml.go71
1 files changed, 47 insertions, 24 deletions
diff --git a/unxml.go b/unxml.go
index d8d588e..f5b7c62 100644
--- a/unxml.go
+++ b/unxml.go
@@ -12,8 +12,11 @@ import (
)
type Reader struct {
- reader io.Reader
- tagmap map[string]bool
+ reader io.Reader
+ tagmap map[string]bool
+ lastread []byte
+ count int
+ tokenizer *html.Tokenizer
}
func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
@@ -29,42 +32,62 @@ func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader {
}
return &Reader{reader: r,
- tagmap: tagmap,
+ tagmap: tagmap,
+ tokenizer: html.NewTokenizer(r),
}
}
-func (r *Reader) Read(out []byte) (n int, err error) {
- hr := html.NewTokenizer(r.reader)
- buf := bytes.NewBuffer(make([]byte, len(out)))
- depth := 0
+func (r *Reader) Read(out []byte) (int, error) {
+ fmt.Fprintf(os.Stderr, "Read has been called.\n")
+ var err error
+
+ r.count = 0
+ n := 0
+ lenout := len(out)
+ if lenout == 0 {
+ return r.count, nil
+ }
+
+ lenlr := len(r.lastread)
+ if lenlr > 0 {
+ n = copy(out[0:], r.lastread)
+ r.count += n
+ r.lastread = make([]byte, len(out))
+ lenout -= n
+ }
+
+ buf := bytes.NewBuffer(out)
for {
- tt := hr.Next()
+ tt := r.tokenizer.Next()
switch tt {
case html.ErrorToken:
- fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt)
- return 0, hr.Err()
+ fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.tokenizer.Err())
+ return r.count, io.EOF
case html.TextToken:
- if depth > 0 {
- // emitBytes should copy the []byte it receives,
- // if it doesn't process it immediately.
- n, err = buf.Write(hr.Text())
+ text := r.tokenizer.Text()
+ lentext := len(text)
+ if lentext <= lenout {
+ //n, err = buf.Write(r.tokenizer.Text())
+ n = copy(out[r.count:], text)
+ r.count += n
+ lenout -= n
+ fmt.Printf("HAD SPACE: n, err: %d, %s; out: %s; buf: %s\n", n, err, out, buf)
+ } else {
+ n = copy(out[r.count:], text[:lenout-1])
+ r.count += n
+ r.lastread = text[lenout-1:]
+ fmt.Printf("HAD NO SPACE: n, err: %d, %s; out: %s; buf: %s\n", n, err, out, buf)
+ return r.count, err
}
case html.StartTagToken, html.EndTagToken:
- tn, _ := hr.TagName()
-
- if len(tn) == 1 && tn[0] == 'a' {
- if tt == html.StartTagToken {
- depth++
- } else {
- depth--
- }
- }
+ _, _ = r.tokenizer.TagName()
+ //fmt.Printf("TagName: %s\n", tn)
}
}
- return
+ return r.count, err
}