From af7a90d550c98b3c6610c121ae49d9abf943fdaa Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Sat, 3 Jan 2015 18:11:33 +0100 Subject: Putting more state into unxml.Reader --- unxml.go | 71 ++++++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/unxml.go b/unxml.go index d8d588e..f5b7c62 100644 --- a/unxml.go +++ b/unxml.go @@ -12,8 +12,11 @@ import ( ) type Reader struct { - reader io.Reader - tagmap map[string]bool + reader io.Reader + tagmap map[string]bool + lastread []byte + count int + tokenizer *html.Tokenizer } func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { @@ -29,42 +32,62 @@ func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { } return &Reader{reader: r, - tagmap: tagmap, + tagmap: tagmap, + tokenizer: html.NewTokenizer(r), } } -func (r *Reader) Read(out []byte) (n int, err error) { - hr := html.NewTokenizer(r.reader) - buf := bytes.NewBuffer(make([]byte, len(out))) - depth := 0 +func (r *Reader) Read(out []byte) (int, error) { + fmt.Fprintf(os.Stderr, "Read has been called.\n") + var err error + + r.count = 0 + n := 0 + lenout := len(out) + if lenout == 0 { + return r.count, nil + } + + lenlr := len(r.lastread) + if lenlr > 0 { + n = copy(out[0:], r.lastread) + r.count += n + r.lastread = make([]byte, len(out)) + lenout -= n + } + + buf := bytes.NewBuffer(out) for { - tt := hr.Next() + tt := r.tokenizer.Next() switch tt { case html.ErrorToken: - fmt.Fprintf(os.Stderr, "There was an error when reading from the underlying os.Reader: %s", tt) - return 0, hr.Err() + fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.tokenizer.Err()) + return r.count, io.EOF case html.TextToken: - if depth > 0 { - // emitBytes should copy the []byte it receives, - // if it doesn't process it immediately. - n, err = buf.Write(hr.Text()) + text := r.tokenizer.Text() + lentext := len(text) + if lentext <= lenout { + //n, err = buf.Write(r.tokenizer.Text()) + n = copy(out[r.count:], text) + r.count += n + lenout -= n + fmt.Printf("HAD SPACE: n, err: %d, %s; out: %s; buf: %s\n", n, err, out, buf) + } else { + n = copy(out[r.count:], text[:lenout-1]) + r.count += n + r.lastread = text[lenout-1:] + fmt.Printf("HAD NO SPACE: n, err: %d, %s; out: %s; buf: %s\n", n, err, out, buf) + return r.count, err } case html.StartTagToken, html.EndTagToken: - tn, _ := hr.TagName() - - if len(tn) == 1 && tn[0] == 'a' { - if tt == html.StartTagToken { - depth++ - } else { - depth-- - } - } + _, _ = r.tokenizer.TagName() + //fmt.Printf("TagName: %s\n", tn) } } - return + return r.count, err } -- cgit v1.2.1-18-gbd029