From be5eb42e23d5625f42147275a9f3c279979567bb Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Tue, 2 Jun 2015 21:00:13 +0200 Subject: Use composition --- unxml.go | 72 ++++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/unxml.go b/unxml.go index c34f29f..cf3f238 100644 --- a/unxml.go +++ b/unxml.go @@ -18,6 +18,21 @@ type Reader struct { tokenizer *html.Tokenizer } +type ElementReader struct { + r Reader + tagsinstack map[string]int +} + +//type stack []string +// +//func (s stack) Empty() bool { return len(s) == 0 } +//func (s stack) Peek() string { return s[len(s)-1] } +//func (s *stack) Pop() string { +// d := (*s)[len(*s)-1] +// (*s) = (*s)[:len(*s)-1] +// return d +//} + func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { var tagmap map[string]bool @@ -36,52 +51,65 @@ func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { } } -func (r *Reader) Read(out []byte) (int, error) { +func (r *ElementReader) Read(out []byte) (int, error) { fmt.Fprintf(os.Stderr, "Read has been called.\n") var err error - r.count = 0 + r.r.count = 0 n := 0 lenout := len(out) if lenout == 0 { - return r.count, nil + return r.r.count, nil } - lenlr := len(r.lastread) + lenlr := len(r.r.lastread) if lenlr > 0 { - n = copy(out[0:], r.lastread) - r.count += n - r.lastread = make([]byte, len(out)) + n = copy(out[0:], r.r.lastread) + r.r.count += n + r.r.lastread = make([]byte, len(out)) lenout -= n } for { - tt := r.tokenizer.Next() + tt := r.r.tokenizer.Next() switch tt { case html.ErrorToken: - fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.tokenizer.Err()) - return r.count, io.EOF + //fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.r.tokenizer.Err()) + return r.r.count, io.EOF case html.TextToken: - text := r.tokenizer.Text() + text := r.r.tokenizer.Text() lentext := len(text) if lentext <= lenout { - n = copy(out[r.count:], text) - r.count += n + n = copy(out[r.r.count:], text) + r.r.count += n lenout -= n - //fmt.Printf("HAD SPACE: %q, count: %d, err: %s\n", text, r.count, err) + //fmt.Printf("HAD SPACE: %q, count: %d, err: %s\n", text, r.r.count, err) } else { - n = copy(out[r.count:], text[:lenout-1]) - r.count += n - r.lastread = text[lenout-1:] - //fmt.Printf("HAD NO SPACE: count: %d, err: %s\n", r.count, err) - return r.count, err + n = copy(out[r.r.count:], text[:lenout-1]) + r.r.count += n + r.r.lastread = text[lenout-1:] + //fmt.Printf("HAD NO SPACE: count: %d, err: %s\n", r.r.count, err) + return r.r.count, err } - case html.StartTagToken, html.EndTagToken: - _, _ = r.tokenizer.TagName() - //fmt.Printf("TagName: %s\n", tn) + case html.StartTagToken: + tn, _ := r.r.tokenizer.TagName() + if _, ok := r.r.tagmap[string(tn)]; ok { + } + fmt.Printf("TagName: %s\n", tn) + + case html.EndTagToken: + tn, _ := r.r.tokenizer.TagName() + fmt.Printf("TagEndName: %s\n", tn) + if count, ok := r.tagsinstack[string(tn)]; ok { + if count == 1 { + delete(r.tagsinstack, string(tn)) + } else { + r.tagsinstack[string(tn)]-- + } + } } } } -- cgit v1.2.1-18-gbd029