From 184819b5888665844defa43324edcea09cab4429 Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Sun, 7 Jun 2015 14:54:11 +0200 Subject: Make the KeepElements functionality mostly work --- unxml.go | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'unxml.go') diff --git a/unxml.go b/unxml.go index e38af31..2bdb354 100644 --- a/unxml.go +++ b/unxml.go @@ -68,6 +68,7 @@ func NewReaderKeepElements(r io.Reader, tagstokeep []string) *ElementReader { tagmap: tagmap, tokenizer: html.NewTokenizer(r), }, + tagsinstack: make(map[string]int, 5), } } @@ -124,8 +125,9 @@ func (r *Reader) Read(out []byte) (int, error) { } func (r *ElementReader) Read(out []byte) (int, error) { - fmt.Fprintf(os.Stderr, "Read has been called.\n") + //fmt.Fprintf(os.Stderr, "Read has been called.\n") var err error + intagtokeep := true r.xr.count = 0 n := 0 @@ -147,10 +149,13 @@ func (r *ElementReader) Read(out []byte) (int, error) { switch tt { case html.ErrorToken: - //fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err()) + fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err()) return r.xr.count, io.EOF case html.TextToken: + if !intagtokeep { + continue + } text := r.xr.tokenizer.Text() lentext := len(text) if lentext <= lenout { @@ -167,20 +172,36 @@ func (r *ElementReader) Read(out []byte) (int, error) { } case html.StartTagToken: + if !intagtokeep { + continue + } tn, _ := r.xr.tokenizer.TagName() + //fmt.Printf("TagNameStart: %s\n", tn) if _, ok := r.xr.tagmap[string(tn)]; ok { + r.tagsinstack[string(tn)]++ + intagtokeep = true + raw := r.xr.tokenizer.Raw() + //fmt.Printf("TokenRaw: %s\n", raw) + n := copy(out[r.xr.count:], raw) + r.xr.count += n + lenout -= n } - fmt.Printf("TagName: %s\n", tn) case html.EndTagToken: tn, _ := r.xr.tokenizer.TagName() - fmt.Printf("TagEndName: %s\n", tn) + //fmt.Printf("TagEndName: %s\n", tn) if count, ok := r.tagsinstack[string(tn)]; ok { + //fmt.Printf("TagEndNameInStack: %s, %d\n", tn, count) if count == 1 { delete(r.tagsinstack, string(tn)) + intagtokeep = false } else { r.tagsinstack[string(tn)]-- } + raw := r.xr.tokenizer.Raw() + n := copy(out[r.xr.count:], raw) + r.xr.count += n + lenout -= n } } } -- cgit v1.2.1-18-gbd029