diff options
-rw-r--r-- | unxml.go | 29 |
1 files changed, 25 insertions, 4 deletions
@@ -68,6 +68,7 @@ func NewReaderKeepElements(r io.Reader, tagstokeep []string) *ElementReader { tagmap: tagmap, tokenizer: html.NewTokenizer(r), }, + tagsinstack: make(map[string]int, 5), } } @@ -124,8 +125,9 @@ func (r *Reader) Read(out []byte) (int, error) { } func (r *ElementReader) Read(out []byte) (int, error) { - fmt.Fprintf(os.Stderr, "Read has been called.\n") + //fmt.Fprintf(os.Stderr, "Read has been called.\n") var err error + intagtokeep := true r.xr.count = 0 n := 0 @@ -147,10 +149,13 @@ func (r *ElementReader) Read(out []byte) (int, error) { switch tt { case html.ErrorToken: - //fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err()) + fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err()) return r.xr.count, io.EOF case html.TextToken: + if !intagtokeep { + continue + } text := r.xr.tokenizer.Text() lentext := len(text) if lentext <= lenout { @@ -167,20 +172,36 @@ func (r *ElementReader) Read(out []byte) (int, error) { } case html.StartTagToken: + if !intagtokeep { + continue + } tn, _ := r.xr.tokenizer.TagName() + //fmt.Printf("TagNameStart: %s\n", tn) if _, ok := r.xr.tagmap[string(tn)]; ok { + r.tagsinstack[string(tn)]++ + intagtokeep = true + raw := r.xr.tokenizer.Raw() + //fmt.Printf("TokenRaw: %s\n", raw) + n := copy(out[r.xr.count:], raw) + r.xr.count += n + lenout -= n } - fmt.Printf("TagName: %s\n", tn) case html.EndTagToken: tn, _ := r.xr.tokenizer.TagName() - fmt.Printf("TagEndName: %s\n", tn) + //fmt.Printf("TagEndName: %s\n", tn) if count, ok := r.tagsinstack[string(tn)]; ok { + //fmt.Printf("TagEndNameInStack: %s, %d\n", tn, count) if count == 1 { delete(r.tagsinstack, string(tn)) + intagtokeep = false } else { r.tagsinstack[string(tn)]-- } + raw := r.xr.tokenizer.Raw() + n := copy(out[r.xr.count:], raw) + r.xr.count += n + lenout -= n } } } |