/* See LICENSE file for copyright and license details. */ package unxml import ( "fmt" "io" "os" "golang.org/x/net/html" ) type Reader struct { reader io.Reader tagmap map[string]bool lastread []byte count int tokenizer *html.Tokenizer } type ElementReader struct { xr Reader tagsinstack map[string]int } //type stack []string // //func (s stack) Empty() bool { return len(s) == 0 } //func (s stack) Peek() string { return s[len(s)-1] } //func (s *stack) Pop() string { // d := (*s)[len(*s)-1] // (*s) = (*s)[:len(*s)-1] // return d //} func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { var tagmap map[string]bool if len(tagstokeep) > 0 { tagmap = make(map[string]bool, 10) for _, tag := range tagstokeep { tagmap[tag] = true } } else { tagmap = nil } return &Reader{reader: r, tagmap: tagmap, tokenizer: html.NewTokenizer(r), } } func (r *Reader) Read(out []byte) (int, error) { var err error r.count = 0 n := 0 lenout := len(out) if lenout == 0 { return r.count, nil } lenlr := len(r.lastread) if lenlr > 0 { n = copy(out[0:], r.lastread) r.count += n r.lastread = make([]byte, len(out)) lenout -= n } for { tt := r.tokenizer.Next() switch tt { case html.ErrorToken: return r.count, io.EOF case html.TextToken: text := r.tokenizer.Text() lentext := len(text) if lentext <= lenout { n = copy(out[r.count:], text) r.count += n lenout -= n } else { n = copy(out[r.count:], text[:lenout-1]) r.count += n r.lastread = text[lenout-1:] return r.count, err } case html.StartTagToken: tn, _ := r.tokenizer.TagName() if _, ok := r.tagmap[string(tn)]; ok { } //fmt.Printf("TagName: %s\n", tn) case html.EndTagToken: _, _ = r.tokenizer.TagName() //fmt.Printf("TagEndName: %s\n", tn) } } } func (r *ElementReader) Read(out []byte) (int, error) { fmt.Fprintf(os.Stderr, "Read has been called.\n") var err error r.xr.count = 0 n := 0 lenout := len(out) if lenout == 0 { return r.xr.count, nil } lenlr := len(r.xr.lastread) if lenlr > 0 { n = copy(out[0:], r.xr.lastread) r.xr.count += n r.xr.lastread = make([]byte, len(out)) lenout -= n } for { tt := r.xr.tokenizer.Next() switch tt { case html.ErrorToken: //fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.xr.tokenizer.Err()) return r.xr.count, io.EOF case html.TextToken: text := r.xr.tokenizer.Text() lentext := len(text) if lentext <= lenout { n = copy(out[r.xr.count:], text) r.xr.count += n lenout -= n //fmt.Printf("HAD SPACE: %q, count: %d, err: %s\n", text, r.xr.count, err) } else { n = copy(out[r.xr.count:], text[:lenout-1]) r.xr.count += n r.xr.lastread = text[lenout-1:] //fmt.Printf("HAD NO SPACE: count: %d, err: %s\n", r.xr.count, err) return r.xr.count, err } case html.StartTagToken: tn, _ := r.xr.tokenizer.TagName() if _, ok := r.xr.tagmap[string(tn)]; ok { } fmt.Printf("TagName: %s\n", tn) case html.EndTagToken: tn, _ := r.xr.tokenizer.TagName() fmt.Printf("TagEndName: %s\n", tn) if count, ok := r.tagsinstack[string(tn)]; ok { if count == 1 { delete(r.tagsinstack, string(tn)) } else { r.tagsinstack[string(tn)]-- } } } } }