/* See LICENSE file for copyright and license details. */ package unxml import ( "fmt" "io" "os" "golang.org/x/net/html" ) type Reader struct { reader io.Reader tagmap map[string]bool lastread []byte count int tokenizer *html.Tokenizer } func NewReaderKeepTags(r io.Reader, tagstokeep []string) *Reader { var tagmap map[string]bool if len(tagstokeep) > 0 { tagmap = make(map[string]bool, 10) for _, tag := range tagstokeep { tagmap[tag] = true } } else { tagmap = nil } return &Reader{reader: r, tagmap: tagmap, tokenizer: html.NewTokenizer(r), } } func (r *Reader) Read(out []byte) (int, error) { fmt.Fprintf(os.Stderr, "Read has been called.\n") var err error r.count = 0 n := 0 lenout := len(out) if lenout == 0 { return r.count, nil } lenlr := len(r.lastread) if lenlr > 0 { n = copy(out[0:], r.lastread) r.count += n r.lastread = make([]byte, len(out)) lenout -= n } for { tt := r.tokenizer.Next() switch tt { case html.ErrorToken: fmt.Fprintf(os.Stderr, "There was an error when parsing the html: %s, %s\n", tt, r.tokenizer.Err()) return r.count, io.EOF case html.TextToken: text := r.tokenizer.Text() lentext := len(text) if lentext <= lenout { n = copy(out[r.count:], text) r.count += n lenout -= n //fmt.Printf("HAD SPACE: %q, count: %d, err: %s\n", text, r.count, err) } else { n = copy(out[r.count:], text[:lenout-1]) r.count += n r.lastread = text[lenout-1:] //fmt.Printf("HAD NO SPACE: count: %d, err: %s\n", r.count, err) return r.count, err } case html.StartTagToken, html.EndTagToken: _, _ = r.tokenizer.TagName() //fmt.Printf("TagName: %s\n", tn) } } }