From 0ad0f3a85145f2b64cf5cbffa57cdb779180b724 Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Sun, 25 Dec 2016 16:41:56 +0100 Subject: Implement our own scanner This allows us to record byte offset and line number in the token. --- conf/parser.go | 8 +++--- conf/scanner.go | 82 ++++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 65 insertions(+), 25 deletions(-) (limited to 'conf') diff --git a/conf/parser.go b/conf/parser.go index 21bc604..85d9b11 100644 --- a/conf/parser.go +++ b/conf/parser.go @@ -17,12 +17,12 @@ type parser struct { func NewConfig(r io.Reader) *Config { s := newScanner(r) - tt, tok, err := s.Next() + tok, err := s.Scan() for err == nil { - fmt.Fprintf(os.Stderr, "tokentype: %d, token: %q, err: %v\n", tt, tok, err) - tt, tok, err = s.Next() + fmt.Fprintf(os.Stderr, "tokentype: %d, token: %q offset: %d, line: %d\n", tok.Type, tok.Lit, tok.Offset, tok.LineNr) + tok, err = s.Scan() } - fmt.Fprintf(os.Stderr, "Error: tokentype: %d, token: %q, err: %v\n", tt, tok, err) + fmt.Fprintf(os.Stderr, "Error: tokentype: %d, token: %q, err: %v\n", tok.Type, tok.Lit, err) return &Config{} } diff --git a/conf/scanner.go b/conf/scanner.go index 172103f..3d83534 100644 --- a/conf/scanner.go +++ b/conf/scanner.go @@ -1,8 +1,12 @@ package conf import ( - "bufio" + "fmt" "io" + "io/ioutil" + "os" + "unicode" + "unicode/utf8" ) type tokentype int @@ -16,45 +20,81 @@ const ( ) type token struct { - Type tokentype + Type tokentype + Offset int + LineNr int + Lit string } type scanner struct { - r io.Reader - bs bufio.Scanner - prev token - cur token + r io.Reader + data []byte + offset int + curline int } func newScanner(r io.Reader) *scanner { - ns := bufio.NewScanner(r) - ns.Split(bufio.ScanWords) + // TODO: don't be lazy + data, err := ioutil.ReadAll(r) + if err != nil { + fmt.Fprintf(os.Stderr, "could not read data from Reader: %v\n", err) + os.Exit(1) + } sc := scanner{ - r: r, - bs: *ns, + data: data, + curline: 1, } return &sc } -func getTokenType(s string) tokentype { +func getTokenType(s []byte) tokentype { return Nothing } -func (s *scanner) Next() (tokentype, string, error) { - more := s.bs.Scan() - if !more { - err := s.bs.Err() - if err != nil { - return Nothing, "", err +func (s *scanner) Scan() (token, error) { + processed := 0 + tokenstarted := false + oldline := s.curline + for { + r, rlen := utf8.DecodeRune(s.data[s.offset+processed:]) + if r == utf8.RuneError { + if rlen == 1 { + return token{}, fmt.Errorf("found invalid UTF8 at offset %d, before: %s", s.offset, s.data[s.offset]) + } else if rlen == 0 { + return token{}, io.EOF + } + } + + processed += rlen + + if unicode.IsSpace(r) { + if r == '\n' { + s.curline++ + } + if tokenstarted { + break + } + s.offset += rlen + processed = 0 + continue } - return Nothing, "", io.EOF + tokenstarted = true } - tokstr := s.bs.Text() - token := getTokenType(tokstr) - return token, tokstr, nil + tokbytes := s.data[s.offset : s.offset+processed-1] + tokent := getTokenType(tokbytes) + + s.offset += processed + + ret := token{ + Type: tokent, + Offset: s.offset, + LineNr: oldline, + Lit: string(tokbytes), + } + return ret, nil } func (s *scanner) peek() (tokentype, string, error) { -- cgit v1.2.1-18-gbd029