| package json |
| |
| import ( |
| "fmt" |
| |
| "github.com/apparentlymart/go-textseg/v13/textseg" |
| "github.com/hashicorp/hcl/v2" |
| ) |
| |
| //go:generate stringer -type tokenType scanner.go |
| type tokenType rune |
| |
| const ( |
| tokenBraceO tokenType = '{' |
| tokenBraceC tokenType = '}' |
| tokenBrackO tokenType = '[' |
| tokenBrackC tokenType = ']' |
| tokenComma tokenType = ',' |
| tokenColon tokenType = ':' |
| tokenKeyword tokenType = 'K' |
| tokenString tokenType = 'S' |
| tokenNumber tokenType = 'N' |
| tokenEOF tokenType = '␄' |
| tokenInvalid tokenType = 0 |
| tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax |
| ) |
| |
| type token struct { |
| Type tokenType |
| Bytes []byte |
| Range hcl.Range |
| } |
| |
| // scan returns the primary tokens for the given JSON buffer in sequence. |
| // |
| // The responsibility of this pass is to just mark the slices of the buffer |
| // as being of various types. It is lax in how it interprets the multi-byte |
| // token types keyword, string and number, preferring to capture erroneous |
| // extra bytes that we presume the user intended to be part of the token |
| // so that we can generate more helpful diagnostics in the parser. |
| func scan(buf []byte, start pos) []token { |
| var tokens []token |
| p := start |
| for { |
| if len(buf) == 0 { |
| tokens = append(tokens, token{ |
| Type: tokenEOF, |
| Bytes: nil, |
| Range: posRange(p, p), |
| }) |
| return tokens |
| } |
| |
| buf, p = skipWhitespace(buf, p) |
| |
| if len(buf) == 0 { |
| tokens = append(tokens, token{ |
| Type: tokenEOF, |
| Bytes: nil, |
| Range: posRange(p, p), |
| }) |
| return tokens |
| } |
| |
| start = p |
| |
| first := buf[0] |
| switch { |
| case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=': |
| p.Pos.Column++ |
| p.Pos.Byte++ |
| tokens = append(tokens, token{ |
| Type: tokenType(first), |
| Bytes: buf[0:1], |
| Range: posRange(start, p), |
| }) |
| buf = buf[1:] |
| case first == '"': |
| var tokBuf []byte |
| tokBuf, buf, p = scanString(buf, p) |
| tokens = append(tokens, token{ |
| Type: tokenString, |
| Bytes: tokBuf, |
| Range: posRange(start, p), |
| }) |
| case byteCanStartNumber(first): |
| var tokBuf []byte |
| tokBuf, buf, p = scanNumber(buf, p) |
| tokens = append(tokens, token{ |
| Type: tokenNumber, |
| Bytes: tokBuf, |
| Range: posRange(start, p), |
| }) |
| case byteCanStartKeyword(first): |
| var tokBuf []byte |
| tokBuf, buf, p = scanKeyword(buf, p) |
| tokens = append(tokens, token{ |
| Type: tokenKeyword, |
| Bytes: tokBuf, |
| Range: posRange(start, p), |
| }) |
| default: |
| tokens = append(tokens, token{ |
| Type: tokenInvalid, |
| Bytes: buf[:1], |
| Range: start.Range(1, 1), |
| }) |
| // If we've encountered an invalid then we might as well stop |
| // scanning since the parser won't proceed beyond this point. |
| // We insert a synthetic EOF marker here to match the expectations |
| // of consumers of this data structure. |
| p.Pos.Column++ |
| p.Pos.Byte++ |
| tokens = append(tokens, token{ |
| Type: tokenEOF, |
| Bytes: nil, |
| Range: posRange(p, p), |
| }) |
| return tokens |
| } |
| } |
| } |
| |
| func byteCanStartNumber(b byte) bool { |
| switch b { |
| // We are slightly more tolerant than JSON requires here since we |
| // expect the parser will make a stricter interpretation of the |
| // number bytes, but we specifically don't allow 'e' or 'E' here |
| // since we want the scanner to treat that as the start of an |
| // invalid keyword instead, to produce more intelligible error messages. |
| case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': |
| return true |
| default: |
| return false |
| } |
| } |
| |
| func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) { |
| // The scanner doesn't check that the sequence of digit-ish bytes is |
| // in a valid order. The parser must do this when decoding a number |
| // token. |
| var i int |
| p := start |
| Byte: |
| for i = 0; i < len(buf); i++ { |
| switch buf[i] { |
| case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': |
| p.Pos.Byte++ |
| p.Pos.Column++ |
| default: |
| break Byte |
| } |
| } |
| return buf[:i], buf[i:], p |
| } |
| |
| func byteCanStartKeyword(b byte) bool { |
| switch { |
| // We allow any sequence of alphabetical characters here, even though |
| // JSON is more constrained, so that we can collect what we presume |
| // the user intended to be a single keyword and then check its validity |
| // in the parser, where we can generate better diagnostics. |
| // So e.g. we want to be able to say: |
| // unrecognized keyword "True". Did you mean "true"? |
| case isAlphabetical(b): |
| return true |
| default: |
| return false |
| } |
| } |
| |
| func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) { |
| var i int |
| p := start |
| Byte: |
| for i = 0; i < len(buf); i++ { |
| b := buf[i] |
| switch { |
| case isAlphabetical(b) || b == '_': |
| p.Pos.Byte++ |
| p.Pos.Column++ |
| default: |
| break Byte |
| } |
| } |
| return buf[:i], buf[i:], p |
| } |
| |
| func scanString(buf []byte, start pos) ([]byte, []byte, pos) { |
| // The scanner doesn't validate correct use of escapes, etc. It pays |
| // attention to escapes only for the purpose of identifying the closing |
| // quote character. It's the parser's responsibility to do proper |
| // validation. |
| // |
| // The scanner also doesn't specifically detect unterminated string |
| // literals, though they can be identified in the parser by checking if |
| // the final byte in a string token is the double-quote character. |
| |
| // Skip the opening quote symbol |
| i := 1 |
| p := start |
| p.Pos.Byte++ |
| p.Pos.Column++ |
| escaping := false |
| Byte: |
| for i < len(buf) { |
| b := buf[i] |
| |
| switch { |
| case b == '\\': |
| escaping = !escaping |
| p.Pos.Byte++ |
| p.Pos.Column++ |
| i++ |
| case b == '"': |
| p.Pos.Byte++ |
| p.Pos.Column++ |
| i++ |
| if !escaping { |
| break Byte |
| } |
| escaping = false |
| case b < 32: |
| break Byte |
| default: |
| // Advance by one grapheme cluster, so that we consider each |
| // grapheme to be a "column". |
| // Ignoring error because this scanner cannot produce errors. |
| advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true) |
| |
| p.Pos.Byte += advance |
| p.Pos.Column++ |
| i += advance |
| |
| escaping = false |
| } |
| } |
| return buf[:i], buf[i:], p |
| } |
| |
| func skipWhitespace(buf []byte, start pos) ([]byte, pos) { |
| var i int |
| p := start |
| Byte: |
| for i = 0; i < len(buf); i++ { |
| switch buf[i] { |
| case ' ': |
| p.Pos.Byte++ |
| p.Pos.Column++ |
| case '\n': |
| p.Pos.Byte++ |
| p.Pos.Column = 1 |
| p.Pos.Line++ |
| case '\r': |
| // For the purpose of line/column counting we consider a |
| // carriage return to take up no space, assuming that it will |
| // be paired up with a newline (on Windows, for example) that |
| // will account for both of them. |
| p.Pos.Byte++ |
| case '\t': |
| // We arbitrarily count a tab as if it were two spaces, because |
| // we need to choose _some_ number here. This means any system |
| // that renders code on-screen with markers must itself treat |
| // tabs as a pair of spaces for rendering purposes, or instead |
| // use the byte offset and back into its own column position. |
| p.Pos.Byte++ |
| p.Pos.Column += 2 |
| default: |
| break Byte |
| } |
| } |
| return buf[i:], p |
| } |
| |
| type pos struct { |
| Filename string |
| Pos hcl.Pos |
| } |
| |
| func (p *pos) Range(byteLen, charLen int) hcl.Range { |
| start := p.Pos |
| end := p.Pos |
| end.Byte += byteLen |
| end.Column += charLen |
| return hcl.Range{ |
| Filename: p.Filename, |
| Start: start, |
| End: end, |
| } |
| } |
| |
| func posRange(start, end pos) hcl.Range { |
| return hcl.Range{ |
| Filename: start.Filename, |
| Start: start.Pos, |
| End: end.Pos, |
| } |
| } |
| |
| func (t token) GoString() string { |
| return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range) |
| } |
| |
| func isAlphabetical(b byte) bool { |
| return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') |
| } |