xplshn
·
2025-09-10
lexer.go
Go
1package lexer
2
3import (
4 "strconv"
5 "strings"
6 "unicode"
7
8 "github.com/xplshn/gbc/pkg/config"
9 "github.com/xplshn/gbc/pkg/token"
10 "github.com/xplshn/gbc/pkg/util"
11)
12
13type Lexer struct {
14 source []rune
15 fileIndex int
16 pos int
17 line int
18 column int
19 cfg *config.Config
20}
21
22func NewLexer(source []rune, fileIndex int, cfg *config.Config) *Lexer {
23 return &Lexer{
24 source: source, fileIndex: fileIndex, line: 1, column: 1, cfg: cfg,
25 }
26}
27
28func (l *Lexer) Next() token.Token {
29 for {
30 l.skipWhitespaceAndComments()
31 startPos, startCol, startLine := l.pos, l.column, l.line
32
33 if l.isAtEnd() {
34 return l.makeToken(token.EOF, "", startPos, startCol, startLine)
35 }
36
37 if l.peek() == '/' && l.peekNext() == '/' {
38 if !l.cfg.IsFeatureEnabled(config.FeatNoDirectives) {
39 if tok, isDirective := l.lineCommentOrDirective(startPos, startCol, startLine); isDirective {
40 return tok
41 }
42 }
43 if l.cfg.IsFeatureEnabled(config.FeatCComments) {
44 l.lineComment()
45 continue
46 }
47 }
48
49 ch := l.peek()
50 if unicode.IsLetter(ch) || ch == '_' {
51 l.advance()
52 return l.identifierOrKeyword(startPos, startCol, startLine)
53 }
54 if unicode.IsDigit(ch) || (ch == '.' && unicode.IsDigit(l.peekNext())) {
55 return l.numberLiteral(startPos, startCol, startLine)
56 }
57
58 l.advance()
59 switch ch {
60 case '(': return l.makeToken(token.LParen, "", startPos, startCol, startLine)
61 case ')': return l.makeToken(token.RParen, "", startPos, startCol, startLine)
62 case '{': return l.makeToken(token.LBrace, "", startPos, startCol, startLine)
63 case '}': return l.makeToken(token.RBrace, "", startPos, startCol, startLine)
64 case '[': return l.makeToken(token.LBracket, "", startPos, startCol, startLine)
65 case ']': return l.makeToken(token.RBracket, "", startPos, startCol, startLine)
66 case ';': return l.makeToken(token.Semi, "", startPos, startCol, startLine)
67 case ',': return l.makeToken(token.Comma, "", startPos, startCol, startLine)
68 case '?': return l.makeToken(token.Question, "", startPos, startCol, startLine)
69 case '~': return l.makeToken(token.Complement, "", startPos, startCol, startLine)
70 case ':': return l.matchThen('=', token.Define, token.Colon, startPos, startCol, startLine)
71 case '!': return l.matchThen('=', token.Neq, token.Not, startPos, startCol, startLine)
72 case '^': return l.matchThen('=', token.XorEq, token.Xor, startPos, startCol, startLine)
73 case '%': return l.matchThen('=', token.RemEq, token.Rem, startPos, startCol, startLine)
74 case '+':
75 return l.plus(startPos, startCol, startLine)
76 case '-':
77 return l.minus(startPos, startCol, startLine)
78 case '*':
79 return l.star(startPos, startCol, startLine)
80 case '/':
81 return l.slash(startPos, startCol, startLine)
82 case '&':
83 return l.ampersand(startPos, startCol, startLine)
84 case '|':
85 return l.pipe(startPos, startCol, startLine)
86 case '<':
87 return l.less(startPos, startCol, startLine)
88 case '>':
89 return l.greater(startPos, startCol, startLine)
90 case '=':
91 return l.equal(startPos, startCol, startLine)
92 case '.':
93 if l.match('.') && l.match('.') {
94 return l.makeToken(token.Dots, "", startPos, startCol, startLine)
95 }
96 return l.makeToken(token.Dot, "", startPos, startCol, startLine)
97 case '"':
98 return l.stringLiteral(startPos, startCol, startLine)
99 case '\'':
100 return l.charLiteral(startPos, startCol, startLine)
101 }
102
103 tok := l.makeToken(token.EOF, "", startPos, startCol, startLine)
104 util.Error(tok, "Unexpected character: '%c'", ch)
105 return tok
106 }
107}
108
109func (l *Lexer) peek() rune {
110 if l.isAtEnd() {
111 return 0
112 }
113 return l.source[l.pos]
114}
115
116func (l *Lexer) peekNext() rune {
117 if l.pos+1 >= len(l.source) {
118 return 0
119 }
120 return l.source[l.pos+1]
121}
122
123func (l *Lexer) advance() rune {
124 if l.isAtEnd() {
125 return 0
126 }
127 ch := l.source[l.pos]
128 if ch == '\n' {
129 l.line++
130 l.column = 1
131 } else {
132 l.column++
133 }
134 l.pos++
135 return ch
136}
137
138func (l *Lexer) match(expected rune) bool {
139 if l.isAtEnd() || l.source[l.pos] != expected {
140 return false
141 }
142 l.advance()
143 return true
144}
145
146func (l *Lexer) isAtEnd() bool { return l.pos >= len(l.source) }
147
148func (l *Lexer) makeToken(tokType token.Type, value string, startPos, startCol, startLine int) token.Token {
149 return token.Token{
150 Type: tokType, Value: value, FileIndex: l.fileIndex,
151 Line: startLine, Column: startCol, Len: l.pos - startPos,
152 }
153}
154
155func (l *Lexer) skipWhitespaceAndComments() {
156 for {
157 switch l.peek() {
158 case ' ', '\t', '\n', '\r':
159 l.advance()
160 case '/':
161 if l.peekNext() == '*' {
162 l.blockComment()
163 } else {
164 return
165 }
166 default:
167 return
168 }
169 }
170}
171
172func (l *Lexer) blockComment() {
173 startTok := l.makeToken(token.Comment, "", l.pos, l.column, l.line)
174 l.advance()
175 l.advance()
176 for !l.isAtEnd() {
177 if l.peek() == '*' && l.peekNext() == '/' {
178 l.advance()
179 l.advance()
180 return
181 }
182 l.advance()
183 }
184 util.Error(startTok, "Unterminated block comment")
185}
186
187func (l *Lexer) lineComment() {
188 for !l.isAtEnd() && l.peek() != '\n' {
189 l.advance()
190 }
191}
192
193func (l *Lexer) lineCommentOrDirective(startPos, startCol, startLine int) (token.Token, bool) {
194 preCommentPos, preCommentCol, preCommentLine := l.pos, l.column, l.line
195 l.advance()
196 l.advance()
197 commentStartPos := l.pos
198 for !l.isAtEnd() && l.peek() != '\n' {
199 l.advance()
200 }
201 commentContent := string(l.source[commentStartPos:l.pos])
202 trimmedContent := strings.TrimSpace(commentContent)
203
204 if strings.HasPrefix(trimmedContent, "[b]:") {
205 directiveContent := strings.TrimSpace(strings.TrimPrefix(trimmedContent, "[b]:"))
206 return l.makeToken(token.Directive, directiveContent, startPos, startCol, startLine), true
207 }
208
209 l.pos, l.column, l.line = preCommentPos, preCommentCol, preCommentLine
210 return token.Token{}, false
211}
212
213func (l *Lexer) identifierOrKeyword(startPos, startCol, startLine int) token.Token {
214 for unicode.IsLetter(l.peek()) || unicode.IsDigit(l.peek()) || l.peek() == '_' {
215 l.advance()
216 }
217 value := string(l.source[startPos:l.pos])
218 tok := l.makeToken(token.Ident, value, startPos, startCol, startLine)
219
220 if tokType, isKeyword := token.KeywordMap[value]; isKeyword {
221 isTypedKeyword := tokType >= token.Void && tokType <= token.Any
222 if !isTypedKeyword || l.cfg.IsFeatureEnabled(config.FeatTyped) {
223 tok.Type = tokType
224 tok.Value = ""
225 }
226 }
227 return tok
228}
229
230func (l *Lexer) numberLiteral(startPos, startCol, startLine int) token.Token {
231 isFloat := false
232 if l.peek() == '.' {
233 isFloat = true
234 l.advance()
235 }
236
237 if l.peek() == '0' && (l.peekNext() == 'x' || l.peekNext() == 'X') {
238 l.advance()
239 l.advance()
240 for unicode.IsDigit(l.peek()) || (l.peek() >= 'a' && l.peek() <= 'f') || (l.peek() >= 'A' && l.peek() <= 'F') {
241 l.advance()
242 }
243 } else {
244 for unicode.IsDigit(l.peek()) {
245 l.advance()
246 }
247 }
248
249 if l.peek() == '.' {
250 if unicode.IsDigit(l.peekNext()) {
251 isFloat = true
252 l.advance()
253 for unicode.IsDigit(l.peek()) {
254 l.advance()
255 }
256 }
257 }
258
259 valueStr := string(l.source[startPos:l.pos])
260 if (l.peek() == 'e' || l.peek() == 'E') && !strings.HasPrefix(valueStr, "0x") && !strings.HasPrefix(valueStr, "0X") {
261 isFloat = true
262 l.advance()
263 if l.peek() == '+' || l.peek() == '-' {
264 l.advance()
265 }
266 if !unicode.IsDigit(l.peek()) {
267 util.Error(l.makeToken(token.FloatNumber, "", startPos, startCol, startLine), "Malformed floating-point literal: exponent has no digits")
268 }
269 for unicode.IsDigit(l.peek()) {
270 l.advance()
271 }
272 }
273
274 valueStr = string(l.source[startPos:l.pos])
275
276 if isFloat {
277 if !l.cfg.IsFeatureEnabled(config.FeatFloat) {
278 tok := l.makeToken(token.FloatNumber, valueStr, startPos, startCol, startLine)
279 util.Error(tok, "Floating-point numbers are not enabled (use -Ffloat)")
280 return tok
281 }
282 if l.cfg.IsWarningEnabled(config.WarnFloat) {
283 tok := l.makeToken(token.FloatNumber, valueStr, startPos, startCol, startLine)
284 util.Warn(l.cfg, config.WarnFloat, tok, "Use of floating-point constant")
285 }
286 return l.makeToken(token.FloatNumber, valueStr, startPos, startCol, startLine)
287 }
288
289 tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
290 val, err := strconv.ParseUint(valueStr, 0, 64)
291 if err != nil {
292 if e, ok := err.(*strconv.NumError); ok && e.Err == strconv.ErrRange {
293 util.Warn(l.cfg, config.WarnOverflow, tok, "Integer constant overflow: %s", valueStr)
294 tok.Value = valueStr
295 return tok
296 }
297 util.Error(tok, "Invalid number literal: %s", valueStr)
298 tok.Value = "0"
299 } else {
300 tok.Value = strconv.FormatUint(val, 10)
301 }
302 return tok
303}
304
305func (l *Lexer) stringLiteral(startPos, startCol, startLine int) token.Token {
306 var sb strings.Builder
307 for !l.isAtEnd() {
308 c := l.peek()
309 if c == '"' {
310 l.advance()
311 return l.makeToken(token.String, sb.String(), startPos, startCol, startLine)
312 }
313 if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
314 l.advance()
315 val := l.decodeEscape(c, startPos, startCol, startLine)
316 // For values 0-127, write as regular rune. For 128-255, write as raw byte
317 if val <= 127 {
318 sb.WriteRune(rune(val))
319 } else {
320 // Build the final string using a byte slice to avoid UTF-8 encoding
321 existing := sb.String()
322 newBuf := []byte(existing)
323 newBuf = append(newBuf, byte(val))
324 sb.Reset()
325 sb.WriteString(string(newBuf))
326 }
327 } else {
328 l.advance()
329 sb.WriteRune(c)
330 }
331 }
332 util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Unterminated string literal")
333 return l.makeToken(token.EOF, "", l.pos, l.column, l.line)
334}
335
336func (l *Lexer) charLiteral(startPos, startCol, startLine int) token.Token {
337 var word int64
338 for l.peek() != '\'' && !l.isAtEnd() {
339 var val int64
340 c := l.peek()
341 if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
342 l.advance()
343 val = l.decodeEscape(c, startPos, startCol, startLine)
344 } else {
345 l.advance()
346 val = int64(c)
347 }
348 word = (word << 8) | (val & 0xFF)
349 }
350
351 tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
352 if !l.match('\'') {
353 util.Error(tok, "Unterminated character literal")
354 }
355 tok.Value = strconv.FormatInt(word, 10)
356 return tok
357}
358
359func (l *Lexer) decodeEscape(escapeChar rune, startPos, startCol, startLine int) int64 {
360 if l.isAtEnd() {
361 util.Error(l.makeToken(token.EOF, "", l.pos, l.column, l.line), "Unterminated escape sequence")
362 return 0
363 }
364 c := l.advance()
365
366 // Handle hex escape sequences (\x followed by exactly 2 hex digits)
367 if c == 'x' {
368 return l.parseHexEscape(2, escapeChar, startPos, startCol, startLine)
369 }
370
371 // Handle unicode escape sequences (\u followed by exactly 4 hex digits)
372 if c == 'u' {
373 return l.parseHexEscape(4, escapeChar, startPos, startCol, startLine)
374 }
375
376 // Handle unicode escape sequences (\U followed by exactly 8 hex digits)
377 if c == 'U' {
378 return l.parseHexEscape(8, escapeChar, startPos, startCol, startLine)
379 }
380
381 // Handle octal escape sequences (\000-\377) - Go-style: exactly 3 digits required
382 if c >= '0' && c <= '7' {
383 val := int64(c - '0')
384 digitsRead := 1
385
386 // Read exactly 2 more digits (for 3 total - Go behavior)
387 for i := 0; i < 2; i++ {
388 if l.isAtEnd() {
389 util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine),
390 "Octal escape sequence '%c%c...' requires exactly 3 digits, got %d (use \\%03o for Go-style)", escapeChar, c, digitsRead, val)
391 return val
392 }
393 next := l.peek()
394 if next >= '0' && next <= '7' {
395 val = val*8 + int64(next-'0')
396 l.advance()
397 digitsRead++
398 } else {
399 util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine),
400 "Octal escape sequence '%c%c...' requires exactly 3 digits, got %d (use \\%03o for Go-style)", escapeChar, c, digitsRead, val)
401 return val
402 }
403 }
404 return val
405 }
406
407 escapes := map[rune]int64{
408 'n': '\n', 't': '\t', 'e': 4, 'b': '\b', 'r': '\r',
409 '(': '{', ')': '}', '\\': '\\', '\'': '\'', '"': '"', '*': '*',
410 'a': '\a', 'f': '\f', 'v': '\v', '0': 0,
411 }
412 if val, ok := escapes[c]; ok {
413 return val
414 }
415 util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine), "Unrecognized escape sequence '%c%c'", escapeChar, c)
416 return int64(c)
417}
418
419func (l *Lexer) parseHexEscape(numDigits int, escapeChar rune, startPos, startCol, startLine int) int64 {
420 var val int64
421 for i := 0; i < numDigits; i++ {
422 if l.isAtEnd() {
423 util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Incomplete hex escape sequence '%c%c' - expected %d hex digits", escapeChar, 'x', numDigits)
424 return 0
425 }
426 c := l.peek()
427 var digit int64
428 switch {
429 case c >= '0' && c <= '9': digit = int64(c - '0')
430 case c >= 'a' && c <= 'f': digit = int64(c - 'a' + 10)
431 case c >= 'A' && c <= 'F': digit = int64(c - 'A' + 10)
432 default:
433 util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Invalid hex digit '%c' in escape sequence", c)
434 return 0
435 }
436 val = val*16 + digit
437 l.advance()
438 }
439 return val
440}
441
442func (l *Lexer) matchThen(expected rune, thenType, elseType token.Type, sPos, sCol, sLine int) token.Token {
443 if l.match(expected) {
444 return l.makeToken(thenType, "", sPos, sCol, sLine)
445 }
446 return l.makeToken(elseType, "", sPos, sCol, sLine)
447}
448
449func (l *Lexer) plus(sPos, sCol, sLine int) token.Token {
450 if l.match('+') {
451 return l.makeToken(token.Inc, "", sPos, sCol, sLine)
452 }
453 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
454 return l.makeToken(token.PlusEq, "", sPos, sCol, sLine)
455 }
456 return l.makeToken(token.Plus, "", sPos, sCol, sLine)
457}
458
459func (l *Lexer) minus(sPos, sCol, sLine int) token.Token {
460 if l.match('-') {
461 return l.makeToken(token.Dec, "", sPos, sCol, sLine)
462 }
463 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
464 return l.makeToken(token.MinusEq, "", sPos, sCol, sLine)
465 }
466 return l.makeToken(token.Minus, "", sPos, sCol, sLine)
467}
468
469func (l *Lexer) star(sPos, sCol, sLine int) token.Token {
470 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
471 return l.makeToken(token.StarEq, "", sPos, sCol, sLine)
472 }
473 return l.makeToken(token.Star, "", sPos, sCol, sLine)
474}
475
476func (l *Lexer) slash(sPos, sCol, sLine int) token.Token {
477 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
478 return l.makeToken(token.SlashEq, "", sPos, sCol, sLine)
479 }
480 return l.makeToken(token.Slash, "", sPos, sCol, sLine)
481}
482
483func (l *Lexer) ampersand(sPos, sCol, sLine int) token.Token {
484 if l.match('&') {
485 return l.makeToken(token.AndAnd, "", sPos, sCol, sLine)
486 }
487 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
488 return l.makeToken(token.AndEq, "", sPos, sCol, sLine)
489 }
490 return l.makeToken(token.And, "", sPos, sCol, sLine)
491}
492
493func (l *Lexer) pipe(sPos, sCol, sLine int) token.Token {
494 if l.match('|') {
495 return l.makeToken(token.OrOr, "", sPos, sCol, sLine)
496 }
497 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
498 return l.makeToken(token.OrEq, "", sPos, sCol, sLine)
499 }
500 return l.makeToken(token.Or, "", sPos, sCol, sLine)
501}
502
503func (l *Lexer) less(sPos, sCol, sLine int) token.Token {
504 if l.match('<') {
505 return l.matchThen('=', token.ShlEq, token.Shl, sPos, sCol, sLine)
506 }
507 return l.matchThen('=', token.Lte, token.Lt, sPos, sCol, sLine)
508}
509
510func (l *Lexer) greater(sPos, sCol, sLine int) token.Token {
511 if l.match('>') {
512 return l.matchThen('=', token.ShrEq, token.Shr, sPos, sCol, sLine)
513 }
514 return l.matchThen('=', token.Gte, token.Gt, sPos, sCol, sLine)
515}
516
517func (l *Lexer) equal(sPos, sCol, sLine int) token.Token {
518 if l.match('=') {
519 return l.makeToken(token.EqEq, "", sPos, sCol, sLine)
520 }
521 if l.cfg.IsFeatureEnabled(config.FeatBOps) {
522 switch {
523 case l.match('+'): return l.makeToken(token.EqPlus, "", sPos, sCol, sLine)
524 case l.match('-'): return l.makeToken(token.EqMinus, "", sPos, sCol, sLine)
525 case l.match('*'): return l.makeToken(token.EqStar, "", sPos, sCol, sLine)
526 case l.match('/'): return l.makeToken(token.EqSlash, "", sPos, sCol, sLine)
527 case l.match('%'): return l.makeToken(token.EqRem, "", sPos, sCol, sLine)
528 case l.match('&'): return l.makeToken(token.EqAnd, "", sPos, sCol, sLine)
529 case l.match('|'): return l.makeToken(token.EqOr, "", sPos, sCol, sLine)
530 case l.match('^'): return l.makeToken(token.EqXor, "", sPos, sCol, sLine)
531 case l.match('<') && l.match('<'): return l.makeToken(token.EqShl, "", sPos, sCol, sLine)
532 case l.match('>') && l.match('>'): return l.makeToken(token.EqShr, "", sPos, sCol, sLine)
533 }
534 }
535 return l.makeToken(token.Eq, "", sPos, sCol, sLine)
536}