xplshn
·
2025-08-16
lexer.go
Go
1package lexer
2
3import (
4 "strconv"
5 "strings"
6 "unicode"
7
8 "github.com/xplshn/gbc/pkg/config"
9 "github.com/xplshn/gbc/pkg/token"
10 "github.com/xplshn/gbc/pkg/util"
11)
12
13type Lexer struct {
14 source []rune
15 fileIndex int
16 pos int
17 line int
18 column int
19 cfg *config.Config
20}
21
22func NewLexer(source []rune, fileIndex int, cfg *config.Config) *Lexer {
23 return &Lexer{
24 source: source, fileIndex: fileIndex, line: 1, column: 1, cfg: cfg,
25 }
26}
27
28func (l *Lexer) Next() token.Token {
29 for {
30 l.skipWhitespaceAndComments()
31 startPos, startCol, startLine := l.pos, l.column, l.line
32
33 if l.isAtEnd() {
34 return l.makeToken(token.EOF, "", startPos, startCol, startLine)
35 }
36
37 if !l.cfg.IsFeatureEnabled(config.FeatNoDirectives) && l.peek() == '/' && l.peekNext() == '/' {
38 if tok, isDirective := l.lineCommentOrDirective(startPos, startCol, startLine); isDirective {
39 return tok
40 }
41 }
42
43 ch := l.advance()
44 if unicode.IsLetter(ch) || ch == '_' {
45 return l.identifierOrKeyword(startPos, startCol, startLine)
46 }
47 if unicode.IsDigit(ch) {
48 return l.numberLiteral(startPos, startCol, startLine)
49 }
50
51 switch ch {
52 case '(':
53 return l.makeToken(token.LParen, "", startPos, startCol, startLine)
54 case ')':
55 return l.makeToken(token.RParen, "", startPos, startCol, startLine)
56 case '{':
57 return l.makeToken(token.LBrace, "", startPos, startCol, startLine)
58 case '}':
59 return l.makeToken(token.RBrace, "", startPos, startCol, startLine)
60 case '[':
61 return l.makeToken(token.LBracket, "", startPos, startCol, startLine)
62 case ']':
63 return l.makeToken(token.RBracket, "", startPos, startCol, startLine)
64 case ';':
65 return l.makeToken(token.Semi, "", startPos, startCol, startLine)
66 case ',':
67 return l.makeToken(token.Comma, "", startPos, startCol, startLine)
68 case '?':
69 return l.makeToken(token.Question, "", startPos, startCol, startLine)
70 case '~':
71 return l.makeToken(token.Complement, "", startPos, startCol, startLine)
72 case ':':
73 return l.matchThen('=', token.Define, token.Colon, startPos, startCol, startLine)
74 case '!':
75 return l.matchThen('=', token.Neq, token.Not, startPos, startCol, startLine)
76 case '^':
77 return l.matchThen('=', token.XorEq, token.Xor, startPos, startCol, startLine)
78 case '%':
79 return l.matchThen('=', token.RemEq, token.Rem, startPos, startCol, startLine)
80 case '+':
81 return l.plus(startPos, startCol, startLine)
82 case '-':
83 return l.minus(startPos, startCol, startLine)
84 case '*':
85 return l.star(startPos, startCol, startLine)
86 case '/':
87 return l.slash(startPos, startCol, startLine)
88 case '&':
89 return l.ampersand(startPos, startCol, startLine)
90 case '|':
91 return l.pipe(startPos, startCol, startLine)
92 case '<':
93 return l.less(startPos, startCol, startLine)
94 case '>':
95 return l.greater(startPos, startCol, startLine)
96 case '=':
97 return l.equal(startPos, startCol, startLine)
98 case '.':
99 if l.match('.') && l.match('.') {
100 return l.makeToken(token.Dots, "", startPos, startCol, startLine)
101 }
102 return l.makeToken(token.Dot, "", startPos, startCol, startLine)
103 case '"':
104 return l.stringLiteral(startPos, startCol, startLine)
105 case '\'':
106 return l.charLiteral(startPos, startCol, startLine)
107 }
108
109 tok := l.makeToken(token.EOF, "", startPos, startCol, startLine)
110 util.Error(tok, "Unexpected character: '%c'", ch)
111 return tok
112 }
113}
114
115func (l *Lexer) peek() rune {
116 if l.isAtEnd() {
117 return 0
118 }
119 return l.source[l.pos]
120}
121
122func (l *Lexer) peekNext() rune {
123 if l.pos+1 >= len(l.source) {
124 return 0
125 }
126 return l.source[l.pos+1]
127}
128
129func (l *Lexer) advance() rune {
130 if l.isAtEnd() {
131 return 0
132 }
133 ch := l.source[l.pos]
134 if ch == '\n' {
135 l.line++
136 l.column = 1
137 } else {
138 l.column++
139 }
140 l.pos++
141 return ch
142}
143
144func (l *Lexer) match(expected rune) bool {
145 if l.isAtEnd() || l.source[l.pos] != expected {
146 return false
147 }
148 l.advance()
149 return true
150}
151
152func (l *Lexer) isAtEnd() bool {
153 return l.pos >= len(l.source)
154}
155
156func (l *Lexer) makeToken(tokType token.Type, value string, startPos, startCol, startLine int) token.Token {
157 return token.Token{
158 Type: tokType, Value: value, FileIndex: l.fileIndex,
159 Line: startLine, Column: startCol, Len: l.pos - startPos,
160 }
161}
162
163func (l *Lexer) skipWhitespaceAndComments() {
164 for {
165 c := l.peek()
166 switch c {
167 case ' ', '\t', '\n', '\r':
168 l.advance()
169 case '/':
170 if l.peekNext() == '*' {
171 l.blockComment()
172 } else if l.peekNext() == '/' && l.cfg.IsFeatureEnabled(config.FeatCComments) {
173 l.lineComment()
174 } else {
175 return
176 }
177 default:
178 return
179 }
180 }
181}
182
183func (l *Lexer) blockComment() {
184 startTok := l.makeToken(token.Comment, "", l.pos, l.column, l.line)
185 l.advance() // Consume '/'
186 l.advance() // Consume '*'
187 for !l.isAtEnd() {
188 if l.peek() == '*' && l.peekNext() == '/' {
189 l.advance()
190 l.advance()
191 return
192 }
193 l.advance()
194 }
195 util.Error(startTok, "Unterminated block comment")
196}
197
198func (l *Lexer) lineComment() {
199 for !l.isAtEnd() && l.peek() != '\n' {
200 l.advance()
201 }
202}
203
204func (l *Lexer) lineCommentOrDirective(startPos, startCol, startLine int) (token.Token, bool) {
205 preCommentPos, preCommentCol, preCommentLine := l.pos, l.column, l.line
206 l.advance() // Consume '/'
207 l.advance() // Consume '/'
208 commentStartPos := l.pos
209 for !l.isAtEnd() && l.peek() != '\n' {
210 l.advance()
211 }
212 commentContent := string(l.source[commentStartPos:l.pos])
213 trimmedContent := strings.TrimSpace(commentContent)
214
215 if strings.HasPrefix(trimmedContent, "[b]:") {
216 directiveContent := strings.TrimSpace(trimmedContent[4:])
217 return l.makeToken(token.Directive, directiveContent, startPos, startCol, startLine), true
218 }
219
220 l.pos, l.column, l.line = preCommentPos, preCommentCol, preCommentLine
221 return token.Token{}, false
222}
223
224func (l *Lexer) identifierOrKeyword(startPos, startCol, startLine int) token.Token {
225 for unicode.IsLetter(l.peek()) || unicode.IsDigit(l.peek()) || l.peek() == '_' {
226 l.advance()
227 }
228 value := string(l.source[startPos:l.pos])
229 tok := l.makeToken(token.Ident, value, startPos, startCol, startLine)
230
231 if tokType, isKeyword := token.KeywordMap[value]; isKeyword {
232 isTypedKeyword := tokType >= token.Void && tokType <= token.Any
233 if !isTypedKeyword || l.cfg.IsFeatureEnabled(config.FeatTyped) {
234 tok.Type = tokType
235 tok.Value = ""
236 }
237 }
238 return tok
239}
240
241func (l *Lexer) numberLiteral(startPos, startCol, startLine int) token.Token {
242 for unicode.IsDigit(l.peek()) || (l.peek() == 'x' || l.peek() == 'X') || (l.peek() >= 'a' && l.peek() <= 'f') || (l.peek() >= 'A' && l.peek() <= 'F') {
243 l.advance()
244 }
245 valueStr := string(l.source[startPos:l.pos])
246 tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
247 val, err := strconv.ParseUint(valueStr, 0, 64)
248 if err != nil {
249 util.Error(tok, "Invalid number literal: %s", valueStr)
250 }
251 tok.Value = strconv.FormatInt(int64(val), 10)
252 return tok
253}
254
255func (l *Lexer) stringLiteral(startPos, startCol, startLine int) token.Token {
256 var sb strings.Builder
257 for !l.isAtEnd() {
258 c := l.peek()
259 if c == '"' {
260 l.advance()
261 return l.makeToken(token.String, sb.String(), startPos, startCol, startLine)
262 }
263 if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
264 l.advance()
265 sb.WriteRune(rune(l.decodeEscape(c, startPos, startCol, startLine)))
266 } else {
267 l.advance()
268 sb.WriteRune(c)
269 }
270 }
271 util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Unterminated string literal")
272 return l.makeToken(token.EOF, "", l.pos, l.column, l.line)
273}
274
275func (l *Lexer) charLiteral(startPos, startCol, startLine int) token.Token {
276 var word int64
277 for l.peek() != '\'' && !l.isAtEnd() {
278 var val int64
279 c := l.peek()
280 if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
281 l.advance()
282 val = l.decodeEscape(c, startPos, startCol, startLine)
283 } else {
284 l.advance()
285 val = int64(c)
286 }
287 word = (word << 8) | (val & 0xFF)
288 }
289
290 tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
291 if !l.match('\'') {
292 util.Error(tok, "Unterminated character literal")
293 }
294 tok.Value = strconv.FormatInt(word, 10)
295 return tok
296}
297
298func (l *Lexer) decodeEscape(escapeChar rune, startPos, startCol, startLine int) int64 {
299 if l.isAtEnd() {
300 util.Error(l.makeToken(token.EOF, "", l.pos, l.column, l.line), "Unterminated escape sequence")
301 return 0
302 }
303 c := l.advance()
304 escapes := map[rune]int64{'n': '\n', 't': '\t', 'e': 4, 'b': '\b', 'r': '\r', '0': 0, '(': '{', ')': '}', '\\': '\\', '\'': '\'', '"': '"', '*': '*'}
305 if val, ok := escapes[c]; ok {
306 return val
307 }
308 util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine), "Unrecognized escape sequence '%c%c'", escapeChar, c)
309 return int64(c)
310}
311
312func (l *Lexer) matchThen(expected rune, thenType, elseType token.Type, sPos, sCol, sLine int) token.Token {
313 if l.match(expected) {
314 return l.makeToken(thenType, "", sPos, sCol, sLine)
315 }
316 return l.makeToken(elseType, "", sPos, sCol, sLine)
317}
318
319func (l *Lexer) plus(sPos, sCol, sLine int) token.Token {
320 if l.match('+') {
321 return l.makeToken(token.Inc, "", sPos, sCol, sLine)
322 }
323 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
324 return l.makeToken(token.PlusEq, "", sPos, sCol, sLine)
325 }
326 return l.makeToken(token.Plus, "", sPos, sCol, sLine)
327}
328
329func (l *Lexer) minus(sPos, sCol, sLine int) token.Token {
330 if l.match('-') {
331 return l.makeToken(token.Dec, "", sPos, sCol, sLine)
332 }
333 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
334 return l.makeToken(token.MinusEq, "", sPos, sCol, sLine)
335 }
336 return l.makeToken(token.Minus, "", sPos, sCol, sLine)
337}
338
339func (l *Lexer) star(sPos, sCol, sLine int) token.Token {
340 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
341 return l.makeToken(token.StarEq, "", sPos, sCol, sLine)
342 }
343 return l.makeToken(token.Star, "", sPos, sCol, sLine)
344}
345
346func (l *Lexer) slash(sPos, sCol, sLine int) token.Token {
347 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
348 return l.makeToken(token.SlashEq, "", sPos, sCol, sLine)
349 }
350 return l.makeToken(token.Slash, "", sPos, sCol, sLine)
351}
352
353func (l *Lexer) ampersand(sPos, sCol, sLine int) token.Token {
354 if l.match('&') {
355 return l.makeToken(token.AndAnd, "", sPos, sCol, sLine)
356 }
357 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
358 return l.makeToken(token.AndEq, "", sPos, sCol, sLine)
359 }
360 return l.makeToken(token.And, "", sPos, sCol, sLine)
361}
362
363func (l *Lexer) pipe(sPos, sCol, sLine int) token.Token {
364 if l.match('|') {
365 return l.makeToken(token.OrOr, "", sPos, sCol, sLine)
366 }
367 if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
368 return l.makeToken(token.OrEq, "", sPos, sCol, sLine)
369 }
370 return l.makeToken(token.Or, "", sPos, sCol, sLine)
371}
372
373func (l *Lexer) less(sPos, sCol, sLine int) token.Token {
374 if l.match('<') {
375 return l.matchThen('=', token.ShlEq, token.Shl, sPos, sCol, sLine)
376 }
377 return l.matchThen('=', token.Lte, token.Lt, sPos, sCol, sLine)
378}
379
380func (l *Lexer) greater(sPos, sCol, sLine int) token.Token {
381 if l.match('>') {
382 return l.matchThen('=', token.ShrEq, token.Shr, sPos, sCol, sLine)
383 }
384 return l.matchThen('=', token.Gte, token.Gt, sPos, sCol, sLine)
385}
386
387func (l *Lexer) equal(sPos, sCol, sLine int) token.Token {
388 if l.match('=') {
389 return l.makeToken(token.EqEq, "", sPos, sCol, sLine)
390 }
391 if l.cfg.IsFeatureEnabled(config.FeatBOps) {
392 switch {
393 case l.match('+'):
394 return l.makeToken(token.EqPlus, "", sPos, sCol, sLine)
395 case l.match('-'):
396 return l.makeToken(token.EqMinus, "", sPos, sCol, sLine)
397 case l.match('*'):
398 return l.makeToken(token.EqStar, "", sPos, sCol, sLine)
399 case l.match('/'):
400 return l.makeToken(token.EqSlash, "", sPos, sCol, sLine)
401 case l.match('%'):
402 return l.makeToken(token.EqRem, "", sPos, sCol, sLine)
403 case l.match('&'):
404 return l.makeToken(token.EqAnd, "", sPos, sCol, sLine)
405 case l.match('|'):
406 return l.makeToken(token.EqOr, "", sPos, sCol, sLine)
407 case l.match('^'):
408 return l.makeToken(token.EqXor, "", sPos, sCol, sLine)
409 case l.match('<') && l.match('<'):
410 return l.makeToken(token.EqShl, "", sPos, sCol, sLine)
411 case l.match('>') && l.match('>'):
412 return l.makeToken(token.EqShr, "", sPos, sCol, sLine)
413 }
414 }
415 return l.makeToken(token.Eq, "", sPos, sCol, sLine)
416}