repos / gbc

GBC - Go B Compiler
git clone https://github.com/xplshn/gbc.git

gbc / pkg / lexer
xplshn  ·  2025-09-10

lexer.go

Go
  1package lexer
  2
  3import (
  4	"strconv"
  5	"strings"
  6	"unicode"
  7
  8	"github.com/xplshn/gbc/pkg/config"
  9	"github.com/xplshn/gbc/pkg/token"
 10	"github.com/xplshn/gbc/pkg/util"
 11)
 12
 13type Lexer struct {
 14	source    []rune
 15	fileIndex int
 16	pos       int
 17	line      int
 18	column    int
 19	cfg       *config.Config
 20}
 21
 22func NewLexer(source []rune, fileIndex int, cfg *config.Config) *Lexer {
 23	return &Lexer{
 24		source: source, fileIndex: fileIndex, line: 1, column: 1, cfg: cfg,
 25	}
 26}
 27
 28func (l *Lexer) Next() token.Token {
 29	for {
 30		l.skipWhitespaceAndComments()
 31		startPos, startCol, startLine := l.pos, l.column, l.line
 32
 33		if l.isAtEnd() {
 34			return l.makeToken(token.EOF, "", startPos, startCol, startLine)
 35		}
 36
 37		if l.peek() == '/' && l.peekNext() == '/' {
 38			if !l.cfg.IsFeatureEnabled(config.FeatNoDirectives) {
 39				if tok, isDirective := l.lineCommentOrDirective(startPos, startCol, startLine); isDirective {
 40					return tok
 41				}
 42			}
 43			if l.cfg.IsFeatureEnabled(config.FeatCComments) {
 44				l.lineComment()
 45				continue
 46			}
 47		}
 48
 49		ch := l.peek()
 50		if unicode.IsLetter(ch) || ch == '_' {
 51			l.advance()
 52			return l.identifierOrKeyword(startPos, startCol, startLine)
 53		}
 54		if unicode.IsDigit(ch) || (ch == '.' && unicode.IsDigit(l.peekNext())) {
 55			return l.numberLiteral(startPos, startCol, startLine)
 56		}
 57
 58		l.advance()
 59		switch ch {
 60		case '(': return l.makeToken(token.LParen, "", startPos, startCol, startLine)
 61		case ')': return l.makeToken(token.RParen, "", startPos, startCol, startLine)
 62		case '{': return l.makeToken(token.LBrace, "", startPos, startCol, startLine)
 63		case '}': return l.makeToken(token.RBrace, "", startPos, startCol, startLine)
 64		case '[': return l.makeToken(token.LBracket, "", startPos, startCol, startLine)
 65		case ']': return l.makeToken(token.RBracket, "", startPos, startCol, startLine)
 66		case ';': return l.makeToken(token.Semi, "", startPos, startCol, startLine)
 67		case ',': return l.makeToken(token.Comma, "", startPos, startCol, startLine)
 68		case '?': return l.makeToken(token.Question, "", startPos, startCol, startLine)
 69		case '~': return l.makeToken(token.Complement, "", startPos, startCol, startLine)
 70		case ':': return l.matchThen('=', token.Define, token.Colon, startPos, startCol, startLine)
 71		case '!': return l.matchThen('=', token.Neq, token.Not, startPos, startCol, startLine)
 72		case '^': return l.matchThen('=', token.XorEq, token.Xor, startPos, startCol, startLine)
 73		case '%': return l.matchThen('=', token.RemEq, token.Rem, startPos, startCol, startLine)
 74		case '+':
 75			return l.plus(startPos, startCol, startLine)
 76		case '-':
 77			return l.minus(startPos, startCol, startLine)
 78		case '*':
 79			return l.star(startPos, startCol, startLine)
 80		case '/':
 81			return l.slash(startPos, startCol, startLine)
 82		case '&':
 83			return l.ampersand(startPos, startCol, startLine)
 84		case '|':
 85			return l.pipe(startPos, startCol, startLine)
 86		case '<':
 87			return l.less(startPos, startCol, startLine)
 88		case '>':
 89			return l.greater(startPos, startCol, startLine)
 90		case '=':
 91			return l.equal(startPos, startCol, startLine)
 92		case '.':
 93			if l.match('.') && l.match('.') {
 94				return l.makeToken(token.Dots, "", startPos, startCol, startLine)
 95			}
 96			return l.makeToken(token.Dot, "", startPos, startCol, startLine)
 97		case '"':
 98			return l.stringLiteral(startPos, startCol, startLine)
 99		case '\'':
100			return l.charLiteral(startPos, startCol, startLine)
101		}
102
103		tok := l.makeToken(token.EOF, "", startPos, startCol, startLine)
104		util.Error(tok, "Unexpected character: '%c'", ch)
105		return tok
106	}
107}
108
109func (l *Lexer) peek() rune {
110	if l.isAtEnd() {
111		return 0
112	}
113	return l.source[l.pos]
114}
115
116func (l *Lexer) peekNext() rune {
117	if l.pos+1 >= len(l.source) {
118		return 0
119	}
120	return l.source[l.pos+1]
121}
122
123func (l *Lexer) advance() rune {
124	if l.isAtEnd() {
125		return 0
126	}
127	ch := l.source[l.pos]
128	if ch == '\n' {
129		l.line++
130		l.column = 1
131	} else {
132		l.column++
133	}
134	l.pos++
135	return ch
136}
137
138func (l *Lexer) match(expected rune) bool {
139	if l.isAtEnd() || l.source[l.pos] != expected {
140		return false
141	}
142	l.advance()
143	return true
144}
145
146func (l *Lexer) isAtEnd() bool { return l.pos >= len(l.source) }
147
148func (l *Lexer) makeToken(tokType token.Type, value string, startPos, startCol, startLine int) token.Token {
149	return token.Token{
150		Type: tokType, Value: value, FileIndex: l.fileIndex,
151		Line: startLine, Column: startCol, Len: l.pos - startPos,
152	}
153}
154
155func (l *Lexer) skipWhitespaceAndComments() {
156	for {
157		switch l.peek() {
158		case ' ', '\t', '\n', '\r':
159			l.advance()
160		case '/':
161			if l.peekNext() == '*' {
162				l.blockComment()
163			} else {
164				return
165			}
166		default:
167			return
168		}
169	}
170}
171
172func (l *Lexer) blockComment() {
173	startTok := l.makeToken(token.Comment, "", l.pos, l.column, l.line)
174	l.advance()
175	l.advance()
176	for !l.isAtEnd() {
177		if l.peek() == '*' && l.peekNext() == '/' {
178			l.advance()
179			l.advance()
180			return
181		}
182		l.advance()
183	}
184	util.Error(startTok, "Unterminated block comment")
185}
186
187func (l *Lexer) lineComment() {
188	for !l.isAtEnd() && l.peek() != '\n' {
189		l.advance()
190	}
191}
192
193func (l *Lexer) lineCommentOrDirective(startPos, startCol, startLine int) (token.Token, bool) {
194	preCommentPos, preCommentCol, preCommentLine := l.pos, l.column, l.line
195	l.advance()
196	l.advance()
197	commentStartPos := l.pos
198	for !l.isAtEnd() && l.peek() != '\n' {
199		l.advance()
200	}
201	commentContent := string(l.source[commentStartPos:l.pos])
202	trimmedContent := strings.TrimSpace(commentContent)
203
204	if strings.HasPrefix(trimmedContent, "[b]:") {
205		directiveContent := strings.TrimSpace(strings.TrimPrefix(trimmedContent, "[b]:"))
206		return l.makeToken(token.Directive, directiveContent, startPos, startCol, startLine), true
207	}
208
209	l.pos, l.column, l.line = preCommentPos, preCommentCol, preCommentLine
210	return token.Token{}, false
211}
212
213func (l *Lexer) identifierOrKeyword(startPos, startCol, startLine int) token.Token {
214	for unicode.IsLetter(l.peek()) || unicode.IsDigit(l.peek()) || l.peek() == '_' {
215		l.advance()
216	}
217	value := string(l.source[startPos:l.pos])
218	tok := l.makeToken(token.Ident, value, startPos, startCol, startLine)
219
220	if tokType, isKeyword := token.KeywordMap[value]; isKeyword {
221		isTypedKeyword := tokType >= token.Void && tokType <= token.Any
222		if !isTypedKeyword || l.cfg.IsFeatureEnabled(config.FeatTyped) {
223			tok.Type = tokType
224			tok.Value = ""
225		}
226	}
227	return tok
228}
229
230func (l *Lexer) numberLiteral(startPos, startCol, startLine int) token.Token {
231	isFloat := false
232	if l.peek() == '.' {
233		isFloat = true
234		l.advance()
235	}
236
237	if l.peek() == '0' && (l.peekNext() == 'x' || l.peekNext() == 'X') {
238		l.advance()
239		l.advance()
240		for unicode.IsDigit(l.peek()) || (l.peek() >= 'a' && l.peek() <= 'f') || (l.peek() >= 'A' && l.peek() <= 'F') {
241			l.advance()
242		}
243	} else {
244		for unicode.IsDigit(l.peek()) {
245			l.advance()
246		}
247	}
248
249	if l.peek() == '.' {
250		if unicode.IsDigit(l.peekNext()) {
251			isFloat = true
252			l.advance()
253			for unicode.IsDigit(l.peek()) {
254				l.advance()
255			}
256		}
257	}
258
259	valueStr := string(l.source[startPos:l.pos])
260	if (l.peek() == 'e' || l.peek() == 'E') && !strings.HasPrefix(valueStr, "0x") && !strings.HasPrefix(valueStr, "0X") {
261		isFloat = true
262		l.advance()
263		if l.peek() == '+' || l.peek() == '-' {
264			l.advance()
265		}
266		if !unicode.IsDigit(l.peek()) {
267			util.Error(l.makeToken(token.FloatNumber, "", startPos, startCol, startLine), "Malformed floating-point literal: exponent has no digits")
268		}
269		for unicode.IsDigit(l.peek()) {
270			l.advance()
271		}
272	}
273
274	valueStr = string(l.source[startPos:l.pos])
275
276	if isFloat {
277		if !l.cfg.IsFeatureEnabled(config.FeatFloat) {
278			tok := l.makeToken(token.FloatNumber, valueStr, startPos, startCol, startLine)
279			util.Error(tok, "Floating-point numbers are not enabled (use -Ffloat)")
280			return tok
281		}
282		if l.cfg.IsWarningEnabled(config.WarnFloat) {
283			tok := l.makeToken(token.FloatNumber, valueStr, startPos, startCol, startLine)
284			util.Warn(l.cfg, config.WarnFloat, tok, "Use of floating-point constant")
285		}
286		return l.makeToken(token.FloatNumber, valueStr, startPos, startCol, startLine)
287	}
288
289	tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
290	val, err := strconv.ParseUint(valueStr, 0, 64)
291	if err != nil {
292		if e, ok := err.(*strconv.NumError); ok && e.Err == strconv.ErrRange {
293			util.Warn(l.cfg, config.WarnOverflow, tok, "Integer constant overflow: %s", valueStr)
294			tok.Value = valueStr
295			return tok
296		}
297		util.Error(tok, "Invalid number literal: %s", valueStr)
298		tok.Value = "0"
299	} else {
300		tok.Value = strconv.FormatUint(val, 10)
301	}
302	return tok
303}
304
305func (l *Lexer) stringLiteral(startPos, startCol, startLine int) token.Token {
306	var sb strings.Builder
307	for !l.isAtEnd() {
308		c := l.peek()
309		if c == '"' {
310			l.advance()
311			return l.makeToken(token.String, sb.String(), startPos, startCol, startLine)
312		}
313		if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
314			l.advance()
315			val := l.decodeEscape(c, startPos, startCol, startLine)
316			// For values 0-127, write as regular rune. For 128-255, write as raw byte
317			if val <= 127 {
318				sb.WriteRune(rune(val))
319			} else {
320				// Build the final string using a byte slice to avoid UTF-8 encoding
321				existing := sb.String()
322				newBuf := []byte(existing)
323				newBuf = append(newBuf, byte(val))
324				sb.Reset()
325				sb.WriteString(string(newBuf))
326			}
327		} else {
328			l.advance()
329			sb.WriteRune(c)
330		}
331	}
332	util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Unterminated string literal")
333	return l.makeToken(token.EOF, "", l.pos, l.column, l.line)
334}
335
336func (l *Lexer) charLiteral(startPos, startCol, startLine int) token.Token {
337	var word int64
338	for l.peek() != '\'' && !l.isAtEnd() {
339		var val int64
340		c := l.peek()
341		if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
342			l.advance()
343			val = l.decodeEscape(c, startPos, startCol, startLine)
344		} else {
345			l.advance()
346			val = int64(c)
347		}
348		word = (word << 8) | (val & 0xFF)
349	}
350
351	tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
352	if !l.match('\'') {
353		util.Error(tok, "Unterminated character literal")
354	}
355	tok.Value = strconv.FormatInt(word, 10)
356	return tok
357}
358
359func (l *Lexer) decodeEscape(escapeChar rune, startPos, startCol, startLine int) int64 {
360	if l.isAtEnd() {
361		util.Error(l.makeToken(token.EOF, "", l.pos, l.column, l.line), "Unterminated escape sequence")
362		return 0
363	}
364	c := l.advance()
365
366	// Handle hex escape sequences (\x followed by exactly 2 hex digits)
367	if c == 'x' {
368		return l.parseHexEscape(2, escapeChar, startPos, startCol, startLine)
369	}
370
371	// Handle unicode escape sequences (\u followed by exactly 4 hex digits)
372	if c == 'u' {
373		return l.parseHexEscape(4, escapeChar, startPos, startCol, startLine)
374	}
375
376	// Handle unicode escape sequences (\U followed by exactly 8 hex digits)
377	if c == 'U' {
378		return l.parseHexEscape(8, escapeChar, startPos, startCol, startLine)
379	}
380
381	// Handle octal escape sequences (\000-\377) - Go-style: exactly 3 digits required
382	if c >= '0' && c <= '7' {
383		val := int64(c - '0')
384		digitsRead := 1
385
386		// Read exactly 2 more digits (for 3 total - Go behavior)
387		for i := 0; i < 2; i++ {
388			if l.isAtEnd() {
389				util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine),
390					"Octal escape sequence '%c%c...' requires exactly 3 digits, got %d (use \\%03o for Go-style)", escapeChar, c, digitsRead, val)
391				return val
392			}
393			next := l.peek()
394			if next >= '0' && next <= '7' {
395				val = val*8 + int64(next-'0')
396				l.advance()
397				digitsRead++
398			} else {
399				util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine),
400					"Octal escape sequence '%c%c...' requires exactly 3 digits, got %d (use \\%03o for Go-style)", escapeChar, c, digitsRead, val)
401				return val
402			}
403		}
404		return val
405	}
406
407	escapes := map[rune]int64{
408		'n': '\n', 't': '\t', 'e': 4, 'b': '\b', 'r': '\r',
409		'(': '{', ')': '}', '\\': '\\', '\'': '\'', '"': '"', '*': '*',
410		'a': '\a', 'f': '\f', 'v': '\v', '0': 0,
411	}
412	if val, ok := escapes[c]; ok {
413		return val
414	}
415	util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine), "Unrecognized escape sequence '%c%c'", escapeChar, c)
416	return int64(c)
417}
418
419func (l *Lexer) parseHexEscape(numDigits int, escapeChar rune, startPos, startCol, startLine int) int64 {
420	var val int64
421	for i := 0; i < numDigits; i++ {
422		if l.isAtEnd() {
423			util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Incomplete hex escape sequence '%c%c' - expected %d hex digits", escapeChar, 'x', numDigits)
424			return 0
425		}
426		c := l.peek()
427		var digit int64
428		switch {
429		case c >= '0' && c <= '9': digit = int64(c - '0')
430		case c >= 'a' && c <= 'f': digit = int64(c - 'a' + 10)
431		case c >= 'A' && c <= 'F': digit = int64(c - 'A' + 10)
432		default:
433			util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Invalid hex digit '%c' in escape sequence", c)
434			return 0
435		}
436		val = val*16 + digit
437		l.advance()
438	}
439	return val
440}
441
442func (l *Lexer) matchThen(expected rune, thenType, elseType token.Type, sPos, sCol, sLine int) token.Token {
443	if l.match(expected) {
444		return l.makeToken(thenType, "", sPos, sCol, sLine)
445	}
446	return l.makeToken(elseType, "", sPos, sCol, sLine)
447}
448
449func (l *Lexer) plus(sPos, sCol, sLine int) token.Token {
450	if l.match('+') {
451		return l.makeToken(token.Inc, "", sPos, sCol, sLine)
452	}
453	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
454		return l.makeToken(token.PlusEq, "", sPos, sCol, sLine)
455	}
456	return l.makeToken(token.Plus, "", sPos, sCol, sLine)
457}
458
459func (l *Lexer) minus(sPos, sCol, sLine int) token.Token {
460	if l.match('-') {
461		return l.makeToken(token.Dec, "", sPos, sCol, sLine)
462	}
463	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
464		return l.makeToken(token.MinusEq, "", sPos, sCol, sLine)
465	}
466	return l.makeToken(token.Minus, "", sPos, sCol, sLine)
467}
468
469func (l *Lexer) star(sPos, sCol, sLine int) token.Token {
470	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
471		return l.makeToken(token.StarEq, "", sPos, sCol, sLine)
472	}
473	return l.makeToken(token.Star, "", sPos, sCol, sLine)
474}
475
476func (l *Lexer) slash(sPos, sCol, sLine int) token.Token {
477	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
478		return l.makeToken(token.SlashEq, "", sPos, sCol, sLine)
479	}
480	return l.makeToken(token.Slash, "", sPos, sCol, sLine)
481}
482
483func (l *Lexer) ampersand(sPos, sCol, sLine int) token.Token {
484	if l.match('&') {
485		return l.makeToken(token.AndAnd, "", sPos, sCol, sLine)
486	}
487	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
488		return l.makeToken(token.AndEq, "", sPos, sCol, sLine)
489	}
490	return l.makeToken(token.And, "", sPos, sCol, sLine)
491}
492
493func (l *Lexer) pipe(sPos, sCol, sLine int) token.Token {
494	if l.match('|') {
495		return l.makeToken(token.OrOr, "", sPos, sCol, sLine)
496	}
497	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
498		return l.makeToken(token.OrEq, "", sPos, sCol, sLine)
499	}
500	return l.makeToken(token.Or, "", sPos, sCol, sLine)
501}
502
503func (l *Lexer) less(sPos, sCol, sLine int) token.Token {
504	if l.match('<') {
505		return l.matchThen('=', token.ShlEq, token.Shl, sPos, sCol, sLine)
506	}
507	return l.matchThen('=', token.Lte, token.Lt, sPos, sCol, sLine)
508}
509
510func (l *Lexer) greater(sPos, sCol, sLine int) token.Token {
511	if l.match('>') {
512		return l.matchThen('=', token.ShrEq, token.Shr, sPos, sCol, sLine)
513	}
514	return l.matchThen('=', token.Gte, token.Gt, sPos, sCol, sLine)
515}
516
517func (l *Lexer) equal(sPos, sCol, sLine int) token.Token {
518	if l.match('=') {
519		return l.makeToken(token.EqEq, "", sPos, sCol, sLine)
520	}
521	if l.cfg.IsFeatureEnabled(config.FeatBOps) {
522		switch {
523		case l.match('+'): return l.makeToken(token.EqPlus, "", sPos, sCol, sLine)
524		case l.match('-'): return l.makeToken(token.EqMinus, "", sPos, sCol, sLine)
525		case l.match('*'): return l.makeToken(token.EqStar, "", sPos, sCol, sLine)
526		case l.match('/'): return l.makeToken(token.EqSlash, "", sPos, sCol, sLine)
527		case l.match('%'): return l.makeToken(token.EqRem, "", sPos, sCol, sLine)
528		case l.match('&'): return l.makeToken(token.EqAnd, "", sPos, sCol, sLine)
529		case l.match('|'): return l.makeToken(token.EqOr, "", sPos, sCol, sLine)
530		case l.match('^'): return l.makeToken(token.EqXor, "", sPos, sCol, sLine)
531		case l.match('<') && l.match('<'): return l.makeToken(token.EqShl, "", sPos, sCol, sLine)
532		case l.match('>') && l.match('>'): return l.makeToken(token.EqShr, "", sPos, sCol, sLine)
533		}
534	}
535	return l.makeToken(token.Eq, "", sPos, sCol, sLine)
536}