repos / gbc

GBC - Go B Compiler
git clone https://github.com/xplshn/gbc.git

gbc / pkg / lexer
xplshn  ·  2025-08-16

lexer.go

Go
  1package lexer
  2
  3import (
  4	"strconv"
  5	"strings"
  6	"unicode"
  7
  8	"github.com/xplshn/gbc/pkg/config"
  9	"github.com/xplshn/gbc/pkg/token"
 10	"github.com/xplshn/gbc/pkg/util"
 11)
 12
 13type Lexer struct {
 14	source    []rune
 15	fileIndex int
 16	pos       int
 17	line      int
 18	column    int
 19	cfg       *config.Config
 20}
 21
 22func NewLexer(source []rune, fileIndex int, cfg *config.Config) *Lexer {
 23	return &Lexer{
 24		source: source, fileIndex: fileIndex, line: 1, column: 1, cfg: cfg,
 25	}
 26}
 27
 28func (l *Lexer) Next() token.Token {
 29	for {
 30		l.skipWhitespaceAndComments()
 31		startPos, startCol, startLine := l.pos, l.column, l.line
 32
 33		if l.isAtEnd() {
 34			return l.makeToken(token.EOF, "", startPos, startCol, startLine)
 35		}
 36
 37		if !l.cfg.IsFeatureEnabled(config.FeatNoDirectives) && l.peek() == '/' && l.peekNext() == '/' {
 38			if tok, isDirective := l.lineCommentOrDirective(startPos, startCol, startLine); isDirective {
 39				return tok
 40			}
 41		}
 42
 43		ch := l.advance()
 44		if unicode.IsLetter(ch) || ch == '_' {
 45			return l.identifierOrKeyword(startPos, startCol, startLine)
 46		}
 47		if unicode.IsDigit(ch) {
 48			return l.numberLiteral(startPos, startCol, startLine)
 49		}
 50
 51		switch ch {
 52		case '(':
 53			return l.makeToken(token.LParen, "", startPos, startCol, startLine)
 54		case ')':
 55			return l.makeToken(token.RParen, "", startPos, startCol, startLine)
 56		case '{':
 57			return l.makeToken(token.LBrace, "", startPos, startCol, startLine)
 58		case '}':
 59			return l.makeToken(token.RBrace, "", startPos, startCol, startLine)
 60		case '[':
 61			return l.makeToken(token.LBracket, "", startPos, startCol, startLine)
 62		case ']':
 63			return l.makeToken(token.RBracket, "", startPos, startCol, startLine)
 64		case ';':
 65			return l.makeToken(token.Semi, "", startPos, startCol, startLine)
 66		case ',':
 67			return l.makeToken(token.Comma, "", startPos, startCol, startLine)
 68		case '?':
 69			return l.makeToken(token.Question, "", startPos, startCol, startLine)
 70		case '~':
 71			return l.makeToken(token.Complement, "", startPos, startCol, startLine)
 72		case ':':
 73			return l.matchThen('=', token.Define, token.Colon, startPos, startCol, startLine)
 74		case '!':
 75			return l.matchThen('=', token.Neq, token.Not, startPos, startCol, startLine)
 76		case '^':
 77			return l.matchThen('=', token.XorEq, token.Xor, startPos, startCol, startLine)
 78		case '%':
 79			return l.matchThen('=', token.RemEq, token.Rem, startPos, startCol, startLine)
 80		case '+':
 81			return l.plus(startPos, startCol, startLine)
 82		case '-':
 83			return l.minus(startPos, startCol, startLine)
 84		case '*':
 85			return l.star(startPos, startCol, startLine)
 86		case '/':
 87			return l.slash(startPos, startCol, startLine)
 88		case '&':
 89			return l.ampersand(startPos, startCol, startLine)
 90		case '|':
 91			return l.pipe(startPos, startCol, startLine)
 92		case '<':
 93			return l.less(startPos, startCol, startLine)
 94		case '>':
 95			return l.greater(startPos, startCol, startLine)
 96		case '=':
 97			return l.equal(startPos, startCol, startLine)
 98		case '.':
 99			if l.match('.') && l.match('.') {
100				return l.makeToken(token.Dots, "", startPos, startCol, startLine)
101			}
102			return l.makeToken(token.Dot, "", startPos, startCol, startLine)
103		case '"':
104			return l.stringLiteral(startPos, startCol, startLine)
105		case '\'':
106			return l.charLiteral(startPos, startCol, startLine)
107		}
108
109		tok := l.makeToken(token.EOF, "", startPos, startCol, startLine)
110		util.Error(tok, "Unexpected character: '%c'", ch)
111		return tok
112	}
113}
114
115func (l *Lexer) peek() rune {
116	if l.isAtEnd() {
117		return 0
118	}
119	return l.source[l.pos]
120}
121
122func (l *Lexer) peekNext() rune {
123	if l.pos+1 >= len(l.source) {
124		return 0
125	}
126	return l.source[l.pos+1]
127}
128
129func (l *Lexer) advance() rune {
130	if l.isAtEnd() {
131		return 0
132	}
133	ch := l.source[l.pos]
134	if ch == '\n' {
135		l.line++
136		l.column = 1
137	} else {
138		l.column++
139	}
140	l.pos++
141	return ch
142}
143
144func (l *Lexer) match(expected rune) bool {
145	if l.isAtEnd() || l.source[l.pos] != expected {
146		return false
147	}
148	l.advance()
149	return true
150}
151
152func (l *Lexer) isAtEnd() bool {
153	return l.pos >= len(l.source)
154}
155
156func (l *Lexer) makeToken(tokType token.Type, value string, startPos, startCol, startLine int) token.Token {
157	return token.Token{
158		Type: tokType, Value: value, FileIndex: l.fileIndex,
159		Line: startLine, Column: startCol, Len: l.pos - startPos,
160	}
161}
162
163func (l *Lexer) skipWhitespaceAndComments() {
164	for {
165		c := l.peek()
166		switch c {
167		case ' ', '\t', '\n', '\r':
168			l.advance()
169		case '/':
170			if l.peekNext() == '*' {
171				l.blockComment()
172			} else if l.peekNext() == '/' && l.cfg.IsFeatureEnabled(config.FeatCComments) {
173				l.lineComment()
174			} else {
175				return
176			}
177		default:
178			return
179		}
180	}
181}
182
183func (l *Lexer) blockComment() {
184	startTok := l.makeToken(token.Comment, "", l.pos, l.column, l.line)
185	l.advance() // Consume '/'
186	l.advance() // Consume '*'
187	for !l.isAtEnd() {
188		if l.peek() == '*' && l.peekNext() == '/' {
189			l.advance()
190			l.advance()
191			return
192		}
193		l.advance()
194	}
195	util.Error(startTok, "Unterminated block comment")
196}
197
198func (l *Lexer) lineComment() {
199	for !l.isAtEnd() && l.peek() != '\n' {
200		l.advance()
201	}
202}
203
204func (l *Lexer) lineCommentOrDirective(startPos, startCol, startLine int) (token.Token, bool) {
205	preCommentPos, preCommentCol, preCommentLine := l.pos, l.column, l.line
206	l.advance() // Consume '/'
207	l.advance() // Consume '/'
208	commentStartPos := l.pos
209	for !l.isAtEnd() && l.peek() != '\n' {
210		l.advance()
211	}
212	commentContent := string(l.source[commentStartPos:l.pos])
213	trimmedContent := strings.TrimSpace(commentContent)
214
215	if strings.HasPrefix(trimmedContent, "[b]:") {
216		directiveContent := strings.TrimSpace(trimmedContent[4:])
217		return l.makeToken(token.Directive, directiveContent, startPos, startCol, startLine), true
218	}
219
220	l.pos, l.column, l.line = preCommentPos, preCommentCol, preCommentLine
221	return token.Token{}, false
222}
223
224func (l *Lexer) identifierOrKeyword(startPos, startCol, startLine int) token.Token {
225	for unicode.IsLetter(l.peek()) || unicode.IsDigit(l.peek()) || l.peek() == '_' {
226		l.advance()
227	}
228	value := string(l.source[startPos:l.pos])
229	tok := l.makeToken(token.Ident, value, startPos, startCol, startLine)
230
231	if tokType, isKeyword := token.KeywordMap[value]; isKeyword {
232		isTypedKeyword := tokType >= token.Void && tokType <= token.Any
233		if !isTypedKeyword || l.cfg.IsFeatureEnabled(config.FeatTyped) {
234			tok.Type = tokType
235			tok.Value = ""
236		}
237	}
238	return tok
239}
240
241func (l *Lexer) numberLiteral(startPos, startCol, startLine int) token.Token {
242	for unicode.IsDigit(l.peek()) || (l.peek() == 'x' || l.peek() == 'X') || (l.peek() >= 'a' && l.peek() <= 'f') || (l.peek() >= 'A' && l.peek() <= 'F') {
243		l.advance()
244	}
245	valueStr := string(l.source[startPos:l.pos])
246	tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
247	val, err := strconv.ParseUint(valueStr, 0, 64)
248	if err != nil {
249		util.Error(tok, "Invalid number literal: %s", valueStr)
250	}
251	tok.Value = strconv.FormatInt(int64(val), 10)
252	return tok
253}
254
255func (l *Lexer) stringLiteral(startPos, startCol, startLine int) token.Token {
256	var sb strings.Builder
257	for !l.isAtEnd() {
258		c := l.peek()
259		if c == '"' {
260			l.advance()
261			return l.makeToken(token.String, sb.String(), startPos, startCol, startLine)
262		}
263		if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
264			l.advance()
265			sb.WriteRune(rune(l.decodeEscape(c, startPos, startCol, startLine)))
266		} else {
267			l.advance()
268			sb.WriteRune(c)
269		}
270	}
271	util.Error(l.makeToken(token.String, "", startPos, startCol, startLine), "Unterminated string literal")
272	return l.makeToken(token.EOF, "", l.pos, l.column, l.line)
273}
274
275func (l *Lexer) charLiteral(startPos, startCol, startLine int) token.Token {
276	var word int64
277	for l.peek() != '\'' && !l.isAtEnd() {
278		var val int64
279		c := l.peek()
280		if (c == '\\' && l.cfg.IsFeatureEnabled(config.FeatCEsc)) || (c == '*' && l.cfg.IsFeatureEnabled(config.FeatBEsc)) {
281			l.advance()
282			val = l.decodeEscape(c, startPos, startCol, startLine)
283		} else {
284			l.advance()
285			val = int64(c)
286		}
287		word = (word << 8) | (val & 0xFF)
288	}
289
290	tok := l.makeToken(token.Number, "", startPos, startCol, startLine)
291	if !l.match('\'') {
292		util.Error(tok, "Unterminated character literal")
293	}
294	tok.Value = strconv.FormatInt(word, 10)
295	return tok
296}
297
298func (l *Lexer) decodeEscape(escapeChar rune, startPos, startCol, startLine int) int64 {
299	if l.isAtEnd() {
300		util.Error(l.makeToken(token.EOF, "", l.pos, l.column, l.line), "Unterminated escape sequence")
301		return 0
302	}
303	c := l.advance()
304	escapes := map[rune]int64{'n': '\n', 't': '\t', 'e': 4, 'b': '\b', 'r': '\r', '0': 0, '(': '{', ')': '}', '\\': '\\', '\'': '\'', '"': '"', '*': '*'}
305	if val, ok := escapes[c]; ok {
306		return val
307	}
308	util.Warn(l.cfg, config.WarnUnrecognizedEscape, l.makeToken(token.String, "", startPos, startCol, startLine), "Unrecognized escape sequence '%c%c'", escapeChar, c)
309	return int64(c)
310}
311
312func (l *Lexer) matchThen(expected rune, thenType, elseType token.Type, sPos, sCol, sLine int) token.Token {
313	if l.match(expected) {
314		return l.makeToken(thenType, "", sPos, sCol, sLine)
315	}
316	return l.makeToken(elseType, "", sPos, sCol, sLine)
317}
318
319func (l *Lexer) plus(sPos, sCol, sLine int) token.Token {
320	if l.match('+') {
321		return l.makeToken(token.Inc, "", sPos, sCol, sLine)
322	}
323	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
324		return l.makeToken(token.PlusEq, "", sPos, sCol, sLine)
325	}
326	return l.makeToken(token.Plus, "", sPos, sCol, sLine)
327}
328
329func (l *Lexer) minus(sPos, sCol, sLine int) token.Token {
330	if l.match('-') {
331		return l.makeToken(token.Dec, "", sPos, sCol, sLine)
332	}
333	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
334		return l.makeToken(token.MinusEq, "", sPos, sCol, sLine)
335	}
336	return l.makeToken(token.Minus, "", sPos, sCol, sLine)
337}
338
339func (l *Lexer) star(sPos, sCol, sLine int) token.Token {
340	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
341		return l.makeToken(token.StarEq, "", sPos, sCol, sLine)
342	}
343	return l.makeToken(token.Star, "", sPos, sCol, sLine)
344}
345
346func (l *Lexer) slash(sPos, sCol, sLine int) token.Token {
347	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
348		return l.makeToken(token.SlashEq, "", sPos, sCol, sLine)
349	}
350	return l.makeToken(token.Slash, "", sPos, sCol, sLine)
351}
352
353func (l *Lexer) ampersand(sPos, sCol, sLine int) token.Token {
354	if l.match('&') {
355		return l.makeToken(token.AndAnd, "", sPos, sCol, sLine)
356	}
357	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
358		return l.makeToken(token.AndEq, "", sPos, sCol, sLine)
359	}
360	return l.makeToken(token.And, "", sPos, sCol, sLine)
361}
362
363func (l *Lexer) pipe(sPos, sCol, sLine int) token.Token {
364	if l.match('|') {
365		return l.makeToken(token.OrOr, "", sPos, sCol, sLine)
366	}
367	if l.cfg.IsFeatureEnabled(config.FeatCOps) && l.match('=') {
368		return l.makeToken(token.OrEq, "", sPos, sCol, sLine)
369	}
370	return l.makeToken(token.Or, "", sPos, sCol, sLine)
371}
372
373func (l *Lexer) less(sPos, sCol, sLine int) token.Token {
374	if l.match('<') {
375		return l.matchThen('=', token.ShlEq, token.Shl, sPos, sCol, sLine)
376	}
377	return l.matchThen('=', token.Lte, token.Lt, sPos, sCol, sLine)
378}
379
380func (l *Lexer) greater(sPos, sCol, sLine int) token.Token {
381	if l.match('>') {
382		return l.matchThen('=', token.ShrEq, token.Shr, sPos, sCol, sLine)
383	}
384	return l.matchThen('=', token.Gte, token.Gt, sPos, sCol, sLine)
385}
386
387func (l *Lexer) equal(sPos, sCol, sLine int) token.Token {
388	if l.match('=') {
389		return l.makeToken(token.EqEq, "", sPos, sCol, sLine)
390	}
391	if l.cfg.IsFeatureEnabled(config.FeatBOps) {
392		switch {
393		case l.match('+'):
394			return l.makeToken(token.EqPlus, "", sPos, sCol, sLine)
395		case l.match('-'):
396			return l.makeToken(token.EqMinus, "", sPos, sCol, sLine)
397		case l.match('*'):
398			return l.makeToken(token.EqStar, "", sPos, sCol, sLine)
399		case l.match('/'):
400			return l.makeToken(token.EqSlash, "", sPos, sCol, sLine)
401		case l.match('%'):
402			return l.makeToken(token.EqRem, "", sPos, sCol, sLine)
403		case l.match('&'):
404			return l.makeToken(token.EqAnd, "", sPos, sCol, sLine)
405		case l.match('|'):
406			return l.makeToken(token.EqOr, "", sPos, sCol, sLine)
407		case l.match('^'):
408			return l.makeToken(token.EqXor, "", sPos, sCol, sLine)
409		case l.match('<') && l.match('<'):
410			return l.makeToken(token.EqShl, "", sPos, sCol, sLine)
411		case l.match('>') && l.match('>'):
412			return l.makeToken(token.EqShr, "", sPos, sCol, sLine)
413		}
414	}
415	return l.makeToken(token.Eq, "", sPos, sCol, sLine)
416}