Skip to content

Commit

Permalink
Parser refers to "symbol"s, "label"s, and "local label"s, not "identi…
Browse files Browse the repository at this point in the history
…fier"s (gbdev#1652)

This better matches how the lexed tokens are discussed in rgbasm(5)
  • Loading branch information
Rangi42 authored Feb 6, 2025
1 parent d9d381c commit 4c916b8
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 92 deletions.
46 changes: 22 additions & 24 deletions src/asm/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,9 @@ struct CaseInsensitive {
}
};

// Identifiers that are also keywords are listed here. This ONLY applies to ones
// that would normally be matched as identifiers! Check out `yylex_NORMAL` to
// see how this is used.
// Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.
// This assumes that no two keywords have the same name.
// This map lists all RGBASM keywords which `yylex_NORMAL` lexes as identifiers
// (see `startsIdentifier` and `continuesIdentifier` below). All non-identifier
// tokens are lexed separately.
static std::unordered_map<std::string, int, CaseInsensitive, CaseInsensitive> keywordDict = {
{"ADC", T_(SM83_ADC) },
{"ADD", T_(SM83_ADD) },
Expand Down Expand Up @@ -1179,7 +1177,7 @@ static uint32_t readGfxConstant() {
return bitPlaneUpper << 8 | bitPlaneLower;
}

// Functions to read identifiers & keywords
// Functions to read identifiers and keywords

static bool startsIdentifier(int c) {
// Anonymous labels internally start with '!'
Expand All @@ -1192,18 +1190,18 @@ static bool continuesIdentifier(int c) {

static Token readIdentifier(char firstChar, bool raw) {
std::string identifier(1, firstChar);
int tokenType = firstChar == '.' ? T_(LOCAL_ID) : T_(ID);
int tokenType = firstChar == '.' ? T_(LOCAL) : T_(SYMBOL);

// Continue reading while the char is in the symbol charset
// Continue reading while the char is in the identifier charset
for (int c = peek(); continuesIdentifier(c); c = peek()) {
shiftChar();

// Write the char to the identifier's name
identifier += c;

// If the char was a dot, mark the identifier as local
// If the char was a dot, the identifier is a local label
if (c == '.') {
tokenType = T_(LOCAL_ID);
tokenType = T_(LOCAL);
}
}

Expand All @@ -1219,7 +1217,7 @@ static Token readIdentifier(char firstChar, bool raw) {

// Label scopes `.` and `..` are the only nonlocal identifiers that start with a dot
if (identifier.find_first_not_of('.') == identifier.npos) {
tokenType = T_(ID);
tokenType = T_(SYMBOL);
}

return Token(tokenType, identifier);
Expand Down Expand Up @@ -1276,7 +1274,7 @@ static std::shared_ptr<std::string> readInterpolation(size_t depth) {
lexerState->disableInterpolation = disableInterpolation;

if (fmtBuf.starts_with('#')) {
// Skip a '#' raw identifier prefix, but after expanding any nested interpolations.
// Skip a '#' raw symbol prefix, but after expanding any nested interpolations.
fmtBuf.erase(0, 1);
} else if (keywordDict.find(fmtBuf) != keywordDict.end()) {
// Don't allow symbols that alias keywords without a '#' prefix.
Expand Down Expand Up @@ -1641,7 +1639,7 @@ static Token yylex_NORMAL() {

case '@': {
std::string symName("@");
return Token(T_(ID), symName);
return Token(T_(SYMBOL), symName);
}

case '[':
Expand Down Expand Up @@ -1903,15 +1901,15 @@ static Token yylex_NORMAL() {
}

// If a keyword, don't try to expand
if (token.type != T_(ID) && token.type != T_(LOCAL_ID)) {
if (token.type != T_(SYMBOL) && token.type != T_(LOCAL)) {
return token;
}

// `token` is either an `ID` or a `LOCAL_ID`, and both have a `std::string` value.
// `token` is either a `SYMBOL` or a `LOCAL`, and both have a `std::string` value.
assume(token.value.holds<std::string>());

// Local symbols cannot be string expansions
if (token.type == T_(ID) && lexerState->expandStrings) {
if (token.type == T_(SYMBOL) && lexerState->expandStrings) {
// Attempt string expansion
Symbol const *sym = sym_FindExactSymbol(token.value.get<std::string>());

Expand All @@ -1925,18 +1923,18 @@ static Token yylex_NORMAL() {
}

// This is a "lexer hack"! We need it to distinguish between label definitions
// (which start with `LABEL`) and macro invocations (which start with `ID`).
// (which start with `LABEL`) and macro invocations (which start with `SYMBOL`).
//
// If we had one `IDENTIFIER` token, the parser would need to perform "lookahead"
// to determine which rule applies. But since macros need to enter "raw" mode to
// parse their arguments, which may not even be valid tokens in "normal" mode, we
// cannot use lookahead to check for the presence of a `COLON`.
//
// Instead, we have separate `ID` and `LABEL` tokens, lexing as a `LABEL` if a ':'
// character *immediately* follows the identifier. Thus, at the beginning of a line,
// "Label:" and "mac:" are treated as label definitions, but "Label :" and "mac :"
// are treated as macro invocations.
if (token.type == T_(ID) && peek() == ':') {
// Instead, we have separate `SYMBOL` and `LABEL` tokens, lexing as a `LABEL` if a
// ':' character *immediately* follows the identifier. Thus, at the beginning of a
// line, "Label:" and "mac:" are treated as label definitions, but "Label :" and
// "mac :" are treated as macro invocations.
if (token.type == T_(SYMBOL) && peek() == ':') {
token.type = T_(LABEL);
}

Expand Down Expand Up @@ -2390,7 +2388,7 @@ Capture lexer_CaptureRept() {
do { // Discard initial whitespace
c = nextChar();
} while (isWhitespace(c));
// Now, try to match `REPT`, `FOR` or `ENDR` as a **whole** identifier
// Now, try to match `REPT`, `FOR` or `ENDR` as a **whole** keyword
if (startsIdentifier(c)) {
switch (readIdentifier(c, false).type) {
case T_(POP_REPT):
Expand Down Expand Up @@ -2443,7 +2441,7 @@ Capture lexer_CaptureMacro() {
do { // Discard initial whitespace
c = nextChar();
} while (isWhitespace(c));
// Now, try to match `ENDM` as a **whole** identifier
// Now, try to match `ENDM` as a **whole** keyword
if (startsIdentifier(c)) {
switch (readIdentifier(c, false).type) {
case T_(POP_ENDM):
Expand Down
94 changes: 38 additions & 56 deletions src/asm/parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -313,18 +313,17 @@
// Literals
%token <int32_t> NUMBER "number"
%token <std::string> STRING "string"
%token <std::string> SYMBOL "symbol"
%token <std::string> LABEL "label"
%token <std::string> ID "identifier"
%token <std::string> LOCAL_ID "local identifier"
%token <std::string> LOCAL "local label"
%token <std::string> ANON "anonymous label"

/******************** Data types ********************/

// The "no_str" types below are to distinguish numeric and string expressions, since many
// contexts treat strings differently than numbers, e.g. `db "string"` or `print "string"`.

// RPN expressions
%type <Expression> relocexpr
// `relocexpr_no_str` exists because strings usually count as numeric expressions, but some
// contexts treat numbers and strings differently, e.g. `db "string"` or `print "string"`.
%type <Expression> relocexpr_no_str
%type <Expression> reloc_8bit
%type <Expression> reloc_8bit_offset
Expand Down Expand Up @@ -355,8 +354,10 @@
%type <std::string> def_rl
%type <std::string> def_equs
%type <std::string> redef_equs
%type <std::string> scoped_id
%type <std::string> scoped_anon_id
%type <std::string> scoped_sym
// `scoped_sym_no_anon` exists because anonymous labels usually count as "scoped symbols", but some
// contexts treat anonymous labels and other labels/symbols differently, e.g. `purge` or `export`.
%type <std::string> scoped_sym_no_anon

// SM83 instruction parameters
%type <int32_t> reg_r
Expand Down Expand Up @@ -513,7 +514,7 @@ endc:
def_id:
OP_DEF {
lexer_ToggleStringExpansion(false);
} ID {
} SYMBOL {
lexer_ToggleStringExpansion(true);
$$ = std::move($3);
}
Expand All @@ -522,61 +523,42 @@ def_id:
redef_id:
POP_REDEF {
lexer_ToggleStringExpansion(false);
} ID {
} SYMBOL {
lexer_ToggleStringExpansion(true);
$$ = std::move($3);
}
;
// LABEL covers identifiers followed by a double colon (e.g. `call Function::ret`,
// to be read as `call Function :: ret`). This should not conflict with anything.
scoped_id:
ID {
$$ = std::move($1);
}
| LOCAL_ID {
$$ = std::move($1);
}
| LABEL {
$$ = std::move($1);
}
;
scoped_sym_no_anon: SYMBOL | LABEL | LOCAL;
scoped_anon_id:
scoped_id {
$$ = std::move($1);
}
| ANON {
$$ = std::move($1);
}
;
scoped_sym: scoped_sym_no_anon | ANON;
label:
%empty
| COLON {
sym_AddAnonLabel();
| LABEL COLON {
sym_AddLabel($1);
}
| LOCAL_ID {
sym_AddLocalLabel($1);
| LABEL DOUBLE_COLON {
sym_AddLabel($1);
sym_Export($1);
}
| LOCAL_ID COLON {
| LOCAL {
sym_AddLocalLabel($1);
}
| LABEL COLON {
sym_AddLabel($1);
| LOCAL COLON {
sym_AddLocalLabel($1);
}
| LOCAL_ID DOUBLE_COLON {
| LOCAL DOUBLE_COLON {
sym_AddLocalLabel($1);
sym_Export($1);
}
| LABEL DOUBLE_COLON {
sym_AddLabel($1);
sym_Export($1);
| COLON {
sym_AddAnonLabel();
}
;
macro:
ID {
SYMBOL {
// Parsing 'macro_args' will restore the lexer's normal mode
lexer_SetMode(LEXER_RAW);
} macro_args {
Expand Down Expand Up @@ -862,7 +844,7 @@ rept:
for:
POP_FOR {
lexer_ToggleStringExpansion(false);
} ID {
} SYMBOL {
lexer_ToggleStringExpansion(true);
} COMMA for_args NEWLINE capture_rept endofline {
if ($8.span.ptr) {
Expand Down Expand Up @@ -906,7 +888,7 @@ break:
def_macro:
POP_MACRO {
lexer_ToggleStringExpansion(false);
} ID {
} SYMBOL {
lexer_ToggleStringExpansion(true);
} NEWLINE capture_macro endofline {
if ($6.span.ptr) {
Expand Down Expand Up @@ -1096,10 +1078,10 @@ purge:
;

purge_args:
scoped_id {
scoped_sym_no_anon {
$$.push_back($1);
}
| purge_args COMMA scoped_id {
| purge_args COMMA scoped_sym_no_anon {
$$ = std::move($1);
$$.push_back($3);
}
Expand All @@ -1113,7 +1095,7 @@ export_list:
;

export_list_entry:
scoped_id {
scoped_sym_no_anon {
sym_Export($1);
}
;
Expand Down Expand Up @@ -1171,16 +1153,16 @@ charmap_args:
;

newcharmap:
POP_NEWCHARMAP ID {
POP_NEWCHARMAP SYMBOL {
charmap_New($2, nullptr);
}
| POP_NEWCHARMAP ID COMMA ID {
| POP_NEWCHARMAP SYMBOL COMMA SYMBOL {
charmap_New($2, &$4);
}
;

setcharmap:
POP_SETCHARMAP ID {
POP_SETCHARMAP SYMBOL {
charmap_Set($2);
}
;
Expand All @@ -1192,7 +1174,7 @@ pushc:
;

pushc_setcharmap:
POP_PUSHC ID {
POP_PUSHC SYMBOL {
charmap_Push();
charmap_Set($2);
}
Expand Down Expand Up @@ -1325,7 +1307,7 @@ relocexpr:
;

relocexpr_no_str:
scoped_anon_id {
scoped_sym {
$$.makeSymbol($1);
}
| NUMBER {
Expand Down Expand Up @@ -1418,8 +1400,8 @@ relocexpr_no_str:
| OP_ISCONST LPAREN relocexpr RPAREN {
$$.makeNumber($3.isKnown());
}
| OP_BANK LPAREN scoped_anon_id RPAREN {
// '@' is also an ID; it is handled here
| OP_BANK LPAREN scoped_sym RPAREN {
// '@' is also a SYMBOL; it is handled here
$$.makeBankSymbol($3);
}
| OP_BANK LPAREN string RPAREN {
Expand All @@ -1439,7 +1421,7 @@ relocexpr_no_str:
}
| OP_DEF {
lexer_ToggleStringExpansion(false);
} LPAREN scoped_anon_id RPAREN {
} LPAREN scoped_sym RPAREN {
$$.makeNumber(sym_FindScopedValidSymbol($4) != nullptr);
lexer_ToggleStringExpansion(true);
}
Expand Down Expand Up @@ -1585,7 +1567,7 @@ string:
| OP_STRFMT LPAREN strfmt_args RPAREN {
$$ = strfmt($3.format, $3.args);
}
| POP_SECTION LPAREN scoped_anon_id RPAREN {
| POP_SECTION LPAREN scoped_sym RPAREN {
Symbol *sym = sym_FindScopedValidSymbol($3);

if (!sym) {
Expand Down
2 changes: 1 addition & 1 deletion test/asm/anon-label-bad.err
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ error: anon-label-bad.asm(6):
error: anon-label-bad.asm(9):
syntax error, unexpected anonymous label
error: anon-label-bad.asm(10):
syntax error, unexpected anonymous label, expecting label or identifier or local identifier
syntax error, unexpected anonymous label, expecting symbol or label or local label
error: anon-label-bad.asm(22):
syntax error, unexpected ::
error: Assembly aborted (5 errors)!
6 changes: 3 additions & 3 deletions test/asm/def-scoped.err
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
error: def-scoped.asm(10):
syntax error, unexpected local identifier, expecting identifier
syntax error, unexpected local label, expecting symbol
error: def-scoped.asm(13):
syntax error, unexpected local identifier, expecting identifier
syntax error, unexpected local label, expecting symbol
error: def-scoped.asm(16):
syntax error, unexpected local identifier, expecting identifier
syntax error, unexpected local label, expecting symbol
error: Assembly aborted (3 errors)!
2 changes: 1 addition & 1 deletion test/asm/error-recovery.err
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
error: error-recovery.asm(3):
syntax error, unexpected number
error: error-recovery.asm(5) -> error-recovery.asm::REPT~1(7):
syntax error, unexpected identifier
syntax error, unexpected symbol
error: Assembly aborted (2 errors)!
Loading

0 comments on commit 4c916b8

Please sign in to comment.