1
0
Fork 0
mirror of https://github.com/ganelson/inform.git synced 2024-07-03 07:24:58 +03:00
inform7/inter/building-module/Chapter 2/Tokenisation.w
2022-01-23 10:34:04 +00:00

799 lines
27 KiB
OpenEdge ABL

[Tokenisation::] Tokenisation.
Turning textual code written in Inform 6 syntax into a linked list of tokens.
@ The following code was sketched out on a long night flight to Hong Kong, but
there is otherwise nothing exotic about it. In as simple a way as possible, we
take a text |from| and break it into Inform 6 tokens. What we return is not
literally a linked list, but it amounts to the same thing: a single node
holding an unstructured run of tokens --
= (text)
EXPRESSION_ISNT
T1
T2
T3
...
=
We follow the syntax of Inform 6, except that we have to look for three extra
syntaxes: |{-braced-commands}|, |(+ Inform 7 interpolation +)|, and, if the
abbreviated syntax is allowed, also some cryptic notations such as |*1|.
The following scanner is basically a finite state machine, and these are the
states:
@e NO_TOKSTATE from 1
@e COMMENT_TOKSTATE /* currently scanning... an I6 comment |! ...| */
@e DQUOTED_TOKSTATE /* ...double-quoted text */
@e SQUOTED_TOKSTATE /* ...single-quoted text */
@e WHITE_TOKSTATE /* ...whitespace */
@e TOK_TOKSTATE /* ...an actual token */
=
void Tokenisation::go(inter_schema *sch, text_stream *from, int pos, int abbreviated,
int no_quoted_inames, void **quoted_inames) {
inter_schema_token *preceding_token = NULL;
int definition_length = Str::len(from);
text_stream *current_raw = Str::new();
int tokeniser_state = NO_TOKSTATE;
for (; pos<definition_length; pos++) {
int c = Str::get_at(from, pos);
if (Characters::is_whitespace(c)) {
if ((tokeniser_state == TOK_TOKSTATE) || (tokeniser_state == NO_TOKSTATE)) {
@<Absorb raw material, if any@>;
tokeniser_state = WHITE_TOKSTATE;
PUT_TO(current_raw, ' ');
}
} else {
if (tokeniser_state == WHITE_TOKSTATE) {
@<Absorb raw material, if any@>;
tokeniser_state = NO_TOKSTATE;
}
}
switch (tokeniser_state) {
case DQUOTED_TOKSTATE:
if (c == '"') {
@<Absorb raw material, if any@>;
tokeniser_state = NO_TOKSTATE;
} else {
PUT_TO(current_raw, c);
}
break;
case SQUOTED_TOKSTATE:
if (c == '\'') {
@<Absorb raw material, if any@>;
tokeniser_state = NO_TOKSTATE;
} else {
PUT_TO(current_raw, c);
}
break;
case COMMENT_TOKSTATE:
if (c == '\n') tokeniser_state = NO_TOKSTATE;
break;
case WHITE_TOKSTATE: break;
default:
if (c == '!') {
@<Absorb raw material, if any@>;
tokeniser_state = COMMENT_TOKSTATE; break;
}
if (c == '"') {
@<Absorb raw material, if any@>;
tokeniser_state = DQUOTED_TOKSTATE; break;
}
if (c == '\'') {
@<Absorb raw material, if any@>;
tokeniser_state = SQUOTED_TOKSTATE; break;
}
if ((c == '{') && (abbreviated == FALSE))
@<Look for a possible bracing@>
else if ((c == '*') && (abbreviated == TRUE))
@<Look for a possible abbreviated command@>
else if ((c == '(') && (Str::get_at(from, pos+1) == '+') && (abbreviated == FALSE))
@<Look for a possible Inform 7 fragment@>
else @<Absorb a raw character@>;
break;
}
}
@<Absorb raw material, if any@>;
}
@<Absorb a raw character@> =
tokeniser_state = TOK_TOKSTATE;
PUT_TO(current_raw, c);
@<Absorb raw material, if any@> =
if (Str::len(current_raw)) {
switch (tokeniser_state) {
case WHITE_TOKSTATE:
InterSchemas::add_token(sch,
InterSchemas::new_token(WHITE_SPACE_ISTT, I" ", 0, 0, -1));
break;
case DQUOTED_TOKSTATE:
Tokenisation::de_escape_text(current_raw);
InterSchemas::add_token(sch,
InterSchemas::new_token(DQUOTED_ISTT, current_raw, 0, 0, -1));
break;
case SQUOTED_TOKSTATE:
InterSchemas::add_token(sch,
InterSchemas::new_token(SQUOTED_ISTT, current_raw, 0, 0, -1));
break;
default:
@<Look for individual tokens@>;
break;
}
Str::clear(current_raw);
}
tokeniser_state = NO_TOKSTATE;
@ Material in |(+ ... +)| notation is an interpolation of I7 source text.
@<Look for a possible Inform 7 fragment@> =
int save_pos = pos, accept = FALSE;
TEMPORARY_TEXT(source_text_fragment)
pos += 2;
while (Str::get_at(from, pos)) {
if ((Str::get_at(from, pos-1) == '+') && (Str::get_at(from, pos) == ')')) {
Str::delete_last_character(source_text_fragment);
accept = TRUE; break;
}
PUT_TO(source_text_fragment, Str::get_at(from, pos++));
}
if (accept) {
@<Absorb raw material, if any@>;
@<Expand a fragment of Inform 7 text@>;
} else { int c = '('; @<Absorb a raw character@>; pos = save_pos; }
DISCARD_TEXT(source_text_fragment)
@ Note that the empty I7 interpolation is legal, but produces no token.
@<Expand a fragment of Inform 7 text@> =
if (Str::len(source_text_fragment) > 0) {
InterSchemas::add_token(sch,
InterSchemas::new_token(I7_ISTT, source_text_fragment, 0, 0, -1));
}
@ Material in braces sometimes indicates an inline command, but not always,
because braces often occur innocently in I6 code. So we require the first
character after the open-brace not to be white-space, and also not to be
a pipe (though I've forgotten why). The text inside the braces is called
a "bracing".
@<Look for a possible bracing@> =
int save_pos = pos++, accept = FALSE;
TEMPORARY_TEXT(bracing)
while (TRUE) {
int c = Str::get_at(from, pos);
if (c == 0) break;
if (c == '}') { accept = TRUE; break; }
PUT_TO(bracing, c);
pos++;
}
int first = Str::get_first_char(bracing);
if ((accept) && (first != ' ') && (first != '\t') && (first != '\n') && (first != '|')) {
@<Absorb raw material, if any@>;
@<Parse a bracing into an inline command@>;
} else { int c = '{'; @<Absorb a raw character@>; pos = save_pos; }
DISCARD_TEXT(bracing)
@ That's everything, then, except the one thing that counts: how to expand
a bracing.
@<Parse a bracing into an inline command@> =
inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, bracing, 0, 0, -1);
t->bracing = Str::duplicate(bracing);
t->command = Str::new();
t->operand = Str::new();
t->operand2 = Str::new();
@<Decompose the bracing@>;
if (Str::len(t->command) > 0) {
int c = unknown_ISINC, sc = no_ISINSC;
if (Str::eq_wide_string(t->command, L"primitive-definition")) {
c = primitive_definition_ISINC;
if (Str::eq_wide_string(t->operand, L"repeat-through")) {
sc = repeat_through_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"repeat-through-list")) {
sc = repeat_through_list_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"number-of")) {
sc = number_of_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"random-of")) {
sc = random_of_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"total-of")) {
sc = total_of_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"extremal")) {
sc = extremal_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"function-application")) {
sc = function_application_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"description-application")) {
sc = description_application_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"solve-equation")) {
sc = solve_equation_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"switch")) {
sc = switch_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"break")) {
sc = break_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"verbose-checking")) {
sc = verbose_checking_ISINSC;
}
} else if (Str::eq_wide_string(t->command, L"new")) {
c = new_ISINC;
} else if (Str::eq_wide_string(t->command, L"new-list-of")) {
c = new_list_of_ISINC;
} else if (Str::eq_wide_string(t->command, L"printing-routine")) {
c = printing_routine_ISINC;
} else if (Str::eq_wide_string(t->command, L"ranger-routine")) {
c = ranger_routine_ISINC;
} else if (Str::eq_wide_string(t->command, L"next-routine")) {
c = next_routine_ISINC;
} else if (Str::eq_wide_string(t->command, L"previous-routine")) {
c = previous_routine_ISINC;
} else if (Str::eq_wide_string(t->command, L"strong-kind")) {
c = strong_kind_ISINC;
} else if (Str::eq_wide_string(t->command, L"weak-kind")) {
c = weak_kind_ISINC;
} else if (Str::eq_wide_string(t->command, L"backspace")) {
c = backspace_ISINC;
} else if (Str::eq_wide_string(t->command, L"erase")) {
c = erase_ISINC;
} else if (Str::eq_wide_string(t->command, L"open-brace")) {
c = open_brace_ISINC;
} else if (Str::eq_wide_string(t->command, L"close-brace")) {
c = close_brace_ISINC;
} else if (Str::eq_wide_string(t->command, L"label")) {
c = label_ISINC;
} else if (Str::eq_wide_string(t->command, L"counter")) {
c = counter_ISINC;
} else if (Str::eq_wide_string(t->command, L"counter-storage")) {
c = counter_storage_ISINC;
} else if (Str::eq_wide_string(t->command, L"counter-up")) {
c = counter_up_ISINC;
} else if (Str::eq_wide_string(t->command, L"counter-down")) {
c = counter_down_ISINC;
} else if (Str::eq_wide_string(t->command, L"counter-makes-array")) {
c = counter_makes_array_ISINC;
} else if (Str::eq_wide_string(t->command, L"by-reference")) {
c = by_reference_ISINC;
} else if (Str::eq_wide_string(t->command, L"by-reference-blank-out")) {
c = by_reference_blank_out_ISINC;
} else if (Str::eq_wide_string(t->command, L"reference-exists")) {
c = reference_exists_ISINC;
} else if (Str::eq_wide_string(t->command, L"lvalue-by-reference")) {
c = lvalue_by_reference_ISINC;
} else if (Str::eq_wide_string(t->command, L"by-value")) {
c = by_value_ISINC;
} else if (Str::eq_wide_string(t->command, L"box-quotation-text")) {
c = box_quotation_text_ISINC;
} else if (Str::eq_wide_string(t->command, L"try-action")) {
c = try_action_ISINC;
} else if (Str::eq_wide_string(t->command, L"try-action-silently")) {
c = try_action_silently_ISINC;
} else if (Str::eq_wide_string(t->command, L"return-value")) {
c = return_value_ISINC;
} else if (Str::eq_wide_string(t->command, L"return-value-from-rule")) {
c = return_value_from_rule_ISINC;
} else if (Str::eq_wide_string(t->command, L"property-holds-block-value")) {
c = property_holds_block_value_ISINC;
} else if (Str::eq_wide_string(t->command, L"mark-event-used")) {
c = mark_event_used_ISINC;
} else if (Str::eq_wide_string(t->command, L"my")) {
c = my_ISINC;
} else if (Str::eq_wide_string(t->command, L"unprotect")) {
c = unprotect_ISINC;
} else if (Str::eq_wide_string(t->command, L"copy")) {
c = copy_ISINC;
} else if (Str::eq_wide_string(t->command, L"initialise")) {
c = initialise_ISINC;
} else if (Str::eq_wide_string(t->command, L"matches-description")) {
c = matches_description_ISINC;
} else if (Str::eq_wide_string(t->command, L"now-matches-description")) {
c = now_matches_description_ISINC;
} else if (Str::eq_wide_string(t->command, L"arithmetic-operation")) {
c = arithmetic_operation_ISINC;
} else if (Str::eq_wide_string(t->command, L"say")) {
c = say_ISINC;
} else if (Str::eq_wide_string(t->command, L"show-me")) {
c = show_me_ISINC;
} else if (Str::eq_wide_string(t->command, L"segment-count")) {
c = segment_count_ISINC;
} else if (Str::eq_wide_string(t->command, L"final-segment-marker")) {
c = final_segment_marker_ISINC;
} else if (Str::eq_wide_string(t->command, L"list-together")) {
c = list_together_ISINC;
if (Str::eq_wide_string(t->operand, L"unarticled")) {
sc = unarticled_ISINSC;
} else if (Str::eq_wide_string(t->operand, L"articled")) {
sc = articled_ISINSC;
}
} else if (Str::eq_wide_string(t->command, L"rescale")) {
c = rescale_ISINC;
}
t->inline_command = c;
t->inline_subcommand = sc;
}
InterSchemas::add_token(sch, t);
preceding_token = t;
@ A bracing can take any of the following forms:
= (text)
{-command}
{-command:operand}
{-command:operand:operand2}
{-command:operand<property name}
{-command:operand>property name}
{some text}
{-annotation:some text}
=
We parse this with the command or annotation in |command|, the "some text"
or operand in |bracing|, the property name (if given) in |extremal_property|,
the direction of the |<| or |>| in |extremal_property_sign|, and the second,
optional, operand in |operand2|.
@<Decompose the bracing@> =
TEMPORARY_TEXT(pname)
if (Str::get_first_char(t->bracing) == '-') {
int portion = 1;
for (int i=1, L = Str::len(t->bracing); i<L; i++) {
int c = Str::get_at(t->bracing, i);
switch(portion) {
case 1:
if (c == ':') portion = 2;
else PUT_TO(t->command, c);
break;
case 2:
if (c == ':') portion = 3;
#ifdef CORE_MODULE
else if (c == '<') {
t->extremal_property_sign = MEASURE_T_OR_LESS; portion = 4;
}
else if (c == '>') {
t->extremal_property_sign = MEASURE_T_OR_MORE; portion = 4;
}
#endif
else PUT_TO(t->operand, c);
break;
case 3:
PUT_TO(t->operand2, c); break;
case 4:
PUT_TO(pname, c); break;
}
}
#ifdef CORE_MODULE
if (t->extremal_property_sign != MEASURE_T_EXACTLY) {
wording W = Feeds::feed_text(pname);
if (<property-name>(W)) t->extremal_property = <<rp>>;
}
#endif
Str::copy(t->bracing, t->operand);
}
DISCARD_TEXT(pname)
@ In abbreviated prototypes, |*1| and |*2| are placeholders, but a number
of modifiers are allowed. See //calculus: Compilation Schemas//.
@d GIVE_KIND_ID_ISSBM 1
@d GIVE_COMPARISON_ROUTINE_ISSBM 2
@d DEREFERENCE_PROPERTY_ISSBM 4
@d ADOPT_LOCAL_STACK_FRAME_ISSBM 8
@d CAST_TO_KIND_OF_OTHER_TERM_ISSBM 16
@d BY_REFERENCE_ISSBM 32
@d LVALUE_CONTEXT_ISSBM 64
@d STORAGE_AS_FUNCTION_ISSBM 128
@<Look for a possible abbreviated command@> =
int at = pos;
wchar_t c = Str::get_at(from, ++at);
int iss_bitmap = 0;
switch (c) {
case '!': InterSchemas::throw_error(sch->node_tree,
I"the '*!' schema notation has been abolished"); break;
case '%': iss_bitmap = iss_bitmap | LVALUE_CONTEXT_ISSBM;
c = Str::get_at(from, ++at); break;
case '$': iss_bitmap = iss_bitmap | STORAGE_AS_FUNCTION_ISSBM;
c = Str::get_at(from, ++at); break;
case '#': iss_bitmap = iss_bitmap | GIVE_KIND_ID_ISSBM;
c = Str::get_at(from, ++at); break;
case '_': iss_bitmap = iss_bitmap | GIVE_COMPARISON_ROUTINE_ISSBM;
c = Str::get_at(from, ++at); break;
case '+': iss_bitmap = iss_bitmap | DEREFERENCE_PROPERTY_ISSBM;
c = Str::get_at(from, ++at); break;
case '|': iss_bitmap = iss_bitmap | (DEREFERENCE_PROPERTY_ISSBM + LVALUE_CONTEXT_ISSBM);
c = Str::get_at(from, ++at); break;
case '?': iss_bitmap = iss_bitmap | ADOPT_LOCAL_STACK_FRAME_ISSBM;
c = Str::get_at(from, ++at); break;
case '<': iss_bitmap = iss_bitmap | CAST_TO_KIND_OF_OTHER_TERM_ISSBM;
c = Str::get_at(from, ++at); break;
case '^': iss_bitmap = iss_bitmap | (ADOPT_LOCAL_STACK_FRAME_ISSBM + BY_REFERENCE_ISSBM);
c = Str::get_at(from, ++at); break;
case '>': iss_bitmap = iss_bitmap | BY_REFERENCE_ISSBM;
c = Str::get_at(from, ++at); break;
}
if (Characters::isdigit(c)) {
@<Absorb raw material, if any@>;
TEMPORARY_TEXT(T)
for (int i=pos; i<=at; i++) PUT_TO(T, Str::get_at(from, i));
inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, T, 0, 0, -1);
t->bracing = Str::duplicate(T);
t->inline_command = substitute_ISINC;
t->inline_modifiers = iss_bitmap;
t->constant_number = (int) c - (int) '1';
InterSchemas::add_token(sch, t);
preceding_token = t;
DISCARD_TEXT(T)
pos = at;
} else if (c == '&') {
inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, I"*&", 0, 0, -1);
t->bracing = I"*&";
t->inline_command = combine_ISINC;
t->inline_modifiers = iss_bitmap;
InterSchemas::add_token(sch, t);
preceding_token = t;
pos = at;
} else if (c == '-') {
InterSchemas::throw_error(sch->node_tree,
I"the '*-' schema notation has been abolished");
} else if (c == '*') {
int c = '*'; @<Absorb a raw character@>;
pos = at;
} else {
int c = '{'; @<Absorb a raw character@>;
}
@ That leaves us with just the main case to handle: raw I6 code which is
outside of quotation marks and commentary, and which doesn't include
bracings or I7 interpolations. That might look like, for instance,
= (text as Inform 6)
Frog + 2*Toad(
=
(there is no reason to suppose that this stretch of code is complete or
matches parentheses); we must tokenise it into
= (text)
Frog
WHITE SPACE
+
WHITE SPACE
2
*
Toad
(
=
We scan through the text until we reach the start of a new token, and then break
off what we scanned through since the last time.
@<Look for individual tokens@> =
int L = Str::len(current_raw);
int c_start = 0, escaped = FALSE;
for (int p = 0; p < L; p++) {
wchar_t c1 = Str::get_at(current_raw, p), c2 = 0, c3 = 0;
if (p < L-1) c2 = Str::get_at(current_raw, p+1);
if (p < L-2) c3 = Str::get_at(current_raw, p+2);
if (escaped == FALSE) {
if ((c1 == '$') &&
((p == 0) ||
(Characters::isalpha(Str::get_at(current_raw, p-1)) == FALSE)))
@<Break off here for real, binary or hexadecimal notation@>;
if (c1 == '-') @<Break off here for negative number@>;
@<Break off here for operators@>;
}
if (c1 == 0x00A7) escaped = escaped?FALSE:TRUE;
}
if (c_start < L) {
int x = c_start, y = L-1;
@<Break off a token@>;
}
@ Recall that in I6 notation, a dollar introduces a non-decimal number, and
the character after the initial dollar determines which:
= (text)
$+3.14159E2
$$1001001
$1FE6
=
@<Break off here for real, binary or hexadecimal notation@> =
int x = c_start, y = p-1;
@<Break off a token@>;
switch (c2) {
case '+': case '-':
x = p; y = p+1;
while ((Str::get_at(current_raw, y+1) == '.') ||
(Str::get_at(current_raw, y+1) == 'E') ||
(Str::get_at(current_raw, y+1) == 'e') ||
(Characters::isdigit(Str::get_at(current_raw, y+1))))
y++;
@<Break off a token@>;
p = y;
c_start = p+1;
continue;
case '$':
x = p; y = p+1;
while ((Str::get_at(current_raw, y+1) == '0') ||
(Str::get_at(current_raw, y+1) == '1'))
y++;
@<Break off a token@>;
p = y;
c_start = p+1;
continue;
default:
x = p; y = p;
while (Characters::isalnum(Str::get_at(current_raw, y+1)))
y++;
@<Break off a token@>;
p = y;
c_start = p+1;
continue;
}
@ A token beginning with a minus sign and continuing with digits may still
not be a negative number: it may be the binary subtraction operator.
For example, we need to tokenise |x-1| as
= (text)
x
-
1
=
and not as
= (text)
x
-1
=
This requires context, that is, remembering what the previous token was.
@<Break off here for negative number@> =
if (((preceding_token == NULL) ||
(preceding_token->ist_type == OPEN_ROUND_ISTT) ||
(preceding_token->ist_type == OPERATOR_ISTT) ||
(preceding_token->ist_type == DIVIDER_ISTT)) &&
(c_start == p) &&
(!((abbreviated) && (preceding_token->ist_type == INLINE_ISTT)))) {
int dc = p+1;
while (Characters::isdigit(Str::get_at(current_raw, dc))) dc++;
if (dc > p+1) {
int x = c_start, y = p-1;
@<Break off a token@>;
x = p; y = dc - 1;
@<Break off a token@>;
p = y;
c_start = p+1;
continue;
}
}
@ In I6, operators made of non-alphanumeric characters can be up to three
characters long, and we take the longest match: thus |-->| is a trigraph,
not the monograph |-| followed by the digraph |->|.
We treat the |@| sign as if it were alphanumeric for the sake of assembly
language opcodes such as |@pull|.
@<Break off here for operators@> =
int monograph = TRUE, digraph = FALSE, trigraph = FALSE;
if ((Characters::isalnum(c1)) || (c1 == '_') || (c1 == '$')) monograph = FALSE;
if (c1 == 0x00A7) monograph = FALSE;
if ((c1 == '#') && (Characters::isalpha(c2))) monograph = FALSE;
if ((c1 == '_') && (Characters::isalpha(c2))) monograph = FALSE;
if ((c1 == '#') && (c2 == '#') && (Characters::isalpha(c3))) monograph = FALSE;
if ((c1 == '@') && (Characters::isalpha(c2))) monograph = FALSE;
if ((c1 == '+') && (c2 == '+')) digraph = TRUE;
if ((c1 == '-') && (c2 == '-')) digraph = TRUE;
if ((c1 == '>') && (c2 == '=')) digraph = TRUE;
if ((c1 == '<') && (c2 == '=')) digraph = TRUE;
if ((c1 == '=') && (c2 == '=')) digraph = TRUE;
if ((c1 == '-') && (c2 == '>')) digraph = TRUE;
if ((c1 == '.') && (c2 == '&')) digraph = TRUE;
if ((c1 == '.') && (c2 == '#')) digraph = TRUE;
if ((c1 == '~') && (c2 == '~')) digraph = TRUE;
if ((c1 == '~') && (c2 == '=')) digraph = TRUE;
if ((c1 == '&') && (c2 == '&')) digraph = TRUE;
if ((c1 == '|') && (c2 == '|')) digraph = TRUE;
if ((c1 == '>') && (c2 == '>')) digraph = TRUE;
if ((c1 == '-') && (c2 == '-') && (c3 == '>')) trigraph = TRUE;
if (trigraph) {
int x = c_start, y = p-1;
@<Break off a token@>;
x = p; y = p+2;
@<Break off a token@>;
p += 2;
c_start = p+1;
continue;
}
if (digraph) {
int x = c_start, y = p-1;
@<Break off a token@>;
x = p; y = p+1;
@<Break off a token@>;
p++;
c_start = p+1;
continue;
}
if (monograph) {
int x = c_start, y = p-1;
@<Break off a token@>;
x = p; y = p;
@<Break off a token@>;
c_start = p+1;
continue;
}
@ In this code, the new token is between character positions |x| and |y|
inclusive; we ignore an empty token.
@<Break off a token@> =
if (y >= x) {
TEMPORARY_TEXT(T)
for (int i = x; i <= y; i++) PUT_TO(T, Str::get_at(current_raw, i));
int is = RAW_ISTT;
inter_ti which = 0;
int which_rw = 0, which_number = -1, which_quote = -1;
@<Identify this new token@>;
inter_schema_token *n = InterSchemas::new_token(is, T, which, which_rw, which_number);
#ifdef CORE_MODULE
if (which_quote >= 0) n->as_quoted = quoted_inames[which_quote];
#endif
InterSchemas::add_token(sch, n);
if (n->ist_type != WHITE_SPACE_ISTT) preceding_token = n;
DISCARD_TEXT(T)
}
@ Finally, we identify what sort of token we're looking at. It would be elegant
to reimplement this with a trie (e.g. using //foundation: Tries and Avinues//),
but speed is not quite important enough to make it worthwhile.
@d LOWEST_XBIP_VALUE HAS_XBIP
@e HAS_XBIP from 10000
@e HASNT_XBIP
@e READ_XBIP
@e OWNERKIND_XBIP
@d HIGHEST_XBIP_VALUE OWNERKIND_XBIP
@<Identify this new token@> =
if (Str::get_at(T, 0) == '@') is = OPCODE_ISTT;
if (Str::get_at(T, 0) == 0x00A7)
is = IDENTIFIER_ISTT;
if ((Str::get_at(T, 0) == '#') && (Str::get_at(T, 1) == '#') &&
(Characters::isalpha(Str::get_at(T, 2)))) {
is = IDENTIFIER_ISTT;
LOOP_THROUGH_TEXT(P, T) {
wchar_t c = Str::get(P);
if ((c != '_') && (c != '#') && (!Characters::isalnum(c)))
is = RAW_ISTT;
}
}
if ((Str::get_at(T, 0) == '#') && (Characters::isalpha(Str::get_at(T, 1)))) {
is = IDENTIFIER_ISTT;
LOOP_THROUGH_TEXT(P, T) {
wchar_t c = Str::get(P);
if ((c != '_') && (c != '#') && (c != '$') && (!Characters::isalnum(c)))
is = RAW_ISTT;
}
}
if ((Str::get_at(T, 0) == '_') && (Characters::isalpha(Str::get_at(T, 1)))) {
is = IDENTIFIER_ISTT;
LOOP_THROUGH_TEXT(P, T) {
wchar_t c = Str::get(P);
if ((c != '_') && (c != '#') && (!Characters::isalnum(c)))
is = RAW_ISTT;
}
}
if (Characters::isalpha(Str::get_at(T, 0))) {
is = IDENTIFIER_ISTT;
LOOP_THROUGH_TEXT(P, T) {
wchar_t c = Str::get(P);
if ((c != '_') && (!Characters::isalnum(c)))
is = RAW_ISTT;
}
if (Str::begins_with_wide_string(T, L"QUOTED_INAME_0_")) which_quote = 0;
else if (Str::begins_with_wide_string(T, L"QUOTED_INAME_1_")) which_quote = 1;
if (Str::eq(T, I"I7_string")) { Str::clear(T); WRITE_TO(T, "I7_String"); }
if (Str::eq(T, I"COMMA_WORD")) { Str::clear(T); WRITE_TO(T, "comma_word"); }
}
if (Characters::isdigit(Str::get_at(T, 0))) {
is = NUMBER_ISTT;
LOOP_THROUGH_TEXT(P, T) {
wchar_t c = Str::get(P);
if (!Characters::isdigit(c))
is = RAW_ISTT;
}
}
if (Str::get_at(T, 0) == '$') {
is = HEX_NUMBER_ISTT;
wchar_t c = Str::get_at(T, 1);
if (c == '$') is = BIN_NUMBER_ISTT;
if (c == '+') is = REAL_NUMBER_ISTT;
if (c == '-') is = REAL_NUMBER_ISTT;
}
if (Str::get_at(T, 0) == '-') is = NUMBER_ISTT;
if (Str::eq(T, I"false")) { is = NUMBER_ISTT; which_number = 0; }
if (Str::eq(T, I"true")) { is = NUMBER_ISTT; which_number = 1; }
if (Str::eq(T, I"nothing")) { is = NUMBER_ISTT; which_number = 0; }
if (Str::eq(T, I"if")) { is = RESERVED_ISTT; which_rw = IF_I6RW; }
if (Str::eq(T, I"else")) { is = RESERVED_ISTT; which_rw = ELSE_I6RW; }
if (Str::eq(T, I"style")) { is = RESERVED_ISTT; which_rw = STYLE_I6RW; }
if (Str::eq(T, I"return")) { is = RESERVED_ISTT; which_rw = RETURN_I6RW; }
if (Str::eq(T, I"rtrue")) { is = RESERVED_ISTT; which_rw = RTRUE_I6RW; }
if (Str::eq(T, I"rfalse")) { is = RESERVED_ISTT; which_rw = RFALSE_I6RW; }
if (Str::eq(T, I"for")) { is = RESERVED_ISTT; which_rw = FOR_I6RW; }
if (Str::eq(T, I"objectloop")) { is = RESERVED_ISTT; which_rw = OBJECTLOOP_I6RW; }
if (Str::eq(T, I"while")) { is = RESERVED_ISTT; which_rw = WHILE_I6RW; }
if (Str::eq(T, I"do")) { is = RESERVED_ISTT; which_rw = DO_I6RW; }
if (Str::eq(T, I"until")) { is = RESERVED_ISTT; which_rw = UNTIL_I6RW; }
if (Str::eq(T, I"print")) { is = RESERVED_ISTT; which_rw = PRINT_I6RW; }
if (Str::eq(T, I"print_ret")) { is = RESERVED_ISTT; which_rw = PRINTRET_I6RW; }
if (Str::eq(T, I"new_line")) { is = RESERVED_ISTT; which_rw = NEWLINE_I6RW; }
if (Str::eq(T, I"give")) { is = RESERVED_ISTT; which_rw = GIVE_I6RW; }
if (Str::eq(T, I"move")) { is = RESERVED_ISTT; which_rw = MOVE_I6RW; }
if (Str::eq(T, I"remove")) { is = RESERVED_ISTT; which_rw = REMOVE_I6RW; }
if (Str::eq(T, I"jump")) { is = RESERVED_ISTT; which_rw = JUMP_I6RW; }
if (Str::eq(T, I"switch")) { is = RESERVED_ISTT; which_rw = SWITCH_I6RW; }
if (Str::eq(T, I"default")) { is = RESERVED_ISTT; which_rw = DEFAULT_I6RW; }
if (Str::eq(T, I"font")) { is = RESERVED_ISTT; which_rw = FONT_I6RW; }
if (Str::eq(T, I"continue")) { is = RESERVED_ISTT; which_rw = CONTINUE_I6RW; }
if (Str::eq(T, I"break")) { is = RESERVED_ISTT; which_rw = BREAK_I6RW; }
if (Str::eq(T, I"quit")) { is = RESERVED_ISTT; which_rw = QUIT_I6RW; }
if (Str::eq(T, I"restore")) { is = RESERVED_ISTT; which_rw = RESTORE_I6RW; }
if (Str::eq(T, I"spaces")) { is = RESERVED_ISTT; which_rw = SPACES_I6RW; }
if (Str::eq(T, I"read")) { is = RESERVED_ISTT; which_rw = READ_I6RW; }
if (Str::eq(T, I"inversion")) { is = RESERVED_ISTT; which_rw = INVERSION_I6RW; }
if (Str::eq_insensitive(T, I"#IFDEF")) { is = DIRECTIVE_ISTT; which_rw = IFDEF_I6RW; }
if (Str::eq_insensitive(T, I"#IFNDEF")) { is = DIRECTIVE_ISTT; which_rw = IFNDEF_I6RW; }
if (Str::eq_insensitive(T, I"#IFTRUE")) { is = DIRECTIVE_ISTT; which_rw = IFTRUE_I6RW; }
if (Str::eq_insensitive(T, I"#IFFALSE")) { is = DIRECTIVE_ISTT; which_rw = IFFALSE_I6RW; }
if (Str::eq_insensitive(T, I"#IFNOT")) { is = DIRECTIVE_ISTT; which_rw = IFNOT_I6RW; }
if (Str::eq_insensitive(T, I"#ENDIF")) { is = DIRECTIVE_ISTT; which_rw = ENDIF_I6RW; }
if (Str::eq(T, I",")) is = COMMA_ISTT;
if (Str::eq(T, I":")) is = COLON_ISTT;
if (Str::eq(T, I"(")) is = OPEN_ROUND_ISTT;
if (Str::eq(T, I")")) is = CLOSE_ROUND_ISTT;
if (Str::eq(T, I"{")) is = OPEN_BRACE_ISTT;
if (Str::eq(T, I"}")) is = CLOSE_BRACE_ISTT;
if (Str::eq(T, I";")) is = DIVIDER_ISTT;
inter_ti x = I6Operators::notation_to_BIP(T);
if (x > 0) { is = OPERATOR_ISTT; which = x; }
@ Anticlimactically: a function to deal with escape characters in Inform 6
double-quoted text notation.
=
void Tokenisation::de_escape_text(text_stream *m) {
int run_start = -1, run_len = 0, run_includes = FALSE;
for (int i=0; i<Str::len(m); i++) {
wchar_t c = Str::get_at(m, i);
if ((c == ' ') || (c == '\t') || (c == '\n')) {
if (run_start == -1) {
run_start = i;
run_len = 0;
run_includes = FALSE;
}
run_len++;
if (c == '\n') run_includes = TRUE;
} else {
if ((run_start >= 0) && (run_includes)) {
Str::put_at(m, run_start, ' ');
for (int j=0; j<run_len-1; j++)
Str::delete_nth_character(m, run_start+1);
i = run_start;
}
run_start = -1;
}
}
LOOP_THROUGH_TEXT(P, m) {
if (Str::get(P) == '^') Str::put(P, '\n');
if (Str::get(P) == '~') Str::put(P, '\"');
}
}