Tokenisation

Turning textual code written in Inform 6 syntax into a linked list of tokens.

§1. The following code was sketched out on a long night flight to Hong Kong, but there is otherwise nothing exotic about it. In as simple a way as possible, we take a text from and break it into Inform 6 tokens. What we return is not literally a linked list, but it amounts to the same thing: a single node holding an unstructured run of tokens —

    EXPRESSION_ISNT
        T1
        T2
        T3
        ...

We follow the syntax of Inform 6, except that we have to look for three extra syntaxes: {-braced-commands}, (+ Inform 7 interpolation +), and, if the abbreviated syntax is allowed, also some cryptic notations such as *1.

The following scanner is basically a finite state machine, and these are the states:

enum NO_TOKSTATE from 1
enum COMMENT_TOKSTATE   currently scanning... an I6 comment ! ...
enum DQUOTED_TOKSTATE   ...double-quoted text
enum SQUOTED_TOKSTATE   ...single-quoted text
enum WHITE_TOKSTATE     ...whitespace
enum TOK_TOKSTATE       ...an actual token

void Tokenisation::go(inter_schema *sch, text_stream *from, int pos, int abbreviated,
    int no_quoted_inames, void **quoted_inames) {
    inter_schema_token *preceding_token = NULL;

    int definition_length = Str::len(from);
    text_stream *current_raw = Str::new();
    int tokeniser_state = NO_TOKSTATE;
    for (; pos<definition_length; pos++) {
        int c = Str::get_at(from, pos);
        if (Characters::is_whitespace(c)) {
            if ((tokeniser_state == TOK_TOKSTATE) || (tokeniser_state == NO_TOKSTATE)) {
                Absorb raw material, if any1.2;
                tokeniser_state = WHITE_TOKSTATE;
                PUT_TO(current_raw, ' ');
            }
        } else {
            if (tokeniser_state == WHITE_TOKSTATE) {
                Absorb raw material, if any1.2;
                tokeniser_state = NO_TOKSTATE;
            }
        }

        switch (tokeniser_state) {
             case DQUOTED_TOKSTATE:
                if (c == '"') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = NO_TOKSTATE;
                } else {
                    PUT_TO(current_raw, c);
                }
                break;
             case SQUOTED_TOKSTATE:
                if (c == '\'') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = NO_TOKSTATE;
                } else {
                    PUT_TO(current_raw, c);
                }
                break;
             case COMMENT_TOKSTATE:
                if (c == '\n') tokeniser_state = NO_TOKSTATE;
                break;
             case WHITE_TOKSTATE: break;
             default:
                if (c == '!') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = COMMENT_TOKSTATE; break;
                }
                if (c == '"') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = DQUOTED_TOKSTATE; break;
                }
                if (c == '\'') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = SQUOTED_TOKSTATE; break;
                }
                if ((c == '{') && (abbreviated == FALSE))
                    Look for a possible bracing1.4
                else if ((c == '*') && (abbreviated == TRUE))
                    Look for a possible abbreviated command1.5
                else if ((c == '(') && (Str::get_at(from, pos+1) == '+') && (abbreviated == FALSE))
                    Look for a possible Inform 7 fragment1.3
                else Absorb a raw character1.1;
                break;
        }
    }
    Absorb raw material, if any1.2;
}

§1.1. Absorb a raw character1.1 =

    tokeniser_state = TOK_TOKSTATE;
    PUT_TO(current_raw, c);

This code is used in §1, §1.3, §1.4, §1.5 (twice).

§1.2. Absorb raw material, if any1.2 =

    if (Str::len(current_raw)) {
        switch (tokeniser_state) {
            case WHITE_TOKSTATE:
                InterSchemas::add_token(sch,
                    InterSchemas::new_token(WHITE_SPACE_ISTT, I" ", 0, 0, -1));
                break;
            case DQUOTED_TOKSTATE:
                Tokenisation::de_escape_text(current_raw);
                InterSchemas::add_token(sch,
                    InterSchemas::new_token(DQUOTED_ISTT, current_raw, 0, 0, -1));
                break;
            case SQUOTED_TOKSTATE:
                InterSchemas::add_token(sch,
                    InterSchemas::new_token(SQUOTED_ISTT, current_raw, 0, 0, -1));
                break;
            default:
                Look for individual tokens1.2.1;
                break;
        }
        Str::clear(current_raw);
    }
    tokeniser_state = NO_TOKSTATE;

This code is used in §1 (8 times), §1.3, §1.4, §1.5.

§1.3. Material in (+ ... +) notation is an interpolation of I7 source text.

Look for a possible Inform 7 fragment1.3 =

    int save_pos = pos, accept = FALSE;
    TEMPORARY_TEXT(source_text_fragment)
    pos += 2;
    while (Str::get_at(from, pos)) {
        if ((Str::get_at(from, pos-1) == '+') && (Str::get_at(from, pos) == ')')) {
            Str::delete_last_character(source_text_fragment);
            accept = TRUE; break;
        }
        PUT_TO(source_text_fragment, Str::get_at(from, pos++));
    }
    if (accept) {
        Absorb raw material, if any1.2;
        Expand a fragment of Inform 7 text1.3.1;
    } else { int c = '('; Absorb a raw character1.1; pos = save_pos; }
    DISCARD_TEXT(source_text_fragment)

This code is used in §1.

§1.3.1. Note that the empty I7 interpolation is legal, but produces no token.

Expand a fragment of Inform 7 text1.3.1 =

    if (Str::len(source_text_fragment) > 0) {
        InterSchemas::add_token(sch,
            InterSchemas::new_token(I7_ISTT, source_text_fragment, 0, 0, -1));
    }

This code is used in §1.3.

§1.4. Material in braces sometimes indicates an inline command, but not always, because braces often occur innocently in I6 code. So we require the first character after the open-brace not to be white-space, and also not to be a pipe (though I've forgotten why). The text inside the braces is called a "bracing".

Look for a possible bracing1.4 =

    int save_pos = pos++, accept = FALSE;
    TEMPORARY_TEXT(bracing)
    while (TRUE) {
        int c = Str::get_at(from, pos);
        if (c == 0) break;
        if (c == '}') { accept = TRUE; break; }
        PUT_TO(bracing, c);
        pos++;
    }
    int first = Str::get_first_char(bracing);
    if ((accept) && (first != ' ') && (first != '\t') && (first != '\n') && (first != '|')) {
        Absorb raw material, if any1.2;
        Parse a bracing into an inline command1.4.1;
    } else { int c = '{'; Absorb a raw character1.1; pos = save_pos; }
    DISCARD_TEXT(bracing)

This code is used in §1.

§1.4.1. That's everything, then, except the one thing that counts: how to expand a bracing.

Parse a bracing into an inline command1.4.1 =

    inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, bracing, 0, 0, -1);
    t->bracing = Str::duplicate(bracing);
    t->command = Str::new();
    t->operand = Str::new();
    t->operand2 = Str::new();
    Decompose the bracing1.4.1.1;
    if (Str::len(t->command) > 0) {
        int c = unknown_ISINC, sc = no_ISINSC;
        if (Str::eq_wide_string(t->command, L"primitive-definition")) {
            c = primitive_definition_ISINC;
            if (Str::eq_wide_string(t->operand, L"repeat-through")) {
                sc = repeat_through_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"repeat-through-list")) {
                sc = repeat_through_list_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"number-of")) {
                sc = number_of_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"random-of")) {
                sc = random_of_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"total-of")) {
                sc = total_of_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"extremal")) {
                sc = extremal_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"function-application")) {
                sc = function_application_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"description-application")) {
                sc = description_application_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"solve-equation")) {
                sc = solve_equation_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"switch")) {
                sc = switch_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"break")) {
                sc = break_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"verbose-checking")) {
                sc = verbose_checking_ISINSC;
            }
        } else if (Str::eq_wide_string(t->command, L"new")) {
            c = new_ISINC;
        } else if (Str::eq_wide_string(t->command, L"new-list-of")) {
            c = new_list_of_ISINC;
        } else if (Str::eq_wide_string(t->command, L"printing-routine")) {
            c = printing_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"ranger-routine")) {
            c = ranger_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"next-routine")) {
            c = next_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"previous-routine")) {
            c = previous_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"strong-kind")) {
            c = strong_kind_ISINC;
        } else if (Str::eq_wide_string(t->command, L"weak-kind")) {
            c = weak_kind_ISINC;
        } else if (Str::eq_wide_string(t->command, L"backspace")) {
            c = backspace_ISINC;
        } else if (Str::eq_wide_string(t->command, L"erase")) {
            c = erase_ISINC;
        } else if (Str::eq_wide_string(t->command, L"open-brace")) {
            c = open_brace_ISINC;
        } else if (Str::eq_wide_string(t->command, L"close-brace")) {
            c = close_brace_ISINC;
        } else if (Str::eq_wide_string(t->command, L"label")) {
            c = label_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter")) {
            c = counter_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-storage")) {
            c = counter_storage_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-up")) {
            c = counter_up_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-down")) {
            c = counter_down_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-makes-array")) {
            c = counter_makes_array_ISINC;
        } else if (Str::eq_wide_string(t->command, L"by-reference")) {
            c = by_reference_ISINC;
        } else if (Str::eq_wide_string(t->command, L"by-reference-blank-out")) {
            c = by_reference_blank_out_ISINC;
        } else if (Str::eq_wide_string(t->command, L"reference-exists")) {
            c = reference_exists_ISINC;
        } else if (Str::eq_wide_string(t->command, L"lvalue-by-reference")) {
            c = lvalue_by_reference_ISINC;
        } else if (Str::eq_wide_string(t->command, L"by-value")) {
            c = by_value_ISINC;
        } else if (Str::eq_wide_string(t->command, L"box-quotation-text")) {
            c = box_quotation_text_ISINC;
        } else if (Str::eq_wide_string(t->command, L"try-action")) {
            c = try_action_ISINC;
        } else if (Str::eq_wide_string(t->command, L"try-action-silently")) {
            c = try_action_silently_ISINC;
        } else if (Str::eq_wide_string(t->command, L"return-value")) {
            c = return_value_ISINC;
        } else if (Str::eq_wide_string(t->command, L"return-value-from-rule")) {
            c = return_value_from_rule_ISINC;
        } else if (Str::eq_wide_string(t->command, L"property-holds-block-value")) {
            c = property_holds_block_value_ISINC;
        } else if (Str::eq_wide_string(t->command, L"mark-event-used")) {
            c = mark_event_used_ISINC;
        } else if (Str::eq_wide_string(t->command, L"my")) {
            c = my_ISINC;
        } else if (Str::eq_wide_string(t->command, L"unprotect")) {
            c = unprotect_ISINC;
        } else if (Str::eq_wide_string(t->command, L"copy")) {
            c = copy_ISINC;
        } else if (Str::eq_wide_string(t->command, L"initialise")) {
            c = initialise_ISINC;
        } else if (Str::eq_wide_string(t->command, L"matches-description")) {
            c = matches_description_ISINC;
        } else if (Str::eq_wide_string(t->command, L"now-matches-description")) {
            c = now_matches_description_ISINC;
        } else if (Str::eq_wide_string(t->command, L"arithmetic-operation")) {
            c = arithmetic_operation_ISINC;
        } else if (Str::eq_wide_string(t->command, L"say")) {
            c = say_ISINC;
        } else if (Str::eq_wide_string(t->command, L"show-me")) {
            c = show_me_ISINC;
        } else if (Str::eq_wide_string(t->command, L"segment-count")) {
            c = segment_count_ISINC;
        } else if (Str::eq_wide_string(t->command, L"final-segment-marker")) {
            c = final_segment_marker_ISINC;
        } else if (Str::eq_wide_string(t->command, L"list-together")) {
            c = list_together_ISINC;
            if (Str::eq_wide_string(t->operand, L"unarticled")) {
                sc = unarticled_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"articled")) {
                sc = articled_ISINSC;
            }
        } else if (Str::eq_wide_string(t->command, L"rescale")) {
            c = rescale_ISINC;
        }
        t->inline_command = c;
        t->inline_subcommand = sc;
    }

    InterSchemas::add_token(sch, t);
    preceding_token = t;

This code is used in §1.4.

§1.4.1.1. A bracing can take any of the following forms:

    {-command}
    {-command:operand}
    {-command:operand:operand2}
    {-command:operand<property name}
    {-command:operand>property name}
    {some text}
    {-annotation:some text}

We parse this with the command or annotation in command, the "some text" or operand in bracing, the property name (if given) in extremal_property, the direction of the < or > in extremal_property_sign, and the second, optional, operand in operand2.

Decompose the bracing1.4.1.1 =

    TEMPORARY_TEXT(pname)
    if (Str::get_first_char(t->bracing) == '-') {
        int portion = 1;
        for (int i=1, L = Str::len(t->bracing); i<L; i++) {
            int c = Str::get_at(t->bracing, i);
            switch(portion) {
                case 1:
                    if (c == ':') portion = 2;
                    else PUT_TO(t->command, c);
                    break;
                case 2:
                    if (c == ':') portion = 3;
                    #ifdef CORE_MODULE
                    else if (c == '<') {
                        t->extremal_property_sign = MEASURE_T_OR_LESS; portion = 4;
                    }
                    else if (c == '>') {
                        t->extremal_property_sign = MEASURE_T_OR_MORE; portion = 4;
                    }
                    #endif
                    else PUT_TO(t->operand, c);
                    break;
                case 3:
                    PUT_TO(t->operand2, c); break;
                case 4:
                    PUT_TO(pname, c); break;
            }
        }
        #ifdef CORE_MODULE
        if (t->extremal_property_sign != MEASURE_T_EXACTLY) {
            wording W = Feeds::feed_text(pname);
            if (<property-name>(W)) t->extremal_property = <<rp>>;
        }
        #endif
        Str::copy(t->bracing, t->operand);
    }
    DISCARD_TEXT(pname)

This code is used in §1.4.1.

§1.5. In abbreviated prototypes, *1 and *2 are placeholders, but a number of modifiers are allowed. See Compilation Schemas (in calculus).

define GIVE_KIND_ID_ISSBM                  1
define GIVE_COMPARISON_ROUTINE_ISSBM       2
define DEREFERENCE_PROPERTY_ISSBM          4
define ADOPT_LOCAL_STACK_FRAME_ISSBM       8
define CAST_TO_KIND_OF_OTHER_TERM_ISSBM        16
define BY_REFERENCE_ISSBM                  32
define LVALUE_CONTEXT_ISSBM                    64
define STORAGE_AS_FUNCTION_ISSBM            128

Look for a possible abbreviated command1.5 =

    int at = pos;
    wchar_t c = Str::get_at(from, ++at);
    int iss_bitmap = 0;
    switch (c) {
        case '!': InterSchemas::throw_error(sch->node_tree,
            I"the '*!' schema notation has been abolished"); break;
        case '%': iss_bitmap = iss_bitmap | LVALUE_CONTEXT_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '$': iss_bitmap = iss_bitmap | STORAGE_AS_FUNCTION_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '#': iss_bitmap = iss_bitmap | GIVE_KIND_ID_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '_': iss_bitmap = iss_bitmap | GIVE_COMPARISON_ROUTINE_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '+': iss_bitmap = iss_bitmap | DEREFERENCE_PROPERTY_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '|': iss_bitmap = iss_bitmap | (DEREFERENCE_PROPERTY_ISSBM + LVALUE_CONTEXT_ISSBM);
                  c = Str::get_at(from, ++at); break;
        case '?': iss_bitmap = iss_bitmap | ADOPT_LOCAL_STACK_FRAME_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '<': iss_bitmap = iss_bitmap | CAST_TO_KIND_OF_OTHER_TERM_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '^': iss_bitmap = iss_bitmap | (ADOPT_LOCAL_STACK_FRAME_ISSBM + BY_REFERENCE_ISSBM);
                  c = Str::get_at(from, ++at); break;
        case '>': iss_bitmap = iss_bitmap | BY_REFERENCE_ISSBM;
                  c = Str::get_at(from, ++at); break;
    }
    if (Characters::isdigit(c)) {
        Absorb raw material, if any1.2;
        TEMPORARY_TEXT(T)
        for (int i=pos; i<=at; i++) PUT_TO(T, Str::get_at(from, i));
        inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, T, 0, 0, -1);
        t->bracing = Str::duplicate(T);
        t->inline_command = substitute_ISINC;
        t->inline_modifiers = iss_bitmap;
        t->constant_number = (int) c - (int) '1';
        InterSchemas::add_token(sch, t);
        preceding_token = t;
        DISCARD_TEXT(T)
        pos = at;
    } else if (c == '&') {
        inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, I"*&", 0, 0, -1);
        t->bracing = I"*&";
        t->inline_command = combine_ISINC;
        t->inline_modifiers = iss_bitmap;
        InterSchemas::add_token(sch, t);
        preceding_token = t;
        pos = at;
    } else if (c == '-') {
        InterSchemas::throw_error(sch->node_tree,
            I"the '*-' schema notation has been abolished");
    } else if (c == '*') {
        int c = '*'; Absorb a raw character1.1;
        pos = at;
    } else {
        int c = '{'; Absorb a raw character1.1;
    }

This code is used in §1.

§1.2.1. That leaves us with just the main case to handle: raw I6 code which is outside of quotation marks and commentary, and which doesn't include bracings or I7 interpolations. That might look like, for instance,

    Frog + 2*Toad(

(there is no reason to suppose that this stretch of code is complete or matches parentheses); we must tokenise it into

    Frog
    WHITE SPACE
    +
    WHITE SPACE
    2
    *
    Toad
    (

We scan through the text until we reach the start of a new token, and then break off what we scanned through since the last time.

Look for individual tokens1.2.1 =

    int L = Str::len(current_raw);
    int c_start = 0, escaped = FALSE;
    for (int p = 0; p < L; p++) {
        wchar_t c1 = Str::get_at(current_raw, p), c2 = 0, c3 = 0;
        if (p < L-1) c2 = Str::get_at(current_raw, p+1);
        if (p < L-2) c3 = Str::get_at(current_raw, p+2);

        if (escaped == FALSE) {
            if ((c1 == '$') &&
                ((p == 0) ||
                    (Characters::isalpha(Str::get_at(current_raw, p-1)) == FALSE)))
                Break off here for real, binary or hexadecimal notation1.2.1.1;
            if (c1 == '-') Break off here for negative number1.2.1.2;
            Break off here for operators1.2.1.3;
        }
        if (c1 == 0x00A7) escaped = escaped?FALSE:TRUE;
    }
    if (c_start < L) {
        int x = c_start, y = L-1;
        Break off a token1.2.1.4;
    }

This code is used in §1.2.

§1.2.1.1. Recall that in I6 notation, a dollar introduces a non-decimal number, and the character after the initial dollar determines which:

    $+3.14159E2
    $$1001001
    $1FE6

Break off here for real, binary or hexadecimal notation1.2.1.1 =

    int x = c_start, y = p-1;
    Break off a token1.2.1.4;
    switch (c2) {
        case '+': case '-':
            x = p; y = p+1;
            while ((Str::get_at(current_raw, y+1) == '.') ||
                    (Str::get_at(current_raw, y+1) == 'E') ||
                    (Str::get_at(current_raw, y+1) == 'e') ||
                    (Characters::isdigit(Str::get_at(current_raw, y+1))))
                y++;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
        case '$':
            x = p; y = p+1;
            while ((Str::get_at(current_raw, y+1) == '0') ||
                    (Str::get_at(current_raw, y+1) == '1'))
                y++;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
        default:
            x = p; y = p;
            while (Characters::isalnum(Str::get_at(current_raw, y+1)))
                y++;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
    }

This code is used in §1.2.1.

§1.2.1.2. A token beginning with a minus sign and continuing with digits may still not be a negative number: it may be the binary subtraction operator. For example, we need to tokenise x-1 as

    x
    -
    1

and not as

    x
    -1

This requires context, that is, remembering what the previous token was.

Break off here for negative number1.2.1.2 =

    if (((preceding_token == NULL) ||
        (preceding_token->ist_type == OPEN_ROUND_ISTT) ||
        (preceding_token->ist_type == OPERATOR_ISTT) ||
        (preceding_token->ist_type == DIVIDER_ISTT)) &&
        (c_start == p) &&
        (!((abbreviated) && (preceding_token->ist_type == INLINE_ISTT)))) {
        int dc = p+1;
        while (Characters::isdigit(Str::get_at(current_raw, dc))) dc++;
        if (dc > p+1) {
            int x = c_start, y = p-1;
            Break off a token1.2.1.4;
            x = p; y = dc - 1;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
        }
    }

This code is used in §1.2.1.

§1.2.1.3. In I6, operators made of non-alphanumeric characters can be up to three characters long, and we take the longest match: thus --> is a trigraph, not the monograph - followed by the digraph ->.

We treat the @ sign as if it were alphanumeric for the sake of assembly language opcodes such as @pull.

Break off here for operators1.2.1.3 =

    int monograph = TRUE, digraph = FALSE, trigraph = FALSE;
    if ((Characters::isalnum(c1)) || (c1 == '_') || (c1 == '$')) monograph = FALSE;
    if (c1 == 0x00A7) monograph = FALSE;
    if ((c1 == '#') && (Characters::isalpha(c2))) monograph = FALSE;
    if ((c1 == '_') && (Characters::isalpha(c2))) monograph = FALSE;
    if ((c1 == '#') && (c2 == '#') && (Characters::isalpha(c3))) monograph = FALSE;
    if ((c1 == '@') && (Characters::isalpha(c2))) monograph = FALSE;

    if ((c1 == '+') && (c2 == '+')) digraph = TRUE;
    if ((c1 == '-') && (c2 == '-')) digraph = TRUE;
    if ((c1 == '>') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '<') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '=') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '-') && (c2 == '>')) digraph = TRUE;
    if ((c1 == '.') && (c2 == '&')) digraph = TRUE;
    if ((c1 == '.') && (c2 == '#')) digraph = TRUE;
    if ((c1 == '~') && (c2 == '~')) digraph = TRUE;
    if ((c1 == '~') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '&') && (c2 == '&')) digraph = TRUE;
    if ((c1 == '|') && (c2 == '|')) digraph = TRUE;
    if ((c1 == '>') && (c2 == '>')) digraph = TRUE;

    if ((c1 == '-') && (c2 == '-') && (c3 == '>')) trigraph = TRUE;

    if (trigraph) {
        int x = c_start, y = p-1;
        Break off a token1.2.1.4;
        x = p; y = p+2;
        Break off a token1.2.1.4;
        p += 2;
        c_start = p+1;
        continue;
    }

    if (digraph) {
        int x = c_start, y = p-1;
        Break off a token1.2.1.4;
        x = p; y = p+1;
        Break off a token1.2.1.4;
        p++;
        c_start = p+1;
        continue;
    }

    if (monograph) {
        int x = c_start, y = p-1;
        Break off a token1.2.1.4;
        x = p; y = p;
        Break off a token1.2.1.4;
        c_start = p+1;
        continue;
    }

This code is used in §1.2.1.

§1.2.1.4. In this code, the new token is between character positions x and y inclusive; we ignore an empty token.

Break off a token1.2.1.4 =

    if (y >= x) {
        TEMPORARY_TEXT(T)
        for (int i = x; i <= y; i++) PUT_TO(T, Str::get_at(current_raw, i));

        int is = RAW_ISTT;
        inter_ti which = 0;
        int which_rw = 0, which_number = -1, which_quote = -1;
        Identify this new token1.2.1.4.1;

        inter_schema_token *n = InterSchemas::new_token(is, T, which, which_rw, which_number);
        #ifdef CORE_MODULE
        if (which_quote >= 0) n->as_quoted = quoted_inames[which_quote];
        #endif
        InterSchemas::add_token(sch, n);
        if (n->ist_type != WHITE_SPACE_ISTT) preceding_token = n;
        DISCARD_TEXT(T)
    }

This code is used in §1.2.1, §1.2.1.1 (four times), §1.2.1.2 (twice), §1.2.1.3 (6 times).

§1.2.1.4.1. Finally, we identify what sort of token we're looking at. It would be elegant to reimplement this with a trie (e.g. using Tries and Avinues (in foundation)), but speed is not quite important enough to make it worthwhile.

define LOWEST_XBIP_VALUE HAS_XBIP
enum HAS_XBIP from 10000
enum HASNT_XBIP
enum READ_XBIP
enum OWNERKIND_XBIP
define HIGHEST_XBIP_VALUE OWNERKIND_XBIP

Identify this new token1.2.1.4.1 =

    if (Str::get_at(T, 0) == '@') is = OPCODE_ISTT;
    if (Str::get_at(T, 0) == 0x00A7)
        is = IDENTIFIER_ISTT;
    if ((Str::get_at(T, 0) == '#') && (Str::get_at(T, 1) == '#') &&
        (Characters::isalpha(Str::get_at(T, 2)))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (c != '#') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
    }
    if ((Str::get_at(T, 0) == '#') && (Characters::isalpha(Str::get_at(T, 1)))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (c != '#') && (c != '$') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
    }
    if ((Str::get_at(T, 0) == '_') && (Characters::isalpha(Str::get_at(T, 1)))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (c != '#') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
    }
    if (Characters::isalpha(Str::get_at(T, 0))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
        if (Str::begins_with_wide_string(T, L"QUOTED_INAME_0_")) which_quote = 0;
        else if (Str::begins_with_wide_string(T, L"QUOTED_INAME_1_")) which_quote = 1;
        if (Str::eq(T, I"I7_string")) { Str::clear(T); WRITE_TO(T, "I7_String"); }
        if (Str::eq(T, I"COMMA_WORD")) { Str::clear(T); WRITE_TO(T, "comma_word"); }
    }
    if (Characters::isdigit(Str::get_at(T, 0))) {
        is = NUMBER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if (!Characters::isdigit(c))
                is = RAW_ISTT;
        }
    }
    if (Str::get_at(T, 0) == '$') {
        is = HEX_NUMBER_ISTT;
        wchar_t c = Str::get_at(T, 1);
        if (c == '$') is = BIN_NUMBER_ISTT;
        if (c == '+') is = REAL_NUMBER_ISTT;
        if (c == '-') is = REAL_NUMBER_ISTT;
    }
    if (Str::get_at(T, 0) == '-') is = NUMBER_ISTT;

    if (Str::eq(T, I"false")) { is = NUMBER_ISTT; which_number = 0; }
    if (Str::eq(T, I"true")) { is = NUMBER_ISTT; which_number = 1; }
    if (Str::eq(T, I"nothing")) { is = NUMBER_ISTT; which_number = 0; }

    if (Str::eq(T, I"if")) { is = RESERVED_ISTT; which_rw = IF_I6RW; }
    if (Str::eq(T, I"else")) { is = RESERVED_ISTT; which_rw = ELSE_I6RW; }
    if (Str::eq(T, I"style")) { is = RESERVED_ISTT; which_rw = STYLE_I6RW; }
    if (Str::eq(T, I"return")) { is = RESERVED_ISTT; which_rw = RETURN_I6RW; }
    if (Str::eq(T, I"rtrue")) { is = RESERVED_ISTT; which_rw = RTRUE_I6RW; }
    if (Str::eq(T, I"rfalse")) { is = RESERVED_ISTT; which_rw = RFALSE_I6RW; }
    if (Str::eq(T, I"for")) { is = RESERVED_ISTT; which_rw = FOR_I6RW; }
    if (Str::eq(T, I"objectloop")) { is = RESERVED_ISTT; which_rw = OBJECTLOOP_I6RW; }
    if (Str::eq(T, I"while")) { is = RESERVED_ISTT; which_rw = WHILE_I6RW; }
    if (Str::eq(T, I"do")) { is = RESERVED_ISTT; which_rw = DO_I6RW; }
    if (Str::eq(T, I"until")) { is = RESERVED_ISTT; which_rw = UNTIL_I6RW; }
    if (Str::eq(T, I"print")) { is = RESERVED_ISTT; which_rw = PRINT_I6RW; }
    if (Str::eq(T, I"print_ret")) { is = RESERVED_ISTT; which_rw = PRINTRET_I6RW; }
    if (Str::eq(T, I"new_line")) { is = RESERVED_ISTT; which_rw = NEWLINE_I6RW; }
    if (Str::eq(T, I"give")) { is = RESERVED_ISTT; which_rw = GIVE_I6RW; }
    if (Str::eq(T, I"move")) { is = RESERVED_ISTT; which_rw = MOVE_I6RW; }
    if (Str::eq(T, I"remove")) { is = RESERVED_ISTT; which_rw = REMOVE_I6RW; }
    if (Str::eq(T, I"jump")) { is = RESERVED_ISTT; which_rw = JUMP_I6RW; }
    if (Str::eq(T, I"switch")) { is = RESERVED_ISTT; which_rw = SWITCH_I6RW; }
    if (Str::eq(T, I"default")) { is = RESERVED_ISTT; which_rw = DEFAULT_I6RW; }
    if (Str::eq(T, I"font")) { is = RESERVED_ISTT; which_rw = FONT_I6RW; }
    if (Str::eq(T, I"continue")) { is = RESERVED_ISTT; which_rw = CONTINUE_I6RW; }
    if (Str::eq(T, I"break")) { is = RESERVED_ISTT; which_rw = BREAK_I6RW; }
    if (Str::eq(T, I"quit")) { is = RESERVED_ISTT; which_rw = QUIT_I6RW; }
    if (Str::eq(T, I"restore")) { is = RESERVED_ISTT; which_rw = RESTORE_I6RW; }
    if (Str::eq(T, I"spaces")) { is = RESERVED_ISTT; which_rw = SPACES_I6RW; }
    if (Str::eq(T, I"read")) { is = RESERVED_ISTT; which_rw = READ_I6RW; }
    if (Str::eq(T, I"inversion")) { is = RESERVED_ISTT; which_rw = INVERSION_I6RW; }

    if (Str::eq_insensitive(T, I"#IFDEF")) { is = DIRECTIVE_ISTT; which_rw = IFDEF_I6RW; }
    if (Str::eq_insensitive(T, I"#IFNDEF")) { is = DIRECTIVE_ISTT; which_rw = IFNDEF_I6RW; }
    if (Str::eq_insensitive(T, I"#IFTRUE")) { is = DIRECTIVE_ISTT; which_rw = IFTRUE_I6RW; }
    if (Str::eq_insensitive(T, I"#IFFALSE")) { is = DIRECTIVE_ISTT; which_rw = IFFALSE_I6RW; }
    if (Str::eq_insensitive(T, I"#IFNOT")) { is = DIRECTIVE_ISTT; which_rw = IFNOT_I6RW; }
    if (Str::eq_insensitive(T, I"#ENDIF")) { is = DIRECTIVE_ISTT; which_rw = ENDIF_I6RW; }

    if (Str::eq(T, I",")) is = COMMA_ISTT;
    if (Str::eq(T, I":")) is = COLON_ISTT;
    if (Str::eq(T, I"(")) is = OPEN_ROUND_ISTT;
    if (Str::eq(T, I")")) is = CLOSE_ROUND_ISTT;
    if (Str::eq(T, I"{")) is = OPEN_BRACE_ISTT;
    if (Str::eq(T, I"}")) is = CLOSE_BRACE_ISTT;
    if (Str::eq(T, I";")) is = DIVIDER_ISTT;

    inter_ti x = I6Operators::notation_to_BIP(T);
    if (x > 0) { is = OPERATOR_ISTT; which = x; }

This code is used in §1.2.1.4.

§2. Anticlimactically: a function to deal with escape characters in Inform 6 double-quoted text notation.

void Tokenisation::de_escape_text(text_stream *m) {
    int run_start = -1, run_len = 0, run_includes = FALSE;
    for (int i=0; i<Str::len(m); i++) {
        wchar_t c = Str::get_at(m, i);
        if ((c == ' ') || (c == '\t') || (c == '\n')) {
            if (run_start == -1) {
                run_start = i;
                run_len = 0;
                run_includes = FALSE;
            }
            run_len++;
            if (c == '\n') run_includes = TRUE;
        } else {
            if ((run_start >= 0) && (run_includes)) {
                Str::put_at(m, run_start, ' ');
                for (int j=0; j<run_len-1; j++)
                    Str::delete_nth_character(m, run_start+1);
                i = run_start;
            }
            run_start = -1;
        }
    }
    LOOP_THROUGH_TEXT(P, m) {
        if (Str::get(P) == '^') Str::put(P, '\n');
        if (Str::get(P) == '~') Str::put(P, '\"');
    }
}