Turning textual code written in Inform 6 syntax into a linked list of tokens.

§1. The following code was sketched out on a long night flight to Hong Kong, but there is otherwise nothing exotic about it. In as simple a way as possible, we take a text from and break it into Inform 6 tokens. What we return is not literally a linked list, but it amounts to the same thing: a single node holding an unstructured run of tokens —

    EXPRESSION_ISNT
        T1
        T2
        T3
        ...

We follow the syntax of Inform 6, except that we have to look for three extra syntaxes: {-braced-commands}, (+ Inform 7 interpolation +), and, if the abbreviated syntax is allowed, also some cryptic notations such as *1.

The following scanner is basically a finite state machine, and these are the states:

enum NO_TOKSTATE from 1
enum COMMENT_TOKSTATE   currently scanning... an I6 comment ! ...
enum DQUOTED_TOKSTATE   ...double-quoted text
enum SQUOTED_TOKSTATE   ...single-quoted text
enum WHITE_TOKSTATE     ...whitespace
enum TOK_TOKSTATE       ...an actual token
void Tokenisation::go(inter_schema *sch, text_stream *from, int pos, int abbreviated,
    int no_quoted_inames, void **quoted_inames) {
    inter_schema_token *preceding_token = NULL;

    int definition_length = Str::len(from);
    text_stream *current_raw = Str::new();
    int tokeniser_state = NO_TOKSTATE;
    for (; pos<definition_length; pos++) {
        int c = Str::get_at(from, pos);
        if (Characters::is_whitespace(c)) {
            if ((tokeniser_state == TOK_TOKSTATE) || (tokeniser_state == NO_TOKSTATE)) {
                Absorb raw material, if any1.2;
                tokeniser_state = WHITE_TOKSTATE;
                PUT_TO(current_raw, ' ');
            }
        } else {
            if (tokeniser_state == WHITE_TOKSTATE) {
                Absorb raw material, if any1.2;
                tokeniser_state = NO_TOKSTATE;
            }
        }

        switch (tokeniser_state) {
             case DQUOTED_TOKSTATE:
                if (c == '"') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = NO_TOKSTATE;
                } else {
                    PUT_TO(current_raw, c);
                }
                break;
             case SQUOTED_TOKSTATE:
                if (c == '\'') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = NO_TOKSTATE;
                } else {
                    PUT_TO(current_raw, c);
                }
                break;
             case COMMENT_TOKSTATE:
                if (c == '\n') tokeniser_state = NO_TOKSTATE;
                break;
             case WHITE_TOKSTATE: break;
             default:
                if (c == '!') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = COMMENT_TOKSTATE; break;
                }
                if (c == '"') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = DQUOTED_TOKSTATE; break;
                }
                if (c == '\'') {
                    Absorb raw material, if any1.2;
                    tokeniser_state = SQUOTED_TOKSTATE; break;
                }
                if ((c == '{') && (abbreviated == FALSE))
                    Look for a possible bracing1.4
                else if ((c == '*') && (abbreviated == TRUE))
                    Look for a possible abbreviated command1.5
                else if ((c == '(') && (Str::get_at(from, pos+1) == '+') && (abbreviated == FALSE))
                    Look for a possible Inform 7 fragment1.3
                else Absorb a raw character1.1;
                break;
        }
    }
    Absorb raw material, if any1.2;
}

§1.1. Absorb a raw character1.1 =

    tokeniser_state = TOK_TOKSTATE;
    PUT_TO(current_raw, c);

§1.2. Absorb raw material, if any1.2 =

    if (Str::len(current_raw)) {
        switch (tokeniser_state) {
            case WHITE_TOKSTATE:
                InterSchemas::add_token(sch,
                    InterSchemas::new_token(WHITE_SPACE_ISTT, I" ", 0, 0, -1));
                break;
            case DQUOTED_TOKSTATE:
                Tokenisation::de_escape_text(current_raw);
                InterSchemas::add_token(sch,
                    InterSchemas::new_token(DQUOTED_ISTT, current_raw, 0, 0, -1));
                break;
            case SQUOTED_TOKSTATE:
                InterSchemas::add_token(sch,
                    InterSchemas::new_token(SQUOTED_ISTT, current_raw, 0, 0, -1));
                break;
            default:
                Look for individual tokens1.2.1;
                break;
        }
        Str::clear(current_raw);
    }
    tokeniser_state = NO_TOKSTATE;

§1.3. Material in (+ ... +) notation is an interpolation of I7 source text.

Look for a possible Inform 7 fragment1.3 =

    int save_pos = pos, accept = FALSE;
    TEMPORARY_TEXT(source_text_fragment)
    pos += 2;
    while (Str::get_at(from, pos)) {
        if ((Str::get_at(from, pos-1) == '+') && (Str::get_at(from, pos) == ')')) {
            Str::delete_last_character(source_text_fragment);
            accept = TRUE; break;
        }
        PUT_TO(source_text_fragment, Str::get_at(from, pos++));
    }
    if (accept) {
        Absorb raw material, if any1.2;
        Expand a fragment of Inform 7 text1.3.1;
    } else { int c = '('; Absorb a raw character1.1; pos = save_pos; }
    DISCARD_TEXT(source_text_fragment)

§1.3.1. Note that the empty I7 interpolation is legal, but produces no token.

Expand a fragment of Inform 7 text1.3.1 =

    if (Str::len(source_text_fragment) > 0) {
        InterSchemas::add_token(sch,
            InterSchemas::new_token(I7_ISTT, source_text_fragment, 0, 0, -1));
    }

§1.4. Material in braces sometimes indicates an inline command, but not always, because braces often occur innocently in I6 code. So we require the first character after the open-brace not to be white-space, and also not to be a pipe (though I've forgotten why). The text inside the braces is called a "bracing".

Look for a possible bracing1.4 =

    int save_pos = pos++, accept = FALSE;
    TEMPORARY_TEXT(bracing)
    while (TRUE) {
        int c = Str::get_at(from, pos);
        if (c == 0) break;
        if (c == '}') { accept = TRUE; break; }
        PUT_TO(bracing, c);
        pos++;
    }
    int first = Str::get_first_char(bracing);
    if ((accept) && (first != ' ') && (first != '\t') && (first != '\n') && (first != '|')) {
        Absorb raw material, if any1.2;
        Parse a bracing into an inline command1.4.1;
    } else { int c = '{'; Absorb a raw character1.1; pos = save_pos; }
    DISCARD_TEXT(bracing)

§1.4.1. That's everything, then, except the one thing that counts: how to expand a bracing.

Parse a bracing into an inline command1.4.1 =

    inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, bracing, 0, 0, -1);
    t->bracing = Str::duplicate(bracing);
    t->command = Str::new();
    t->operand = Str::new();
    t->operand2 = Str::new();
    Decompose the bracing1.4.1.1;
    if (Str::len(t->command) > 0) {
        int c = unknown_ISINC, sc = no_ISINSC;
        if (Str::eq_wide_string(t->command, L"primitive-definition")) {
            c = primitive_definition_ISINC;
            if (Str::eq_wide_string(t->operand, L"repeat-through")) {
                sc = repeat_through_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"repeat-through-list")) {
                sc = repeat_through_list_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"number-of")) {
                sc = number_of_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"random-of")) {
                sc = random_of_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"total-of")) {
                sc = total_of_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"extremal")) {
                sc = extremal_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"function-application")) {
                sc = function_application_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"description-application")) {
                sc = description_application_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"solve-equation")) {
                sc = solve_equation_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"switch")) {
                sc = switch_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"break")) {
                sc = break_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"verbose-checking")) {
                sc = verbose_checking_ISINSC;
            }
        } else if (Str::eq_wide_string(t->command, L"new")) {
            c = new_ISINC;
        } else if (Str::eq_wide_string(t->command, L"new-list-of")) {
            c = new_list_of_ISINC;
        } else if (Str::eq_wide_string(t->command, L"printing-routine")) {
            c = printing_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"ranger-routine")) {
            c = ranger_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"next-routine")) {
            c = next_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"previous-routine")) {
            c = previous_routine_ISINC;
        } else if (Str::eq_wide_string(t->command, L"strong-kind")) {
            c = strong_kind_ISINC;
        } else if (Str::eq_wide_string(t->command, L"weak-kind")) {
            c = weak_kind_ISINC;
        } else if (Str::eq_wide_string(t->command, L"backspace")) {
            c = backspace_ISINC;
        } else if (Str::eq_wide_string(t->command, L"erase")) {
            c = erase_ISINC;
        } else if (Str::eq_wide_string(t->command, L"open-brace")) {
            c = open_brace_ISINC;
        } else if (Str::eq_wide_string(t->command, L"close-brace")) {
            c = close_brace_ISINC;
        } else if (Str::eq_wide_string(t->command, L"label")) {
            c = label_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter")) {
            c = counter_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-storage")) {
            c = counter_storage_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-up")) {
            c = counter_up_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-down")) {
            c = counter_down_ISINC;
        } else if (Str::eq_wide_string(t->command, L"counter-makes-array")) {
            c = counter_makes_array_ISINC;
        } else if (Str::eq_wide_string(t->command, L"by-reference")) {
            c = by_reference_ISINC;
        } else if (Str::eq_wide_string(t->command, L"by-reference-blank-out")) {
            c = by_reference_blank_out_ISINC;
        } else if (Str::eq_wide_string(t->command, L"reference-exists")) {
            c = reference_exists_ISINC;
        } else if (Str::eq_wide_string(t->command, L"lvalue-by-reference")) {
            c = lvalue_by_reference_ISINC;
        } else if (Str::eq_wide_string(t->command, L"by-value")) {
            c = by_value_ISINC;
        } else if (Str::eq_wide_string(t->command, L"box-quotation-text")) {
            c = box_quotation_text_ISINC;
        } else if (Str::eq_wide_string(t->command, L"try-action")) {
            c = try_action_ISINC;
        } else if (Str::eq_wide_string(t->command, L"try-action-silently")) {
            c = try_action_silently_ISINC;
        } else if (Str::eq_wide_string(t->command, L"return-value")) {
            c = return_value_ISINC;
        } else if (Str::eq_wide_string(t->command, L"return-value-from-rule")) {
            c = return_value_from_rule_ISINC;
        } else if (Str::eq_wide_string(t->command, L"property-holds-block-value")) {
            c = property_holds_block_value_ISINC;
        } else if (Str::eq_wide_string(t->command, L"mark-event-used")) {
            c = mark_event_used_ISINC;
        } else if (Str::eq_wide_string(t->command, L"my")) {
            c = my_ISINC;
        } else if (Str::eq_wide_string(t->command, L"unprotect")) {
            c = unprotect_ISINC;
        } else if (Str::eq_wide_string(t->command, L"copy")) {
            c = copy_ISINC;
        } else if (Str::eq_wide_string(t->command, L"initialise")) {
            c = initialise_ISINC;
        } else if (Str::eq_wide_string(t->command, L"matches-description")) {
            c = matches_description_ISINC;
        } else if (Str::eq_wide_string(t->command, L"now-matches-description")) {
            c = now_matches_description_ISINC;
        } else if (Str::eq_wide_string(t->command, L"arithmetic-operation")) {
            c = arithmetic_operation_ISINC;
        } else if (Str::eq_wide_string(t->command, L"say")) {
            c = say_ISINC;
        } else if (Str::eq_wide_string(t->command, L"show-me")) {
            c = show_me_ISINC;
        } else if (Str::eq_wide_string(t->command, L"segment-count")) {
            c = segment_count_ISINC;
        } else if (Str::eq_wide_string(t->command, L"final-segment-marker")) {
            c = final_segment_marker_ISINC;
        } else if (Str::eq_wide_string(t->command, L"list-together")) {
            c = list_together_ISINC;
            if (Str::eq_wide_string(t->operand, L"unarticled")) {
                sc = unarticled_ISINSC;
            } else if (Str::eq_wide_string(t->operand, L"articled")) {
                sc = articled_ISINSC;
            }
        } else if (Str::eq_wide_string(t->command, L"rescale")) {
            c = rescale_ISINC;
        }
        t->inline_command = c;
        t->inline_subcommand = sc;
    }

    InterSchemas::add_token(sch, t);
    preceding_token = t;

§1.4.1.1. A bracing can take any of the following forms:

    {-command}
    {-command:operand}
    {-command:operand:operand2}
    {-command:operand<property name}
    {-command:operand>property name}
    {some text}
    {-annotation:some text}

We parse this with the command or annotation in command, the "some text" or operand in bracing, the property name (if given) in extremal_property, the direction of the < or > in extremal_property_sign, and the second, optional, operand in operand2.

Decompose the bracing1.4.1.1 =

    TEMPORARY_TEXT(pname)
    if (Str::get_first_char(t->bracing) == '-') {
        int portion = 1;
        for (int i=1, L = Str::len(t->bracing); i<L; i++) {
            int c = Str::get_at(t->bracing, i);
            switch(portion) {
                case 1:
                    if (c == ':') portion = 2;
                    else PUT_TO(t->command, c);
                    break;
                case 2:
                    if (c == ':') portion = 3;
                    #ifdef CORE_MODULE
                    else if (c == '<') {
                        t->extremal_property_sign = MEASURE_T_OR_LESS; portion = 4;
                    }
                    else if (c == '>') {
                        t->extremal_property_sign = MEASURE_T_OR_MORE; portion = 4;
                    }
                    #endif
                    else PUT_TO(t->operand, c);
                    break;
                case 3:
                    PUT_TO(t->operand2, c); break;
                case 4:
                    PUT_TO(pname, c); break;
            }
        }
        #ifdef CORE_MODULE
        if (t->extremal_property_sign != MEASURE_T_EXACTLY) {
            wording W = Feeds::feed_text(pname);
            if (<property-name>(W)) t->extremal_property = <<rp>>;
        }
        #endif
        Str::copy(t->bracing, t->operand);
    }
    DISCARD_TEXT(pname)

§1.5. In abbreviated prototypes, *1 and *2 are placeholders, but a number of modifiers are allowed. See Compilation Schemas (in calculus).

define GIVE_KIND_ID_ISSBM                  1
define GIVE_COMPARISON_ROUTINE_ISSBM       2
define DEREFERENCE_PROPERTY_ISSBM          4
define ADOPT_LOCAL_STACK_FRAME_ISSBM       8
define CAST_TO_KIND_OF_OTHER_TERM_ISSBM        16
define BY_REFERENCE_ISSBM                  32
define LVALUE_CONTEXT_ISSBM                    64
define STORAGE_AS_FUNCTION_ISSBM            128

Look for a possible abbreviated command1.5 =

    int at = pos;
    wchar_t c = Str::get_at(from, ++at);
    int iss_bitmap = 0;
    switch (c) {
        case '!': InterSchemas::throw_error(sch->node_tree,
            I"the '*!' schema notation has been abolished"); break;
        case '%': iss_bitmap = iss_bitmap | LVALUE_CONTEXT_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '$': iss_bitmap = iss_bitmap | STORAGE_AS_FUNCTION_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '#': iss_bitmap = iss_bitmap | GIVE_KIND_ID_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '_': iss_bitmap = iss_bitmap | GIVE_COMPARISON_ROUTINE_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '+': iss_bitmap = iss_bitmap | DEREFERENCE_PROPERTY_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '|': iss_bitmap = iss_bitmap | (DEREFERENCE_PROPERTY_ISSBM + LVALUE_CONTEXT_ISSBM);
                  c = Str::get_at(from, ++at); break;
        case '?': iss_bitmap = iss_bitmap | ADOPT_LOCAL_STACK_FRAME_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '<': iss_bitmap = iss_bitmap | CAST_TO_KIND_OF_OTHER_TERM_ISSBM;
                  c = Str::get_at(from, ++at); break;
        case '^': iss_bitmap = iss_bitmap | (ADOPT_LOCAL_STACK_FRAME_ISSBM + BY_REFERENCE_ISSBM);
                  c = Str::get_at(from, ++at); break;
        case '>': iss_bitmap = iss_bitmap | BY_REFERENCE_ISSBM;
                  c = Str::get_at(from, ++at); break;
    }
    if (Characters::isdigit(c)) {
        Absorb raw material, if any1.2;
        TEMPORARY_TEXT(T)
        for (int i=pos; i<=at; i++) PUT_TO(T, Str::get_at(from, i));
        inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, T, 0, 0, -1);
        t->bracing = Str::duplicate(T);
        t->inline_command = substitute_ISINC;
        t->inline_modifiers = iss_bitmap;
        t->constant_number = (int) c - (int) '1';
        InterSchemas::add_token(sch, t);
        preceding_token = t;
        DISCARD_TEXT(T)
        pos = at;
    } else if (c == '&') {
        inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, I"*&", 0, 0, -1);
        t->bracing = I"*&";
        t->inline_command = combine_ISINC;
        t->inline_modifiers = iss_bitmap;
        InterSchemas::add_token(sch, t);
        preceding_token = t;
        pos = at;
    } else if (c == '-') {
        InterSchemas::throw_error(sch->node_tree,
            I"the '*-' schema notation has been abolished");
    } else if (c == '*') {
        int c = '*'; Absorb a raw character1.1;
        pos = at;
    } else {
        int c = '{'; Absorb a raw character1.1;
    }

§1.2.1. That leaves us with just the main case to handle: raw I6 code which is outside of quotation marks and commentary, and which doesn't include bracings or I7 interpolations. That might look like, for instance,

    Frog + 2*Toad(

(there is no reason to suppose that this stretch of code is complete or matches parentheses); we must tokenise it into

    Frog
    WHITE SPACE
    +
    WHITE SPACE
    2
    *
    Toad
    (

We scan through the text until we reach the start of a new token, and then break off what we scanned through since the last time.

Look for individual tokens1.2.1 =

    int L = Str::len(current_raw);
    int c_start = 0, escaped = FALSE;
    for (int p = 0; p < L; p++) {
        wchar_t c1 = Str::get_at(current_raw, p), c2 = 0, c3 = 0;
        if (p < L-1) c2 = Str::get_at(current_raw, p+1);
        if (p < L-2) c3 = Str::get_at(current_raw, p+2);

        if (escaped == FALSE) {
            if ((c1 == '$') &&
                ((p == 0) ||
                    (Characters::isalpha(Str::get_at(current_raw, p-1)) == FALSE)))
                Break off here for real, binary or hexadecimal notation1.2.1.1;
            if (c1 == '-') Break off here for negative number1.2.1.2;
            Break off here for operators1.2.1.3;
        }
        if (c1 == 0x00A7) escaped = escaped?FALSE:TRUE;
    }
    if (c_start < L) {
        int x = c_start, y = L-1;
        Break off a token1.2.1.4;
    }

§1.2.1.1. Recall that in I6 notation, a dollar introduces a non-decimal number, and the character after the initial dollar determines which:

    $+3.14159E2
    $$1001001
    $1FE6

Break off here for real, binary or hexadecimal notation1.2.1.1 =

    int x = c_start, y = p-1;
    Break off a token1.2.1.4;
    switch (c2) {
        case '+': case '-':
            x = p; y = p+1;
            while ((Str::get_at(current_raw, y+1) == '.') ||
                    (Str::get_at(current_raw, y+1) == 'E') ||
                    (Str::get_at(current_raw, y+1) == 'e') ||
                    (Characters::isdigit(Str::get_at(current_raw, y+1))))
                y++;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
        case '$':
            x = p; y = p+1;
            while ((Str::get_at(current_raw, y+1) == '0') ||
                    (Str::get_at(current_raw, y+1) == '1'))
                y++;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
        default:
            x = p; y = p;
            while (Characters::isalnum(Str::get_at(current_raw, y+1)))
                y++;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
    }

§1.2.1.2. A token beginning with a minus sign and continuing with digits may still not be a negative number: it may be the binary subtraction operator. For example, we need to tokenise x-1 as

    x
    -
    1

and not as

    x
    -1

This requires context, that is, remembering what the previous token was.

Break off here for negative number1.2.1.2 =

    if (((preceding_token == NULL) ||
        (preceding_token->ist_type == OPEN_ROUND_ISTT) ||
        (preceding_token->ist_type == OPERATOR_ISTT) ||
        (preceding_token->ist_type == DIVIDER_ISTT)) &&
        (c_start == p) &&
        (!((abbreviated) && (preceding_token->ist_type == INLINE_ISTT)))) {
        int dc = p+1;
        while (Characters::isdigit(Str::get_at(current_raw, dc))) dc++;
        if (dc > p+1) {
            int x = c_start, y = p-1;
            Break off a token1.2.1.4;
            x = p; y = dc - 1;
            Break off a token1.2.1.4;
            p = y;
            c_start = p+1;
            continue;
        }
    }

§1.2.1.3. In I6, operators made of non-alphanumeric characters can be up to three characters long, and we take the longest match: thus --> is a trigraph, not the monograph - followed by the digraph ->.

We treat the @ sign as if it were alphanumeric for the sake of assembly language opcodes such as @pull.

Break off here for operators1.2.1.3 =

    int monograph = TRUE, digraph = FALSE, trigraph = FALSE;
    if ((Characters::isalnum(c1)) || (c1 == '_') || (c1 == '$')) monograph = FALSE;
    if (c1 == 0x00A7) monograph = FALSE;
    if ((c1 == '#') && (Characters::isalpha(c2))) monograph = FALSE;
    if ((c1 == '_') && (Characters::isalpha(c2))) monograph = FALSE;
    if ((c1 == '#') && (c2 == '#') && (Characters::isalpha(c3))) monograph = FALSE;
    if ((c1 == '@') && (Characters::isalpha(c2))) monograph = FALSE;

    if ((c1 == '+') && (c2 == '+')) digraph = TRUE;
    if ((c1 == '-') && (c2 == '-')) digraph = TRUE;
    if ((c1 == '>') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '<') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '=') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '-') && (c2 == '>')) digraph = TRUE;
    if ((c1 == '.') && (c2 == '&')) digraph = TRUE;
    if ((c1 == '.') && (c2 == '#')) digraph = TRUE;
    if ((c1 == '~') && (c2 == '~')) digraph = TRUE;
    if ((c1 == '~') && (c2 == '=')) digraph = TRUE;
    if ((c1 == '&') && (c2 == '&')) digraph = TRUE;
    if ((c1 == '|') && (c2 == '|')) digraph = TRUE;
    if ((c1 == '>') && (c2 == '>')) digraph = TRUE;

    if ((c1 == '-') && (c2 == '-') && (c3 == '>')) trigraph = TRUE;

    if (trigraph) {
        int x = c_start, y = p-1;
        Break off a token1.2.1.4;
        x = p; y = p+2;
        Break off a token1.2.1.4;
        p += 2;
        c_start = p+1;
        continue;
    }

    if (digraph) {
        int x = c_start, y = p-1;
        Break off a token1.2.1.4;
        x = p; y = p+1;
        Break off a token1.2.1.4;
        p++;
        c_start = p+1;
        continue;
    }

    if (monograph) {
        int x = c_start, y = p-1;
        Break off a token1.2.1.4;
        x = p; y = p;
        Break off a token1.2.1.4;
        c_start = p+1;
        continue;
    }

§1.2.1.4. In this code, the new token is between character positions x and y inclusive; we ignore an empty token.

Break off a token1.2.1.4 =

    if (y >= x) {
        TEMPORARY_TEXT(T)
        for (int i = x; i <= y; i++) PUT_TO(T, Str::get_at(current_raw, i));

        int is = RAW_ISTT;
        inter_ti which = 0;
        int which_rw = 0, which_number = -1, which_quote = -1;
        Identify this new token1.2.1.4.1;

        inter_schema_token *n = InterSchemas::new_token(is, T, which, which_rw, which_number);
        #ifdef CORE_MODULE
        if (which_quote >= 0) n->as_quoted = quoted_inames[which_quote];
        #endif
        InterSchemas::add_token(sch, n);
        if (n->ist_type != WHITE_SPACE_ISTT) preceding_token = n;
        DISCARD_TEXT(T)
    }

§1.2.1.4.1. Finally, we identify what sort of token we're looking at. It would be elegant to reimplement this with a trie (e.g. using Tries and Avinues (in foundation)), but speed is not quite important enough to make it worthwhile.

define LOWEST_XBIP_VALUE HAS_XBIP
enum HAS_XBIP from 10000
enum HASNT_XBIP
enum READ_XBIP
enum OWNERKIND_XBIP
define HIGHEST_XBIP_VALUE OWNERKIND_XBIP

Identify this new token1.2.1.4.1 =

    if (Str::get_at(T, 0) == '@') is = OPCODE_ISTT;
    if (Str::get_at(T, 0) == 0x00A7)
        is = IDENTIFIER_ISTT;
    if ((Str::get_at(T, 0) == '#') && (Str::get_at(T, 1) == '#') &&
        (Characters::isalpha(Str::get_at(T, 2)))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (c != '#') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
    }
    if ((Str::get_at(T, 0) == '#') && (Characters::isalpha(Str::get_at(T, 1)))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (c != '#') && (c != '$') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
    }
    if ((Str::get_at(T, 0) == '_') && (Characters::isalpha(Str::get_at(T, 1)))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (c != '#') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
    }
    if (Characters::isalpha(Str::get_at(T, 0))) {
        is = IDENTIFIER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if ((c != '_') && (!Characters::isalnum(c)))
                is = RAW_ISTT;
        }
        if (Str::begins_with_wide_string(T, L"QUOTED_INAME_0_")) which_quote = 0;
        else if (Str::begins_with_wide_string(T, L"QUOTED_INAME_1_")) which_quote = 1;
        if (Str::eq(T, I"I7_string")) { Str::clear(T); WRITE_TO(T, "I7_String"); }
        if (Str::eq(T, I"COMMA_WORD")) { Str::clear(T); WRITE_TO(T, "comma_word"); }
    }
    if (Characters::isdigit(Str::get_at(T, 0))) {
        is = NUMBER_ISTT;
        LOOP_THROUGH_TEXT(P, T) {
            wchar_t c = Str::get(P);
            if (!Characters::isdigit(c))
                is = RAW_ISTT;
        }
    }
    if (Str::get_at(T, 0) == '$') {
        is = HEX_NUMBER_ISTT;
        wchar_t c = Str::get_at(T, 1);
        if (c == '$') is = BIN_NUMBER_ISTT;
        if (c == '+') is = REAL_NUMBER_ISTT;
        if (c == '-') is = REAL_NUMBER_ISTT;
    }
    if (Str::get_at(T, 0) == '-') is = NUMBER_ISTT;

    if (Str::eq(T, I"false")) { is = NUMBER_ISTT; which_number = 0; }
    if (Str::eq(T, I"true")) { is = NUMBER_ISTT; which_number = 1; }
    if (Str::eq(T, I"nothing")) { is = NUMBER_ISTT; which_number = 0; }

    if (Str::eq(T, I"if")) { is = RESERVED_ISTT; which_rw = IF_I6RW; }
    if (Str::eq(T, I"else")) { is = RESERVED_ISTT; which_rw = ELSE_I6RW; }
    if (Str::eq(T, I"style")) { is = RESERVED_ISTT; which_rw = STYLE_I6RW; }
    if (Str::eq(T, I"return")) { is = RESERVED_ISTT; which_rw = RETURN_I6RW; }
    if (Str::eq(T, I"rtrue")) { is = RESERVED_ISTT; which_rw = RTRUE_I6RW; }
    if (Str::eq(T, I"rfalse")) { is = RESERVED_ISTT; which_rw = RFALSE_I6RW; }
    if (Str::eq(T, I"for")) { is = RESERVED_ISTT; which_rw = FOR_I6RW; }
    if (Str::eq(T, I"objectloop")) { is = RESERVED_ISTT; which_rw = OBJECTLOOP_I6RW; }
    if (Str::eq(T, I"while")) { is = RESERVED_ISTT; which_rw = WHILE_I6RW; }
    if (Str::eq(T, I"do")) { is = RESERVED_ISTT; which_rw = DO_I6RW; }
    if (Str::eq(T, I"until")) { is = RESERVED_ISTT; which_rw = UNTIL_I6RW; }
    if (Str::eq(T, I"print")) { is = RESERVED_ISTT; which_rw = PRINT_I6RW; }
    if (Str::eq(T, I"print_ret")) { is = RESERVED_ISTT; which_rw = PRINTRET_I6RW; }
    if (Str::eq(T, I"new_line")) { is = RESERVED_ISTT; which_rw = NEWLINE_I6RW; }
    if (Str::eq(T, I"give")) { is = RESERVED_ISTT; which_rw = GIVE_I6RW; }
    if (Str::eq(T, I"move")) { is = RESERVED_ISTT; which_rw = MOVE_I6RW; }
    if (Str::eq(T, I"remove")) { is = RESERVED_ISTT; which_rw = REMOVE_I6RW; }
    if (Str::eq(T, I"jump")) { is = RESERVED_ISTT; which_rw = JUMP_I6RW; }
    if (Str::eq(T, I"switch")) { is = RESERVED_ISTT; which_rw = SWITCH_I6RW; }
    if (Str::eq(T, I"default")) { is = RESERVED_ISTT; which_rw = DEFAULT_I6RW; }
    if (Str::eq(T, I"font")) { is = RESERVED_ISTT; which_rw = FONT_I6RW; }
    if (Str::eq(T, I"continue")) { is = RESERVED_ISTT; which_rw = CONTINUE_I6RW; }
    if (Str::eq(T, I"break")) { is = RESERVED_ISTT; which_rw = BREAK_I6RW; }
    if (Str::eq(T, I"quit")) { is = RESERVED_ISTT; which_rw = QUIT_I6RW; }
    if (Str::eq(T, I"restore")) { is = RESERVED_ISTT; which_rw = RESTORE_I6RW; }
    if (Str::eq(T, I"spaces")) { is = RESERVED_ISTT; which_rw = SPACES_I6RW; }
    if (Str::eq(T, I"read")) { is = RESERVED_ISTT; which_rw = READ_I6RW; }
    if (Str::eq(T, I"inversion")) { is = RESERVED_ISTT; which_rw = INVERSION_I6RW; }

    if (Str::eq_insensitive(T, I"#IFDEF")) { is = DIRECTIVE_ISTT; which_rw = IFDEF_I6RW; }
    if (Str::eq_insensitive(T, I"#IFNDEF")) { is = DIRECTIVE_ISTT; which_rw = IFNDEF_I6RW; }
    if (Str::eq_insensitive(T, I"#IFTRUE")) { is = DIRECTIVE_ISTT; which_rw = IFTRUE_I6RW; }
    if (Str::eq_insensitive(T, I"#IFFALSE")) { is = DIRECTIVE_ISTT; which_rw = IFFALSE_I6RW; }
    if (Str::eq_insensitive(T, I"#IFNOT")) { is = DIRECTIVE_ISTT; which_rw = IFNOT_I6RW; }
    if (Str::eq_insensitive(T, I"#ENDIF")) { is = DIRECTIVE_ISTT; which_rw = ENDIF_I6RW; }

    if (Str::eq(T, I",")) is = COMMA_ISTT;
    if (Str::eq(T, I":")) is = COLON_ISTT;
    if (Str::eq(T, I"(")) is = OPEN_ROUND_ISTT;
    if (Str::eq(T, I")")) is = CLOSE_ROUND_ISTT;
    if (Str::eq(T, I"{")) is = OPEN_BRACE_ISTT;
    if (Str::eq(T, I"}")) is = CLOSE_BRACE_ISTT;
    if (Str::eq(T, I";")) is = DIVIDER_ISTT;

    inter_ti x = I6Operators::notation_to_BIP(T);
    if (x > 0) { is = OPERATOR_ISTT; which = x; }

§2. Anticlimactically: a function to deal with escape characters in Inform 6 double-quoted text notation.

void Tokenisation::de_escape_text(text_stream *m) {
    int run_start = -1, run_len = 0, run_includes = FALSE;
    for (int i=0; i<Str::len(m); i++) {
        wchar_t c = Str::get_at(m, i);
        if ((c == ' ') || (c == '\t') || (c == '\n')) {
            if (run_start == -1) {
                run_start = i;
                run_len = 0;
                run_includes = FALSE;
            }
            run_len++;
            if (c == '\n') run_includes = TRUE;
        } else {
            if ((run_start >= 0) && (run_includes)) {
                Str::put_at(m, run_start, ' ');
                for (int j=0; j<run_len-1; j++)
                    Str::delete_nth_character(m, run_start+1);
                i = run_start;
            }
            run_start = -1;
        }
    }
    LOOP_THROUGH_TEXT(P, m) {
        if (Str::get(P) == '^') Str::put(P, '\n');
        if (Str::get(P) == '~') Str::put(P, '\"');
    }
}