inform7/inform6/Inform6/lexer.c

/* ------------------------------------------------------------------------- */
/*   "lexer" : Lexical analyser                                              */
/*                                                                           */
/*   Part of Inform 6.34                                                     */
/*   copyright (c) Graham Nelson 1993 - 2020                                 */
/*                                                                           */
/* ------------------------------------------------------------------------- */

#include "header.h"

int total_source_line_count,            /* Number of source lines so far     */

    no_hash_printed_yet,                /* Have not yet printed the first #  */
    hash_printed_since_newline,         /* A hash has been printed since the
                                           most recent new-line was printed
                                           (generally as a result of an error
                                           message or the start of pass)     */
    dont_enter_into_symbol_table,       /* Return names as text (with
                                           token type DQ_TT, i.e., as if
                                           they had double-quotes around)
                                           and not as entries in the symbol
                                           table, when TRUE. If -2, only the
                                           keyword table is searched.        */
    return_sp_as_variable;              /* When TRUE, the word "sp" denotes
                                           the stack pointer variable
                                           (used in assembly language only)  */
int next_token_begins_syntax_line;      /* When TRUE, start a new syntax
                                           line (for error reporting, etc.)
                                           on the source code line where
                                           the next token appears            */

int32 last_mapped_line;  /* Last syntax line reported to debugging file      */

/* ------------------------------------------------------------------------- */
/*   The lexer's output is a sequence of triples, each called a "token",     */
/*   representing one lexical unit (or "lexeme") each.  Instead of providing */
/*   "lookahead" (that is, always having available the next token after the  */
/*   current one, so that syntax analysers higher up in Inform can have      */
/*   advance knowledge of what is coming), the lexer instead has a system    */
/*   where tokens can be read in and then "put back again".                  */
/*   The meaning of the number (and to some extent the text) supplied with   */
/*   a token depends on its type: see "header.h" for the list of types.      */
/*   For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
/*   number, and translated to the token:                                    */
/*     type NUMBER_TT, value 483, text "$1e3"                                */
/* ------------------------------------------------------------------------- */
/*   These three variables are set to the current token on a call to         */
/*   get_next_token() (but are not changed by a call to put_token_back()).   */
/* ------------------------------------------------------------------------- */

int token_type;
int32 token_value;
char *token_text;

/* ------------------------------------------------------------------------- */
/*   The next two variables are the head and tail of a singly linked list.   */
/*   The tail stores the portion most recently read from the current         */
/*   lexical block; its end values therefore describe the location of the    */
/*   current token, and are updated whenever the three variables above are   */
/*   via set_token_location(...).  Earlier vertices, if any, represent the   */
/*   regions of lexical blocks read beforehand, where new vertices are       */
/*   only introduced by interruptions like a file inclusion or an EOF.       */
/*   Vertices are deleted off of the front of the list once they are no      */
/*   longer referenced by pending debug information records.                 */
/* ------------------------------------------------------------------------- */

static debug_locations *first_token_locations;
static debug_locations *last_token_location;

extern debug_location get_token_location(void)
{   debug_location result;
    debug_location *location = &(last_token_location->location);
    result.file_index = location->file_index;
    result.beginning_byte_index = location->end_byte_index;
    result.end_byte_index = location->end_byte_index;
    result.beginning_line_number = location->end_line_number;
    result.end_line_number = location->end_line_number;
    result.beginning_character_number = location->end_character_number;
    result.end_character_number = location->end_character_number;
    result.orig_file_index = location->orig_file_index;
    result.orig_beg_line_number = location->orig_beg_line_number;
    result.orig_beg_char_number = location->orig_beg_char_number;
    return result;
}

extern debug_locations get_token_locations(void)
{   debug_locations result;
    result.location = get_token_location();
    result.next = NULL;
    result.reference_count = 0;
    return result;
}

static void set_token_location(debug_location location)
{   if (location.file_index == last_token_location->location.file_index)
    {   last_token_location->location.end_byte_index =
            location.end_byte_index;
        last_token_location->location.end_line_number =
            location.end_line_number;
        last_token_location->location.end_character_number =
            location.end_character_number;
        last_token_location->location.orig_file_index =
            location.orig_file_index;
        last_token_location->location.orig_beg_line_number =
            location.orig_beg_line_number;
        last_token_location->location.orig_beg_char_number =
            location.orig_beg_char_number;
    } else
    {   debug_locations*successor =
            my_malloc
                (sizeof(debug_locations),
                 "debug locations of recent tokens");
        successor->location = location;
        successor->next = NULL;
        successor->reference_count = 0;
        last_token_location->next = successor;
        last_token_location = successor;
    }
}

extern debug_location_beginning get_token_location_beginning(void)
{   debug_location_beginning result;
    ++(last_token_location->reference_count);
    result.head = last_token_location;
    result.beginning_byte_index =
        last_token_location->location.end_byte_index;
    result.beginning_line_number =
        last_token_location->location.end_line_number;
    result.beginning_character_number =
        last_token_location->location.end_character_number;
    result.orig_file_index = last_token_location->location.orig_file_index;
    result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
    result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;

    return result;
}

static void cleanup_token_locations(debug_location_beginning*beginning)
{   if (first_token_locations)
    {   while (first_token_locations &&
               !first_token_locations->reference_count)
        {   debug_locations*moribund = first_token_locations;
            first_token_locations = moribund->next;
            my_free(&moribund, "debug locations of recent tokens");
            if (beginning &&
                (beginning->head == moribund || !first_token_locations))
            {   compiler_error
                    ("Records needed by a debug_location_beginning are no "
                     "longer allocated, perhaps because of an invalid reuse "
                     "of this or an earlier beginning");
            }
        }
    } else
    {   if (beginning)
        {   compiler_error
                ("Attempt to use a debug_location_beginning when no token "
                 "locations are defined");
        } else
        {   compiler_error
                ("Attempt to clean up token locations when no token locations "
                 "are defined");
        }
    }
}

extern void discard_token_location(debug_location_beginning beginning)
{   --(beginning.head->reference_count);
}

extern debug_locations get_token_location_end
    (debug_location_beginning beginning)
{   debug_locations result;
    cleanup_token_locations(&beginning);
    --(beginning.head->reference_count);
    /* Sometimes we know what we'll read before we switch to the lexical block
       where we'll read it.  In such cases the beginning will be placed in the
       prior block and last exactly zero bytes there.  It's misleading to
       include such ranges, so we gobble them. */
    if (beginning.head->location.end_byte_index ==
          beginning.beginning_byte_index &&
        beginning.head->next)
    {   beginning.head = beginning.head->next;
        result.location = beginning.head->location;
        result.location.beginning_byte_index = 0;
        result.location.beginning_line_number = 1;
        result.location.beginning_character_number = 1;
    } else
    {   result.location = beginning.head->location;
        result.location.beginning_byte_index =
            beginning.beginning_byte_index;
        result.location.beginning_line_number =
            beginning.beginning_line_number;
        result.location.beginning_character_number =
            beginning.beginning_character_number;
    }

    result.location.orig_file_index =
        beginning.orig_file_index;
    result.location.orig_beg_line_number =
        beginning.orig_beg_line_number;
    result.location.orig_beg_char_number =
        beginning.orig_beg_char_number;

    result.next = beginning.head->next;
    result.reference_count = 0;
    return result;
}

/* ------------------------------------------------------------------------- */
/*   In order to be able to put tokens back efficiently, the lexer stores    */
/*   tokens in a "circle": the variable circle_position ranges between       */
/*   0 and CIRCLE_SIZE-1.  We only need a circle size as large as the        */
/*   maximum number of tokens ever put back at once, plus 1 (in effect, the  */
/*   maximum token lookahead ever needed in syntax analysis, plus 1).        */
/*                                                                           */
/*   Unlike some compilers, Inform does not have a context-free lexer: in    */
/*   fact it has 12288 different possible states.  However, the context only */
/*   affects the interpretation of "identifiers": lexemes beginning with a   */
/*   letter and containing up to 32 chars of alphanumeric and underscore     */
/*   chars.  (For example, "default" may refer to the directive or statement */
/*   of that name, and which token values are returned depends on the        */
/*   current lexical context.)                                               */
/*                                                                           */
/*   Along with each token, we also store the lexical context it was         */
/*   translated under; because if it is called for again, there may need     */
/*   to be a fresh interpretation of it if the context has changed.          */
/* ------------------------------------------------------------------------- */

#define CIRCLE_SIZE 6

/*   (The worst case for token lookahead is distinguishing between an
     old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".)   */

static int circle_position;
static token_data circle[CIRCLE_SIZE];

static int token_contexts[CIRCLE_SIZE];

/* ------------------------------------------------------------------------- */
/*   A complication, however, is that the text of some lexemes needs to be   */
/*   held in Inform's memory for much longer periods: for example, a         */
/*   dictionary word lexeme (like "'south'") must have its text preserved    */
/*   until the code generation time for the expression it occurs in, when    */
/*   the dictionary reference is actually made.  Code generation in general  */
/*   occurs as early as possible in Inform: pending some better method of    */
/*   garbage collection, we simply use a buffer so large that unless         */
/*   expressions spread across 10K of source code are found, there can be    */
/*   no problem.                                                             */
/* ------------------------------------------------------------------------- */

static char *lexeme_memory;
static char *lex_p;                     /* Current write position            */

/* ------------------------------------------------------------------------- */
/*   The lexer itself needs up to 3 characters of lookahead (it uses an      */
/*   LR(3) grammar to translate characters into tokens).                     */
/* ------------------------------------------------------------------------- */

#define LOOKAHEAD_SIZE 3

static int current, lookahead,          /* The latest character read, and    */
    lookahead2, lookahead3;             /* the three characters following it */

static int pipeline_made;               /* Whether or not the pipeline of
                                           characters has been constructed
                                           yet (this pass)                   */

static int (* get_next_char)(void);     /* Routine for reading the stream of
                                           characters: the lexer does not
                                           need any "ungetc" routine for
                                           putting them back again.  End of
                                           stream is signalled by returning
                                           zero.                             */

static char *source_to_analyse;         /* The current lexical source:
                                           NULL for "load from source files",
                                           otherwise this points to a string
                                           containing Inform code            */

static int tokens_put_back;             /* Count of the number of backward
                                           moves made from the last-read
                                           token                             */

extern void describe_token(token_data t)
{
    /*  Many of the token types are not set in this file, but later on in
        Inform's higher stages (for example, in the expression evaluator);
        but this routine describes them all.                                 */

    printf("{ ");

    switch(t.type)
    {
        /*  The following token types occur in lexer output:                 */

        case SYMBOL_TT:          printf("symbol ");
                                 describe_symbol(t.value);
                                 break;
        case NUMBER_TT:          printf("literal number %d", t.value);
                                 break;
        case DQ_TT:              printf("string \"%s\"", t.text);
                                 break;
        case SQ_TT:              printf("string '%s'", t.text);
                                 break;
        case SEP_TT:             printf("separator '%s'", t.text);
                                 break;
        case EOF_TT:             printf("end of file");
                                 break;

        case STATEMENT_TT:       printf("statement name '%s'", t.text);
                                 break;
        case SEGMENT_MARKER_TT:  printf("object segment marker '%s'", t.text);
                                 break;
        case DIRECTIVE_TT:       printf("directive name '%s'", t.text);
                                 break;
        case CND_TT:             printf("textual conditional '%s'", t.text);
                                 break;
        case OPCODE_NAME_TT:     printf("opcode name '%s'", t.text);
                                 break;
        case SYSFUN_TT:          printf("built-in function name '%s'", t.text);
                                 break;
        case LOCAL_VARIABLE_TT:  printf("local variable name '%s'", t.text);
                                 break;
        case MISC_KEYWORD_TT:    printf("statement keyword '%s'", t.text);
                                 break;
        case DIR_KEYWORD_TT:     printf("directive keyword '%s'", t.text);
                                 break;
        case TRACE_KEYWORD_TT:   printf("'trace' keyword '%s'", t.text);
                                 break;
        case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
                                 break;

        /*  The remaining are etoken types, not set by the lexer             */

        case OP_TT:              printf("operator '%s'",
                                     operators[t.value].description);
                                 break;
        case ENDEXP_TT:          printf("end of expression");
                                 break;
        case SUBOPEN_TT:         printf("open bracket");
                                 break;
        case SUBCLOSE_TT:        printf("close bracket");
                                 break;
        case LARGE_NUMBER_TT:    printf("large number: '%s'=%d",t.text,t.value);
                                 break;
        case SMALL_NUMBER_TT:    printf("small number: '%s'=%d",t.text,t.value);
                                 break;
        case VARIABLE_TT:        printf("variable '%s'=%d", t.text, t.value);
                                 break;
        case DICTWORD_TT:        printf("dictionary word '%s'", t.text);
                                 break;
        case ACTION_TT:          printf("action name '%s'", t.text);
                                 break;

        default:
            printf("** unknown token type %d, text='%s', value=%d **",
            t.type, t.text, t.value);
    }
    printf(" }");
}

/* ------------------------------------------------------------------------- */
/*   All but one of the 280 Inform keywords (118 of them opcode names used   */
/*   only by the assembler).  (The one left over is "sp", a keyword used in  */
/*   assembly language only.)                                                */
/*                                                                           */
/*   A "keyword group" is a set of keywords to be searched for.  If a match  */
/*   is made on an identifier, the token type becomes that given in the KG   */
/*   and the token value is its index in the KG.                             */
/*                                                                           */
/*   The keyword ordering must correspond with the appropriate #define's in  */
/*   "header.h" but is otherwise not significant.                            */
/* ------------------------------------------------------------------------- */

#define MAX_KEYWORDS 350

/* The values will be filled in at compile time, when we know
   which opcode set to use. */
keyword_group opcode_names =
{ { "" },
    OPCODE_NAME_TT, FALSE, TRUE
};

static char *opcode_list_z[] = {
    "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
    "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
    "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
    "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
    "print_char", "print_num", "random", "push", "pull", "split_window",
    "set_window", "output_stream", "input_stream", "sound_effect", "jz",
    "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
    "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
    "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
    "restore", "restart", "ret_popped", "pop", "quit", "new_line",
    "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
    "erase_window", "erase_line", "set_cursor", "get_cursor",
    "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
    "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
    "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
    "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
    "restore_undo", "draw_picture", "picture_data", "erase_picture",
    "set_margins", "move_window", "window_size", "window_style",
    "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
    "mouse_window", "push_stack", "put_wind_prop", "print_form",
    "make_menu", "picture_table", "print_unicode", "check_unicode",
    ""
};

static char *opcode_list_g[] = {
    "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
    "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
    "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
    "jltu", "jgeu", "jgtu", "jleu",
    "call", "return",
    "catch", "throw", "tailcall",
    "copy", "copys", "copyb", "sexs", "sexb", "aload",
    "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
    "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
    "streamchar", "streamnum", "streamstr",
    "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
    "random", "setrandom", "quit", "verify",
    "restart", "save", "restore", "saveundo", "restoreundo", "protect",
    "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
    "linearsearch", "binarysearch", "linkedsearch",
    "callf", "callfi", "callfii", "callfiii",
    "streamunichar",
    "mzero", "mcopy", "malloc", "mfree",
    "accelfunc", "accelparam",
    "numtof", "ftonumz", "ftonumn", "ceil", "floor",
    "fadd", "fsub", "fmul", "fdiv", "fmod",
    "sqrt", "exp", "log", "pow",
    "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
    "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
    ""
};

keyword_group opcode_macros =
{ { "" },
  OPCODE_MACRO_TT, FALSE, TRUE
};

static char *opmacro_list_z[] = { "" };

static char *opmacro_list_g[] = {
    "pull", "push",
    ""
};

keyword_group directives =
{ { "abbreviate", "array", "attribute", "class", "constant",
    "default", "dictionary", "end", "endif", "extend", "fake_action",
    "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
    "iffalse", "import", "include", "link", "lowstring", "message",
    "nearby", "object", "origsource", "property", "release", "replace",
    "serial", "switches", "statusline", "stub", "system_file", "trace",
    "undef", "verb", "version", "zcharacter",
    "" },
    DIRECTIVE_TT, FALSE, FALSE
};

keyword_group trace_keywords =
{ { "dictionary", "symbols", "objects", "verbs",
    "assembly", "expressions", "lines", "tokens", "linker",
    "on", "off", "" },
    TRACE_KEYWORD_TT, FALSE, TRUE
};

keyword_group segment_markers =
{ { "class", "has", "private", "with", "" },
    SEGMENT_MARKER_TT, FALSE, TRUE
};

keyword_group directive_keywords =
{ { "alias", "long", "additive",
    "score", "time",
    "noun", "held", "multi", "multiheld", "multiexcept",
    "multiinside", "creature", "special", "number", "scope", "topic",
    "reverse", "meta", "only", "replace", "first", "last",
    "string", "table", "buffer", "data", "initial", "initstr",
    "with", "private", "has", "class",
    "error", "fatalerror", "warning",
    "terminating", "static",
    "" },
    DIR_KEYWORD_TT, FALSE, TRUE
};

keyword_group misc_keywords =
{ { "char", "name", "the", "a", "an", "The", "number",
    "roman", "reverse", "bold", "underline", "fixed", "on", "off",
    "to", "address", "string", "object", "near", "from", "property", "A", "" },
    MISC_KEYWORD_TT, FALSE, TRUE
};

keyword_group statements =
{ { "box", "break", "continue", "default", "do", "else", "font", "for",
    "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
    "print", "print_ret", "quit", "read", "remove", "restore", "return",
    "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
    "until", "while", "" },
    STATEMENT_TT, FALSE, TRUE
};

keyword_group conditions =
{ { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
    CND_TT, FALSE, TRUE
};

keyword_group system_functions =
{ { "child", "children", "elder", "eldest", "indirect", "parent", "random",
    "sibling", "younger", "youngest", "metaclass", "glk", "" },
    SYSFUN_TT, FALSE, TRUE
};

keyword_group system_constants =
{ { "adjectives_table", "actions_table", "classes_table",
    "identifiers_table", "preactions_table", "version_number",
    "largest_object", "strings_offset", "code_offset",
    "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
    "static_memory_offset", "array_names_offset", "readable_memory_offset",
    "cpv__start", "cpv__end", "ipv__start", "ipv__end",
    "array__start", "array__end",
    "lowest_attribute_number", "highest_attribute_number",
    "attribute_names_array",
    "lowest_property_number", "highest_property_number",
    "property_names_array",
    "lowest_action_number", "highest_action_number",
    "action_names_array",
    "lowest_fake_action_number", "highest_fake_action_number",
    "fake_action_names_array",
    "lowest_routine_number", "highest_routine_number", "routines_array",
    "routine_names_array", "routine_flags_array",
    "lowest_global_number", "highest_global_number", "globals_array",
    "global_names_array", "global_flags_array",
    "lowest_array_number", "highest_array_number", "arrays_array",
    "array_names_array", "array_flags_array",
    "lowest_constant_number", "highest_constant_number", "constants_array",
    "constant_names_array",
    "lowest_class_number", "highest_class_number", "class_objects_array",
    "lowest_object_number", "highest_object_number",
    "oddeven_packing",
    "grammar_table", "dictionary_table", "dynam_string_table",
    "" },
    SYSTEM_CONSTANT_TT, FALSE, TRUE
};

keyword_group *keyword_groups[12]
= { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
    &directive_keywords, &misc_keywords, &statements, &conditions,
    &system_functions, &system_constants, &opcode_macros};

keyword_group local_variables =
{ { "" },                                 /* Filled in when routine declared */
    LOCAL_VARIABLE_TT, FALSE, FALSE
};

static int lexical_context(void)
{
    /*  The lexical context is a number representing all of the context
        information in the lexical analyser: the same input text will
        always translate to the same output tokens whenever the context
        is the same.

        In fact, for efficiency reasons this number omits the bit of
        information held in the variable "dont_enter_into_symbol_table".
        Inform never needs to backtrack through tokens parsed in that
        way (thankfully, as it would be expensive indeed to check
        the tokens).                                                         */

    int c = 0;
    if (opcode_names.enabled)         c |= 1;
    if (directives.enabled)           c |= 2;
    if (trace_keywords.enabled)       c |= 4;
    if (segment_markers.enabled)      c |= 8;
    if (directive_keywords.enabled)   c |= 16;
    if (misc_keywords.enabled)        c |= 32;
    if (statements.enabled)           c |= 64;
    if (conditions.enabled)           c |= 128;
    if (system_functions.enabled)     c |= 256;
    if (system_constants.enabled)     c |= 512;
    if (local_variables.enabled)      c |= 1024;

    if (return_sp_as_variable)        c |= 2048;
    return(c);
}

static void print_context(int c)
{
    if ((c & 1) != 0) printf("OPC ");
    if ((c & 2) != 0) printf("DIR ");
    if ((c & 4) != 0) printf("TK ");
    if ((c & 8) != 0) printf("SEG ");
    if ((c & 16) != 0) printf("DK ");
    if ((c & 32) != 0) printf("MK ");
    if ((c & 64) != 0) printf("STA ");
    if ((c & 128) != 0) printf("CND ");
    if ((c & 256) != 0) printf("SFUN ");
    if ((c & 512) != 0) printf("SCON ");
    if ((c & 1024) != 0) printf("LV ");
    if ((c & 2048) != 0) printf("sp ");
}

static int *keywords_hash_table;
static int *keywords_hash_ends_table;
static int *keywords_data_table;

static int *local_variable_hash_table;
static int *local_variable_hash_codes;
char **local_variable_texts;
static char *local_variable_text_table;

static char one_letter_locals[128];

static void make_keywords_tables(void)
{   int i, j, h, tp=0;
    char **oplist, **maclist;

    if (!glulx_mode) {
        oplist = opcode_list_z;
        maclist = opmacro_list_z;
    }
    else {
        oplist = opcode_list_g;
        maclist = opmacro_list_g;
    }

    for (j=0; *(oplist[j]); j++) {
        opcode_names.keywords[j] = oplist[j];
    }
    opcode_names.keywords[j] = "";

    for (j=0; *(maclist[j]); j++) {
        opcode_macros.keywords[j] = maclist[j];
    }
    opcode_macros.keywords[j] = "";

    for (i=0; i<HASH_TAB_SIZE; i++)
    {   keywords_hash_table[i] = -1;
        keywords_hash_ends_table[i] = -1;
    }

    for (i=1; i<=11; i++)
    {   keyword_group *kg = keyword_groups[i];
        for (j=0; *(kg->keywords[j]) != 0; j++)
        {   h = hash_code_from_string(kg->keywords[j]);
            if (keywords_hash_table[h] == -1)
                keywords_hash_table[h] = tp;
            else
              *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
            keywords_hash_ends_table[h] = tp;
            *(keywords_data_table + 3*tp) = i;
            *(keywords_data_table + 3*tp+1) = j;
            *(keywords_data_table + 3*tp+2) = -1;
            tp++;
        }
    }
}

extern void construct_local_variable_tables(void)
{   int i, h; char *p = local_variable_text_table;
    for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
    for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;

    for (i=0; i<no_locals; i++)
    {   char *q = local_variables.keywords[i];
        if (q[1] == 0)
        {   one_letter_locals[(uchar)q[0]] = i;
            if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
            if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
        }
        h = hash_code_from_string(q);
        if (local_variable_hash_table[h] == -1)
            local_variable_hash_table[h] = i;
        local_variable_hash_codes[i] = h;
        local_variable_texts[i] = p;
        strcpy(p, q);
        p += strlen(p)+1;
    }
    for (;i<MAX_LOCAL_VARIABLES-1;i++)
      local_variable_texts[i] = "<no such local variable>";
}

static void interpret_identifier(int pos, int dirs_only_flag)
{   int index, hashcode; char *p = circle[pos].text;

    /*  An identifier is either a keyword or a "symbol", a name which the
        lexical analyser leaves to higher levels of Inform to understand.    */

    hashcode = hash_code_from_string(p);

    if (dirs_only_flag) goto KeywordSearch;

    /*  If this is assembly language, perhaps it is "sp"?                    */

    if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
    {   circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
        return;
    }

    /*  Test for local variables first, quite quickly.                       */

    if (local_variables.enabled)
    {   if (p[1]==0)
        {   index = one_letter_locals[(uchar)p[0]];
            if (index<MAX_LOCAL_VARIABLES)
            {   circle[pos].type = LOCAL_VARIABLE_TT;
                circle[pos].value = index+1;
                return;
            }
        }
        index = local_variable_hash_table[hashcode];
        if (index >= 0)
        {   for (;index<no_locals;index++)
            {   if (hashcode == local_variable_hash_codes[index])
                {   if (strcmpcis(p, local_variable_texts[index])==0)
                    {   circle[pos].type = LOCAL_VARIABLE_TT;
                        circle[pos].value = index+1;
                        return;
                    }
                }
            }
        }
    }

    /*  Now the bulk of the keywords.  Note that the lexer doesn't recognise
        the name of a system function which has been Replaced.               */

    KeywordSearch:
    index = keywords_hash_table[hashcode];
    while (index >= 0)
    {   int *i = keywords_data_table + 3*index;
        keyword_group *kg = keyword_groups[*i];
        if (((!dirs_only_flag) && (kg->enabled))
            || (dirs_only_flag && (kg == &directives)))
        {   char *q = kg->keywords[*(i+1)];
            if (((kg->case_sensitive) && (strcmp(p, q)==0))
                || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
            {   if ((kg != &system_functions)
                    || (system_function_usage[*(i+1)]!=2))
                {   circle[pos].type = kg->change_token_type;
                    circle[pos].value = *(i+1);
                    return;
                }
            }
        }
        index = *(i+2);
    }

    if (dirs_only_flag) return;

    /*  Search for the name; create it if necessary.                         */

    circle[pos].value = symbol_index(p, hashcode);
    circle[pos].type = SYMBOL_TT;
}


/* ------------------------------------------------------------------------- */
/*   The tokeniser grid aids a rapid decision about the consequences of a    */
/*   character reached in the buffer.  In effect it is an efficiently stored */
/*   transition table using an algorithm similar to that of S. C. Johnson's  */
/*   "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9).       */
/*   My thanks to Dilip Sequeira for suggesting this.                        */
/*                                                                           */
/*       tokeniser_grid[c]   is (16*n + m) if c is the first character of    */
/*                               separator numbers n, n+1, ..., n+m-1        */
/*                           or certain special values (QUOTE_CODE, etc)     */
/*                           or 0 otherwise                                  */
/*                                                                           */
/*   Since 1000/16 = 62, the code numbers below will need increasing if the  */
/*   number of separators supported exceeds 61.                              */
/* ------------------------------------------------------------------------- */

static int tokeniser_grid[256];

#define QUOTE_CODE      1000
#define DQUOTE_CODE     1001
#define NULL_CODE       1002
#define SPACE_CODE      1003
#define NEGATIVE_CODE   1004
#define DIGIT_CODE      1005
#define RADIX_CODE      1006
#define KEYWORD_CODE    1007
#define EOF_CODE        1008
#define WHITESPACE_CODE 1009
#define COMMENT_CODE    1010
#define IDENTIFIER_CODE 1011

/*  This list cannot safely be changed without also changing the header
    separator #defines.  The ordering is significant in that (i) all entries
    beginning with the same character must be adjacent and (ii) that if
    X is a an initial substring of Y then X must come before Y.

    E.g. --> must occur before -- to prevent "-->0" being tokenised
    wrongly as "--", ">", "0" rather than "-->", "0".                        */

static const char separators[NUMBER_SEPARATORS][4] =
{   "->", "-->", "--", "-", "++", "+", "*", "/", "%",
    "||", "|", "&&", "&", "~~",
    "~=", "~", "==", "=", ">=", ">",
    "<=", "<", "(", ")", ",",
    ".&", ".#", "..&", "..#", "..", ".",
    "::", ":", "@", ";", "[", "]", "{", "}",
    "$", "?~", "?",
    "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
};

static void make_tokeniser_grid(void)
{
    /*  Construct the grid to the specification above.                       */

    int i, j;

    for (i=0; i<256; i++) tokeniser_grid[i]=0;

    for (i=0; i<NUMBER_SEPARATORS; i++)
    {   j=separators[i][0];
        if (tokeniser_grid[j]==0)
            tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
    }
    tokeniser_grid['\''] = QUOTE_CODE;
    tokeniser_grid['\"'] = DQUOTE_CODE;
    tokeniser_grid[0]    = EOF_CODE;
    tokeniser_grid[' ']  = WHITESPACE_CODE;
    tokeniser_grid['\n'] = WHITESPACE_CODE;
    tokeniser_grid['$']  = RADIX_CODE;
    tokeniser_grid['!']  = COMMENT_CODE;

    tokeniser_grid['0']  = DIGIT_CODE;
    tokeniser_grid['1']  = DIGIT_CODE;
    tokeniser_grid['2']  = DIGIT_CODE;
    tokeniser_grid['3']  = DIGIT_CODE;
    tokeniser_grid['4']  = DIGIT_CODE;
    tokeniser_grid['5']  = DIGIT_CODE;
    tokeniser_grid['6']  = DIGIT_CODE;
    tokeniser_grid['7']  = DIGIT_CODE;
    tokeniser_grid['8']  = DIGIT_CODE;
    tokeniser_grid['9']  = DIGIT_CODE;

    tokeniser_grid['a']  = IDENTIFIER_CODE;
    tokeniser_grid['b']  = IDENTIFIER_CODE;
    tokeniser_grid['c']  = IDENTIFIER_CODE;
    tokeniser_grid['d']  = IDENTIFIER_CODE;
    tokeniser_grid['e']  = IDENTIFIER_CODE;
    tokeniser_grid['f']  = IDENTIFIER_CODE;
    tokeniser_grid['g']  = IDENTIFIER_CODE;
    tokeniser_grid['h']  = IDENTIFIER_CODE;
    tokeniser_grid['i']  = IDENTIFIER_CODE;
    tokeniser_grid['j']  = IDENTIFIER_CODE;
    tokeniser_grid['k']  = IDENTIFIER_CODE;
    tokeniser_grid['l']  = IDENTIFIER_CODE;
    tokeniser_grid['m']  = IDENTIFIER_CODE;
    tokeniser_grid['n']  = IDENTIFIER_CODE;
    tokeniser_grid['o']  = IDENTIFIER_CODE;
    tokeniser_grid['p']  = IDENTIFIER_CODE;
    tokeniser_grid['q']  = IDENTIFIER_CODE;
    tokeniser_grid['r']  = IDENTIFIER_CODE;
    tokeniser_grid['s']  = IDENTIFIER_CODE;
    tokeniser_grid['t']  = IDENTIFIER_CODE;
    tokeniser_grid['u']  = IDENTIFIER_CODE;
    tokeniser_grid['v']  = IDENTIFIER_CODE;
    tokeniser_grid['w']  = IDENTIFIER_CODE;
    tokeniser_grid['x']  = IDENTIFIER_CODE;
    tokeniser_grid['y']  = IDENTIFIER_CODE;
    tokeniser_grid['z']  = IDENTIFIER_CODE;

    tokeniser_grid['A']  = IDENTIFIER_CODE;
    tokeniser_grid['B']  = IDENTIFIER_CODE;
    tokeniser_grid['C']  = IDENTIFIER_CODE;
    tokeniser_grid['D']  = IDENTIFIER_CODE;
    tokeniser_grid['E']  = IDENTIFIER_CODE;
    tokeniser_grid['F']  = IDENTIFIER_CODE;
    tokeniser_grid['G']  = IDENTIFIER_CODE;
    tokeniser_grid['H']  = IDENTIFIER_CODE;
    tokeniser_grid['I']  = IDENTIFIER_CODE;
    tokeniser_grid['J']  = IDENTIFIER_CODE;
    tokeniser_grid['K']  = IDENTIFIER_CODE;
    tokeniser_grid['L']  = IDENTIFIER_CODE;
    tokeniser_grid['M']  = IDENTIFIER_CODE;
    tokeniser_grid['N']  = IDENTIFIER_CODE;
    tokeniser_grid['O']  = IDENTIFIER_CODE;
    tokeniser_grid['P']  = IDENTIFIER_CODE;
    tokeniser_grid['Q']  = IDENTIFIER_CODE;
    tokeniser_grid['R']  = IDENTIFIER_CODE;
    tokeniser_grid['S']  = IDENTIFIER_CODE;
    tokeniser_grid['T']  = IDENTIFIER_CODE;
    tokeniser_grid['U']  = IDENTIFIER_CODE;
    tokeniser_grid['V']  = IDENTIFIER_CODE;
    tokeniser_grid['W']  = IDENTIFIER_CODE;
    tokeniser_grid['X']  = IDENTIFIER_CODE;
    tokeniser_grid['Y']  = IDENTIFIER_CODE;
    tokeniser_grid['Z']  = IDENTIFIER_CODE;

    tokeniser_grid['_']  = IDENTIFIER_CODE;
}

/* ------------------------------------------------------------------------- */
/*   Definition of a lexical block: a source file or a string containing     */
/*   text for lexical analysis; an independent source from the point of      */
/*   view of issuing error reports.                                          */
/* ------------------------------------------------------------------------- */

typedef struct LexicalBlock_s
{   char *filename;                              /*  Full translated name    */
    int   main_flag;                             /*  TRUE if the main file
                                                     (the first one opened)  */
    int   sys_flag;                              /*  TRUE if a System_File   */
    int   source_line;                           /*  Line number count       */
    int   line_start;                            /*  Char number within file
                                                     where the current line
                                                     starts                  */
    int   chars_read;                            /*  Char number of read pos */
    int   file_no;                               /*  Or 255 if not from a
                                                     file; used for debug
                                                     information             */
    char *orig_source;                           /* From #origsource direct  */
    int orig_file;
    int32 orig_line;
    int32 orig_char;
} LexicalBlock;

static LexicalBlock NoFileOpen =
    {   "<before compilation>",  FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };

static LexicalBlock MakingOutput =
    {   "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };

static LexicalBlock StringLB =
    {   "<veneer routine>",      FALSE, TRUE,  0, 0, 0, 255, NULL, 0, 0, 0 };

static LexicalBlock *CurrentLB;                  /*  The current lexical
                                                     block of input text     */

extern void declare_systemfile(void)
{   CurrentLB->sys_flag = TRUE;
}

extern int is_systemfile(void)
{   return ((CurrentLB->sys_flag)?1:0);
}

extern void set_origsource_location(char *source, int32 line, int32 charnum)
{
    if (!source) {
        /* Clear the Origsource declaration. */
        CurrentLB->orig_file = 0;
        CurrentLB->orig_source = NULL;
        CurrentLB->orig_line = 0;
        CurrentLB->orig_char = 0;
        return;
    }

    /* Get the file number for a new or existing InputFiles entry. */
    int file_no = register_orig_sourcefile(source);

    CurrentLB->orig_file = file_no;
    CurrentLB->orig_source = InputFiles[file_no-1].filename;
    CurrentLB->orig_line = line;
    CurrentLB->orig_char = charnum;
}

/* Error locations. */

extern debug_location get_current_debug_location(void)
{   debug_location result;
    /* Assume that all input characters are one byte. */
    result.file_index = CurrentLB->file_no;
    result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
    result.end_byte_index = result.beginning_byte_index;
    result.beginning_line_number = CurrentLB->source_line;
    result.end_line_number = result.beginning_line_number;
    result.beginning_character_number =
        CurrentLB->chars_read - CurrentLB->line_start;
    result.end_character_number = result.beginning_character_number;
    result.orig_file_index = CurrentLB->orig_file;
    result.orig_beg_line_number = CurrentLB->orig_line;
    result.orig_beg_char_number = CurrentLB->orig_char;
    return result;
}

static debug_location ErrorReport_debug_location;

extern void report_errors_at_current_line(void)
{   ErrorReport.line_number = CurrentLB->source_line;
    ErrorReport.file_number = CurrentLB->file_no;
    if (ErrorReport.file_number == 255)
        ErrorReport.file_number = -1;
    ErrorReport.source      = CurrentLB->filename;
    ErrorReport.main_flag   = CurrentLB->main_flag;
    if (debugfile_switch)
        ErrorReport_debug_location = get_current_debug_location();
    ErrorReport.orig_file = CurrentLB->orig_file;
    ErrorReport.orig_source = CurrentLB->orig_source;
    ErrorReport.orig_line = CurrentLB->orig_line;
    ErrorReport.orig_char = CurrentLB->orig_char;
}

extern debug_location get_error_report_debug_location(void)
{   return ErrorReport_debug_location;
}

extern int32 get_current_line_start(void)
{   return CurrentLB->line_start;
}

brief_location blank_brief_location;

extern brief_location get_brief_location(ErrorPosition *errpos)
{
    brief_location loc;
    loc.file_index = errpos->file_number;
    loc.line_number = errpos->line_number;
    loc.orig_file_index = errpos->orig_file;
    loc.orig_line_number = errpos->orig_line;
    return loc;
}

extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
{
    if (loc.file_index != -1)
    {   errpos->file_number = loc.file_index;
        errpos->line_number = loc.line_number;
        errpos->main_flag = (errpos->file_number == 1);
        errpos->orig_source = NULL;
        errpos->orig_file = loc.orig_file_index;
        errpos->orig_line = loc.orig_line_number;
        errpos->orig_char = 0;
    }
}

/* ------------------------------------------------------------------------- */
/*   Hash printing and line counting                                         */
/* ------------------------------------------------------------------------- */

static void print_hash(void)
{
    /*  Hash-printing is the practice of printing a # character every 100
        lines of source code (the -x switch), reassuring the user that
        progress is being made                                               */

    if (no_hash_printed_yet)
    {   printf("::"); no_hash_printed_yet = FALSE;
    }
    printf("#"); hash_printed_since_newline = TRUE;

#ifndef MAC_FACE
    /*  On some systems, text output is buffered to a line at a time, and
        this would frustrate the point of hash-printing, so:                 */

    fflush(stdout);
#endif
}

static void reached_new_line(void)
{
    /*  Called to signal that a new line has been reached in the source code */

    forerrors_pointer = 0;

    CurrentLB->source_line++;
    CurrentLB->line_start = CurrentLB->chars_read;

    total_source_line_count++;

    if (total_source_line_count%100==0)
    {   if (hash_switch) print_hash();
#ifdef MAC_MPW
        SpinCursor(32);                    /* I.e., allow other tasks to run */
#endif
    }

#ifdef MAC_FACE
    if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
    {   ProcessEvents (&g_proc);
        if (g_proc != true)
        {   free_arrays();
            close_all_source();
            if (temporary_files_switch)
                remove_temp_files();
            if (store_the_text)
                my_free(&all_text,"transcription text");
            abort_transcript_file();
            longjmp (g_fallback, 1);
        }
    }
#endif
}

static void new_syntax_line(void)
{   if (source_to_analyse != NULL) forerrors_pointer = 0;
    report_errors_at_current_line();
}

/* Return 10 raised to the expo power.
 *
 * I'm avoiding the standard pow() function for a rather lame reason:
 * it's in the libmath (-lm) library, and I don't want to change the
 * build model for the compiler. So, this is implemented with a stupid
 * lookup table. It's faster than pow() for small values of expo.
 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
 * anyway, so I don't expect that to be a problem.
 *
 * (For some reason, frexp() and ldexp(), which are used later on, do
 * not require libmath to be linked in.)
 */
static double pow10_cheap(int expo)
{
    #define POW10_RANGE (8)
    static double powers[POW10_RANGE*2+1] = {
        0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
        1.0,
        10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
    };

    double res = 1.0;

    if (expo < 0) {
        for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
            res *= powers[0];
        }
        return res * powers[POW10_RANGE+expo];
    }
    else {
        for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
            res *= powers[POW10_RANGE*2];
        }
        return res * powers[POW10_RANGE+expo];
    }
}

/* Return the IEEE-754 single-precision encoding of a floating-point
 * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
 * for an explanation.
 *
 * The number is provided in the pieces it was parsed in:
 *    [+|-] intv "." fracv "e" [+|-]expo
 *
 * If the magnitude is too large (beyond about 3.4e+38), this returns
 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
 * small (below about 1e-45), this returns a zero value (0x00000000 or
 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
 * lexer should never do that).
 *
 * Note that using a float constant does *not* set the uses_float_features
 * flag (which would cause the game file to be labelled 3.1.2). There's
 * no VM feature here, just an integer. Of course, any use of the float
 * *opcodes* will set the flag.
 *
 * The math functions in this routine require #including <math.h>, but
 * they should not require linking the math library (-lm). At least,
 * they do not on OSX and Linux.
 */
static int32 construct_float(int signbit, double intv, double fracv, int expo)
{
    double absval = (intv + fracv) * pow10_cheap(expo);
    int32 sign = (signbit ? 0x80000000 : 0x0);
    double mant;
    int32 fbits;

    if (isinf(absval)) {
        return sign | 0x7f800000; /* infinity */
    }
    if (isnan(absval)) {
        return sign | 0x7fc00000;
    }

    mant = frexp(absval, &expo);

    /* Normalize mantissa to be in the range [1.0, 2.0) */
    if (0.5 <= mant && mant < 1.0) {
        mant *= 2.0;
        expo--;
    }
    else if (mant == 0.0) {
        expo = 0;
    }
    else {
        return sign | 0x7f800000; /* infinity */
    }

    if (expo >= 128) {
        return sign | 0x7f800000; /* infinity */
    }
    else if (expo < -126) {
        /* Denormalized (very small) number */
        mant = ldexp(mant, 126 + expo);
        expo = 0;
    }
    else if (!(expo == 0 && mant == 0.0)) {
        expo += 127;
        mant -= 1.0; /* Get rid of leading 1 */
    }

    mant *= 8388608.0; /* 2^23 */
    fbits = (int32)(mant + 0.5); /* round mant to nearest int */
    if (fbits >> 23) {
        /* The carry propagated out of a string of 23 1 bits. */
        fbits = 0;
        expo++;
        if (expo >= 255) {
            return sign | 0x7f800000; /* infinity */
        }
    }

    return (sign) | ((int32)(expo << 23)) | (fbits);
}

/* ------------------------------------------------------------------------- */
/*   Characters are read via a "pipeline" of variables, allowing us to look  */
/*       up to three characters ahead of the current position.               */
/*                                                                           */
/*   There are two possible sources: from the source files being loaded in,  */
/*   and from a string inside Inform (which is where the code for veneer     */
/*   routines comes from).  Each source has its own get-next-character       */
/*   routine.                                                                */
/* ------------------------------------------------------------------------- */
/*   Source 1: from files                                                    */
/*                                                                           */
/*   Note that file_load_chars(p, size) loads "size" bytes into buffer "p"   */
/*   from the current input file.  If the file runs out, then if it was      */
/*   the last source file 4 EOF characters are placed in the buffer: if it   */
/*   was only an Include file ending, then a '\n' character is placed there  */
/*   (essentially to force termination of any comment line) followed by      */
/*   three harmless spaces.                                                  */
/*                                                                           */
/*   The routine returns the number of characters it has written, and note   */
/*   that this conveniently ensures that all characters in the buffer come   */
/*   from the same file.                                                     */
/* ------------------------------------------------------------------------- */

#define SOURCE_BUFFER_SIZE 4096                  /*  Typical disc block size */

typedef struct Sourcefile_s
{   char *buffer;                                /*  Input buffer            */
    int   read_pos;                              /*  Read position in buffer */
    int   size;                                  /*  Number of meaningful
                                                     characters in buffer    */
    int   la, la2, la3;                          /*  Three characters of
                                                     lookahead pipeline      */
    int   file_no;                               /*  Internal file number
                                                     (1, 2, 3, ...)          */
    LexicalBlock LB;
} Sourcefile;

static Sourcefile *FileStack;
static int File_sp;                              /*  Stack pointer           */

static Sourcefile *CF;                           /*  Top entry on stack      */

static int last_input_file;

static void begin_buffering_file(int i, int file_no)
{   int j, cnt; uchar *p;

    if (i >= MAX_INCLUSION_DEPTH)
       memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);

    p = (uchar *) FileStack[i].buffer;

    if (i>0)
    {   FileStack[i-1].la  = lookahead;
        FileStack[i-1].la2 = lookahead2;
        FileStack[i-1].la3 = lookahead3;
    }

    FileStack[i].file_no = file_no;
    FileStack[i].size = file_load_chars(file_no,
        (char *) p, SOURCE_BUFFER_SIZE);
    lookahead  = source_to_iso_grid[p[0]];
    lookahead2 = source_to_iso_grid[p[1]];
    lookahead3 = source_to_iso_grid[p[2]];
    if (LOOKAHEAD_SIZE != 3)
        compiler_error
            ("Lexer lookahead size does not match hard-coded lookahead code");
    FileStack[i].read_pos = LOOKAHEAD_SIZE;

    if (file_no==1) FileStack[i].LB.main_flag = TRUE;
               else FileStack[i].LB.main_flag = FALSE;
    FileStack[i].LB.sys_flag = FALSE;
    FileStack[i].LB.source_line = 1;
    FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
    FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
    FileStack[i].LB.filename = InputFiles[file_no-1].filename;
    FileStack[i].LB.file_no = file_no;
    FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
    FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;

    CurrentLB = &(FileStack[i].LB);
    CF = &(FileStack[i]);

    /* Check for recursive inclusion */
    cnt = 0;
    for (j=0; j<i; j++)
    {   if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
            cnt++;
    }
    if (cnt==1)
        warning_named("File included more than once",
            FileStack[j].LB.filename);
}

static void create_char_pipeline(void)
{
    File_sp = 0;
    begin_buffering_file(File_sp++, 1);
    pipeline_made = TRUE;
    last_input_file = current_input_file;
}

static int get_next_char_from_pipeline(void)
{   uchar *p;

    while (last_input_file < current_input_file)
    {
        /*  An "Include" file must have opened since the last character
            was read. Perhaps more than one. We run forward through the
            list and add them to the include stack. But we ignore
            "Origsource" files (which were never actually opened for
            reading). */

        last_input_file++;
        if (!InputFiles[last_input_file-1].is_input)
            continue;

        begin_buffering_file(File_sp++, last_input_file);
    }
    if (last_input_file != current_input_file)
        compiler_error("last_input_file did not match after Include");

    if (File_sp == 0)
    {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
    }

    if (CF->read_pos == CF->size)
    {   CF->size =
            file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
        CF->read_pos = 0;
    }
    else
    if (CF->read_pos == -(CF->size))
    {   set_token_location(get_current_debug_location());
        File_sp--;
        if (File_sp == 0)
        {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
        }
        CF = &(FileStack[File_sp-1]);
        CurrentLB = &(FileStack[File_sp-1].LB);
        lookahead  = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
        if (CF->read_pos == CF->size)
        {   CF->size =
                file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
            CF->read_pos = 0;
        }
        set_token_location(get_current_debug_location());
    }

    p = (uchar *) (CF->buffer);

    current = lookahead;
    lookahead = lookahead2;
    lookahead2 = lookahead3;
    lookahead3 = source_to_iso_grid[p[CF->read_pos++]];

    CurrentLB->chars_read++;
    if (forerrors_pointer < 511)
        forerrors_buff[forerrors_pointer++] = current;
    if (current == '\n') reached_new_line();
    return(current);
}

/* ------------------------------------------------------------------------- */
/*   Source 2: from a string                                                 */
/* ------------------------------------------------------------------------- */

static int source_to_analyse_pointer;            /*  Current read position   */

static int get_next_char_from_string(void)
{   uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
    current = source_to_iso_grid[p[0]];

    if (current == 0)    lookahead  = 0;
                    else lookahead  = source_to_iso_grid[p[1]];
    if (lookahead == 0)  lookahead2 = 0;
                    else lookahead2 = source_to_iso_grid[p[2]];
    if (lookahead2 == 0) lookahead3 = 0;
                    else lookahead3 = source_to_iso_grid[p[3]];

    CurrentLB->chars_read++;
    if (forerrors_pointer < 511)
        forerrors_buff[forerrors_pointer++] = current;
    if (current == '\n') reached_new_line();
    return(current);
}

/* ========================================================================= */
/*   The interface between the lexer and Inform's higher levels:             */
/*                                                                           */
/*       put_token_back()            (effectively) move the read position    */
/*                                       back by one token                   */
/*                                                                           */
/*       get_next_token()            copy the token at the current read      */
/*                                       position into the triple            */
/*                                   (token_type, token_value, token_text)   */
/*                                       and move the read position forward  */
/*                                       by one                              */
/*                                                                           */
/*       restart_lexer(source, name) if source is NULL, initialise the lexer */
/*                                       to read from source files;          */
/*                                   otherwise, to read from this string.    */
/* ------------------------------------------------------------------------- */

extern void put_token_back(void)
{   tokens_put_back++;

    if (tokens_trace_level > 0)
    {   if (tokens_trace_level == 1) printf("<- ");
        else printf("<-\n");
    }

    /*  The following error, of course, should never happen!                 */

    if (tokens_put_back == CIRCLE_SIZE)
    {   compiler_error("The lexical analyser has collapsed because of a wrong \
assumption inside Inform");
        tokens_put_back--;
        return;
    }
}

extern void get_next_token(void)
{   int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
    int returning_a_put_back_token = TRUE;

    context = lexical_context();

    if (tokens_put_back > 0)
    {   i = circle_position - tokens_put_back + 1;
        if (i<0) i += CIRCLE_SIZE;
        tokens_put_back--;
        if (context != token_contexts[i])
        {   j = circle[i].type;
            if ((j==0) || ((j>=100) && (j<200)))
                interpret_identifier(i, FALSE);
        }
        goto ReturnBack;
    }
    returning_a_put_back_token = FALSE;

    if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
    else circle_position++;

    if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
        lex_p = lexeme_memory;

    circle[circle_position].text = lex_p;
    circle[circle_position].value = 0;
    *lex_p = 0;

    StartTokenAgain:
    d = (*get_next_char)();
    e = tokeniser_grid[d];

    if (next_token_begins_syntax_line)
    {   if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
        {   new_syntax_line();
            next_token_begins_syntax_line = FALSE;
        }
    }

    circle[circle_position].location = get_current_debug_location();

    switch(e)
    {   case 0: char_error("Illegal character found in source:", d);
            goto StartTokenAgain;

        case WHITESPACE_CODE:
            while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
                (*get_next_char)();
            goto StartTokenAgain;

        case COMMENT_CODE:
            while ((lookahead != '\n') && (lookahead != 0))
                (*get_next_char)();
            goto StartTokenAgain;

        case EOF_CODE:
            circle[circle_position].type = EOF_TT;
            strcpy(lex_p, "<end of file>");
            lex_p += strlen(lex_p) + 1;
            break;

        case DIGIT_CODE:
            radix = 10;
            ReturnNumber:
            n=0;
            do
            {   n = n*radix + character_digit_value[d];
                *lex_p++ = d;
            } while ((character_digit_value[lookahead] < radix)
                     && (d = (*get_next_char)(), TRUE));

            *lex_p++ = 0;
            circle[circle_position].type = NUMBER_TT;
            circle[circle_position].value = n;
            break;

            FloatNumber:
            {   int expo=0; double intv=0, fracv=0;
                int expocount=0, intcount=0, fraccount=0;
                int signbit = (d == '-');
                *lex_p++ = d;
                while (character_digit_value[lookahead] < 10) {
                    intv = 10.0*intv + character_digit_value[lookahead];
                    intcount++;
                    *lex_p++ = lookahead;
                    (*get_next_char)();
                }
                if (lookahead == '.') {
                    double fracpow = 1.0;
                    *lex_p++ = lookahead;
                    (*get_next_char)();
                    while (character_digit_value[lookahead] < 10) {
                        fracpow *= 0.1;
                        fracv = fracv + fracpow*character_digit_value[lookahead];
                        fraccount++;
                        *lex_p++ = lookahead;
                        (*get_next_char)();
                    }
                }
                if (lookahead == 'e' || lookahead == 'E') {
                    int exposign = 0;
                    *lex_p++ = lookahead;
                    (*get_next_char)();
                    if (lookahead == '+' || lookahead == '-') {
                        exposign = (lookahead == '-');
                        *lex_p++ = lookahead;
                        (*get_next_char)();
                    }
                    while (character_digit_value[lookahead] < 10) {
                        expo = 10*expo + character_digit_value[lookahead];
                        expocount++;
                        *lex_p++ = lookahead;
                        (*get_next_char)();
                    }
                    if (expocount == 0)
                        error("Floating-point literal must have digits after the 'e'");
                    if (exposign) { expo = -expo; }
                }
                if (intcount + fraccount == 0)
                    error("Floating-point literal must have digits");
                n = construct_float(signbit, intv, fracv, expo);
            }
            *lex_p++ = 0;
            circle[circle_position].type = NUMBER_TT;
            circle[circle_position].value = n;
            if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
            break;

        case RADIX_CODE:
            radix = 16; d = (*get_next_char)();
            if (d == '-' || d == '+') { goto FloatNumber; }
            if (d == '$') { d = (*get_next_char)(); radix = 2; }
            if (character_digit_value[d] >= radix)
            {   if (radix == 2)
                    error("Binary number expected after '$$'");
                else
                    error("Hexadecimal number expected after '$'");
            }
            goto ReturnNumber;

        case QUOTE_CODE:     /* Single-quotes: scan a literal string */
            quoted_size=0;
            do
            {   e = d; d = (*get_next_char)(); *lex_p++ = d;
                if (quoted_size++==64)
                {   error(
                    "Too much text for one pair of quotations '...' to hold");
                    *lex_p='\''; break;
                }
                if ((d == '\'') && (e != '@'))
                {   if (quoted_size == 1)
                    {   d = (*get_next_char)(); *lex_p++ = d;
                        if (d != '\'')
                            error("No text between quotation marks ''");
                    }
                    break;
                }
            } while (d != EOF);
            if (d==EOF) ebf_error("'\''", "end of file");
            *(lex_p-1) = 0;
            circle[circle_position].type = SQ_TT;
            break;

        case DQUOTE_CODE:    /* Double-quotes: scan a literal string */
            quoted_size=0;
            do
            {   d = (*get_next_char)(); *lex_p++ = d;
                if (quoted_size++==MAX_QTEXT_SIZE)
                {   memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
                    break;
                }
                if (d == '\n')
                {   lex_p--;
                    while (*(lex_p-1) == ' ') lex_p--;
                    if (*(lex_p-1) != '^') *lex_p++ = ' ';
                    while ((lookahead != EOF) &&
                          (tokeniser_grid[lookahead] == WHITESPACE_CODE))
                    (*get_next_char)();
                }
                else if (d == '\\')
                {   int newline_passed = FALSE;
                    lex_p--;
                    while ((lookahead != EOF) &&
                          (tokeniser_grid[lookahead] == WHITESPACE_CODE))
                        if ((d = (*get_next_char)()) == '\n')
                            newline_passed = TRUE;
                    if (!newline_passed)
                    {   char chb[4];
                        chb[0] = '\"'; chb[1] = lookahead;
                        chb[2] = '\"'; chb[3] = 0;
                        ebf_error("empty rest of line after '\\' in string",
                            chb);
                    }
                }
            }   while ((d != EOF) && (d!='\"'));
            if (d==EOF) ebf_error("'\"'", "end of file");
            *(lex_p-1) = 0;
            circle[circle_position].type = DQ_TT;
            break;

        case IDENTIFIER_CODE:    /* Letter or underscore: an identifier */

            *lex_p++ = d; n=1;
            while ((n<=MAX_IDENTIFIER_LENGTH)
                   && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
                   || (tokeniser_grid[lookahead] == DIGIT_CODE)))
                n++, *lex_p++ = (*get_next_char)();

            *lex_p++ = 0;

            if (n > MAX_IDENTIFIER_LENGTH)
            {   char bad_length[100];
                sprintf(bad_length,
                    "Name exceeds the maximum length of %d characters:",
                         MAX_IDENTIFIER_LENGTH);
                error_named(bad_length, circle[circle_position].text);
                /* Trim token so that it doesn't violate
                   MAX_IDENTIFIER_LENGTH during error recovery */
                circle[circle_position].text[MAX_IDENTIFIER_LENGTH] = 0;
            }

            if (dont_enter_into_symbol_table)
            {   circle[circle_position].type = DQ_TT;
                circle[circle_position].value = 0;
                if (dont_enter_into_symbol_table == -2)
                    interpret_identifier(circle_position, TRUE);
                break;
            }

            interpret_identifier(circle_position, FALSE);
            break;

        default:

            /*  The character is initial to at least one of the separators  */

            for (j=e>>4, k=j+(e&0x0f); j<k; j++)
            {   r = (char *) separators[j];
                if (r[1]==0)
                {   *lex_p++=d; *lex_p++=0;
                    goto SeparatorMatched;
                }
                else
                if (r[2]==0)
                {   if (*(r+1) == lookahead)
                    {   *lex_p++=d;
                        *lex_p++=(*get_next_char)();
                        *lex_p++=0;
                        goto SeparatorMatched;
                    }
                }
                else
                {   if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
                    {   *lex_p++=d;
                        *lex_p++=(*get_next_char)();
                        *lex_p++=(*get_next_char)();
                        *lex_p++=0;
                        goto SeparatorMatched;
                    }
                }
            }

            /*  The following contingency never in fact arises with the
                current set of separators, but might in future  */

            *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
            *lex_p++ = 0;
            error_named("Unrecognised combination in source:", lex_p);
            goto StartTokenAgain;

            SeparatorMatched:

            circle[circle_position].type = SEP_TT;
            circle[circle_position].value = j;
            switch(j)
            {   case SEMICOLON_SEP: break;
                case HASHNDOLLAR_SEP:
                case HASHWDOLLAR_SEP:
                    if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
                    {   error_named("Character expected after",
                            circle[circle_position].text);
                        break;
                    }
                    lex_p--;
                    *lex_p++ = (*get_next_char)();
                    while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
                           || (tokeniser_grid[lookahead] == DIGIT_CODE))
                        *lex_p++ = (*get_next_char)();
                    *lex_p++ = 0;
                    break;
                case HASHADOLLAR_SEP:
                case HASHGDOLLAR_SEP:
                case HASHRDOLLAR_SEP:
                case HASHHASH_SEP:
                    if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
                    {   error_named("Alphabetic character expected after",
                            circle[circle_position].text);
                        break;
                    }
                    lex_p--;
                    while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
                           || (tokeniser_grid[lookahead] == DIGIT_CODE))
                        *lex_p++ = (*get_next_char)();
                    *lex_p++ = 0;
                    break;
            }
            break;
    }

    i = circle_position;

    ReturnBack:
    token_value = circle[i].value;
    token_type = circle[i].type;
    token_text = circle[i].text;
    if (!returning_a_put_back_token)
    {   set_token_location(circle[i].location);
    }
    token_contexts[i] = context;

    if (tokens_trace_level > 0)
    {   if (tokens_trace_level == 1)
            printf("'%s' ", circle[i].text);
        else
        {   printf("-> "); describe_token(circle[i]);
            printf(" ");
            if (tokens_trace_level > 2) print_context(token_contexts[i]);
            printf("\n");
        }
    }
}

static char veneer_error_title[64];

extern void restart_lexer(char *lexical_source, char *name)
{   int i;
    circle_position = 0;
    for (i=0; i<CIRCLE_SIZE; i++)
    {   circle[i].type = 0;
        circle[i].value = 0;
        circle[i].text = "(if this is ever visible, there is a bug)";
        token_contexts[i] = 0;
    }

    lex_p = lexeme_memory;
    tokens_put_back = 0;
    forerrors_pointer = 0;
    dont_enter_into_symbol_table = FALSE;
    return_sp_as_variable = FALSE;
    next_token_begins_syntax_line = TRUE;

    source_to_analyse = lexical_source;

    if (source_to_analyse == NULL)
    {   get_next_char = get_next_char_from_pipeline;
        if (!pipeline_made) create_char_pipeline();
        forerrors_buff[0] = 0; forerrors_pointer = 0;
    }
    else
    {   get_next_char = get_next_char_from_string;
        source_to_analyse_pointer = 0;
        CurrentLB = &StringLB;
        sprintf(veneer_error_title, "<veneer routine '%s'>", name);
        StringLB.filename = veneer_error_title;

        CurrentLB->source_line = 1;
        CurrentLB->line_start  = 0;
        CurrentLB->chars_read  = 0;
    }
}

/* ========================================================================= */
/*   Data structure management routines                                      */
/* ------------------------------------------------------------------------- */

extern void init_lexer_vars(void)
{
    blank_brief_location.file_index = -1;
    blank_brief_location.line_number = 0;
    blank_brief_location.orig_file_index = 0;
    blank_brief_location.orig_line_number = 0;
}

extern void lexer_begin_prepass(void)
{   total_source_line_count = 0;
    CurrentLB = &NoFileOpen;
    report_errors_at_current_line();
}

extern void lexer_begin_pass(void)
{   no_hash_printed_yet = TRUE;
    hash_printed_since_newline = FALSE;

    pipeline_made = FALSE;

    restart_lexer(NULL, NULL);
}

extern void lexer_endpass(void)
{   CurrentLB = &MakingOutput;
    report_errors_at_current_line();
}

extern void lexer_allocate_arrays(void)
{   int i;

    FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
        "filestack buffer");

    for (i=0; i<MAX_INCLUSION_DEPTH; i++)
    FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");

    lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");

    keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
        "keyword hash table");
    keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
        "keyword hash end table");
    keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
        "keyword hashing linked list");
    local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
        "local variable hash table");
    local_variable_text_table = my_malloc(
        (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
        "text of local variable names");

    local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
        "local variable hash codes");
    local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
        "local variable text pointers");

    make_tokeniser_grid();
    make_keywords_tables();

    first_token_locations =
        my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
    first_token_locations->location.file_index = 0;
    first_token_locations->location.beginning_byte_index = 0;
    first_token_locations->location.end_byte_index = 0;
    first_token_locations->location.beginning_line_number = 0;
    first_token_locations->location.end_line_number = 0;
    first_token_locations->location.beginning_character_number = 0;
    first_token_locations->location.end_character_number = 0;
    first_token_locations->location.orig_file_index = 0;
    first_token_locations->location.orig_beg_line_number = 0;
    first_token_locations->location.orig_beg_char_number = 0;
    first_token_locations->next = NULL;
    first_token_locations->reference_count = 0;
    last_token_location = first_token_locations;
}

extern void lexer_free_arrays(void)
{   int i; char *p;

    for (i=0; i<MAX_INCLUSION_DEPTH; i++)
    {   p = FileStack[i].buffer;
        my_free(&p, "source file buffer");
    }
    my_free(&FileStack, "filestack buffer");
    my_free(&lexeme_memory, "lexeme memory");

    my_free(&keywords_hash_table, "keyword hash table");
    my_free(&keywords_hash_ends_table, "keyword hash end table");
    my_free(&keywords_data_table, "keyword hashing linked list");
    my_free(&local_variable_hash_table, "local variable hash table");
    my_free(&local_variable_text_table, "text of local variable names");

    my_free(&local_variable_hash_codes, "local variable hash codes");
    my_free(&local_variable_texts, "local variable text pointers");

    cleanup_token_locations(NULL);
}

/* ========================================================================= */