[LoadPreform::] Loading Preform. To read in structural definitions of natural language written in the meta-language Preform. @h Reading Preform syntax from a file or text. The parser reads source text against a specific language only, if |primary_Preform_language| is set; or, if it isn't, from any language. @default NATURAL_LANGUAGE_WORDS_TYPE void = NATURAL_LANGUAGE_WORDS_TYPE *primary_Preform_language = NULL; int LoadPreform::load(filename *F, NATURAL_LANGUAGE_WORDS_TYPE *L) { primary_Preform_language = L; wording W = LoadPreform::feed_from_Preform_file(F); if (Wordings::empty(W)) return 0; return LoadPreform::parse(W, L); } @ We simply feed the lines one at a time. Preform is parsed with the same lexer as is used for Inform itself, but using the following set of characters as word-breaking punctuation marks: @d PREFORM_PUNCTUATION_MARKS L"{}[]_^?&\\" = wording LoadPreform::feed_from_Preform_file(filename *F) { feed_t id = Feeds::begin(); if (TextFiles::read(F, FALSE, NULL, FALSE, LoadPreform::load_helper, NULL, NULL) == FALSE) return EMPTY_WORDING; return Feeds::end(id); } void LoadPreform::load_helper(text_stream *item_name, text_file_position *tfp, void *unused_state) { WRITE_TO(item_name, "\n"); Feeds::feed_text_punctuated(item_name, PREFORM_PUNCTUATION_MARKS); } @ It is also possible to load additional Preform declarations from source text in Inform, and when that happens, the following is called: = int LoadPreform::parse_text(text_stream *wd) { wording W = Feeds::feed_text_punctuated(wd, PREFORM_PUNCTUATION_MARKS); return LoadPreform::parse(W, primary_Preform_language); } @ Either way, then, all that remains is to write //LoadPreform::parse//. But before we can get to that, we have to create the... @h Reserved words in Preform. The ideal tool with which to parse Preform definitions would be Preform, but then how would we define the grammar required? So we will have to do this by hand, and in particular, we have to define Preform's syntactic punctuation marks explicitly. These are, in effect, the reserved words of the Preform notational language. (Note the absence of the |==>| marker: that's stripped out by //inweb// and never reaches the |Syntax.preform| file.) The bare letters K and L are snuck in here for convenience. They aren't actually used by anything in //words//, but are used for kind variables in //kinds//. = vocabulary_entry *AMPERSAND_V; vocabulary_entry *BACKSLASH_V; vocabulary_entry *CARET_V; vocabulary_entry *COLONCOLONEQUALS_V; vocabulary_entry *QUESTIONMARK_V; vocabulary_entry *QUOTEQUOTE_V; vocabulary_entry *SIXDOTS_V; vocabulary_entry *THREEASTERISKS_V; vocabulary_entry *THREEDOTS_V; vocabulary_entry *THREEHASHES_V; vocabulary_entry *UNDERSCORE_V; vocabulary_entry *language_V; vocabulary_entry *internal_V; vocabulary_entry *CAPITAL_K_V; vocabulary_entry *CAPITAL_L_V; @ = void LoadPreform::create_punctuation(void) { AMPERSAND_V = Vocabulary::entry_for_text(L"&"); BACKSLASH_V = Vocabulary::entry_for_text(L"\\"); CARET_V = Vocabulary::entry_for_text(L"^"); COLONCOLONEQUALS_V = Vocabulary::entry_for_text(L":" ":="); QUESTIONMARK_V = Vocabulary::entry_for_text(L"?"); QUOTEQUOTE_V = Vocabulary::entry_for_text(L"\"\""); SIXDOTS_V = Vocabulary::entry_for_text(L"......"); THREEASTERISKS_V = Vocabulary::entry_for_text(L"***"); THREEDOTS_V = Vocabulary::entry_for_text(L"..."); THREEHASHES_V = Vocabulary::entry_for_text(L"###"); UNDERSCORE_V = Vocabulary::entry_for_text(L"_"); language_V = Vocabulary::entry_for_text(L"language"); internal_V = Vocabulary::entry_for_text(L"internal"); CAPITAL_K_V = Vocabulary::entry_for_text(L"k"); CAPITAL_L_V = Vocabulary::entry_for_text(L"l"); } @h Parsing Preform. The syntax of the |Syntax.preform| is, fortunately, very simple. At any given time, we are parsing definitions for a given natural language |L|: for example, English. Note that Preform can contain comments in square brackets; but that the Lexer has already removed any such. = int LoadPreform::parse(wording W, NATURAL_LANGUAGE_WORDS_TYPE *L) { NATURAL_LANGUAGE_WORDS_TYPE *current_natural_language = L; int declarations = 0; LOOP_THROUGH_WORDING(wn, W) { if (Lexer::word(wn) == PARBREAK_V) continue; if ((Wordings::last_wn(W) >= wn+1) && (Lexer::word(wn) == language_V)) @ else if ((Wordings::last_wn(W) >= wn+1) && (Lexer::word(wn+1) == internal_V)) @ else if ((Wordings::last_wn(W) >= wn+2) && (Lexer::word(wn+1) == COLONCOLONEQUALS_V)) @ else PreformUtilities::production_error(NULL, NULL, "syntax error in Preform declarations"); } Optimiser::optimise_counts(); return declarations; } @ We either switch to an existing natural language, or create a new one. @ = TEMPORARY_TEXT(lname) WRITE_TO(lname, "%W", Wordings::one_word(wn+1)); NATURAL_LANGUAGE_WORDS_TYPE *nl = NULL; #ifdef PREFORM_LANGUAGE_FROM_NAME_WORDS_CALLBACK nl = PREFORM_LANGUAGE_FROM_NAME_WORDS_CALLBACK(lname); #endif if (nl == NULL) { LOG("Missing: %S\n", lname); PreformUtilities::production_error(NULL, NULL, "tried to define for missing language"); } DISCARD_TEXT(lname) current_natural_language = nl; wn++; @ Internal declarations appear as single lines in |Syntax.preform|. @ = nonterminal *nt = Nonterminals::find(Lexer::word(wn)); if (nt->first_pl) PreformUtilities::production_error(nt, NULL, "nonterminal internal in one definition and regular in another"); nt->marked_internal = TRUE; wn++; declarations++; @ Regular declarations are much longer and continue until the end of the text, or until we reach a paragraph break. The body of such a declaration is a list of productions divided by stroke symbols. Note that an empty production is not recorded: e.g., if there are two consecutive strokes, the gap between them will not be considered as a production, and the second stroke will simply be ignored. I suppose this is arguably a syntax error and could be rejected as such, but it does no harm. @ = nonterminal *nt = Nonterminals::find(Lexer::word(wn)); if (nt->marked_internal) PreformUtilities::production_error(nt, NULL, "nonterminal internal in one definition and regular in another"); production_list *pl = LoadPreform::find_list_for_language(nt, current_natural_language); wn += 2; /* advance past the ID word and the |::=| word */ int pc = 0; while (TRUE) { int x = wn; while ((x <= Wordings::last_wn(W)) && (Lexer::word(x) != STROKE_V) && (Lexer::word(x) != PARBREAK_V)) x++; if (wn < x) { wording PW = Wordings::new(wn, x-1); wn = x; LoadPreform::add_production(LoadPreform::new_production(PW, nt, pc++), pl); } if ((wn > Wordings::last_wn(W)) || (Lexer::word(x) == PARBREAK_V)) break; /* reached end */ wn++; /* advance past the stroke and continue */ } wn--; declarations++; @h Production lists. Regular nonterminals are defined by a list of alternative possibilities divided by vertical stroke characters; these are called "productions", for reasons going back to computer science history. However, the same nonterminal can have multiple lists, one for each language where it's defined: for example, could have one English and one French definition both in memory at the same time. Each would be an independent //production_list// object. = typedef struct production_list { NATURAL_LANGUAGE_WORDS_TYPE *definition_language; struct production_list *next_pl; /* in the list of PLs for a NT */ struct production *first_pr; /* start of linked list of productions */ struct match_avinue *as_avinue; /* when compiled to a trie rather than for Preform */ CLASS_DEFINITION } production_list; @ = production_list *LoadPreform::find_list_for_language(nonterminal *nt, NATURAL_LANGUAGE_WORDS_TYPE *L) { production_list *pl; for (pl = nt->first_pl; pl; pl = pl->next_pl) if (pl->definition_language == L) break; if (pl == NULL) { pl = CREATE(production_list); pl->definition_language = L; pl->first_pr = NULL; pl->as_avinue = NULL; @; } return pl; } @ = if (nt->first_pl == NULL) nt->first_pl = pl; else { production_list *p = nt->first_pl; while ((p) && (p->next_pl)) p = p->next_pl; p->next_pl = pl; } @ It is undeniably clumsy that the linked list of PLs, and also of productions within each PL, is managed by hand rather than by using //foundation//. But speed is critical when parsing Inform text from these productions, and we want to minimise overhead. = void LoadPreform::add_production(production *pr, production_list *pl) { if (pl->first_pr == NULL) pl->first_pr = pr; else { production *p = pl->first_pr; while ((p) && (p->next_pr)) p = p->next_pr; p->next_pr = pr; } } @h Productions and ptokens. So now we reach the production, which encodes a typical "row" of grammar: for example, = (text as Preform) runner no = is a production. This is implemented as still another list, of "ptokens" (the "p" is silent): that example has three ptokens. Note that the stroke sign and the defined-by sign are not ptokens; they divide up productions, but aren't part of them. //The Optimiser// calculates data on productions just as it does on nonterminals. For example, it can see that the above can only match a text if it has exactly 3 words, so it sets both |pr_extremes.min_words| and |pr_extremes.max_words| to 3. For the meaning of the remaining data, and for what "struts" are, see //The Optimiser//: it only confuses the picture here. @d MAX_STRUTS_PER_PRODUCTION 10 = typedef struct production { struct ptoken *first_pt; /* the linked list of ptokens */ int match_number; /* 0 for |/a/|, 1 for |/b/| and so on: see //About Preform// */ int no_ranges; /* actually one more, since range 0 is reserved */ struct production_optimisation_data opt; /* see //The Optimiser// */ struct production_instrumentation_data ins; /* see //Instrumentation// */ struct production *next_pr; /* within its production list */ CLASS_DEFINITION } production; @ And at the bottom of God's great chain, the lowly ptoken. Even this can spawn another list, though: the token |fried/green/tomatoes| is a list of three ptokens joined by the |alternative_ptoken| links. There are really only three kinds of ptoken, wildcards, fixed words, and nonterminals, but it's fractionally quicker to differentiate the sorts of wildcard here, so we'll actually divide them into five. The remaining wildcard, the |......| form of |...|, is represented as |MULTIPLE_WILDCARD_PTC| but with the |balanced_wildcard| flag set. @d SINGLE_WILDCARD_PTC 1 @d MULTIPLE_WILDCARD_PTC 2 @d POSSIBLY_EMPTY_WILDCARD_PTC 3 @d FIXED_WORD_PTC 4 @d NONTERMINAL_PTC 5 = typedef struct ptoken { /* How to parse text against this ptoken */ int ptoken_category; /* one of the |*_PTC| values */ int balanced_wildcard; /* for |MULTIPLE_WILDCARD_PTC| ptokens: brackets balanced? */ int negated_ptoken; /* the |^| modifier applies */ int disallow_unexpected_upper; /* the |_| modifier applies */ struct nonterminal *nt_pt; /* for |NONTERMINAL_PTC| ptokens */ struct vocabulary_entry *ve_pt; /* for |FIXED_WORD_PTC| ptokens */ struct ptoken *alternative_ptoken; /* linked list of other vocabulary ptokens */ /* What results from a successful match against this ptoken */ int result_index; /* for |NONTERMINAL_PTC| ptokens: what result number, counting from 1? */ int range_starts; /* 1, 2, 3, ... if word range 1, 2, 3, ... starts with this */ int range_ends; /* 1, 2, 3, ... if word range 1, 2, 3, ... ends with this */ struct ptoken_optimisation_data opt; /* see //The Optimiser// */ struct ptoken_instrumentation_data ins; /* see //Instrumentation// */ struct ptoken *next_pt; /* within its production list */ CLASS_DEFINITION } ptoken; @ Each ptoken has a |range_starts| and |range_ends| number. This is either -1, or marks that the ptoken occurs as the first or last in a range (or both). For example, in the production = (text as InC) make ... from {rice ... onions} and peppers = the first |...| ptoken has start and end set to 1; |rice| has start 2; |onions| has end 2. Note that the second |...|, inside the braces, doesn't start or end anything; it normally would, but the wider range consumes it. @h Reading productions. We read the wording |W| of a production for the nonterminal |nt|, with |pc| or "production count" being 0 for the first one defined, 1 for the second, and so on. = production *LoadPreform::new_production(wording W, nonterminal *nt, int pc) { production *pr = CREATE(production); pr->match_number = pc; /* "production count": 0 for first in defn, and so on */ pr->next_pr = NULL; pr->no_ranges = 1; /* so that they count from 1; range 0 is unused */ Optimiser::initialise_production_data(&(pr->opt)); Instrumentation::initialise_production_data(&(pr->ins)); ptoken *head = NULL, *tail = NULL; @; pr->first_pt = head; return pr; } @ So, then, we have to turn a wording like: = (text as Preform) make ... from {rice ... onions/shallots} and peppers = into a sequence of ptokens, setting |head| to the first and |tail| to the last. This particular example will produce two word ranges: one for the initial |...|, one for the matter in the braces. Because of that, we need to keep track of whether we're in braces or not. (Braces cannot be nested.) The alternative |onions/shallots| is called a "slashed chain" in the code below. @d OUTSIDE_PTBRACE 0 @d ABOUT_TO_OPEN_PTBRACE 1 @d INSIDE_PTBRACE 2 @ = int result_count = 1; int negation_modifier = FALSE, lower_case_modifier = FALSE; int unescaped = TRUE; int bracing_mode = OUTSIDE_PTBRACE; ptoken *bracing_begins_at = NULL; int token_count = 0; LOOP_THROUGH_WORDING(i, W) { if (unescaped) @; ptoken *pt = LoadPreform::parse_slashed_chain(nt, pr, i, unescaped); if (pt) { if (pt->ptoken_category == NONTERMINAL_PTC) @; @; @; } } @ = if (Lexer::word(i) == CARET_V) { negation_modifier = TRUE; continue; } if (Lexer::word(i) == UNDERSCORE_V) { lower_case_modifier = TRUE; continue; } if (Lexer::word(i) == BACKSLASH_V) { unescaped = FALSE; continue; } switch (bracing_mode) { case OUTSIDE_PTBRACE: if (Lexer::word(i) == OPENBRACE_V) { bracing_mode = ABOUT_TO_OPEN_PTBRACE; continue; } break; case INSIDE_PTBRACE: if (Lexer::word(i) == CLOSEBRACE_V) { if (bracing_begins_at) @; bracing_mode = OUTSIDE_PTBRACE; bracing_begins_at = NULL; continue; } break; } @ A bracing can be followed by |? N| to make it have the number |N|, rather than the next number counting upwards; see //About Preform//. @ = int rnum = pr->no_ranges++; if ((i+2 <= Wordings::last_wn(W)) && (Lexer::word(i+1) == QUESTIONMARK_V) && (Vocabulary::test_flags(i+2, NUMBER_MC))) { rnum = Vocabulary::get_literal_number_value(Lexer::word(i+2)); i += 2; } if ((rnum < 1) || (rnum >= MAX_RANGES_PER_PRODUCTION)) { PreformUtilities::production_error(nt, pr, "range number out of range"); rnum = 1; } bracing_begins_at->range_starts = rnum; tail->range_ends = rnum; @ = if (negation_modifier) pt->negated_ptoken = TRUE; if (lower_case_modifier) pt->disallow_unexpected_upper = TRUE; unescaped = TRUE; negation_modifier = FALSE; lower_case_modifier = FALSE; switch (bracing_mode) { case OUTSIDE_PTBRACE: if ((pt->ptoken_category == SINGLE_WILDCARD_PTC) || (pt->ptoken_category == MULTIPLE_WILDCARD_PTC) || (pt->ptoken_category == POSSIBLY_EMPTY_WILDCARD_PTC)) { int rnum = pr->no_ranges++; if ((rnum < 1) || (rnum >= MAX_RANGES_PER_PRODUCTION)) { PreformUtilities::production_error(nt, pr, "range number out of range"); rnum = 1; } pt->range_starts = rnum; pt->range_ends = rnum; } break; case ABOUT_TO_OPEN_PTBRACE: bracing_begins_at = pt; bracing_mode = INSIDE_PTBRACE; break; } @ A nonterminal can be followed by |? N| to make it have result number |N|, rather than the next number counting upwards; see //About Preform//. @ = if (result_count < MAX_RESULTS_PER_PRODUCTION) { if ((i+2 <= Wordings::last_wn(W)) && (Lexer::word(i+1) == QUESTIONMARK_V) && (Vocabulary::test_flags(i+2, NUMBER_MC))) { pt->result_index = Vocabulary::get_literal_number_value(Lexer::word(i+2)); if ((pt->result_index < 0) || (pt->result_index >= MAX_RESULTS_PER_PRODUCTION)) { PreformUtilities::production_error(nt, pr, "result number out of range"); pt->result_index = 1; } i += 2; } else { pt->result_index = result_count; } result_count++; } else { PreformUtilities::production_error(nt, pr, "too many nonterminals for one production to hold"); } @ = if (token_count++ < MAX_PTOKENS_PER_PRODUCTION) { if (head == NULL) head = pt; else tail->next_pt = pt; tail = pt; } else { PreformUtilities::production_error(nt, pr, "too many tokens on production for nonterminal"); } @ Here we parse what is, to the Lexer, a single word (at word number |wn|), but which might actually be a row of possibilities divided by slashes: for example, |onions/shallots|. = ptoken *LoadPreform::parse_slashed_chain(nonterminal *nt, production *pr, int wn, int unescaped) { wording AW = Wordings::one_word(wn); @; ptoken *pt = NULL; @; return pt; } @ Should we detect a slash used to indicate alternatives, we have to ask the Lexer to have another try, but with |/| as a word-breaking character this time. So, for example, |AW| might then end up as |onions|, |/|, |shallots|. @ = wchar_t *p = Lexer::word_raw_text(wn); int k, breakme = FALSE; if (unescaped) { @; for (k=0; (p[k]) && (p[k+1]); k++) if ((k > 0) && (p[k] == '/')) breakme = TRUE; } if (breakme) AW = Feeds::feed_C_string_full(p, FALSE, L"/", FALSE); /* break only at slashes */ @ Intercept |/a/| to |/z/| and |/aa/| to |/zz/|, which don't make ptokens at all, but simply change the production's match number. @ = if ((p[0] == '/') && (Characters::islower(p[1])) && (p[2] == '/') && (p[3] == 0)) { pr->match_number = p[1] - 'a'; return NULL; /* i.e., contribute no token */ } if ((p[0] == '/') && (Characters::islower(p[1])) && (p[2] == p[1]) && (p[3] == '/') && (p[4] == 0)) { pr->match_number = p[1] - 'a' + 26; return NULL; /* i.e., contribute no token */ } @ And then we string together the ptokens made into a linked list with links provided by |->alternative_ptoken|, and return the head of this list. @ = ptoken *alt = NULL; for (; Wordings::nonempty(AW); AW = Wordings::trim_first_word(AW)) if (Lexer::word(Wordings::first_wn(AW)) != FORWARDSLASH_V) { int mode = unescaped; if (Wordings::length(AW) > 1) mode = FALSE; ptoken *latest = LoadPreform::new_ptoken(Lexer::word(Wordings::first_wn(AW)), mode, nt, pr->match_number); if (alt == NULL) pt = latest; /* thus making |pt| the head */ else alt->alternative_ptoken = latest; alt = latest; } @ Finally, then, the bottom-most function here parses what is definitely a single word into what will definitely be a single ptoken. In "escaped" mode, where a backslash has made the text literal, it just becomes a fixed word; otherwise it could be any of the five categories. = ptoken *LoadPreform::new_ptoken(vocabulary_entry *ve, int unescaped, nonterminal *nt, int pc) { ptoken *pt = CREATE(ptoken); @; wchar_t *p = Vocabulary::get_exemplar(ve, FALSE); if ((unescaped) && (p) && (p[0] == '<') && (p[Wide::len(p)-1] == '>')) @ else @; return pt; } @ = pt->next_pt = NULL; pt->ptoken_category = FIXED_WORD_PTC; pt->balanced_wildcard = FALSE; pt->negated_ptoken = FALSE; pt->disallow_unexpected_upper = FALSE; pt->ve_pt = NULL; pt->nt_pt = NULL; pt->alternative_ptoken = NULL; pt->result_index = 1; pt->range_starts = -1; pt->range_ends = -1; Optimiser::initialise_ptoken_data(&(pt->opt)); Instrumentation::initialise_ptoken_data(&(pt->ins)); @ If the text refers to a nonterminal which doesn't yet exist, then this creates it; that's how we deal with forward references. //Nonterminals::find// never returns |NULL|. @ = pt->nt_pt = Nonterminals::find(ve); pt->ptoken_category = NONTERMINAL_PTC; @ = pt->ve_pt = ve; if (unescaped) { if (ve == SIXDOTS_V) { pt->ptoken_category = MULTIPLE_WILDCARD_PTC; pt->balanced_wildcard = TRUE; } if (ve == THREEDOTS_V) pt->ptoken_category = MULTIPLE_WILDCARD_PTC; if (ve == THREEHASHES_V) pt->ptoken_category = SINGLE_WILDCARD_PTC; if (ve == THREEASTERISKS_V) pt->ptoken_category = POSSIBLY_EMPTY_WILDCARD_PTC; } if (pt->ptoken_category == FIXED_WORD_PTC) Nonterminals::note_word(ve, nt, pc);