mirror of
https://github.com/ganelson/inform.git
synced 2024-07-18 06:54:26 +03:00
735 lines
29 KiB
OpenEdge ABL
735 lines
29 KiB
OpenEdge ABL
[Sentences::] Sentences.
|
|
|
|
To break up the stream of words produced by the lexer into
|
|
English sentences, and join each to the parse tree.
|
|
|
|
@h Sentence breaking.
|
|
What breaks a sentence? In ordinary English, question marks, exclamation
|
|
marks, in some cases ellipses, but mainly full stops. In Inform source text,
|
|
only full stops are used outside quoted text; but we do have to recognise the
|
|
other cases when they occur at the end of quoted matter. Moreover, we
|
|
actually subdivide a little further, because we also want to break up
|
|
rule "sentences" into their subordinate clauses. Thus, going on
|
|
punctuation, we recognise rules as having the following model:
|
|
|
|
>> Preamble: phrase 1; phrase 2; ...; phrase N.
|
|
|
|
It is even, in certain limited circumstances, possible that a comma can
|
|
divide a sentence:
|
|
|
|
>> Instead of eating, say "You really aren't hungry just now."
|
|
|
|
This means that context is important even here, where it might have been
|
|
expected that all we needed to do was to spot the punctuation marks.
|
|
|
|
@h Finite state machine.
|
|
So we carry out the sentence breaking with a simple finite state machine --
|
|
the last sentence having been a rule preamble tells us that the current one
|
|
is probably a phrase, and so on -- and the following is its state. It is
|
|
inelegant that we have a singleton copy of this object and use a pointer
|
|
to it as a global variable; but it saves an awful lot of parameter-passing
|
|
in Preform grammar functions.
|
|
|
|
@default PROBLEM_REF_SYNTAX_TYPE void
|
|
@default PROJECT_REF_SYNTAX_TYPE void
|
|
|
|
@e NO_EXTENSION_POS from 0
|
|
@e BEFORE_BEGINS_EXTENSION_POS
|
|
@e MIDDLE_EXTENSION_POS
|
|
@e AFTER_ENDS_EXTENSION_POS
|
|
@e PAST_CARING_EXTENSION_POS
|
|
|
|
=
|
|
typedef struct syntax_fsm_state {
|
|
source_file *sf; /* reading from this source file */
|
|
int ext_pos; /* one of the |*_EXTENSION_POS| values: where we are in an extension */
|
|
int skipping_material_at_level;
|
|
int main_source_start_wn;
|
|
node_type_t nt;
|
|
int inside_rule_mode;
|
|
int inside_table_mode;
|
|
PROBLEM_REF_SYNTAX_TYPE *ref;
|
|
PROJECT_REF_SYNTAX_TYPE *project_ref;
|
|
} syntax_fsm_state;
|
|
|
|
syntax_fsm_state the_one_and_only;
|
|
syntax_fsm_state *sfsm = &the_one_and_only;
|
|
|
|
@ Note that a reset zeroes everything out except the |main_source_start_wn|;
|
|
that's because we reset each time we begin a round of sentence-breaking, and
|
|
there may be many such rounds on the same Inform project, but there's only
|
|
one source text start position.
|
|
|
|
=
|
|
void Sentences::set_start_of_source(syntax_fsm_state *sfsm, int wn) {
|
|
sfsm->main_source_start_wn = wn;
|
|
}
|
|
|
|
void Sentences::reset(syntax_fsm_state *sfsm, int is_extension,
|
|
PROBLEM_REF_SYNTAX_TYPE *ref, PROJECT_REF_SYNTAX_TYPE *project_ref) {
|
|
sfsm->sf = NULL;
|
|
sfsm->inside_rule_mode = FALSE;
|
|
sfsm->skipping_material_at_level = -1;
|
|
sfsm->ref = ref;
|
|
sfsm->project_ref = project_ref;
|
|
if (is_extension) sfsm->ext_pos = BEFORE_BEGINS_EXTENSION_POS;
|
|
else sfsm->ext_pos = NO_EXTENSION_POS;
|
|
}
|
|
|
|
@ These are the syntax errors we will generate.
|
|
|
|
@e UnexpectedSemicolon_SYNERROR from 1
|
|
@e ParaEndsInColon_SYNERROR
|
|
@e SentenceEndsInColon_SYNERROR
|
|
@e SentenceEndsInSemicolon_SYNERROR
|
|
@e SemicolonAfterColon_SYNERROR
|
|
@e SemicolonAfterStop_SYNERROR
|
|
@e ExtSpuriouslyContinues_SYNERROR
|
|
@e ExtNoBeginsHere_SYNERROR
|
|
@e ExtNoEndsHere_SYNERROR
|
|
@e HeadingOverLine_SYNERROR
|
|
@e HeadingStopsBeforeEndOfLine_SYNERROR
|
|
|
|
@ Now for the function itself. We break into bite-sized chunks, each of which is
|
|
despatched to the |Sentences::make_node| function with a note of the punctuation
|
|
which was used to end it.
|
|
|
|
=
|
|
void Sentences::break(parse_node_tree *T, wording W) {
|
|
Sentences::break_inner(T, W, FALSE, NULL, NULL);
|
|
}
|
|
void Sentences::break_into_project_copy(parse_node_tree *T, wording W,
|
|
PROBLEM_REF_SYNTAX_TYPE *ref, void *project_ref) {
|
|
Sentences::break_inner(T, W, FALSE, ref, project_ref);
|
|
}
|
|
void Sentences::break_into_extension_copy(parse_node_tree *T, wording W,
|
|
PROBLEM_REF_SYNTAX_TYPE *ref, PROJECT_REF_SYNTAX_TYPE *project_ref) {
|
|
Sentences::break_inner(T, W, TRUE, ref, project_ref);
|
|
}
|
|
|
|
void Sentences::break_inner(parse_node_tree *T, wording W, int is_extension,
|
|
PROBLEM_REF_SYNTAX_TYPE *ref, PROJECT_REF_SYNTAX_TYPE *project_ref) {
|
|
while (((Wordings::nonempty(W))) && (compare_word(Wordings::first_wn(W), PARBREAK_V)))
|
|
W = Wordings::trim_first_word(W);
|
|
if (Wordings::empty(W)) return;
|
|
|
|
int sentence_start = Wordings::first_wn(W);
|
|
SyntaxTree::enable_last_sentence_cache(T);
|
|
|
|
Sentences::reset(sfsm, is_extension, ref, project_ref);
|
|
@<Go into table sentence mode if necessary@>;
|
|
|
|
LOOP_THROUGH_WORDING(position, W)
|
|
if (sentence_start < position) {
|
|
int no_stop_words, back_up_one_word;
|
|
int stop_character;
|
|
|
|
@<Look for a sentence break, finding the number of stop words and the stop character@>;
|
|
if (no_stop_words > 0) {
|
|
Sentences::make_node(T, Wordings::new(sentence_start, position-1), stop_character);
|
|
position = position + no_stop_words - 1;
|
|
if (back_up_one_word) sentence_start = position;
|
|
else sentence_start = position + 1;
|
|
|
|
@<Go into table sentence mode if necessary@>;
|
|
}
|
|
}
|
|
|
|
if ((sentence_start < Wordings::last_wn(W)) ||
|
|
((sentence_start == Wordings::last_wn(W)) && (!(Lexer::word(Wordings::last_wn(W)) == PARBREAK_V)))) {
|
|
Sentences::make_node(T, Wordings::from(W, sentence_start), '.');
|
|
}
|
|
|
|
SyntaxTree::disable_last_sentence_cache(T);
|
|
|
|
if (is_extension)
|
|
@<Issue a problem message if we are missing the begin and end here sentences@>;
|
|
Sentences::reset(sfsm, FALSE, NULL, NULL);
|
|
}
|
|
|
|
@ A table is any sentence beginning with the word "Table". (Bad news for
|
|
anyone writing "Table Mountain is a room.", of course, but there are other
|
|
ways to do that, and it seems wise to keep the syntax for tables clear,
|
|
since their entries are governed by different lexical and semantic rules.)
|
|
|
|
@<Go into table sentence mode if necessary@> =
|
|
if ((<structural-sentence>(Wordings::from(W, sentence_start))) &&
|
|
(NodeType::has_flag(sfsm->nt, TABBED_NFLAG)))
|
|
sfsm->inside_table_mode = TRUE;
|
|
else
|
|
sfsm->inside_table_mode = FALSE;
|
|
|
|
@ We now come to the definition of a sentence break, which is more complicated
|
|
than might have been expected.
|
|
|
|
For one thing, a run of sentence divisions is treated as a single division,
|
|
only the last of which is the one which counts. This looks odd at first sight,
|
|
because it means that Inform considers
|
|
|
|
>> The cat is on the table;.
|
|
|
|
to be a valid sentence, equivalent to
|
|
|
|
>> The cat is on the table.
|
|
|
|
But it has the advantage that it enables us to avoid being pointlessly strict
|
|
over the punctuation which precedes a paragraph break. Some people like to
|
|
write paragraphs like this:
|
|
= (text as Inform 7)
|
|
Before going north:
|
|
say "Northward ho!";
|
|
now the compass points north;
|
|
=
|
|
And properly speaking that ends with a semicolon then a paragraph break,
|
|
which is a doubled sentence division. But we forgive it as harmless, and
|
|
that forgiveness is provided by the loop arrangement below.
|
|
|
|
We also avoid the need for empty sentences, because it is not possible
|
|
for the code below to detect them: thus
|
|
|
|
>> say "Look behind you!";;;;; now the Wug is in the Cave
|
|
|
|
is broken as two sentences, not six sentences of which four are empty.
|
|
Perhaps we ought to be stricter, and reject more of these dubious forms,
|
|
but at this point we have too little understanding of the semantics of
|
|
the text to risk annoying the user with problem messages.
|
|
|
|
@ Full stops, semicolons and paragraph breaks (all rendered by the lexer as
|
|
individual words: the stroke word in the case of the latter) are always
|
|
sentence divisions. The other cases are more complicated: see below.
|
|
|
|
@<Look for a sentence break, finding the number of stop words and the stop character@> =
|
|
int at = position;
|
|
no_stop_words = 0; stop_character = '?'; back_up_one_word = FALSE;
|
|
while (at < Wordings::last_wn(W)) {
|
|
int stopped = FALSE;
|
|
|
|
if (Lexer::word(at) == PARBREAK_V) {
|
|
if (stop_character == ':') @<Issue problem for colon at end of paragraph@>;
|
|
stop_character = '|'; stopped = TRUE;
|
|
}
|
|
if (Lexer::word(at) == FULLSTOP_V) {
|
|
if (stop_character == ':') @<Issue problem for colon at end of sentence@>;
|
|
if (stop_character == ';') @<Issue problem for semicolon at end of sentence@>;
|
|
stop_character = '.'; stopped = TRUE;
|
|
}
|
|
if (Lexer::word(at) == SEMICOLON_V) {
|
|
if (stop_character == ':') @<Issue problem for semicolon after colon@>;
|
|
if (stop_character == '.') @<Issue problem for semicolon after full stop@>;
|
|
stop_character = ';'; stopped = TRUE;
|
|
}
|
|
|
|
@<Consider if a colon divides a sentence@>;
|
|
@<Consider if punctuation within a preceding quoted text divides a sentence, making an X break@>;
|
|
|
|
if (stopped == FALSE) break;
|
|
no_stop_words++; at++;
|
|
}
|
|
if (stop_character == 'X') { /* X breaks are like full stops, but there is no stop word to skip over */
|
|
stop_character = '.'; back_up_one_word = TRUE;
|
|
}
|
|
if (no_stop_words > 0)
|
|
LOGIF(LEXICAL_OUTPUT, "Stop character '%c', no_stop_words %d, sentence_break %d, position %d\n",
|
|
stop_character, no_stop_words, sentence_start, position);
|
|
|
|
@<Issue problem for colon at end of paragraph@> =
|
|
Sentences::syntax_problem(ParaEndsInColon_SYNERROR, Wordings::new(sentence_start, at-1), sfsm->ref, 0);
|
|
|
|
@<Issue problem for colon at end of sentence@> =
|
|
Sentences::syntax_problem(SentenceEndsInColon_SYNERROR, Wordings::new(sentence_start, at), sfsm->ref, 0);
|
|
|
|
@<Issue problem for semicolon at end of sentence@> =
|
|
Sentences::syntax_problem(SentenceEndsInSemicolon_SYNERROR, Wordings::new(sentence_start, at), sfsm->ref, 0);
|
|
|
|
@<Issue problem for semicolon after colon@> =
|
|
Sentences::syntax_problem(SemicolonAfterColon_SYNERROR, Wordings::new(sentence_start, at), sfsm->ref, 0);
|
|
|
|
@<Issue problem for semicolon after full stop@> =
|
|
Sentences::syntax_problem(SemicolonAfterStop_SYNERROR, Wordings::new(sentence_start, at), sfsm->ref, 0);
|
|
|
|
@ Colons are normally dividers, too, but an exception is made if they come
|
|
between two apparently numerical constructions, because this suggests that
|
|
the colon is being used not as punctuation but within a literal pattern.
|
|
(For instance, "He went out at 1:34 PM." is a sentence with just one
|
|
clause, not two clauses divided by the colon; but "He went out at 1 PM:
|
|
the snow was still falling." is indeed divided. Our rule here correctly
|
|
distinguishes these cases, and although it can be fooled by really contrived
|
|
sentences -- "He went out at 1: 22 Company, the Parachute Regiment, was
|
|
marching." -- it's robust enough in practice. The exception is forbidden
|
|
if a line break occurs between the colon and the succeeding numeral, as
|
|
then we might be looking at switch cases in an "if".)
|
|
|
|
Note that here we are at a word position which is strictly within the word
|
|
range being sentence-broken, so that it is safe to examine both the word
|
|
before and the word after the current position.
|
|
|
|
@<Consider if a colon divides a sentence@> =
|
|
#ifdef DIVIDE_AT_COLON_SYNTAX_CALLBACK
|
|
if (DIVIDE_AT_COLON_SYNTAX_CALLBACK(at)) {
|
|
stop_character = ':'; stopped = TRUE;
|
|
}
|
|
#endif
|
|
#ifndef DIVIDE_AT_COLON_SYNTAX_CALLBACK
|
|
if ((Lexer::word(at) == COLON_V) &&
|
|
(Lexer::file_of_origin(at-1) == Lexer::file_of_origin(at)) &&
|
|
(no_stop_words == 0) &&
|
|
((Characters::isdigit(*(Lexer::word_raw_text(at-1))) == FALSE) ||
|
|
(Characters::isdigit(*(Lexer::word_raw_text(at+1))) == FALSE) ||
|
|
(Lexer::indentation_level(at+1) > 0))) {
|
|
stop_character = ':'; stopped = TRUE;
|
|
}
|
|
#endif
|
|
|
|
@ Inform authors habitually use the punctuation in quoted text to end
|
|
sentences, just as other writers of English do. The text
|
|
|
|
>> "Look out!" The explosion shattered the calm of the hillside.
|
|
|
|
is certainly intended as two sentences, not one.
|
|
|
|
An exception is made for table declarations, because a table needs to be formed as
|
|
one long sentence, and it clearly does not abide by the ordinary punctuation
|
|
rules of English. The point is that in the random line of table entries...
|
|
|
|
>> "Of cabbages and kings."\qquad Walrus\qquad "Carroll"
|
|
|
|
...the full stop after "kings" has no significance: the semantics of the
|
|
table would be no different if it were not there.
|
|
|
|
@<Consider if punctuation within a preceding quoted text divides a sentence, making an X break@> =
|
|
if ((stopped == FALSE) && /* only look if we are not already at a division */
|
|
(no_stop_words == 0) && /* be sure not to elide two such texts in a row */
|
|
(sfsm->inside_table_mode == FALSE) && /* check that we are not scanning the body of a table */
|
|
(isupper(*(Lexer::word_raw_text(at)))) && /* and the current word begins with a capital letter */
|
|
(Word::text_ending_sentence(at-1))) { /* and the preceding one was quoted text ending in punctuation */
|
|
stop_character = 'X'; stopped = TRUE;
|
|
}
|
|
|
|
@h Making sentence nodes.
|
|
At this point we have established that |Sentences::make_node| is called
|
|
sequentially for every divided-off sentence in the original source text.
|
|
But we need a little machinery to skip past sentences which are being
|
|
excluded for one reason or another.
|
|
|
|
The design of Inform deliberately excludes conditional compilation in the
|
|
traditional C sense of |#ifdef| and |#endif|. This takes us too far from
|
|
what natural language would do, faced with the same basic issue. A book, or
|
|
a government form, would more naturally have a heading making clear that
|
|
the section beneath it is not universal in application. This is what Inform
|
|
does, too: it parses a heading to decide whether to skip the material,
|
|
and if so, the state |sfsm->skipping_material_at_level| is set to the
|
|
level of the heading in question. We then skip all subsequent sentences
|
|
until reaching the next heading of the same or higher status, or until
|
|
reaching the "... ends here." sentence (if we are reading an extension),
|
|
or until reaching the end of the text: whichever comes first.
|
|
|
|
=
|
|
void Sentences::make_node(parse_node_tree *T, wording W, int stop_character) {
|
|
int heading_level = 0;
|
|
int begins_or_ends = 0; /* 1 for "begins here", -1 for "ends here" */
|
|
parse_node *new;
|
|
|
|
if (Wordings::empty(W)) internal_error("empty sentence generated");
|
|
|
|
Vocabulary::identify_word_range(W); /* a precaution to catch any late unidentified text */
|
|
|
|
@<Detect a change of source file, and declare it as an implicit heading@>;
|
|
@<Detect a dividing sentence@>;
|
|
|
|
if ((begins_or_ends == -1) ||
|
|
((heading_level > 0) && (heading_level <= sfsm->skipping_material_at_level)))
|
|
sfsm->skipping_material_at_level = -1;
|
|
|
|
if (sfsm->skipping_material_at_level >= 0) return;
|
|
|
|
if (heading_level > 0) {
|
|
@<Issue a problem message if the heading incorporates a line break@>;
|
|
@<Issue a problem message if the heading does not end with a line break@>;
|
|
@<Make a new HEADING node, possibly beginning to skip material@>;
|
|
return;
|
|
}
|
|
|
|
@<Reject if we have run on past the end of an extension@>;
|
|
@<Accept the new sentence as one or more nodes in the parse tree@>;
|
|
}
|
|
|
|
@ For reasons gone into in the section on Headings below, a change of
|
|
source file (e.g., when one extension has been read in and another begins)
|
|
is declared as if it were a super-heading in the text.
|
|
|
|
@<Detect a change of source file, and declare it as an implicit heading@> =
|
|
if (Lexer::file_of_origin(Wordings::first_wn(W)) != sfsm->sf) {
|
|
parse_node *implicit_heading = Node::new(HEADING_NT);
|
|
Node::set_text(implicit_heading, W);
|
|
Annotations::write_int(implicit_heading, heading_level_ANNOT, 0);
|
|
SyntaxTree::graft_sentence(T, implicit_heading);
|
|
#ifdef NEW_HEADING_SYNTAX_CALLBACK
|
|
NEW_HEADING_SYNTAX_CALLBACK(T, implicit_heading, sfsm->project_ref);
|
|
#endif
|
|
sfsm->skipping_material_at_level = -1;
|
|
}
|
|
sfsm->sf = Lexer::file_of_origin(Wordings::first_wn(W));
|
|
|
|
@<Reject if we have run on past the end of an extension@> =
|
|
if ((sfsm->ext_pos == AFTER_ENDS_EXTENSION_POS) && (begins_or_ends == 0)) {
|
|
Sentences::syntax_problem(ExtSpuriouslyContinues_SYNERROR, W, sfsm->ref, 0);
|
|
sfsm->ext_pos = PAST_CARING_EXTENSION_POS; /* to avoid multiply issuing this */
|
|
}
|
|
|
|
@ The client must define a Preform nonterminal called |<dividing-sentence>|
|
|
which returns either a heading level number (1 to 10, with 1 the most
|
|
important), or |-1| to mean that the sentence begins an extension, or
|
|
|-2| that it ends one.
|
|
|
|
@<Detect a dividing sentence@> =
|
|
if (<dividing-sentence>(W)) {
|
|
switch (<<r>>) {
|
|
case -1: if (sfsm->ext_pos != NO_EXTENSION_POS) begins_or_ends = 1; break;
|
|
case -2: if (sfsm->ext_pos != NO_EXTENSION_POS) begins_or_ends = -1; break;
|
|
default: heading_level = <<r>>; break;
|
|
}
|
|
}
|
|
|
|
@ We have already looked to see if the sentence could be a heading, and set
|
|
the variable |heading_level| to be its ranking in the hierarchy (with 1,
|
|
for "volume", the highest). But we also want to check that the heading
|
|
does not have a line break in, because this is almost certainly a mistake
|
|
by the designer, and likely to be a difficult one to understand: so we
|
|
should help out if we can. Such a problem is best recovered from by
|
|
continuing regardless.
|
|
|
|
@<Issue a problem message if the heading incorporates a line break@> =
|
|
LOOP_THROUGH_WORDING(k, W)
|
|
if (k > Wordings::first_wn(W))
|
|
if ((Lexer::break_before(k) == '\n') || (Lexer::indentation_level(k) > 0)) {
|
|
Sentences::syntax_problem(HeadingOverLine_SYNERROR, W, sfsm->ref, k);
|
|
break;
|
|
}
|
|
|
|
@ And similarly... Here we take the liberty of looking a little ahead of
|
|
the current word range in order to make the problem message more helpful:
|
|
we check that we are still looking at valid words in the lexer, just to be
|
|
on the safe side, but in fact we cannot run on past the end of the lexer
|
|
feed which fed the malformed heading, because of all of the run-off
|
|
newlines automatically added at the end of the feed of any source file.
|
|
|
|
@<Issue a problem message if the heading does not end with a line break@> =
|
|
if (Lexer::break_before(Wordings::last_wn(W)+1) != '\n') {
|
|
int k;
|
|
for (k = Wordings::last_wn(W)+1;
|
|
(k<=Wordings::last_wn(W)+8) &&
|
|
(k<lexer_wordcount) && (Lexer::break_before(k) != '\n');
|
|
k++) ;
|
|
Sentences::syntax_problem(HeadingStopsBeforeEndOfLine_SYNERROR, W, sfsm->ref, k);
|
|
}
|
|
|
|
@ We now have a genuine heading, and can declare it, calling a routine
|
|
in Headings to determine whether we should include the material.
|
|
|
|
@<Make a new HEADING node, possibly beginning to skip material@> =
|
|
new = Node::new(HEADING_NT);
|
|
Node::set_text(new, W);
|
|
Annotations::write_int(new, heading_level_ANNOT, heading_level);
|
|
SyntaxTree::graft_sentence(T, new);
|
|
#ifdef NEW_HEADING_SYNTAX_CALLBACK
|
|
if (NEW_HEADING_SYNTAX_CALLBACK(T, new, sfsm->project_ref) == FALSE)
|
|
sfsm->skipping_material_at_level = heading_level;
|
|
#endif
|
|
|
|
@ When we finish scanning all the sentences in a given batch, and if they came
|
|
from an extension, we need to make sure we saw both beginning and end:
|
|
|
|
@<Issue a problem message if we are missing the begin and end here sentences@> =
|
|
switch (sfsm->ext_pos) {
|
|
case BEFORE_BEGINS_EXTENSION_POS:
|
|
Sentences::syntax_problem(ExtNoBeginsHere_SYNERROR, W, sfsm->ref, 0); break;
|
|
case MIDDLE_EXTENSION_POS:
|
|
Sentences::syntax_problem(ExtNoEndsHere_SYNERROR, W, sfsm->ref, 0); break;
|
|
}
|
|
|
|
@h Unskipped material which is not a heading.
|
|
Each of the sentences which are to be included is given its own node on the
|
|
parse tree, which for the time being is a direct child of the root.
|
|
Sentences are classified by their node types, the main identification
|
|
attached to each unit in the tree.
|
|
|
|
(a) "Nonstructural sentences", which will be subject to further parsing
|
|
work, have node type |SENTENCE_NT| (and so will "regular sentences").
|
|
Anything we cannot place into categories (b) or (c) below will go here.
|
|
|
|
(b) "Sentences making up rules". These are sequences of sentences in which
|
|
a preamble (ending with a colon, or in certain cases a comma) of node type
|
|
|IMPERATIVE_NT| is followed by a sequence of phrases (ending with semicolons until
|
|
the last, which ends with a full stop or paragraph break), each of node type
|
|
|INVOCATION_LIST_NT|. For instance, the following produces three nodes:
|
|
|
|
>> To look upwards: say "Look out!"; something else.
|
|
|
|
(c) "Structural sentences". These demarcate the text, call for other text
|
|
or unusual matter to be included, etc.: the types in question are |TRACE_NT|,
|
|
|HEADING_NT|, |INCLUDE_NT|, |INFORM6CODE_NT|, |BEGINHERE_NT|, |ENDHERE_NT|,
|
|
|TABLE_NT|, |EQUATION_NT| and |BIBLIOGRAPHIC_NT|.
|
|
|
|
@ The second sentence in the source text is construed as containing
|
|
bibliographic data if it begins with a quoted piece of text, perhaps with
|
|
substitutions. For instance,
|
|
|
|
>> "A Dream of Fair to Middling Women" by Samuel Beckett
|
|
|
|
This sentence is at the position matched by <if-start-of-source-text>.
|
|
(It may not be the first sentence read, because implied extension inclusion
|
|
sentences and options-file sentences may have been read already.)
|
|
|
|
=
|
|
<if-start-of-source-text> internal 0 {
|
|
int w1 = Wordings::first_wn(W);
|
|
while (w1 >= 0) {
|
|
if (w1 == sfsm->main_source_start_wn) return TRUE;
|
|
if (compare_word(w1-1, PARBREAK_V) == FALSE) { ==> { fail nonterminal }; }
|
|
w1--;
|
|
}
|
|
==> { fail nonterminal };
|
|
}
|
|
|
|
@<Accept the new sentence as one or more nodes in the parse tree@> =
|
|
@<Convert comma-divided rule into two sentences, if this is allowed@>;
|
|
@<Otherwise, make a SENTENCE node@>;
|
|
|
|
@<Convert a rule preamble to a ROUTINE node and enter rule mode@>;
|
|
if (sfsm->inside_rule_mode)
|
|
@<Convert to a COMMAND node and exit rule mode unless a semicolon implies more@>
|
|
else if (stop_character == ';') {
|
|
Sentences::syntax_problem(UnexpectedSemicolon_SYNERROR, W, sfsm->ref, 0);
|
|
stop_character = '.';
|
|
}
|
|
|
|
/* at this point we are certainly in assertion mode, not rule mode */
|
|
if (<structural-sentence>(W)) {
|
|
if (<<r>> == -1)
|
|
@<Detect a language definition sentence and sneakily act upon it@>
|
|
else if (<<r>> == -2) {
|
|
@<Detect a Preform grammar inclusion and sneakily act upon it@>
|
|
Node::set_type(new, sfsm->nt); return;
|
|
} else {
|
|
Node::set_type(new, sfsm->nt);
|
|
return;
|
|
}
|
|
}
|
|
|
|
@<Convert a begins here or ends here sentence to a BEGINHERE or ENDHERE node and return@>;
|
|
|
|
/* none of that happened, so we have a SENTENCE node for certain */
|
|
#ifdef NEW_NONSTRUCTURAL_SENTENCE_SYNTAX_CALLBACK
|
|
NEW_NONSTRUCTURAL_SENTENCE_SYNTAX_CALLBACK(new);
|
|
#endif
|
|
|
|
@ We make an exception to the exception for the serial comma used in a list of
|
|
alternatives: thus the comma in "Aeschylus, Sophocles, or Euripides" does
|
|
not trigger this rule. We need this exception because such lists of
|
|
alternatives often occur in rule preambles, where it's the third comma
|
|
which divides rule from preamble:
|
|
|
|
>> Instead of pushing, dropping, or taking the talisman, say "It is cursed."
|
|
|
|
The following is used to detect "or" in such lists.
|
|
|
|
=
|
|
<list-or-division> ::=
|
|
...... , _or ...... |
|
|
...... _or ......
|
|
|
|
@<Convert comma-divided rule into two sentences, if this is allowed@> =
|
|
if ((sfsm->inside_rule_mode == FALSE)
|
|
&& ((stop_character == '.') || (stop_character == '|'))
|
|
&& (<comma-divisible-sentence>(W)))
|
|
@<Look for a comma and split the sentence at it, unless in serial list@>;
|
|
|
|
@ In such sentences a comma is read as if it were a colon. (The text up to the
|
|
comma will then be given a |IMPERATIVE_NT| node and the text beyond the comma
|
|
will make a |INVOCATION_LIST_NT| node.)
|
|
|
|
@<Look for a comma and split the sentence at it, unless in serial list@> =
|
|
int earliest_comma_position = Wordings::first_wn(W);
|
|
@<Set earliest comma to position after the or, if there is one@>;
|
|
wording AW = EMPTY_WORDING, BW = EMPTY_WORDING;
|
|
if (<list-comma-division>(Wordings::from(W, earliest_comma_position))) {
|
|
AW = GET_RW(<list-comma-division>, 1);
|
|
BW = GET_RW(<list-comma-division>, 2);
|
|
}
|
|
if (Wordings::nonempty(AW)) {
|
|
Sentences::make_node(T, Wordings::up_to(W, Wordings::last_wn(AW)), ':'); /* rule preamble stopped with a colon */
|
|
Sentences::make_node(T, BW, '.'); /* rule body with one sentence, stopped with a stop */
|
|
return;
|
|
}
|
|
|
|
@<Set earliest comma to position after the or, if there is one@> =
|
|
if (<list-or-division>(W)) {
|
|
wording BW = GET_RW(<list-or-division>, 2);
|
|
earliest_comma_position = Wordings::first_wn(BW);
|
|
}
|
|
|
|
@ At this point we know that the text |W| will make one and only
|
|
one sentence node in the parse tree, so we may as well create and SyntaxTree::graft it
|
|
now. There are a number of special cases with variant node types, but the
|
|
commonest outcome is a SENTENCE node, so that's what we shall assume for now.
|
|
|
|
@<Otherwise, make a SENTENCE node@> =
|
|
new = Node::new(SENTENCE_NT);
|
|
Node::set_text(new, W);
|
|
SyntaxTree::graft_sentence(T, new);
|
|
|
|
@ Rules are sequences of phrases with a preamble in front, which we detect by
|
|
its terminating colon. For instance:
|
|
|
|
>> To look upwards: say "Look out!"; something else.
|
|
|
|
(which arrives at this routine as three separate "sentences") will produce
|
|
nodes with type |IMPERATIVE_NT|, |INVOCATION_LIST_NT| and |INVOCATION_LIST_NT| respectively.
|
|
|
|
This paragraph of code might look as if it should only be used in assertion
|
|
mode, not in rule mode, because how can a rule preamble legally occur in
|
|
the middle of another rule? But in fact it can, in two ways. One is the
|
|
officially sanctioned way to make a definition with a complex phrase:
|
|
|
|
>> Definition: a supporter is wobbly: if the player is on it, decide yes; decide no.
|
|
|
|
This produces four nodes: |IMPERATIVE_NT|, |IMPERATIVE_NT|, |INVOCATION_LIST_NT| and
|
|
|INVOCATION_LIST_NT| respectively.
|
|
|
|
The other arises somewhat less officially when people treat phrases as
|
|
if they were C (or Inform 6) statements, always to be terminated with
|
|
semicolons, and also run two rules together with no skipped paragraph
|
|
between:
|
|
= (text as Inform 7)
|
|
To do one thing: something here;
|
|
To do another thing: something else here;
|
|
=
|
|
A strict reading of our rules would oblige us to consider "To do another
|
|
thing:" as a phrase within the definition of "To do one thing", and
|
|
we would then have to issue a problem message. But this would be pettifogging.
|
|
(People who habitually shuffle phrases about in their editors tend not to
|
|
want to fuss about changing the punctuation of the last to a full stop
|
|
instead of a semicolon. We may lament this, but it is so.)
|
|
|
|
@<Convert a rule preamble to a ROUTINE node and enter rule mode@> =
|
|
#ifdef list_node_type
|
|
if (stop_character == ':') {
|
|
if ((sfsm->inside_rule_mode) && (ControlStructures::detect(W))) {
|
|
Node::set_type(new, list_entry_node_type);
|
|
#ifdef CORE_MODULE
|
|
Annotations::write_int(new, colon_block_command_ANNOT, TRUE);
|
|
#endif
|
|
sfsm->inside_rule_mode = TRUE;
|
|
return;
|
|
} else {
|
|
Node::set_type(new, list_node_type);
|
|
sfsm->inside_rule_mode = TRUE;
|
|
return;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
@ Subsequent commands are divided by semicolons, and any failure of a
|
|
semicolon to appear indicates an end of the rule.
|
|
|
|
@<Convert to a COMMAND node and exit rule mode unless a semicolon implies more@> =
|
|
#ifdef list_node_type
|
|
Node::set_type(new, list_entry_node_type);
|
|
#endif
|
|
if (stop_character != ';') sfsm->inside_rule_mode = FALSE;
|
|
return;
|
|
|
|
@ Finally, we must tidy away the previously detected "begins here" and
|
|
"ends here" sentences into nodes on the tree.
|
|
|
|
@<Convert a begins here or ends here sentence to a BEGINHERE or ENDHERE node and return@> =
|
|
if (begins_or_ends == 1) {
|
|
Node::set_type(new, BEGINHERE_NT);
|
|
Node::set_text(new, Wordings::trim_last_word(Wordings::trim_last_word(W)));
|
|
#ifdef BEGIN_OR_END_HERE_SYNTAX_CALLBACK
|
|
BEGIN_OR_END_HERE_SYNTAX_CALLBACK(new, sfsm->ref);
|
|
#endif
|
|
return;
|
|
}
|
|
if (begins_or_ends == -1) {
|
|
Node::set_type(new, ENDHERE_NT);
|
|
Node::set_text(new, Wordings::trim_last_word(Wordings::trim_last_word(W)));
|
|
#ifdef BEGIN_OR_END_HERE_SYNTAX_CALLBACK
|
|
BEGIN_OR_END_HERE_SYNTAX_CALLBACK(new, sfsm->ref);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
@ Why are we taking a sneak look at this sentence now? Because it affects
|
|
which headings we read the contents of. If we waited until sentence traverses,
|
|
it would be too late.
|
|
|
|
@<Detect a language definition sentence and sneakily act upon it@> =
|
|
current_sentence = new;
|
|
#ifdef LANGUAGE_ELEMENT_SYNTAX_CALLBACK
|
|
LANGUAGE_ELEMENT_SYNTAX_CALLBACK(GET_RW(<language-modifying-sentence>, 1));
|
|
#endif
|
|
Annotations::write_int(new, language_element_ANNOT, TRUE);
|
|
|
|
@ And for similar reasons:
|
|
|
|
@<Detect a Preform grammar inclusion and sneakily act upon it@> =
|
|
current_sentence = new;
|
|
wording W = GET_RW(<language-modifying-sentence>, 1);
|
|
TEMPORARY_TEXT(wd)
|
|
WRITE_TO(wd, "%+W", Wordings::one_word(Wordings::first_wn(W)));
|
|
LoadPreform::parse_text(wd);
|
|
DISCARD_TEXT(wd)
|
|
|
|
@ Some tools using this module will want to push simple error messages out to
|
|
the command line; others will want to translate them into elaborate problem
|
|
texts in HTML. So the client is allowed to define |PROBLEM_SYNTAX_CALLBACK|
|
|
to some routine of her own, gazumping this one.
|
|
|
|
=
|
|
void Sentences::syntax_problem(int err_no, wording W, void *ref, int k) {
|
|
#ifdef PROBLEM_SYNTAX_CALLBACK
|
|
PROBLEM_SYNTAX_CALLBACK(err_no, W, ref, k);
|
|
#endif
|
|
#ifndef PROBLEM_SYNTAX_CALLBACK
|
|
TEMPORARY_TEXT(text)
|
|
WRITE_TO(text, "%+W", W);
|
|
switch (err_no) {
|
|
case UnexpectedSemicolon_SYNERROR:
|
|
Errors::with_text("unexpected semicolon in sentence: %S", text);
|
|
break;
|
|
case ParaEndsInColon_SYNERROR:
|
|
Errors::with_text("paragraph ends with a colon: %S", text);
|
|
break;
|
|
case SentenceEndsInColon_SYNERROR:
|
|
Errors::with_text("paragraph ends with a colon and full stop: %S", text);
|
|
break;
|
|
case SentenceEndsInSemicolon_SYNERROR:
|
|
Errors::with_text("paragraph ends with a semicolon and full stop: %S", text);
|
|
break;
|
|
case SemicolonAfterColon_SYNERROR:
|
|
Errors::with_text("paragraph ends with a colon and semicolon: %S", text);
|
|
break;
|
|
case SemicolonAfterStop_SYNERROR:
|
|
Errors::with_text("paragraph ends with a full stop and semicolon: %S", text);
|
|
break;
|
|
case ExtNoBeginsHere_SYNERROR:
|
|
Errors::nowhere("extension has no beginning");
|
|
break;
|
|
case ExtNoEndsHere_SYNERROR:
|
|
Errors::nowhere("extension has no end");
|
|
break;
|
|
case ExtSpuriouslyContinues_SYNERROR:
|
|
Errors::with_text("extension continues after end: %S", text);
|
|
break;
|
|
case HeadingOverLine_SYNERROR:
|
|
Errors::with_text("heading contains a line break: %S", text);
|
|
break;
|
|
case HeadingStopsBeforeEndOfLine_SYNERROR:
|
|
Errors::with_text("heading stops before end of line: %S", text);
|
|
break;
|
|
}
|
|
DISCARD_TEXT(text)
|
|
#endif
|
|
}
|