Moved sentence-breaking into inbuild

2024-06-26 04:00:43 +03:00 · 2020-03-03 00:59:42 +00:00 · 2020-03-03 00:59:42 +00:00 · ad713cc66a
parent 67fc2ae91c
commit ad713cc66a
14 changed files with 217 additions and 138 deletions
--- a/inbuild/Chapter
+++ b/inbuild/Chapter
@ -29,6 +29,7 @@ linked_list *inbuild_nest_list = NULL;
 int main(int argc, char **argv) {
 	Foundation::start();
 	WordsModule::start();
+	SyntaxModule::start();
 	HTMLModule::start();
 	ArchModule::start();
 	InbuildModule::start();
@ -74,10 +75,10 @@ int main(int argc, char **argv) {
 			}
 		}
 	}
-	WordsModule::end();
 	ArchModule::end();
 	InbuildModule::end();
 	HTMLModule::end();
+	SyntaxModule::end();
 	WordsModule::end();
 	Foundation::end();
 	return 0;
@ -222,3 +223,68 @@ vocabulary_meaning Main::ignore(vocabulary_entry *ve) {
@

@d PREFORM_LANGUAGE_TYPE void
+@d PARSE_TREE_TRAVERSE_TYPE void
+@d SENTENCE_NODE Main::sentence_level
+
+=
+int Main::sentence_level(node_type_t t) {
+	return FALSE;
+}
+
+@
+
+@d SYNTAX_PROBLEM_HANDLER Main::syntax_problem_handler
+
+=
+void Main::syntax_problem_handler(int err_no, wording W, void *ref, int k) {
+	TEMPORARY_TEXT(text);
+	WRITE_TO(text, "%+W", W);
+	switch (err_no) {
+		case UnexpectedSemicolon_SYNERROR:
+			Errors::with_text("unexpected semicolon in sentence: %S", text);
+			break;
+		case ParaEndsInColon_SYNERROR:
+			Errors::with_text("paragraph ends with a colon: %S", text);
+			break;
+		case SentenceEndsInColon_SYNERROR:
+			Errors::with_text("paragraph ends with a colon and full stop: %S", text);
+			break;
+		case SentenceEndsInSemicolon_SYNERROR:
+			Errors::with_text("paragraph ends with a semicolon and full stop: %S", text);
+			break;
+		case SemicolonAfterColon_SYNERROR:
+			Errors::with_text("paragraph ends with a colon and semicolon: %S", text);
+			break;
+		case SemicolonAfterStop_SYNERROR:
+			Errors::with_text("paragraph ends with a full stop and semicolon: %S", text);
+			break;
+		case ExtNoBeginsHere_SYNERROR:
+			Errors::nowhere("extension has no beginning");
+			break;
+		case ExtNoEndsHere_SYNERROR:
+			Errors::nowhere("extension has no end");
+			break;
+		case ExtSpuriouslyContinues_SYNERROR:
+			Errors::with_text("extension continues after end: %S", text);
+			break;
+		case HeadingOverLine_SYNERROR:
+			Errors::with_text("heading contains a line break: %S", text);
+			break;
+		case HeadingStopsBeforeEndOfLine_SYNERROR:
+			Errors::with_text("heading stops before end of line: %S", text);
+			break;
+		case ExtMultipleBeginsHere_SYNERROR:
+			Errors::nowhere("extension has multiple 'begins here' sentences");
+			break;
+		case ExtBeginsAfterEndsHere_SYNERROR:
+			Errors::nowhere("extension has a 'begins here' after its 'ends here'");
+			break;
+		case ExtEndsWithoutBegins_SYNERROR:
+			Errors::nowhere("extension has an 'ends here' but no 'begins here'");
+			break;
+		case ExtMultipleEndsHere_SYNERROR:
+			Errors::nowhere("extension has multiple 'ends here' sentences");
+			break;
+	}
+	DISCARD_TEXT(text);
+}
--- a/inbuild/Contents.w
+++ b/inbuild/Contents.w
@ -8,6 +8,7 @@ Version Name: Avignon

 Import: foundation
 Import: inform7/words
+Import: inform7/syntax
 Import: html
 Import: arch
 Import: inbuild
--- a/inbuild/inbuild-module/Chapter
+++ b/inbuild/inbuild-module/Chapter
@ -311,6 +311,8 @@ void Extensions::read_source_text_for(inform_extension *E) {
 		E->read_into_file->your_ref = STORE_POINTER_inbuild_copy(E->as_copy);
 		wording EXW = E->read_into_file->text_read;
 		if (Wordings::nonempty(EXW)) @<Break the extension's text into body and documentation@>;
+		Sentences::break(E->body_text, E);
+		E->body_text_unbroken = FALSE;
 	}
 }

--- a/inbuild/inbuild-module/Chapter
+++ b/inbuild/inbuild-module/Chapter
@ -337,6 +337,7 @@ void Projects::construct_graph(inform_project *project) {

 =
 void Projects::read_source_text_for(inform_project *project) {
+	int wc = lexer_wordcount;
 	TEMPORARY_TEXT(early);
 	Projects::early_source_text(early, project);
 	if (Str::len(early) > 0) Feeds::feed_stream(early);
@ -355,6 +356,13 @@ void Projects::read_source_text_for(inform_project *project) {
 				FALSE, TRUE);
 		}
 	}
+	ParseTree::plant_parse_tree();
+	int l = ParseTree::push_attachment_point(tree_root);
+	Sentences::break(Wordings::new(wc, lexer_wordcount-1), NULL);
+	ParseTree::pop_attachment_point(l);
+	#ifdef CORE_MODULE
+	StructuralSentences::add_inventions_heading();
+	#endif
 }

 int Projects::draws_from_source_file(inform_project *project, source_file *sf) {
--- a/inbuild/inbuild-module/Chapter
+++ b/inbuild/inbuild-module/Chapter
@ -93,3 +93,104 @@ void SourceText::lexer_problem_handler(int err, text_stream *desc, wchar_t *word
    }
 	DISCARD_TEXT(erm);
 }
+
+
+@ Sentences in the source text are of five categories: dividing sentences,
+which divide up the source into segments; structural sentences, which split
+the source into different forms (standard text, tables, equations, I6 matter,
+and so on); nonstructural sentences, which make grammatical definitions and
+give Inform other more or less direct instructions; rule declarations; and
+regular sentences, those which use the standard verbs. Examples:
+
+>> Volume II [dividing]
+>> Include Locksmith by Emily Short [structural]
+>> Release along with a website [nonstructural]
+>> Instead of looking [rule]
+>> The cushion is on the wooden chair [regular]
+
+Dividing sentences are always read, whereas the others may be skipped in
+sections of source not being included for one reason or another. Dividing
+sentences must match the following. Note that the extension end markers are
+only read in extensions, so they can never accidentally match in the main
+source text.
+
+@e ExtMultipleBeginsHere_SYNERROR
+@e ExtBeginsAfterEndsHere_SYNERROR
+@e ExtEndsWithoutBegins_SYNERROR
+@e ExtMultipleEndsHere_SYNERROR
+
+=
+<dividing-sentence> ::=
+	<if-start-of-paragraph> <heading> |	==> R[2]
+	<extension-end-marker-sentence>		==> R[1]
+
+<heading> ::=
+	volume ... |						==> 1
+	book ... |							==> 2
+	part ... |							==> 3
+	chapter ... |						==> 4
+	section ...							==> 5
+
+<extension-end-marker-sentence> ::=
+	... begin/begins here |				==> -1; @<Check we can begin an extension here@>;
+	... end/ends here					==> -2; @<Check we can end an extension here@>;
+
+@<Check we can begin an extension here@> =
+	switch (sfsm_extension_position) {
+		case 1: sfsm_extension_position++; break;
+		case 2: SYNTAX_PROBLEM_HANDLER(ExtMultipleBeginsHere_SYNERROR, W, sfsm_extension, 0); break;
+		case 3: SYNTAX_PROBLEM_HANDLER(ExtBeginsAfterEndsHere_SYNERROR, W, sfsm_extension, 0); break;
+	}
+
+@<Check we can end an extension here@> =
+	switch (sfsm_extension_position) {
+		case 1: SYNTAX_PROBLEM_HANDLER(ExtEndsWithoutBegins_SYNERROR, W, sfsm_extension, 0); break;
+		case 2: sfsm_extension_position++; break;
+		case 3: SYNTAX_PROBLEM_HANDLER(ExtMultipleEndsHere_SYNERROR, W, sfsm_extension, 0); break;
+	}
+
+@<Detect a dividing sentence@> =
+	if (<dividing-sentence>(W)) {
+		switch (<<r>>) {
+			case -1: if (sfsm_extension_position > 0) begins_or_ends = 1;
+				break;
+			case -2:
+				if (sfsm_extension_position > 0) begins_or_ends = -1;
+				break;
+			default:
+				heading_level = <<r>>;
+				break;
+		}
+	}
+
+@ Structural sentences are defined as follows. (The asterisk notation isn't
+known to most Inform users: it increases output to the debugging log.)
+
+@e BIBLIOGRAPHIC_NT     			/* For the initial title sentence */
+@e ROUTINE_NT           			/* "Instead of taking something, ..." */
+@e INFORM6CODE_NT       			/* "Include (- ... -) */
+@e TABLE_NT             			/* "Table 1 - Counties of England" */
+@e EQUATION_NT          			/* "Equation 2 - Newton's Second Law" */
+@e TRACE_NT             			/* A sentence consisting of an asterisk and optional quoted text */
+
+=
+<structural-sentence> ::=
+	<if-start-of-source-text> <quoted-text> |				==> 0; ssnt = BIBLIOGRAPHIC_NT;
+	<if-start-of-source-text> <quoted-text> ... |			==> 0; ssnt = BIBLIOGRAPHIC_NT;
+	<language-modifying-sentence> |							==> R[1]
+	* |														==> 0; ssnt = TRACE_NT;
+	* <quoted-text-without-subs> |							==> 0; ssnt = TRACE_NT;
+	<if-start-of-paragraph> table ... |						==> 0; ssnt = TABLE_NT;
+	<if-start-of-paragraph> equation ... |					==> 0; ssnt = EQUATION_NT;
+	include <nounphrase-articled> by <nounphrase> |			==> 0; ssnt = INCLUDE_NT; *XP = RP[1]; ((parse_node *) RP[1])->next = RP[2];
+	include (- ...											==> 0; ssnt = INFORM6CODE_NT;
+
+@ Properly speaking, despite the definition above, language modifying sentences
+are nonstructural. So what are they doing here? The answer is that we need to
+read them early on, because they affect the way that they parse all other
+sentences. Whereas other nonstructural sentences can wait, these can't.
+
+=
+<language-modifying-sentence> ::=
+	include (- ### in the preform grammar |			==> -2; ssnt = INFORM6CODE_NT;
+	use ... language element/elements				==> -1
--- a/inform7/core-module/Chapter
+++ b/inform7/core-module/Chapter
@ -109,14 +109,12 @@ most of these worker functions are in the |core| module, some are not.

@<Perform lexical analysis@> =
 	Task::advance_stage_to(LEXICAL_CSEQ, I"Lexical analysis", 0);
-	BENCH(Task::read_source_text)
 	BENCH(Sentences::RuleSubtrees::create_standard_csps)
+	BENCH(Task::read_source_text)

@<Perform semantic analysis@> =
 	Task::advance_stage_to(SEMANTIC_IA_CSEQ, I"Semantic analysis Ia", 1);
 	BENCH(Task::activate_language_elements)
-	BENCH(ParseTreeUsage::plant_parse_tree)
-	BENCH(StructuralSentences::break_source)
 	BENCH(Extensions::Inclusion::traverse)
 	BENCH(Sentences::Headings::satisfy_dependencies)

--- a/inform7/core-module/Chapter
+++ b/inform7/core-module/Chapter
@ -14,7 +14,6 @@ can simply discard the search results.

 =
 int bundle_scan_made = FALSE;
-int language_scan_top = -1;

 void NaturalLanguages::scan(void) {
 	if (bundle_scan_made == FALSE) {
@ -22,7 +21,6 @@ void NaturalLanguages::scan(void) {
 		inbuild_requirement *req = Requirements::anything_of_genre(language_genre);
 		linked_list *L = NEW_LINKED_LIST(inbuild_search_result);
 		Nests::search_for(req, Inbuild::nest_list(), L);
-		language_scan_top = lexer_wordcount - 1;
 	}
 }

--- a/inform7/core-module/Chapter
+++ b/inform7/core-module/Chapter
@ -213,7 +213,6 @@ void Semantics::read_preform(void) {
 	NaturalLanguages::scan();
 	wording W = NaturalLanguages::load_preform(Projects::get_language_of_syntax(Task::project()));
 	int nonterminals_declared = Preform::parse_preform(W, FALSE);
-	language_definition_top = lexer_wordcount - 1;

 	LOG("%d declarations read (%d words)\n", nonterminals_declared, Wordings::length(W));
 }
--- a/inform7/core-module/Chapter
+++ b/inform7/core-module/Chapter
@ -134,13 +134,6 @@ void ParseTreeUsage::copy_annotations(parse_node_annotation *to, parse_node_anno

@

-@e BIBLIOGRAPHIC_NT     			/* For the initial title sentence */
-@e ROUTINE_NT           			/* "Instead of taking something, ..." */
-@e INFORM6CODE_NT       			/* "Include (- ... -) */
-@e TABLE_NT             			/* "Table 1 - Counties of England" */
-@e EQUATION_NT          			/* "Equation 2 - Newton's Second Law" */
-@e TRACE_NT             			/* A sentence consisting of an asterisk and optional quoted text */
-
@e ALLOWED_NT           			/* "An animal is allowed to have a description" */
@e EVERY_NT             			/* "every container" */
@e COMMON_NOUN_NT       			/* "a container" */
--- a/inform7/core-module/Chapter
+++ b/inform7/core-module/Chapter
@ -64,22 +64,11 @@ void StructuralSentences::new_language(wording W) {
 		"in favour of a new system with Inform kits.");
 }

-@h Sentence breaking.
-The |Sentences::break| routine is used for long stretches of text,
-normally entire files. The following provides a way for the |.i6t|
-interpreter to apply it to the whole text as lexed, which provides the
-original basis for parsing. (This won't be the entire source text,
-though: extensions, including the Standard Rules, have yet to be read.)
+@ This is for invented sentences, such as those creating the understood
+variables.

 =
-void StructuralSentences::break_source(void) {
-	int l = ParseTree::push_attachment_point(tree_root);
-	int n = 0;
-	if (language_definition_top >= n) n = language_definition_top+1;
-	if (doc_references_top >= n) n = doc_references_top+1;
-	if (language_scan_top >= n) n = language_scan_top+1;
-	Sentences::break(Wordings::new(n, lexer_wordcount-1), NULL);
-	ParseTree::pop_attachment_point(l);
+void StructuralSentences::add_inventions_heading(void) {
 	parse_node *implicit_heading = ParseTree::new(HEADING_NT);
 	ParseTree::set_text(implicit_heading, Feeds::feed_text_expanding_strings(L"Invented sentences"));
 	ParseTree::annotate_int(implicit_heading, sentence_unparsed_ANNOT, FALSE);
@ -88,98 +77,6 @@ void StructuralSentences::break_source(void) {
 	Sentences::Headings::declare(implicit_heading);
 }

-@ Sentences in the source text are of five categories: dividing sentences,
-which divide up the source into segments; structural sentences, which split
-the source into different forms (standard text, tables, equations, I6 matter,
-and so on); nonstructural sentences, which make grammatical definitions and
-give Inform other more or less direct instructions; rule declarations; and
-regular sentences, those which use the standard verbs. Examples:
-
->> Volume II [dividing]
->> Include Locksmith by Emily Short [structural]
->> Release along with a website [nonstructural]
->> Instead of looking [rule]
->> The cushion is on the wooden chair [regular]
-
-Dividing sentences are always read, whereas the others may be skipped in
-sections of source not being included for one reason or another. Dividing
-sentences must match the following. Note that the extension end markers are
-only read in extensions, so they can never accidentally match in the main
-source text.
-
-=
-<dividing-sentence> ::=
-	<if-start-of-paragraph> <heading> |	==> R[2]
-	<extension-end-marker-sentence>		==> R[1]
-
-<heading> ::=
-	volume ... |						==> 1
-	book ... |							==> 2
-	part ... |							==> 3
-	chapter ... |						==> 4
-	section ...							==> 5
-
-<extension-end-marker-sentence> ::=
-	... begin/begins here |				==> -1; @<Check we can begin an extension here@>;
-	... end/ends here					==> -2; @<Check we can end an extension here@>;
-
-@<Check we can begin an extension here@> =
-	switch (sfsm_extension_position) {
-		case 1: sfsm_extension_position++; break;
-		case 2: Problems::Issue::extension_problem(_p_(PM_ExtMultipleBeginsHere),
-			sfsm_extension, "has more than one 'begins here' sentence"); break;
-		case 3: Problems::Issue::extension_problem(_p_(PM_ExtBeginsAfterEndsHere),
-			sfsm_extension, "has a further 'begins here' after an 'ends here'"); break;
-	}
-
-@<Check we can end an extension here@> =
-	switch (sfsm_extension_position) {
-		case 1: Problems::Issue::extension_problem(_p_(BelievedImpossible),
-			sfsm_extension, "has an 'ends here' with nothing having begun"); break;
-		case 2: sfsm_extension_position++; break;
-		case 3: Problems::Issue::extension_problem(_p_(PM_ExtMultipleEndsHere),
-			sfsm_extension, "has more than one 'ends here' sentence"); break;
-	}
-
-@<Detect a dividing sentence@> =
-	if (<dividing-sentence>(W)) {
-		switch (<<r>>) {
-			case -1: if (sfsm_extension_position > 0) begins_or_ends = 1;
-				break;
-			case -2:
-				if (sfsm_extension_position > 0) begins_or_ends = -1;
-				break;
-			default:
-				heading_level = <<r>>;
-				break;
-		}
-	}
-
-@ Structural sentences are defined as follows. (The asterisk notation isn't
-known to most Inform users: it increases output to the debugging log.)
-
-=
-<structural-sentence> ::=
-	<if-start-of-source-text> <quoted-text> |				==> 0; ssnt = BIBLIOGRAPHIC_NT;
-	<if-start-of-source-text> <quoted-text> ... |			==> 0; ssnt = BIBLIOGRAPHIC_NT;
-	<language-modifying-sentence> |							==> R[1]
-	* |														==> 0; ssnt = TRACE_NT;
-	* <quoted-text-without-subs> |							==> 0; ssnt = TRACE_NT;
-	<if-start-of-paragraph> table ... |						==> 0; ssnt = TABLE_NT;
-	<if-start-of-paragraph> equation ... |					==> 0; ssnt = EQUATION_NT;
-	include <nounphrase-articled> by <nounphrase> |			==> 0; ssnt = INCLUDE_NT; *XP = RP[1]; ((parse_node *) RP[1])->next = RP[2];
-	include (- ...											==> 0; ssnt = INFORM6CODE_NT;
-
-@ Properly speaking, despite the definition above, language modifying sentences
-are nonstructural. So what are they doing here? The answer is that we need to
-read them early on, because they affect the way that they parse all other
-sentences. Whereas other nonstructural sentences can wait, these can't.
-
-=
-<language-modifying-sentence> ::=
-	include (- ### in the preform grammar |			==> -2; ssnt = INFORM6CODE_NT;
-	use ... language element/elements				==> -1
-
@

@d SYNTAX_PROBLEM_HANDLER StructuralSentences::syntax_problem_handler
@ -288,6 +185,31 @@ void StructuralSentences::syntax_problem_handler(int err_no, wording W, void *re
 				"arises when a decimal point is misread as a full stop.)");
 			Problems::issue_problem_end();
 			break;
+		case ExtMultipleBeginsHere_SYNERROR: {
+			inform_extension *E = (inform_extension *) ref;
+			Problems::Issue::extension_problem(_p_(PM_ExtMultipleBeginsHere),
+				E, "has more than one 'begins here' sentence");
+			break;
+		}
+		case ExtBeginsAfterEndsHere_SYNERROR: {
+			inform_extension *E = (inform_extension *) ref;
+			Problems::Issue::extension_problem(_p_(PM_ExtBeginsAfterEndsHere),
+				E, "has a further 'begins here' after an 'ends here'");
+			break;
+		}
+		case ExtEndsWithoutBegins_SYNERROR: {
+			inform_extension *E = (inform_extension *) ref;
+			Problems::Issue::extension_problem(_p_(BelievedImpossible),
+				E, "has an 'ends here' with nothing having begun"); break;
+			break;
+		}
+		case ExtMultipleEndsHere_SYNERROR: {
+			inform_extension *E = (inform_extension *) ref;
+			Problems::Issue::extension_problem(_p_(PM_ExtMultipleEndsHere),
+				E, "has more than one 'ends here' sentence"); break;
+			break;
+		}
+
 		default: internal_error("unimplemented problem message");
 	}
 }
--- a/inform7/core-module/Chapter
+++ b/inform7/core-module/Chapter
@ -140,15 +140,16 @@ parse tree.
 	DISCARD_TEXT(exft);
 	DISCARD_TEXT(exfa);

+	parse_node *at = current_sentence;
 	inform_extension *E = Extensions::Inclusion::load(req);
 	if (E) {
-		Extensions::set_inclusion_sentence(E, current_sentence);
+		Extensions::set_inclusion_sentence(E, at);
 		Extensions::set_VM_text(E, RW);
 	}
-	if ((E) && (E->body_text_unbroken)) {
-		Sentences::break(E->body_text, E);
-		E->body_text_unbroken = FALSE;
-	}
+//	if ((E) && (E->body_text_unbroken)) {
+//		Sentences::break(E->body_text, E);
+//		E->body_text_unbroken = FALSE;
+//	}

@h Extension loading.
 Extensions are loaded here.
--- a/inform7/index-module/Chapter
+++ b/inform7/index-module/Chapter
@ -48,7 +48,6 @@ void Index::DocReferences::read_xrefs(void) {
 		xrefs_read = TRUE;
 		TextFiles::read(Inbuild::file_from_installation(DOCUMENTATION_XREFS_IRES), TRUE,
 			NULL, FALSE, Index::DocReferences::read_xrefs_helper, NULL, NULL);
-		doc_references_top = lexer_wordcount - 1;
 	}
 }

--- a/inform7/words-module/Chapter
+++ b/inform7/words-module/Chapter
@ -78,18 +78,8 @@ with result 2.
 		yellow | polkadot | green | white

@h Implementation.
-We read the Preform file for English early in Inform's run, and since it
-goes through the standard lexer, it makes words. The following holds the
-word number of the last of these words. (The same is also true for documentation
-cross-references, which are not really anything to do with Preform.)
-
-=
-int language_definition_top = -1;
-int doc_references_top = -1;
-
-@ Now for nonterminals. We must first clarify how word ranges, once matched in
-the parser, will be stored. Within each production, word ranges are numbered
-upwards from 1. Thus:
+We must first clarify how word ranges, once matched in the parser, will be
+stored. Within each production, word ranges are numbered upwards from 1. Thus:

 	|man with ... on his ...|

--- a/scripts/makescript.txt
+++ b/scripts/makescript.txt
@ -105,6 +105,7 @@ INBUILDX = inbuild/Tangled/inbuild
 {tool} INBUILDTOOL inbuild inbuild
 {dep} INBUILDTOOL on FOUNDATION
 {dep} INBUILDTOOL on WORDS
+{dep} INBUILDTOOL on SYNTAX
 {dep} INBUILDTOOL on HTML
 {dep} INBUILDTOOL on ARCH
 {dep} INBUILDTOOL on INBUILD