mirror of
https://github.com/ganelson/inform.git
synced 2024-07-02 23:14:57 +03:00
455 lines
13 KiB
OpenEdge ABL
455 lines
13 KiB
OpenEdge ABL
[Wordings::] Wordings.
|
|
|
|
To manage contiguous word ranges.
|
|
|
|
@h Definitions.
|
|
|
|
@ Wordings are an efficient representation of a multi-word name for
|
|
something, using the fact that almost all names derive from contiguous
|
|
runs of words in the source text.
|
|
|
|
Recall that words are numbered from 0 upwards in order of reading into
|
|
the lexer. The wording |Wordings::new(A, B)| represents both a positional
|
|
marker, at word number |A|, and also some textual content, the text making
|
|
up words |A| to |B| inclusive. Different wordings can represent the
|
|
same text at different positions in the source text, for example if
|
|
"brown spotted owl" occurs multiple times.
|
|
|
|
=
|
|
typedef struct wording {
|
|
int word_A, word_B;
|
|
} wording;
|
|
|
|
@ Note that this applies even to the empty text. A wording holds no text
|
|
if both |A| and |B| are negative, or if |A| is larger than |B|. Thus
|
|
the wording |(17, 16)| represents the empty text at position 17. (Preform
|
|
makes use of this when parsing nonterminals which match conditionally
|
|
but consume no text, for example.) When we are representing no text and
|
|
no position either, we should use the following constant wording:
|
|
|
|
@d EMPTY_WORDING ((wording) { -1, -1 })
|
|
|
|
@ Annoyingly, |gcc| (though not |clang|) rejects this as an initializer, so
|
|
we also need:
|
|
|
|
@d EMPTY_WORDING_INIT { -1, -1 }
|
|
|
|
@ We will frequently want to loop through the words in a wording, so
|
|
the following macro is convenient. Note that the loop body is not executed
|
|
if the wording is empty.
|
|
|
|
@d LOOP_THROUGH_WORDING(i, W)
|
|
if (W.word_A >= 0)
|
|
for (int i=W.word_A; i<=W.word_B; i++)
|
|
|
|
@h Construction.
|
|
|
|
=
|
|
wording Wordings::new(int A, int B) {
|
|
return (wording) { A, B };
|
|
}
|
|
|
|
wording Wordings::one_word(int A) {
|
|
return (wording) { A, A };
|
|
}
|
|
|
|
@ Note that these two are sometimes used to construct empty wordings either
|
|
by moving |A| past |B|, or moving |B| before |A|.
|
|
|
|
=
|
|
wording Wordings::up_to(wording W, int last_wn) {
|
|
if (Wordings::empty(W)) return W;
|
|
W.word_B = last_wn;
|
|
return W;
|
|
}
|
|
|
|
wording Wordings::from(wording W, int first_wn) {
|
|
if (Wordings::empty(W)) return W;
|
|
W.word_A = first_wn;
|
|
return W;
|
|
}
|
|
|
|
@h Reading.
|
|
|
|
=
|
|
int Wordings::length(wording W) {
|
|
if (Wordings::empty(W)) return 0;
|
|
return W.word_B - W.word_A + 1;
|
|
}
|
|
|
|
int Wordings::phrasual_length(wording W) {
|
|
if (Wordings::empty(W)) return 0;
|
|
int bl = 0, n = 0;
|
|
LOOP_THROUGH_WORDING(i, W) {
|
|
if ((Lexer::word(i) == OPENBRACKET_V) || (Lexer::word(i) == OPENBRACE_V)) bl++;
|
|
if ((Lexer::word(i) == CLOSEBRACKET_V) || (Lexer::word(i) == CLOSEBRACE_V)) bl--;
|
|
if (bl == 0) n++;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
int Wordings::first_wn(wording W) {
|
|
return W.word_A;
|
|
}
|
|
|
|
int Wordings::last_wn(wording W) {
|
|
return W.word_B;
|
|
}
|
|
|
|
@h Manipulation.
|
|
Unlike the construction routines above, these never make positional empty
|
|
wordings.
|
|
|
|
=
|
|
wording Wordings::truncate(wording W, int max) {
|
|
if (Wordings::length(W) > max) W.word_B = W.word_A + max - 1;
|
|
return W;
|
|
}
|
|
|
|
wording Wordings::first_word(wording W) {
|
|
if (Wordings::empty(W)) return EMPTY_WORDING;
|
|
W.word_B = W.word_A;
|
|
return W;
|
|
}
|
|
|
|
wording Wordings::last_word(wording W) {
|
|
if (Wordings::empty(W)) return EMPTY_WORDING;
|
|
W.word_A = W.word_B;
|
|
return W;
|
|
}
|
|
|
|
wording Wordings::trim_first_word(wording W) {
|
|
if (Wordings::empty(W)) return EMPTY_WORDING;
|
|
W.word_A++;
|
|
if (W.word_A > W.word_B) return EMPTY_WORDING;
|
|
return W;
|
|
}
|
|
|
|
wording Wordings::trim_last_word(wording W) {
|
|
if (Wordings::empty(W)) return EMPTY_WORDING;
|
|
W.word_B--;
|
|
if (W.word_A > W.word_B) return EMPTY_WORDING;
|
|
return W;
|
|
}
|
|
|
|
wording Wordings::trim_both_ends(wording W) {
|
|
if (Wordings::empty(W)) return EMPTY_WORDING;
|
|
W.word_A++; W.word_B--;
|
|
if (W.word_A > W.word_B) return EMPTY_WORDING;
|
|
return W;
|
|
}
|
|
|
|
@h Widening.
|
|
|
|
=
|
|
wording Wordings::union(wording W1, wording W2) {
|
|
if (Wordings::empty(W1)) return W2;
|
|
if (Wordings::empty(W2)) return W1;
|
|
int w1 = W1.word_A; if (w1 > W2.word_A) w1 = W2.word_A; /* the min */
|
|
int w2 = W1.word_B; if (w2 < W2.word_B) w2 = W2.word_B; /* the max */
|
|
return Wordings::new(w1, w2);
|
|
}
|
|
|
|
@h Position.
|
|
|
|
=
|
|
int Wordings::within(wording SMALL, wording BIG) {
|
|
if ((Wordings::nonempty(SMALL)) && (Wordings::nonempty(BIG)) &&
|
|
(SMALL.word_A >= BIG.word_A) && (SMALL.word_B <= BIG.word_B))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
source_location Wordings::location(wording W) {
|
|
return Lexer::word_location(W.word_A);
|
|
}
|
|
|
|
@h Emptiness.
|
|
See above.
|
|
|
|
=
|
|
int Wordings::empty(wording W) {
|
|
if ((W.word_A >= 0) && (W.word_B >= W.word_A)) return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
int Wordings::nonempty(wording W) {
|
|
if ((W.word_A >= 0) && (W.word_B >= W.word_A)) return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
@h Comparing wordings.
|
|
First, though it's little needed, literal equality: two wordings are the
|
|
same only if they represent the same word numbers in the source.
|
|
|
|
=
|
|
int Wordings::eq(wording W1, wording W2) {
|
|
if ((W1.word_A == W2.word_A) && (W1.word_B == W2.word_B))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
@ Two wordings are said to "match" if they are nonempty and contain the same text.
|
|
|
|
The calculation overhead makes it marginally not worth using a hash function
|
|
to speed up the following comparison, which requires two excerpts to be
|
|
absolutely equal.
|
|
|
|
=
|
|
int Wordings::match(wording W1, wording W2) {
|
|
if ((W1.word_A >= 0) && (W1.word_B >= 0) &&
|
|
(Wordings::match_inner(W1.word_A, W1.word_B, W2.word_A, W2.word_B)))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
int Wordings::starts_with(wording W, wording S) {
|
|
if ((Wordings::nonempty(W)) && (Wordings::nonempty(S)) &&
|
|
(Wordings::length(W) >= Wordings::length(S)) &&
|
|
(Wordings::match_inner(W.word_A, W.word_A + S.word_B - S.word_A, S.word_A, S.word_B)))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
int Wordings::match_inner(int w1, int w2, int w3, int w4) {
|
|
if (w4-w3 != w2-w1) return FALSE;
|
|
if ((w1<0) || (w3<0)) return FALSE;
|
|
for (int j=0; j<=w2-w1; j++)
|
|
if (compare_words(w1+j, w3+j) == FALSE) return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
@ Case sensitively:
|
|
|
|
=
|
|
int Wordings::match_cs(wording W1, wording W2) {
|
|
if ((W1.word_A >= 0) && (W1.word_B >= 0) &&
|
|
(Wordings::match_cs_inner(W1.word_A, W1.word_B, W2.word_A, W2.word_B)))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
int Wordings::match_cs_inner(int w1, int w2, int w3, int w4) {
|
|
if (w4-w3 != w2-w1) return FALSE;
|
|
if ((w1<0) || (w3<0)) return FALSE;
|
|
for (int j=0; j<=w2-w1; j++)
|
|
if (compare_words_cs(w1+j, w3+j) == FALSE) return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
@ This alternative form is slower, but gets the case where one of the words
|
|
holds double-quoted text correctly. (We don't need this often because quoted
|
|
text is in general not allowed in identifier names.)
|
|
|
|
=
|
|
int Wordings::match_perhaps_quoted(wording W1, wording W2) {
|
|
int w1 = W1.word_A, w2 = W1.word_B, w3 = W2.word_A, w4 = W2.word_B;
|
|
if (w4-w3 != w2-w1) return FALSE;
|
|
if ((w1<0) || (w3<0)) return FALSE;
|
|
for (int j=0; j<=w2-w1; j++) {
|
|
if (compare_words(w1+j, w3+j) == FALSE) {
|
|
if ((Vocabulary::test_flags(w1+j, (TEXT_MC+TEXTWITHSUBS_MC))) &&
|
|
(Vocabulary::test_flags(w3+j, (TEXT_MC+TEXTWITHSUBS_MC))) &&
|
|
(Wide::cmp(Lexer::word_raw_text(w1+j), Lexer::word_raw_text(w3+j)) == 0))
|
|
continue;
|
|
return FALSE;
|
|
}
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
@ And relatedly, used for sorting into alphabetical order, a direct analogue
|
|
of |strcmp| but for word ranges:
|
|
|
|
=
|
|
int Wordings::strcmp(wording X, wording Y) {
|
|
int x1 = X.word_A, x2 = X.word_B, y1 = Y.word_A, y2 = Y.word_B;
|
|
if (x1 < 0) { if (y1 < 0) return 0; return -1; }
|
|
if (y1 < 0) return 1;
|
|
int n;
|
|
int l1 = x2 - x1 + 1;
|
|
int l2 = y2 - y1 + 1;
|
|
for (n=0; (n<l1) && (n<l2); n++) {
|
|
int delta = Wide::cmp(Lexer::word_text(x1 + n), Lexer::word_text(y1 + n));
|
|
if (delta != 0) return delta;
|
|
}
|
|
return l1 - l2;
|
|
}
|
|
|
|
@h Bracketing.
|
|
We are going to need to look for paired brackets at the outside of an
|
|
excerpt reasonably quickly, and the following routine performs that test.
|
|
|
|
=
|
|
int Wordings::paired_brackets(wording W) {
|
|
if ((Lexer::word(Wordings::first_wn(W)) == OPENBRACKET_V) &&
|
|
(Lexer::word(Wordings::last_wn(W)) == CLOSEBRACKET_V) &&
|
|
(Wordings::mismatched_brackets(Wordings::trim_both_ends(W)) == FALSE))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
@ For problem detection:
|
|
|
|
=
|
|
int Wordings::mismatched_brackets(wording W) {
|
|
int bl = 0;
|
|
LOOP_THROUGH_WORDING(i, W) {
|
|
if ((Lexer::word(i) == OPENBRACKET_V) || (Lexer::word(i) == OPENBRACE_V)) bl++;
|
|
if ((Lexer::word(i) == CLOSEBRACKET_V) || (Lexer::word(i) == CLOSEBRACE_V)) bl--;
|
|
if (bl < 0) return TRUE;
|
|
}
|
|
if (bl != 0) return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
@ For syntax disambiguation:
|
|
|
|
=
|
|
int Wordings::top_level_comma(wording W) {
|
|
int bl = 0;
|
|
LOOP_THROUGH_WORDING(i, W) {
|
|
if ((Lexer::word(i) == OPENBRACKET_V) || (Lexer::word(i) == OPENBRACE_V)) bl++;
|
|
if ((Lexer::word(i) == CLOSEBRACKET_V) || (Lexer::word(i) == CLOSEBRACE_V)) bl--;
|
|
if ((bl == 0) && (Lexer::word(i) == COMMA_V)) return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
@h Searching for unusual spacing in ranges.
|
|
Looking forward to see how far the current column or row extends,
|
|
for formatted tables. The idea is that we have a range |w1| to |w2|,
|
|
and that the current column or row extends from |w1| but may run only
|
|
part-way through: we look for the first point at which there is a
|
|
tab break (for column scanning) or a newline break (for row scanning),
|
|
and return the word position just before that break. If we do not find
|
|
one, it follows that the entire range holds the current column or row,
|
|
and we return |w2|.
|
|
|
|
=
|
|
int Wordings::last_word_of_formatted_text(wording W, int tab_flag) {
|
|
if (Wordings::empty(W)) return -1;
|
|
if (Wordings::length(W) == 1) return Wordings::first_wn(W);
|
|
LOOP_THROUGH_WORDING(i, W)
|
|
if (i > Wordings::first_wn(W))
|
|
if (((tab_flag) && (Lexer::break_before(i) == '\t')) ||
|
|
(Lexer::indentation_level(i) > 0) ||
|
|
(Lexer::break_before(i) == '\n'))
|
|
return i-1;
|
|
return Wordings::last_wn(W);
|
|
}
|
|
|
|
@h The Writer.
|
|
The following implements the |%W| escape, which comes in four varieties:
|
|
|
|
=
|
|
void Wordings::writer(OUTPUT_STREAM, char *format_string, wording W) {
|
|
switch (format_string[0]) {
|
|
case 'W': /* bare |%W| means the same as |%-W|, so fall through to... */
|
|
case '-': @<Write the stream with normalised casing@>; break;
|
|
case '+': @<Write the stream raw@>; break;
|
|
case '<': @<Write the stream in an abbreviated raw form@>; break;
|
|
case '~': @<Write the stream in a raw form suitable for use in an I6 literal string@>; break;
|
|
default: internal_error("bad %W modifier");
|
|
}
|
|
}
|
|
|
|
@ Note that the empty wording causes nothing to be written to the stream.
|
|
|
|
@<Write the stream with normalised casing@> =
|
|
LOOP_THROUGH_WORDING(j, W) {
|
|
WRITE("%N", j);
|
|
if (j<Wordings::last_wn(W)) WRITE(" ");
|
|
}
|
|
|
|
@ Raw, in this context, means that it retains its original case, but not that
|
|
it retains its original spacing. This is sometimes problematic: we need to
|
|
go to some trouble to make punctuation look reasonably nice again, to obtain,
|
|
say,
|
|
|
|
>> The auctioneer said: "I'm not through yet -".
|
|
|
|
in preference to:
|
|
|
|
>> The auctioneer said : "I'm not through yet -" .
|
|
|
|
Note that we are not actually preserving the spacing in the original source --
|
|
that might have line breaks or other curiosities which we don't want: we are
|
|
instead imposing what we think are normal English conventions. While this
|
|
will sometimes be wrong, this is only likely to affect the index and problem
|
|
messages, so the user is not likely to be bothered.
|
|
|
|
@<Write the stream raw@> =
|
|
LOOP_THROUGH_WORDING(j, W) {
|
|
WRITE("%<N", j);
|
|
int space = FALSE;
|
|
if (j<Wordings::last_wn(W)) {
|
|
space = TRUE;
|
|
if (compare_word(j+1, COMMA_V)) space = FALSE;
|
|
if (compare_word(j+1, COLON_V)) space = FALSE;
|
|
if (compare_word(j+1, CLOSEBRACKET_V)) space = FALSE;
|
|
}
|
|
if (compare_word(j, OPENBRACKET_V)) space = FALSE;
|
|
if (space) WRITE(" ");
|
|
}
|
|
|
|
@ A variation on this tries to contain unreasonably long pastes of quoted
|
|
literals, and is used in printing out problem messages, where quoting
|
|
back the offending source text might make unhelpfully vast paragraphs
|
|
in which the actual information is more or less hidden.
|
|
|
|
@d STRING_TOLERANCE_LIMIT 70
|
|
|
|
@<Write the stream in an abbreviated raw form@> =
|
|
if (Wordings::empty(W)) {
|
|
WRITE("<no text>");
|
|
} else {
|
|
LOOP_THROUGH_WORDING(i, W) {
|
|
int space = TRUE;
|
|
if (i == Wordings::first_wn(W)) space = FALSE;
|
|
else {
|
|
if (compare_word(i, COMMA_V)) space = FALSE;
|
|
if (compare_word(i, COLON_V)) space = FALSE;
|
|
if (compare_word(i, SEMICOLON_V)) space = FALSE;
|
|
if (compare_word(i, CLOSEBRACE_V)) space = FALSE;
|
|
if (compare_word(i, CLOSEBRACKET_V)) space = FALSE;
|
|
if (compare_word(i-1, OPENBRACE_V)) space = FALSE;
|
|
if (compare_word(i-1, OPENBRACKET_V)) space = FALSE;
|
|
}
|
|
if (space) WRITE(" ");
|
|
wchar_t *p = Lexer::word_raw_text(i);
|
|
int L = Wide::len(p);
|
|
if (L > STRING_TOLERANCE_LIMIT+5) {
|
|
for (int j=0; j<STRING_TOLERANCE_LIMIT/2; j++) PUT(p[j]);
|
|
WRITE(" [...] ");
|
|
for (int j=STRING_TOLERANCE_LIMIT/2; j>0; j--) PUT(p[L-j]);
|
|
} else {
|
|
WRITE("%w", p);
|
|
}
|
|
if ((i >= Wordings::first_wn(W)+1) && (compare_word(i-1, OPENI6_V))) {
|
|
WRITE("-)");
|
|
}
|
|
}
|
|
}
|
|
|
|
@ Another variation, this time formatted for use in I6 double-quoted text.
|
|
Here we don't care about punctuation spacing, because we are only writing
|
|
in comments and I6 debugging routines, but we do need to use the I6 escape
|
|
|~| for a double-quotation mark.
|
|
|
|
@<Write the stream in a raw form suitable for use in an I6 literal string@> =
|
|
LOOP_THROUGH_WORDING(j, W) {
|
|
wchar_t *str = Lexer::word_raw_text(j);
|
|
if (j>Wordings::first_wn(W)) WRITE(" ");
|
|
for (int k=0; str[k] != 0; k++) {
|
|
int c = str[k];
|
|
switch (c) {
|
|
case '@': WRITE("@@"); break;
|
|
case '"': WRITE("~"); break;
|
|
case '^': WRITE("[cr]"); break;
|
|
case '\\': WRITE("[backslash]"); break;
|
|
default: PUT(c);
|
|
}
|
|
}
|
|
}
|