[Wordings::] Wordings. To manage contiguous word ranges. @h Definitions. @ Wordings are an efficient representation of a multi-word name for something, using the fact that almost all names derive from contiguous runs of words in the source text. Recall that words are numbered from 0 upwards in order of reading into the lexer. The wording |Wordings::new(A, B)| represents both a positional marker, at word number |A|, and also some textual content, the text making up words |A| to |B| inclusive. Different wordings can represent the same text at different positions in the source text, for example if "brown spotted owl" occurs multiple times. = typedef struct wording { int word_A, word_B; } wording; @ Note that this applies even to the empty text. A wording holds no text if both |A| and |B| are negative, or if |A| is larger than |B|. Thus the wording |(17, 16)| represents the empty text at position 17. (Preform makes use of this when parsing nonterminals which match conditionally but consume no text, for example.) When we are representing no text and no position either, we should use the following constant wording: @d EMPTY_WORDING ((wording) { -1, -1 }) @ Annoyingly, |gcc| (though not |clang|) rejects this as an initializer, so we also need: @d EMPTY_WORDING_INIT { -1, -1 } @ We will frequently want to loop through the words in a wording, so the following macro is convenient. Note that the loop body is not executed if the wording is empty. @d LOOP_THROUGH_WORDING(i, W) if (W.word_A >= 0) for (int i=W.word_A; i<=W.word_B; i++) @h Construction. = wording Wordings::new(int A, int B) { return (wording) { A, B }; } wording Wordings::one_word(int A) { return (wording) { A, A }; } @ Note that these two are sometimes used to construct empty wordings either by moving |A| past |B|, or moving |B| before |A|. = wording Wordings::up_to(wording W, int last_wn) { if (Wordings::empty(W)) return W; W.word_B = last_wn; return W; } wording Wordings::from(wording W, int first_wn) { if (Wordings::empty(W)) return W; W.word_A = first_wn; return W; } @h Reading. = int Wordings::length(wording W) { if (Wordings::empty(W)) return 0; return W.word_B - W.word_A + 1; } int Wordings::phrasual_length(wording W) { if (Wordings::empty(W)) return 0; int bl = 0, n = 0; LOOP_THROUGH_WORDING(i, W) { if ((Lexer::word(i) == OPENBRACKET_V) || (Lexer::word(i) == OPENBRACE_V)) bl++; if ((Lexer::word(i) == CLOSEBRACKET_V) || (Lexer::word(i) == CLOSEBRACE_V)) bl--; if (bl == 0) n++; } return n; } int Wordings::first_wn(wording W) { return W.word_A; } int Wordings::last_wn(wording W) { return W.word_B; } @h Manipulation. Unlike the construction routines above, these never make positional empty wordings. = wording Wordings::truncate(wording W, int max) { if (Wordings::length(W) > max) W.word_B = W.word_A + max - 1; return W; } wording Wordings::first_word(wording W) { if (Wordings::empty(W)) return EMPTY_WORDING; W.word_B = W.word_A; return W; } wording Wordings::last_word(wording W) { if (Wordings::empty(W)) return EMPTY_WORDING; W.word_A = W.word_B; return W; } wording Wordings::trim_first_word(wording W) { if (Wordings::empty(W)) return EMPTY_WORDING; W.word_A++; if (W.word_A > W.word_B) return EMPTY_WORDING; return W; } wording Wordings::trim_last_word(wording W) { if (Wordings::empty(W)) return EMPTY_WORDING; W.word_B--; if (W.word_A > W.word_B) return EMPTY_WORDING; return W; } wording Wordings::trim_both_ends(wording W) { if (Wordings::empty(W)) return EMPTY_WORDING; W.word_A++; W.word_B--; if (W.word_A > W.word_B) return EMPTY_WORDING; return W; } @h Widening. = wording Wordings::union(wording W1, wording W2) { if (Wordings::empty(W1)) return W2; if (Wordings::empty(W2)) return W1; int w1 = W1.word_A; if (w1 > W2.word_A) w1 = W2.word_A; /* the min */ int w2 = W1.word_B; if (w2 < W2.word_B) w2 = W2.word_B; /* the max */ return Wordings::new(w1, w2); } @h Position. = int Wordings::within(wording SMALL, wording BIG) { if ((Wordings::nonempty(SMALL)) && (Wordings::nonempty(BIG)) && (SMALL.word_A >= BIG.word_A) && (SMALL.word_B <= BIG.word_B)) return TRUE; return FALSE; } source_location Wordings::location(wording W) { return Lexer::word_location(W.word_A); } @h Emptiness. See above. = int Wordings::empty(wording W) { if ((W.word_A >= 0) && (W.word_B >= W.word_A)) return FALSE; return TRUE; } int Wordings::nonempty(wording W) { if ((W.word_A >= 0) && (W.word_B >= W.word_A)) return TRUE; return FALSE; } @h Comparing wordings. First, though it's little needed, literal equality: two wordings are the same only if they represent the same word numbers in the source. = int Wordings::eq(wording W1, wording W2) { if ((W1.word_A == W2.word_A) && (W1.word_B == W2.word_B)) return TRUE; return FALSE; } @ Two wordings are said to "match" if they are nonempty and contain the same text. The calculation overhead makes it marginally not worth using a hash function to speed up the following comparison, which requires two excerpts to be absolutely equal. = int Wordings::match(wording W1, wording W2) { if ((W1.word_A >= 0) && (W1.word_B >= 0) && (Wordings::match_inner(W1.word_A, W1.word_B, W2.word_A, W2.word_B))) return TRUE; return FALSE; } int Wordings::starts_with(wording W, wording S) { if ((Wordings::nonempty(W)) && (Wordings::nonempty(S)) && (Wordings::length(W) >= Wordings::length(S)) && (Wordings::match_inner(W.word_A, W.word_A + S.word_B - S.word_A, S.word_A, S.word_B))) return TRUE; return FALSE; } int Wordings::match_inner(int w1, int w2, int w3, int w4) { if (w4-w3 != w2-w1) return FALSE; if ((w1<0) || (w3<0)) return FALSE; for (int j=0; j<=w2-w1; j++) if (compare_words(w1+j, w3+j) == FALSE) return FALSE; return TRUE; } @ Case sensitively: = int Wordings::match_cs(wording W1, wording W2) { if ((W1.word_A >= 0) && (W1.word_B >= 0) && (Wordings::match_cs_inner(W1.word_A, W1.word_B, W2.word_A, W2.word_B))) return TRUE; return FALSE; } int Wordings::match_cs_inner(int w1, int w2, int w3, int w4) { if (w4-w3 != w2-w1) return FALSE; if ((w1<0) || (w3<0)) return FALSE; for (int j=0; j<=w2-w1; j++) if (compare_words_cs(w1+j, w3+j) == FALSE) return FALSE; return TRUE; } @ This alternative form is slower, but gets the case where one of the words holds double-quoted text correctly. (We don't need this often because quoted text is in general not allowed in identifier names.) = int Wordings::match_perhaps_quoted(wording W1, wording W2) { int w1 = W1.word_A, w2 = W1.word_B, w3 = W2.word_A, w4 = W2.word_B; if (w4-w3 != w2-w1) return FALSE; if ((w1<0) || (w3<0)) return FALSE; for (int j=0; j<=w2-w1; j++) { if (compare_words(w1+j, w3+j) == FALSE) { if ((Vocabulary::test_flags(w1+j, (TEXT_MC+TEXTWITHSUBS_MC))) && (Vocabulary::test_flags(w3+j, (TEXT_MC+TEXTWITHSUBS_MC))) && (Wide::cmp(Lexer::word_raw_text(w1+j), Lexer::word_raw_text(w3+j)) == 0)) continue; return FALSE; } } return TRUE; } @ And relatedly, used for sorting into alphabetical order, a direct analogue of |strcmp| but for word ranges: = int Wordings::strcmp(wording X, wording Y) { int x1 = X.word_A, x2 = X.word_B, y1 = Y.word_A, y2 = Y.word_B; if (x1 < 0) { if (y1 < 0) return 0; return -1; } if (y1 < 0) return 1; int n; int l1 = x2 - x1 + 1; int l2 = y2 - y1 + 1; for (n=0; (n Wordings::first_wn(W)) if (((tab_flag) && (Lexer::break_before(i) == '\t')) || (Lexer::indentation_level(i) > 0) || (Lexer::break_before(i) == '\n')) return i-1; return Wordings::last_wn(W); } @h The Writer. The following implements the |%W| escape, which comes in four varieties: = void Wordings::writer(OUTPUT_STREAM, char *format_string, wording W) { switch (format_string[0]) { case 'W': /* bare |%W| means the same as |%-W|, so fall through to... */ case '-': @; break; case '+': @; break; case '<': @; break; case '~': @; break; default: internal_error("bad %W modifier"); } } @ Note that the empty wording causes nothing to be written to the stream. @ = LOOP_THROUGH_WORDING(j, W) { WRITE("%N", j); if (j> The auctioneer said: "I'm not through yet -". in preference to: >> The auctioneer said : "I'm not through yet -" . Note that we are not actually preserving the spacing in the original source -- that might have line breaks or other curiosities which we don't want: we are instead imposing what we think are normal English conventions. While this will sometimes be wrong, this is only likely to affect the index and problem messages, so the user is not likely to be bothered. @ = LOOP_THROUGH_WORDING(j, W) { WRITE("% = if (Wordings::empty(W)) { WRITE(""); } else { LOOP_THROUGH_WORDING(i, W) { int space = TRUE; if (i == Wordings::first_wn(W)) space = FALSE; else { if (compare_word(i, COMMA_V)) space = FALSE; if (compare_word(i, COLON_V)) space = FALSE; if (compare_word(i, SEMICOLON_V)) space = FALSE; if (compare_word(i, CLOSEBRACE_V)) space = FALSE; if (compare_word(i, CLOSEBRACKET_V)) space = FALSE; if (compare_word(i-1, OPENBRACE_V)) space = FALSE; if (compare_word(i-1, OPENBRACKET_V)) space = FALSE; } if (space) WRITE(" "); wchar_t *p = Lexer::word_raw_text(i); int L = Wide::len(p); if (L > STRING_TOLERANCE_LIMIT+5) { for (int j=0; j0; j--) PUT(p[L-j]); } else { WRITE("%w", p); } if ((i >= Wordings::first_wn(W)+1) && (compare_word(i-1, OPENI6_V))) { WRITE("-)"); } } } @ Another variation, this time formatted for use in I6 double-quoted text. Here we don't care about punctuation spacing, because we are only writing in comments and I6 debugging routines, but we do need to use the I6 escape |~| for a double-quotation mark. @ = LOOP_THROUGH_WORDING(j, W) { wchar_t *str = Lexer::word_raw_text(j); if (j>Wordings::first_wn(W)) WRITE(" "); for (int k=0; str[k] != 0; k++) { int c = str[k]; switch (c) { case '@': WRITE("@@"); break; case '"': WRITE("~"); break; case '^': WRITE("[cr]"); break; case '\\': WRITE("[backslash]"); break; default: PUT(c); } } }