mirror of
https://github.com/ganelson/inform.git
synced 2024-07-08 01:54:21 +03:00
164 lines
5.4 KiB
OpenEdge ABL
164 lines
5.4 KiB
OpenEdge ABL
[TextFromFiles::] Text From Files.
|
|
|
|
This is where source text is read in, whether from extension files
|
|
or from the main source text file, and fed into the lexer.
|
|
|
|
@h Source files.
|
|
Each separate file of text read into the lexer has its identity docketed
|
|
in a |source_file| structure, as follows.
|
|
|
|
=
|
|
typedef struct source_file {
|
|
struct filename *name;
|
|
int words_of_source; /* word count, omitting comments and verbatim matter */
|
|
struct wording text_read;
|
|
int words_of_quoted_text; /* word count for text in double-quotes */
|
|
FILE *handle; /* file handle while open */
|
|
general_pointer your_ref; /* for the client to attach some meaning */
|
|
CLASS_DEFINITION
|
|
} source_file;
|
|
|
|
@h Feeding whole files into the lexer.
|
|
This is one of the two feeder routines for the lexer, the other being in
|
|
Lexical Writing Back.w: see Lexer.w for its obligations.
|
|
|
|
We feed characters from an open file into the lexer, and continue until there
|
|
is nothing left in it. Inform is used on operating systems which between them
|
|
use all four of the sequences |0a|, |0d|, |0a0d| and |0d0a| to divide lines in
|
|
text files, so each of these is converted to a single |'\n'|. Tabs are treated
|
|
as if spaces in most contexts, but not when parsing formatted tables, for
|
|
instance, so they are not similarly converted.
|
|
|
|
=
|
|
source_file *TextFromFiles::feed_open_file_into_lexer(filename *F, FILE *handle,
|
|
text_stream *leaf, int documentation_only, general_pointer ref) {
|
|
source_file *sf = CREATE(source_file);
|
|
sf->words_of_source = 0;
|
|
sf->words_of_quoted_text = 0;
|
|
sf->your_ref = ref;
|
|
sf->name = F;
|
|
sf->handle = handle;
|
|
source_location top_of_file;
|
|
int cr, last_cr, next_cr, read_cr, newline_char = 0;
|
|
|
|
unicode_file_buffer ufb = TextFiles::create_ufb();
|
|
|
|
top_of_file.file_of_origin = sf;
|
|
top_of_file.line_number = 1;
|
|
|
|
Lexer::feed_begins(top_of_file);
|
|
if (documentation_only) lexer_wait_for_dashes = TRUE;
|
|
|
|
last_cr = ' '; cr = ' '; next_cr = TextFiles::utf8_fgetc(sf->handle, NULL, TRUE, &ufb);
|
|
if (next_cr == 0xFEFF) next_cr = TextFiles::utf8_fgetc(sf->handle, NULL, TRUE, &ufb); /* Unicode BOM code */
|
|
if (next_cr != EOF)
|
|
while (((read_cr = TextFiles::utf8_fgetc(sf->handle, NULL, TRUE, &ufb)), next_cr) != EOF) {
|
|
last_cr = cr; cr = next_cr; next_cr = read_cr;
|
|
switch(cr) {
|
|
case '\x0a':
|
|
if (newline_char == '\x0d') {
|
|
newline_char = 0; continue; /* suppress |0x000A| when it follows |0x000D| */
|
|
}
|
|
newline_char = cr; cr = '\n'; /* and otherwise convert to |'\n'| */
|
|
break;
|
|
case '\x0d':
|
|
if (newline_char == '\x0a') {
|
|
newline_char = 0; continue; /* suppress |0x000D| when it follows |0x000A| */
|
|
}
|
|
newline_char = cr; cr = '\n'; /* and otherwise convert to |'\n'| */
|
|
break;
|
|
default:
|
|
newline_char = 0;
|
|
break;
|
|
}
|
|
Lexer::feed_triplet(last_cr, cr, next_cr);
|
|
}
|
|
|
|
sf->text_read = Lexer::feed_ends(TRUE, leaf);
|
|
|
|
@<Word count the new material@>;
|
|
return sf;
|
|
}
|
|
|
|
@ We word count all source files, both as to their source text and their
|
|
quoted text (i.e., their text within double-quotes).
|
|
|
|
@<Word count the new material@> =
|
|
LOOP_THROUGH_WORDING(wc, sf->text_read)
|
|
sf->words_of_source += TextFromFiles::word_count(wc);
|
|
|
|
@ A much simpler version:
|
|
|
|
=
|
|
source_file *TextFromFiles::feed_into_lexer(filename *F, general_pointer ref) {
|
|
FILE *handle = Filenames::fopen(F, "r");
|
|
if (handle == NULL) return NULL;
|
|
source_file *sf = TextFromFiles::feed_open_file_into_lexer(F, handle,
|
|
Filenames::get_leafname(F), FALSE, ref);
|
|
fclose(handle);
|
|
return sf;
|
|
}
|
|
|
|
@ =
|
|
int TextFromFiles::word_count(int wc) {
|
|
int N = 0;
|
|
wchar_t *p = Lexer::word_text(wc);
|
|
if (*p == '"') {
|
|
/* inside quoted text, each run of non-whitespace counts as 1 word */
|
|
p++; /* skip opening quotation mark */
|
|
while (*p != 0) {
|
|
while ((*p == ' ') || (*p == NEWLINE_IN_STRING)) p++; /* move past white space */
|
|
if ((*p == '"') || (*p == 0)) break; /* stop if this reaches the end */
|
|
N++; /* otherwise we have a word */
|
|
while ((*p != ' ') && (*p != NEWLINE_IN_STRING)
|
|
&& (*p != '"') && (*p != 0)) p++; /* move to white space or end */
|
|
}
|
|
} else {
|
|
/* outside quoted text, each lexer word not wholly composed of punctuation scores 1 */
|
|
if (Lexer::word(wc) != PARBREAK_V)
|
|
for (; *p != 0; p++)
|
|
if ((Lexer::is_punctuation(*p) == FALSE) && (*p != '|')) {
|
|
N++;
|
|
break;
|
|
}
|
|
}
|
|
return N;
|
|
}
|
|
|
|
@ At present, though, the only use made of these two word counts is via
|
|
the following routine, which combines them into one.
|
|
|
|
=
|
|
int TextFromFiles::total_word_count(source_file *sf) {
|
|
if (sf == NULL) return 0;
|
|
return sf->words_of_source + sf->words_of_quoted_text;
|
|
}
|
|
|
|
int TextFromFiles::last_lexed_word(source_file *sf) {
|
|
return Wordings::last_wn(sf->text_read);
|
|
}
|
|
|
|
@ Finally, we translate between the tiresomely many representations of
|
|
files we seem to be stuck with. The method used by |TextFromFiles::filename_to_source_file|
|
|
looks vulnerable to case-insensitive filename issues, but isn't, because
|
|
each filename is present in Inform in only one form.
|
|
|
|
=
|
|
filename *TextFromFiles::get_filename(source_file *sf) {
|
|
if (sf == NULL) internal_error("tried to read filename of null source file");
|
|
return sf->name;
|
|
}
|
|
|
|
source_file *TextFromFiles::filename_to_source_file(text_stream *name2) {
|
|
int l2 = Str::len(name2);
|
|
source_file *sf;
|
|
LOOP_OVER(sf, source_file) {
|
|
TEMPORARY_TEXT(name1)
|
|
WRITE_TO(name1, "%f", sf->name);
|
|
int l1 = Str::len(name1);
|
|
int minl = (l1<l2)?l1:l2;
|
|
if (Str::suffix_eq(name1, name2, minl)) return sf;
|
|
}
|
|
return NULL;
|
|
}
|