HLA Program to Fix Word Index Entries
Posted: Sun Jan 15, 2023 1:38 am
Note: updated source code on 2023-01-16 to fix a few bugs.
I use Adobe FrameMaker when writing my books. I really like the professional-level features, especially for large projects (like books).
I despise Microsoft Word. Unfortunately, the publishing world (and just about everyone else) has standardized on MS Word for documentation exchange. Fortunately, FrameMaker can export documents in an .RTF format, which can be read by Word. FrameMaker does a half-way decent job of translating index entries into .RTF form (which are converted into Word index entries).
I say "half-way decent" because there are some bugs in this translation. For example, FrameMaker allows you to embed multiple index entries in you document by separating the entries with semicolons, e.g.,
This will create two entries in the "I" and "A" sections of the index.
Unfortunately, MS Word doesn't seem to support this. Recently, my publisher (No Starch Press) got tired of fixing all these index entries in my latest book (The Art of ARM Assembly) and asked me to do it; I was completely unaware that this was a problem in Word and that they had been manually fixing these entries (which is a *lot* of work). I took a look at the latest chapter from my editor, which had hundreds of index entries, and rapidly got tired of manually correcting these things. So I got the bright idea of writing a program to do the conversions for me (across 16 chapters).
Now to be honest, I should have used SNOBOL4/Spitbol or Icon (string-oriented/pattern matching languages), but it's been probably 10 years since I've written code in these languages. I also probably should have taken this opportunity to learn the regular expression features in Swift. But I didn't want to spend several days on this project, so I wrote it in assembly language (crazy, right?).
Actually, it's not as bad as it seems. The High-Level Assembler has a really nice pattern-matching library (based on SNOBOL4 and Icon), so I whipped out my editor and started writing some 32-bit assembly code (for the first time in a couple of years; been doing mostly 64-bit x86 stuff with MASM, Gas, and 64-bit ARM assembly lately).
This program is a hack, certainly not very efficient (doesn't need to be, it will only be run a couple dozen times).
But I thought I'd share it here as it was a lot of fun to write.
Cheers,
Randy Hyde
I use Adobe FrameMaker when writing my books. I really like the professional-level features, especially for large projects (like books).
I despise Microsoft Word. Unfortunately, the publishing world (and just about everyone else) has standardized on MS Word for documentation exchange. Fortunately, FrameMaker can export documents in an .RTF format, which can be read by Word. FrameMaker does a half-way decent job of translating index entries into .RTF form (which are converted into Word index entries).
I say "half-way decent" because there are some bugs in this translation. For example, FrameMaker allows you to embed multiple index entries in you document by separating the entries with semicolons, e.g.,
Code: Select all
index item; another item
Unfortunately, MS Word doesn't seem to support this. Recently, my publisher (No Starch Press) got tired of fixing all these index entries in my latest book (The Art of ARM Assembly) and asked me to do it; I was completely unaware that this was a problem in Word and that they had been manually fixing these entries (which is a *lot* of work). I took a look at the latest chapter from my editor, which had hundreds of index entries, and rapidly got tired of manually correcting these things. So I got the bright idea of writing a program to do the conversions for me (across 16 chapters).
Now to be honest, I should have used SNOBOL4/Spitbol or Icon (string-oriented/pattern matching languages), but it's been probably 10 years since I've written code in these languages. I also probably should have taken this opportunity to learn the regular expression features in Swift. But I didn't want to spend several days on this project, so I wrote it in assembly language (crazy, right?).
Actually, it's not as bad as it seems. The High-Level Assembler has a really nice pattern-matching library (based on SNOBOL4 and Icon), so I whipped out my editor and started writing some 32-bit assembly code (for the first time in a couple of years; been doing mostly 64-bit x86 stuff with MASM, Gas, and 64-bit ARM assembly lately).
This program is a hack, certainly not very efficient (doesn't need to be, it will only be run a couple dozen times).
But I thought I'd share it here as it was a lot of fun to write.
Code: Select all
// fixXref-
//
// Fixes bad index entries from
// FrameMaker->.RTF translation.
program fixXref;
#include( "stdlib.hhf" )
#include( "w.hhf" )
type
txtPtr: pointer to char;
static
input: mmap_t;
inStart: txtPtr;
inEnd: txtPtr;
outH: dword;
inFile: string;
outFile: string;
inputCnt: int32;
outputCnt: int32;
// memToStr:
//
// Allocates storage to hold the characters
// found in memory from start to (endstr-1), then
// initializes the resulting string and copies
// the memory data to the string. Returns a
// point to the string in EAX.
procedure memToStr( start:dword; endstr:dword );
begin memToStr;
push( ecx );
push( esi );
push( edi );
mov( start, esi );
mov( endstr, ecx );
sub( esi, ecx );
str.alloc( ecx );
mov( ecx, (type str.strRec [eax]).length );
mov( eax, edi );
cld();
rep.movsb();
mov( 0, (type char [edi]) );
pop( edi );
pop( esi );
pop( ecx );
end memToStr;
// Process an RTF index entry.
//
// EBX..ESI points at an index entry: "{\xe ... }"
procedure processIndex( ndxStart:dword; ndxEnd:dword );
static
cntr: int32:= 0;
var
mainStr: string;
mainStr2: string;
startItem: dword;
matched: dword; // Really Boolean.
hasSemi: dword; // Really Boolean.
nestedNdx: dword; // Number of nested entries
i: dword; // Generic loop index
special: string;
special2: string;
combined: string;
catToComb: string;
firstComb: string;
combinedPrefix: string;
nested: string[1024];
begin processIndex;
push( ebx );
push( ecx );
push( edx );
push( edi );
push( esi );
// Create a string from the index entry:
add( 4, ndxStart ); // Skip "{\xe"
dec( ndxEnd ); // Removing trailing "}"
memToStr( ndxStart, ndxEnd );
mov( eax, mainStr );
str.a_cpy( mainStr );
mov( eax, mainStr2 );
// See if there are any semicolons in this string.
// (Output it as-is if there are none.)
str.chpos2( eax, ';' );
if( eax <> -1 ) then // There was a semicolon
inc( cntr );
// Go through and delete any newline characters:
mov( mainStr, eax );
mov( (type str.strRec [eax]).length, ecx );
mov( eax, esi );
mov( eax, edi );
mov( 0, edx );
mov( 0, hasSemi );
while( ecx > 0 ) do
mov( [esi], al );
if( al = stdio.lf || al = stdio.cr ) then
inc( edx );
else
mov( al, [edi] );
inc( edi );
endif;
inc( esi );
dec( ecx );
endwhile;
mov( 0, (type char [edi]) );
mov( mainStr, eax );
mov( (type str.strRec [eax]).length, ecx );
sub( edx, ecx );
mov( ecx, (type str.strRec [eax]).length );
// Collect all the text between braces inside
// the index entry:
mov( 0, nestedNdx );
repeat
mov( false, matched );
pat.match( mainStr );
pat.upToChar( '{' );
mov( esi, startItem );
pat.oneChar( '{' );
pat.upToChar( '}' );
pat.oneChar( '}' );
// Save the processed item:
memToStr( startItem, esi );
mov( nestedNdx, edx );
inc( nestedNdx );
mov( eax, nested[edx*4] );
mov( true, matched );
// Delete the processed item:
mov( esi, ecx );
sub( mainStr, ecx );
str.delete3( mainStr, 0, ecx );
pat.if_failure
nop(); // Ignore mal-formed index entries.
pat.endmatch;
until( !matched );
mov( NULL, special );
mov( NULL, combined );
mov( NULL, combinedPrefix );
for( mov( 0, i ); mov( i, eax ) < nestedNdx; inc(i) ) do
// Determine if this is a non-text item.
// Non-text items have a single space after
// ...\insrsidxxxxx
mov( i, ecx );
mov( nested[ecx*4], edx );
pat.match( nested[ecx*4] );
mov( esi, startItem );
pat.oneChar( '{' );
pat.matchToStr( "\insrsid" );
pat.oneOrMoreCset( {'0'..'9'} );
pat.matchStr( " }" );
memToStr( startItem, esi );
if( special == NULL ) then
mov( eax, special );
else
mov( eax, special2 );
if( !str.eq( special, eax ) ) then
stdout.put
(
"Not Equal!" nl,
"Special: """, special, """" nl,
"special2: """, special2, """" nl
);
endif;
str.free( special2 );
endif;
mov( true, matched );
pat.if_failure
mov( false, matched );
pat.endmatch;
// If it was a text item, combine it with all the previous text items.
if( !matched ) then // it's a text item.
mov( i, ecx );
mov( nested[ecx*4], edx );
pat.match( nested[ecx*4] );
mov( esi, firstComb );
pat.oneChar( '{' );
pat.matchToStr( "\insrsid" );
pat.oneOrMoreCset( {'0'..'9'} );
pat.zeroOrOnePat
pat.oneChar( '\' );
pat.oneOrMoreCset( {'0'..'9', 'a'..'z', 'A'..'Z'} );
pat.endZeroOrOnePat;
pat.oneChar( ' ' );
pat.zeroOrOnePat
pat.matchStr( "{\*\oldcprops" );
pat.upToChar( '}' );
pat.oneChar( '}' );
pat.endZeroOrOnePat;
mov( esi, startItem );
if( combinedPrefix = NULL ) then
memToStr( firstComb, esi );
mov( eax, combinedPrefix );
endif;
pat.upToChar( '}' );
if( combined = NULL ) then
memToStr( startItem, esi );
mov( eax, combined );
else
memToStr( startItem, esi );
mov( eax, catToComb );
str.a_cat( combined, catToComb );
str.free( combined );
str.free( catToComb );
mov( eax, combined );
endif;
pat.if_failure
mov( i, ecx );
mov( nested[ecx*4], edx );
stdout.put( "Failed, nested: ", (type string edx), nl );
nop();
pat.endmatch;
endif;
// Free allocated storage for this string:
mov( i, ecx );
str.free( nested[ecx*4] );
endfor;
// Now process the combined entry and break it up into
// separate entries if there are semicolons present.
mov( 0, nestedNdx );
while( str.length( combined ) > 0 ) do
// Isolate the strings separated by semicolons:
pat.match( combined );
// Special case to provide warning
// if escape is present:
pat.matchToStr( "\;" );
stdout.put
(
"File contains escaped semicolon!", nl
"""", combined, """" nl
);
mov( true, matched );
pat.if_failure
mov( false, matched );
pat.endmatch;
// Handle strings that don't have an escaped semicolon:
if( !matched ) then
pat.match( combined );
mov( esi, startItem);
pat.upToChar( ';' );
memToStr( startItem, esi );
mov( nestedNdx, edx );
inc( nestedNdx );
str.trim( eax );
mov( eax, nested[ edx*4 ] );
mov( esi, edx );
sub( startItem, edx );
inc( edx ); // Remove semicolon.
str.delete3( combined, 0, edx );
pat.if_failure
str.a_cpy( combined );
mov( nestedNdx, edx );
inc( nestedNdx );
str.trim( eax );
mov( eax, nested[ edx*4 ] );
mov( combined, eax );
mov( 0, (type str.strRec [eax]).length ); // Force length to 0.
pat.endmatch;
endif;
endwhile;
str.free( combined );
// Emit the individual index entries:
for( mov( 0, ecx ); ecx < nestedNdx; inc( ecx )) do
fileio.put( outH, "{\xe " nl );
if( special <> NULL ) then
fileio.put( outH, special, nl );
endif;
mov( nested[ecx*4], eax );
fileio.put( outH, combinedPrefix, (type string eax), "}" nl );
if( special <> NULL ) then
fileio.put( outH, special, nl );
endif;
fileio.put( outH, "}" nl );
// Bump the output index count by one:
inc( outputCnt );
endfor;
str.free( combinedPrefix );
else // Does not have a semiColon
fileio.put( outH, "{\xe", mainStr2, "}" );
endif;
str.free( mainStr );
str.free( mainStr2 );
pop( esi );
pop( edi );
pop( edx );
pop( ecx );
pop( ebx );
end processIndex;
// processFile-
//
// Processes the text in the input file:
procedure processFile( var start:txtPtr; endptr:txtPtr );
begin processFile;
push( esi );
push( edi );
push( ebx );
push( ecx );
push( edx );
mov( start, esi );
mov( [esi], esi );
mov( endptr, edi );
if( esi < edi ) then
pat.match( esi, edi );
// Match up to the first index entry:
pat.upToiStr( "{\xe" );
// Write all the text up to the
// first index entry to the output file:
mov( esi, eax );
sub( ebx, eax );
fileio.write( outH, val ebx, eax );
// Bump the input index count by 1:
inc( inputCnt );
// Find the end of this index entry:
mov( esi, edx ); // Save ptr to "{"
mov( 1, ecx ); // Count braces
inc( esi ); // Skip leading brace
while( ecx > 0 && esi < edi ) do
// Keep track of nested braces:
mov( [esi], al );
if( al == '{' ) then
inc( ecx );
elseif( al == '}' ) then
dec( ecx );
endif;
// Move on to the next character
inc( esi );
endwhile;
// Process the index string:
processIndex( edx, esi );
pat.if_failure
// If we didn't match an index entry,
// then we've hit the end of the file.
// Write everything from the start of
// the match to the end of the file
// to the output file:
mov( edi, eax );
sub( ebx, eax );
fileio.write( outH, val ebx, eax );
mov( edi, esi );
pat.endmatch;
endif;
// Return pass-by-reference value:
mov( start, edi );
mov( esi, [edi] );
pop( edx );
pop( ecx );
pop( ebx );
pop( edi );
pop( esi );
end processFile;
begin fixXref;
mov( 0, inputCnt );
mov( 0, outputCnt );
// Process command-line parameters:
arg.c();
if( eax != 3 ) then
stderr.put( "Usage fixxref in-file out-file" nl );
w.ExitProcess(1);
endif;
arg.a_v( 1 );
mov( eax, inFile );
arg.a_v( 2 );
mov( eax, outFile );
input.create(); // Initialize the memory-mapped file.
// Open the input file:
try
input.open( inFile, fileio.r );
anyexception
stderr.put( "Could not open ", inFile, nl );
w.ExitProcess( 2 );
endtry;
// Create the output file:
try
fileio.openNew( outFile );
mov( eax, outH );
anyexception
stderr.put( "Could not open ", outFile, nl );
input.close();
input.destroy();
w.ExitProcess( 3 );
endtry;
stdout.put
(
"Adjusting index entries in ",
inFile,
" and writing output to ",
outFile,
nl
);
// Process the input file and write the result to
// the output file.
mov( input.filePtr, eax );
mov( eax, inStart );
mov( input.endFilePtr, eax );
mov( eax, inEnd );
// process the file:
forever
mov( inStart, esi );
breakif( esi >= inEnd );
processFile( inStart, inEnd );
endfor;
// Print statistics:
stdout.put
(
"There were ", inputCnt, " input index entries, " nl
"and ", outputCnt, " output index entries" nl
);
// Clean up and terminate:
fileio.close( outH );
input.close();
input.destroy();
end fixXref;
Randy Hyde