HLA Program to Fix Word Index Entries

Post by **rhyde** » Sun Jan 15, 2023 1:38 am

Note: updated source code on 2023-01-16 to fix a few bugs.

I use Adobe FrameMaker when writing my books. I really like the professional-level features, especially for large projects (like books).

I despise Microsoft Word. Unfortunately, the publishing world (and just about everyone else) has standardized on MS Word for documentation exchange. Fortunately, FrameMaker can export documents in an .RTF format, which can be read by Word. FrameMaker does a half-way decent job of translating index entries into .RTF form (which are converted into Word index entries).

I say "half-way decent" because there are some bugs in this translation. For example, FrameMaker allows you to embed multiple index entries in you document by separating the entries with semicolons, e.g.,

Code: Select all

index item; another item

This will create two entries in the "I" and "A" sections of the index.

Unfortunately, MS Word doesn't seem to support this. Recently, my publisher (No Starch Press) got tired of fixing all these index entries in my latest book (The Art of ARM Assembly) and asked me to do it; I was completely unaware that this was a problem in Word and that they had been manually fixing these entries (which is a *lot* of work). I took a look at the latest chapter from my editor, which had hundreds of index entries, and rapidly got tired of manually correcting these things. So I got the bright idea of writing a program to do the conversions for me (across 16 chapters).

Now to be honest, I should have used SNOBOL4/Spitbol or Icon (string-oriented/pattern matching languages), but it's been probably 10 years since I've written code in these languages. I also probably should have taken this opportunity to learn the regular expression features in Swift. But I didn't want to spend several days on this project, so I wrote it in assembly language (crazy, right?).

Actually, it's not as bad as it seems. The High-Level Assembler has a really nice pattern-matching library (based on SNOBOL4 and Icon), so I whipped out my editor and started writing some 32-bit assembly code (for the first time in a couple of years; been doing mostly 64-bit x86 stuff with MASM, Gas, and 64-bit ARM assembly lately).

This program is a hack, certainly not very efficient (doesn't need to be, it will only be run a couple dozen times).
But I thought I'd share it here as it was a lot of fun to write.

Code: Select all

// fixXref-
//
// Fixes bad index entries from
// FrameMaker->.RTF translation.

program fixXref;
#include( "stdlib.hhf" )
#include( "w.hhf" )

type
	txtPtr:		pointer to char;

static

	input:		mmap_t;
	inStart:	txtPtr;
	inEnd:		txtPtr;
	
	outH:		dword;
	inFile:		string;
	outFile:	string;
	
	inputCnt:	int32;
	outputCnt:	int32;
	
	
	
	// memToStr:
	//
	// Allocates storage to hold the characters
	// found in memory from start to (endstr-1), then
	// initializes the resulting string and copies
	// the memory data to the string. Returns a
	// point to the string in EAX.
	
	procedure memToStr( start:dword; endstr:dword );
	begin memToStr;
	
		push( ecx );
		push( esi );
		push( edi );
		
		mov( start, esi );
		mov( endstr, ecx );
		sub( esi, ecx );
		str.alloc( ecx );
		mov( ecx, (type str.strRec [eax]).length );
		mov( eax, edi );
		cld();
		rep.movsb();
		mov( 0, (type char [edi]) );
		
		pop( edi );
		pop( esi );
		pop( ecx );
	
	end memToStr;
	
	
	// Process an RTF index entry.
	//
	// EBX..ESI points at an index entry: "{\xe ... }"
	
	procedure processIndex( ndxStart:dword; ndxEnd:dword );
	static
		cntr:			int32:=	0;
	var
		mainStr:		string;
		mainStr2:		string;	  
		startItem:		dword;
		matched:		dword;			// Really Boolean.
		hasSemi:		dword;			// Really Boolean.
		
		
		nestedNdx:		dword;			// Number of nested entries
		i:				dword;			// Generic loop index
		
		special:		string;
		special2:		string;
		combined:		string;
		catToComb:		string;
		firstComb:		string;
		combinedPrefix:	string;
		nested:			string[1024];
		
	begin processIndex;
	
		push( ebx );
		push( ecx );
		push( edx );
		push( edi );
		push( esi );
				
		// Create a string from the index entry:
		
		add( 4, ndxStart );			// Skip "{\xe"
		dec( ndxEnd );				// Removing trailing "}"
		memToStr( ndxStart, ndxEnd );
		mov( eax, mainStr );
		str.a_cpy( mainStr );
		mov( eax, mainStr2 );
		
		// See if there are any semicolons in this string.
		// (Output it as-is if there are none.)
		
		str.chpos2( eax, ';' );
		if( eax <> -1 ) then	// There was a semicolon
		
			inc( cntr );
			
			// Go through and delete any newline characters:

			mov( mainStr, eax );
			mov( (type str.strRec [eax]).length, ecx );		
			mov( eax, esi );
			mov( eax, edi );
			mov( 0, edx );
			mov( 0, hasSemi );
			while( ecx > 0 ) do
			
				mov( [esi], al );
				if( al = stdio.lf || al = stdio.cr ) then
				
					inc( edx );
					
				else
				
					mov( al, [edi] );
					inc( edi );
					
				endif;
				inc( esi );
				dec( ecx );
				
			endwhile;
			mov( 0, (type char [edi]) );
			mov( mainStr, eax );
			mov( (type str.strRec [eax]).length, ecx );
			sub( edx, ecx );
			mov( ecx, (type str.strRec [eax]).length );
			
			// Collect all the text between braces inside
			// the index entry:
			
			mov( 0, nestedNdx );
			repeat
			
				mov( false, matched );
				pat.match( mainStr );
				
					pat.upToChar( '{' );
					mov( esi, startItem );				
					pat.oneChar( '{' );
					pat.upToChar( '}' );
					pat.oneChar( '}' );
					
					// Save the processed item:
					
					memToStr( startItem, esi );
					mov( nestedNdx, edx );
					inc( nestedNdx );
					mov( eax, nested[edx*4] );
					
					mov( true, matched );
					
					// Delete the processed item:
					
					mov( esi, ecx );
					sub( mainStr, ecx );
					str.delete3( mainStr, 0, ecx ); 			
								
					pat.if_failure
					
						nop();	// Ignore mal-formed index entries.

				pat.endmatch;
			
			until( !matched );
			
			mov( NULL, special );
			mov( NULL, combined );
			mov( NULL, combinedPrefix );
			for( mov( 0, i ); mov( i, eax ) < nestedNdx; inc(i) ) do
			
				// Determine if this is a non-text item.
				// Non-text items have a single space after
				// ...\insrsidxxxxx
				
				mov( i, ecx );
				mov( nested[ecx*4], edx );
				pat.match( nested[ecx*4] );
				
					mov( esi, startItem );
					pat.oneChar( '{' );
					pat.matchToStr( "\insrsid" );
					pat.oneOrMoreCset( {'0'..'9'} );
					pat.matchStr( " }" );
					memToStr( startItem, esi );

					if( special == NULL ) then
					
						mov( eax, special );
						
					else
					
						mov( eax, special2 );
						if( !str.eq( special, eax ) ) then
						
							stdout.put
							( 
								"Not Equal!" nl, 
								"Special: """, special, """" nl, 
								"special2: """, special2, """" nl 
							);
							
						endif;
						str.free( special2 );
						
					endif;
					mov( true, matched );
					
					pat.if_failure
					
						mov( false, matched );
						
				pat.endmatch;
				
				// If it was a text item, combine it with all the previous text items.
				
				if( !matched ) then	// it's a text item.	
					
					mov( i, ecx );
					mov( nested[ecx*4], edx );
					pat.match( nested[ecx*4] );
					
						mov( esi, firstComb );
						pat.oneChar( '{' );
						pat.matchToStr( "\insrsid" );
						pat.oneOrMoreCset( {'0'..'9'} );
						pat.zeroOrOnePat
						
							pat.oneChar( '\' );
							pat.oneOrMoreCset( {'0'..'9', 'a'..'z', 'A'..'Z'} );
							
						pat.endZeroOrOnePat;
						pat.oneChar( ' ' );
						pat.zeroOrOnePat
						
							pat.matchStr( "{\*\oldcprops" );
							pat.upToChar( '}' );
							pat.oneChar( '}' );
							
						pat.endZeroOrOnePat; 
						mov( esi, startItem );
						
						if( combinedPrefix = NULL ) then
						
							memToStr( firstComb, esi );
							mov( eax, combinedPrefix );
							
						endif;
						pat.upToChar( '}' );
						
						if( combined = NULL ) then
						
							memToStr( startItem, esi );
							mov( eax, combined );
							
						else
						 
							memToStr( startItem, esi );
							mov( eax, catToComb );
							str.a_cat( combined, catToComb );
							str.free( combined );
							str.free( catToComb );
							mov( eax, combined );
							
						endif;
						
						pat.if_failure
						
							mov( i, ecx );
							mov( nested[ecx*4], edx );
							stdout.put( "Failed, nested: ", (type string edx), nl );
							nop();
							
					pat.endmatch;
					
				endif;
				
				// Free allocated storage for this string:
				
				mov( i, ecx );
				str.free( nested[ecx*4] );
					
			endfor;
			
			// Now process the combined entry and break it up into
			// separate entries if there are semicolons present.
			
			
			mov( 0, nestedNdx );
			while( str.length( combined ) > 0 ) do
			
				// Isolate the strings separated by semicolons:
				
				pat.match( combined );
				
					// Special case to provide warning
					// if escape is present:
					
					pat.matchToStr( "\;" );
					stdout.put
					( 
						"File contains escaped semicolon!", nl
						"""", combined, """" nl
					);
					mov( true, matched );
					
					pat.if_failure
					
						mov( false, matched );					
						
				pat.endmatch;
				
				// Handle strings that don't have an escaped semicolon:
				
				if( !matched ) then
				
					pat.match( combined );
					
						mov( esi, startItem);
						pat.upToChar( ';' );
						memToStr( startItem, esi );
						mov( nestedNdx, edx );
						inc( nestedNdx );
						str.trim( eax );
						mov( eax, nested[ edx*4 ] );
						mov( esi, edx );
						sub( startItem, edx );
						inc( edx );		// Remove semicolon. 
						str.delete3( combined, 0, edx );
											
						pat.if_failure
						
							str.a_cpy( combined );
							mov( nestedNdx, edx );
							inc( nestedNdx );
							str.trim( eax );
							mov( eax, nested[ edx*4 ] );
							mov( combined, eax );
							mov( 0, (type str.strRec [eax]).length );	// Force length to 0.
							
					pat.endmatch;
					
				endif;
				
			endwhile;
			str.free( combined );
			
			// Emit the individual index entries:
			
			for( mov( 0, ecx ); ecx < nestedNdx; inc( ecx )) do
			
				fileio.put( outH, "{\xe " nl );
				if( special <> NULL ) then
				
					fileio.put( outH, special, nl );
					
				endif;
				mov( nested[ecx*4], eax );
				fileio.put( outH, combinedPrefix, (type string eax), "}" nl );
				if( special <> NULL ) then
				
					fileio.put( outH, special, nl );
					
				endif;
				fileio.put( outH, "}" nl );
				
				// Bump the output index count by one:
				
				inc( outputCnt );
				
			endfor;
			str.free( combinedPrefix );
			
		else // Does not have a semiColon
		
			fileio.put( outH, "{\xe", mainStr2, "}" );
		
		endif;
		str.free( mainStr );							
		str.free( mainStr2 );							
		pop( esi );
		pop( edi );
		pop( edx );
		pop( ecx );
		pop( ebx );
		
	end processIndex;
	
	// processFile-
	//
	// Processes the text in the input file:
	
	procedure processFile( var start:txtPtr; endptr:txtPtr );
	begin processFile;
	
		push( esi );
		push( edi );
		push( ebx );
		push( ecx );
		push( edx );
	
		mov( start, esi );
		mov( [esi], esi );
		mov( endptr, edi );
		if( esi < edi ) then
		
			pat.match( esi, edi );
			
				// Match up to the first index entry:
				
				pat.upToiStr( "{\xe" );
				
				// Write all the text up to the
				// first index entry to the output file:
				
				mov( esi, eax );
				sub( ebx, eax );
				fileio.write( outH, val ebx, eax );
				
				// Bump the input index count by 1:
				
				inc( inputCnt );
				
				// Find the end of this index entry:
				
				mov( esi, edx );	// Save ptr to "{"
				mov( 1, ecx );		// Count braces
				inc( esi );			// Skip leading brace
				while( ecx > 0 && esi < edi ) do
				
					// Keep track of nested braces:
					
					mov( [esi], al );
					if( al == '{' ) then
					
						inc( ecx );
						
					elseif( al == '}' ) then
					
						dec( ecx );
						
					endif;
					
					// Move on to the next character
					
					inc( esi );
					
				endwhile;
				
				// Process the index string:
				
				processIndex( edx, esi );
				
			
				pat.if_failure
				
					// If we didn't match an index entry,
					// then we've hit the end of the file.
					// Write everything from the start of
					// the match to the end of the file
					// to the output file:
					
					mov( edi, eax );
					sub( ebx, eax );
					fileio.write( outH, val ebx, eax );
					mov( edi, esi ); 				
			
			pat.endmatch;
		
		endif;
		
		// Return pass-by-reference value:
		
		mov( start, edi );
		mov( esi, [edi] );
		
		pop( edx );
		pop( ecx );
		pop( ebx );
		pop( edi );
		pop( esi );
		
	end processFile;
	
	
begin fixXref;

	mov( 0, inputCnt );
	mov( 0, outputCnt );
	
	// Process command-line parameters:
	
	arg.c();
	if( eax != 3 ) then
	
		stderr.put( "Usage fixxref in-file out-file" nl );
		w.ExitProcess(1);
		
	endif;
	arg.a_v( 1 );
	mov( eax, inFile );
	arg.a_v( 2 );
	mov( eax, outFile );
	input.create();		// Initialize the memory-mapped file.
	
	// Open the input file:
								
	try
	
		input.open( inFile, fileio.r );
		
	anyexception
	
		stderr.put( "Could not open ", inFile, nl );
		w.ExitProcess( 2 );
		
	endtry;
	
	// Create the output file:
	
	try
	
		fileio.openNew( outFile );
		mov( eax, outH );
		
	anyexception
	
		stderr.put( "Could not open ", outFile, nl );
		input.close();
		input.destroy();
		w.ExitProcess( 3 );
		
	endtry;
	
	stdout.put
	( 
		"Adjusting index entries in ", 
		inFile, 
		" and writing output to ", 
		outFile, 
		nl 
	);
	
	
	// Process the input file and write the result to
	// the output file.
	
	mov( input.filePtr, eax );
	mov( eax, inStart );
	mov( input.endFilePtr, eax );
	mov( eax, inEnd );
	
	// process the file:
	
	forever
	
		mov( inStart, esi );
		breakif( esi >= inEnd );
		processFile( inStart, inEnd );
			
	endfor;
	
	// Print statistics:
	
	stdout.put
	( 
		"There were ", inputCnt, " input index entries, " nl
		"and ", outputCnt, " output index entries" nl
	);
	
	// Clean up and terminate:
	
	fileio.close( outH );
	input.close();
	input.destroy(); 

end fixXref;

Cheers,
Randy Hyde