/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*   u8dump <file
*
* This is a prototype UTF-8 dump program.  The intent is to analyze a file and
* show its contents in the most explanatory way.  This is primarily for debugging
* and trouble-shooting purposes.  This program's output is far too large for most
* routine uses.  But it might be useful if you're debugging problems related to
* how text is encoded.
*
*AUTHOR
*  John Chambers <jc@trillian.mit.edu>
*/

#include "local.h"
#include "sys_stdio.h"
#include "V.h"

// Buffer for holding a short ASCII text string:
#define BUF 7
char  buf[BUF+1];
int   bi = 0;		// Next char in buf[]

// Buffer for holding a UTF-8 hex string:
#define UHEX8 31
char uhex8[UHEX8+1];
char *uhex8p = uhex8;		// Pointer into uhex8[]
char *uhex8z = uhex8+UHEX8;	// Last byte of uhex8[]

// Buffer for holding a Unicode hex string:
#define UHEX16 5
char uhexbuf[UHEX16+1];
char *uhexbufp = uhexbuf;		// Pointer into uhexbuf[]
char *uhexbufz = uhexbuf+UHEX16;	// Last byte of uhexbuf[]

char *dsc = 0;		// Description of one char
char *txt = 0;		// Text displayed for a char

int   files = 0;	// Number of files read so far
int   ucode = 0;	// Unicode value of char

int main(ac,av,ev)
	int   ac;
	char**av;
	char**ev;
{	int   a;

	ac = Vinit(ac,av);

	for (a=1; a<ac; a++) {
		V2 "arg%3d=\"%s\"",a,av[a] V;
	}

	unless (files > 0) {
		onefile(stdin);
	}
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Process one input file.
*/
onefile(fp)
	FILE *fp;
{	int   bc, ch, ci, ext;
	Fenter("onefile");
	V3 "onefile: Called." V;
	++files;	// Count the files as we read them.
	while ((ci = getc(fp)) != EOF) {
		ch = ci & 0x0FF;
		V5 "Next byte is %02X %02X.",ci,ch V;
		sprintf(uhex8,"%02X",ch);	// Fill in first char in uhex8[] string
		dsc = txt = "";
		bc = 1;				// Bytes/char
		if (ch <= 0x20) {	// ASCII control char or SP
			switch (ch) {
			case 0x00: txt = "00";  dsc = "NUL"; break;
			case 0x01: txt = "__";  dsc = "SOH"; break;
			case 0x02: txt = "__";  dsc = "STX"; break;
			case 0x03: txt = "__";  dsc = "ETX"; break;
			case 0x04: txt = "__";  dsc = "EOT"; break;
			case 0x05: txt = "__";  dsc = "ENQ"; break;
			case 0x06: txt = "__";  dsc = "ACK"; break;
			case 0x07: txt = "__";  dsc = "BEL"; break;
			case 0x08: txt = "\\b"; dsc = "BS";  break;
			case 0x09: txt = "\\t"; dsc = "HT";  break;
			case 0x0A: txt = "\\n"; dsc = "LF";  break;
			case 0x0B: txt = "__";  dsc = "VT";  break;
			case 0x0C: txt = "__";  dsc = "NP";  break;
			case 0x0D: txt = "\\r"; dsc = "CR";  break;
			case 0x0E: txt = "__";  dsc = "SO";  break;
			case 0x0F: txt = "__";  dsc = "SI";  break;
			case 0x10: txt = "__";  dsc = "DLE"; break;
			case 0x11: txt = "__";  dsc = "DC1"; break;
			case 0x12: txt = "__";  dsc = "DC2"; break;
			case 0x13: txt = "__";  dsc = "DC3"; break;
			case 0x14: txt = "__";  dsc = "DC4"; break;
			case 0x15: txt = "__";  dsc = "NAK"; break;
			case 0x16: txt = "__";  dsc = "SYN"; break;
			case 0x17: txt = "__";  dsc = "ETB"; break;
			case 0x18: txt = "__";  dsc = "CAN"; break;
			case 0x19: txt = "__";  dsc = "EM";  break;
			case 0x1A: txt = "__";  dsc = "SUB"; break;
			case 0x1B: txt = "\\e"; dsc = "ESC"; break;
			case 0x1C: txt = "__";  dsc = "FS";  break;
			case 0x1D: txt = "__";  dsc = "GS";  break;
			case 0x1E: txt = "__";  dsc = "RS";  break;
			case 0x1F: txt = "__";  dsc = "US";  break;
			case 0x20: txt = " ";   dsc = "SP";  break;
			}
			ext = 0;
		} elsif (ch < 0x7F) {	// ASCII character
			*(txt = buf) = ucode = ch;
			bi = 1;
			dsc = "";
			ext = 0;
		} elsif (ch == 0x7F) {	// ASCII DEL
			txt = "**";
			ucode = ch;
			dsc = "DEL";
			ext = 0;
		} elsif (ch & 0x080)  {	// UTF-8 multi-byte char
			txt = "??";
			if ((ch & 0x0E0) == 0xC0) { 	// UTF-8 2-byte flag
				txt = "_2";
				dsc = "U2";
				ucode = ch & 0x01F;
				extchars(bc=ext=2,fp,ch);
			} elsif ((ch & 0x0F0) == 0x0E0) {	//UTF-8 3-byte flag
				txt = "_3";
				dsc = "U3";
				ucode = ch & 0x0F;
				extchars(bc=ext=3,fp,ch);
			} elsif ((ch & 0x0F8) == 0x0F0) {	//UTF-8 4-byte flag
				txt = "_4";
				dsc = "U4";
				ucode = ch & 0x07;
				extchars(bc=ext=4,fp,ch);
			} elsif ((ch & 0x0FC) == 0x0F8) {	//UTF-8 5-byte flag
				txt = "_5";
				dsc = "U5";
				bc = ext = 5;
				bi = 0;
				ucode = ch & 0x03;
				buf[bi++] = ch;
			} elsif ((ch & 0x0FE) == 0x0FC) {	//UTF-8 6-byte flag
				txt = "_6";
				dsc = "U6";
				bc = ext = 6;
				bi = 0;
				ucode = ch & 0x01;
				buf[bi++] = ch;
			} elsif ((ch & 0x0FF) == 0x0FE) {	// Shouldn't happen yet ;-)
				txt = "_7";
				dsc = "U7";
				bc = ext = 7;
				bi = 0;
				ucode = 0;
				buf[bi++] = ch;
			} else {	// UTF-8 extension byte
				txt = " .";
				dsc = "U.";
				if (ext > 0) {	// Are we expecting extension bytes?
					if (bi < BUF) {
						buf[bi++] = ch;
						--ext;		// Remaining extension bytes expected
					} else {
						V1 "*** Overflow at byte %d with %d bytes left UTF char ***",bi,ext V;
					}
				} else {
					V1 "*** Unexpected extension byte %02X ***",ch V;
				}
			}
		} else {
			txt = "**";
			dsc = "???";
		}
		buf[bi] = 0;
		if (ucode > 0x0FFFF) {
			sprintf(uhexbuf,"%06X",ucode);
		} else {
			sprintf(uhexbuf,"%04X",ucode);
		}
		printf("%db\t%s\t%s\t%s\t%s\n",bc,txt,dsc,uhexbuf,uhex8);
		fflush(stdout);
		if (ext < 1) bi = 0;
	}
	Fexit;
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Read the extension bytes of a multi-byte UTF-8 char.  The actual number  of
* bytes  read  is returned, which should be n, but may be less in the case of
* malformed UTF-8 data.  The bytes are in buf[].
*/
int extchars(n,fp,c0)
	int   n;	// Number of extension bytes
	FILE *fp;	// File to read
	int   c0;	// First byte of multi-byte char
{	char *F="extchars";
	int   ch, ci;
	Fenter("extchars");
	V5 "Called with n=%d c0=%02X.",n,c0 V;
	bi = 0;
	V3 "1st byte %d in %d-byte UTF char is %02X ***",bi,n,c0 V;
	buf[bi++] = c0;	// Insert first char in buf;
	uhex8p = uhex8 + strlen(uhex8);
	while (bi < n) {
		if ((ci = getc(fp)) == EOF) {
			V1 "*** EOF at byte %d in %d-byte UTF char ***",bi,n V;
			Fail;
		}
		ch = B8(ci);
		V3 "Ext byte %d is %2X %02X.",bi,ci,ch V;
		if (bi >= BUF) {	// Buffer overflow
			V1 "*** Overflow at byte %d %2X in %d-byte UTF char ***",bi,ch,n V;
			Fail;
		} elsif ((ch & 0x0C0) != 0x080) {	// Is it a UTF-8 extension byte?
			V1 "*** Bad byte %d in %d-byte UTF char is %02X ***",bi,n,ch V;
			ungetc(ch,fp);
			Fail;
		} else {
			V3 "Ext byte %d in %d-byte UTF char is %02X ***",bi,n,ch V;
			buf[bi++] = ch;
			sprintf(uhex8p," %02X",ch);
			ucode = (ucode << 6) + (ch & 0x03F);
			uhex8p += strlen(uhex8p);
		//	printf("\t%s\t .%d\t%02X ...\n","--",bi,ch);
		//	fflush(stdout);
		}
	}
fail:
	Fexit;
	buf[bi] = 0;
	txt = buf;
	return bi;
}