/*
* The UTF-FSS (aka UTF-2) encoding of UCS, as described in the following
* quote from Ken Thompson's utf-fss.c:
*
* Bits Hex Min Hex Max Byte Sequence in Binary
* 7 00000000 0000007f 0vvvvvvv
* 11 00000080 000007FF 110vvvvv 10vvvvvv
* 16 00000800 0000FFFF 1110vvvv 10vvvvvv 10vvvvvv
* 21 00010000 001FFFFF 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
* 26 00200000 03FFFFFF 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
* 31 04000000 7FFFFFFF 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
*
* The UCS value is just the concatenation of the v bits in the multibyte
* encoding. When there are multiple ways to encode a value, for example
* UCS 0, only the shortest encoding is legal.
*/
/* This implementation does not enforce the last restriction on input */
#define CODE_ERROR 0x80
Char
utf_getc(f)
reg FILE *f;
{
reg int c;
reg Char wc;
reg int extras;
reg unsigned bit;
if ((c = getc(f)) == EOF)
return EOF;
if ((c & 0x80) == 0) /* ASCII character */
return c;
if ((c & 0xc0) == 0x80) /* unexpected tail character */
return CODE_ERROR;
/* how many extra bytes? */
extras = 1;
for (bit = 0x20; (c & bit) != 0; bit >>= 1)
extras++;
if (extras > 5)
return CODE_ERROR;
/* put all the bits together */
wc = c & (bit-1);
while (extras-- > 0) {
if ((c = getc(f)) == EOF)
return EOF;
if ((c & 0xc0) != 0x80) { /* unexpected head character */
ungetc(c, f);
return CODE_ERROR;
}
wc = (wc<<6) | c&0x3f;
}
return wc;
}