2 * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
4 * This file and library is covered by the LGPL version 3, read LICENSE for details.
7 * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding
8 * - Based on to basic UTF-8 manipulation routines
10 * placed in the public domain Fall 2005
12 * This code is designed to provide the utilities you need to manipulate
13 * UTF-8 as an internal string encoding. These functions do not perform the
14 * error checking normally needed when handling UTF-8 data, so if you happen
15 * to be from the Unicode Consortium you will want to flay me alive.
16 * I do this because error checking can be performed at the boundaries (I/O),
17 * with these routines reserved for higher performance on data known to be
28 static const u_int32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
31 static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
36 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
38 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
40 /* conversions without error checking
41 only works for valid UTF-8, i.e. no 5- or 6-byte sequences
42 srcsz = source size in bytes, or -1 if 0-terminated
43 sz = dest size in # of wide characters
45 returns # characters converted
46 dest will always be L'\0'-terminated, even if there isn't enough room
47 for all the characters.
48 if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
50 int utf8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
53 char *src_end = src + srcsz;
58 nb = trailingBytesForUTF8[(unsigned char) *src];
64 if (src + nb >= src_end)
69 /* these fall through deliberately */
71 ch += (unsigned char) *src++;
74 ch += (unsigned char) *src++;
77 ch += (unsigned char) *src++;
80 ch += (unsigned char) *src++;
82 ch -= offsetsFromUTF8[nb];
89 int utf8_vprintf(char *fmt, va_list ap)
99 buf = (char*) alloca(sz);
102 cnt = vsnprintf(buf, sz, fmt, ap);
104 buf = (char*) alloca(cnt - sz + 1);
111 wcs = (u_int32_t*) alloca((cnt + 1) * sizeof(u_int32_t));
112 cnt = utf8_toucs(wcs, cnt + 1, buf, cnt);
113 printf("%ls", (wchar_t*) wcs);