/** * Copyright (c) 2009-2010 Mika Laitio * * This file and library is covered by the LGPL version 3, read LICENSE for details. * * History: * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding * - Based on to basic UTF-8 manipulation routines * by Jeff Bezanson * placed in the public domain Fall 2005 * * This code is designed to provide the utilities you need to manipulate * UTF-8 as an internal string encoding. These functions do not perform the * error checking normally needed when handling UTF-8 data, so if you happen * to be from the Unicode Consortium you will want to flay me alive. * I do this because error checking can be performed at the boundaries (I/O), * with these routines reserved for higher performance on data known to be * valid. */ #include #include #include #include #include #include static const u_int32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; /* conversions without error checking only works for valid UTF-8, i.e. no 5- or 6-byte sequences srcsz = source size in bytes, or -1 if 0-terminated sz = dest size in # of wide characters returns # characters converted dest will always be L'\0'-terminated, even if there isn't enough room for all the characters. if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. */ int utf8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) { u_int32_t ch; char *src_end = src + srcsz; int nb; int i = 0; while (i < sz - 1) { nb = trailingBytesForUTF8[(unsigned char) *src]; if (srcsz == -1) { if (*src == 0) break; } else { if (src + nb >= src_end) break; } ch = 0; switch (nb) { /* these fall through deliberately */ case 3: ch += (unsigned char) *src++; ch <<= 6; case 2: ch += (unsigned char) *src++; ch <<= 6; case 1: ch += (unsigned char) *src++; ch <<= 6; case 0: ch += (unsigned char) *src++; } ch -= offsetsFromUTF8[nb]; dest[i++] = ch; } dest[i] = 0; return i; } int utf8_vprintf(char *fmt, va_list ap) { int cnt; int sz; char *buf; u_int32_t *wcs; bool do_loop; cnt = 0; sz = 512; buf = (char*) alloca(sz); do_loop = true; while (do_loop) { cnt = vsnprintf(buf, sz, fmt, ap); if (cnt >= sz) { buf = (char*) alloca(cnt - sz + 1); sz = cnt + 1; } else { do_loop = false; } } wcs = (u_int32_t*) alloca((cnt + 1) * sizeof(u_int32_t)); cnt = utf8_toucs(wcs, cnt + 1, buf, cnt); printf("%ls", (wchar_t*) wcs); return cnt; }