+/**
+ * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+ *
+ * This file and library is covered by the LGPL version 3, read LICENSE for details.
+ *
+ * History:
+ * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding
+ * - Based on to basic UTF-8 manipulation routines
+ * by Jeff Bezanson
+ * placed in the public domain Fall 2005
+ *
+ * This code is designed to provide the utilities you need to manipulate
+ * UTF-8 as an internal string encoding. These functions do not perform the
+ * error checking normally needed when handling UTF-8 data, so if you happen
+ * to be from the Unicode Consortium you will want to flay me alive.
+ * I do this because error checking can be performed at the boundaries (I/O),
+ * with these routines reserved for higher performance on data known to be
+ * valid.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+static const u_int32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
+ 0x82082080UL };
+
+static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
+
+/* conversions without error checking
+ only works for valid UTF-8, i.e. no 5- or 6-byte sequences
+ srcsz = source size in bytes, or -1 if 0-terminated
+ sz = dest size in # of wide characters
+
+ returns # characters converted
+ dest will always be L'\0'-terminated, even if there isn't enough room
+ for all the characters.
+ if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
+ */
+int utf8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
+{
+ u_int32_t ch;
+ char *src_end = src + srcsz;
+ int nb;
+ int i = 0;
+
+ while (i < sz - 1) {
+ nb = trailingBytesForUTF8[(unsigned char) *src];
+ if (srcsz == -1) {
+ if (*src == 0)
+ break;
+ }
+ else {
+ if (src + nb >= src_end)
+ break;
+ }
+ ch = 0;
+ switch (nb) {
+ /* these fall through deliberately */
+ case 3:
+ ch += (unsigned char) *src++;
+ ch <<= 6;
+ case 2:
+ ch += (unsigned char) *src++;
+ ch <<= 6;
+ case 1:
+ ch += (unsigned char) *src++;
+ ch <<= 6;
+ case 0:
+ ch += (unsigned char) *src++;
+ }
+ ch -= offsetsFromUTF8[nb];
+ dest[i++] = ch;
+ }
+ dest[i] = 0;
+ return i;
+}
+
+int utf8_vprintf(char *fmt, va_list ap)
+{
+ int cnt;
+ int sz;
+ char *buf;
+ u_int32_t *wcs;
+ bool do_loop;
+
+ cnt = 0;
+ sz = 512;
+ buf = (char*) alloca(sz);
+ do_loop = true;
+ while (do_loop) {
+ cnt = vsnprintf(buf, sz, fmt, ap);
+ if (cnt >= sz) {
+ buf = (char*) alloca(cnt - sz + 1);
+ sz = cnt + 1;
+ }
+ else {
+ do_loop = false;
+ }
+ }
+ wcs = (u_int32_t*) alloca((cnt + 1) * sizeof(u_int32_t));
+ cnt = utf8_toucs(wcs, cnt + 1, buf, cnt);
+ printf("%ls", (wchar_t*) wcs);
+ return cnt;
+}