/**
  * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
  *
  * This file and library is covered by the LGPL version 3, read LICENSE for details.
  *
  * History:
  * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding
  * - Based on to basic UTF-8 manipulation routines
  * by Jeff Bezanson
  * placed in the public domain Fall 2005
  *
  * This code is designed to provide the utilities you need to manipulate
  * UTF-8 as an internal string encoding. These functions do not perform the
  * error checking normally needed when handling UTF-8 data, so if you happen
  * to be from the Unicode Consortium you will want to flay me alive.
  * I do this because error checking can be performed at the boundaries (I/O),
  * with these routines reserved for higher performance on data known to be
  * valid.
  */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <malloc.h>
#include <stdbool.h>

static const u_int32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
		0x82082080UL };

static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };

/* conversions without error checking
 only works for valid UTF-8, i.e. no 5- or 6-byte sequences
 srcsz = source size in bytes, or -1 if 0-terminated
 sz = dest size in # of wide characters

 returns # characters converted
 dest will always be L'\0'-terminated, even if there isn't enough room
 for all the characters.
 if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
 */
int utf8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
{
	u_int32_t ch;
	char *src_end = src + srcsz;
	int nb;
	int i = 0;

	while (i < sz - 1) {
		nb = trailingBytesForUTF8[(unsigned char) *src];
		if (srcsz == -1) {
			if (*src == 0)
				break;
		}
		else {
			if (src + nb >= src_end)
				break;
		}
		ch = 0;
		switch (nb) {
		/* these fall through deliberately */
		case 3:
			ch += (unsigned char) *src++;
			ch <<= 6;
		case 2:
			ch += (unsigned char) *src++;
			ch <<= 6;
		case 1:
			ch += (unsigned char) *src++;
			ch <<= 6;
		case 0:
			ch += (unsigned char) *src++;
		}
		ch -= offsetsFromUTF8[nb];
		dest[i++] = ch;
	}
	dest[i] = 0;
	return i;
}

int utf8_vprintf(char *fmt, va_list ap)
{
	int		cnt;
	int		sz;
	char		*buf;
	u_int32_t	*wcs;
	bool 		do_loop;

	cnt	= 0;
	sz	= 512;
	buf	= (char*) alloca(sz);
	do_loop	= true;
	while (do_loop) {
		cnt = vsnprintf(buf, sz, fmt, ap);
		if (cnt >= sz) {
			buf = (char*) alloca(cnt - sz + 1);
			sz = cnt + 1;
		}
		else {
			do_loop = false;
		}
	}
	wcs	= (u_int32_t*) alloca((cnt + 1) * sizeof(u_int32_t));
	cnt	= utf8_toucs(wcs, cnt + 1, buf, cnt);
	printf("%ls", (wchar_t*) wcs);
	return cnt;
}