src/internal/utf8.c

   1 /**
   2   * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
   3   *
   4   * This file and library is covered by the LGPL version 3, read LICENSE for details.
   5   *
   6   * History:
   7   * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding
   8   * - Based on to basic UTF-8 manipulation routines
   9   * by Jeff Bezanson
  10   * placed in the public domain Fall 2005
  11   *
  12   * This code is designed to provide the utilities you need to manipulate
  13   * UTF-8 as an internal string encoding. These functions do not perform the
  14   * error checking normally needed when handling UTF-8 data, so if you happen
  15   * to be from the Unicode Consortium you will want to flay me alive.
  16   * I do this because error checking can be performed at the boundaries (I/O),
  17   * with these routines reserved for higher performance on data known to be
  18   * valid.
  19   */
  20
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23 #include <string.h>
  24 #include <stdarg.h>
  25 #include <malloc.h>
  26 #include <stdbool.h>
  27
  28 static const u_int32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
  29                 0x82082080UL };
  30
  31 static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  32                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  33                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  34                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  35                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  36                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
  37                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
  38                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
  39
  40 /* conversions without error checking
  41  only works for valid UTF-8, i.e. no 5- or 6-byte sequences
  42  srcsz = source size in bytes, or -1 if 0-terminated
  43  sz = dest size in # of wide characters
  44
  45  returns # characters converted
  46  dest will always be L'\0'-terminated, even if there isn't enough room
  47  for all the characters.
  48  if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
  49  */
  50 int utf8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
  51 {
  52         u_int32_t ch;
  53         char *src_end = src + srcsz;
  54         int nb;
  55         int i = 0;
  56
  57         while (i < sz - 1) {
  58                 nb = trailingBytesForUTF8[(unsigned char) *src];
  59                 if (srcsz == -1) {
  60                         if (*src == 0)
  61                                 break;
  62                 }
  63                 else {
  64                         if (src + nb >= src_end)
  65                                 break;
  66                 }
  67                 ch = 0;
  68                 switch (nb) {
  69                 /* these fall through deliberately */
  70                 case 3:
  71                         ch += (unsigned char) *src++;
  72                         ch <<= 6;
  73                 case 2:
  74                         ch += (unsigned char) *src++;
  75                         ch <<= 6;
  76                 case 1:
  77                         ch += (unsigned char) *src++;
  78                         ch <<= 6;
  79                 case 0:
  80                         ch += (unsigned char) *src++;
  81                 }
  82                 ch -= offsetsFromUTF8[nb];
  83                 dest[i++] = ch;
  84         }
  85         dest[i] = 0;
  86         return i;
  87 }
  88
  89 int utf8_vprintf(char *fmt, va_list ap)
  90 {
  91         int             cnt;
  92         int             sz;
  93         char            *buf;
  94         u_int32_t       *wcs;
  95         bool            do_loop;
  96
  97         cnt     = 0;
  98         sz      = 512;
  99         buf     = (char*) alloca(sz);
 100         do_loop = true;
 101         while (do_loop) {
 102                 cnt = vsnprintf(buf, sz, fmt, ap);
 103                 if (cnt >= sz) {
 104                         buf = (char*) alloca(cnt - sz + 1);
 105                         sz = cnt + 1;
 106                 }
 107                 else {
 108                         do_loop = false;
 109                 }
 110         }
 111         wcs     = (u_int32_t*) alloca((cnt + 1) * sizeof(u_int32_t));
 112         cnt     = utf8_toucs(wcs, cnt + 1, buf, cnt);
 113         printf("%ls", (wchar_t*) wcs);
 114         return cnt;
 115 }