#include #include #include #include #include #include #include #include #include #include #include #include #include /* builtin charmaps */ #include "charmaps.h" /* only 0-7 are valid as dest charset */ #define UTF_16BE 000 #define UTF_16LE 001 #define UTF_32BE 002 #define UTF_32LE 003 #define WCHAR_T 004 #define UTF_8 005 #define US_ASCII 006 #define LATIN_1 007 /* additional charsets with algorithmic conversion */ #define LATIN_9 010 #define TIS_620 011 #define JIS_0201 012 /* some programs like php need this */ int _libiconv_version = _LIBICONV_VERSION; /* these must match the constants above */ static const unsigned char charsets[] = "\005" "UTF-8" "\0" "\004" "WCHAR_T" "\0" "\000" "UTF-16BE" "\0" "\001" "UTF-16LE" "\0" "\002" "UTF-32BE" "\0" "\003" "UTF-32LE" "\0" "\006" "US-ASCII" "\0" "\007" "ISO-8859-1" "\0" "\007" "LATIN1" "\0" "\010" "ISO-8859-15""\0" "\010" "LATIN9" "\0" "\011" "ISO-8859-11""\0" "\011" "TIS-620" "\0" "\012" "JIS-0201" "\0" "\377"; /* separate identifiers for sbcs/dbcs/etc map type */ #define UCS2_8BIT 000 #define UCS3_8BIT 001 #define EUC 002 #define EUC_TW 003 #define SHIFT_JIS 004 #define BIG5 005 #define GBK 006 /* FIXME: these are not implemented yet // EUC: A1-FE A1-FE // GBK: 81-FE 40-7E,80-FE // Big5: A1-FE 40-7E,A1-FE */ static const unsigned short maplen[] = { [UCS2_8BIT] = 4+ 2* 128, [UCS3_8BIT] = 4+ 3* 128, [EUC] = 4+ 2* 94*94, [SHIFT_JIS] = 4+ 2* 94*94, [BIG5] = 4+ 2* 94*157, [GBK] = 4+ 2* 126*190, [EUC_TW] = 4+ 2* 2*94*94, }; static int find_charmap(const char *name) { int i; for (i = 0; i < (sizeof(charmaps) / sizeof(charmaps[0])); i++) if (!strcasecmp(charmaps[i].name, name)) return i; return -1; } static int find_charset(const char *name) { const unsigned char *s; for (s=charsets; *s<0xff && strcasecmp(s+1, name); s+=strlen(s)+1); return *s; } iconv_t iconv_open(const char *to, const char *from) { unsigned f, t; int m; if ((t = find_charset(to)) >= 8) return -1; if ((f = find_charset(from)) < 255) return 0 | (t<<1) | (f<<4); if ((m = find_charmap(from)) > -1) return 1 | (t<<1) | (m<<4); return -1; } int iconv_close(iconv_t cd) { return 0; } static inline wchar_t get_16(const unsigned char *s, int endian) { endian &= 1; return s[endian]<<8 | s[endian^1]; } static inline void put_16(unsigned char *s, wchar_t c, int endian) { endian &= 1; s[endian] = c>>8; s[endian^1] = c; } size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) { size_t x=0; unsigned char to = (cd>>1)&7; unsigned char from = 255; const unsigned char *map = 0; mbstate_t st = {0}; char tmp[MB_LEN_MAX]; wchar_t c, d; size_t k, l; int err; if (!in || !*in || !*inb) return 0; if (cd & 1) map = charmaps[cd>>4].map; else from = cd>>4; for (; *inb; *in+=l, *inb-=l) { c = *(unsigned char *)*in; l = 1; if (from >= UTF_8 && c < 0x80) goto charok; switch (from) { case WCHAR_T: l = sizeof(wchar_t); if (*inb < l) goto starved; c = *(wchar_t *)*in; break; case UTF_8: l = mbrtowc(&c, *in, *inb, &st); if (!l) l++; else if (l == (size_t)-1) goto ilseq; else if (l == (size_t)-2) goto starved; break; case US_ASCII: goto ilseq; case LATIN_9: if ((unsigned)c - 0xa4 <= 0xbe - 0xa4) { static const unsigned char map[] = { 0, 0x60, 0, 0x61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x7d, 0, 0, 0, 0x7e, 0, 0, 0, 0x52, 0x53, 0x78 }; if (c == 0xa4) c = 0x20ac; else if (map[c-0xa5]) c = 0x100 | map[c-0xa5]; } case LATIN_1: goto charok; case TIS_620: if (c >= 0xa1) c += 0x0e01-0xa1; goto charok; case JIS_0201: if (c >= 0xa1) { if (c <= 0xdf) c += 0xff61-0xa1; else goto ilseq; } goto charok; case UTF_16BE: case UTF_16LE: l = 2; if (*inb < 2) goto starved; c = get_16(*in, from); if ((unsigned)(c-0xdc00) < 0x400) goto ilseq; if ((unsigned)(c-0xd800) < 0x400) { l = 4; if (*inb < 4) goto starved; d = get_16(*in + 2, from); if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq; c = ((c-0xd800)<<10) | (d-0xdc00); } break; case UTF_32BE: case UTF_32LE: l = 4; if (*inb < 4) goto starved; // FIXME // c = get_32(*in, from); break; default: /* only support ascii supersets */ if (c < 0x80) break; switch (map[0]) { case UCS2_8BIT: c -= 0x80; break; case EUC: if ((unsigned)c - 0xa1 >= 94) goto ilseq; if ((unsigned)in[0][1] - 0xa1 >= 94) goto ilseq; c = (c-0xa1)*94 + (in[0][1]-0xa1); l = 2; break; case SHIFT_JIS: if ((unsigned)c - 0xa1 <= 0xdf-0xa1) { c += 0xff61-0xa1; goto charok; } // FIXME... l = 2; break; default: goto badf; } c = get_16(map + 4 + 2*c, 0); if (c == 0xffff) goto ilseq; goto charok; } if ((unsigned)c - 0xd800 < 0x800 || (unsigned)c >= 0x110000) goto ilseq; charok: switch (to) { case WCHAR_T: if (*outb < sizeof(wchar_t)) goto toobig; *(wchar_t *)*out = c; *out += sizeof(wchar_t); *outb -= sizeof(wchar_t); break; case UTF_8: if (*outb < 4) { k = wctomb(tmp, c); if (*outb < k) goto toobig; memcpy(*out, tmp, k); } else k = wctomb(*out, c); *out += k; *outb -= k; break; case US_ASCII: if (c > 0x7f) c = 0xfffd; /* fall thru and count replacement in latin1 case */ case LATIN_1: if (!*outb) goto toobig; if (c < 0x100) **out = c; else x++, **out = '*'; //FIXME: translit? ++*out; --*outb; break; case UTF_16BE: case UTF_16LE: if (c < 0x10000) { if (*outb < 2) goto toobig; put_16(*out, c, to); *out += 2; *outb -= 2; break; } if (*outb < 4) goto toobig; put_16(*out, (c>>10)|0xd800, to); put_16(*out + 2, (c&0x3ff)|0xdc00, to); *out += 4; *outb -= 4; break; default: goto badf; } } return x; ilseq: err = EILSEQ; x = -1; goto end; badf: err = EBADF; x = -1; goto end; toobig: err = E2BIG; goto end; starved: err = EINVAL; end: errno = err; return x; }