diff options
Diffstat (limited to 'cscmd/utf8ing.c')
-rw-r--r-- | cscmd/utf8ing.c | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/cscmd/utf8ing.c b/cscmd/utf8ing.c new file mode 100644 index 0000000..1d67c79 --- /dev/null +++ b/cscmd/utf8ing.c @@ -0,0 +1,121 @@ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <stdarg.h> +#include <limits.h> +#include <locale.h> +#include <wchar.h> +#include <wctype.h> + +#include <defs.h> +#include <utf8ing.h> + + +static const ucs4_t replacement_char = 0xfffd; +static const ucs4_t maximum_ucs4 = 0x7fffffff; + +static const int half_shift = 10; +static const ucs4_t half_base = 0x0010000; + +static const ucs4_t surrogate_high_start = 0xd800; +static const ucs4_t surrogate_high_end = 0xdbff; +static const ucs4_t surrogate_low_start = 0xdc00; +static const ucs4_t surrogate_low_end = 0xdfff; + +static utf8_t +first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; + + +/*************************************************************** + static copy_ucs4_to_utf8() + + Переводит строку символов UCS4( src ) в UTF8( dest ). + + Возвращаемое значение: + Количество байт, реально записанное в DEST. + + NOTE: + Выход за пределы памяти, выделенной под указатель DEST + не контролируются. + Подразумевается, что строка SRC имеет null-терминатор. + ***************************************************************/ +int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src ) +{ + utf8_t target[7]; + utf8_t *ptr; + int count = 0; + + while( *src ) + { + ucs4_t c; + int bytes_to_write = 0; + const ucs4_t byte_mask = 0xbf; + const ucs4_t byte_mark = 0x80; + + c = *src++; + + if( c >= surrogate_high_start && + c <= surrogate_high_end && *src ) + { + ucs4_t c2 = *src; + + if( c2 >= surrogate_low_start && + c2 <= surrogate_low_end ) + { + c = ((c - surrogate_high_start) << half_shift) + + (c2 - surrogate_low_start) + half_base; + ++src; + } + } + + if( c < 0x80 ) bytes_to_write = 1; + else if( c < 0x800 ) bytes_to_write = 2; + else if( c < 0x10000 ) bytes_to_write = 3; + else if( c < 0x200000 ) bytes_to_write = 4; + else if( c < 0x4000000 ) bytes_to_write = 5; + else if( c <= maximum_ucs4 ) bytes_to_write = 6; + else + { + bytes_to_write = 2; c = replacement_char; + } + + ptr = &target[0] + bytes_to_write; + + switch( bytes_to_write ) + { + case 6: + *--ptr = (c | byte_mark) & byte_mask; c >>= 6; + case 5: + *--ptr = (c | byte_mark) & byte_mask; c >>= 6; + case 4: + *--ptr = (c | byte_mark) & byte_mask; c >>= 6; + case 3: + *--ptr = (c | byte_mark) & byte_mask; c >>= 6; + case 2: + *--ptr = (c | byte_mark) & byte_mask; c >>= 6; + case 1: + *--ptr = c | first_byte_mark[bytes_to_write]; + } + + ptr = &target[0]; + + while( bytes_to_write > 0 ) + { + *dest++ = *ptr++; /* write byte */ + --bytes_to_write; + ++count; + } + + } /* End while( *src ) */ + + *dest = (utf8_t)0; /* null terminator */ + + return( count ); + +} /* End of static copy_ucs4_to_utf8() */ |