#ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include #include static const ucs4_t replacement_char = 0xfffd; static const ucs4_t maximum_ucs4 = 0x7fffffff; static const int half_shift = 10; static const ucs4_t half_base = 0x0010000; static const ucs4_t surrogate_high_start = 0xd800; static const ucs4_t surrogate_high_end = 0xdbff; static const ucs4_t surrogate_low_start = 0xdc00; static const ucs4_t surrogate_low_end = 0xdfff; static utf8_t first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; /*************************************************************** static copy_ucs4_to_utf8() Переводит строку символов UCS4( src ) в UTF8( dest ). Возвращаемое значение: Количество байт, реально записанное в DEST. NOTE: Выход за пределы памяти, выделенной под указатель DEST не контролируются. Подразумевается, что строка SRC имеет null-терминатор. ***************************************************************/ int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src ) { utf8_t target[7]; utf8_t *ptr; int count = 0; while( *src ) { ucs4_t c; int bytes_to_write = 0; const ucs4_t byte_mask = 0xbf; const ucs4_t byte_mark = 0x80; c = *src++; if( c >= surrogate_high_start && c <= surrogate_high_end && *src ) { ucs4_t c2 = *src; if( c2 >= surrogate_low_start && c2 <= surrogate_low_end ) { c = ((c - surrogate_high_start) << half_shift) + (c2 - surrogate_low_start) + half_base; ++src; } } if( c < 0x80 ) bytes_to_write = 1; else if( c < 0x800 ) bytes_to_write = 2; else if( c < 0x10000 ) bytes_to_write = 3; else if( c < 0x200000 ) bytes_to_write = 4; else if( c < 0x4000000 ) bytes_to_write = 5; else if( c <= maximum_ucs4 ) bytes_to_write = 6; else { bytes_to_write = 2; c = replacement_char; } ptr = &target[0] + bytes_to_write; switch( bytes_to_write ) { case 6: *--ptr = (c | byte_mark) & byte_mask; c >>= 6; case 5: *--ptr = (c | byte_mark) & byte_mask; c >>= 6; case 4: *--ptr = (c | byte_mark) & byte_mask; c >>= 6; case 3: *--ptr = (c | byte_mark) & byte_mask; c >>= 6; case 2: *--ptr = (c | byte_mark) & byte_mask; c >>= 6; case 1: *--ptr = c | first_byte_mark[bytes_to_write]; } ptr = &target[0]; while( bytes_to_write > 0 ) { *dest++ = *ptr++; /* write byte */ --bytes_to_write; ++count; } } /* End while( *src ) */ *dest = (utf8_t)0; /* null terminator */ return( count ); } /* End of static copy_ucs4_to_utf8() */