summaryrefslogtreecommitdiff
path: root/cscmd/utf8ing.c
diff options
context:
space:
mode:
authorkx <kx@radix.pro>2023-03-24 02:53:04 +0300
committerkx <kx@radix.pro>2023-03-24 02:53:04 +0300
commit12c7b1c5658602269da2f5b75835ec0f5fab8890 (patch)
tree93f6f6b85830af69743d5ebda902d4305bf23f4f /cscmd/utf8ing.c
parent4e72ffe940d9aff7c019d37a6459e765902c1fae (diff)
downloadcscm-trunk.tar.xz
Version 0.1.4HEADcscm-0.1.4trunk
Diffstat (limited to 'cscmd/utf8ing.c')
-rw-r--r--cscmd/utf8ing.c121
1 files changed, 121 insertions, 0 deletions
diff --git a/cscmd/utf8ing.c b/cscmd/utf8ing.c
new file mode 100644
index 0000000..1d67c79
--- /dev/null
+++ b/cscmd/utf8ing.c
@@ -0,0 +1,121 @@
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <locale.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <defs.h>
+#include <utf8ing.h>
+
+
+static const ucs4_t replacement_char = 0xfffd;
+static const ucs4_t maximum_ucs4 = 0x7fffffff;
+
+static const int half_shift = 10;
+static const ucs4_t half_base = 0x0010000;
+
+static const ucs4_t surrogate_high_start = 0xd800;
+static const ucs4_t surrogate_high_end = 0xdbff;
+static const ucs4_t surrogate_low_start = 0xdc00;
+static const ucs4_t surrogate_low_end = 0xdfff;
+
+static utf8_t
+first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };
+
+
+/***************************************************************
+ static copy_ucs4_to_utf8()
+
+ Переводит строку символов UCS4( src ) в UTF8( dest ).
+
+ Возвращаемое значение:
+ Количество байт, реально записанное в DEST.
+
+ NOTE:
+ Выход за пределы памяти, выделенной под указатель DEST
+ не контролируются.
+ Подразумевается, что строка SRC имеет null-терминатор.
+ ***************************************************************/
+int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src )
+{
+ utf8_t target[7];
+ utf8_t *ptr;
+ int count = 0;
+
+ while( *src )
+ {
+ ucs4_t c;
+ int bytes_to_write = 0;
+ const ucs4_t byte_mask = 0xbf;
+ const ucs4_t byte_mark = 0x80;
+
+ c = *src++;
+
+ if( c >= surrogate_high_start &&
+ c <= surrogate_high_end && *src )
+ {
+ ucs4_t c2 = *src;
+
+ if( c2 >= surrogate_low_start &&
+ c2 <= surrogate_low_end )
+ {
+ c = ((c - surrogate_high_start) << half_shift) +
+ (c2 - surrogate_low_start) + half_base;
+ ++src;
+ }
+ }
+
+ if( c < 0x80 ) bytes_to_write = 1;
+ else if( c < 0x800 ) bytes_to_write = 2;
+ else if( c < 0x10000 ) bytes_to_write = 3;
+ else if( c < 0x200000 ) bytes_to_write = 4;
+ else if( c < 0x4000000 ) bytes_to_write = 5;
+ else if( c <= maximum_ucs4 ) bytes_to_write = 6;
+ else
+ {
+ bytes_to_write = 2; c = replacement_char;
+ }
+
+ ptr = &target[0] + bytes_to_write;
+
+ switch( bytes_to_write )
+ {
+ case 6:
+ *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+ case 5:
+ *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+ case 4:
+ *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+ case 3:
+ *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+ case 2:
+ *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+ case 1:
+ *--ptr = c | first_byte_mark[bytes_to_write];
+ }
+
+ ptr = &target[0];
+
+ while( bytes_to_write > 0 )
+ {
+ *dest++ = *ptr++; /* write byte */
+ --bytes_to_write;
+ ++count;
+ }
+
+ } /* End while( *src ) */
+
+ *dest = (utf8_t)0; /* null terminator */
+
+ return( count );
+
+} /* End of static copy_ucs4_to_utf8() */