Version 0.1.4HEAD cscm-0.1.4 trunk

author: kx <kx@radix.pro> 2023-03-24 02:53:04 +0300
committer: kx <kx@radix.pro> 2023-03-24 02:53:04 +0300
commit: 12c7b1c5658602269da2f5b75835ec0f5fab8890 (patch)
tree: 93f6f6b85830af69743d5ebda902d4305bf23f4f /cscmd/utf8ing.c
parent: 4e72ffe940d9aff7c019d37a6459e765902c1fae (diff)
download: cscm-trunk.tar.xz
1 files changed, 121 insertions, 0 deletions
diff --git a/cscmd/utf8ing.c b/cscmd/utf8ing.c
new file mode 100644
index 0000000..1d67c79
--- /dev/null
+++ b/cscmd/utf8ing.c
@@ -0,0 +1,121 @@
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <locale.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <defs.h>
+#include <utf8ing.h>
+
+
+static const ucs4_t replacement_char     = 0xfffd;
+static const ucs4_t maximum_ucs4         = 0x7fffffff;
+
+static const int    half_shift           = 10;
+static const ucs4_t half_base            = 0x0010000;
+
+static const ucs4_t surrogate_high_start = 0xd800;
+static const ucs4_t surrogate_high_end   = 0xdbff;
+static const ucs4_t surrogate_low_start  = 0xdc00;
+static const ucs4_t surrogate_low_end    = 0xdfff;
+
+static utf8_t
+first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };
+
+
+/***************************************************************
+  static copy_ucs4_to_utf8()
+
+     Переводит строку символов UCS4( src ) в UTF8( dest ).
+
+     Возвращаемое значение:
+        Количество байт, реально записанное в DEST.
+
+     NOTE:
+        Выход за пределы памяти, выделенной под указатель DEST
+        не контролируются.
+        Подразумевается, что строка SRC имеет null-терминатор.
+ ***************************************************************/
+int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src )
+{
+  utf8_t   target[7];
+  utf8_t  *ptr;
+  int      count = 0;
+
+  while( *src )
+  {
+    ucs4_t        c;
+    int           bytes_to_write = 0;
+    const ucs4_t  byte_mask = 0xbf;
+    const ucs4_t  byte_mark = 0x80;
+
+    c = *src++;
+
+    if( c >= surrogate_high_start &&
+        c <= surrogate_high_end   && *src )
+    {
+      ucs4_t c2 = *src;
+
+      if( c2 >= surrogate_low_start &&
+          c2 <= surrogate_low_end      )
+      {
+        c = ((c  - surrogate_high_start) << half_shift) +
+             (c2 - surrogate_low_start) + half_base;
+        ++src;
+      }
+    }
+
+         if( c <          0x80 ) bytes_to_write = 1;
+    else if( c <         0x800 ) bytes_to_write = 2;
+    else if( c <       0x10000 ) bytes_to_write = 3;
+    else if( c <      0x200000 ) bytes_to_write = 4;
+    else if( c <     0x4000000 ) bytes_to_write = 5;
+    else if( c <= maximum_ucs4 ) bytes_to_write = 6;
+    else
+    {
+      bytes_to_write = 2;   c = replacement_char;
+    }
+
+    ptr = &target[0] + bytes_to_write;
+
+    switch( bytes_to_write )
+    {
+      case 6:
+        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+      case 5:
+        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+      case 4:
+        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+      case 3:
+        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+      case 2:
+        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
+      case 1:
+        *--ptr = c | first_byte_mark[bytes_to_write];
+    }
+
+    ptr = &target[0];
+
+    while( bytes_to_write > 0 )
+    {
+      *dest++ = *ptr++; /* write byte */
+      --bytes_to_write;
+      ++count;
+    }
+
+  } /* End while( *src ) */
+
+  *dest = (utf8_t)0; /* null terminator */
+
+  return( count );
+
+} /* End of static copy_ucs4_to_utf8() */
author	kx <kx@radix.pro>	2023-03-24 02:53:04 +0300
committer	kx <kx@radix.pro>	2023-03-24 02:53:04 +0300
commit	12c7b1c5658602269da2f5b75835ec0f5fab8890 (patch)
tree	93f6f6b85830af69743d5ebda902d4305bf23f4f /cscmd/utf8ing.c
parent	4e72ffe940d9aff7c019d37a6459e765902c1fae (diff)
download	cscm-trunk.tar.xz