cscmd/utf8ing.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>

#include <defs.h>
#include <utf8ing.h>


static const ucs4_t replacement_char     = 0xfffd;
static const ucs4_t maximum_ucs4         = 0x7fffffff;

static const int    half_shift           = 10;
static const ucs4_t half_base            = 0x0010000;

static const ucs4_t surrogate_high_start = 0xd800;
static const ucs4_t surrogate_high_end   = 0xdbff;
static const ucs4_t surrogate_low_start  = 0xdc00;
static const ucs4_t surrogate_low_end    = 0xdfff;

static utf8_t
first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };


/***************************************************************
  static copy_ucs4_to_utf8()

     Переводит строку символов UCS4( src ) в UTF8( dest ).

     Возвращаемое значение:
        Количество байт, реально записанное в DEST.

     NOTE:
        Выход за пределы памяти, выделенной под указатель DEST
        не контролируются.
        Подразумевается, что строка SRC имеет null-терминатор.
 ***************************************************************/
int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src )
{
  utf8_t   target[7];
  utf8_t  *ptr;
  int      count = 0;

  while( *src )
  {
    ucs4_t        c;
    int           bytes_to_write = 0;
    const ucs4_t  byte_mask = 0xbf;
    const ucs4_t  byte_mark = 0x80;

    c = *src++;

    if( c >= surrogate_high_start &&
        c <= surrogate_high_end   && *src )
    {
      ucs4_t c2 = *src;

      if( c2 >= surrogate_low_start &&
          c2 <= surrogate_low_end      )
      {
        c = ((c  - surrogate_high_start) << half_shift) +
             (c2 - surrogate_low_start) + half_base;
        ++src;
      }
    }

         if( c <          0x80 ) bytes_to_write = 1;
    else if( c <         0x800 ) bytes_to_write = 2;
    else if( c <       0x10000 ) bytes_to_write = 3;
    else if( c <      0x200000 ) bytes_to_write = 4;
    else if( c <     0x4000000 ) bytes_to_write = 5;
    else if( c <= maximum_ucs4 ) bytes_to_write = 6;
    else
    {
      bytes_to_write = 2;   c = replacement_char;
    }

    ptr = &target[0] + bytes_to_write;

    switch( bytes_to_write )
    {
      case 6:
        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
      case 5:
        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
      case 4:
        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
      case 3:
        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
      case 2:
        *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
      case 1:
        *--ptr = c | first_byte_mark[bytes_to_write];
    }

    ptr = &target[0];

    while( bytes_to_write > 0 )
    {
      *dest++ = *ptr++; /* write byte */
      --bytes_to_write;
      ++count;
    }

  } /* End while( *src ) */

  *dest = (utf8_t)0; /* null terminator */

  return( count );

} /* End of static copy_ucs4_to_utf8() */