diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/Makefile | 1 | ||||
-rw-r--r-- | scripts/mkutf8data.c | 3419 |
2 files changed, 0 insertions, 3420 deletions
diff --git a/scripts/Makefile b/scripts/Makefile index b87e3e0ade4d..9d442ee050bd 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -20,7 +20,6 @@ hostprogs-$(CONFIG_ASN1) += asn1_compiler hostprogs-$(CONFIG_MODULE_SIG) += sign-file hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert -hostprogs-$(CONFIG_UNICODE) += mkutf8data HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include diff --git a/scripts/mkutf8data.c b/scripts/mkutf8data.c deleted file mode 100644 index ff2025ac5a32..000000000000 --- a/scripts/mkutf8data.c +++ /dev/null @@ -1,3419 +0,0 @@ -/* - * Copyright (c) 2014 SGI. - * All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* Generator for a compact trie for unicode normalization */ - -#include <sys/types.h> -#include <stddef.h> -#include <stdlib.h> -#include <stdio.h> -#include <assert.h> -#include <string.h> -#include <unistd.h> -#include <errno.h> - -/* Default names of the in- and output files. */ - -#define AGE_NAME "DerivedAge.txt" -#define CCC_NAME "DerivedCombiningClass.txt" -#define PROP_NAME "DerivedCoreProperties.txt" -#define DATA_NAME "UnicodeData.txt" -#define FOLD_NAME "CaseFolding.txt" -#define NORM_NAME "NormalizationCorrections.txt" -#define TEST_NAME "NormalizationTest.txt" -#define UTF8_NAME "utf8data.h" - -const char *age_name = AGE_NAME; -const char *ccc_name = CCC_NAME; -const char *prop_name = PROP_NAME; -const char *data_name = DATA_NAME; -const char *fold_name = FOLD_NAME; -const char *norm_name = NORM_NAME; -const char *test_name = TEST_NAME; -const char *utf8_name = UTF8_NAME; - -int verbose = 0; - -/* An arbitrary line size limit on input lines. */ - -#define LINESIZE 1024 -char line[LINESIZE]; -char buf0[LINESIZE]; -char buf1[LINESIZE]; -char buf2[LINESIZE]; -char buf3[LINESIZE]; - -const char *argv0; - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -/* ------------------------------------------------------------------ */ - -/* - * Unicode version numbers consist of three parts: major, minor, and a - * revision. These numbers are packed into an unsigned int to obtain - * a single version number. - * - * To save space in the generated trie, the unicode version is not - * stored directly, instead we calculate a generation number from the - * unicode versions seen in the DerivedAge file, and use that as an - * index into a table of unicode versions. - */ -#define UNICODE_MAJ_SHIFT (16) -#define UNICODE_MIN_SHIFT (8) - -#define UNICODE_MAJ_MAX ((unsigned short)-1) -#define UNICODE_MIN_MAX ((unsigned char)-1) -#define UNICODE_REV_MAX ((unsigned char)-1) - -#define UNICODE_AGE(MAJ,MIN,REV) \ - (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ - ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ - ((unsigned int)(REV))) - -unsigned int *ages; -int ages_count; - -unsigned int unicode_maxage; - -static int age_valid(unsigned int major, unsigned int minor, - unsigned int revision) -{ - if (major > UNICODE_MAJ_MAX) - return 0; - if (minor > UNICODE_MIN_MAX) - return 0; - if (revision > UNICODE_REV_MAX) - return 0; - return 1; -} - -/* ------------------------------------------------------------------ */ - -/* - * utf8trie_t - * - * A compact binary tree, used to decode UTF-8 characters. - * - * Internal nodes are one byte for the node itself, and up to three - * bytes for an offset into the tree. The first byte contains the - * following information: - * NEXTBYTE - flag - advance to next byte if set - * BITNUM - 3 bit field - the bit number to tested - * OFFLEN - 2 bit field - number of bytes in the offset - * if offlen == 0 (non-branching node) - * RIGHTPATH - 1 bit field - set if the following node is for the - * right-hand path (tested bit is set) - * TRIENODE - 1 bit field - set if the following node is an internal - * node, otherwise it is a leaf node - * if offlen != 0 (branching node) - * LEFTNODE - 1 bit field - set if the left-hand node is internal - * RIGHTNODE - 1 bit field - set if the right-hand node is internal - * - * Due to the way utf8 works, there cannot be branching nodes with - * NEXTBYTE set, and moreover those nodes always have a righthand - * descendant. - */ -typedef unsigned char utf8trie_t; -#define BITNUM 0x07 -#define NEXTBYTE 0x08 -#define OFFLEN 0x30 -#define OFFLEN_SHIFT 4 -#define RIGHTPATH 0x40 -#define TRIENODE 0x80 -#define RIGHTNODE 0x40 -#define LEFTNODE 0x80 - -/* - * utf8leaf_t - * - * The leaves of the trie are embedded in the trie, and so the same - * underlying datatype, unsigned char. - * - * leaf[0]: The unicode version, stored as a generation number that is - * an index into utf8agetab[]. With this we can filter code - * points based on the unicode version in which they were - * defined. The CCC of a non-defined code point is 0. - * leaf[1]: Canonical Combining Class. During normalization, we need - * to do a stable sort into ascending order of all characters - * with a non-zero CCC that occur between two characters with - * a CCC of 0, or at the begin or end of a string. - * The unicode standard guarantees that all CCC values are - * between 0 and 254 inclusive, which leaves 255 available as - * a special value. - * Code points with CCC 0 are known as stoppers. - * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the - * start of a NUL-terminated string that is the decomposition - * of the character. - * The CCC of a decomposable character is the same as the CCC - * of the first character of its decomposition. - * Some characters decompose as the empty string: these are - * characters with the Default_Ignorable_Code_Point property. - * These do affect normalization, as they all have CCC 0. - * - * The decompositions in the trie have been fully expanded. - * - * Casefolding, if applicable, is also done using decompositions. - */ -typedef unsigned char utf8leaf_t; - -#define LEAF_GEN(LEAF) ((LEAF)[0]) -#define LEAF_CCC(LEAF) ((LEAF)[1]) -#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) - -#define MAXGEN (255) - -#define MINCCC (0) -#define MAXCCC (254) -#define STOPPER (0) -#define DECOMPOSE (255) -#define HANGUL ((char)(255)) - -#define UTF8HANGULLEAF (12) - -struct tree; -static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, - const char *, size_t); -static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); - -unsigned char *utf8data; -size_t utf8data_size; - -utf8trie_t *nfdi; -utf8trie_t *nfdicf; - -/* ------------------------------------------------------------------ */ - -/* - * UTF8 valid ranges. - * - * The UTF-8 encoding spreads the bits of a 32bit word over several - * bytes. This table gives the ranges that can be held and how they'd - * be represented. - * - * 0x00000000 0x0000007F: 0xxxxxxx - * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx - * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx - * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * - * There is an additional requirement on UTF-8, in that only the - * shortest representation of a 32bit value is to be used. A decoder - * must not decode sequences that do not satisfy this requirement. - * Thus the allowed ranges have a lower bound. - * - * 0x00000000 0x0000007F: 0xxxxxxx - * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx - * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx - * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * - * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, - * 17 planes of 65536 values. This limits the sequences actually seen - * even more, to just the following. - * - * 0 - 0x7f: 0 0x7f - * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf - * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf - * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf - * - * Even within those ranges not all values are allowed: the surrogates - * 0xd800 - 0xdfff should never be seen. - * - * Note that the longest sequence seen with valid usage is 4 bytes, - * the same a single UTF-32 character. This makes the UTF-8 - * representation of Unicode strictly smaller than UTF-32. - * - * The shortest sequence requirement was introduced by: - * Corrigendum #1: UTF-8 Shortest Form - * It can be found here: - * http://www.unicode.org/versions/corrigendum1.html - * - */ - -#define UTF8_2_BITS 0xC0 -#define UTF8_3_BITS 0xE0 -#define UTF8_4_BITS 0xF0 -#define UTF8_N_BITS 0x80 -#define UTF8_2_MASK 0xE0 -#define UTF8_3_MASK 0xF0 -#define UTF8_4_MASK 0xF8 -#define UTF8_N_MASK 0xC0 -#define UTF8_V_MASK 0x3F -#define UTF8_V_SHIFT 6 - -static int utf8encode(char *str, unsigned int val) -{ - int len; - - if (val < 0x80) { - str[0] = val; - len = 1; - } else if (val < 0x800) { - str[1] = val & UTF8_V_MASK; - str[1] |= UTF8_N_BITS; - val >>= UTF8_V_SHIFT; - str[0] = val; - str[0] |= UTF8_2_BITS; - len = 2; - } else if (val < 0x10000) { - str[2] = val & UTF8_V_MASK; - str[2] |= UTF8_N_BITS; - val >>= UTF8_V_SHIFT; - str[1] = val & UTF8_V_MASK; - str[1] |= UTF8_N_BITS; - val >>= UTF8_V_SHIFT; - str[0] = val; - str[0] |= UTF8_3_BITS; - len = 3; - } else if (val < 0x110000) { - str[3] = val & UTF8_V_MASK; - str[3] |= UTF8_N_BITS; - val >>= UTF8_V_SHIFT; - str[2] = val & UTF8_V_MASK; - str[2] |= UTF8_N_BITS; - val >>= UTF8_V_SHIFT; - str[1] = val & UTF8_V_MASK; - str[1] |= UTF8_N_BITS; - val >>= UTF8_V_SHIFT; - str[0] = val; - str[0] |= UTF8_4_BITS; - len = 4; - } else { - printf("%#x: illegal val\n", val); - len = 0; - } - return len; -} - -static unsigned int utf8decode(const char *str) -{ - const unsigned char *s = (const unsigned char*)str; - unsigned int unichar = 0; - - if (*s < 0x80) { - unichar = *s; - } else if (*s < UTF8_3_BITS) { - unichar = *s++ & 0x1F; - unichar <<= UTF8_V_SHIFT; - unichar |= *s & 0x3F; - } else if (*s < UTF8_4_BITS) { - unichar = *s++ & 0x0F; - unichar <<= UTF8_V_SHIFT; - unichar |= *s++ & 0x3F; - unichar <<= UTF8_V_SHIFT; - unichar |= *s & 0x3F; - } else { - unichar = *s++ & 0x0F; - unichar <<= UTF8_V_SHIFT; - unichar |= *s++ & 0x3F; - unichar <<= UTF8_V_SHIFT; - unichar |= *s++ & 0x3F; - unichar <<= UTF8_V_SHIFT; - unichar |= *s & 0x3F; - } - return unichar; -} - -static int utf32valid(unsigned int unichar) -{ - return unichar < 0x110000; -} - -#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) - -#define NODE 1 -#define LEAF 0 - -struct tree { - void *root; - int childnode; - const char *type; - unsigned int maxage; - struct tree *next; - int (*leaf_equal)(void *, void *); - void (*leaf_print)(void *, int); - int (*leaf_mark)(void *); - int (*leaf_size)(void *); - int *(*leaf_index)(struct tree *, void *); - unsigned char *(*leaf_emit)(void *, unsigned char *); - int leafindex[0x110000]; - int index; -}; - -struct node { - int index; - int offset; - int mark; - int size; - struct node *parent; - void *left; - void *right; - unsigned char bitnum; - unsigned char nextbyte; - unsigned char leftnode; - unsigned char rightnode; - unsigned int keybits; - unsigned int keymask; -}; - -/* - * Example lookup function for a tree. - */ -static void *lookup(struct tree *tree, const char *key) -{ - struct node *node; - void *leaf = NULL; - - node = tree->root; - while (!leaf && node) { - if (node->nextbyte) - key++; - if (*key & (1 << (node->bitnum & 7))) { - /* Right leg */ - if (node->rightnode == NODE) { - node = node->right; - } else if (node->rightnode == LEAF) { - leaf = node->right; - } else { - node = NULL; - } - } else { - /* Left leg */ - if (node->leftnode == NODE) { - node = node->left; - } else if (node->leftnode == LEAF) { - leaf = node->left; - } else { - node = NULL; - } - } - } - - return leaf; -} - -/* - * A simple non-recursive tree walker: keep track of visits to the - * left and right branches in the leftmask and rightmask. - */ -static void tree_walk(struct tree *tree) -{ - struct node *node; - unsigned int leftmask; - unsigned int rightmask; - unsigned int bitmask; - int indent = 1; - int nodes, singletons, leaves; - - nodes = singletons = leaves = 0; - - printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); - if (tree->childnode == LEAF) { - assert(tree->root); - tree->leaf_print(tree->root, indent); - leaves = 1; - } else { - assert(tree->childnode == NODE); - node = tree->root; - leftmask = rightmask = 0; - while (node) { - printf("%*snode @ %p bitnum %d nextbyte %d" - " left %p right %p mask %x bits %x\n", - indent, "", node, - node->bitnum, node->nextbyte, - node->left, node->right, - node->keymask, node->keybits); - nodes += 1; - if (!(node->left && node->right)) - singletons += 1; - - while (node) { - bitmask = 1 << node->bitnum; - if ((leftmask & bitmask) == 0) { - leftmask |= bitmask; - if (node->leftnode == LEAF) { - assert(node->left); - tree->leaf_print(node->left, - indent+1); - leaves += 1; - } else if (node->left) { - assert(node->leftnode == NODE); - indent += 1; - node = node->left; - break; - } - } - if ((rightmask & bitmask) == 0) { - rightmask |= bitmask; - if (node->rightnode == LEAF) { - assert(node->right); - tree->leaf_print(node->right, - indent+1); - leaves += 1; - } else if (node->right) { - assert(node->rightnode == NODE); - indent += 1; - node = node->right; - break; - } - } - leftmask &= ~bitmask; - rightmask &= ~bitmask; - node = node->parent; - indent -= 1; - } - } - } - printf("nodes %d leaves %d singletons %d\n", - nodes, leaves, singletons); -} - -/* - * Allocate an initialize a new internal node. - */ -static struct node *alloc_node(struct node *parent) -{ - struct node *node; - int bitnum; - - node = malloc(sizeof(*node)); - node->left = node->right = NULL; - node->parent = parent; - node->leftnode = NODE; - node->rightnode = NODE; - node->keybits = 0; - node->keymask = 0; - node->mark = 0; - node->index = 0; - node->offset = -1; - node->size = 4; - - if (node->parent) { - bitnum = parent->bitnum; - if ((bitnum & 7) == 0) { - node->bitnum = bitnum + 7 + 8; - node->nextbyte = 1; - } else { - node->bitnum = bitnum - 1; - node->nextbyte = 0; - } - } else { - node->bitnum = 7; - node->nextbyte = 0; - } - - return node; -} - -/* - * Insert a new leaf into the tree, and collapse any subtrees that are - * fully populated and end in identical leaves. A nextbyte tagged - * internal node will not be removed to preserve the tree's integrity. - * Note that due to the structure of utf8, no nextbyte tagged node - * will be a candidate for removal. - */ -static int insert(struct tree *tree, char *key, int keylen, void *leaf) -{ - struct node *node; - struct node *parent; - void **cursor; - int keybits; - - assert(keylen >= 1 && keylen <= 4); - - node = NULL; - cursor = &tree->root; - keybits = 8 * keylen; - - /* Insert, creating path along the way. */ - while (keybits) { - if (!*cursor) - *cursor = alloc_node(node); - node = *cursor; - if (node->nextbyte) - key++; - if (*key & (1 << (node->bitnum & 7))) - cursor = &node->right; - else - cursor = &node->left; - keybits--; - } - *cursor = leaf; - - /* Merge subtrees if possible. */ - while (node) { - if (*key & (1 << (node->bitnum & 7))) - node->rightnode = LEAF; - else - node->leftnode = LEAF; - if (node->nextbyte) - break; - if (node->leftnode == NODE || node->rightnode == NODE) - break; - assert(node->left); - assert(node->right); - /* Compare */ - if (! tree->leaf_equal(node->left, node->right)) - break; - /* Keep left, drop right leaf. */ - leaf = node->left; - /* Check in parent */ - parent = node->parent; - if (!parent) { - /* root of tree! */ - tree->root = leaf; - tree->childnode = LEAF; - } else if (parent->left == node) { - parent->left = leaf; - parent->leftnode = LEAF; - if (parent->right) { - parent->keymask = 0; - parent->keybits = 0; - } else { - parent->keymask |= (1 << node->bitnum); - } - } else if (parent->right == node) { - parent->right = leaf; - parent->rightnode = LEAF; - if (parent->left) { - parent->keymask = 0; - parent->keybits = 0; - } else { - parent->keymask |= (1 << node->bitnum); - parent->keybits |= (1 << node->bitnum); - } - } else { - /* internal tree error */ - assert(0); - } - free(node); - node = parent; - } - - /* Propagate keymasks up along singleton chains. */ - while (node) { - parent = node->parent; - if (!parent) - break; - /* Nix the mask for parents with two children. */ - if (node->keymask == 0) { - parent->keymask = 0; - parent->keybits = 0; - } else if (parent->left && parent->right) { - parent->keymask = 0; - parent->keybits = 0; - } else { - assert((parent->keymask & node->keymask) == 0); - parent->keymask |= node->keymask; - parent->keymask |= (1 << parent->bitnum); - parent->keybits |= node->keybits; - if (parent->right) - parent->keybits |= (1 << parent->bitnum); - } - node = parent; - } - - return 0; -} - -/* - * Prune internal nodes. - * - * Fully populated subtrees that end at the same leaf have already - * been collapsed. There are still internal nodes that have for both - * their left and right branches a sequence of singletons that make - * identical choices and end in identical leaves. The keymask and - * keybits collected in the nodes describe the choices made in these - * singleton chains. When they are identical for the left and right - * branch of a node, and the two leaves comare identical, the node in - * question can be removed. - * - * Note that nodes with the nextbyte tag set will not be removed by - * this to ensure tree integrity. Note as well that the structure of - * utf8 ensures that these nodes would not have been candidates for - * removal in any case. - */ -static void prune(struct tree *tree) -{ - struct node *node; - struct node *left; - struct node *right; - struct node *parent; - void *leftleaf; - void *rightleaf; - unsigned int leftmask; - unsigned int rightmask; - unsigned int bitmask; - int count; - - if (verbose > 0) - printf("Pruning %s_%x\n", tree->type, tree->maxage); - - count = 0; - if (tree->childnode == LEAF) - return; - if (!tree->root) - return; - - leftmask = rightmask = 0; - node = tree->root; - while (node) { - if (node->nextbyte) - goto advance; - if (node->leftnode == LEAF) - goto advance; - if (node->rightnode == LEAF) - goto advance; - if (!node->left) - goto advance; - if (!node->right) - goto advance; - left = node->left; - right = node->right; - if (left->keymask == 0) - goto advance; - if (right->keymask == 0) - goto advance; - if (left->keymask != right->keymask) - goto advance; - if (left->keybits != right->keybits) - goto advance; - leftleaf = NULL; - while (!leftleaf) { - assert(left->left || left->right); - if (left->leftnode == LEAF) - leftleaf = left->left; - else if (left->rightnode == LEAF) - leftleaf = left->right; - else if (left->left) - left = left->left; - else if (left->right) - left = left->right; - else - assert(0); - } - rightleaf = NULL; - while (!rightleaf) { - assert(right->left || right->right); - if (right->leftnode == LEAF) - rightleaf = right->left; - else if (right->rightnode == LEAF) - rightleaf = right->right; - else if (right->left) - right = right->left; - else if (right->right) - right = right->right; - else - assert(0); - } - if (! tree->leaf_equal(leftleaf, rightleaf)) - goto advance; - /* - * This node has identical singleton-only subtrees. - * Remove it. - */ - parent = node->parent; - left = node->left; - right = node->right; - if (parent->left == node) - parent->left = left; - else if (parent->right == node) - parent->right = left; - else - assert(0); - left->parent = parent; - left->keymask |= (1 << node->bitnum); - node->left = NULL; - while (node) { - bitmask = 1 << node->bitnum; - leftmask &= ~bitmask; - rightmask &= ~bitmask; - if (node->leftnode == NODE && node->left) { - left = node->left; - free(node); - count++; - node = left; - } else if (node->rightnode == NODE && node->right) { - right = node->right; - free(node); - count++; - node = right; - } else { - node = NULL; - } - } - /* Propagate keymasks up along singleton chains. */ - node = parent; - /* Force re-check */ - bitmask = 1 << node->bitnum; - leftmask &= ~bitmask; - rightmask &= ~bitmask; - for (;;) { - if (node->left && node->right) - break; - if (node->left) { - left = node->left; - node->keymask |= left->keymask; - node->keybits |= left->keybits; - } - if (node->right) { - right = node->right; - node->keymask |= right->keymask; - node->keybits |= right->keybits; - } - node->keymask |= (1 << node->bitnum); - node = node->parent; - /* Force re-check */ - bitmask = 1 << node->bitnum; - leftmask &= ~bitmask; - rightmask &= ~bitmask; - } - advance: - bitmask = 1 << node->bitnum; - if ((leftmask & bitmask) == 0 && - node->leftnode == NODE && - node->left) { - leftmask |= bitmask; - node = node->left; - } else if ((rightmask & bitmask) == 0 && - node->rightnode == NODE && - node->right) { - rightmask |= bitmask; - node = node->right; - } else { - leftmask &= ~bitmask; - rightmask &= ~bitmask; - node = node->parent; - } - } - if (verbose > 0) - printf("Pruned %d nodes\n", count); -} - -/* - * Mark the nodes in the tree that lead to leaves that must be - * emitted. - */ -static void mark_nodes(struct tree *tree) -{ - struct node *node; - struct node *n; - unsigned int leftmask; - unsigned int rightmask; - unsigned int bitmask; - int marked; - - marked = 0; - if (verbose > 0) - printf("Marking %s_%x\n", tree->type, tree->maxage); - if (tree->childnode == LEAF) - goto done; - - assert(tree->childnode == NODE); - node = tree->root; - leftmask = rightmask = 0; - while (node) { - bitmask = 1 << node->bitnum; - if ((leftmask & bitmask) == 0) { - leftmask |= bitmask; - if (node->leftnode == LEAF) { - assert(node->left); - if (tree->leaf_mark(node->left)) { - n = node; - while (n && !n->mark) { - marked++; - n->mark = 1; - n = n->parent; - } - } - } else if (node->left) { - assert(node->leftnode == NODE); - node = node->left; - continue; - } - } - if ((rightmask & bitmask) == 0) { - rightmask |= bitmask; - if (node->rightnode == LEAF) { - assert(node->right); - if (tree->leaf_mark(node->right)) { - n = node; - while (n && !n->mark) { - marked++; - n->mark = 1; - n = n->parent; - } - } - } else if (node->right) { - assert(node->rightnode == NODE); - node = node->right; - continue; - } - } - leftmask &= ~bitmask; - rightmask &= ~bitmask; - node = node->parent; - } - - /* second pass: left siblings and singletons */ - - assert(tree->childnode == NODE); - node = tree->root; - leftmask = rightmask = 0; - while (node) { - bitmask = 1 << node->bitnum; - if ((leftmask & bitmask) == 0) { - leftmask |= bitmask; - if (node->leftnode == LEAF) { - assert(node->left); - if (tree->leaf_mark(node->left)) { - n = node; - while (n && !n->mark) { - marked++; - n->mark = 1; - n = n->parent; - } - } - } else if (node->left) { - assert(node->leftnode == NODE); - node = node->left; - if (!node->mark && node->parent->mark) { - marked++; - node->mark = 1; - } - continue; - } - } - if ((rightmask & bitmask) == 0) { - rightmask |= bitmask; - if (node->rightnode == LEAF) { - assert(node->right); - if (tree->leaf_mark(node->right)) { - n = node; - while (n && !n->mark) { - marked++; - n->mark = 1; - n = n->parent; - } - } - } else if (node->right) { - assert(node->rightnode == NODE); - node = node->right; - if (!node->mark && node->parent->mark && - !node->parent->left) { - marked++; - node->mark = 1; - } - continue; - } - } - leftmask &= ~bitmask; - rightmask &= ~bitmask; - node = node->parent; - } -done: - if (verbose > 0) - printf("Marked %d nodes\n", marked); -} - -/* - * Compute the index of each node and leaf, which is the offset in the - * emitted trie. These values must be pre-computed because relative - * offsets between nodes are used to navigate the tree. - */ -static int index_nodes(struct tree *tree, int index) -{ - struct node *node; - unsigned int leftmask; - unsigned int rightmask; - unsigned int bitmask; - int count; - int indent; - - /* Align to a cache line (or half a cache line?). */ - while (index % 64) - index++; - tree->index = index; - indent = 1; - count = 0; - - if (verbose > 0) - printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); - if (tree->childnode == LEAF) { - index += tree->leaf_size(tree->root); - goto done; - } - - assert(tree->childnode == NODE); - node = tree->root; - leftmask = rightmask = 0; - while (node) { - if (!node->mark) - goto skip; - count++; - if (node->index != index) - node->index = index; - index += node->size; -skip: - while (node) { - bitmask = 1 << node->bitnum; - if (node->mark && (leftmask & bitmask) == 0) { - leftmask |= bitmask; - if (node->leftnode == LEAF) { - assert(node->left); - *tree->leaf_index(tree, node->left) = - index; - index += tree->leaf_size(node->left); - count++; - } else if (node->left) { - assert(node->leftnode == NODE); - indent += 1; - node = node->left; - break; - } - } - if (node->mark && (rightmask & bitmask) == 0) { - rightmask |= bitmask; - if (node->rightnode == LEAF) { - assert(node->right); - *tree->leaf_index(tree, node->right) = index; - index += tree->leaf_size(node->right); - count++; - } else if (node->right) { - assert(node->rightnode == NODE); - indent += 1; - node = node->right; - break; - } - } - leftmask &= ~bitmask; - rightmask &= ~bitmask; - node = node->parent; - indent -= 1; - } - } -done: - /* Round up to a multiple of 16 */ - while (index % 16) - index++; - if (verbose > 0) - printf("Final index %d\n", index); - return index; -} - -/* - * Mark the nodes in a subtree, helper for size_nodes(). - */ -static int mark_subtree(struct node *node) -{ - int changed; - - if (!node || node->mark) - return 0; - node->mark = 1; - node->index = node->parent->index; - changed = 1; - if (node->leftnode == NODE) - changed += mark_subtree(node->left); - if (node->rightnode == NODE) - changed += mark_subtree(node->right); - return changed; -} - -/* - * Compute the size of nodes and leaves. We start by assuming that - * each node needs to store a three-byte offset. The indexes of the - * nodes are calculated based on that, and then this function is - * called to see if the sizes of some nodes can be reduced. This is - * repeated until no more changes are seen. - */ -static int size_nodes(struct tree *tree) -{ - struct tree *next; - struct node *node; - struct node *right; - struct node *n; - unsigned int leftmask; - unsigned int rightmask; - unsigned int bitmask; - unsigned int pathbits; - unsigned int pathmask; - unsigned int nbit; - int changed; - int offset; - int size; - int indent; - - indent = 1; - changed = 0; - size = 0; - - if (verbose > 0) - printf("Sizing %s_%x\n", tree->type, tree->maxage); - if (tree->childnode == LEAF) - goto done; - - assert(tree->childnode == NODE); - pathbits = 0; - pathmask = 0; - node = tree->root; - leftmask = rightmask = 0; - while (node) { - if (!node->mark) - goto skip; - offset = 0; - if (!node->left || !node->right) { - size = 1; - } else { - if (node->rightnode == NODE) { - /* - * If the right node is not marked, - * look for a corresponding node in - * the next tree. Such a node need - * not exist. - */ - right = node->right; - next = tree->next; - while (!right->mark) { - assert(next); - n = next->root; - while (n->bitnum != node->bitnum) { - nbit = 1 << n->bitnum; - if (!(pathmask & nbit)) - break; - if (pathbits & nbit) { - if (n->rightnode == LEAF) - break; - n = n->right; - } else { - if (n->leftnode == LEAF) - break; - n = n->left; - } - } - if (n->bitnum != node->bitnum) - break; - n = n->right; - right = n; - next = next->next; - } - /* Make sure the right node is marked. */ - if (!right->mark) - changed += mark_subtree(right); - offset = right->index - node->index; - } else { - offset = *tree->leaf_index(tree, node->right); - offset -= node->index; - } - assert(offset >= 0); - assert(offset <= 0xffffff); - if (offset <= 0xff) { - size = 2; - } else if (offset <= 0xffff) { - size = 3; - } else { /* offset <= 0xffffff */ - size = 4; - } - } - if (node->size != size || node->offset != offset) { - node->size = size; - node->offset = offset; - changed++; - } -skip: - while (node) { - bitmask = 1 << node->bitnum; - pathmask |= bitmask; - if (node->mark && (leftmask & bitmask) == 0) { - leftmask |= bitmask; - if (node->leftnode == LEAF) { - assert(node->left); - } else if (node->left) { - assert(node->leftnode == NODE); - indent += 1; - node = node->left; - break; - } - } - if (node->mark && (rightmask & bitmask) == 0) { - rightmask |= bitmask; - pathbits |= bitmask; - if (node->rightnode == LEAF) { - assert(node->right); - } else if (node->right) { - assert(node->rightnode == NODE); - indent += 1; - node = node->right; - break; - } - } - leftmask &= ~bitmask; - rightmask &= ~bitmask; - pathmask &= ~bitmask; - pathbits &= ~bitmask; - node = node->parent; - indent -= 1; - } - } -done: - if (verbose > 0) - printf("Found %d changes\n", changed); - return changed; -} - -/* - * Emit a trie for the given tree into the data array. - */ -static void emit(struct tree *tree, unsigned char *data) -{ - struct node *node; - unsigned int leftmask; - unsigned int rightmask; - unsigned int bitmask; - int offlen; - int offset; - int index; - int indent; - int size; - int bytes; - int leaves; - int nodes[4]; - unsigned char byte; - - nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; - leaves = 0; - bytes = 0; - index = tree->index; - data += index; - indent = 1; - if (verbose > 0) - printf("Emitting %s_%x\n", tree->type, tree->maxage); - if (tree->childnode == LEAF) { - assert(tree->root); - tree->leaf_emit(tree->root, data); - size = tree->leaf_size(tree->root); - index += size; - leaves++; - goto done; - } - - assert(tree->childnode == NODE); - node = tree->root; - leftmask = rightmask = 0; - while (node) { - if (!node->mark) - goto skip; - assert(node->offset != -1); - assert(node->index == index); - - byte = 0; - if (node->nextbyte) - byte |= NEXTBYTE; - byte |= (node->bitnum & BITNUM); - if (node->left && node->right) { - if (node->leftnode == NODE) - byte |= LEFTNODE; - if (node->rightnode == NODE) - byte |= RIGHTNODE; - if (node->offset <= 0xff) - offlen = 1; - else if (node->offset <= 0xffff) - offlen = 2; - else - offlen = 3; - nodes[offlen]++; - offset = node->offset; - byte |= offlen << OFFLEN_SHIFT; - *data++ = byte; - index++; - while (offlen--) { - *data++ = offset & 0xff; - index++; - offset >>= 8; - } - } else if (node->left) { - if (node->leftnode == NODE) - byte |= TRIENODE; - nodes[0]++; - *data++ = byte; - index++; - } else if (node->right) { - byte |= RIGHTNODE; - if (node->rightnode == NODE) - byte |= TRIENODE; - nodes[0]++; - *data++ = byte; - index++; - } else { - assert(0); - } -skip: - while (node) { - bitmask = 1 << node->bitnum; - if (node->mark && (leftmask & bitmask) == 0) { - leftmask |= bitmask; - if (node->leftnode == LEAF) { - assert(node->left); - data = tree->leaf_emit(node->left, - data); - size = tree->leaf_size(node->left); - index += size; - bytes += size; - leaves++; - } else if (node->left) { - assert(node->leftnode == NODE); - indent += 1; - node = node->left; - break; - } - } - if (node->mark && (rightmask & bitmask) == 0) { - rightmask |= bitmask; - if (node->rightnode == LEAF) { - assert(node->right); - data = tree->leaf_emit(node->right, - data); - size = tree->leaf_size(node->right); - index += size; - bytes += size; - leaves++; - } else if (node->right) { - assert(node->rightnode == NODE); - indent += 1; - node = node->right; - break; - } - } - leftmask &= ~bitmask; - rightmask &= ~bitmask; - node = node->parent; - indent -= 1; - } - } -done: - if (verbose > 0) { - printf("Emitted %d (%d) leaves", - leaves, bytes); - printf(" %d (%d+%d+%d+%d) nodes", - nodes[0] + nodes[1] + nodes[2] + nodes[3], - nodes[0], nodes[1], nodes[2], nodes[3]); - printf(" %d total\n", index - tree->index); - } -} - -/* ------------------------------------------------------------------ */ - -/* - * Unicode data. - * - * We need to keep track of the Canonical Combining Class, the Age, - * and decompositions for a code point. - * - * For the Age, we store the index into the ages table. Effectively - * this is a generation number that the table maps to a unicode - * version. - * - * The correction field is used to indicate that this entry is in the - * corrections array, which contains decompositions that were - * corrected in later revisions. The value of the correction field is - * the Unicode version in which the mapping was corrected. - */ -struct unicode_data { - unsigned int code; - int ccc; - int gen; - int correction; - unsigned int *utf32nfdi; - unsigned int *utf32nfdicf; - char *utf8nfdi; - char *utf8nfdicf; -}; - -struct unicode_data unicode_data[0x110000]; -struct unicode_data *corrections; -int corrections_count; - -struct tree *nfdi_tree; -struct tree *nfdicf_tree; - -struct tree *trees; -int trees_count; - -/* - * Check the corrections array to see if this entry was corrected at - * some point. - */ -static struct unicode_data *corrections_lookup(struct unicode_data *u) -{ - int i; - - for (i = 0; i != corrections_count; i++) - if (u->code == corrections[i].code) - return &corrections[i]; - return u; -} - -static int nfdi_equal(void *l, void *r) -{ - struct unicode_data *left = l; - struct unicode_data *right = r; - - if (left->gen != right->gen) - return 0; - if (left->ccc != right->ccc) - return 0; - if (left->utf8nfdi && right->utf8nfdi && - strcmp(left->utf8nfdi, right->utf8nfdi) == 0) - return 1; - if (left->utf8nfdi || right->utf8nfdi) - return 0; - return 1; -} - -static int nfdicf_equal(void *l, void *r) -{ - struct unicode_data *left = l; - struct unicode_data *right = r; - - if (left->gen != right->gen) - return 0; - if (left->ccc != right->ccc) - return 0; - if (left->utf8nfdicf && right->utf8nfdicf && - strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) - return 1; - if (left->utf8nfdicf && right->utf8nfdicf) - return 0; - if (left->utf8nfdicf || right->utf8nfdicf) - return 0; - if (left->utf8nfdi && right->utf8nfdi && - strcmp(left->utf8nfdi, right->utf8nfdi) == 0) - return 1; - if (left->utf8nfdi || right->utf8nfdi) - return 0; - return 1; -} - -static void nfdi_print(void *l, int indent) -{ - struct unicode_data *leaf = l; - - printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, - leaf->code, leaf->ccc, leaf->gen); - - if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) - printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); - else if (leaf->utf8nfdi) - printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); - - printf("\n"); -} - -static void nfdicf_print(void *l, int indent) -{ - struct unicode_data *leaf = l; - - printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, - leaf->code, leaf->ccc, leaf->gen); - - if (leaf->utf8nfdicf) - printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); - else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) - printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); - else if (leaf->utf8nfdi) - printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); - printf("\n"); -} - -static int nfdi_mark(void *l) -{ - return 1; -} - -static int nfdicf_mark(void *l) -{ - struct unicode_data *leaf = l; - - if (leaf->utf8nfdicf) - return 1; - return 0; -} - -static int correction_mark(void *l) -{ - struct unicode_data *leaf = l; - - return leaf->correction; -} - -static int nfdi_size(void *l) -{ - struct unicode_data *leaf = l; - int size = 2; - - if (HANGUL_SYLLABLE(leaf->code)) - size += 1; - else if (leaf->utf8nfdi) - size += strlen(leaf->utf8nfdi) + 1; - return size; -} - -static int nfdicf_size(void *l) -{ - struct unicode_data *leaf = l; - int size = 2; - - if (HANGUL_SYLLABLE(leaf->code)) - size += 1; - else if (leaf->utf8nfdicf) - size += strlen(leaf->utf8nfdicf) + 1; - else if (leaf->utf8nfdi) - size += strlen(leaf->utf8nfdi) + 1; - return size; -} - -static int *nfdi_index(struct tree *tree, void *l) -{ - struct unicode_data *leaf = l; - - return &tree->leafindex[leaf->code]; -} - -static int *nfdicf_index(struct tree *tree, void *l) -{ - struct unicode_data *leaf = l; - - return &tree->leafindex[leaf->code]; -} - -static unsigned char *nfdi_emit(void *l, unsigned char *data) -{ - struct unicode_data *leaf = l; - unsigned char *s; - - *data++ = leaf->gen; - - if (HANGUL_SYLLABLE(leaf->code)) { - *data++ = DECOMPOSE; - *data++ = HANGUL; - } else if (leaf->utf8nfdi) { - *data++ = DECOMPOSE; - s = (unsigned char*)leaf->utf8nfdi; - while ((*data++ = *s++) != 0) - ; - } else { - *data++ = leaf->ccc; - } - return data; -} - -static unsigned char *nfdicf_emit(void *l, unsigned char *data) -{ - struct unicode_data *leaf = l; - unsigned char *s; - - *data++ = leaf->gen; - - if (HANGUL_SYLLABLE(leaf->code)) { - *data++ = DECOMPOSE; - *data++ = HANGUL; - } else if (leaf->utf8nfdicf) { - *data++ = DECOMPOSE; - s = (unsigned char*)leaf->utf8nfdicf; - while ((*data++ = *s++) != 0) - ; - } else if (leaf->utf8nfdi) { - *data++ = DECOMPOSE; - s = (unsigned char*)leaf->utf8nfdi; - while ((*data++ = *s++) != 0) - ; - } else { - *data++ = leaf->ccc; - } - return data; -} - -static void utf8_create(struct unicode_data *data) -{ - char utf[18*4+1]; - char *u; - unsigned int *um; - int i; - - if (data->utf8nfdi) { - assert(data->utf8nfdi[0] == HANGUL); - return; - } - - u = utf; - um = data->utf32nfdi; - if (um) { - for (i = 0; um[i]; i++) - u += utf8encode(u, um[i]); - *u = '\0'; - data->utf8nfdi = strdup(utf); - } - u = utf; - um = data->utf32nfdicf; - if (um) { - for (i = 0; um[i]; i++) - u += utf8encode(u, um[i]); - *u = '\0'; - if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) - data->utf8nfdicf = strdup(utf); - } -} - -static void utf8_init(void) -{ - unsigned int unichar; - int i; - - for (unichar = 0; unichar != 0x110000; unichar++) - utf8_create(&unicode_data[unichar]); - - for (i = 0; i != corrections_count; i++) - utf8_create(&corrections[i]); -} - -static void trees_init(void) -{ - struct unicode_data *data; - unsigned int maxage; - unsigned int nextage; - int count; - int i; - int j; - - /* Count the number of different ages. */ - count = 0; - nextage = (unsigned int)-1; - do { - maxage = nextage; - nextage = 0; - for (i = 0; i <= corrections_count; i++) { - data = &corrections[i]; - if (nextage < data->correction && - data->correction < maxage) - nextage = data->correction; - } - count++; - } while (nextage); - - /* Two trees per age: nfdi and nfdicf */ - trees_count = count * 2; - trees = calloc(trees_count, sizeof(struct tree)); - - /* Assign ages to the trees. */ - count = trees_count; - nextage = (unsigned int)-1; - do { - maxage = nextage; - trees[--count].maxage = maxage; - trees[--count].maxage = maxage; - nextage = 0; - for (i = 0; i <= corrections_count; i++) { - data = &corrections[i]; - if (nextage < data->correction && - data->correction < maxage) - nextage = data->correction; - } - } while (nextage); - - /* The ages assigned above are off by one. */ - for (i = 0; i != trees_count; i++) { - j = 0; - while (ages[j] < trees[i].maxage) - j++; - trees[i].maxage = ages[j-1]; - } - - /* Set up the forwarding between trees. */ - trees[trees_count-2].next = &trees[trees_count-1]; - trees[trees_count-1].leaf_mark = nfdi_mark; - trees[trees_count-2].leaf_mark = nfdicf_mark; - for (i = 0; i != trees_count-2; i += 2) { - trees[i].next = &trees[trees_count-2]; - trees[i].leaf_mark = correction_mark; - trees[i+1].next = &trees[trees_count-1]; - trees[i+1].leaf_mark = correction_mark; - } - - /* Assign the callouts. */ - for (i = 0; i != trees_count; i += 2) { - trees[i].type = "nfdicf"; - trees[i].leaf_equal = nfdicf_equal; - trees[i].leaf_print = nfdicf_print; - trees[i].leaf_size = nfdicf_size; - trees[i].leaf_index = nfdicf_index; - trees[i].leaf_emit = nfdicf_emit; - - trees[i+1].type = "nfdi"; - trees[i+1].leaf_equal = nfdi_equal; - trees[i+1].leaf_print = nfdi_print; - trees[i+1].leaf_size = nfdi_size; - trees[i+1].leaf_index = nfdi_index; - trees[i+1].leaf_emit = nfdi_emit; - } - - /* Finish init. */ - for (i = 0; i != trees_count; i++) - trees[i].childnode = NODE; -} - -static void trees_populate(void) -{ - struct unicode_data *data; - unsigned int unichar; - char keyval[4]; - int keylen; - int i; - - for (i = 0; i != trees_count; i++) { - if (verbose > 0) { - printf("Populating %s_%x\n", - trees[i].type, trees[i].maxage); - } - for (unichar = 0; unichar != 0x110000; unichar++) { - if (unicode_data[unichar].gen < 0) - continue; - keylen = utf8encode(keyval, unichar); - data = corrections_lookup(&unicode_data[unichar]); - if (data->correction <= trees[i].maxage) - data = &unicode_data[unichar]; - insert(&trees[i], keyval, keylen, data); - } - } -} - -static void trees_reduce(void) -{ - int i; - int size; - int changed; - - for (i = 0; i != trees_count; i++) - prune(&trees[i]); - for (i = 0; i != trees_count; i++) - mark_nodes(&trees[i]); - do { - size = 0; - for (i = 0; i != trees_count; i++) - size = index_nodes(&trees[i], size); - changed = 0; - for (i = 0; i != trees_count; i++) - changed += size_nodes(&trees[i]); - } while (changed); - - utf8data = calloc(size, 1); - utf8data_size = size; - for (i = 0; i != trees_count; i++) - emit(&trees[i], utf8data); - - if (verbose > 0) { - for (i = 0; i != trees_count; i++) { - printf("%s_%x idx %d\n", - trees[i].type, trees[i].maxage, trees[i].index); - } - } - - nfdi = utf8data + trees[trees_count-1].index; - nfdicf = utf8data + trees[trees_count-2].index; - - nfdi_tree = &trees[trees_count-1]; - nfdicf_tree = &trees[trees_count-2]; -} - -static void verify(struct tree *tree) -{ - struct unicode_data *data; - utf8leaf_t *leaf; - unsigned int unichar; - char key[4]; - unsigned char hangul[UTF8HANGULLEAF]; - int report; - int nocf; - - if (verbose > 0) - printf("Verifying %s_%x\n", tree->type, tree->maxage); - nocf = strcmp(tree->type, "nfdicf"); - - for (unichar = 0; unichar != 0x110000; unichar++) { - report = 0; - data = corrections_lookup(&unicode_data[unichar]); - if (data->correction <= tree->maxage) - data = &unicode_data[unichar]; - utf8encode(key,unichar); - leaf = utf8lookup(tree, hangul, key); - - if (!leaf) { - if (data->gen != -1) - report++; - if (unichar < 0xd800 || unichar > 0xdfff) - report++; - } else { - if (unichar >= 0xd800 && unichar <= 0xdfff) - report++; - if (data->gen == -1) - report++; - if (data->gen != LEAF_GEN(leaf)) - report++; - if (LEAF_CCC(leaf) == DECOMPOSE) { - if (HANGUL_SYLLABLE(data->code)) { - if (data->utf8nfdi[0] != HANGUL) - report++; - } else if (nocf) { - if (!data->utf8nfdi) { - report++; - } else if (strcmp(data->utf8nfdi, - LEAF_STR(leaf))) { - report++; - } - } else { - if (!data->utf8nfdicf && - !data->utf8nfdi) { - report++; - } else if (data->utf8nfdicf) { - if (strcmp(data->utf8nfdicf, - LEAF_STR(leaf))) - report++; - } else if (strcmp(data->utf8nfdi, - LEAF_STR(leaf))) { - report++; - } - } - } else if (data->ccc != LEAF_CCC(leaf)) { - report++; - } - } - if (report) { - printf("%X code %X gen %d ccc %d" - " nfdi -> \"%s\"", - unichar, data->code, data->gen, - data->ccc, - data->utf8nfdi); - if (leaf) { - printf(" gen %d ccc %d" - " nfdi -> \"%s\"", - LEAF_GEN(leaf), - LEAF_CCC(leaf), - LEAF_CCC(leaf) == DECOMPOSE ? - LEAF_STR(leaf) : ""); - } - printf("\n"); - } - } -} - -static void trees_verify(void) -{ - int i; - - for (i = 0; i != trees_count; i++) - verify(&trees[i]); -} - -/* ------------------------------------------------------------------ */ - -static void help(void) -{ - printf("Usage: %s [options]\n", argv0); - printf("\n"); - printf("This program creates an a data trie used for parsing and\n"); - printf("normalization of UTF-8 strings. The trie is derived from\n"); - printf("a set of input files from the Unicode character database\n"); - printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); - printf("\n"); - printf("The generated tree supports two normalization forms:\n"); - printf("\n"); - printf("\tnfdi:\n"); - printf("\t- Apply unicode normalization form NFD.\n"); - printf("\t- Remove any Default_Ignorable_Code_Point.\n"); - printf("\n"); - printf("\tnfdicf:\n"); - printf("\t- Apply unicode normalization form NFD.\n"); - printf("\t- Remove any Default_Ignorable_Code_Point.\n"); - printf("\t- Apply a full casefold (C + F).\n"); - printf("\n"); - printf("These forms were chosen as being most useful when dealing\n"); - printf("with file names: NFD catches most cases where characters\n"); - printf("should be considered equivalent. The ignorables are mostly\n"); - printf("invisible, making names hard to type.\n"); - printf("\n"); - printf("The options to specify the files to be used are listed\n"); - printf("below with their default values, which are the names used\n"); - printf("by version 11.0.0 of the Unicode Character Database.\n"); - printf("\n"); - printf("The input files:\n"); - printf("\t-a %s\n", AGE_NAME); - printf("\t-c %s\n", CCC_NAME); - printf("\t-p %s\n", PROP_NAME); - printf("\t-d %s\n", DATA_NAME); - printf("\t-f %s\n", FOLD_NAME); - printf("\t-n %s\n", NORM_NAME); - printf("\n"); - printf("Additionally, the generated tables are tested using:\n"); - printf("\t-t %s\n", TEST_NAME); - printf("\n"); - printf("Finally, the output file:\n"); - printf("\t-o %s\n", UTF8_NAME); - printf("\n"); -} - -static void usage(void) -{ - help(); - exit(1); -} - -static void open_fail(const char *name, int error) -{ - printf("Error %d opening %s: %s\n", error, name, strerror(error)); - exit(1); -} - -static void file_fail(const char *filename) -{ - printf("Error parsing %s\n", filename); - exit(1); -} - -static void line_fail(const char *filename, const char *line) -{ - printf("Error parsing %s:%s\n", filename, line); - exit(1); -} - -/* ------------------------------------------------------------------ */ - -static void print_utf32(unsigned int *utf32str) -{ - int i; - - for (i = 0; utf32str[i]; i++) - printf(" %X", utf32str[i]); -} - -static void print_utf32nfdi(unsigned int unichar) -{ - printf(" %X ->", unichar); - print_utf32(unicode_data[unichar].utf32nfdi); - printf("\n"); -} - -static void print_utf32nfdicf(unsigned int unichar) -{ - printf(" %X ->", unichar); - print_utf32(unicode_data[unichar].utf32nfdicf); - printf("\n"); -} - -/* ------------------------------------------------------------------ */ - -static void age_init(void) -{ - FILE *file; - unsigned int first; - unsigned int last; - unsigned int unichar; - unsigned int major; - unsigned int minor; - unsigned int revision; - int gen; - int count; - int ret; - - if (verbose > 0) - printf("Parsing %s\n", age_name); - - file = fopen(age_name, "r"); - if (!file) - open_fail(age_name, errno); - count = 0; - - gen = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "# Age=V%d_%d_%d", - &major, &minor, &revision); - if (ret == 3) { - ages_count++; - if (verbose > 1) - printf(" Age V%d_%d_%d\n", - major, minor, revision); - if (!age_valid(major, minor, revision)) - line_fail(age_name, line); - continue; - } - ret = sscanf(line, "# Age=V%d_%d", &major, &minor); - if (ret == 2) { - ages_count++; - if (verbose > 1) - printf(" Age V%d_%d\n", major, minor); - if (!age_valid(major, minor, 0)) - line_fail(age_name, line); - continue; - } - } - - /* We must have found something above. */ - if (verbose > 1) - printf("%d age entries\n", ages_count); - if (ages_count == 0 || ages_count > MAXGEN) - file_fail(age_name); - - /* There is a 0 entry. */ - ages_count++; - ages = calloc(ages_count + 1, sizeof(*ages)); - /* And a guard entry. */ - ages[ages_count] = (unsigned int)-1; - - rewind(file); - count = 0; - gen = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "# Age=V%d_%d_%d", - &major, &minor, &revision); - if (ret == 3) { - ages[++gen] = - UNICODE_AGE(major, minor, revision); - if (verbose > 1) - printf(" Age V%d_%d_%d = gen %d\n", - major, minor, revision, gen); - if (!age_valid(major, minor, revision)) - line_fail(age_name, line); - continue; - } - ret = sscanf(line, "# Age=V%d_%d", &major, &minor); - if (ret == 2) { - ages[++gen] = UNICODE_AGE(major, minor, 0); - if (verbose > 1) - printf(" Age V%d_%d = %d\n", - major, minor, gen); - if (!age_valid(major, minor, 0)) - line_fail(age_name, line); - continue; - } - ret = sscanf(line, "%X..%X ; %d.%d #", - &first, &last, &major, &minor); - if (ret == 4) { - for (unichar = first; unichar <= last; unichar++) - unicode_data[unichar].gen = gen; - count += 1 + last - first; - if (verbose > 1) - printf(" %X..%X gen %d\n", first, last, gen); - if (!utf32valid(first) || !utf32valid(last)) - line_fail(age_name, line); - continue; - } - ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); - if (ret == 3) { - unicode_data[unichar].gen = gen; - count++; - if (verbose > 1) - printf(" %X gen %d\n", unichar, gen); - if (!utf32valid(unichar)) - line_fail(age_name, line); - continue; - } - } - unicode_maxage = ages[gen]; - fclose(file); - - /* Nix surrogate block */ - if (verbose > 1) - printf(" Removing surrogate block D800..DFFF\n"); - for (unichar = 0xd800; unichar <= 0xdfff; unichar++) - unicode_data[unichar].gen = -1; - - if (verbose > 0) - printf("Found %d entries\n", count); - if (count == 0) - file_fail(age_name); -} - -static void ccc_init(void) -{ - FILE *file; - unsigned int first; - unsigned int last; - unsigned int unichar; - unsigned int value; - int count; - int ret; - - if (verbose > 0) - printf("Parsing %s\n", ccc_name); - - file = fopen(ccc_name, "r"); - if (!file) - open_fail(ccc_name, errno); - - count = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); - if (ret == 3) { - for (unichar = first; unichar <= last; unichar++) { - unicode_data[unichar].ccc = value; - count++; - } - if (verbose > 1) - printf(" %X..%X ccc %d\n", first, last, value); - if (!utf32valid(first) || !utf32valid(last)) - line_fail(ccc_name, line); - continue; - } - ret = sscanf(line, "%X ; %d #", &unichar, &value); - if (ret == 2) { - unicode_data[unichar].ccc = value; - count++; - if (verbose > 1) - printf(" %X ccc %d\n", unichar, value); - if (!utf32valid(unichar)) - line_fail(ccc_name, line); - continue; - } - } - fclose(file); - - if (verbose > 0) - printf("Found %d entries\n", count); - if (count == 0) - file_fail(ccc_name); -} - -static int ignore_compatibility_form(char *type) -{ - int i; - char *ignored_types[] = {"font", "noBreak", "initial", "medial", - "final", "isolated", "circle", "super", - "sub", "vertical", "wide", "narrow", - "small", "square", "fraction", "compat"}; - - for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) - if (strcmp(type, ignored_types[i]) == 0) - return 1; - return 0; -} - -static void nfdi_init(void) -{ - FILE *file; - unsigned int unichar; - unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ - char *s; - char *type; - unsigned int *um; - int count; - int i; - int ret; - - if (verbose > 0) - printf("Parsing %s\n", data_name); - file = fopen(data_name, "r"); - if (!file) - open_fail(data_name, errno); - - count = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", - &unichar, buf0); - if (ret != 2) - continue; - if (!utf32valid(unichar)) - line_fail(data_name, line); - - s = buf0; - /* skip over <tag> */ - if (*s == '<') { - type = ++s; - while (*++s != '>'); - *s++ = '\0'; - if(ignore_compatibility_form(type)) - continue; - } - /* decode the decomposition into UTF-32 */ - i = 0; - while (*s) { - mapping[i] = strtoul(s, &s, 16); - if (!utf32valid(mapping[i])) - line_fail(data_name, line); - i++; - } - mapping[i++] = 0; - - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdi = um; - - if (verbose > 1) - print_utf32nfdi(unichar); - count++; - } - fclose(file); - if (verbose > 0) - printf("Found %d entries\n", count); - if (count == 0) - file_fail(data_name); -} - -static void nfdicf_init(void) -{ - FILE *file; - unsigned int unichar; - unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ - char status; - char *s; - unsigned int *um; - int i; - int count; - int ret; - - if (verbose > 0) - printf("Parsing %s\n", fold_name); - file = fopen(fold_name, "r"); - if (!file) - open_fail(fold_name, errno); - - count = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); - if (ret != 3) - continue; - if (!utf32valid(unichar)) - line_fail(fold_name, line); - /* Use the C+F casefold. */ - if (status != 'C' && status != 'F') - continue; - s = buf0; - if (*s == '<') - while (*s++ != ' ') - ; - i = 0; - while (*s) { - mapping[i] = strtoul(s, &s, 16); - if (!utf32valid(mapping[i])) - line_fail(fold_name, line); - i++; - } - mapping[i++] = 0; - - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdicf = um; - - if (verbose > 1) - print_utf32nfdicf(unichar); - count++; - } - fclose(file); - if (verbose > 0) - printf("Found %d entries\n", count); - if (count == 0) - file_fail(fold_name); -} - -static void ignore_init(void) -{ - FILE *file; - unsigned int unichar; - unsigned int first; - unsigned int last; - unsigned int *um; - int count; - int ret; - - if (verbose > 0) - printf("Parsing %s\n", prop_name); - file = fopen(prop_name, "r"); - if (!file) - open_fail(prop_name, errno); - assert(file); - count = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); - if (ret == 3) { - if (strcmp(buf0, "Default_Ignorable_Code_Point")) - continue; - if (!utf32valid(first) || !utf32valid(last)) - line_fail(prop_name, line); - for (unichar = first; unichar <= last; unichar++) { - free(unicode_data[unichar].utf32nfdi); - um = malloc(sizeof(unsigned int)); - *um = 0; - unicode_data[unichar].utf32nfdi = um; - free(unicode_data[unichar].utf32nfdicf); - um = malloc(sizeof(unsigned int)); - *um = 0; - unicode_data[unichar].utf32nfdicf = um; - count++; - } - if (verbose > 1) - printf(" %X..%X Default_Ignorable_Code_Point\n", - first, last); - continue; - } - ret = sscanf(line, "%X ; %s # ", &unichar, buf0); - if (ret == 2) { - if (strcmp(buf0, "Default_Ignorable_Code_Point")) - continue; - if (!utf32valid(unichar)) - line_fail(prop_name, line); - free(unicode_data[unichar].utf32nfdi); - um = malloc(sizeof(unsigned int)); - *um = 0; - unicode_data[unichar].utf32nfdi = um; - free(unicode_data[unichar].utf32nfdicf); - um = malloc(sizeof(unsigned int)); - *um = 0; - unicode_data[unichar].utf32nfdicf = um; - if (verbose > 1) - printf(" %X Default_Ignorable_Code_Point\n", - unichar); - count++; - continue; - } - } - fclose(file); - - if (verbose > 0) - printf("Found %d entries\n", count); - if (count == 0) - file_fail(prop_name); -} - -static void corrections_init(void) -{ - FILE *file; - unsigned int unichar; - unsigned int major; - unsigned int minor; - unsigned int revision; - unsigned int age; - unsigned int *um; - unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ - char *s; - int i; - int count; - int ret; - - if (verbose > 0) - printf("Parsing %s\n", norm_name); - file = fopen(norm_name, "r"); - if (!file) - open_fail(norm_name, errno); - - count = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", - &unichar, buf0, buf1, - &major, &minor, &revision); - if (ret != 6) - continue; - if (!utf32valid(unichar) || !age_valid(major, minor, revision)) - line_fail(norm_name, line); - count++; - } - corrections = calloc(count, sizeof(struct unicode_data)); - corrections_count = count; - rewind(file); - - count = 0; - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", - &unichar, buf0, buf1, - &major, &minor, &revision); - if (ret != 6) - continue; - if (!utf32valid(unichar) || !age_valid(major, minor, revision)) - line_fail(norm_name, line); - corrections[count] = unicode_data[unichar]; - assert(corrections[count].code == unichar); - age = UNICODE_AGE(major, minor, revision); - corrections[count].correction = age; - - i = 0; - s = buf0; - while (*s) { - mapping[i] = strtoul(s, &s, 16); - if (!utf32valid(mapping[i])) - line_fail(norm_name, line); - i++; - } - mapping[i++] = 0; - - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - corrections[count].utf32nfdi = um; - - if (verbose > 1) - printf(" %X -> %s -> %s V%d_%d_%d\n", - unichar, buf0, buf1, major, minor, revision); - count++; - } - fclose(file); - - if (verbose > 0) - printf("Found %d entries\n", count); - if (count == 0) - file_fail(norm_name); -} - -/* ------------------------------------------------------------------ */ - -/* - * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) - * - * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; - * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; - * - * SBase = 0xAC00 - * LBase = 0x1100 - * VBase = 0x1161 - * TBase = 0x11A7 - * LCount = 19 - * VCount = 21 - * TCount = 28 - * NCount = 588 (VCount * TCount) - * SCount = 11172 (LCount * NCount) - * - * Decomposition: - * SIndex = s - SBase - * - * LV (Canonical/Full) - * LIndex = SIndex / NCount - * VIndex = (Sindex % NCount) / TCount - * LPart = LBase + LIndex - * VPart = VBase + VIndex - * - * LVT (Canonical) - * LVIndex = (SIndex / TCount) * TCount - * TIndex = (Sindex % TCount) - * LVPart = SBase + LVIndex - * TPart = TBase + TIndex - * - * LVT (Full) - * LIndex = SIndex / NCount - * VIndex = (Sindex % NCount) / TCount - * TIndex = (Sindex % TCount) - * LPart = LBase + LIndex - * VPart = VBase + VIndex - * if (TIndex == 0) { - * d = <LPart, VPart> - * } else { - * TPart = TBase + TIndex - * d = <LPart, VPart, TPart> - * } - * - */ - -static void hangul_decompose(void) -{ - unsigned int sb = 0xAC00; - unsigned int lb = 0x1100; - unsigned int vb = 0x1161; - unsigned int tb = 0x11a7; - /* unsigned int lc = 19; */ - unsigned int vc = 21; - unsigned int tc = 28; - unsigned int nc = (vc * tc); - /* unsigned int sc = (lc * nc); */ - unsigned int unichar; - unsigned int mapping[4]; - unsigned int *um; - int count; - int i; - - if (verbose > 0) - printf("Decomposing hangul\n"); - /* Hangul */ - count = 0; - for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { - unsigned int si = unichar - sb; - unsigned int li = si / nc; - unsigned int vi = (si % nc) / tc; - unsigned int ti = si % tc; - - i = 0; - mapping[i++] = lb + li; - mapping[i++] = vb + vi; - if (ti) - mapping[i++] = tb + ti; - mapping[i++] = 0; - - assert(!unicode_data[unichar].utf32nfdi); - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdi = um; - - assert(!unicode_data[unichar].utf32nfdicf); - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdicf = um; - - /* - * Add a cookie as a reminder that the hangul syllable - * decompositions must not be stored in the generated - * trie. - */ - unicode_data[unichar].utf8nfdi = malloc(2); - unicode_data[unichar].utf8nfdi[0] = HANGUL; - unicode_data[unichar].utf8nfdi[1] = '\0'; - - if (verbose > 1) - print_utf32nfdi(unichar); - - count++; - } - if (verbose > 0) - printf("Created %d entries\n", count); -} - -static void nfdi_decompose(void) -{ - unsigned int unichar; - unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ - unsigned int *um; - unsigned int *dc; - int count; - int i; - int j; - int ret; - - if (verbose > 0) - printf("Decomposing nfdi\n"); - - count = 0; - for (unichar = 0; unichar != 0x110000; unichar++) { - if (!unicode_data[unichar].utf32nfdi) - continue; - for (;;) { - ret = 1; - i = 0; - um = unicode_data[unichar].utf32nfdi; - while (*um) { - dc = unicode_data[*um].utf32nfdi; - if (dc) { - for (j = 0; dc[j]; j++) - mapping[i++] = dc[j]; - ret = 0; - } else { - mapping[i++] = *um; - } - um++; - } - mapping[i++] = 0; - if (ret) - break; - free(unicode_data[unichar].utf32nfdi); - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdi = um; - } - /* Add this decomposition to nfdicf if there is no entry. */ - if (!unicode_data[unichar].utf32nfdicf) { - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdicf = um; - } - if (verbose > 1) - print_utf32nfdi(unichar); - count++; - } - if (verbose > 0) - printf("Processed %d entries\n", count); -} - -static void nfdicf_decompose(void) -{ - unsigned int unichar; - unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ - unsigned int *um; - unsigned int *dc; - int count; - int i; - int j; - int ret; - - if (verbose > 0) - printf("Decomposing nfdicf\n"); - count = 0; - for (unichar = 0; unichar != 0x110000; unichar++) { - if (!unicode_data[unichar].utf32nfdicf) - continue; - for (;;) { - ret = 1; - i = 0; - um = unicode_data[unichar].utf32nfdicf; - while (*um) { - dc = unicode_data[*um].utf32nfdicf; - if (dc) { - for (j = 0; dc[j]; j++) - mapping[i++] = dc[j]; - ret = 0; - } else { - mapping[i++] = *um; - } - um++; - } - mapping[i++] = 0; - if (ret) - break; - free(unicode_data[unichar].utf32nfdicf); - um = malloc(i * sizeof(unsigned int)); - memcpy(um, mapping, i * sizeof(unsigned int)); - unicode_data[unichar].utf32nfdicf = um; - } - if (verbose > 1) - print_utf32nfdicf(unichar); - count++; - } - if (verbose > 0) - printf("Processed %d entries\n", count); -} - -/* ------------------------------------------------------------------ */ - -int utf8agemax(struct tree *, const char *); -int utf8nagemax(struct tree *, const char *, size_t); -int utf8agemin(struct tree *, const char *); -int utf8nagemin(struct tree *, const char *, size_t); -ssize_t utf8len(struct tree *, const char *); -ssize_t utf8nlen(struct tree *, const char *, size_t); -struct utf8cursor; -int utf8cursor(struct utf8cursor *, struct tree *, const char *); -int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); -int utf8byte(struct utf8cursor *); - -/* - * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) - * - * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; - * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; - * - * SBase = 0xAC00 - * LBase = 0x1100 - * VBase = 0x1161 - * TBase = 0x11A7 - * LCount = 19 - * VCount = 21 - * TCount = 28 - * NCount = 588 (VCount * TCount) - * SCount = 11172 (LCount * NCount) - * - * Decomposition: - * SIndex = s - SBase - * - * LV (Canonical/Full) - * LIndex = SIndex / NCount - * VIndex = (Sindex % NCount) / TCount - * LPart = LBase + LIndex - * VPart = VBase + VIndex - * - * LVT (Canonical) - * LVIndex = (SIndex / TCount) * TCount - * TIndex = (Sindex % TCount) - * LVPart = SBase + LVIndex - * TPart = TBase + TIndex - * - * LVT (Full) - * LIndex = SIndex / NCount - * VIndex = (Sindex % NCount) / TCount - * TIndex = (Sindex % TCount) - * LPart = LBase + LIndex - * VPart = VBase + VIndex - * if (TIndex == 0) { - * d = <LPart, VPart> - * } else { - * TPart = TBase + TIndex - * d = <LPart, VPart, TPart> - * } - */ - -/* Constants */ -#define SB (0xAC00) -#define LB (0x1100) -#define VB (0x1161) -#define TB (0x11A7) -#define LC (19) -#define VC (21) -#define TC (28) -#define NC (VC * TC) -#define SC (LC * NC) - -/* Algorithmic decomposition of hangul syllable. */ -static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) -{ - unsigned int si; - unsigned int li; - unsigned int vi; - unsigned int ti; - unsigned char *h; - - /* Calculate the SI, LI, VI, and TI values. */ - si = utf8decode(str) - SB; - li = si / NC; - vi = (si % NC) / TC; - ti = si % TC; - - /* Fill in base of leaf. */ - h = hangul; - LEAF_GEN(h) = 2; - LEAF_CCC(h) = DECOMPOSE; - h += 2; - - /* Add LPart, a 3-byte UTF-8 sequence. */ - h += utf8encode((char *)h, li + LB); - - /* Add VPart, a 3-byte UTF-8 sequence. */ - h += utf8encode((char *)h, vi + VB); - - /* Add TPart if required, also a 3-byte UTF-8 sequence. */ - if (ti) - h += utf8encode((char *)h, ti + TB); - - /* Terminate string. */ - h[0] = '\0'; - - return hangul; -} - -/* - * Use trie to scan s, touching at most len bytes. - * Returns the leaf if one exists, NULL otherwise. - * - * A non-NULL return guarantees that the UTF-8 sequence starting at s - * is well-formed and corresponds to a known unicode code point. The - * shorthand for this will be "is valid UTF-8 unicode". - */ -static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, - const char *s, size_t len) -{ - utf8trie_t *trie; - int offlen; - int offset; - int mask; - int node; - - if (!tree) - return NULL; - if (len == 0) - return NULL; - node = 1; - trie = utf8data + tree->index; - while (node) { - offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; - if (*trie & NEXTBYTE) { - if (--len == 0) - return NULL; - s++; - } - mask = 1 << (*trie & BITNUM); - if (*s & mask) { - /* Right leg */ - if (offlen) { - /* Right node at offset of trie */ - node = (*trie & RIGHTNODE); - offset = trie[offlen]; - while (--offlen) { - offset <<= 8; - offset |= trie[offlen]; - } - trie += offset; - } else if (*trie & RIGHTPATH) { - /* Right node after this node */ - node = (*trie & TRIENODE); - trie++; - } else { - /* No right node. */ - return NULL; - } - } else { - /* Left leg */ - if (offlen) { - /* Left node after this node. */ - node = (*trie & LEFTNODE); - trie += offlen + 1; - } else if (*trie & RIGHTPATH) { - /* No left node. */ - return NULL; - } else { - /* Left node after this node */ - node = (*trie & TRIENODE); - trie++; - } - } - } - /* - * Hangul decomposition is done algorithmically. These are the - * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is - * always 3 bytes long, so s has been advanced twice, and the - * start of the sequence is at s-2. - */ - if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) - trie = utf8hangul(s - 2, hangul); - return trie; -} - -/* - * Use trie to scan s. - * Returns the leaf if one exists, NULL otherwise. - * - * Forwards to trie_nlookup(). - */ -static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, - const char *s) -{ - return utf8nlookup(tree, hangul, s, (size_t)-1); -} - -/* - * Return the number of bytes used by the current UTF-8 sequence. - * Assumes the input points to the first byte of a valid UTF-8 - * sequence. - */ -static inline int utf8clen(const char *s) -{ - unsigned char c = *s; - return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); -} - -/* - * Maximum age of any character in s. - * Return -1 if s is not valid UTF-8 unicode. - * Return 0 if only non-assigned code points are used. - */ -int utf8agemax(struct tree *tree, const char *s) -{ - utf8leaf_t *leaf; - int age = 0; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!tree) - return -1; - - while (*s) { - leaf = utf8lookup(tree, hangul, s); - if (!leaf) - return -1; - leaf_age = ages[LEAF_GEN(leaf)]; - if (leaf_age <= tree->maxage && leaf_age > age) - age = leaf_age; - s += utf8clen(s); - } - return age; -} - -/* - * Minimum age of any character in s. - * Return -1 if s is not valid UTF-8 unicode. - * Return 0 if non-assigned code points are used. - */ -int utf8agemin(struct tree *tree, const char *s) -{ - utf8leaf_t *leaf; - int age; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!tree) - return -1; - age = tree->maxage; - while (*s) { - leaf = utf8lookup(tree, hangul, s); - if (!leaf) - return -1; - leaf_age = ages[LEAF_GEN(leaf)]; - if (leaf_age <= tree->maxage && leaf_age < age) - age = leaf_age; - s += utf8clen(s); - } - return age; -} - -/* - * Maximum age of any character in s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -int utf8nagemax(struct tree *tree, const char *s, size_t len) -{ - utf8leaf_t *leaf; - int age = 0; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!tree) - return -1; - - while (len && *s) { - leaf = utf8nlookup(tree, hangul, s, len); - if (!leaf) - return -1; - leaf_age = ages[LEAF_GEN(leaf)]; - if (leaf_age <= tree->maxage && leaf_age > age) - age = leaf_age; - len -= utf8clen(s); - s += utf8clen(s); - } - return age; -} - -/* - * Maximum age of any character in s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -int utf8nagemin(struct tree *tree, const char *s, size_t len) -{ - utf8leaf_t *leaf; - int leaf_age; - int age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!tree) - return -1; - age = tree->maxage; - while (len && *s) { - leaf = utf8nlookup(tree, hangul, s, len); - if (!leaf) - return -1; - leaf_age = ages[LEAF_GEN(leaf)]; - if (leaf_age <= tree->maxage && leaf_age < age) - age = leaf_age; - len -= utf8clen(s); - s += utf8clen(s); - } - return age; -} - -/* - * Length of the normalization of s. - * Return -1 if s is not valid UTF-8 unicode. - * - * A string of Default_Ignorable_Code_Point has length 0. - */ -ssize_t utf8len(struct tree *tree, const char *s) -{ - utf8leaf_t *leaf; - size_t ret = 0; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!tree) - return -1; - while (*s) { - leaf = utf8lookup(tree, hangul, s); - if (!leaf) - return -1; - if (ages[LEAF_GEN(leaf)] > tree->maxage) - ret += utf8clen(s); - else if (LEAF_CCC(leaf) == DECOMPOSE) - ret += strlen(LEAF_STR(leaf)); - else - ret += utf8clen(s); - s += utf8clen(s); - } - return ret; -} - -/* - * Length of the normalization of s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) -{ - utf8leaf_t *leaf; - size_t ret = 0; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!tree) - return -1; - while (len && *s) { - leaf = utf8nlookup(tree, hangul, s, len); - if (!leaf) - return -1; - if (ages[LEAF_GEN(leaf)] > tree->maxage) - ret += utf8clen(s); - else if (LEAF_CCC(leaf) == DECOMPOSE) - ret += strlen(LEAF_STR(leaf)); - else - ret += utf8clen(s); - len -= utf8clen(s); - s += utf8clen(s); - } - return ret; -} - -/* - * Cursor structure used by the normalizer. - */ -struct utf8cursor { - struct tree *tree; - const char *s; - const char *p; - const char *ss; - const char *sp; - unsigned int len; - unsigned int slen; - short int ccc; - short int nccc; - unsigned int unichar; - unsigned char hangul[UTF8HANGULLEAF]; -}; - -/* - * Set up an utf8cursor for use by utf8byte(). - * - * s : string. - * len : length of s. - * u8c : pointer to cursor. - * trie : utf8trie_t to use for normalization. - * - * Returns -1 on error, 0 on success. - */ -int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, - size_t len) -{ - if (!tree) - return -1; - if (!s) - return -1; - u8c->tree = tree; - u8c->s = s; - u8c->p = NULL; - u8c->ss = NULL; - u8c->sp = NULL; - u8c->len = len; - u8c->slen = 0; - u8c->ccc = STOPPER; - u8c->nccc = STOPPER; - u8c->unichar = 0; - /* Check we didn't clobber the maximum length. */ - if (u8c->len != len) - return -1; - /* The first byte of s may not be an utf8 continuation. */ - if (len > 0 && (*s & 0xC0) == 0x80) - return -1; - return 0; -} - -/* - * Set up an utf8cursor for use by utf8byte(). - * - * s : NUL-terminated string. - * u8c : pointer to cursor. - * trie : utf8trie_t to use for normalization. - * - * Returns -1 on error, 0 on success. - */ -int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) -{ - return utf8ncursor(u8c, tree, s, (unsigned int)-1); -} - -/* - * Get one byte from the normalized form of the string described by u8c. - * - * Returns the byte cast to an unsigned char on succes, and -1 on failure. - * - * The cursor keeps track of the location in the string in u8c->s. - * When a character is decomposed, the current location is stored in - * u8c->p, and u8c->s is set to the start of the decomposition. Note - * that bytes from a decomposition do not count against u8c->len. - * - * Characters are emitted if they match the current CCC in u8c->ccc. - * Hitting end-of-string while u8c->ccc == STOPPER means we're done, - * and the function returns 0 in that case. - * - * Sorting by CCC is done by repeatedly scanning the string. The - * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at - * the start of the scan. The first pass finds the lowest CCC to be - * emitted and stores it in u8c->nccc, the second pass emits the - * characters with this CCC and finds the next lowest CCC. This limits - * the number of passes to 1 + the number of different CCCs in the - * sequence being scanned. - * - * Therefore: - * u8c->p != NULL -> a decomposition is being scanned. - * u8c->ss != NULL -> this is a repeating scan. - * u8c->ccc == -1 -> this is the first scan of a repeating scan. - */ -int utf8byte(struct utf8cursor *u8c) -{ - utf8leaf_t *leaf; - int ccc; - - for (;;) { - /* Check for the end of a decomposed character. */ - if (u8c->p && *u8c->s == '\0') { - u8c->s = u8c->p; - u8c->p = NULL; - } - - /* Check for end-of-string. */ - if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { - /* There is no next byte. */ - if (u8c->ccc == STOPPER) - return 0; - /* End-of-string during a scan counts as a stopper. */ - ccc = STOPPER; - goto ccc_mismatch; - } else if ((*u8c->s & 0xC0) == 0x80) { - /* This is a continuation of the current character. */ - if (!u8c->p) - u8c->len--; - return (unsigned char)*u8c->s++; - } - - /* Look up the data for the current character. */ - if (u8c->p) { - leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); - } else { - leaf = utf8nlookup(u8c->tree, u8c->hangul, - u8c->s, u8c->len); - } - - /* No leaf found implies that the input is a binary blob. */ - if (!leaf) - return -1; - - /* Characters that are too new have CCC 0. */ - if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { - ccc = STOPPER; - } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { - u8c->len -= utf8clen(u8c->s); - u8c->p = u8c->s + utf8clen(u8c->s); - u8c->s = LEAF_STR(leaf); - /* Empty decomposition implies CCC 0. */ - if (*u8c->s == '\0') { - if (u8c->ccc == STOPPER) - continue; - ccc = STOPPER; - goto ccc_mismatch; - } - leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); - ccc = LEAF_CCC(leaf); - } - u8c->unichar = utf8decode(u8c->s); - - /* - * If this is not a stopper, then see if it updates - * the next canonical class to be emitted. - */ - if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) - u8c->nccc = ccc; - - /* - * Return the current byte if this is the current - * combining class. - */ - if (ccc == u8c->ccc) { - if (!u8c->p) - u8c->len--; - return (unsigned char)*u8c->s++; - } - - /* Current combining class mismatch. */ - ccc_mismatch: - if (u8c->nccc == STOPPER) { - /* - * Scan forward for the first canonical class - * to be emitted. Save the position from - * which to restart. - */ - assert(u8c->ccc == STOPPER); - u8c->ccc = MINCCC - 1; - u8c->nccc = ccc; - u8c->sp = u8c->p; - u8c->ss = u8c->s; - u8c->slen = u8c->len; - if (!u8c->p) - u8c->len -= utf8clen(u8c->s); - u8c->s += utf8clen(u8c->s); - } else if (ccc != STOPPER) { - /* Not a stopper, and not the ccc we're emitting. */ - if (!u8c->p) - u8c->len -= utf8clen(u8c->s); - u8c->s += utf8clen(u8c->s); - } else if (u8c->nccc != MAXCCC + 1) { - /* At a stopper, restart for next ccc. */ - u8c->ccc = u8c->nccc; - u8c->nccc = MAXCCC + 1; - u8c->s = u8c->ss; - u8c->p = u8c->sp; - u8c->len = u8c->slen; - } else { - /* All done, proceed from here. */ - u8c->ccc = STOPPER; - u8c->nccc = STOPPER; - u8c->sp = NULL; - u8c->ss = NULL; - u8c->slen = 0; - } - } -} - -/* ------------------------------------------------------------------ */ - -static int normalize_line(struct tree *tree) -{ - char *s; - char *t; - int c; - struct utf8cursor u8c; - - /* First test: null-terminated string. */ - s = buf2; - t = buf3; - if (utf8cursor(&u8c, tree, s)) - return -1; - while ((c = utf8byte(&u8c)) > 0) - if (c != (unsigned char)*t++) - return -1; - if (c < 0) - return -1; - if (*t != 0) - return -1; - - /* Second test: length-limited string. */ - s = buf2; - /* Replace NUL with a value that will cause an error if seen. */ - s[strlen(s) + 1] = -1; - t = buf3; - if (utf8cursor(&u8c, tree, s)) - return -1; - while ((c = utf8byte(&u8c)) > 0) - if (c != (unsigned char)*t++) - return -1; - if (c < 0) - return -1; - if (*t != 0) - return -1; - - return 0; -} - -static void normalization_test(void) -{ - FILE *file; - unsigned int unichar; - struct unicode_data *data; - char *s; - char *t; - int ret; - int ignorables; - int tests = 0; - int failures = 0; - - if (verbose > 0) - printf("Parsing %s\n", test_name); - /* Step one, read data from file. */ - file = fopen(test_name, "r"); - if (!file) - open_fail(test_name, errno); - - while (fgets(line, LINESIZE, file)) { - ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];", - buf0, buf1); - if (ret != 2 || *line == '#') - continue; - s = buf0; - t = buf2; - while (*s) { - unichar = strtoul(s, &s, 16); - t += utf8encode(t, unichar); - } - *t = '\0'; - - ignorables = 0; - s = buf1; - t = buf3; - while (*s) { - unichar = strtoul(s, &s, 16); - data = &unicode_data[unichar]; - if (data->utf8nfdi && !*data->utf8nfdi) - ignorables = 1; - else - t += utf8encode(t, unichar); - } - *t = '\0'; - - tests++; - if (normalize_line(nfdi_tree) < 0) { - printf("Line %s -> %s", buf0, buf1); - if (ignorables) - printf(" (ignorables removed)"); - printf(" failure\n"); - failures++; - } - } - fclose(file); - if (verbose > 0) - printf("Ran %d tests with %d failures\n", tests, failures); - if (failures) - file_fail(test_name); -} - -/* ------------------------------------------------------------------ */ - -static void write_file(void) -{ - FILE *file; - int i; - int j; - int t; - int gen; - - if (verbose > 0) - printf("Writing %s\n", utf8_name); - file = fopen(utf8_name, "w"); - if (!file) - open_fail(utf8_name, errno); - - fprintf(file, "/* This file is generated code, do not edit. */\n"); - fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); - fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); - fprintf(file, "#endif\n"); - fprintf(file, "\n"); - fprintf(file, "static const unsigned int utf8vers = %#x;\n", - unicode_maxage); - fprintf(file, "\n"); - fprintf(file, "static const unsigned int utf8agetab[] = {\n"); - for (i = 0; i != ages_count; i++) - fprintf(file, "\t%#x%s\n", ages[i], - ages[i] == unicode_maxage ? "" : ","); - fprintf(file, "};\n"); - fprintf(file, "\n"); - fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n"); - t = 0; - for (gen = 0; gen < ages_count; gen++) { - fprintf(file, "\t{ %#x, %d }%s\n", - ages[gen], trees[t].index, - ages[gen] == unicode_maxage ? "" : ","); - if (trees[t].maxage == ages[gen]) - t += 2; - } - fprintf(file, "};\n"); - fprintf(file, "\n"); - fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n"); - t = 1; - for (gen = 0; gen < ages_count; gen++) { - fprintf(file, "\t{ %#x, %d }%s\n", - ages[gen], trees[t].index, - ages[gen] == unicode_maxage ? "" : ","); - if (trees[t].maxage == ages[gen]) - t += 2; - } - fprintf(file, "};\n"); - fprintf(file, "\n"); - fprintf(file, "static const unsigned char utf8data[%zd] = {\n", - utf8data_size); - t = 0; - for (i = 0; i != utf8data_size; i += 16) { - if (i == trees[t].index) { - fprintf(file, "\t/* %s_%x */\n", - trees[t].type, trees[t].maxage); - if (t < trees_count-1) - t++; - } - fprintf(file, "\t"); - for (j = i; j != i + 16; j++) - fprintf(file, "0x%.2x%s", utf8data[j], - (j < utf8data_size -1 ? "," : "")); - fprintf(file, "\n"); - } - fprintf(file, "};\n"); - fclose(file); -} - -/* ------------------------------------------------------------------ */ - -int main(int argc, char *argv[]) -{ - unsigned int unichar; - int opt; - - argv0 = argv[0]; - - while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { - switch (opt) { - case 'a': - age_name = optarg; - break; - case 'c': - ccc_name = optarg; - break; - case 'd': - data_name = optarg; - break; - case 'f': - fold_name = optarg; - break; - case 'n': - norm_name = optarg; - break; - case 'o': - utf8_name = optarg; - break; - case 'p': - prop_name = optarg; - break; - case 't': - test_name = optarg; - break; - case 'v': - verbose++; - break; - case 'h': - help(); - exit(0); - default: - usage(); - } - } - - if (verbose > 1) - help(); - for (unichar = 0; unichar != 0x110000; unichar++) - unicode_data[unichar].code = unichar; - age_init(); - ccc_init(); - nfdi_init(); - nfdicf_init(); - ignore_init(); - corrections_init(); - hangul_decompose(); - nfdi_decompose(); - nfdicf_decompose(); - utf8_init(); - trees_init(); - trees_populate(); - trees_reduce(); - trees_verify(); - /* Prevent "unused function" warning. */ - (void)lookup(nfdi_tree, " "); - if (verbose > 2) - tree_walk(nfdi_tree); - if (verbose > 2) - tree_walk(nfdicf_tree); - normalization_test(); - write_file(); - - return 0; -} |