From 3b1fffade1473f20f2558733fbd218f4580fc7c3 Mon Sep 17 00:00:00 2001 From: Rasmus Andersson Date: Tue, 22 Aug 2017 00:05:20 -0700 Subject: Initial public commit --- misc/unicode_util.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 misc/unicode_util.py (limited to 'misc/unicode_util.py') diff --git a/misc/unicode_util.py b/misc/unicode_util.py new file mode 100644 index 000000000..18196e87e --- /dev/null +++ b/misc/unicode_util.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# encoding: utf8 + +class MainCategories: + Letter = 'Letter' + Mark = 'Mark' + Number = 'Number' + Punctuation = 'Punctuation' + Symbol = 'Symbol' + Separator = 'Separator' + Control = 'Control' + Format = 'Format' + Surrogate = 'Surrogate' + PrivateUse = 'Private_Use' + Unassigned = 'Unassigned' + Other = 'Other' + +GeneralCategories = { + 'Lu': ('Uppercase_Letter', MainCategories.Letter), + 'Ll': ('Lowercase_Letter', MainCategories.Letter), + 'Lt': ('Titlecase_Letter', MainCategories.Letter), + 'LC': ('Cased_Letter', MainCategories.Letter), + 'Lm': ('Modifier_Letter', MainCategories.Letter), + 'Lo': ('Other_Letter', MainCategories.Letter), + 'L': ('Letter', MainCategories.Letter), + 'Mn': ('Nonspacing_Mark', MainCategories.Mark), + 'Mc': ('Spacing_Mark', MainCategories.Mark), + 'Me': ('Enclosing_Mark', MainCategories.Mark), + 'M': ('Mark', MainCategories.Mark), + 'Nd': ('Decimal_Number', MainCategories.Number), + 'Nl': ('Letter_Number', MainCategories.Number), + 'No': ('Other_Number', MainCategories.Number), + 'N': ('Number', MainCategories.Number), + 'Pc': ('Connector_Punctuation', MainCategories.Punctuation), + 'Pd': ('Dash_Punctuation', MainCategories.Punctuation), + 'Ps': ('Open_Punctuation', MainCategories.Punctuation), + 'Pe': ('Close_Punctuation', MainCategories.Punctuation), + 'Pi': ('Initial_Punctuation', MainCategories.Punctuation), + 'Pf': ('Final_Punctuation', MainCategories.Punctuation), + 'Po': ('Other_Punctuation', MainCategories.Punctuation), + 'P': ('Punctuation', MainCategories.Punctuation), + 'Sm': ('Math_Symbol', MainCategories.Symbol), + 'Sc': ('Currency_Symbol', MainCategories.Symbol), + 'Sk': ('Modifier_Symbol', MainCategories.Symbol), + 'So': ('Other_Symbol', MainCategories.Symbol), + 'S': ('Symbol', MainCategories.Symbol), + 'Zs': ('Space_Separator', MainCategories.Separator), + 'Zl': ('Line_Separator', MainCategories.Separator), + 'Zp': ('Paragraph_Separator', MainCategories.Separator), + 'Z': ('Separator', MainCategories.Separator), + 'Cc': ('Control', MainCategories.Control), + 'Cf': ('Format', MainCategories.Format), + 'Cs': ('Surrogate', MainCategories.Surrogate), + 'Co': ('Private_Use', MainCategories.PrivateUse), + 'Cn': ('Unassigned', MainCategories.Unassigned), + 'C': ('Other', MainCategories.Other), +} + + +class Codepoint: + def __init__(self, v): + self.codePoint = int(v[0], 16) + self.name = v[1] + + self.category = v[2] + c = GeneralCategories.get(self.category, ('', MainCategories.Other)) + self.categoryName = c[0] + self.mainCategory = c[1] + + self.decDigitValue = v[6] + self.numValue = v[8] + + def isLetter(self): return self.mainCategory is MainCategories.Letter + def isMark(self): return self.mainCategory is MainCategories.Mark + def isNumber(self): return self.mainCategory is MainCategories.Number + def isPunctuation(self): return self.mainCategory is MainCategories.Punctuation + def isSymbol(self): return self.mainCategory is MainCategories.Symbol + def isSeparator(self): return self.mainCategory is MainCategories.Separator + def isControl(self): return self.mainCategory is MainCategories.Control + def isFormat(self): return self.mainCategory is MainCategories.Format + def isSurrogate(self): return self.mainCategory is MainCategories.Surrogate + def isPrivateUse(self): return self.mainCategory is MainCategories.PrivateUse + def isUnassigned(self): return self.mainCategory is MainCategories.Unassigned + def isOther(self): return self.mainCategory is MainCategories.Other + + +# http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt +def parseUnicodeDataFile(ucdFile): # { codepoint:int => Codepoint() } + ucd = {} + with open(ucdFile, 'r') as f: + for line in f: + # See http://unicode.org/reports/tr44/#UnicodeData.txt for fields + # e.g. "001D;;Cc;0;B;;;;;N;INFORMATION SEPARATOR THREE;;;;" + if len(line) == 0 or line.startswith('#'): + continue + v = line.split(';') + if len(v) < 10: + continue + try: + cp = Codepoint(v) + ucd[cp.codePoint] = cp + except: + pass + return ucd -- cgit v1.2.3