aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-11-29 23:25:06 -0800
committerH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-11-29 23:25:06 -0800
commit86a84dd7743ab0d8140b6a07f81e6b6cf7953b46 (patch)
tree4dc5ff70d5a3495c8a748633d956cde40b9081ce
parentc76509f7077ced109ed221248c29495d6cbd40e2 (diff)
downloadlibucd-86a84dd7743ab0d8140b6a07f81e6b6cf7953b46.tar.gz
Add enumerations for additional properties; actually generate property array
-rw-r--r--Makefile25
-rwxr-xr-xconvert_ucd.pl75
-rw-r--r--libucd_int.h16
-rw-r--r--ucd.h416
4 files changed, 383 insertions, 149 deletions
diff --git a/Makefile b/Makefile
index 73de55a..4adca95 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,9 @@ HOST_LIBS =
# These are the files produced by convert_ucd.pl
#
CVT_FILES = gen/jamo.c gen/nameslist.c gen/nametoucs.keys \
- gen/ucstoname.keys gen/propdump.txt
+ gen/ucstoname.keys gen/proparray.c
+
+# -----------------------------------------------------------------------
.SUFFIXES: .c .o .lo .s .ls .i .li .cc .h
.c.o:
@@ -28,7 +30,18 @@ CVT_FILES = gen/jamo.c gen/nameslist.c gen/nametoucs.keys \
.c.li:
$(HOST_CC) $(HOST_CFLAGS) -E -o $@ $<
-all : gen/done perfect/perfect gen/nametoucs_hash.o gen/ucstoname_hash.o
+# -----------------------------------------------------------------------
+
+all : perfect/perfect gen/nametoucs_hash.o gen/ucstoname_hash.o \
+ proparray.o
+
+
+clean:
+ rm -rf gen
+ rm -f *.o *.a *.so *.so.*
+ $(MAKE) -C perfect clean
+
+# -----------------------------------------------------------------------
$(CVT_FILES) : gen/done
@@ -54,10 +67,6 @@ gen/ucstoname_hash.c: gen/ucstoname.keys perfect/perfect
gen/ucstoname_hash.h: gen/ucstoname_hash.c
: Generated by side effect
-clean:
- rm -rf gen
- rm -f *.o *.a *.so *.so.*
- $(MAKE) -C perfect clean
-
-
+# -----------------------------------------------------------------------
+proparray.o: proparray.c ucd.h libucd_int.h gen/proparray.c
diff --git a/convert_ucd.pl b/convert_ucd.pl
index 2da914f..5d48cfe 100755
--- a/convert_ucd.pl
+++ b/convert_ucd.pl
@@ -4,8 +4,6 @@
# into data for libucd.
#
-use POSIX;
-
#
# Internally this file uses a hash with the UCS value as key, and
# as data another hash from property name to value.
@@ -35,6 +33,7 @@ sub read_separated_file($$$) {
my $line, @fields, $c0, $c1, $c;
my $was_first = 0;
+ print STDERR "Reading $filename\n";
open($fh, '<', $filename) or return 0;
while ( defined($line = <$fh>) ) {
chomp $line;
@@ -116,6 +115,7 @@ sub read_boolean_file($) {
my $fh;
my $line, @fields, $c0, $c1, $c;
+ print STDERR "Reading $filename\n";
open($fh, '<', $filename) or return 0;
while ( defined($line = <$fh>) ) {
chomp $line;
@@ -141,27 +141,6 @@ sub read_boolean_file($) {
return 1;
}
-# UCD numeric values are given as 8-significant figures floating-point
-# numbers, but in reality they are all fractions. This converts a
-# floating-point number to a numerator and denominator with just about
-# enough fuzz.
-#
-# Note: the denominator will always be positive, and will always be 1
-# if the number is an integer.
-sub make_fraction($) {
- my ($v) = @_;
- my $n, $d, $minus;
-
- return [0, 1] if ( $v == 0 );
-
- $d = 1;
- while ( 1 ) {
- $n = floor($v*$d + 0.5);
- return [$n, $d] if ( abs($n/$d-$v)/$v < 1e-7 );
- $d++;
- }
-}
-
sub make_jamo_string($) {
my ($s) = @_;
my $i, $c;
@@ -191,6 +170,7 @@ sub make_jamo_tables() {
# None of the syllables exceed 4 bytes, so let's not waste
# pointer space that might have to be relocated...
+ print STDERR "Writing gen/jamo.c\n";
open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c";
print $fh "#include \"libucd_int.h\"\n\n";
@@ -226,6 +206,7 @@ sub make_names_list() {
my $fh;
my $col;
+ print STDERR "Writing gen/nameslist.c\n";
open($fh, '>', 'gen/nameslist.c') or die "$0: Cannot create gen/nameslist.c";
print $fh "#include \"libucd_int.h\"\n\n";
@@ -297,6 +278,7 @@ sub make_name_keyfile()
my $fh;
my $k;
+ print STDERR "Writing gen/nametoucs.keys\n";
open($fh, '>', 'gen/nametoucs.keys')
or die "$0: cannot write gen/nametoucs.keys\n";
@@ -317,6 +299,7 @@ sub make_named_ucs_keyfile()
my $fh;
my $k;
+ print STDERR "Writing gen/ucstoname.keys\n";
open($fh, '>', 'gen/ucstoname.keys')
or die "$0: cannot write gen/ucstoname.keys\n";
@@ -336,6 +319,7 @@ sub dump_prop_list()
{
my $fh, $c;
+ print STDERR "Writing gen/propdump.txt\n";
open($fh, '>', 'gen/propdump.txt')
or die "$0: cannot write gen/propdump.txt\n";
binmode $fh, ':utf8';
@@ -405,6 +389,7 @@ sub make_properties_array()
'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector',
'White_Space', 'Bidi_Mirrored');
+ print STDERR "Writing gen/proparray.c\n";
open($fh, '>', 'gen/proparray.c') or die;
binmode $fh, ':utf8';
@@ -454,9 +439,9 @@ sub make_properties_array()
$mine .= "\t\tUC_FL_\U$bp\E |\n";
}
}
- my $block = $$cp{'Block'};
+ my $block = $$cp{'Block'} || 'No_Block';
$block =~ tr/ .-/___/;
- $mine .= "\t\t(UC_BLK_$block << 48),\n";
+ $mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n";
# Simple case mappings
my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c;
@@ -472,19 +457,36 @@ sub make_properties_array()
my (@sage) = split(/\./, $age);
$mine .= sprintf("\t\t(%d << 5) + %d, /* $age */\n", $sage[0], $sage[1]);
- # Padding
- $mine .= "\t\t{ 0, 0, },\n";
+ # Canonical Combining Class
+ my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
+ if ( $ccc =~ /^[0-9]+$/ ) {
+ $mine .= "\t\t$ccc,\n"; # Numeric CCC
+ } else {
+ $mine .= "\t\tUC_CCC_$ccc,\n";
+ }
+
+ # Sentence Break
+ my $sb = $$cp{'Sentence_Break'} || 'Other';
+ $mine .= "\t\tUC_SB_$sb,\n";
+
+ # Grapheme Cluster Break
+ my $gcb = $$cp{'Grapheme_Cluster_Break'} || 'Other';
+ $mine .= "\t\tUC_GCB_$gcb,\n";
+
+ # Word Break
+ my $wb = $$cp{'Word_Break'} || 'Other';
+ $mine .= "\t\tUC_WB_$wb,\n";
# Arabic Joining Type
- my $ajt = $$cp{'Arabic_Joining_Type'} ||
+ my $ajt = $$cp{'Joining_Type'} ||
($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U';
- $mine .= "\t\tUC_AJT_$ajt,\n";
+ $mine .= "\t\tUC_JT_$ajt,\n";
# Arabic Joining Group
- my $ajg = $$cp{'Arabic_Joining_Group'} || 'No_Joining_Group';
+ my $ajg = $$cp{'Joining_Group'} || 'No_Joining_Group';
$ajg =~ tr/ /_/;
$ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g;
- $mine .= "\t\tUC_AJG_$ajg,\n";
+ $mine .= "\t\tUC_JG_$ajg,\n";
# East Asian Width
my $eaw = $$cp{'East_Asian_Width'} || 'N';
@@ -502,13 +504,9 @@ sub make_properties_array()
my $nt = $$cp{'Numeric_Type'} || 'None';
$mine .= "\t\tUC_NT_$nt,\n";
- # Canonical Combining Class
- my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
- $mine .= "\t\tUC_CCC_$ccc,\n";
-
# Bidi Class
my $bc = $$cp{'Bidi_Class'} || 'L';
- $mine .= "\t\tUC_BC_$bc,\n";
+ $mine .= "\t\tUC_BIDI_$bc,\n";
# Additional properties...
$mine .= "\t},\n";
@@ -553,6 +551,9 @@ read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']);
read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping',
'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []);
read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []);
+read_separated_file('ucd/auxilliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []);
+read_separated_file('ucd/auxilliary/SentenceBreakProperty.txt', ['eSentence_Break'], []);
+read_separated_file('ucd/auxilliary/WordBreakProperty.txt', ['eWord_Break'], []);
read_boolean_file('ucd/DerivedCoreProperties.txt');
read_boolean_file('ucd/PropList.txt');
@@ -564,4 +565,4 @@ make_names_list();
make_name_keyfile();
make_named_ucs_keyfile();
make_properties_array();
-dump_prop_list();
+# dump_prop_list();
diff --git a/libucd_int.h b/libucd_int.h
index 8ab2f06..c23b8ed 100644
--- a/libucd_int.h
+++ b/libucd_int.h
@@ -15,26 +15,30 @@ extern const char _libucd_hangul_jamo_l[][4];
extern const char _libucd_hangul_jamo_v[][4];
extern const char _libucd_hangul_jamo_t[][4];
+/* This structure is exactly 32 bytes long, nice and alignable. */
struct _libucd_property_array {
int32_t ucd; /* Wasteful but fast (used in search) */
uint8_t general_category;
uint8_t script;
uint8_t numeric_value_num;
uint8_t numeric_value_den_exp; /* bit 7 = 1 if exponent */
- uint64_t flags_block; /* Block index is high byte */
+ uint64_t flags_block; /* Block index is byte 6, byte 7 free */
int24 simple_uppercase;
int24 simple_lowercase;
int24 simple_titlecase;
uint8_t age; /* (major << 5) + minor */
- uint8_t pad[2]; /* Do something useful here... */
- unsigned arabic_joining_type :3;
- unsigned arabic_joining_group :6;
+ uint8_t combining_class;
+ unsigned sentence_break :4;
+ unsigned grapheme_cluster_break :4;
+ unsigned word_break :3;
+ unsigned joining_type :3;
+ unsigned joining_group :6;
unsigned east_asian_width :3;
unsigned hangul_syllable_type :3;
- unsigned line_break :5;
+ unsigned line_break :6;
unsigned numeric_type :2;
- unsigned combining_class :5;
unsigned bidi_class :5;
+ unsigned /* unused */ :1;
};
#endif
diff --git a/ucd.h b/ucd.h
index 4c0b898..6c51d72 100644
--- a/ucd.h
+++ b/ucd.h
@@ -90,16 +90,16 @@ enum unicode_east_asian_width {
};
enum unicode_grapheme_cluster_break {
- UC_GCB_XX = 0, /* Other */
- UC_GCB_CN, /* Control */
- UC_GCB_CR, /* CR */
- UC_GCB_EX, /* Extend */
- UC_GCB_L, /* L */
- UC_GCB_LF, /* LF */
- UC_GCB_LV, /* LV */
- UC_GCB_LVT, /* LVT */
- UC_GCB_T, /* T */
- UC_GCB_V, /* V */
+ UC_GCB_Other = 0, /* Other */
+ UC_GCB_Control,
+ UC_GCB_CR,
+ UC_GCB_Extend,
+ UC_GCB_L,
+ UC_GCB_LF,
+ UC_GCB_LV,
+ UC_GCB_LVT,
+ UC_GCB_T,
+ UC_GCB_V,
};
enum unicode_hangul_syllable_type {
@@ -110,66 +110,70 @@ enum unicode_hangul_syllable_type {
UC_HST_T, /* Trailing_Jamo */
UC_HST_V, /* Vowel_Jamo */
};
-
-enum unicode_arabic_joining_group {
- UC_AJC_None = 0,
- UC_AJG_Ain,
- UC_AJC_Alaph,
- UC_AJC_Alef,
- UC_AJC_Beh,
- UC_AJC_Beth,
- UC_AJC_Dal,
- UC_AJC_Dalath_Rish,
- UC_AJC_E,
- UC_AJC_Fe,
- UC_AJC_Feh,
- UC_AJC_Final_Semkath,
- UC_AJC_Gaf,
- UC_AJC_Gamal,
- UC_AJC_Hah,
- UC_AJC_Hamza_On_Heh_Goal,
- UC_AJC_He,
- UC_AJC_Heh,
- UC_AJC_Heh_Goal,
- UC_AJC_Heth,
- UC_AJC_Kaf,
- UC_AJC_Kaph,
- UC_AJC_Khaph,
- UC_AJC_Knotted_Heh,
- UC_AJC_Lam,
- UC_AJC_Lamadh,
- UC_AJC_Meem,
- UC_AJC_Mim,
- UC_AJC_Noon,
- UC_AJC_Nun,
- UC_AJC_Pe,
- UC_AJC_Qaf,
- UC_AJC_Qaph,
- UC_AJC_Reh,
- UC_AJC_Reversed_Pe,
- UC_AJC_Sad,
- UC_AJC_Sadhe,
- UC_AJC_Seen,
- UC_AJC_Semkath,
- UC_AJC_Shin,
- UC_AJC_Swash_Kaf,
- UC_AJC_Syriac_Waw,
- UC_AJC_Tah,
- UC_AJC_Taw,
- UC_AJC_Teh_Marbuta,
- UC_AJC_Teth,
- UC_AJC_Waw,
- UC_AJC_Yeh,
- UC_AJC_Yeh_Barree,
- UC_AJC_Yeh_With_Tail,
- UC_AJC_Yudh,
- UC_AJC_Yudh_He,
- UC_AJC_Zain,
- UC_AJC_Zhain,
+enum unicode_joining_group {
+ UC_JG_No_Joining_Group = 0,
+ UC_JG_Ain,
+ UC_JG_Alaph,
+ UC_JG_Alef,
+ UC_JG_Beh,
+ UC_JG_Beth,
+ UC_JG_Dal,
+ UC_JG_Dalath_Rish,
+ UC_JG_E,
+ UC_JG_Fe,
+ UC_JG_Feh,
+ UC_JG_Final_Semkath,
+ UC_JG_Gaf,
+ UC_JG_Gamal,
+ UC_JG_Hah,
+ UC_JG_Hamza_On_Heh_Goal,
+ UC_JG_He,
+ UC_JG_Heh,
+ UC_JG_Heh_Goal,
+ UC_JG_Heth,
+ UC_JG_Kaf,
+ UC_JG_Kaph,
+ UC_JG_Khaph,
+ UC_JG_Knotted_Heh,
+ UC_JG_Lam,
+ UC_JG_Lamadh,
+ UC_JG_Meem,
+ UC_JG_Mim,
+ UC_JG_Noon,
+ UC_JG_Nun,
+ UC_JG_Pe,
+ UC_JG_Qaf,
+ UC_JG_Qaph,
+ UC_JG_Reh,
+ UC_JG_Reversed_Pe,
+ UC_JG_Sad,
+ UC_JG_Sadhe,
+ UC_JG_Seen,
+ UC_JG_Semkath,
+ UC_JG_Shin,
+ UC_JG_Swash_Kaf,
+ UC_JG_Syriac_Waw,
+ UC_JG_Tah,
+ UC_JG_Taw,
+ UC_JG_Teh_Marbuta,
+ UC_JG_Teth,
+ UC_JG_Waw,
+ UC_JG_Yeh,
+ UC_JG_Yeh_Barree,
+ UC_JG_Yeh_With_Tail,
+ UC_JG_Yudh,
+ UC_JG_Yudh_He,
+ UC_JG_Zain,
+ UC_JG_Zhain,
};
-enum unicode_arabic_joining_type {
- UC_AJT_None,
+enum unicode_joining_type {
+ UC_JT_U = 0,
+ UC_JT_R,
+ UC_JT_L,
+ UC_JT_D,
+ UC_JT_C,
+ UC_JT_T,
};
enum unicode_ternary {
@@ -179,35 +183,35 @@ enum unicode_ternary {
};
enum unicode_numeric_type {
- UC_NT_None, /* Not numeric */
- UC_NT_Nu, /* Numeric */
- UC_NT_Di, /* Digit */
- UC_NT_De, /* Decimal digit */
+ UC_NT_None = 0,
+ UC_NT_Numeric,
+ UC_NT_Digit,
+ UC_NT_Decimal,
};
enum unicode_sentence_break {
- UC_SB_XX = 0,
- UC_SB_AT,
- UC_SB_CL,
- UC_SB_FO,
- UC_SB_LE,
- UC_SB_LO,
- UC_SB_NU,
- UC_SB_SE,
- UC_SB_SP,
- UC_SB_ST,
- UC_SB_UP,
+ UC_SB_Other = 0,
+ UC_SB_Sep,
+ UC_SB_Format,
+ UC_SB_Sp,
+ UC_SB_Lower,
+ UC_SB_Upper,
+ UC_SB_OLetter,
+ UC_SB_Numeric,
+ UC_SB_ATerm,
+ UC_SB_STerm,
+ UC_SB_Close,
};
enum unicode_word_break {
- UC_WB_XX = 0,
- UC_WB_EX,
- UC_WB_FO,
- UC_WB_KA,
- UC_WB_LE,
- UC_WB_ML,
- UC_WB_MN,
- UC_WB_NU,
+ UC_WB_Other = 0,
+ UC_WB_Format,
+ UC_WB_Katakana,
+ UC_WB_ALetter,
+ UC_WB_MidLetter,
+ UC_WB_MidNum,
+ UC_WB_Numeric,
+ UC_WB_ExtendNumLet,
};
enum unicode_line_break {
@@ -250,8 +254,7 @@ enum unicode_line_break {
};
enum unicode_general_category {
- UC_GC_XX = 0,
- UC_GC_Cn,
+ UC_GC_Cn = 0,
UC_GC_Cc,
UC_GC_Cf,
UC_GC_Co,
@@ -278,11 +281,225 @@ enum unicode_general_category {
UC_GC_Sk,
UC_GC_Sm,
UC_GC_So,
- UC_GC_Zl,
UC_GC_Sp,
+ UC_GC_Zl,
+ UC_GC_Zp,
UC_GC_Zs,
};
+enum unicode_script {
+ UC_SCR_Common = 0,
+ UC_SCR_Latin,
+ UC_SCR_Greek,
+ UC_SCR_Cyrillic,
+ UC_SCR_Armenian,
+ UC_SCR_Hebrew,
+ UC_SCR_Arabic,
+ UC_SCR_Syriac,
+ UC_SCR_Thaana,
+ UC_SCR_Devanagari,
+ UC_SCR_Bengali,
+ UC_SCR_Gurmukhi,
+ UC_SCR_Gujarati,
+ UC_SCR_Oriya,
+ UC_SCR_Tamil,
+ UC_SCR_Telugu,
+ UC_SCR_Kannada,
+ UC_SCR_Malayalam,
+ UC_SCR_Sinhala,
+ UC_SCR_Thai,
+ UC_SCR_Lao,
+ UC_SCR_Tibetan,
+ UC_SCR_Myanmar,
+ UC_SCR_Georgian,
+ UC_SCR_Hangul,
+ UC_SCR_Ethiopic,
+ UC_SCR_Cherokee,
+ UC_SCR_Canadian_Aboriginal,
+ UC_SCR_Ogham,
+ UC_SCR_Runic,
+ UC_SCR_Khmer,
+ UC_SCR_Mongolian,
+ UC_SCR_Hiragana,
+ UC_SCR_Katakana,
+ UC_SCR_Bopomofo,
+ UC_SCR_Han,
+ UC_SCR_Yi,
+ UC_SCR_Old_Italic,
+ UC_SCR_Gothic,
+ UC_SCR_Deseret,
+ UC_SCR_Inherited,
+ UC_SCR_Tagalog,
+ UC_SCR_Hanunoo,
+ UC_SCR_Buhid,
+ UC_SCR_Tagbanwa,
+ UC_SCR_Limbu,
+ UC_SCR_Tai_Le,
+ UC_SCR_Linear_B,
+ UC_SCR_Ugaritic,
+ UC_SCR_Shavian,
+ UC_SCR_Osmanya,
+ UC_SCR_Cypriot,
+ UC_SCR_Braille,
+ UC_SCR_Buginese,
+ UC_SCR_Coptic,
+ UC_SCR_New_Tai_Lue,
+ UC_SCR_Glagolitic,
+ UC_SCR_Tifinagh,
+ UC_SCR_Syloti_Nagri,
+ UC_SCR_Old_Persian,
+ UC_SCR_Kharoshthi,
+};
+
+enum unicode_block {
+ UC_BLK_No_Block = 0,
+ UC_BLK_Basic_Latin,
+ UC_BLK_Latin_1_Supplement,
+ UC_BLK_Latin_Extended_A,
+ UC_BLK_Latin_Extended_B,
+ UC_BLK_IPA_Extensions,
+ UC_BLK_Spacing_Modifier_Letters,
+ UC_BLK_Combining_Diacritical_Marks,
+ UC_BLK_Greek_and_Coptic,
+ UC_BLK_Cyrillic,
+ UC_BLK_Cyrillic_Supplement,
+ UC_BLK_Armenian,
+ UC_BLK_Hebrew,
+ UC_BLK_Arabic,
+ UC_BLK_Syriac,
+ UC_BLK_Arabic_Supplement,
+ UC_BLK_Thaana,
+ UC_BLK_Devanagari,
+ UC_BLK_Bengali,
+ UC_BLK_Gurmukhi,
+ UC_BLK_Gujarati,
+ UC_BLK_Oriya,
+ UC_BLK_Tamil,
+ UC_BLK_Telugu,
+ UC_BLK_Kannada,
+ UC_BLK_Malayalam,
+ UC_BLK_Sinhala,
+ UC_BLK_Thai,
+ UC_BLK_Lao,
+ UC_BLK_Tibetan,
+ UC_BLK_Myanmar,
+ UC_BLK_Georgian,
+ UC_BLK_Hangul_Jamo,
+ UC_BLK_Ethiopic,
+ UC_BLK_Ethiopic_Supplement,
+ UC_BLK_Cherokee,
+ UC_BLK_Unified_Canadian_Aboriginal_Syllabics,
+ UC_BLK_Ogham,
+ UC_BLK_Runic,
+ UC_BLK_Tagalog,
+ UC_BLK_Hanunoo,
+ UC_BLK_Buhid,
+ UC_BLK_Tagbanwa,
+ UC_BLK_Khmer,
+ UC_BLK_Mongolian,
+ UC_BLK_Limbu,
+ UC_BLK_Tai_Le,
+ UC_BLK_New_Tai_Lue,
+ UC_BLK_Khmer_Symbols,
+ UC_BLK_Buginese,
+ UC_BLK_Phonetic_Extensions,
+ UC_BLK_Phonetic_Extensions_Supplement,
+ UC_BLK_Combining_Diacritical_Marks_Supplement,
+ UC_BLK_Latin_Extended_Additional,
+ UC_BLK_Greek_Extended,
+ UC_BLK_General_Punctuation,
+ UC_BLK_Superscripts_and_Subscripts,
+ UC_BLK_Currency_Symbols,
+ UC_BLK_Combining_Diacritical_Marks_for_Symbols,
+ UC_BLK_Letterlike_Symbols,
+ UC_BLK_Number_Forms,
+ UC_BLK_Arrows,
+ UC_BLK_Mathematical_Operators,
+ UC_BLK_Miscellaneous_Technical,
+ UC_BLK_Control_Pictures,
+ UC_BLK_Optical_Character_Recognition,
+ UC_BLK_Enclosed_Alphanumerics,
+ UC_BLK_Box_Drawing,
+ UC_BLK_Block_Elements,
+ UC_BLK_Geometric_Shapes,
+ UC_BLK_Miscellaneous_Symbols,
+ UC_BLK_Dingbats,
+ UC_BLK_Miscellaneous_Mathematical_Symbols_A,
+ UC_BLK_Supplemental_Arrows_A,
+ UC_BLK_Braille_Patterns,
+ UC_BLK_Supplemental_Arrows_B,
+ UC_BLK_Miscellaneous_Mathematical_Symbols_B,
+ UC_BLK_Supplemental_Mathematical_Operators,
+ UC_BLK_Miscellaneous_Symbols_and_Arrows,
+ UC_BLK_Glagolitic,
+ UC_BLK_Coptic,
+ UC_BLK_Georgian_Supplement,
+ UC_BLK_Tifinagh,
+ UC_BLK_Ethiopic_Extended,
+ UC_BLK_Supplemental_Punctuation,
+ UC_BLK_CJK_Radicals_Supplement,
+ UC_BLK_Kangxi_Radicals,
+ UC_BLK_Ideographic_Description_Characters,
+ UC_BLK_CJK_Symbols_and_Punctuation,
+ UC_BLK_Hiragana,
+ UC_BLK_Katakana,
+ UC_BLK_Bopomofo,
+ UC_BLK_Hangul_Compatibility_Jamo,
+ UC_BLK_Kanbun,
+ UC_BLK_Bopomofo_Extended,
+ UC_BLK_CJK_Strokes,
+ UC_BLK_Katakana_Phonetic_Extensions,
+ UC_BLK_Enclosed_CJK_Letters_and_Months,
+ UC_BLK_CJK_Compatibility,
+ UC_BLK_CJK_Unified_Ideographs_Extension_A,
+ UC_BLK_Yijing_Hexagram_Symbols,
+ UC_BLK_CJK_Unified_Ideographs,
+ UC_BLK_Yi_Syllables,
+ UC_BLK_Yi_Radicals,
+ UC_BLK_Modifier_Tone_Letters,
+ UC_BLK_Syloti_Nagri,
+ UC_BLK_Hangul_Syllables,
+ UC_BLK_High_Surrogates,
+ UC_BLK_High_Private_Use_Surrogates,
+ UC_BLK_Low_Surrogates,
+ UC_BLK_Private_Use_Area,
+ UC_BLK_CJK_Compatibility_Ideographs,
+ UC_BLK_Alphabetic_Presentation_Forms,
+ UC_BLK_Arabic_Presentation_Forms_A,
+ UC_BLK_Variation_Selectors,
+ UC_BLK_Vertical_Forms,
+ UC_BLK_Combining_Half_Marks,
+ UC_BLK_CJK_Compatibility_Forms,
+ UC_BLK_Small_Form_Variants,
+ UC_BLK_Arabic_Presentation_Forms_B,
+ UC_BLK_Halfwidth_and_Fullwidth_Forms,
+ UC_BLK_Specials,
+ UC_BLK_Linear_B_Syllabary,
+ UC_BLK_Linear_B_Ideograms,
+ UC_BLK_Aegean_Numbers,
+ UC_BLK_Ancient_Greek_Numbers,
+ UC_BLK_Old_Italic,
+ UC_BLK_Gothic,
+ UC_BLK_Ugaritic,
+ UC_BLK_Old_Persian,
+ UC_BLK_Deseret,
+ UC_BLK_Shavian,
+ UC_BLK_Osmanya,
+ UC_BLK_Cypriot_Syllabary,
+ UC_BLK_Kharoshthi,
+ UC_BLK_Byzantine_Musical_Symbols,
+ UC_BLK_Musical_Symbols,
+ UC_BLK_Ancient_Greek_Musical_Notation,
+ UC_BLK_Tai_Xuan_Jing_Symbols,
+ UC_BLK_Mathematical_Alphanumeric_Symbols,
+ UC_BLK_CJK_Unified_Ideographs_Extension_B,
+ UC_BLK_CJK_Compatibility_Ideographs_Supplement,
+ UC_BLK_Tags,
+ UC_BLK_Variation_Selectors_Supplement,
+ UC_BLK_Supplementary_Private_Use_Area_A,
+ UC_BLK_Supplementary_Private_Use_Area_B,
+};
+
#define UC_FLAG(x) (UINT64_C(1) << (x))
#define UC_FL_COMPOSITION_EXCLUSION UC_FLAG(0)
@@ -326,7 +543,6 @@ struct unicode_character_data {
uint64_t fl; /* Flags */
const char *name;
const char *bidi_mirroring_glyph;
- const char *block;
const char *uppercase_mapping;
const char *lowercase_mapping;
const char *titlecase_mapping;
@@ -339,15 +555,19 @@ struct unicode_character_data {
uint8_t numeric_value_den;
uint8_t numeric_value_exp;
enum unicode_general_category general_category;
+ enum unicode_block block;
enum unicode_script script;
- enum unicode_arabic_joining_type arabic_joining_type;
- enum unicode_arabic_joining_group arabic_joining_group;
+ enum unicode_joining_type joining_type;
+ enum unicode_joining_group joining_group;
enum unicode_east_asian_width east_asian_width;
enum unicode_hangul_syllable_type hangul_syllable_type;
- enum unicode_line_break line_break;
enum unicode_numeric_type numeric_type;
enum unicode_combining_class combining_class;
enum unicode_bidi_class bidi_class;
+ enum unicode_grapheme_cluster_break grapheme_cluster_break;
+ enum unicode_sentence_break sentence_break;
+ enum unicode_word_break word_break;
+ enum unicode_line_break line_break;
};
#endif /* UCD_H */