diff options
author | H. Peter Anvin <hpa@smyrno.hos.anvin.org> | 2005-11-29 23:25:06 -0800 |
---|---|---|
committer | H. Peter Anvin <hpa@smyrno.hos.anvin.org> | 2005-11-29 23:25:06 -0800 |
commit | 86a84dd7743ab0d8140b6a07f81e6b6cf7953b46 (patch) | |
tree | 4dc5ff70d5a3495c8a748633d956cde40b9081ce | |
parent | c76509f7077ced109ed221248c29495d6cbd40e2 (diff) | |
download | libucd-86a84dd7743ab0d8140b6a07f81e6b6cf7953b46.tar.gz |
Add enumerations for additional properties; actually generate property array
-rw-r--r-- | Makefile | 25 | ||||
-rwxr-xr-x | convert_ucd.pl | 75 | ||||
-rw-r--r-- | libucd_int.h | 16 | ||||
-rw-r--r-- | ucd.h | 416 |
4 files changed, 383 insertions, 149 deletions
@@ -12,7 +12,9 @@ HOST_LIBS = # These are the files produced by convert_ucd.pl # CVT_FILES = gen/jamo.c gen/nameslist.c gen/nametoucs.keys \ - gen/ucstoname.keys gen/propdump.txt + gen/ucstoname.keys gen/proparray.c + +# ----------------------------------------------------------------------- .SUFFIXES: .c .o .lo .s .ls .i .li .cc .h .c.o: @@ -28,7 +30,18 @@ CVT_FILES = gen/jamo.c gen/nameslist.c gen/nametoucs.keys \ .c.li: $(HOST_CC) $(HOST_CFLAGS) -E -o $@ $< -all : gen/done perfect/perfect gen/nametoucs_hash.o gen/ucstoname_hash.o +# ----------------------------------------------------------------------- + +all : perfect/perfect gen/nametoucs_hash.o gen/ucstoname_hash.o \ + proparray.o + + +clean: + rm -rf gen + rm -f *.o *.a *.so *.so.* + $(MAKE) -C perfect clean + +# ----------------------------------------------------------------------- $(CVT_FILES) : gen/done @@ -54,10 +67,6 @@ gen/ucstoname_hash.c: gen/ucstoname.keys perfect/perfect gen/ucstoname_hash.h: gen/ucstoname_hash.c : Generated by side effect -clean: - rm -rf gen - rm -f *.o *.a *.so *.so.* - $(MAKE) -C perfect clean - - +# ----------------------------------------------------------------------- +proparray.o: proparray.c ucd.h libucd_int.h gen/proparray.c diff --git a/convert_ucd.pl b/convert_ucd.pl index 2da914f..5d48cfe 100755 --- a/convert_ucd.pl +++ b/convert_ucd.pl @@ -4,8 +4,6 @@ # into data for libucd. # -use POSIX; - # # Internally this file uses a hash with the UCS value as key, and # as data another hash from property name to value. @@ -35,6 +33,7 @@ sub read_separated_file($$$) { my $line, @fields, $c0, $c1, $c; my $was_first = 0; + print STDERR "Reading $filename\n"; open($fh, '<', $filename) or return 0; while ( defined($line = <$fh>) ) { chomp $line; @@ -116,6 +115,7 @@ sub read_boolean_file($) { my $fh; my $line, @fields, $c0, $c1, $c; + print STDERR "Reading $filename\n"; open($fh, '<', $filename) or return 0; while ( defined($line = <$fh>) ) { chomp $line; @@ -141,27 +141,6 @@ sub read_boolean_file($) { return 1; } -# UCD numeric values are given as 8-significant figures floating-point -# numbers, but in reality they are all fractions. This converts a -# floating-point number to a numerator and denominator with just about -# enough fuzz. -# -# Note: the denominator will always be positive, and will always be 1 -# if the number is an integer. -sub make_fraction($) { - my ($v) = @_; - my $n, $d, $minus; - - return [0, 1] if ( $v == 0 ); - - $d = 1; - while ( 1 ) { - $n = floor($v*$d + 0.5); - return [$n, $d] if ( abs($n/$d-$v)/$v < 1e-7 ); - $d++; - } -} - sub make_jamo_string($) { my ($s) = @_; my $i, $c; @@ -191,6 +170,7 @@ sub make_jamo_tables() { # None of the syllables exceed 4 bytes, so let's not waste # pointer space that might have to be relocated... + print STDERR "Writing gen/jamo.c\n"; open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c"; print $fh "#include \"libucd_int.h\"\n\n"; @@ -226,6 +206,7 @@ sub make_names_list() { my $fh; my $col; + print STDERR "Writing gen/nameslist.c\n"; open($fh, '>', 'gen/nameslist.c') or die "$0: Cannot create gen/nameslist.c"; print $fh "#include \"libucd_int.h\"\n\n"; @@ -297,6 +278,7 @@ sub make_name_keyfile() my $fh; my $k; + print STDERR "Writing gen/nametoucs.keys\n"; open($fh, '>', 'gen/nametoucs.keys') or die "$0: cannot write gen/nametoucs.keys\n"; @@ -317,6 +299,7 @@ sub make_named_ucs_keyfile() my $fh; my $k; + print STDERR "Writing gen/ucstoname.keys\n"; open($fh, '>', 'gen/ucstoname.keys') or die "$0: cannot write gen/ucstoname.keys\n"; @@ -336,6 +319,7 @@ sub dump_prop_list() { my $fh, $c; + print STDERR "Writing gen/propdump.txt\n"; open($fh, '>', 'gen/propdump.txt') or die "$0: cannot write gen/propdump.txt\n"; binmode $fh, ':utf8'; @@ -405,6 +389,7 @@ sub make_properties_array() 'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector', 'White_Space', 'Bidi_Mirrored'); + print STDERR "Writing gen/proparray.c\n"; open($fh, '>', 'gen/proparray.c') or die; binmode $fh, ':utf8'; @@ -454,9 +439,9 @@ sub make_properties_array() $mine .= "\t\tUC_FL_\U$bp\E |\n"; } } - my $block = $$cp{'Block'}; + my $block = $$cp{'Block'} || 'No_Block'; $block =~ tr/ .-/___/; - $mine .= "\t\t(UC_BLK_$block << 48),\n"; + $mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n"; # Simple case mappings my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c; @@ -472,19 +457,36 @@ sub make_properties_array() my (@sage) = split(/\./, $age); $mine .= sprintf("\t\t(%d << 5) + %d, /* $age */\n", $sage[0], $sage[1]); - # Padding - $mine .= "\t\t{ 0, 0, },\n"; + # Canonical Combining Class + my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR'; + if ( $ccc =~ /^[0-9]+$/ ) { + $mine .= "\t\t$ccc,\n"; # Numeric CCC + } else { + $mine .= "\t\tUC_CCC_$ccc,\n"; + } + + # Sentence Break + my $sb = $$cp{'Sentence_Break'} || 'Other'; + $mine .= "\t\tUC_SB_$sb,\n"; + + # Grapheme Cluster Break + my $gcb = $$cp{'Grapheme_Cluster_Break'} || 'Other'; + $mine .= "\t\tUC_GCB_$gcb,\n"; + + # Word Break + my $wb = $$cp{'Word_Break'} || 'Other'; + $mine .= "\t\tUC_WB_$wb,\n"; # Arabic Joining Type - my $ajt = $$cp{'Arabic_Joining_Type'} || + my $ajt = $$cp{'Joining_Type'} || ($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U'; - $mine .= "\t\tUC_AJT_$ajt,\n"; + $mine .= "\t\tUC_JT_$ajt,\n"; # Arabic Joining Group - my $ajg = $$cp{'Arabic_Joining_Group'} || 'No_Joining_Group'; + my $ajg = $$cp{'Joining_Group'} || 'No_Joining_Group'; $ajg =~ tr/ /_/; $ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g; - $mine .= "\t\tUC_AJG_$ajg,\n"; + $mine .= "\t\tUC_JG_$ajg,\n"; # East Asian Width my $eaw = $$cp{'East_Asian_Width'} || 'N'; @@ -502,13 +504,9 @@ sub make_properties_array() my $nt = $$cp{'Numeric_Type'} || 'None'; $mine .= "\t\tUC_NT_$nt,\n"; - # Canonical Combining Class - my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR'; - $mine .= "\t\tUC_CCC_$ccc,\n"; - # Bidi Class my $bc = $$cp{'Bidi_Class'} || 'L'; - $mine .= "\t\tUC_BC_$bc,\n"; + $mine .= "\t\tUC_BIDI_$bc,\n"; # Additional properties... $mine .= "\t},\n"; @@ -553,6 +551,9 @@ read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']); read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping', 'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []); read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []); +read_separated_file('ucd/auxilliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []); +read_separated_file('ucd/auxilliary/SentenceBreakProperty.txt', ['eSentence_Break'], []); +read_separated_file('ucd/auxilliary/WordBreakProperty.txt', ['eWord_Break'], []); read_boolean_file('ucd/DerivedCoreProperties.txt'); read_boolean_file('ucd/PropList.txt'); @@ -564,4 +565,4 @@ make_names_list(); make_name_keyfile(); make_named_ucs_keyfile(); make_properties_array(); -dump_prop_list(); +# dump_prop_list(); diff --git a/libucd_int.h b/libucd_int.h index 8ab2f06..c23b8ed 100644 --- a/libucd_int.h +++ b/libucd_int.h @@ -15,26 +15,30 @@ extern const char _libucd_hangul_jamo_l[][4]; extern const char _libucd_hangul_jamo_v[][4]; extern const char _libucd_hangul_jamo_t[][4]; +/* This structure is exactly 32 bytes long, nice and alignable. */ struct _libucd_property_array { int32_t ucd; /* Wasteful but fast (used in search) */ uint8_t general_category; uint8_t script; uint8_t numeric_value_num; uint8_t numeric_value_den_exp; /* bit 7 = 1 if exponent */ - uint64_t flags_block; /* Block index is high byte */ + uint64_t flags_block; /* Block index is byte 6, byte 7 free */ int24 simple_uppercase; int24 simple_lowercase; int24 simple_titlecase; uint8_t age; /* (major << 5) + minor */ - uint8_t pad[2]; /* Do something useful here... */ - unsigned arabic_joining_type :3; - unsigned arabic_joining_group :6; + uint8_t combining_class; + unsigned sentence_break :4; + unsigned grapheme_cluster_break :4; + unsigned word_break :3; + unsigned joining_type :3; + unsigned joining_group :6; unsigned east_asian_width :3; unsigned hangul_syllable_type :3; - unsigned line_break :5; + unsigned line_break :6; unsigned numeric_type :2; - unsigned combining_class :5; unsigned bidi_class :5; + unsigned /* unused */ :1; }; #endif @@ -90,16 +90,16 @@ enum unicode_east_asian_width { }; enum unicode_grapheme_cluster_break { - UC_GCB_XX = 0, /* Other */ - UC_GCB_CN, /* Control */ - UC_GCB_CR, /* CR */ - UC_GCB_EX, /* Extend */ - UC_GCB_L, /* L */ - UC_GCB_LF, /* LF */ - UC_GCB_LV, /* LV */ - UC_GCB_LVT, /* LVT */ - UC_GCB_T, /* T */ - UC_GCB_V, /* V */ + UC_GCB_Other = 0, /* Other */ + UC_GCB_Control, + UC_GCB_CR, + UC_GCB_Extend, + UC_GCB_L, + UC_GCB_LF, + UC_GCB_LV, + UC_GCB_LVT, + UC_GCB_T, + UC_GCB_V, }; enum unicode_hangul_syllable_type { @@ -110,66 +110,70 @@ enum unicode_hangul_syllable_type { UC_HST_T, /* Trailing_Jamo */ UC_HST_V, /* Vowel_Jamo */ }; - -enum unicode_arabic_joining_group { - UC_AJC_None = 0, - UC_AJG_Ain, - UC_AJC_Alaph, - UC_AJC_Alef, - UC_AJC_Beh, - UC_AJC_Beth, - UC_AJC_Dal, - UC_AJC_Dalath_Rish, - UC_AJC_E, - UC_AJC_Fe, - UC_AJC_Feh, - UC_AJC_Final_Semkath, - UC_AJC_Gaf, - UC_AJC_Gamal, - UC_AJC_Hah, - UC_AJC_Hamza_On_Heh_Goal, - UC_AJC_He, - UC_AJC_Heh, - UC_AJC_Heh_Goal, - UC_AJC_Heth, - UC_AJC_Kaf, - UC_AJC_Kaph, - UC_AJC_Khaph, - UC_AJC_Knotted_Heh, - UC_AJC_Lam, - UC_AJC_Lamadh, - UC_AJC_Meem, - UC_AJC_Mim, - UC_AJC_Noon, - UC_AJC_Nun, - UC_AJC_Pe, - UC_AJC_Qaf, - UC_AJC_Qaph, - UC_AJC_Reh, - UC_AJC_Reversed_Pe, - UC_AJC_Sad, - UC_AJC_Sadhe, - UC_AJC_Seen, - UC_AJC_Semkath, - UC_AJC_Shin, - UC_AJC_Swash_Kaf, - UC_AJC_Syriac_Waw, - UC_AJC_Tah, - UC_AJC_Taw, - UC_AJC_Teh_Marbuta, - UC_AJC_Teth, - UC_AJC_Waw, - UC_AJC_Yeh, - UC_AJC_Yeh_Barree, - UC_AJC_Yeh_With_Tail, - UC_AJC_Yudh, - UC_AJC_Yudh_He, - UC_AJC_Zain, - UC_AJC_Zhain, +enum unicode_joining_group { + UC_JG_No_Joining_Group = 0, + UC_JG_Ain, + UC_JG_Alaph, + UC_JG_Alef, + UC_JG_Beh, + UC_JG_Beth, + UC_JG_Dal, + UC_JG_Dalath_Rish, + UC_JG_E, + UC_JG_Fe, + UC_JG_Feh, + UC_JG_Final_Semkath, + UC_JG_Gaf, + UC_JG_Gamal, + UC_JG_Hah, + UC_JG_Hamza_On_Heh_Goal, + UC_JG_He, + UC_JG_Heh, + UC_JG_Heh_Goal, + UC_JG_Heth, + UC_JG_Kaf, + UC_JG_Kaph, + UC_JG_Khaph, + UC_JG_Knotted_Heh, + UC_JG_Lam, + UC_JG_Lamadh, + UC_JG_Meem, + UC_JG_Mim, + UC_JG_Noon, + UC_JG_Nun, + UC_JG_Pe, + UC_JG_Qaf, + UC_JG_Qaph, + UC_JG_Reh, + UC_JG_Reversed_Pe, + UC_JG_Sad, + UC_JG_Sadhe, + UC_JG_Seen, + UC_JG_Semkath, + UC_JG_Shin, + UC_JG_Swash_Kaf, + UC_JG_Syriac_Waw, + UC_JG_Tah, + UC_JG_Taw, + UC_JG_Teh_Marbuta, + UC_JG_Teth, + UC_JG_Waw, + UC_JG_Yeh, + UC_JG_Yeh_Barree, + UC_JG_Yeh_With_Tail, + UC_JG_Yudh, + UC_JG_Yudh_He, + UC_JG_Zain, + UC_JG_Zhain, }; -enum unicode_arabic_joining_type { - UC_AJT_None, +enum unicode_joining_type { + UC_JT_U = 0, + UC_JT_R, + UC_JT_L, + UC_JT_D, + UC_JT_C, + UC_JT_T, }; enum unicode_ternary { @@ -179,35 +183,35 @@ enum unicode_ternary { }; enum unicode_numeric_type { - UC_NT_None, /* Not numeric */ - UC_NT_Nu, /* Numeric */ - UC_NT_Di, /* Digit */ - UC_NT_De, /* Decimal digit */ + UC_NT_None = 0, + UC_NT_Numeric, + UC_NT_Digit, + UC_NT_Decimal, }; enum unicode_sentence_break { - UC_SB_XX = 0, - UC_SB_AT, - UC_SB_CL, - UC_SB_FO, - UC_SB_LE, - UC_SB_LO, - UC_SB_NU, - UC_SB_SE, - UC_SB_SP, - UC_SB_ST, - UC_SB_UP, + UC_SB_Other = 0, + UC_SB_Sep, + UC_SB_Format, + UC_SB_Sp, + UC_SB_Lower, + UC_SB_Upper, + UC_SB_OLetter, + UC_SB_Numeric, + UC_SB_ATerm, + UC_SB_STerm, + UC_SB_Close, }; enum unicode_word_break { - UC_WB_XX = 0, - UC_WB_EX, - UC_WB_FO, - UC_WB_KA, - UC_WB_LE, - UC_WB_ML, - UC_WB_MN, - UC_WB_NU, + UC_WB_Other = 0, + UC_WB_Format, + UC_WB_Katakana, + UC_WB_ALetter, + UC_WB_MidLetter, + UC_WB_MidNum, + UC_WB_Numeric, + UC_WB_ExtendNumLet, }; enum unicode_line_break { @@ -250,8 +254,7 @@ enum unicode_line_break { }; enum unicode_general_category { - UC_GC_XX = 0, - UC_GC_Cn, + UC_GC_Cn = 0, UC_GC_Cc, UC_GC_Cf, UC_GC_Co, @@ -278,11 +281,225 @@ enum unicode_general_category { UC_GC_Sk, UC_GC_Sm, UC_GC_So, - UC_GC_Zl, UC_GC_Sp, + UC_GC_Zl, + UC_GC_Zp, UC_GC_Zs, }; +enum unicode_script { + UC_SCR_Common = 0, + UC_SCR_Latin, + UC_SCR_Greek, + UC_SCR_Cyrillic, + UC_SCR_Armenian, + UC_SCR_Hebrew, + UC_SCR_Arabic, + UC_SCR_Syriac, + UC_SCR_Thaana, + UC_SCR_Devanagari, + UC_SCR_Bengali, + UC_SCR_Gurmukhi, + UC_SCR_Gujarati, + UC_SCR_Oriya, + UC_SCR_Tamil, + UC_SCR_Telugu, + UC_SCR_Kannada, + UC_SCR_Malayalam, + UC_SCR_Sinhala, + UC_SCR_Thai, + UC_SCR_Lao, + UC_SCR_Tibetan, + UC_SCR_Myanmar, + UC_SCR_Georgian, + UC_SCR_Hangul, + UC_SCR_Ethiopic, + UC_SCR_Cherokee, + UC_SCR_Canadian_Aboriginal, + UC_SCR_Ogham, + UC_SCR_Runic, + UC_SCR_Khmer, + UC_SCR_Mongolian, + UC_SCR_Hiragana, + UC_SCR_Katakana, + UC_SCR_Bopomofo, + UC_SCR_Han, + UC_SCR_Yi, + UC_SCR_Old_Italic, + UC_SCR_Gothic, + UC_SCR_Deseret, + UC_SCR_Inherited, + UC_SCR_Tagalog, + UC_SCR_Hanunoo, + UC_SCR_Buhid, + UC_SCR_Tagbanwa, + UC_SCR_Limbu, + UC_SCR_Tai_Le, + UC_SCR_Linear_B, + UC_SCR_Ugaritic, + UC_SCR_Shavian, + UC_SCR_Osmanya, + UC_SCR_Cypriot, + UC_SCR_Braille, + UC_SCR_Buginese, + UC_SCR_Coptic, + UC_SCR_New_Tai_Lue, + UC_SCR_Glagolitic, + UC_SCR_Tifinagh, + UC_SCR_Syloti_Nagri, + UC_SCR_Old_Persian, + UC_SCR_Kharoshthi, +}; + +enum unicode_block { + UC_BLK_No_Block = 0, + UC_BLK_Basic_Latin, + UC_BLK_Latin_1_Supplement, + UC_BLK_Latin_Extended_A, + UC_BLK_Latin_Extended_B, + UC_BLK_IPA_Extensions, + UC_BLK_Spacing_Modifier_Letters, + UC_BLK_Combining_Diacritical_Marks, + UC_BLK_Greek_and_Coptic, + UC_BLK_Cyrillic, + UC_BLK_Cyrillic_Supplement, + UC_BLK_Armenian, + UC_BLK_Hebrew, + UC_BLK_Arabic, + UC_BLK_Syriac, + UC_BLK_Arabic_Supplement, + UC_BLK_Thaana, + UC_BLK_Devanagari, + UC_BLK_Bengali, + UC_BLK_Gurmukhi, + UC_BLK_Gujarati, + UC_BLK_Oriya, + UC_BLK_Tamil, + UC_BLK_Telugu, + UC_BLK_Kannada, + UC_BLK_Malayalam, + UC_BLK_Sinhala, + UC_BLK_Thai, + UC_BLK_Lao, + UC_BLK_Tibetan, + UC_BLK_Myanmar, + UC_BLK_Georgian, + UC_BLK_Hangul_Jamo, + UC_BLK_Ethiopic, + UC_BLK_Ethiopic_Supplement, + UC_BLK_Cherokee, + UC_BLK_Unified_Canadian_Aboriginal_Syllabics, + UC_BLK_Ogham, + UC_BLK_Runic, + UC_BLK_Tagalog, + UC_BLK_Hanunoo, + UC_BLK_Buhid, + UC_BLK_Tagbanwa, + UC_BLK_Khmer, + UC_BLK_Mongolian, + UC_BLK_Limbu, + UC_BLK_Tai_Le, + UC_BLK_New_Tai_Lue, + UC_BLK_Khmer_Symbols, + UC_BLK_Buginese, + UC_BLK_Phonetic_Extensions, + UC_BLK_Phonetic_Extensions_Supplement, + UC_BLK_Combining_Diacritical_Marks_Supplement, + UC_BLK_Latin_Extended_Additional, + UC_BLK_Greek_Extended, + UC_BLK_General_Punctuation, + UC_BLK_Superscripts_and_Subscripts, + UC_BLK_Currency_Symbols, + UC_BLK_Combining_Diacritical_Marks_for_Symbols, + UC_BLK_Letterlike_Symbols, + UC_BLK_Number_Forms, + UC_BLK_Arrows, + UC_BLK_Mathematical_Operators, + UC_BLK_Miscellaneous_Technical, + UC_BLK_Control_Pictures, + UC_BLK_Optical_Character_Recognition, + UC_BLK_Enclosed_Alphanumerics, + UC_BLK_Box_Drawing, + UC_BLK_Block_Elements, + UC_BLK_Geometric_Shapes, + UC_BLK_Miscellaneous_Symbols, + UC_BLK_Dingbats, + UC_BLK_Miscellaneous_Mathematical_Symbols_A, + UC_BLK_Supplemental_Arrows_A, + UC_BLK_Braille_Patterns, + UC_BLK_Supplemental_Arrows_B, + UC_BLK_Miscellaneous_Mathematical_Symbols_B, + UC_BLK_Supplemental_Mathematical_Operators, + UC_BLK_Miscellaneous_Symbols_and_Arrows, + UC_BLK_Glagolitic, + UC_BLK_Coptic, + UC_BLK_Georgian_Supplement, + UC_BLK_Tifinagh, + UC_BLK_Ethiopic_Extended, + UC_BLK_Supplemental_Punctuation, + UC_BLK_CJK_Radicals_Supplement, + UC_BLK_Kangxi_Radicals, + UC_BLK_Ideographic_Description_Characters, + UC_BLK_CJK_Symbols_and_Punctuation, + UC_BLK_Hiragana, + UC_BLK_Katakana, + UC_BLK_Bopomofo, + UC_BLK_Hangul_Compatibility_Jamo, + UC_BLK_Kanbun, + UC_BLK_Bopomofo_Extended, + UC_BLK_CJK_Strokes, + UC_BLK_Katakana_Phonetic_Extensions, + UC_BLK_Enclosed_CJK_Letters_and_Months, + UC_BLK_CJK_Compatibility, + UC_BLK_CJK_Unified_Ideographs_Extension_A, + UC_BLK_Yijing_Hexagram_Symbols, + UC_BLK_CJK_Unified_Ideographs, + UC_BLK_Yi_Syllables, + UC_BLK_Yi_Radicals, + UC_BLK_Modifier_Tone_Letters, + UC_BLK_Syloti_Nagri, + UC_BLK_Hangul_Syllables, + UC_BLK_High_Surrogates, + UC_BLK_High_Private_Use_Surrogates, + UC_BLK_Low_Surrogates, + UC_BLK_Private_Use_Area, + UC_BLK_CJK_Compatibility_Ideographs, + UC_BLK_Alphabetic_Presentation_Forms, + UC_BLK_Arabic_Presentation_Forms_A, + UC_BLK_Variation_Selectors, + UC_BLK_Vertical_Forms, + UC_BLK_Combining_Half_Marks, + UC_BLK_CJK_Compatibility_Forms, + UC_BLK_Small_Form_Variants, + UC_BLK_Arabic_Presentation_Forms_B, + UC_BLK_Halfwidth_and_Fullwidth_Forms, + UC_BLK_Specials, + UC_BLK_Linear_B_Syllabary, + UC_BLK_Linear_B_Ideograms, + UC_BLK_Aegean_Numbers, + UC_BLK_Ancient_Greek_Numbers, + UC_BLK_Old_Italic, + UC_BLK_Gothic, + UC_BLK_Ugaritic, + UC_BLK_Old_Persian, + UC_BLK_Deseret, + UC_BLK_Shavian, + UC_BLK_Osmanya, + UC_BLK_Cypriot_Syllabary, + UC_BLK_Kharoshthi, + UC_BLK_Byzantine_Musical_Symbols, + UC_BLK_Musical_Symbols, + UC_BLK_Ancient_Greek_Musical_Notation, + UC_BLK_Tai_Xuan_Jing_Symbols, + UC_BLK_Mathematical_Alphanumeric_Symbols, + UC_BLK_CJK_Unified_Ideographs_Extension_B, + UC_BLK_CJK_Compatibility_Ideographs_Supplement, + UC_BLK_Tags, + UC_BLK_Variation_Selectors_Supplement, + UC_BLK_Supplementary_Private_Use_Area_A, + UC_BLK_Supplementary_Private_Use_Area_B, +}; + #define UC_FLAG(x) (UINT64_C(1) << (x)) #define UC_FL_COMPOSITION_EXCLUSION UC_FLAG(0) @@ -326,7 +543,6 @@ struct unicode_character_data { uint64_t fl; /* Flags */ const char *name; const char *bidi_mirroring_glyph; - const char *block; const char *uppercase_mapping; const char *lowercase_mapping; const char *titlecase_mapping; @@ -339,15 +555,19 @@ struct unicode_character_data { uint8_t numeric_value_den; uint8_t numeric_value_exp; enum unicode_general_category general_category; + enum unicode_block block; enum unicode_script script; - enum unicode_arabic_joining_type arabic_joining_type; - enum unicode_arabic_joining_group arabic_joining_group; + enum unicode_joining_type joining_type; + enum unicode_joining_group joining_group; enum unicode_east_asian_width east_asian_width; enum unicode_hangul_syllable_type hangul_syllable_type; - enum unicode_line_break line_break; enum unicode_numeric_type numeric_type; enum unicode_combining_class combining_class; enum unicode_bidi_class bidi_class; + enum unicode_grapheme_cluster_break grapheme_cluster_break; + enum unicode_sentence_break sentence_break; + enum unicode_word_break word_break; + enum unicode_line_break line_break; }; #endif /* UCD_H */ |