Add enumerations for additional properties; actually generate property array

author: H. Peter Anvin <hpa@smyrno.hos.anvin.org> 2005-11-29 23:25:06 -0800
committer: H. Peter Anvin <hpa@smyrno.hos.anvin.org> 2005-11-29 23:25:06 -0800
commit: 86a84dd7743ab0d8140b6a07f81e6b6cf7953b46 (patch)
tree: 4dc5ff70d5a3495c8a748633d956cde40b9081ce
parent: c76509f7077ced109ed221248c29495d6cbd40e2 (diff)
download: libucd-86a84dd7743ab0d8140b6a07f81e6b6cf7953b46.tar.gz
4 files changed, 383 insertions, 149 deletions
diff --git a/Makefile b/Makefile
index 73de55a..4adca95 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,9 @@ HOST_LIBS    =
 # These are the files produced by convert_ucd.pl
 #
 CVT_FILES = gen/jamo.c gen/nameslist.c gen/nametoucs.keys \
-	gen/ucstoname.keys gen/propdump.txt
+	gen/ucstoname.keys gen/proparray.c
+
+# -----------------------------------------------------------------------
 
 .SUFFIXES: .c .o .lo .s .ls .i .li .cc .h
 .c.o:
@@ -28,7 +30,18 @@ CVT_FILES = gen/jamo.c gen/nameslist.c gen/nametoucs.keys \
 .c.li:
 	$(HOST_CC) $(HOST_CFLAGS) -E -o $@ $<
 
-all : gen/done perfect/perfect gen/nametoucs_hash.o gen/ucstoname_hash.o
+# -----------------------------------------------------------------------
+
+all : perfect/perfect gen/nametoucs_hash.o gen/ucstoname_hash.o \
+	proparray.o
+
+
+clean:
+	rm -rf gen
+	rm -f *.o *.a *.so *.so.*
+	$(MAKE) -C perfect clean
+
+# -----------------------------------------------------------------------
 
 $(CVT_FILES) : gen/done
 
@@ -54,10 +67,6 @@ gen/ucstoname_hash.c: gen/ucstoname.keys perfect/perfect
 gen/ucstoname_hash.h: gen/ucstoname_hash.c
 	: Generated by side effect
 
-clean:
-	rm -rf gen
-	rm -f *.o *.a *.so *.so.*
-	$(MAKE) -C perfect clean
-
-
+# -----------------------------------------------------------------------
 
+proparray.o: proparray.c ucd.h libucd_int.h gen/proparray.c
diff --git a/convert_ucd.pl b/convert_ucd.pl
index 2da914f..5d48cfe 100755
--- a/convert_ucd.pl
+++ b/convert_ucd.pl
@@ -4,8 +4,6 @@
 # into data for libucd.
 #
 
-use POSIX;
-
 #
 # Internally this file uses a hash with the UCS value as key, and
 # as data another hash from property name to value.
@@ -35,6 +33,7 @@ sub read_separated_file($$$) {
     my $line, @fields, $c0, $c1, $c;
     my $was_first = 0;
 
+    print STDERR "Reading $filename\n";
     open($fh, '<', $filename) or return 0;
     while ( defined($line = <$fh>) ) {
 	chomp $line;
@@ -116,6 +115,7 @@ sub read_boolean_file($) {
     my $fh;
     my $line, @fields, $c0, $c1, $c;
 
+    print STDERR "Reading $filename\n";
     open($fh, '<', $filename) or return 0;
     while ( defined($line = <$fh>) ) {
 	chomp $line;
@@ -141,27 +141,6 @@ sub read_boolean_file($) {
     return 1;
 }
 
-# UCD numeric values are given as 8-significant figures floating-point
-# numbers, but in reality they are all fractions.  This converts a
-# floating-point number to a numerator and denominator with just about
-# enough fuzz.
-#
-# Note: the denominator will always be positive, and will always be 1
-# if the number is an integer.
-sub make_fraction($) {
-    my ($v) = @_;
-    my $n, $d, $minus;
-
-    return [0, 1] if ( $v == 0 );
-
-    $d = 1;
-    while ( 1 ) {
-	$n = floor($v*$d + 0.5);
-	return [$n, $d] if ( abs($n/$d-$v)/$v < 1e-7 );
-	$d++;
-    }
-}
-
 sub make_jamo_string($) {
     my ($s) = @_;
     my $i, $c;
@@ -191,6 +170,7 @@ sub make_jamo_tables() {
     # None of the syllables exceed 4 bytes, so let's not waste
     # pointer space that might have to be relocated...
 
+    print STDERR "Writing gen/jamo.c\n";
     open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c";
     print $fh "#include \"libucd_int.h\"\n\n";
 
@@ -226,6 +206,7 @@ sub make_names_list() {
     my $fh;
     my $col;
 
+    print STDERR "Writing gen/nameslist.c\n";
     open($fh, '>', 'gen/nameslist.c') or die "$0: Cannot create gen/nameslist.c";
 
     print $fh "#include \"libucd_int.h\"\n\n";
@@ -297,6 +278,7 @@ sub make_name_keyfile()
     my $fh;
     my $k;
 
+    print STDERR "Writing gen/nametoucs.keys\n";
     open($fh, '>', 'gen/nametoucs.keys')
 	or die "$0: cannot write gen/nametoucs.keys\n";
 
@@ -317,6 +299,7 @@ sub make_named_ucs_keyfile()
     my $fh;
     my $k;
 
+    print STDERR "Writing gen/ucstoname.keys\n";
     open($fh, '>', 'gen/ucstoname.keys')
 	or die "$0: cannot write gen/ucstoname.keys\n";
 
@@ -336,6 +319,7 @@ sub dump_prop_list()
 {
     my $fh, $c;
 
+    print STDERR "Writing gen/propdump.txt\n";
     open($fh, '>', 'gen/propdump.txt')
 	or die "$0: cannot write gen/propdump.txt\n";
     binmode $fh, ':utf8';
@@ -405,6 +389,7 @@ sub make_properties_array()
 			 'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector',
 			 'White_Space', 'Bidi_Mirrored');
 
+    print STDERR "Writing gen/proparray.c\n";
     open($fh, '>', 'gen/proparray.c') or die;
     binmode $fh, ':utf8';
 
@@ -454,9 +439,9 @@ sub make_properties_array()
 		$mine .= "\t\tUC_FL_\U$bp\E |\n";
 	    }
 	}
-	my $block = $$cp{'Block'};
+	my $block = $$cp{'Block'} || 'No_Block';
 	$block =~ tr/ .-/___/;
-	$mine .= "\t\t(UC_BLK_$block << 48),\n";
+	$mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n";
 	
 	# Simple case mappings
 	my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c;
@@ -472,19 +457,36 @@ sub make_properties_array()
 	my (@sage) = split(/\./, $age);
 	$mine .= sprintf("\t\t(%d << 5) + %d, /* $age */\n", $sage[0], $sage[1]);
 
-	# Padding
-	$mine .= "\t\t{ 0, 0, },\n";
+	# Canonical Combining Class
+	my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
+	if ( $ccc =~ /^[0-9]+$/ ) {
+	    $mine .= "\t\t$ccc,\n"; # Numeric CCC
+	} else {
+	    $mine .= "\t\tUC_CCC_$ccc,\n";
+	}
+
+	# Sentence Break
+	my $sb = $$cp{'Sentence_Break'} || 'Other';
+	$mine .= "\t\tUC_SB_$sb,\n";
+
+	# Grapheme Cluster Break
+	my $gcb = $$cp{'Grapheme_Cluster_Break'} || 'Other';
+	$mine .= "\t\tUC_GCB_$gcb,\n";
+
+	# Word Break
+	my $wb = $$cp{'Word_Break'} || 'Other';
+	$mine .= "\t\tUC_WB_$wb,\n";
 
 	# Arabic Joining Type
-	my $ajt = $$cp{'Arabic_Joining_Type'} ||
+	my $ajt = $$cp{'Joining_Type'} ||
 	    ($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U';
-	$mine .= "\t\tUC_AJT_$ajt,\n";
+	$mine .= "\t\tUC_JT_$ajt,\n";
 
 	# Arabic Joining Group
-	my $ajg = $$cp{'Arabic_Joining_Group'} || 'No_Joining_Group';
+	my $ajg = $$cp{'Joining_Group'} || 'No_Joining_Group';
 	$ajg =~ tr/ /_/;
 	$ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g;
-	$mine .= "\t\tUC_AJG_$ajg,\n";
+	$mine .= "\t\tUC_JG_$ajg,\n";
 
 	# East Asian Width
 	my $eaw = $$cp{'East_Asian_Width'} || 'N';
@@ -502,13 +504,9 @@ sub make_properties_array()
 	my $nt = $$cp{'Numeric_Type'} || 'None';
 	$mine .= "\t\tUC_NT_$nt,\n";
 
-	# Canonical Combining Class
-	my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
-	$mine .= "\t\tUC_CCC_$ccc,\n";
-
 	# Bidi Class
 	my $bc = $$cp{'Bidi_Class'} || 'L';
-	$mine .= "\t\tUC_BC_$bc,\n";
+	$mine .= "\t\tUC_BIDI_$bc,\n";
 
 	# Additional properties...
 	$mine .= "\t},\n";
@@ -553,6 +551,9 @@ read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']);
 read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping',
 					  'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []);
 read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []);
+read_separated_file('ucd/auxilliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []);
+read_separated_file('ucd/auxilliary/SentenceBreakProperty.txt', ['eSentence_Break'], []);
+read_separated_file('ucd/auxilliary/WordBreakProperty.txt', ['eWord_Break'], []);
 read_boolean_file('ucd/DerivedCoreProperties.txt');
 read_boolean_file('ucd/PropList.txt');
 
@@ -564,4 +565,4 @@ make_names_list();
 make_name_keyfile();
 make_named_ucs_keyfile();
 make_properties_array();
-dump_prop_list();
+# dump_prop_list();
diff --git a/libucd_int.h b/libucd_int.h
index 8ab2f06..c23b8ed 100644
--- a/libucd_int.h
+++ b/libucd_int.h
@@ -15,26 +15,30 @@ extern const char _libucd_hangul_jamo_l[][4];
 extern const char _libucd_hangul_jamo_v[][4];
 extern const char _libucd_hangul_jamo_t[][4];
 
+/* This structure is exactly 32 bytes long, nice and alignable. */
 struct _libucd_property_array {
   int32_t  ucd;			/* Wasteful but fast (used in search) */
   uint8_t  general_category;
   uint8_t  script;
   uint8_t  numeric_value_num;
   uint8_t  numeric_value_den_exp; /* bit 7 = 1 if exponent */
-  uint64_t flags_block;		/* Block index is high byte */
+  uint64_t flags_block;		/* Block index is byte 6, byte 7 free */
   int24    simple_uppercase;
   int24    simple_lowercase;
   int24    simple_titlecase;
   uint8_t  age;			/* (major << 5) + minor */
-  uint8_t  pad[2];		/* Do something useful here... */
-  unsigned arabic_joining_type  :3;
-  unsigned arabic_joining_group :6;
+  uint8_t  combining_class;
+  unsigned sentence_break	:4;
+  unsigned grapheme_cluster_break :4;
+  unsigned word_break		:3;
+  unsigned joining_type  	:3;
+  unsigned joining_group 	:6;
   unsigned east_asian_width     :3;
   unsigned hangul_syllable_type :3;
-  unsigned line_break           :5;
+  unsigned line_break           :6;
   unsigned numeric_type         :2;
-  unsigned combining_class      :5;
   unsigned bidi_class           :5;
+  unsigned /* unused */         :1;
 };
 
 #endif
diff --git a/ucd.h b/ucd.h
index 4c0b898..6c51d72 100644
--- a/ucd.h
+++ b/ucd.h
@@ -90,16 +90,16 @@ enum unicode_east_asian_width {
 };
 
 enum unicode_grapheme_cluster_break {
-  UC_GCB_XX = 0,		/* Other */
-  UC_GCB_CN,			/* Control */
-  UC_GCB_CR,			/* CR */
-  UC_GCB_EX,			/* Extend */
-  UC_GCB_L,			/* L */
-  UC_GCB_LF,			/* LF */
-  UC_GCB_LV,			/* LV */
-  UC_GCB_LVT,			/* LVT */
-  UC_GCB_T,			/* T */
-  UC_GCB_V,			/* V */
+  UC_GCB_Other = 0,		/* Other */
+  UC_GCB_Control,
+  UC_GCB_CR,
+  UC_GCB_Extend,
+  UC_GCB_L,
+  UC_GCB_LF,
+  UC_GCB_LV,
+  UC_GCB_LVT,
+  UC_GCB_T,
+  UC_GCB_V,
 };
 
 enum unicode_hangul_syllable_type {
@@ -110,66 +110,70 @@ enum unicode_hangul_syllable_type {
   UC_HST_T,			/* Trailing_Jamo */
   UC_HST_V,			/* Vowel_Jamo */
 };
-
-enum unicode_arabic_joining_group {
-  UC_AJC_None = 0,
-  UC_AJG_Ain,
-  UC_AJC_Alaph,
-  UC_AJC_Alef,
-  UC_AJC_Beh,
-  UC_AJC_Beth,
-  UC_AJC_Dal,
-  UC_AJC_Dalath_Rish,
-  UC_AJC_E,
-  UC_AJC_Fe,
-  UC_AJC_Feh,
-  UC_AJC_Final_Semkath,
-  UC_AJC_Gaf,
-  UC_AJC_Gamal,
-  UC_AJC_Hah,
-  UC_AJC_Hamza_On_Heh_Goal,
-  UC_AJC_He,
-  UC_AJC_Heh,
-  UC_AJC_Heh_Goal,
-  UC_AJC_Heth,
-  UC_AJC_Kaf,
-  UC_AJC_Kaph,
-  UC_AJC_Khaph,
-  UC_AJC_Knotted_Heh,
-  UC_AJC_Lam,
-  UC_AJC_Lamadh,
-  UC_AJC_Meem,
-  UC_AJC_Mim,
-  UC_AJC_Noon,
-  UC_AJC_Nun,
-  UC_AJC_Pe,
-  UC_AJC_Qaf,
-  UC_AJC_Qaph,
-  UC_AJC_Reh,
-  UC_AJC_Reversed_Pe,
-  UC_AJC_Sad,
-  UC_AJC_Sadhe,
-  UC_AJC_Seen,
-  UC_AJC_Semkath,
-  UC_AJC_Shin,
-  UC_AJC_Swash_Kaf,
-  UC_AJC_Syriac_Waw,
-  UC_AJC_Tah,
-  UC_AJC_Taw,
-  UC_AJC_Teh_Marbuta,
-  UC_AJC_Teth,
-  UC_AJC_Waw,
-  UC_AJC_Yeh,
-  UC_AJC_Yeh_Barree,
-  UC_AJC_Yeh_With_Tail,
-  UC_AJC_Yudh,
-  UC_AJC_Yudh_He,
-  UC_AJC_Zain,
-  UC_AJC_Zhain,
+enum unicode_joining_group {
+  UC_JG_No_Joining_Group = 0,
+  UC_JG_Ain,
+  UC_JG_Alaph,
+  UC_JG_Alef,
+  UC_JG_Beh,
+  UC_JG_Beth,
+  UC_JG_Dal,
+  UC_JG_Dalath_Rish,
+  UC_JG_E,
+  UC_JG_Fe,
+  UC_JG_Feh,
+  UC_JG_Final_Semkath,
+  UC_JG_Gaf,
+  UC_JG_Gamal,
+  UC_JG_Hah,
+  UC_JG_Hamza_On_Heh_Goal,
+  UC_JG_He,
+  UC_JG_Heh,
+  UC_JG_Heh_Goal,
+  UC_JG_Heth,
+  UC_JG_Kaf,
+  UC_JG_Kaph,
+  UC_JG_Khaph,
+  UC_JG_Knotted_Heh,
+  UC_JG_Lam,
+  UC_JG_Lamadh,
+  UC_JG_Meem,
+  UC_JG_Mim,
+  UC_JG_Noon,
+  UC_JG_Nun,
+  UC_JG_Pe,
+  UC_JG_Qaf,
+  UC_JG_Qaph,
+  UC_JG_Reh,
+  UC_JG_Reversed_Pe,
+  UC_JG_Sad,
+  UC_JG_Sadhe,
+  UC_JG_Seen,
+  UC_JG_Semkath,
+  UC_JG_Shin,
+  UC_JG_Swash_Kaf,
+  UC_JG_Syriac_Waw,
+  UC_JG_Tah,
+  UC_JG_Taw,
+  UC_JG_Teh_Marbuta,
+  UC_JG_Teth,
+  UC_JG_Waw,
+  UC_JG_Yeh,
+  UC_JG_Yeh_Barree,
+  UC_JG_Yeh_With_Tail,
+  UC_JG_Yudh,
+  UC_JG_Yudh_He,
+  UC_JG_Zain,
+  UC_JG_Zhain,
 };
 
-enum unicode_arabic_joining_type {
-  UC_AJT_None,
+enum unicode_joining_type {
+  UC_JT_U = 0,
+  UC_JT_R,
+  UC_JT_L,
+  UC_JT_D,
+  UC_JT_C,
+  UC_JT_T,
 };
 
 enum unicode_ternary {
@@ -179,35 +183,35 @@ enum unicode_ternary {
 };
 
 enum unicode_numeric_type {
-  UC_NT_None,			/* Not numeric */
-  UC_NT_Nu,			/* Numeric */
-  UC_NT_Di,			/* Digit */
-  UC_NT_De,			/* Decimal digit */
+  UC_NT_None = 0,
+  UC_NT_Numeric,
+  UC_NT_Digit,
+  UC_NT_Decimal,
 };
 
 enum unicode_sentence_break {
-  UC_SB_XX = 0,
-  UC_SB_AT,
-  UC_SB_CL,
-  UC_SB_FO,
-  UC_SB_LE,
-  UC_SB_LO,
-  UC_SB_NU,
-  UC_SB_SE,
-  UC_SB_SP,
-  UC_SB_ST,
-  UC_SB_UP,
+  UC_SB_Other = 0,
+  UC_SB_Sep,
+  UC_SB_Format,
+  UC_SB_Sp,
+  UC_SB_Lower,
+  UC_SB_Upper,
+  UC_SB_OLetter,
+  UC_SB_Numeric,
+  UC_SB_ATerm,
+  UC_SB_STerm,
+  UC_SB_Close,
 };
 
 enum unicode_word_break {
-  UC_WB_XX = 0,
-  UC_WB_EX,
-  UC_WB_FO,
-  UC_WB_KA,
-  UC_WB_LE,
-  UC_WB_ML,
-  UC_WB_MN,
-  UC_WB_NU,
+  UC_WB_Other = 0,
+  UC_WB_Format,
+  UC_WB_Katakana,
+  UC_WB_ALetter,
+  UC_WB_MidLetter,
+  UC_WB_MidNum,
+  UC_WB_Numeric,
+  UC_WB_ExtendNumLet,
 };
 
 enum unicode_line_break {
@@ -250,8 +254,7 @@ enum unicode_line_break {
 };
 
 enum unicode_general_category {
-  UC_GC_XX = 0,
-  UC_GC_Cn,
+  UC_GC_Cn = 0,
   UC_GC_Cc,
   UC_GC_Cf,
   UC_GC_Co,
@@ -278,11 +281,225 @@ enum unicode_general_category {
   UC_GC_Sk,
   UC_GC_Sm,
   UC_GC_So,
-  UC_GC_Zl,
   UC_GC_Sp,
+  UC_GC_Zl,
+  UC_GC_Zp,
   UC_GC_Zs,
 };
 
+enum unicode_script {
+  UC_SCR_Common = 0,
+  UC_SCR_Latin,
+  UC_SCR_Greek,
+  UC_SCR_Cyrillic,
+  UC_SCR_Armenian,
+  UC_SCR_Hebrew,
+  UC_SCR_Arabic,
+  UC_SCR_Syriac,
+  UC_SCR_Thaana,
+  UC_SCR_Devanagari,
+  UC_SCR_Bengali,
+  UC_SCR_Gurmukhi,
+  UC_SCR_Gujarati,
+  UC_SCR_Oriya,
+  UC_SCR_Tamil,
+  UC_SCR_Telugu,
+  UC_SCR_Kannada,
+  UC_SCR_Malayalam,
+  UC_SCR_Sinhala,
+  UC_SCR_Thai,
+  UC_SCR_Lao,
+  UC_SCR_Tibetan,
+  UC_SCR_Myanmar,
+  UC_SCR_Georgian,
+  UC_SCR_Hangul,
+  UC_SCR_Ethiopic,
+  UC_SCR_Cherokee,
+  UC_SCR_Canadian_Aboriginal,
+  UC_SCR_Ogham,
+  UC_SCR_Runic,
+  UC_SCR_Khmer,
+  UC_SCR_Mongolian,
+  UC_SCR_Hiragana,
+  UC_SCR_Katakana,
+  UC_SCR_Bopomofo,
+  UC_SCR_Han,
+  UC_SCR_Yi,
+  UC_SCR_Old_Italic,
+  UC_SCR_Gothic,
+  UC_SCR_Deseret,
+  UC_SCR_Inherited,
+  UC_SCR_Tagalog,
+  UC_SCR_Hanunoo,
+  UC_SCR_Buhid,
+  UC_SCR_Tagbanwa,
+  UC_SCR_Limbu,
+  UC_SCR_Tai_Le,
+  UC_SCR_Linear_B,
+  UC_SCR_Ugaritic,
+  UC_SCR_Shavian,
+  UC_SCR_Osmanya,
+  UC_SCR_Cypriot,
+  UC_SCR_Braille,
+  UC_SCR_Buginese,
+  UC_SCR_Coptic,
+  UC_SCR_New_Tai_Lue,
+  UC_SCR_Glagolitic,
+  UC_SCR_Tifinagh,
+  UC_SCR_Syloti_Nagri,
+  UC_SCR_Old_Persian,
+  UC_SCR_Kharoshthi,
+};
+
+enum unicode_block {
+  UC_BLK_No_Block = 0,
+  UC_BLK_Basic_Latin,
+  UC_BLK_Latin_1_Supplement,
+  UC_BLK_Latin_Extended_A,
+  UC_BLK_Latin_Extended_B,
+  UC_BLK_IPA_Extensions,
+  UC_BLK_Spacing_Modifier_Letters,
+  UC_BLK_Combining_Diacritical_Marks,
+  UC_BLK_Greek_and_Coptic,
+  UC_BLK_Cyrillic,
+  UC_BLK_Cyrillic_Supplement,
+  UC_BLK_Armenian,
+  UC_BLK_Hebrew,
+  UC_BLK_Arabic,
+  UC_BLK_Syriac,
+  UC_BLK_Arabic_Supplement,
+  UC_BLK_Thaana,
+  UC_BLK_Devanagari,
+  UC_BLK_Bengali,
+  UC_BLK_Gurmukhi,
+  UC_BLK_Gujarati,
+  UC_BLK_Oriya,
+  UC_BLK_Tamil,
+  UC_BLK_Telugu,
+  UC_BLK_Kannada,
+  UC_BLK_Malayalam,
+  UC_BLK_Sinhala,
+  UC_BLK_Thai,
+  UC_BLK_Lao,
+  UC_BLK_Tibetan,
+  UC_BLK_Myanmar,
+  UC_BLK_Georgian,
+  UC_BLK_Hangul_Jamo,
+  UC_BLK_Ethiopic,
+  UC_BLK_Ethiopic_Supplement,
+  UC_BLK_Cherokee,
+  UC_BLK_Unified_Canadian_Aboriginal_Syllabics,
+  UC_BLK_Ogham,
+  UC_BLK_Runic,
+  UC_BLK_Tagalog,
+  UC_BLK_Hanunoo,
+  UC_BLK_Buhid,
+  UC_BLK_Tagbanwa,
+  UC_BLK_Khmer,
+  UC_BLK_Mongolian,
+  UC_BLK_Limbu,
+  UC_BLK_Tai_Le,
+  UC_BLK_New_Tai_Lue,
+  UC_BLK_Khmer_Symbols,
+  UC_BLK_Buginese,
+  UC_BLK_Phonetic_Extensions,
+  UC_BLK_Phonetic_Extensions_Supplement,
+  UC_BLK_Combining_Diacritical_Marks_Supplement,
+  UC_BLK_Latin_Extended_Additional,
+  UC_BLK_Greek_Extended,
+  UC_BLK_General_Punctuation,
+  UC_BLK_Superscripts_and_Subscripts,
+  UC_BLK_Currency_Symbols,
+  UC_BLK_Combining_Diacritical_Marks_for_Symbols,
+  UC_BLK_Letterlike_Symbols,
+  UC_BLK_Number_Forms,
+  UC_BLK_Arrows,
+  UC_BLK_Mathematical_Operators,
+  UC_BLK_Miscellaneous_Technical,
+  UC_BLK_Control_Pictures,
+  UC_BLK_Optical_Character_Recognition,
+  UC_BLK_Enclosed_Alphanumerics,
+  UC_BLK_Box_Drawing,
+  UC_BLK_Block_Elements,
+  UC_BLK_Geometric_Shapes,
+  UC_BLK_Miscellaneous_Symbols,
+  UC_BLK_Dingbats,
+  UC_BLK_Miscellaneous_Mathematical_Symbols_A,
+  UC_BLK_Supplemental_Arrows_A,
+  UC_BLK_Braille_Patterns,
+  UC_BLK_Supplemental_Arrows_B,
+  UC_BLK_Miscellaneous_Mathematical_Symbols_B,
+  UC_BLK_Supplemental_Mathematical_Operators,
+  UC_BLK_Miscellaneous_Symbols_and_Arrows,
+  UC_BLK_Glagolitic,
+  UC_BLK_Coptic,
+  UC_BLK_Georgian_Supplement,
+  UC_BLK_Tifinagh,
+  UC_BLK_Ethiopic_Extended,
+  UC_BLK_Supplemental_Punctuation,
+  UC_BLK_CJK_Radicals_Supplement,
+  UC_BLK_Kangxi_Radicals,
+  UC_BLK_Ideographic_Description_Characters,
+  UC_BLK_CJK_Symbols_and_Punctuation,
+  UC_BLK_Hiragana,
+  UC_BLK_Katakana,
+  UC_BLK_Bopomofo,
+  UC_BLK_Hangul_Compatibility_Jamo,
+  UC_BLK_Kanbun,
+  UC_BLK_Bopomofo_Extended,
+  UC_BLK_CJK_Strokes,
+  UC_BLK_Katakana_Phonetic_Extensions,
+  UC_BLK_Enclosed_CJK_Letters_and_Months,
+  UC_BLK_CJK_Compatibility,
+  UC_BLK_CJK_Unified_Ideographs_Extension_A,
+  UC_BLK_Yijing_Hexagram_Symbols,
+  UC_BLK_CJK_Unified_Ideographs,
+  UC_BLK_Yi_Syllables,
+  UC_BLK_Yi_Radicals,
+  UC_BLK_Modifier_Tone_Letters,
+  UC_BLK_Syloti_Nagri,
+  UC_BLK_Hangul_Syllables,
+  UC_BLK_High_Surrogates,
+  UC_BLK_High_Private_Use_Surrogates,
+  UC_BLK_Low_Surrogates,
+  UC_BLK_Private_Use_Area,
+  UC_BLK_CJK_Compatibility_Ideographs,
+  UC_BLK_Alphabetic_Presentation_Forms,
+  UC_BLK_Arabic_Presentation_Forms_A,
+  UC_BLK_Variation_Selectors,
+  UC_BLK_Vertical_Forms,
+  UC_BLK_Combining_Half_Marks,
+  UC_BLK_CJK_Compatibility_Forms,
+  UC_BLK_Small_Form_Variants,
+  UC_BLK_Arabic_Presentation_Forms_B,
+  UC_BLK_Halfwidth_and_Fullwidth_Forms,
+  UC_BLK_Specials,
+  UC_BLK_Linear_B_Syllabary,
+  UC_BLK_Linear_B_Ideograms,
+  UC_BLK_Aegean_Numbers,
+  UC_BLK_Ancient_Greek_Numbers,
+  UC_BLK_Old_Italic,
+  UC_BLK_Gothic,
+  UC_BLK_Ugaritic,
+  UC_BLK_Old_Persian,
+  UC_BLK_Deseret,
+  UC_BLK_Shavian,
+  UC_BLK_Osmanya,
+  UC_BLK_Cypriot_Syllabary,
+  UC_BLK_Kharoshthi,
+  UC_BLK_Byzantine_Musical_Symbols,
+  UC_BLK_Musical_Symbols,
+  UC_BLK_Ancient_Greek_Musical_Notation,
+  UC_BLK_Tai_Xuan_Jing_Symbols,
+  UC_BLK_Mathematical_Alphanumeric_Symbols,
+  UC_BLK_CJK_Unified_Ideographs_Extension_B,
+  UC_BLK_CJK_Compatibility_Ideographs_Supplement,
+  UC_BLK_Tags,
+  UC_BLK_Variation_Selectors_Supplement,
+  UC_BLK_Supplementary_Private_Use_Area_A,
+  UC_BLK_Supplementary_Private_Use_Area_B,
+};
+
 #define UC_FLAG(x) (UINT64_C(1) << (x))
 
 #define UC_FL_COMPOSITION_EXCLUSION     UC_FLAG(0)
@@ -326,7 +543,6 @@ struct unicode_character_data {
   uint64_t fl;			/* Flags */
   const char *name;
   const char *bidi_mirroring_glyph;
-  const char *block;
   const char *uppercase_mapping;
   const char *lowercase_mapping;
   const char *titlecase_mapping;
@@ -339,15 +555,19 @@ struct unicode_character_data {
   uint8_t numeric_value_den;
   uint8_t numeric_value_exp;
   enum unicode_general_category         general_category;
+  enum unicode_block			block;
   enum unicode_script                   script;
-  enum unicode_arabic_joining_type      arabic_joining_type;
-  enum unicode_arabic_joining_group     arabic_joining_group;
+  enum unicode_joining_type      	joining_type;
+  enum unicode_joining_group     	joining_group;
   enum unicode_east_asian_width         east_asian_width;
   enum unicode_hangul_syllable_type     hangul_syllable_type;
-  enum unicode_line_break               line_break;
   enum unicode_numeric_type             numeric_type;
   enum unicode_combining_class	        combining_class;
   enum unicode_bidi_class	    	bidi_class;
+  enum unicode_grapheme_cluster_break	grapheme_cluster_break;
+  enum unicode_sentence_break		sentence_break;
+  enum unicode_word_break		word_break;
+  enum unicode_line_break		line_break;
 };
 
 #endif /* UCD_H */
author	H. Peter Anvin <hpa@smyrno.hos.anvin.org>	2005-11-29 23:25:06 -0800
committer	H. Peter Anvin <hpa@smyrno.hos.anvin.org>	2005-11-29 23:25:06 -0800
commit	86a84dd7743ab0d8140b6a07f81e6b6cf7953b46 (patch)
tree	4dc5ff70d5a3495c8a748633d956cde40b9081ce
parent	c76509f7077ced109ed221248c29495d6cbd40e2 (diff)
download	libucd-86a84dd7743ab0d8140b6a07f81e6b6cf7953b46.tar.gz