aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-10 16:21:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-10 16:21:35 -0700
commite62cdf04cff63381121364cd6ef077f00d72307a (patch)
tree54f971d3553ca036daab8bf2f7b7d001e68e9ad0
parent12e4647debb6016da2063feed7516ad835fffd08 (diff)
downloaduemacs-e62cdf04cff63381121364cd6ef077f00d72307a.tar.gz
Split up the utf8 helper functions into a file of their own
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Makefile8
-rw-r--r--display.c47
-rw-r--r--posix.c24
-rw-r--r--utf8.c98
-rw-r--r--utf8.h9
5 files changed, 119 insertions, 67 deletions
diff --git a/Makefile b/Makefile
index eb53a8a..76a8122 100644
--- a/Makefile
+++ b/Makefile
@@ -20,13 +20,13 @@ SRC=ansi.c basic.c bind.c buffer.c crypt.c display.c eval.c exec.c \
file.c fileio.c ibmpc.c input.c isearch.c line.c lock.c main.c \
pklock.c posix.c random.c region.c search.c spawn.c tcap.c \
termio.c vmsvt.c vt52.c window.c word.c names.c globals.c version.c \
- usage.c wrapper.c
+ usage.c wrapper.c utf8.c
OBJ=ansi.o basic.o bind.o buffer.o crypt.o display.o eval.o exec.o \
file.o fileio.o ibmpc.o input.o isearch.o line.o lock.o main.o \
pklock.o posix.o random.o region.o search.o spawn.o tcap.o \
termio.o vmsvt.o vt52.o window.o word.o names.o globals.o version.o \
- usage.o wrapper.o
+ usage.o wrapper.o utf8.o
HDR=ebind.h edef.h efunc.h epath.h estruct.h evar.h util.h version.h
@@ -132,7 +132,7 @@ basic.o: basic.c estruct.h edef.h
bind.o: bind.c estruct.h edef.h epath.h
buffer.o: buffer.c estruct.h edef.h
crypt.o: crypt.c estruct.h edef.h
-display.o: display.c estruct.h edef.h
+display.o: display.c estruct.h edef.h utf8.h
eval.o: eval.c estruct.h edef.h evar.h
exec.o: exec.c estruct.h edef.h
file.o: file.c estruct.h edef.h
@@ -144,12 +144,14 @@ line.o: line.c estruct.h edef.h
lock.o: lock.c estruct.h edef.h
main.o: main.c estruct.h efunc.h edef.h ebind.h
pklock.o: pklock.c estruct.h
+posix.o: posix.c estruct.h utf8.h
random.o: random.c estruct.h edef.h
region.o: region.c estruct.h edef.h
search.o: search.c estruct.h edef.h
spawn.o: spawn.c estruct.h edef.h
tcap.o: tcap.c estruct.h edef.h
termio.o: termio.c estruct.h edef.h
+utf8.o: utf8.c utf8.h
vmsvt.o: vmsvt.c estruct.h edef.h
vt52.o: vt52.c estruct.h edef.h
window.o: window.c estruct.h edef.h
diff --git a/display.c b/display.c
index d5b50f5..82b4f84 100644
--- a/display.c
+++ b/display.c
@@ -19,8 +19,7 @@
#include "line.h"
#include "version.h"
#include "wrapper.h"
-
-typedef unsigned int unicode_t;
+#include "utf8.h"
struct video {
int v_flag; /* Flags */
@@ -434,50 +433,6 @@ static int reframe(struct window *wp)
return TRUE;
}
-static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
-{
- unsigned value;
- unsigned char c = line[index];
- unsigned bytes, mask, i;
-
- *res = c;
- line += index;
- len -= index;
-
- /*
- * 0xxxxxxx is valid utf8
- * 10xxxxxx is invalid UTF-8, we assume it is Latin1
- */
- if (c < 0xc0)
- return 1;
-
- /* Ok, it's 11xxxxxx, do a stupid decode */
- mask = 0x20;
- bytes = 2;
- while (c & mask) {
- bytes++;
- mask >>= 1;
- }
-
- /* Invalid? Do it as a single byte Latin1 */
- if (bytes > 6)
- return 1;
-
- value = c & (mask-1);
-
- /* Ok, do the bytes */
- for (i = 1; i < bytes; i++) {
- if (i > len)
- return 1;
- c = line[i];
- if ((c & 0xc0) != 0x80)
- return 1;
- value = (value << 6) | (c & 0x3f);
- }
- *res = value;
- return bytes;
-}
-
static void show_line(struct line *lp)
{
unsigned i = 0, len = llength(lp);
diff --git a/posix.c b/posix.c
index c33b286..445724e 100644
--- a/posix.c
+++ b/posix.c
@@ -22,6 +22,7 @@
#include "estruct.h"
#include "edef.h"
#include "efunc.h"
+#include "utf8.h"
/* Since Mac OS X's termios.h doesn't have the following 2 macros, define them.
*/
@@ -106,24 +107,11 @@ void ttclose(void)
*/
int ttputc(int c)
{
- unsigned char utf8[6], *p = utf8+5;
- int bytes = 1;
-
- if (c < 0)
- return 0;
- *p = c;
- if (c > 0x7f) {
- int prefix = 0x40;
- do {
- *p = 0x80 + (c & 0x3f);
- --p;
- bytes++;
- prefix >>= 1;
- c >>= 6;
- } while (c > prefix);
- *p = c - 2*prefix;
- }
- fwrite(p, 1, bytes, stdout);
+ char utf8[6];
+ int bytes;
+
+ bytes = unicode_to_utf8(c, utf8);
+ fwrite(utf8, 1, bytes, stdout);
return 0;
}
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..6276b13
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,98 @@
+#include "utf8.h"
+
+/*
+ * utf8_to_unicode()
+ *
+ * Convert a UTF-8 sequence to its unicode value, and return the length of
+ * the sequence in bytes.
+ *
+ * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
+ * either use it as-is (ie as Latin1) or you can check for invalid UTF-8
+ * by checking for a length of 1 and a result > 127.
+ *
+ * NOTE 2! This does *not* verify things like minimality. So overlong forms
+ * are happily accepted and decoded, as are the various "invalid values".
+ */
+unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
+{
+ unsigned value;
+ unsigned char c = line[index];
+ unsigned bytes, mask, i;
+
+ *res = c;
+ line += index;
+ len -= index;
+
+ /*
+ * 0xxxxxxx is valid utf8
+ * 10xxxxxx is invalid UTF-8, we assume it is Latin1
+ */
+ if (c < 0xc0)
+ return 1;
+
+ /* Ok, it's 11xxxxxx, do a stupid decode */
+ mask = 0x20;
+ bytes = 2;
+ while (c & mask) {
+ bytes++;
+ mask >>= 1;
+ }
+
+ /* Invalid? Do it as a single byte Latin1 */
+ if (bytes > 6)
+ return 1;
+
+ value = c & (mask-1);
+
+ /* Ok, do the bytes */
+ for (i = 1; i < bytes; i++) {
+ if (i > len)
+ return 1;
+ c = line[i];
+ if ((c & 0xc0) != 0x80)
+ return 1;
+ value = (value << 6) | (c & 0x3f);
+ }
+ *res = value;
+ return bytes;
+}
+
+static void reverse_string(char *begin, char *end)
+{
+ do {
+ char a = *begin, b = *end;
+ *end = a; *begin = b;
+ begin++; end--;
+ } while (begin < end);
+}
+
+/*
+ * unicode_to_utf8()
+ *
+ * Convert a unicode value to its canonical utf-8 sequence.
+ *
+ * NOTE! This does not check for - or care about - the "invalid" unicode
+ * values. Also, converting a utf-8 sequence to unicode and back does
+ * *not* guarantee the same sequence, since this generates the shortest
+ * possible sequence, while utf8_to_unicode() accepts both Latin1 and
+ * overlong utf-8 sequences.
+ */
+unsigned unicode_to_utf8(unsigned int c, char *utf8)
+{
+ int bytes = 1;
+
+ *utf8 = c;
+ if (c > 0x7f) {
+ int prefix = 0x40;
+ char *p = utf8;
+ do {
+ *p++ = 0x80 + (c & 0x3f);
+ bytes++;
+ prefix >>= 1;
+ c >>= 6;
+ } while (c > prefix);
+ *p = c - 2*prefix;
+ reverse_string(utf8, p);
+ }
+ return bytes;
+}
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..b60ccd2
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,9 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+typedef unsigned int unicode_t;
+
+unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res);
+unsigned unicode_to_utf8(unsigned int c, char *utf8);
+
+#endif