diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-10 16:21:35 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-10 16:21:35 -0700 |
commit | e62cdf04cff63381121364cd6ef077f00d72307a (patch) | |
tree | 54f971d3553ca036daab8bf2f7b7d001e68e9ad0 | |
parent | 12e4647debb6016da2063feed7516ad835fffd08 (diff) | |
download | uemacs-e62cdf04cff63381121364cd6ef077f00d72307a.tar.gz |
Split up the utf8 helper functions into a file of their own
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Makefile | 8 | ||||
-rw-r--r-- | display.c | 47 | ||||
-rw-r--r-- | posix.c | 24 | ||||
-rw-r--r-- | utf8.c | 98 | ||||
-rw-r--r-- | utf8.h | 9 |
5 files changed, 119 insertions, 67 deletions
@@ -20,13 +20,13 @@ SRC=ansi.c basic.c bind.c buffer.c crypt.c display.c eval.c exec.c \ file.c fileio.c ibmpc.c input.c isearch.c line.c lock.c main.c \ pklock.c posix.c random.c region.c search.c spawn.c tcap.c \ termio.c vmsvt.c vt52.c window.c word.c names.c globals.c version.c \ - usage.c wrapper.c + usage.c wrapper.c utf8.c OBJ=ansi.o basic.o bind.o buffer.o crypt.o display.o eval.o exec.o \ file.o fileio.o ibmpc.o input.o isearch.o line.o lock.o main.o \ pklock.o posix.o random.o region.o search.o spawn.o tcap.o \ termio.o vmsvt.o vt52.o window.o word.o names.o globals.o version.o \ - usage.o wrapper.o + usage.o wrapper.o utf8.o HDR=ebind.h edef.h efunc.h epath.h estruct.h evar.h util.h version.h @@ -132,7 +132,7 @@ basic.o: basic.c estruct.h edef.h bind.o: bind.c estruct.h edef.h epath.h buffer.o: buffer.c estruct.h edef.h crypt.o: crypt.c estruct.h edef.h -display.o: display.c estruct.h edef.h +display.o: display.c estruct.h edef.h utf8.h eval.o: eval.c estruct.h edef.h evar.h exec.o: exec.c estruct.h edef.h file.o: file.c estruct.h edef.h @@ -144,12 +144,14 @@ line.o: line.c estruct.h edef.h lock.o: lock.c estruct.h edef.h main.o: main.c estruct.h efunc.h edef.h ebind.h pklock.o: pklock.c estruct.h +posix.o: posix.c estruct.h utf8.h random.o: random.c estruct.h edef.h region.o: region.c estruct.h edef.h search.o: search.c estruct.h edef.h spawn.o: spawn.c estruct.h edef.h tcap.o: tcap.c estruct.h edef.h termio.o: termio.c estruct.h edef.h +utf8.o: utf8.c utf8.h vmsvt.o: vmsvt.c estruct.h edef.h vt52.o: vt52.c estruct.h edef.h window.o: window.c estruct.h edef.h @@ -19,8 +19,7 @@ #include "line.h" #include "version.h" #include "wrapper.h" - -typedef unsigned int unicode_t; +#include "utf8.h" struct video { int v_flag; /* Flags */ @@ -434,50 +433,6 @@ static int reframe(struct window *wp) return TRUE; } -static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res) -{ - unsigned value; - unsigned char c = line[index]; - unsigned bytes, mask, i; - - *res = c; - line += index; - len -= index; - - /* - * 0xxxxxxx is valid utf8 - * 10xxxxxx is invalid UTF-8, we assume it is Latin1 - */ - if (c < 0xc0) - return 1; - - /* Ok, it's 11xxxxxx, do a stupid decode */ - mask = 0x20; - bytes = 2; - while (c & mask) { - bytes++; - mask >>= 1; - } - - /* Invalid? Do it as a single byte Latin1 */ - if (bytes > 6) - return 1; - - value = c & (mask-1); - - /* Ok, do the bytes */ - for (i = 1; i < bytes; i++) { - if (i > len) - return 1; - c = line[i]; - if ((c & 0xc0) != 0x80) - return 1; - value = (value << 6) | (c & 0x3f); - } - *res = value; - return bytes; -} - static void show_line(struct line *lp) { unsigned i = 0, len = llength(lp); @@ -22,6 +22,7 @@ #include "estruct.h" #include "edef.h" #include "efunc.h" +#include "utf8.h" /* Since Mac OS X's termios.h doesn't have the following 2 macros, define them. */ @@ -106,24 +107,11 @@ void ttclose(void) */ int ttputc(int c) { - unsigned char utf8[6], *p = utf8+5; - int bytes = 1; - - if (c < 0) - return 0; - *p = c; - if (c > 0x7f) { - int prefix = 0x40; - do { - *p = 0x80 + (c & 0x3f); - --p; - bytes++; - prefix >>= 1; - c >>= 6; - } while (c > prefix); - *p = c - 2*prefix; - } - fwrite(p, 1, bytes, stdout); + char utf8[6]; + int bytes; + + bytes = unicode_to_utf8(c, utf8); + fwrite(utf8, 1, bytes, stdout); return 0; } @@ -0,0 +1,98 @@ +#include "utf8.h" + +/* + * utf8_to_unicode() + * + * Convert a UTF-8 sequence to its unicode value, and return the length of + * the sequence in bytes. + * + * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can + * either use it as-is (ie as Latin1) or you can check for invalid UTF-8 + * by checking for a length of 1 and a result > 127. + * + * NOTE 2! This does *not* verify things like minimality. So overlong forms + * are happily accepted and decoded, as are the various "invalid values". + */ +unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res) +{ + unsigned value; + unsigned char c = line[index]; + unsigned bytes, mask, i; + + *res = c; + line += index; + len -= index; + + /* + * 0xxxxxxx is valid utf8 + * 10xxxxxx is invalid UTF-8, we assume it is Latin1 + */ + if (c < 0xc0) + return 1; + + /* Ok, it's 11xxxxxx, do a stupid decode */ + mask = 0x20; + bytes = 2; + while (c & mask) { + bytes++; + mask >>= 1; + } + + /* Invalid? Do it as a single byte Latin1 */ + if (bytes > 6) + return 1; + + value = c & (mask-1); + + /* Ok, do the bytes */ + for (i = 1; i < bytes; i++) { + if (i > len) + return 1; + c = line[i]; + if ((c & 0xc0) != 0x80) + return 1; + value = (value << 6) | (c & 0x3f); + } + *res = value; + return bytes; +} + +static void reverse_string(char *begin, char *end) +{ + do { + char a = *begin, b = *end; + *end = a; *begin = b; + begin++; end--; + } while (begin < end); +} + +/* + * unicode_to_utf8() + * + * Convert a unicode value to its canonical utf-8 sequence. + * + * NOTE! This does not check for - or care about - the "invalid" unicode + * values. Also, converting a utf-8 sequence to unicode and back does + * *not* guarantee the same sequence, since this generates the shortest + * possible sequence, while utf8_to_unicode() accepts both Latin1 and + * overlong utf-8 sequences. + */ +unsigned unicode_to_utf8(unsigned int c, char *utf8) +{ + int bytes = 1; + + *utf8 = c; + if (c > 0x7f) { + int prefix = 0x40; + char *p = utf8; + do { + *p++ = 0x80 + (c & 0x3f); + bytes++; + prefix >>= 1; + c >>= 6; + } while (c > prefix); + *p = c - 2*prefix; + reverse_string(utf8, p); + } + return bytes; +} @@ -0,0 +1,9 @@ +#ifndef UTF8_H +#define UTF8_H + +typedef unsigned int unicode_t; + +unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res); +unsigned unicode_to_utf8(unsigned int c, char *utf8); + +#endif |