Show UTF-8 input as UTF-8 output

.. by doing the stupid "convert to unicode value and back" model. This actually populates the 'struct video' array with the unicode values, so UTF8 input actually shows correctly. In particular, the nice test-file (UTF-8-demo.txt) shows up not as garbage, but as the UTF-8 it is. HOWEVER! Since the *editing* doesn't know about UTF-8, and considers it just a stream of bytes, the end result is not actually a usable utf-8 editor. So don't get too excited yet: this is just a partial step to "actually edit utf8 data" NOTE NOTE NOTE! If the character buffer contains Latin1, we will transform that Latin1 to unicode, and then output it as UTF8. And we will edit it correctly as the character-by-character data. Also, we still do the "UTF8 to Latin1" translation on *input*, so with this commit we can actually continue to *edit* Latin1 text. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 15:08:17 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 15:08:17 -0700
commit: cee00b0efb86c583a10f478a1d7d5b4b5a530a88 (patch)
tree: 78ec343fca3369f432bc34319b62aed2cd7a05d7
parent: e8f984a1b0dc4f5160bc8ccf16ae7df55ac7563d (diff)
download: uemacs-cee00b0efb86c583a10f478a1d7d5b4b5a530a88.tar.gz
1 files changed, 59 insertions, 3 deletions
diff --git a/display.c b/display.c
index 0d0326e..d5b50f5 100644
--- a/display.c
+++ b/display.c
@@ -434,11 +434,67 @@ static int reframe(struct window *wp)
 	return TRUE;
 }
 
+static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
+{
+	unsigned value;
+	unsigned char c = line[index];
+	unsigned bytes, mask, i;
+
+	*res = c;
+	line += index;
+	len -= index;
+
+	/*
+	 * 0xxxxxxx is valid utf8
+	 * 10xxxxxx is invalid UTF-8, we assume it is Latin1
+	 */
+	if (c < 0xc0)
+		return 1;
+
+	/* Ok, it's 11xxxxxx, do a stupid decode */
+	mask = 0x20;
+	bytes = 2;
+	while (c & mask) {
+		bytes++;
+		mask >>= 1;
+	}
+
+	/* Invalid? Do it as a single byte Latin1 */
+	if (bytes > 6)
+		return 1;
+
+	value = c & (mask-1);
+
+	/* Ok, do the bytes */
+	for (i = 1; i < bytes; i++) {
+		if (i > len)
+			return 1;
+		c = line[i];
+		if ((c & 0xc0) != 0x80)
+			return 1;
+		value = (value << 6) | (c & 0x3f);
+	}
+	*res = value;
+	return bytes;
+}
+
 static void show_line(struct line *lp)
 {
-	int i;
-	for (i = 0; i < llength(lp); ++i)
-		vtputc(lgetc(lp, i));
+	unsigned i = 0, len = llength(lp);
+	struct video *vp;
+
+	vp = vscreen[vtrow];
+
+	while (i < len) {
+		unicode_t c;
+
+		i += utf8_to_unicode(lp->l_text, i, len, &c);
+		if (vtcol >= term.t_ncol)
+			vp->v_text[term.t_ncol - 1] = '$';
+		else if (vtcol >= 0)
+			vp->v_text[vtcol] = c;
+		++vtcol;
+	}
 }
 
 /*
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-10 15:08:17 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-10 15:08:17 -0700
commit	cee00b0efb86c583a10f478a1d7d5b4b5a530a88 (patch)
tree	78ec343fca3369f432bc34319b62aed2cd7a05d7
parent	e8f984a1b0dc4f5160bc8ccf16ae7df55ac7563d (diff)
download	uemacs-cee00b0efb86c583a10f478a1d7d5b4b5a530a88.tar.gz