Fix 'getccol()' and 'getgoal()' functions for multibyte UTF-8 characters

These functions convert the byte offset into the column number (getccol()) and vice versa (getgoal()). Getting this right means that moving up and down the text gets us the right columns, rather than moving randomly left and right when you move up and down. We also won't end up in the middle of a utf-8 character, because we're not just moving into some random byte offset, we're moving into a proper column. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-15 14:36:38 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-15 14:36:38 -0700
commit: ddd45dbff17f4fc30c66404331f6b8f0c05cdd82 (patch)
tree: f9d1d8156efcac6ae65edd363f88ba90b8ea36cb
parent: 1edeced67c1cacefe83444e5fd403dab73529ebf (diff)
download: uemacs-ddd45dbff17f4fc30c66404331f6b8f0c05cdd82.tar.gz
2 files changed, 22 insertions, 8 deletions
diff --git a/basic.c b/basic.c
index 3a7d6f7..5071047 100644
--- a/basic.c
+++ b/basic.c
@@ -24,25 +24,31 @@
  */
 static int getgoal(struct line *dlp)
 {
-	int c;
 	int col;
 	int newcol;
 	int dbo;
+	int len = llength(dlp);
 
 	col = 0;
 	dbo = 0;
-	while (dbo != llength(dlp)) {
-		c = lgetc(dlp, dbo);
+	while (dbo != len) {
+		unicode_t c;
+		int width = utf8_to_unicode(dlp->l_text, dbo, len, &c);
 		newcol = col;
+
+		/* Take tabs, ^X and \xx hex characters into account */
 		if (c == '\t')
 			newcol |= tabmask;
 		else if (c < 0x20 || c == 0x7F)
 			++newcol;
+		else if (c >= 0x80 && c <= 0xa0)
+			newcol += 2;
+
 		++newcol;
 		if (newcol > curgoal)
 			break;
 		col = newcol;
-		++dbo;
+		dbo += width;
 	}
 	return dbo;
 }
diff --git a/random.c b/random.c
index 240e807..455661d 100644
--- a/random.c
+++ b/random.c
@@ -124,16 +124,24 @@ int getcline(void)
  */
 int getccol(int bflg)
 {
-	int c, i, col;
-	col = 0;
-	for (i = 0; i < curwp->w_doto; ++i) {
-		c = lgetc(curwp->w_dotp, i);
+	int i, col;
+	struct line *dlp = curwp->w_dotp;
+	int byte_offset = curwp->w_doto;
+	int len = llength(dlp);
+
+	col = i = 0;
+	while (i < byte_offset) {
+		unicode_t c;
+
+		i += utf8_to_unicode(dlp->l_text, i, len, &c);
 		if (c != ' ' && c != '\t' && bflg)
 			break;
 		if (c == '\t')
 			col |= tabmask;
 		else if (c < 0x20 || c == 0x7F)
 			++col;
+		else if (c >= 0xc0 && c <= 0xa0)
+			col += 2;
 		++col;
 	}
 	return col;
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-15 14:36:38 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-15 14:36:38 -0700
commit	ddd45dbff17f4fc30c66404331f6b8f0c05cdd82 (patch)
tree	f9d1d8156efcac6ae65edd363f88ba90b8ea36cb
parent	1edeced67c1cacefe83444e5fd403dab73529ebf (diff)
download	uemacs-ddd45dbff17f4fc30c66404331f6b8f0c05cdd82.tar.gz