overhaul utf8decode() 51e32d49
this changes the utf8decode function to:

* report when an error occurs
* report how many bytes to advance on error

these will be useful in the next commit to render invalid utf8
sequences.

the new implementation is also shorter and more direct.
NRK · 2024-07-04 21:25 1 file(s) · +30 −44
drw.c +30 −44
9 9
#include "util.h"
10 10
11 11
#define UTF_INVALID 0xFFFD
12 -
#define UTF_SIZ     4
13 12
14 -
static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 0xF0};
15 -
static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
16 -
static const long utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  0x10000};
17 -
static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
18 -
19 -
static long
20 -
utf8decodebyte(const char c, size_t *i)
21 -
{
22 -
	for (*i = 0; *i < (UTF_SIZ + 1); ++(*i))
23 -
		if (((unsigned char)c & utfmask[*i]) == utfbyte[*i])
24 -
			return (unsigned char)c & ~utfmask[*i];
25 -
	return 0;
26 -
}
27 -
28 -
static size_t
29 -
utf8validate(long *u, size_t i)
13 +
static int
14 +
utf8decode(const char *s_in, long *u, int *err)
30 15
{
31 -
	if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
32 -
		*u = UTF_INVALID;
33 -
	for (i = 1; *u > utfmax[i]; ++i)
34 -
		;
35 -
	return i;
36 -
}
37 -
38 -
static size_t
39 -
utf8decode(const char *c, long *u, size_t clen)
40 -
{
41 -
	size_t i, j, len, type;
42 -
	long udecoded;
16 +
	static const unsigned char lens[] = {
17 +
		/* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 +
		/* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0,  /* invalid */
19 +
		/* 110XX */ 2, 2, 2, 2,
20 +
		/* 1110X */ 3, 3,
21 +
		/* 11110 */ 4,
22 +
		/* 11111 */ 0,  /* invalid */
23 +
	};
24 +
	static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
25 +
	static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
43 26
27 +
	const unsigned char *s = (const unsigned char *)s_in;
28 +
	int len = lens[*s >> 3];
44 29
	*u = UTF_INVALID;
45 -
	if (!clen)
46 -
		return 0;
47 -
	udecoded = utf8decodebyte(c[0], &len);
48 -
	if (!BETWEEN(len, 1, UTF_SIZ))
30 +
	*err = 1;
31 +
	if (len == 0)
49 32
		return 1;
50 -
	for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
51 -
		udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
52 -
		if (type)
53 -
			return j;
33 +
34 +
	long cp = s[0] & leading_mask[len - 1];
35 +
	for (int i = 1; i < len; ++i) {
36 +
		if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
37 +
			return i;
38 +
		cp = (cp << 6) | (s[i] & 0x3F);
54 39
	}
55 -
	if (j < len)
56 -
		return 0;
57 -
	*u = udecoded;
58 -
	utf8validate(u, len);
40 +
	/* out of range, surrogate, overlong encoding */
41 +
	if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
42 +
		return len;
59 43
44 +
	*err = 0;
45 +
	*u = cp;
60 46
	return len;
61 47
}
62 48
242 228
	unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
243 229
	XftDraw *d = NULL;
244 230
	Fnt *usedfont, *curfont, *nextfont;
245 -
	int utf8strlen, utf8charlen, render = x || y || w || h;
231 +
	int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
246 232
	long utf8codepoint = 0;
247 233
	const char *utf8str;
248 234
	FcCharSet *fccharset;
272 258
	if (!ellipsis_width && render)
273 259
		ellipsis_width = drw_fontset_getwidth(drw, "...");
274 260
	while (1) {
275 -
		ew = ellipsis_len = utf8strlen = 0;
261 +
		ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
276 262
		utf8str = text;
277 263
		nextfont = NULL;
278 264
		while (*text) {
279 -
			utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
265 +
			utf8charlen = utf8decode(text, &utf8codepoint, &utf8err);
280 266
			for (curfont = drw->fonts; curfont; curfont = curfont->next) {
281 267
				charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
282 268
				if (charexists) {