git.stevedylan.dev

overhaul utf8decode() 51e32d49

this changes the utf8decode function to:

* report when an error occurs
* report how many bytes to advance on error

these will be useful in the next commit to render invalid utf8
sequences.

the new implementation is also shorter and more direct.

NRK · 2024-07-04 21:25 1 file(s) · +30 −44

drw.c +30 −44

#include "util.h"

#define UTF_INVALID 0xFFFD
#define UTF_SIZ     4

static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 0xF0};
static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static const long utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  0x10000};
static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};

static long
utf8decodebyte(const char c, size_t *i)
{
	for (*i = 0; *i < (UTF_SIZ + 1); ++(*i))
		if (((unsigned char)c & utfmask[*i]) == utfbyte[*i])
			return (unsigned char)c & ~utfmask[*i];
	return 0;
}

static size_t
utf8validate(long *u, size_t i)
static int
utf8decode(const char *s_in, long *u, int *err)
{
	if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
		*u = UTF_INVALID;
	for (i = 1; *u > utfmax[i]; ++i)
		;
	return i;
}

static size_t
utf8decode(const char *c, long *u, size_t clen)
{
	size_t i, j, len, type;
	long udecoded;
	static const unsigned char lens[] = {
		/* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
		/* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0,  /* invalid */
		/* 110XX */ 2, 2, 2, 2,
		/* 1110X */ 3, 3,
		/* 11110 */ 4,
		/* 11111 */ 0,  /* invalid */
	};
	static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };

	const unsigned char *s = (const unsigned char *)s_in;
	int len = lens[*s >> 3];
	*u = UTF_INVALID;
	if (!clen)
		return 0;
	udecoded = utf8decodebyte(c[0], &len);
	if (!BETWEEN(len, 1, UTF_SIZ))
	*err = 1;
	if (len == 0)
		return 1;
	for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
		udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
		if (type)
			return j;

	long cp = s[0] & leading_mask[len - 1];
	for (int i = 1; i < len; ++i) {
		if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
			return i;
		cp = (cp << 6) | (s[i] & 0x3F);
	}
	if (j < len)
		return 0;
	*u = udecoded;
	utf8validate(u, len);
	/* out of range, surrogate, overlong encoding */
	if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
		return len;

	*err = 0;
	*u = cp;
	return len;
}


	unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
	XftDraw *d = NULL;
	Fnt *usedfont, *curfont, *nextfont;
	int utf8strlen, utf8charlen, render = x || y || w || h;
	int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
	long utf8codepoint = 0;
	const char *utf8str;
	FcCharSet *fccharset;

	if (!ellipsis_width && render)
		ellipsis_width = drw_fontset_getwidth(drw, "...");
	while (1) {
		ew = ellipsis_len = utf8strlen = 0;
		ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
		utf8str = text;
		nextfont = NULL;
		while (*text) {
			utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
			utf8charlen = utf8decode(text, &utf8codepoint, &utf8err);
			for (curfont = drw->fonts; curfont; curfont = curfont->next) {
				charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
				if (charexists) {

9	9		#include "util.h"
10	10
11	11		#define UTF_INVALID 0xFFFD
12		-	#define UTF_SIZ 4
13	12
14		-	static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
15		-	static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
16		-	static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
17		-	static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
18		-
19		-	static long
20		-	utf8decodebyte(const char c, size_t *i)
21		-	{
22		-	for (i = 0; i < (UTF_SIZ + 1); ++(*i))
23		-	if (((unsigned char)c & utfmask[i]) == utfbyte[i])
24		-	return (unsigned char)c & ~utfmask[*i];
25		-	return 0;
26		-	}
27		-
28		-	static size_t
29		-	utf8validate(long *u, size_t i)
	13	+	static int
	14	+	utf8decode(const char s_in, long u, int *err)
30	15		{
31		-	if (!BETWEEN(u, utfmin[i], utfmax[i]) \|\| BETWEEN(u, 0xD800, 0xDFFF))
32		-	*u = UTF_INVALID;
33		-	for (i = 1; *u > utfmax[i]; ++i)
34		-	;
35		-	return i;
36		-	}
37		-
38		-	static size_t
39		-	utf8decode(const char c, long u, size_t clen)
40		-	{
41		-	size_t i, j, len, type;
42		-	long udecoded;
	16	+	static const unsigned char lens[] = {
	17	+	/* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	18	+	/* 10XXX / 0, 0, 0, 0, 0, 0, 0, 0, / invalid */
	19	+	/* 110XX */ 2, 2, 2, 2,
	20	+	/* 1110X */ 3, 3,
	21	+	/* 11110 */ 4,
	22	+	/* 11111 / 0, / invalid */
	23	+	};
	24	+	static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	25	+	static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
43	26
	27	+	const unsigned char s = (const unsigned char )s_in;
	28	+	int len = lens[*s >> 3];
44	29		*u = UTF_INVALID;
45		-	if (!clen)
46		-	return 0;
47		-	udecoded = utf8decodebyte(c[0], &len);
48		-	if (!BETWEEN(len, 1, UTF_SIZ))
	30	+	*err = 1;
	31	+	if (len == 0)
49	32		return 1;
50		-	for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
51		-	udecoded = (udecoded << 6) \| utf8decodebyte(c[i], &type);
52		-	if (type)
53		-	return j;
	33	+
	34	+	long cp = s[0] & leading_mask[len - 1];
	35	+	for (int i = 1; i < len; ++i) {
	36	+	if (s[i] == '\0' \|\| (s[i] & 0xC0) != 0x80)
	37	+	return i;
	38	+	cp = (cp << 6) \| (s[i] & 0x3F);
54	39		}
55		-	if (j < len)
56		-	return 0;
57		-	*u = udecoded;
58		-	utf8validate(u, len);
	40	+	/* out of range, surrogate, overlong encoding */
	41	+	if (cp > 0x10FFFF \|\| (cp >> 11) == 0x1B \|\| cp < overlong[len - 1])
	42	+	return len;
59	43
	44	+	*err = 0;
	45	+	*u = cp;
60	46		return len;
61	47		}
62	48

242	228		unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
243	229		XftDraw *d = NULL;
244	230		Fnt usedfont, curfont, *nextfont;
245		-	int utf8strlen, utf8charlen, render = x \|\| y \|\| w \|\| h;
	231	+	int utf8strlen, utf8charlen, utf8err, render = x \|\| y \|\| w \|\| h;
246	232		long utf8codepoint = 0;
247	233		const char *utf8str;
248	234		FcCharSet *fccharset;

272	258		if (!ellipsis_width && render)
273	259		ellipsis_width = drw_fontset_getwidth(drw, "...");
274	260		while (1) {
275		-	ew = ellipsis_len = utf8strlen = 0;
	261	+	ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
276	262		utf8str = text;
277	263		nextfont = NULL;
278	264		while (*text) {
279		-	utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
	265	+	utf8charlen = utf8decode(text, &utf8codepoint, &utf8err);
280	266		for (curfont = drw->fonts; curfont; curfont = curfont->next) {
281	267		charexists = charexists \|\| XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
282	268		if (charexists) {