From 3e125830a5446970184c1b70c03c7aee72909883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= Date: Tue, 13 Aug 2013 17:23:54 -0700 Subject: [PATCH] terminal: Make utf-8 state machine assemble unicode code point value --- clients/terminal.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/clients/terminal.c b/clients/terminal.c index 31bcedd42..6701fb62c 100644 --- a/clients/terminal.c +++ b/clients/terminal.c @@ -106,6 +106,7 @@ struct utf8_state_machine { enum utf8_state state; int len; union utf8_char s; + uint32_t unicode; }; static void @@ -132,6 +133,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c) /* single byte, accept */ machine->s.byte[machine->len++] = c; machine->state = utf8state_accept; + machine->unicode = c; } else if((c & 0xC0) == 0x80) { /* parser out of sync, ignore byte */ machine->state = utf8state_start; @@ -139,14 +141,17 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c) /* start of two byte sequence */ machine->s.byte[machine->len++] = c; machine->state = utf8state_expect1; + machine->unicode = c & 0x1f; } else if((c & 0xF0) == 0xE0) { /* start of three byte sequence */ machine->s.byte[machine->len++] = c; machine->state = utf8state_expect2; + machine->unicode = c & 0x0f; } else if((c & 0xF8) == 0xF0) { /* start of four byte sequence */ machine->s.byte[machine->len++] = c; machine->state = utf8state_expect3; + machine->unicode = c & 0x07; } else { /* overlong encoding, reject */ machine->state = utf8state_reject; @@ -154,6 +159,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c) break; case utf8state_expect3: machine->s.byte[machine->len++] = c; + machine->unicode = (machine->unicode << 6) | (c & 0x3f); if((c & 0xC0) == 0x80) { /* all good, continue */ machine->state = utf8state_expect2; @@ -164,6 +170,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c) break; case utf8state_expect2: machine->s.byte[machine->len++] = c; + machine->unicode = (machine->unicode << 6) | (c & 0x3f); if((c & 0xC0) == 0x80) { /* all good, continue */ machine->state = utf8state_expect1; @@ -174,6 +181,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c) break; case utf8state_expect1: machine->s.byte[machine->len++] = c; + machine->unicode = (machine->unicode << 6) | (c & 0x3f); if((c & 0xC0) == 0x80) { /* all good, accept */ machine->state = utf8state_accept; @@ -190,6 +198,26 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c) return machine->state; } +static uint32_t +get_unicode(union utf8_char utf8) +{ + struct utf8_state_machine machine; + int i; + + init_state_machine(&machine); + for (i = 0; i < 4; i++) { + utf8_next_char(&machine, utf8.byte[i]); + if (machine.state == utf8state_accept || + machine.state == utf8state_reject) + break; + } + + if (machine.state == utf8state_reject) + return 0xfffd; + + return machine.unicode; +} + struct char_sub { union utf8_char match; union utf8_char replace;