Skip to content

Commit

Permalink
Merge pull request antirez#32 from cxw42/faster-utf8
Browse files Browse the repository at this point in the history
UTF8 isWideChar, isCombiningChar: fast-bail for low-codepoint text
  • Loading branch information
rain-1 authored Jun 8, 2018
2 parents 1e6ffd4 + 38109ad commit 7866206
Showing 1 changed file with 15 additions and 4 deletions.
19 changes: 15 additions & 4 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

/* ============================ UTF8 utilities ============================== */

static unsigned long wideCharTable[][2] = {
static unsigned long wideCharTable[][2] = { /* list in ascending order */
{ 0x1100, 0x115F },
{ 0x231A, 0x231B },
{ 0x2329, 0x232A },
Expand Down Expand Up @@ -157,7 +157,7 @@ static unsigned long wideCharTable[][2] = {

static size_t wideCharTableSize = sizeof(wideCharTable) / sizeof(wideCharTable[0]);

static unsigned long combiningCharTable[] = {
static unsigned long combiningCharTable[] = { /* list in ascending order */
0x0300,0x0301,0x0302,0x0303,0x0304,0x0305,0x0306,0x0307,
0x0308,0x0309,0x030A,0x030B,0x030C,0x030D,0x030E,0x030F,
0x0310,0x0311,0x0312,0x0313,0x0314,0x0315,0x0316,0x0317,
Expand Down Expand Up @@ -378,17 +378,28 @@ static unsigned long combiningCharTableSize = sizeof(combiningCharTable) / sizeo
*/
static int isWideChar(unsigned long cp) {
size_t i;
for (i = 0; i < wideCharTableSize; i++)
for (i = 0; i < wideCharTableSize; i++) {
/* ranges are listed in ascending order. Therefore, once the
* whole range is higher than the codepoint we're testing, the
* codepoint won't be found in any remaining range => bail early. */
if(wideCharTable[i][0] > cp) return 0;

/* test this range */
if (wideCharTable[i][0] <= cp && cp <= wideCharTable[i][1]) return 1;
}
return 0;
}

/* Check if the code is a combining character
*/
static int isCombiningChar(unsigned long cp) {
size_t i;
for (i = 0; i < combiningCharTableSize; i++)
for (i = 0; i < combiningCharTableSize; i++) {
/* combining chars are listed in ascending order, so once we pass
* the codepoint of interest, we know it's not a combining char. */
if(combiningCharTable[i] > cp) return 0;
if (combiningCharTable[i] == cp) return 1;
}
return 0;
}

Expand Down

0 comments on commit 7866206

Please sign in to comment.