// This file is part of Jiffy released under the MIT license.
|
|
// See the LICENSE file for more information.
|
|
#include "jiffy.h"
|
|
#include <stdio.h>
|
|
|
|
static const unsigned char hexvals[256] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
0, 1, 2, 3, 4, 5, 6, 7,
|
|
8, 9, 255, 255, 255, 255, 255, 255,
|
|
255, 10, 11, 12, 13, 14, 15, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 10, 11, 12, 13, 14, 15, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255
|
|
};
|
|
|
|
static const char hexdigits[16] = {
|
|
'0', '1', '2', '3',
|
|
'4', '5', '6', '7',
|
|
'8', '9', 'A', 'B',
|
|
'C', 'D', 'E', 'F'
|
|
};
|
|
|
|
int
|
|
int_from_hex(const unsigned char* p)
|
|
{
|
|
unsigned char* h = (unsigned char*) p;
|
|
int ret;
|
|
|
|
if(hexvals[*(h+0)] == 255) return -1;
|
|
if(hexvals[*(h+1)] == 255) return -1;
|
|
if(hexvals[*(h+2)] == 255) return -1;
|
|
if(hexvals[*(h+3)] == 255) return -1;
|
|
|
|
ret = (hexvals[*(h+0)] << 12)
|
|
+ (hexvals[*(h+1)] << 8)
|
|
+ (hexvals[*(h+2)] << 4)
|
|
+ (hexvals[*(h+3)] << 0);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
int_to_hex(int val, unsigned char* p)
|
|
{
|
|
if(val < 0 || val > 65535)
|
|
return -1;
|
|
|
|
p[0] = hexdigits[(val >> 12) & 0xF];
|
|
p[1] = hexdigits[(val >> 8) & 0xF];
|
|
p[2] = hexdigits[(val >> 4) & 0xF];
|
|
p[3] = hexdigits[val & 0xF];
|
|
|
|
return 1;
|
|
}
|
|
|
|
int
|
|
utf8_len(int c)
|
|
{
|
|
if(c < 128) {
|
|
return 1;
|
|
} else if(c < 0x800) {
|
|
return 2;
|
|
} else if(c < 0x10000) {
|
|
if(c < 0xD800 || (c > 0xDFFF)) {
|
|
return 3;
|
|
} else {
|
|
return -1;
|
|
}
|
|
} else if(c <= 0x10FFFF) {
|
|
return 4;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
int
|
|
utf8_esc_len(int c)
|
|
{
|
|
if(c < 0x10000) {
|
|
return 6;
|
|
} else if(c <= 0x10FFFF) {
|
|
return 12;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
int
|
|
utf8_validate(unsigned char* data, size_t size)
|
|
{
|
|
int ulen = -1;
|
|
int ui;
|
|
|
|
if((data[0] & 0x80) == 0x00) {
|
|
ulen = 1;
|
|
} if((data[0] & 0xE0) == 0xC0) {
|
|
ulen = 2;
|
|
} else if((data[0] & 0xF0) == 0xE0) {
|
|
ulen = 3;
|
|
} else if((data[0] & 0xF8) == 0xF0) {
|
|
ulen = 4;
|
|
}
|
|
if(ulen < 0 || ulen > size) {
|
|
return -1;
|
|
}
|
|
|
|
// Check each continuation byte.
|
|
for(ui = 1; ui < ulen; ui++) {
|
|
if((data[ui] & 0xC0) != 0x80) return -1;
|
|
}
|
|
|
|
// Wikipedia says I have to check that a UTF-8 encoding
|
|
// uses as few bits as possible. This means that we
|
|
// can't do things like encode 't' in three bytes.
|
|
// To check this all we need to ensure is that for each
|
|
// of the following bit patterns that there is at least
|
|
// one 1 bit in any of the x's
|
|
// 1: 0yyyyyyy
|
|
// 2: 110xxxxy 10yyyyyy
|
|
// 3: 1110xxxx 10xyyyyy 10yyyyyy
|
|
// 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
|
|
|
|
// ulen == 1 passes by definition
|
|
if(ulen == 2) {
|
|
if((data[0] & 0x1E) == 0)
|
|
return -1;
|
|
} else if(ulen == 3) {
|
|
if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
|
|
return -1;
|
|
} else if(ulen == 4) {
|
|
if((data[0] & 0x07) + (data[1] & 0x30) == 0)
|
|
return -1;
|
|
}
|
|
|
|
// Lastly we need to check some miscellaneous ranges for
|
|
// some of the larger code point values.
|
|
if(ulen >= 3) {
|
|
ui = utf8_to_unicode(data, ulen);
|
|
if(ui < 0) {
|
|
return -1;
|
|
} else if(ui >= 0xD800 && ui <= 0xDFFF) {
|
|
return -1;
|
|
} else if(ui > 0x10FFFF) {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return ulen;
|
|
}
|
|
|
|
int
|
|
utf8_to_unicode(unsigned char* buf, size_t size)
|
|
{
|
|
int ret;
|
|
if((buf[0] & 0x80) == 0x00) {
|
|
// 0xxxxxxx
|
|
ret = buf[0];
|
|
} else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
|
|
// 110xxxxy 10yyyyyy
|
|
ret = ((buf[0] & 0x1F) << 6)
|
|
| ((buf[1] & 0x3F));
|
|
} else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
|
|
// 1110xxxx 10xyyyyy 10yyyyyy
|
|
ret = ((buf[0] & 0x0F) << 12)
|
|
| ((buf[1] & 0x3F) << 6)
|
|
| ((buf[2] & 0x3F));
|
|
if(ret >= 0xD800 && ret <= 0xDFFF) {
|
|
ret = -1;
|
|
}
|
|
} else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
|
|
// 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
|
|
ret = ((buf[0] & 0x07) << 18)
|
|
| ((buf[1] & 0x3F) << 12)
|
|
| ((buf[2] & 0x3F) << 6)
|
|
| ((buf[3] & 0x3F));
|
|
} else {
|
|
ret = -1;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
unicode_to_utf8(int c, unsigned char* buf)
|
|
{
|
|
if(c < 0x80) {
|
|
buf[0] = c;
|
|
return 1;
|
|
} else if(c < 0x800) {
|
|
buf[0] = 0xC0 + (c >> 6);
|
|
buf[1] = 0x80 + (c & 0x3F);
|
|
return 2;
|
|
} else if(c < 0x10000) {
|
|
if(c < 0xD800 || (c > 0xDFFF)) {
|
|
buf[0] = 0xE0 + (c >> 12);
|
|
buf[1] = 0x80 + ((c >> 6) & 0x3F);
|
|
buf[2] = 0x80 + (c & 0x3F);
|
|
return 3;
|
|
} else {
|
|
return -1;
|
|
}
|
|
} else if(c <= 0x10FFFF) {
|
|
buf[0] = 0xF0 + (c >> 18);
|
|
buf[1] = 0x80 + ((c >> 12) & 0x3F);
|
|
buf[2] = 0x80 + ((c >> 6) & 0x3F);
|
|
buf[3] = 0x80 + (c & 0x3F);
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
int
|
|
unicode_from_pair(int hi, int lo)
|
|
{
|
|
if(hi < 0xD800 || hi >= 0xDC00) return -1;
|
|
if(lo < 0xDC00 || lo > 0xDFFF) return -1;
|
|
return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
|
|
}
|
|
|
|
int
|
|
unicode_uescape(int val, unsigned char* p)
|
|
{
|
|
int n;
|
|
if(val < 0x10000) {
|
|
p[0] = '\\';
|
|
p[1] = 'u';
|
|
if(int_to_hex(val, p+2) < 0) {
|
|
return -1;
|
|
}
|
|
return 6;
|
|
} else if (val <= 0x10FFFF) {
|
|
n = val - 0x10000;
|
|
p[0] = '\\';
|
|
p[1] = 'u';
|
|
if(int_to_hex((0xD800 | ((n >> 10) & 0x03FF)), p+2) < 0) {
|
|
return -1;
|
|
}
|
|
p[6] = '\\';
|
|
p[7] = 'u';
|
|
if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
|
|
return -1;
|
|
}
|
|
return 12;
|
|
}
|
|
return -1;
|
|
}
|
|
|