From 1e95a84b0ce8d5a86e3431e2095f245272c88662 Mon Sep 17 00:00:00 2001 From: Paul Guyot Date: Tue, 22 Apr 2014 12:07:53 +0200 Subject: [PATCH] Noncharacters U+FFFF and U+FFFE are not invalid. Properly decode them and accept to encode them. This was clarified by Unicode Technical Committee: http://www.unicode.org/versions/corrigendum9.html --- c_src/utf8.c | 6 ++---- test/004-strings.t | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/c_src/utf8.c b/c_src/utf8.c index 3ac65cb..e251bc6 100644 --- a/c_src/utf8.c +++ b/c_src/utf8.c @@ -62,7 +62,7 @@ utf8_len(int c) } else if(c < 0x800) { return 2; } else if(c < 0x10000) { - if(c < 0xD800 || (c > 0xDFFF && c < 0xFFFE)) { + if(c < 0xD800 || (c > 0xDFFF)) { return 3; } else { return -1; @@ -141,8 +141,6 @@ utf8_validate(unsigned char* data, size_t size) return -1; } else if(ui >= 0xD800 && ui <= 0xDFFF) { return -1; - } else if(ui == 0xFFFE || ui == 0xFFFF) { - return -1; } else if(ui > 0x10FFFF) { return -1; } @@ -193,7 +191,7 @@ unicode_to_utf8(int c, unsigned char* buf) buf[1] = (unsigned char) 0x80 + (c & 0x3F); return 2; } else if(c < 0x10000) { - if(c < 0xD800 || (c > 0xDFFF && c < 0xFFFE)) { + if(c < 0xD800 || (c > 0xDFFF)) { buf[0] = (unsigned char) 0xE0 + (c >> 12); buf[1] = (unsigned char) 0x80 + ((c >> 6) & 0x3F); buf[2] = (unsigned char) 0x80 + (c & 0x3F); diff --git a/test/004-strings.t b/test/004-strings.t index d5e5161..d0214d5 100755 --- a/test/004-strings.t +++ b/test/004-strings.t @@ -6,7 +6,7 @@ main([]) -> code:add_pathz("ebin"), code:add_pathz("test"), - etap:plan(119), + etap:plan(115), util:test_good(good()), util:test_good(uescaped(), [uescape]), util:test_errors(errors()), @@ -29,7 +29,9 @@ good() -> <<"\"\\uD834\\uDD1E\"">>, <<240, 157, 132, 158>>, <<34, 240, 157, 132, 158, 34>> - } + }, + {<<"\"\\uFFFF\"">>, <<239,191,191>>, <<34,239,191,191,34>>}, + {<<"\"\\uFFFE\"">>, <<239,191,190>>, <<34,239,191,190,34>>} ]. uescaped() -> @@ -54,8 +56,6 @@ errors() -> <<"\"foo">>, <<"\"", 0, "\"">>, <<"\"\\g\"">>, - <<"\"\\uFFFF\"">>, - <<"\"\\uFFFE\"">>, <<"\"\\uD834foo\\uDD1E\"">>, % CouchDB-345 <<34,78,69,73,77,69,78,32,70,216,82,82,32,70,65,69,78,33,34>> @@ -88,10 +88,6 @@ utf8_cases() -> {<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>}, {<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>}, - % Invalid Unicode code points - {<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>}, - {<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>}, - % Not enough extension bytes {<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},