Browse Source

Noncharacters U+FFFF and U+FFFE are not invalid.

Properly decode them and accept to encode them.

This was clarified by Unicode Technical Committee:
http://www.unicode.org/versions/corrigendum9.html
pull/57/merge 0.9.0
Paul Guyot 11 years ago
committed by Paul J. Davis
parent
commit
1e95a84b0c
2 changed files with 6 additions and 12 deletions
  1. +2
    -4
      c_src/utf8.c
  2. +4
    -8
      test/004-strings.t

+ 2
- 4
c_src/utf8.c View File

@ -62,7 +62,7 @@ utf8_len(int c)
} else if(c < 0x800) {
return 2;
} else if(c < 0x10000) {
if(c < 0xD800 || (c > 0xDFFF && c < 0xFFFE)) {
if(c < 0xD800 || (c > 0xDFFF)) {
return 3;
} else {
return -1;
@ -141,8 +141,6 @@ utf8_validate(unsigned char* data, size_t size)
return -1;
} else if(ui >= 0xD800 && ui <= 0xDFFF) {
return -1;
} else if(ui == 0xFFFE || ui == 0xFFFF) {
return -1;
} else if(ui > 0x10FFFF) {
return -1;
}
@ -193,7 +191,7 @@ unicode_to_utf8(int c, unsigned char* buf)
buf[1] = (unsigned char) 0x80 + (c & 0x3F);
return 2;
} else if(c < 0x10000) {
if(c < 0xD800 || (c > 0xDFFF && c < 0xFFFE)) {
if(c < 0xD800 || (c > 0xDFFF)) {
buf[0] = (unsigned char) 0xE0 + (c >> 12);
buf[1] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
buf[2] = (unsigned char) 0x80 + (c & 0x3F);

+ 4
- 8
test/004-strings.t View File

@ -6,7 +6,7 @@ main([]) ->
code:add_pathz("ebin"),
code:add_pathz("test"),
etap:plan(119),
etap:plan(115),
util:test_good(good()),
util:test_good(uescaped(), [uescape]),
util:test_errors(errors()),
@ -29,7 +29,9 @@ good() ->
<<"\"\\uD834\\uDD1E\"">>,
<<240, 157, 132, 158>>,
<<34, 240, 157, 132, 158, 34>>
}
},
{<<"\"\\uFFFF\"">>, <<239,191,191>>, <<34,239,191,191,34>>},
{<<"\"\\uFFFE\"">>, <<239,191,190>>, <<34,239,191,190,34>>}
].
uescaped() ->
@ -54,8 +56,6 @@ errors() ->
<<"\"foo">>,
<<"\"", 0, "\"">>,
<<"\"\\g\"">>,
<<"\"\\uFFFF\"">>,
<<"\"\\uFFFE\"">>,
<<"\"\\uD834foo\\uDD1E\"">>,
% CouchDB-345
<<34,78,69,73,77,69,78,32,70,216,82,82,32,70,65,69,78,33,34>>
@ -88,10 +88,6 @@ utf8_cases() ->
{<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>},
{<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>},
% Invalid Unicode code points
{<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>},
{<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>},
% Not enough extension bytes
{<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},

Loading…
Cancel
Save