Noncharacters U+FFFF and U+FFFE are not invalid.

Properly decode them and accept to encode them. This was clarified by Unicode Technical Committee: http://www.unicode.org/versions/corrigendum9.html
11 years ago · 1e95a84b0c
--- a/c_src/utf8.c
+++ b/c_src/utf8.c
@ -62,7 +62,7 @@ utf8_len(int c)
    } else if(c < 0x800) {
        return 2;
    } else if(c < 0x10000) {
        if(c < 0xD800 || (c > 0xDFFF && c < 0xFFFE)) {
        if(c < 0xD800 || (c > 0xDFFF)) {
            return 3;
        } else {
            return -1;
@ -141,8 +141,6 @@ utf8_validate(unsigned char* data, size_t size)
            return -1;
        } else if(ui >= 0xD800 && ui <= 0xDFFF) {
            return -1;
        } else if(ui == 0xFFFE || ui == 0xFFFF) {
            return -1;
        } else if(ui > 0x10FFFF) {
            return -1;
        }
@ -193,7 +191,7 @@ unicode_to_utf8(int c, unsigned char* buf)
        buf[1] = (unsigned char) 0x80 + (c & 0x3F);
        return 2;
    } else if(c < 0x10000) {
        if(c < 0xD800 || (c > 0xDFFF && c < 0xFFFE)) {
        if(c < 0xD800 || (c > 0xDFFF)) {
            buf[0] = (unsigned char) 0xE0 + (c >> 12);
            buf[1] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
            buf[2] = (unsigned char) 0x80 + (c & 0x3F);
--- a/test/004-strings.t
+++ b/test/004-strings.t
@ -6,7 +6,7 @@ main([]) ->
    code:add_pathz("ebin"),
    code:add_pathz("test"),

    etap:plan(119),
    etap:plan(115),
    util:test_good(good()),
    util:test_good(uescaped(), [uescape]),
    util:test_errors(errors()),
@ -29,7 +29,9 @@ good() ->
            <<"\"\\uD834\\uDD1E\"">>,
            <<240, 157, 132, 158>>,
            <<34, 240, 157, 132, 158, 34>>
        }
        },
        {<<"\"\\uFFFF\"">>, <<239,191,191>>, <<34,239,191,191,34>>},
        {<<"\"\\uFFFE\"">>, <<239,191,190>>, <<34,239,191,190,34>>}
    ].

 uescaped() ->
@ -54,8 +56,6 @@ errors() ->
        <<"\"foo">>,
        <<"\"", 0, "\"">>,
        <<"\"\\g\"">>,
        <<"\"\\uFFFF\"">>,
        <<"\"\\uFFFE\"">>,
        <<"\"\\uD834foo\\uDD1E\"">>,
        % CouchDB-345
        <<34,78,69,73,77,69,78,32,70,216,82,82,32,70,65,69,78,33,34>>
@ -88,10 +88,6 @@ utf8_cases() ->
        {<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>},
        {<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>},

        % Invalid Unicode code points
        {<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>},
        {<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>},

        % Not enough extension bytes
        {<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},