diff --git a/c_src/utf8.c b/c_src/utf8.c index 198b3a8..230f631 100644 --- a/c_src/utf8.c +++ b/c_src/utf8.c @@ -132,6 +132,22 @@ utf8_validate(unsigned char* data, size_t size) if((data[0] & 0x07) + (data[1] & 0x30) == 0) return -1; } + + // Lastly we need to check some miscellaneous ranges for + // some of the larger code point values. + if(ulen >= 3) { + ui = utf8_to_unicode(data, ulen); + if(ui < 0) { + return -1; + } else if(ui >= 0xD800 && ui <= 0xDFFF) { + return -1; + } else if(ui == 0xFFFE || ui == 0xFFFF) { + return -1; + } else if(ui > 0x10FFFF) { + return -1; + } + } + return ulen; } diff --git a/test/004-strings.t b/test/004-strings.t index 6a69586..00d6d77 100755 --- a/test/004-strings.t +++ b/test/004-strings.t @@ -6,7 +6,7 @@ main([]) -> code:add_pathz("ebin"), code:add_pathz("test"), - etap:plan(78), + etap:plan(80), util:test_good(good()), util:test_good(uescaped(), [uescape]), util:test_errors(errors()), @@ -45,6 +45,7 @@ errors() -> <<"\"", 0, "\"">>, <<"\"\\g\"">>, <<"\"\\uFFFF\"">>, + <<"\"\\uFFFE\"">>, <<"\"\\uD834foo\\uDD1E\"">>, % CouchDB-345 <<34,78,69,73,77,69,78,32,70,216,82,82,32,70,65,69,78,33,34>> @@ -71,6 +72,10 @@ utf8_cases() -> % Stray continuation byte <<16#C2, 16#81, 16#80>>, <<"foo", 16#80, "bar">>, + + % Invalid Unicode code points + <<239, 191, 190>>, + <<237, 160, 129>>, % Not enough extension bytes <<16#C0>>,