From de163881bea4d63771d802433c0746c2734d356f Mon Sep 17 00:00:00 2001 From: "Paul J. Davis" Date: Mon, 31 Oct 2011 16:07:39 -0500 Subject: [PATCH] Enforce Unicode constraints more strictly It was possible to pass some types of invalid UTF-8 through Jiffy's encoder. Specifically, if uescaping isn't used, values that would decode from 0xD800 to 0xDFFFF, 0xFFFE, 0xFFFF, and values greater than 0x10FFFF would not be flagged as invalid. Now they are. --- c_src/utf8.c | 16 ++++++++++++++++ test/004-strings.t | 7 ++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/c_src/utf8.c b/c_src/utf8.c index 198b3a8..230f631 100644 --- a/c_src/utf8.c +++ b/c_src/utf8.c @@ -132,6 +132,22 @@ utf8_validate(unsigned char* data, size_t size) if((data[0] & 0x07) + (data[1] & 0x30) == 0) return -1; } + + // Lastly we need to check some miscellaneous ranges for + // some of the larger code point values. + if(ulen >= 3) { + ui = utf8_to_unicode(data, ulen); + if(ui < 0) { + return -1; + } else if(ui >= 0xD800 && ui <= 0xDFFF) { + return -1; + } else if(ui == 0xFFFE || ui == 0xFFFF) { + return -1; + } else if(ui > 0x10FFFF) { + return -1; + } + } + return ulen; } diff --git a/test/004-strings.t b/test/004-strings.t index 6a69586..00d6d77 100755 --- a/test/004-strings.t +++ b/test/004-strings.t @@ -6,7 +6,7 @@ main([]) -> code:add_pathz("ebin"), code:add_pathz("test"), - etap:plan(78), + etap:plan(80), util:test_good(good()), util:test_good(uescaped(), [uescape]), util:test_errors(errors()), @@ -45,6 +45,7 @@ errors() -> <<"\"", 0, "\"">>, <<"\"\\g\"">>, <<"\"\\uFFFF\"">>, + <<"\"\\uFFFE\"">>, <<"\"\\uD834foo\\uDD1E\"">>, % CouchDB-345 <<34,78,69,73,77,69,78,32,70,216,82,82,32,70,65,69,78,33,34>> @@ -71,6 +72,10 @@ utf8_cases() -> % Stray continuation byte <<16#C2, 16#81, 16#80>>, <<"foo", 16#80, "bar">>, + + % Invalid Unicode code points + <<239, 191, 190>>, + <<237, 160, 129>>, % Not enough extension bytes <<16#C0>>,