From 706527d100ed8e52068ad4169f5e0a78ed15be3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20H=C3=B6gberg?= Date: Tue, 9 Apr 2019 08:08:23 +0200 Subject: [PATCH] Skip erroneous UTF-8 validation for atoms We requested atoms in latin1 and then handled them as utf-8, erroring out on some valid atoms and performing pointless validation on others. --- c_src/encoder.c | 234 ++++++++++++++++++++------------- test/jiffy_04_string_tests.erl | 6 + 2 files changed, 150 insertions(+), 90 deletions(-) diff --git a/c_src/encoder.c b/c_src/encoder.c index 3c2874e..fdae2d1 100644 --- a/c_src/encoder.c +++ b/c_src/encoder.c @@ -241,36 +241,127 @@ enc_literal(Encoder* e, const char* literal, size_t len) } static inline int -enc_string(Encoder* e, ERL_NIF_TERM val) +enc_special_character(Encoder* e, int val) { + switch(val) { + case '\"': + case '\\': + e->p[e->i++] = '\\'; + e->u[e->i++] = val; + return 1; + case '\b': + e->p[e->i++] = '\\'; + e->p[e->i++] = 'b'; + return 1; + case '\f': + e->p[e->i++] = '\\'; + e->p[e->i++] = 'f'; + return 1; + case '\n': + e->p[e->i++] = '\\'; + e->p[e->i++] = 'n'; + return 1; + case '\r': + e->p[e->i++] = '\\'; + e->p[e->i++] = 'r'; + return 1; + case '\t': + e->p[e->i++] = '\\'; + e->p[e->i++] = 't'; + return 1; + case '/': + if(e->escape_forward_slashes) { + e->p[e->i++] = '\\'; + } + e->u[e->i++] = '/'; + return 1; + default: + if(val < 0x20) { + e->i += unicode_uescape(val, &(e->p[e->i])); + return 1; + } + + return 0; + } +} + +static int +enc_atom(Encoder* e, ERL_NIF_TERM val) { static const int MAX_ESCAPE_LEN = 12; - ErlNifBinary bin; char atom[512]; unsigned char* data; size_t size; + int i; + + if(!enif_get_atom(e->env, val, atom, 512, ERL_NIF_LATIN1)) { + return 0; + } + + data = (unsigned char*) atom; + size = strlen(atom); + + /* Reserve space for the first quotation mark and most of the output. */ + if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) { + return 0; + } + + e->p[e->i++] = '\"'; + + i = 0; + while(i < size) { + if(!enc_ensure(e, MAX_ESCAPE_LEN)) { + return 0; + } + + if(enc_special_character(e, data[i])) { + i++; + } else if(data[i] < 0x80) { + e->u[e->i++] = data[i]; + i++; + } else if(data[i] >= 0x80) { + /* The atom encoding is latin1, so we don't need validation + * as all latin1 characters are valid Unicode codepoints. */ + if (!e->uescape) { + e->i += unicode_to_utf8(data[i], &e->u[e->i]); + } else { + e->i += unicode_uescape(data[i], &e->p[e->i]); + } + + i++; + } + } + + if(!enc_ensure(e, 1)) { + return 0; + } + + e->p[e->i++] = '\"'; + e->count++; + return 1; +} + +static int +enc_string(Encoder* e, ERL_NIF_TERM val) +{ + static const int MAX_ESCAPE_LEN = 12; + ErlNifBinary bin; + + unsigned char* data; + size_t size; int esc_len; int ulen; int uval; int i; - if(enif_is_binary(e->env, val)) { - if(!enif_inspect_binary(e->env, val, &bin)) { - return 0; - } - data = bin.data; - size = bin.size; - } else if(enif_is_atom(e->env, val)) { - if(!enif_get_atom(e->env, val, atom, 512, ERL_NIF_LATIN1)) { - return 0; - } - data = (unsigned char*) atom; - size = strlen(atom); - } else { + if(!enif_inspect_binary(e->env, val, &bin)) { return 0; } + data = bin.data; + size = bin.size; + /* Reserve space for the first quotation mark and most of the output. */ if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) { return 0; @@ -284,80 +375,33 @@ enc_string(Encoder* e, ERL_NIF_TERM val) return 0; } - switch((char) data[i]) { - case '\"': - case '\\': - e->p[e->i++] = '\\'; - e->u[e->i++] = data[i]; - i++; - continue; - case '\b': - e->p[e->i++] = '\\'; - e->p[e->i++] = 'b'; - i++; - continue; - case '\f': - e->p[e->i++] = '\\'; - e->p[e->i++] = 'f'; - i++; - continue; - case '\n': - e->p[e->i++] = '\\'; - e->p[e->i++] = 'n'; - i++; - continue; - case '\r': - e->p[e->i++] = '\\'; - e->p[e->i++] = 'r'; - i++; - continue; - case '\t': - e->p[e->i++] = '\\'; - e->p[e->i++] = 't'; - i++; - continue; - case '/': - if(e->escape_forward_slashes) { - e->p[e->i++] = '\\'; + if(enc_special_character(e, data[i])) { + i++; + } else if(data[i] < 0x80) { + e->u[e->i++] = data[i++]; + } else if(data[i] >= 0x80) { + ulen = utf8_validate(&(data[i]), size - i); + + if (ulen < 0) { + return 0; + } else if (e->uescape) { + uval = utf8_to_unicode(&(data[i]), size-i); + if(uval < 0) { + return 0; } - e->u[e->i++] = '/'; - i++; - continue; - default: - if(data[i] < 0x20) { - ulen = unicode_uescape(data[i], &(e->p[e->i])); - if(ulen < 0) { - return 0; - } - - e->i += ulen; - i++; - } else if(data[i] & 0x80) { - ulen = utf8_validate(&(data[i]), size - i); - - if (ulen < 0) { - return 0; - } else if (e->uescape) { - uval = utf8_to_unicode(&(data[i]), size-i); - if(uval < 0) { - return 0; - } - - esc_len = unicode_uescape(uval, &(e->p[e->i])); - if(esc_len < 0) { - return 0; - } - - e->i += esc_len; - } else { - memcpy(&e->u[e->i], &data[i], ulen); - e->i += ulen; - } - - i += ulen; - } else { - e->u[e->i++] = data[i++]; + + esc_len = unicode_uescape(uval, &(e->p[e->i])); + if(esc_len < 0) { + return 0; } + + e->i += esc_len; + } else { + memcpy(&e->u[e->i], &data[i], ulen); + e->i += ulen; + } + + i += ulen; } } @@ -371,6 +415,16 @@ enc_string(Encoder* e, ERL_NIF_TERM val) return 1; } +static inline int +enc_object_key(ErlNifEnv *env, Encoder* e, ERL_NIF_TERM val) +{ + if(enif_is_atom(env, val)) { + return enc_atom(e, val); + } + + return enc_string(e, val); +} + // From https://www.slideshare.net/andreialexandrescu1/three-optimization-tips-for-c-15708507 #define P01 10 @@ -746,7 +800,7 @@ encode_iter(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) ret = enc_error(e, "internal_error"); goto done; } - if(!enc_string(e, tuple[0])) { + if(!enc_object_key(env, e, tuple[0])) { ret = enc_obj_error(e, "invalid_object_member_key", tuple[0]); goto done; } @@ -802,7 +856,7 @@ encode_iter(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) goto done; } } else if(enif_is_atom(env, curr)) { - if(!enc_string(e, curr)) { + if(!enc_atom(e, curr)) { ret = enc_obj_error(e, "invalid_string", curr); goto done; } @@ -844,7 +898,7 @@ encode_iter(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) ret = enc_obj_error(e, "invalid_object_member_arity", item); goto done; } - if(!enc_string(e, tuple[0])) { + if(!enc_object_key(env, e, tuple[0])) { ret = enc_obj_error(e, "invalid_object_member_key", tuple[0]); goto done; } diff --git a/test/jiffy_04_string_tests.erl b/test/jiffy_04_string_tests.erl index 38a0207..9617d85 100644 --- a/test/jiffy_04_string_tests.erl +++ b/test/jiffy_04_string_tests.erl @@ -8,6 +8,12 @@ -include("jiffy_util.hrl"). +latin1_atom_test_() -> + Key = binary_to_atom(<<228>>, latin1), %% `รค` + Expected = <<"{\"", 195, 164, "\":\"bar\"}">>, + ?_assertEqual(Expected, enc(#{ Key => <<"bar">> })). + + string_success_test_() -> [gen(ok, Case) || Case <- cases(ok)].