From 706527d100ed8e52068ad4169f5e0a78ed15be3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20H=C3=B6gberg?= <john@erlang.org>
Date: Tue, 9 Apr 2019 08:08:23 +0200
Subject: [PATCH] Skip erroneous UTF-8 validation for atoms

We requested atoms in latin1 and then handled them as utf-8,
erroring out on some valid atoms and performing pointless
validation on others.
---
 c_src/encoder.c                | 234 ++++++++++++++++++++-------------
 test/jiffy_04_string_tests.erl |   6 +
 2 files changed, 150 insertions(+), 90 deletions(-)

diff --git a/c_src/encoder.c b/c_src/encoder.c
index 3c2874e..fdae2d1 100644
--- a/c_src/encoder.c
+++ b/c_src/encoder.c
@@ -241,36 +241,127 @@ enc_literal(Encoder* e, const char* literal, size_t len)
 }
 
 static inline int
-enc_string(Encoder* e, ERL_NIF_TERM val)
+enc_special_character(Encoder* e, int val) {
+    switch(val) {
+        case '\"':
+        case '\\':
+            e->p[e->i++] = '\\';
+            e->u[e->i++] = val;
+            return 1;
+        case '\b':
+            e->p[e->i++] = '\\';
+            e->p[e->i++] = 'b';
+            return 1;
+        case '\f':
+            e->p[e->i++] = '\\';
+            e->p[e->i++] = 'f';
+            return 1;
+        case '\n':
+            e->p[e->i++] = '\\';
+            e->p[e->i++] = 'n';
+            return 1;
+        case '\r':
+            e->p[e->i++] = '\\';
+            e->p[e->i++] = 'r';
+            return 1;
+        case '\t':
+            e->p[e->i++] = '\\';
+            e->p[e->i++] = 't';
+            return 1;
+        case '/':
+            if(e->escape_forward_slashes) {
+                e->p[e->i++] = '\\';
+            }
+            e->u[e->i++] = '/';
+            return 1;
+        default:
+            if(val < 0x20) {
+                e->i += unicode_uescape(val, &(e->p[e->i]));
+                return 1;
+            }
+
+            return 0;
+    }
+}
+
+static int
+enc_atom(Encoder* e, ERL_NIF_TERM val)
 {
     static const int MAX_ESCAPE_LEN = 12;
-    ErlNifBinary bin;
     char atom[512];
 
     unsigned char* data;
     size_t size;
+    int i;
+
+    if(!enif_get_atom(e->env, val, atom, 512, ERL_NIF_LATIN1)) {
+        return 0;
+    }
+
+    data = (unsigned char*) atom;
+    size = strlen(atom);
+
+    /* Reserve space for the first quotation mark and most of the output. */
+    if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) {
+        return 0;
+    }
+
+    e->p[e->i++] = '\"';
+
+    i = 0;
+    while(i < size) {
+        if(!enc_ensure(e, MAX_ESCAPE_LEN)) {
+            return 0;
+        }
+
+        if(enc_special_character(e, data[i])) {
+            i++;
+        } else if(data[i] < 0x80) {
+            e->u[e->i++] = data[i];
+            i++;
+        } else if(data[i] >= 0x80) {
+            /* The atom encoding is latin1, so we don't need validation
+             * as all latin1 characters are valid Unicode codepoints. */
+            if (!e->uescape) {
+                e->i += unicode_to_utf8(data[i], &e->u[e->i]);
+            } else {
+                e->i += unicode_uescape(data[i], &e->p[e->i]);
+            }
+
+            i++;
+        }
+    }
+
+    if(!enc_ensure(e, 1)) {
+        return 0;
+    }
+
+    e->p[e->i++] = '\"';
+    e->count++;
 
+    return 1;
+}
+
+static int
+enc_string(Encoder* e, ERL_NIF_TERM val)
+{
+    static const int MAX_ESCAPE_LEN = 12;
+    ErlNifBinary bin;
+
+    unsigned char* data;
+    size_t size;
     int esc_len;
     int ulen;
     int uval;
     int i;
 
-    if(enif_is_binary(e->env, val)) {
-        if(!enif_inspect_binary(e->env, val, &bin)) {
-            return 0;
-        }
-        data = bin.data;
-        size = bin.size;
-    } else if(enif_is_atom(e->env, val)) {
-        if(!enif_get_atom(e->env, val, atom, 512, ERL_NIF_LATIN1)) {
-            return 0;
-        }
-        data = (unsigned char*) atom;
-        size = strlen(atom);
-    } else {
+    if(!enif_inspect_binary(e->env, val, &bin)) {
         return 0;
     }
 
+    data = bin.data;
+    size = bin.size;
+
     /* Reserve space for the first quotation mark and most of the output. */
     if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) {
         return 0;
@@ -284,80 +375,33 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
             return 0;
         }
 
-        switch((char) data[i]) {
-            case '\"':
-            case '\\':
-                e->p[e->i++] = '\\';
-                e->u[e->i++] = data[i];
-                i++;
-                continue;
-            case '\b':
-                e->p[e->i++] = '\\';
-                e->p[e->i++] = 'b';
-                i++;
-                continue;
-            case '\f':
-                e->p[e->i++] = '\\';
-                e->p[e->i++] = 'f';
-                i++;
-                continue;
-            case '\n':
-                e->p[e->i++] = '\\';
-                e->p[e->i++] = 'n';
-                i++;
-                continue;
-            case '\r':
-                e->p[e->i++] = '\\';
-                e->p[e->i++] = 'r';
-                i++;
-                continue;
-            case '\t':
-                e->p[e->i++] = '\\';
-                e->p[e->i++] = 't';
-                i++;
-                continue;
-            case '/':
-                if(e->escape_forward_slashes) {
-                    e->p[e->i++] = '\\';
+        if(enc_special_character(e, data[i])) {
+            i++;
+        } else if(data[i] < 0x80) {
+            e->u[e->i++] = data[i++];
+        } else if(data[i] >= 0x80) {
+            ulen = utf8_validate(&(data[i]), size - i);
+
+            if (ulen < 0) {
+                return 0;
+            } else if (e->uescape) {
+                uval = utf8_to_unicode(&(data[i]), size-i);
+                if(uval < 0) {
+                    return 0;
                 }
-                e->u[e->i++] = '/';
-                i++;
-                continue;
-            default:
-                if(data[i] < 0x20) {
-                    ulen = unicode_uescape(data[i], &(e->p[e->i]));
-                    if(ulen < 0) {
-                        return 0;
-                    }
-
-                    e->i += ulen;
-                    i++;
-                } else if(data[i] & 0x80) {
-                    ulen = utf8_validate(&(data[i]), size - i);
-
-                    if (ulen < 0) {
-                        return 0;
-                    } else if (e->uescape) {
-                        uval = utf8_to_unicode(&(data[i]), size-i);
-                        if(uval < 0) {
-                            return 0;
-                        }
-
-                        esc_len = unicode_uescape(uval, &(e->p[e->i]));
-                        if(esc_len < 0) {
-                            return 0;
-                        }
-
-                        e->i += esc_len;
-                    } else {
-                        memcpy(&e->u[e->i], &data[i], ulen);
-                        e->i += ulen;
-                    }
-
-                    i += ulen;
-                } else {
-                    e->u[e->i++] = data[i++];
+
+                esc_len = unicode_uescape(uval, &(e->p[e->i]));
+                if(esc_len < 0) {
+                    return 0;
                 }
+
+                e->i += esc_len;
+            } else {
+                memcpy(&e->u[e->i], &data[i], ulen);
+                e->i += ulen;
+            }
+
+            i += ulen;
         }
     }
 
@@ -371,6 +415,16 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
     return 1;
 }
 
+static inline int
+enc_object_key(ErlNifEnv *env, Encoder* e, ERL_NIF_TERM val)
+{
+    if(enif_is_atom(env, val)) {
+        return enc_atom(e, val);
+    }
+
+    return enc_string(e, val);
+}
+
 // From https://www.slideshare.net/andreialexandrescu1/three-optimization-tips-for-c-15708507
 
 #define P01 10
@@ -746,7 +800,7 @@ encode_iter(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
                 ret = enc_error(e, "internal_error");
                 goto done;
             }
-            if(!enc_string(e, tuple[0])) {
+            if(!enc_object_key(env, e, tuple[0])) {
                 ret = enc_obj_error(e, "invalid_object_member_key", tuple[0]);
                 goto done;
             }
@@ -802,7 +856,7 @@ encode_iter(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
                 goto done;
             }
         } else if(enif_is_atom(env, curr)) {
-            if(!enc_string(e, curr)) {
+            if(!enc_atom(e, curr)) {
                 ret = enc_obj_error(e, "invalid_string", curr);
                 goto done;
             }
@@ -844,7 +898,7 @@ encode_iter(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
                 ret = enc_obj_error(e, "invalid_object_member_arity", item);
                 goto done;
             }
-            if(!enc_string(e, tuple[0])) {
+            if(!enc_object_key(env, e, tuple[0])) {
                 ret = enc_obj_error(e, "invalid_object_member_key", tuple[0]);
                 goto done;
             }
diff --git a/test/jiffy_04_string_tests.erl b/test/jiffy_04_string_tests.erl
index 38a0207..9617d85 100644
--- a/test/jiffy_04_string_tests.erl
+++ b/test/jiffy_04_string_tests.erl
@@ -8,6 +8,12 @@
 -include("jiffy_util.hrl").
 
 
+latin1_atom_test_() ->
+    Key = binary_to_atom(<<228>>, latin1), %% `ä`
+    Expected = <<"{\"", 195, 164, "\":\"bar\"}">>,
+    ?_assertEqual(Expected, enc(#{ Key => <<"bar">> })).
+
+
 string_success_test_() ->
     [gen(ok, Case) || Case <- cases(ok)].