浏览代码

Add an option to ignore UTF-8 encoding errors

By default Jiffy is quite strict in what it encodes. By default it will
not allow invalid UTF-8 to be produced. This can cause issues when
attempting to encode JSON that was decoded by other libraries as UTF-8
semantics are not uniformly enforced.

This patch adds an option 'force_utf8' to the encoder. If encoding hits
an error for an invalid string it will forcefully mutate the object to
contain only valid UTF-8 and return the resulting encoded JSON.

For the most part this means it will strip any garbage data from
binaries replacing it replacement codepoint U+FFFD. Although, it will
also try and the common error of encoding surrogate pairs as three-byte
sequences and reencode them into UTF-8 properly.
pull/29/head 0.4.4
Paul J. Davis 13 年前
父节点
当前提交
414827d604
共有 6 个文件被更改,包括 148 次插入31 次删除
  1. +2
    -0
      c_src/encoder.c
  2. +1
    -0
      c_src/jiffy.c
  3. +1
    -0
      c_src/jiffy.h
  4. +4
    -0
      src/jiffy.erl
  5. +104
    -0
      src/jiffy_utf8.erl
  6. +36
    -31
      test/004-strings.t

+ 2
- 0
c_src/encoder.c 查看文件

@ -81,6 +81,8 @@ enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin)
e->uescape = 1;
} else if(enif_compare(val, e->atoms->atom_pretty) == 0) {
e->pretty = 1;
} else if(enif_compare(val, e->atoms->atom_force_utf8) == 0) {
// Ignore, handled in Erlang
} else {
return 0;
}

+ 1
- 0
c_src/jiffy.c 查看文件

@ -22,6 +22,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info)
st->atom_partial = make_atom(env, "partial");
st->atom_uescape = make_atom(env, "uescape");
st->atom_pretty = make_atom(env, "pretty");
st->atom_force_utf8 = make_atom(env, "force_utf8");
// Markers used in encoding
st->ref_object = make_atom(env, "$object_ref$");

+ 1
- 0
c_src/jiffy.h 查看文件

@ -18,6 +18,7 @@ typedef struct {
ERL_NIF_TERM atom_partial;
ERL_NIF_TERM atom_uescape;
ERL_NIF_TERM atom_pretty;
ERL_NIF_TERM atom_force_utf8;
ERL_NIF_TERM ref_object;
ERL_NIF_TERM ref_array;

+ 4
- 0
src/jiffy.erl 查看文件

@ -25,7 +25,11 @@ encode(Data) ->
encode(Data, Options) ->
ForceUTF8 = lists:member(force_utf8, Options),
case nif_encode(Data, Options) of
{error, invalid_string} when ForceUTF8 == true ->
FixedData = jiffy_utf8:fix(Data),
encode(FixedData, Options -- [force_utf8]);
{error, _} = Error ->
throw(Error);
{partial, IOData} ->

+ 104
- 0
src/jiffy_utf8.erl 查看文件

@ -0,0 +1,104 @@
% This file is part of Jiffy released under the MIT license.
% See the LICENSE file for more information.
-module(jiffy_utf8).
-export([fix/1]).
fix({Props}) ->
fix_props(Props, []);
fix(Values) when is_list(Values) ->
fix_array(Values, []);
fix(Bin) when is_binary(Bin) ->
fix_bin(Bin);
fix(Val) ->
Val.
fix_props([], Acc) ->
{lists:reverse(Acc)};
fix_props([{K0, V0} | Rest], Acc) ->
K = fix(K0),
V = fix(V0),
fix_props(Rest, [{K, V} | Acc]).
fix_array([], Acc) ->
lists:reverse(Acc);
fix_array([Val | Rest], Acc0) ->
Acc = [fix(Val) | Acc0],
fix_array(Rest, Acc).
fix_bin(Bin) ->
Dec0 = loose_decode(Bin, 0, []),
Dec1 = try_combining(Dec0, []),
Dec2 = replace_garbage(Dec1, []),
list_to_binary(xmerl_ucs:to_utf8(Dec2)).
loose_decode(Bin, O, Acc) ->
case Bin of
<<_:O/binary>> ->
lists:reverse(Acc);
<<_:O/binary, 0:1/integer, V:7/integer, _/binary>> ->
loose_decode(Bin, O+1, [V | Acc]);
<<_:O/binary, 6:3/integer, V0:5/integer,
2:2/integer, V1:6/integer, _/binary>> ->
B = <<0:5/integer, V0:5/integer, V1:6/integer>>,
<<V:16/integer>> = B,
loose_decode(Bin, O+2, [V | Acc]);
<<_:O/binary, 14:4/integer, V0:4/integer,
2:2/integer, V1:6/integer,
2:2/integer, V2:6/integer, _/binary>> ->
B = <<V0:4/integer, V1:6/integer, V2:6/integer>>,
<<V:16/integer>> = B,
loose_decode(Bin, O+3, [V | Acc]);
<<_:O/binary, 30:5/integer, V0:3/integer,
2:2/integer, V1:6/integer,
2:2/integer, V2:6/integer,
2:2/integer, V3:6/integer, _/binary>> ->
B = <<0:11/integer, V0:3/integer, V1:6/integer,
V2:6/integer, V3:6/integer>>,
<<V:32/integer>> = B,
loose_decode(Bin, O+4, [V | Acc]);
<<_:O/binary, _:8/integer, R/binary>> ->
% Broken lead or continuation byte. Discard first
% byte and all broken continuations. Replace the
% whole mess with a replacment code point.
T = 1 + count_continuation_bytes(R, 0),
loose_decode(Bin, O+T, [16#FFFD | Acc])
end.
count_continuation_bytes(R, O) ->
case R of
<<_:O/binary, 2:2/integer, _:6/integer, _/binary>> ->
count_continuation_bytes(R, O+1);
_ ->
O
end.
try_combining([], Acc) ->
lists:reverse(Acc);
try_combining([H, L | Rest], Acc) when H >= 16#D800, H =< 16#DFFF,
L >= 16#D800, L =< 16#DFFF ->
Bin = <<H:16/big-unsigned-integer, L:16/big-unsigned-integer>>,
try
[C] = xmerl_ucs:from_utf16be(Bin),
try_combining(Rest, [C | Acc])
catch _:_ ->
try_combining(Rest, [L, H | Acc])
end;
try_combining([C | Rest], Acc) ->
try_combining(Rest, [C | Acc]).
replace_garbage([], Acc) ->
lists:reverse(Acc);
replace_garbage([C | Rest], Acc) ->
case xmerl_ucs:is_unicode(C) of
true -> replace_garbage(Rest, [C | Acc]);
false -> replace_garbage(Rest, [16#FFFD | Acc])
end.

+ 36
- 31
test/004-strings.t 查看文件

@ -6,7 +6,7 @@ main([]) ->
code:add_pathz("ebin"),
code:add_pathz("test"),
etap:plan(87),
etap:plan(116),
util:test_good(good()),
util:test_good(uescaped(), [uescape]),
util:test_errors(errors()),
@ -61,12 +61,17 @@ errors() ->
test_utf8([]) ->
ok;
test_utf8([Case | Rest]) ->
test_utf8([{Case, Fixed} | Rest]) ->
etap:fun_is(
fun({error, invalid_string}) -> true; (Else) -> Else end,
(catch jiffy:encode(Case)),
lists:flatten(io_lib:format("Invalid utf-8: ~p", [Case]))
),
etap:fun_is(
fun(Fixed) -> true; (Else) -> Else end,
jiffy:encode(Case, [force_utf8]),
lists:flatten(io_lib:format("Fixed correctly: ~p", [Fixed]))
),
Case2 = <<34, Case/binary, 34>>,
etap:fun_is(
fun({error, {_, invalid_string}}) -> true; (Else) -> Else end,
@ -78,47 +83,47 @@ test_utf8([Case | Rest]) ->
utf8_cases() ->
[
% Stray continuation byte
<<16#C2, 16#81, 16#80>>,
<<"foo", 16#80, "bar">>,
{<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>},
{<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>},
% Invalid Unicode code points
<<239, 191, 190>>,
<<237, 160, 129>>,
{<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>},
{<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>},
% Not enough extension bytes
<<16#C0>>,
{<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},
<<16#E0>>,
<<16#E0, 16#80>>,
{<<16#E0>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#E0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
<<16#F0>>,
<<16#F0, 16#80>>,
<<16#F0, 16#80, 16#80>>,
{<<16#F0>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F0, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
<<16#F8>>,
<<16#F8, 16#80>>,
<<16#F8, 16#80, 16#80>>,
<<16#F8, 16#80, 16#80, 16#80>>,
{<<16#F8>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F8, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F8, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F8, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
<<16#FC>>,
<<16#FC, 16#80>>,
<<16#FC, 16#80, 16#80>>,
<<16#FC, 16#80, 16#80, 16#80>>,
<<16#FC, 16#80, 16#80, 16#80, 16#80>>,
{<<16#FC>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
% No data in high bits.
<<16#C0, 16#80>>,
<<16#C1, 16#80>>,
{<<16#C0, 16#80>>, <<"\"\\u0000\"">>},
{<<16#C1, 16#80>>, <<"\"\\u0000\"">>},
<<16#E0, 16#80, 16#80>>,
<<16#E0, 16#90, 16#80>>,
{<<16#E0, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#E0, 16#90, 16#80>>, <<"\"\\u0000\"">>},
<<16#F0, 16#80, 16#80, 16#80>>,
<<16#F0, 16#88, 16#80, 16#80>>,
{<<16#F0, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#F0, 16#88, 16#80, 16#80>>, <<"\"\\u0000\"">>},
<<16#F8, 16#80, 16#80, 16#80, 16#80>>,
<<16#F8, 16#84, 16#80, 16#80, 16#80>>,
{<<16#F8, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#F8, 16#84, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>,
<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>
{<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}
].

正在加载...
取消
保存