Quellcode durchsuchen

Add an encoder option to escape unicode.

The encoder can now return \u escaped unicode data instead of leaving
it as UTF-8 byte sequences. This done like so:

    Eshell V5.8.3  (abort with ^G)
    1> jiffy:encode(<<240, 144, 129, 128>>, [uescape]).
    <<"\"\\uD800\\uDC40\"">>
pull/8/merge
Paul J. Davis vor 14 Jahren
Ursprung
Commit
2305ded365
7 geänderte Dateien mit 204 neuen und 120 gelöschten Zeilen
  1. +12
    -47
      c_src/decoder.c
  2. +47
    -41
      c_src/encoder.c
  3. +2
    -1
      c_src/jiffy.c
  4. +7
    -2
      c_src/jiffy.h
  5. +128
    -25
      c_src/utf8.c
  6. +7
    -3
      src/jiffy.erl
  7. +1
    -1
      test/cases/string_invalid_hex_char.erl

+ 12
- 47
c_src/decoder.c Datei anzeigen

@ -196,10 +196,10 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
return 0;
}
hi = int_from_hex(&(d->u[d->i]));
d->i += 4;
if(hi < 0) {
return 0;
}
d->i += 4;
if(hi >= 0xD800 && hi < 0xDC00) {
if(d->i + 6 >= d->len) {
return 0;
@ -213,7 +213,7 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
if(lo < 0) {
return 0;
}
hi = utf8_from_pair(hi, lo);
hi = unicode_from_pair(hi, lo);
if(hi < 0) {
return 0;
}
@ -234,52 +234,11 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
} else if(d->u[d->i] < 0x80) {
d->i++;
} else {
ulen = -1;
if((d->u[d->i] & 0xE0) == 0xC0) {
ulen = 1;
} else if((d->u[d->i] & 0xF0) == 0xE0) {
ulen = 2;
} else if((d->u[d->i] & 0xF8) == 0xF0) {
ulen = 3;
} else if((d->u[d->i] & 0xFC) == 0xF8) {
ulen = 4;
} else if((d->u[d->i] & 0xFE) == 0xFC) {
ulen = 5;
}
ulen = utf8_validate(&(d->u[d->i]), d->len - d->i);
if(ulen < 0) {
return 0;
}
if(d->i + ulen >= d->len) {
return 0;
}
for(ui = 0; ui < ulen; ui++) {
if((d->u[d->i+1+ui] & 0xC0) != 0x80) {
return 0;
}
}
// Wikipedia says I have to check that a UTF-8 encoding
// uses as few bits as possible. This means that we
// can't do things like encode 't' in three bytes.
// To check this all we need to ensure is that for each
// of the following bit patterns that there is at least
// one 1 bit in any of the x's
// 11: 110xxxxy 10yyyyyy
// 16: 1110xxxx 10xyyyyy 10yyyyyy
// 21: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
// 26: 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy
// 31: 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy
if(ulen == 1) {
if((d->u[d->i] & 0x1E) == 0) return 0;
} else if(ulen == 2) {
if((d->u[d->i] & 0x0F) + (d->u[d->i+1] & 0x20) == 0) return 0;
} else if(ulen == 3) {
if((d->u[d->i] & 0x07) + (d->u[d->i+1] & 0x30) == 0) return 0;
} else if(ulen == 4) {
if((d->u[d->i] & 0x03) + (d->u[d->i+1] & 0x38) == 0) return 0;
} else if(ulen == 5) {
if((d->u[d->i] & 0x01) + (d->u[d->i+1] & 0x3C) == 0) return 0;
}
d->i += 1 + ulen;
d->i += ulen;
}
}
@ -336,14 +295,20 @@ parse:
case 'u':
ui++;
hi = int_from_hex(&(d->u[ui]));
if(hi < 0) {
return 0;
}
if(hi >= 0xD800 && hi < 0xDC00) {
lo = int_from_hex(&(d->u[ui+6]));
hi = utf8_from_pair(hi, lo);
if(lo < 0) {
return 0;
}
hi = unicode_from_pair(hi, lo);
ui += 10;
} else {
ui += 4;
}
hi = utf8_to_binary(hi, (unsigned char*) chrbuf+chrpos);
hi = unicode_to_utf8(hi, (unsigned char*) chrbuf+chrpos);
if(hi < 0) {
return 0;
}

+ 47
- 41
c_src/encoder.c Datei anzeigen

@ -14,6 +14,7 @@
typedef struct {
ErlNifEnv* env;
jiffy_st* atoms;
int uescape;
int count;
@ -28,12 +29,26 @@ typedef struct {
} Encoder;
int
enc_init(Encoder* e, ErlNifEnv* env, ErlNifBinary* bin)
enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin)
{
ERL_NIF_TERM val;
e->env = env;
e->atoms = enif_priv_data(env);
e->uescape = 0;
e->count = 0;
if(!enif_is_list(env, opts)) {
return 0;
}
while(enif_get_list_cell(env, opts, &val, &opts)) {
if(enif_compare(val, e->atoms->atom_uescape) == 0) {
e->uescape = 1;
} else {
return 0;
}
}
e->iolen = 0;
e->iolist = enif_make_list(env, 0);
@ -183,7 +198,7 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
int esc_extra = 0;
int ulen;
int ui;
int uval;
int i;
if(enif_is_binary(e->env, val)) {
@ -225,46 +240,21 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
i++;
continue;
}
ulen = -1;
if((data[i] & 0xE0) == 0xC0) {
ulen = 1;
} else if((data[i] & 0xF0) == 0xE0) {
ulen = 2;
} else if((data[i] & 0xF8) == 0xF0) {
ulen = 3;
} else if((data[i] & 0xFC) == 0xF8) {
ulen = 4;
} else if((data[i] & 0xFE) == 0xFC) {
ulen = 5;
}
ulen = utf8_validate(&(data[i]), size - i);
if(ulen < 0) {
return 0;
}
if(i+1+ulen > size) {
return 0;
}
for(ui = 0; ui < ulen; ui++) {
if((data[i+1+ui] & 0xC0) != 0x80) {
if(e->uescape) {
uval = utf8_to_unicode(&(data[i]), ulen);
if(uval < 0) {
return 0;
}
}
if(ulen == 1) {
if((data[i] & 0x1E) == 0)
return 0;
} else if(ulen == 2) {
if((data[i] & 0x0F) + (data[i+1] & 0x20) == 0)
return 0;
} else if(ulen == 3) {
if((data[i] & 0x07) + (data[i+1] & 0x30) == 0)
return 0;
} else if(ulen == 4) {
if((data[i] & 0x03) + (data[i+1] & 0x38) == 0)
return 0;
} else if(ulen == 5) {
if((data[i] & 0x01) + (data[i+1] & 0x3C) == 0)
ulen = utf8_esc_len(uval);
if(ulen < 0) {
return 0;
}
}
i += 1 + ulen;
i += ulen;
}
}
@ -311,13 +301,29 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
continue;
default:
if(data[i] < 0x20) {
e->p[e->i++] = '\\';
e->p[e->i++] = 'u';
if(!int_to_hex(data[i], &(e->p[e->i]))) {
ulen = unicode_uescape(data[i], &(e->p[e->i]));
if(ulen < 0) {
return 0;
}
e->i += 4;
e->i += ulen;
i++;
} else if((data[i] & 0x80) && e->uescape) {
uval = utf8_to_unicode(&(data[i]), size-i);
if(uval < 0) {
return 0;
}
ulen = unicode_uescape(uval, &(e->p[e->i]));
if(ulen < 0) {
return 0;
}
e->i += ulen;
ulen = utf8_len(uval);
if(ulen < 0) {
return 0;
}
i += ulen;
} else {
e->u[e->i++] = data[i++];
}
@ -424,11 +430,11 @@ encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
double dval;
long lval;
if(argc != 1) {
if(argc != 2) {
return enif_make_badarg(env);
}
if(!enc_init(e, env, &bin)) {
if(!enc_init(e, env, argv[1], &bin)) {
return enif_make_badarg(env);
}

+ 2
- 1
c_src/jiffy.c Datei anzeigen

@ -20,6 +20,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info)
st->atom_bignum_e = make_atom(env, "bignum_e");
st->atom_bigdbl = make_atom(env, "bigdbl");
st->atom_partial = make_atom(env, "partial");
st->atom_uescape = make_atom(env, "uescape");
// Markers used in encoding
st->ref_object = make_atom(env, "$object_ref$");
@ -53,7 +54,7 @@ unload(ErlNifEnv* env, void* priv)
static ErlNifFunc funcs[] =
{
{"nif_decode", 1, decode},
{"nif_encode", 1, encode}
{"nif_encode", 2, encode}
};
ERL_NIF_INIT(jiffy, funcs, &load, &reload, &upgrade, &unload);

+ 7
- 2
c_src/jiffy.h Datei anzeigen

@ -16,6 +16,7 @@ typedef struct {
ERL_NIF_TERM atom_bignum_e;
ERL_NIF_TERM atom_bigdbl;
ERL_NIF_TERM atom_partial;
ERL_NIF_TERM atom_uescape;
ERL_NIF_TERM ref_object;
ERL_NIF_TERM ref_array;
@ -31,7 +32,11 @@ ERL_NIF_TERM encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
int int_from_hex(const unsigned char* p);
int int_to_hex(int val, char* p);
int utf8_len(int c);
int utf8_from_pair(int hi, int lo);
int utf8_to_binary(int c, unsigned char* buf);
int utf8_esc_len(int c);
int utf8_validate(unsigned char* data, size_t size);
int utf8_to_unicode(unsigned char* buf, size_t size);
int unicode_to_utf8(int c, unsigned char* buf);
int unicode_from_pair(int hi, int lo);
int unicode_uescape(int c, char* buf);
#endif // Included JIFFY_H

+ 128
- 25
c_src/utf8.c Datei anzeigen

@ -1,5 +1,7 @@
// This file is part of Jiffy released under the MIT license.
// See the LICENSE file for more information.
#include "jiffy.h"
#include <stdio.h>
static const char hexvals[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@ -42,7 +44,7 @@ int
int_to_hex(int val, char* p)
{
if(val < 0 || val > 65535)
return 0;
return -1;
p[0] = hexdigits[(val >> 12) & 0xF];
p[1] = hexdigits[(val >> 8) & 0xF];
@ -65,27 +67,107 @@ utf8_len(int c)
} else {
return -1;
}
} else if(c < 0x200000) {
} else if(c <= 0x10FFFF) {
return 4;
} else if(c < 0x4000000) {
return 5;
} else if(c < 0x80000000) {
} else {
return -1;
}
}
int
utf8_esc_len(int c)
{
if(c < 0x10000) {
return 6;
} else if(c <= 0x10FFFF) {
return 12;
} else {
return -1;
}
}
int
utf8_from_pair(int hi, int lo)
utf8_validate(unsigned char* data, size_t size)
{
if(hi < 0xD800 || hi >= 0xDC00) return -1;
if(lo < 0xDC00 || lo > 0xDFFF) return -1;
return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
int ulen = -1;
int ui;
if((data[0] & 0x80) == 0x00) {
ulen = 1;
} if((data[0] & 0xE0) == 0xC0) {
ulen = 2;
} else if((data[0] & 0xF0) == 0xE0) {
ulen = 3;
} else if((data[0] & 0xF8) == 0xF0) {
ulen = 4;
}
if(ulen < 0 || ulen > size) {
return -1;
}
// Check each continuation byte.
for(ui = 1; ui < ulen; ui++) {
if((data[ui] & 0xC0) != 0x80) return -1;
}
// Wikipedia says I have to check that a UTF-8 encoding
// uses as few bits as possible. This means that we
// can't do things like encode 't' in three bytes.
// To check this all we need to ensure is that for each
// of the following bit patterns that there is at least
// one 1 bit in any of the x's
// 1: 0yyyyyyy
// 2: 110xxxxy 10yyyyyy
// 3: 1110xxxx 10xyyyyy 10yyyyyy
// 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
// ulen == 1 passes by definition
if(ulen == 2) {
if((data[0] & 0x1E) == 0)
return -1;
} else if(ulen == 3) {
if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
return -1;
} else if(ulen == 4) {
if((data[0] & 0x07) + (data[1] & 0x30) == 0)
return -1;
}
return ulen;
}
int
utf8_to_unicode(unsigned char* buf, size_t size)
{
int ret;
if((buf[0] & 0x80) == 0x00) {
// 0xxxxxxx
ret = (int) buf[0];
} else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
// 110xxxxy 10yyyyyy
ret = ((buf[0] & 0x1F) << 6)
| ((buf[1] & 0x3F));
} else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
// 1110xxxx 10xyyyyy 10yyyyyy
ret = ((buf[0] & 0x0F) << 12)
| ((buf[1] & 0x3F) << 6)
| ((buf[2] & 0x3F));
if(ret >= 0xD800 && ret <= 0xDFFF) {
ret = -1;
}
} else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
// 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
ret = ((buf[0] & 0x07) << 18)
| ((buf[1] & 0x3F) << 12)
| ((buf[2] & 0x3F) << 6)
| ((buf[3] & 0x3F));
} else {
ret = -1;
}
return ret;
}
int
utf8_to_binary(int c, unsigned char* buf)
unicode_to_utf8(int c, unsigned char* buf)
{
if(c < 0x80) {
buf[0] = (unsigned char) c;
@ -103,27 +185,48 @@ utf8_to_binary(int c, unsigned char* buf)
} else {
return -1;
}
} else if(c < 0x200000) {
} else if(c < 0x10FFFF) {
buf[0] = (unsigned char) 0xF0 + (c >> 18);
buf[1] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
buf[2] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
buf[3] = (unsigned char) 0x80 + (c & 0x3F);
return 4;
} else if(c < 0x4000000) {
buf[0] = (unsigned char) 0xF8 + (c >> 24);
buf[1] = (unsigned char) 0x80 + ((c >> 18) & 0x3F);
buf[2] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
buf[3] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
buf[4] = (unsigned char) 0x80 + (c & 0x3F);
return 5;
} else if(c < 0x80000000) {
buf[0] = (unsigned char) 0xFC + (c >> 30);
buf[1] = (unsigned char) 0x80 + ((c >> 24) & 0x3F);
buf[2] = (unsigned char) 0x80 + ((c >> 18) & 0x3F);
buf[3] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
buf[4] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
buf[5] = (unsigned char) 0x80 + (c & 0x3F);
}
return -1;
}
int
unicode_from_pair(int hi, int lo)
{
if(hi < 0xD800 || hi >= 0xDC00) return -1;
if(lo < 0xDC00 || lo > 0xDFFF) return -1;
return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
}
int
unicode_uescape(int val, char* p)
{
int n;
if(val < 0x10000) {
p[0] = '\\';
p[1] = 'u';
if(int_to_hex(val, p+2) < 0) {
return -1;
}
return 6;
} else if (val <= 0x10FFFF) {
n = val - 0x10000;
p[0] = '\\';
p[1] = 'u';
if(int_to_hex((0xD800 | ((n << 10) & 0x03FF)), p+2) < 0) {
return -1;
}
p[6] = '\\';
p[7] = 'u';
if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
return -1;
}
return 12;
}
return -1;
}

+ 7
- 3
src/jiffy.erl Datei anzeigen

@ -2,7 +2,7 @@
% See the LICENSE file for more information.
-module(jiffy).
-export([decode/1, encode/1]).
-export([decode/1, encode/1, encode/2]).
-define(NOT_LOADED, not_loaded(?LINE)).
-on_load(init/0).
@ -19,7 +19,11 @@ decode(Data) ->
encode(Data) ->
case nif_encode(Data) of
encode(Data, []).
encode(Data, Options) ->
case nif_encode(Data, Options) of
{error, _} = Error ->
throw(Error);
{partial, IOData} ->
@ -95,6 +99,6 @@ not_loaded(Line) ->
nif_decode(_Data) ->
?NOT_LOADED.
nif_encode(_Data) ->
nif_encode(_Data, _Options) ->
?NOT_LOADED.

+ 1
- 1
test/cases/string_invalid_hex_char.erl Datei anzeigen

@ -1 +1 @@
{error,{48,invalid_string}}.
{error,{44,invalid_string}}.

Laden…
Abbrechen
Speichern