You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

934 lines
27 KiB

#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "erl_nif.h"
#include "jiffy.h"
#define U(c) ((unsigned char) (c))
#define ERROR(i, msg) make_error(st, env, msg)
#define STACK_SIZE_INC 64
#define NUM_BUF_LEN 256
enum {
st_value=0,
st_object,
st_array,
st_key,
st_colon,
st_comma,
st_done,
st_invalid
} JsonState;
enum {
nst_init=0,
nst_sign,
nst_mantissa,
nst_frac0,
nst_frac1,
nst_frac,
nst_esign,
nst_edigit
} JsonNumState;
typedef struct {
ErlNifEnv* env;
jiffy_st* atoms;
ERL_NIF_TERM arg;
ErlNifBinary bin;
int has_bignum;
char* p;
unsigned char* u;
int i;
int len;
char* st_data;
int st_size;
int st_top;
} Decoder;
void
dec_init(Decoder* d, ErlNifEnv* env, ERL_NIF_TERM arg, ErlNifBinary* bin)
{
int i;
d->env = env;
d->atoms = enif_priv_data(env);
d->arg = arg;
d->has_bignum = 0;
d->p = (char*) bin->data;
d->u = bin->data;
d->len = bin->size;
d->i = 0;
d->st_data = (char*) enif_alloc(STACK_SIZE_INC * sizeof(char));
d->st_size = STACK_SIZE_INC;
d->st_top = 0;
for(i = 0; i < d->st_size; i++) {
d->st_data[i] = st_invalid;
}
d->st_data[0] = st_value;
d->st_top++;
}
void
dec_destroy(Decoder* d)
{
if(d->st_data != NULL) {
enif_free(d->st_data);
}
}
ERL_NIF_TERM
dec_error(Decoder* d, const char* atom)
{
ERL_NIF_TERM pos = enif_make_int(d->env, d->i+1);
ERL_NIF_TERM msg = make_atom(d->env, atom);
ERL_NIF_TERM ret = enif_make_tuple2(d->env, pos, msg);
return enif_make_tuple2(d->env, d->atoms->atom_error, ret);
}
char
dec_curr(Decoder* d)
{
return d->st_data[d->st_top-1];
}
int
dec_top(Decoder* d)
{
return d->st_top;
}
void
dec_push(Decoder* d, char val)
{
char* tmp;
int new_sz;
int i;
if(d->st_top >= d->st_size) {
new_sz = d->st_size + STACK_SIZE_INC;
tmp = (char*) enif_alloc(new_sz * sizeof(char));
memcpy(tmp, d->st_data, d->st_size * sizeof(char));
enif_free(d->st_data);
d->st_data = tmp;
d->st_size = new_sz;
for(i = d->st_top; i < d->st_size; i++) {
d->st_data[i] = st_invalid;
}
}
d->st_data[d->st_top++] = val;
}
void
dec_pop(Decoder* d, char val)
{
assert(d->st_data[d->st_top-1] == val && "popped invalid state.");
d->st_data[d->st_top-1] = st_invalid;
d->st_top--;
}
int
dec_string(Decoder* d, ERL_NIF_TERM* value)
{
int has_escape = 0;
int num_escapes = 0;
int st;
int ulen;
int ui;
int hi;
int lo;
char* chrbuf;
int chrpos;
if(d->p[d->i] != '\"') {
return 0;
}
d->i++;
st = d->i;
while(d->i < d->len) {
if(d->u[d->i] < 0x20) {
return 0;
} else if(d->p[d->i] == '\"') {
d->i++;
goto parse;
} else if(d->p[d->i] == '\\') {
if(d->i+1 >= d->len) {
return 0;
}
has_escape = 1;
num_escapes += 1;
d->i++;
switch(d->p[d->i]) {
case '\"':
case '\\':
case '/':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
d->i++;
break;
case 'u':
hi = 0;
lo = 0;
d->i++;
if(d->i + 4 >= d->len) {
return 0;
}
hi = int_from_hex(&(d->u[d->i]));
d->i += 4;
if(hi < 0) {
return 0;
}
if(hi >= 0xD800 && hi < 0xDC00) {
if(d->i + 6 >= d->len) {
return 0;
}
if(d->p[d->i++] != '\\') {
return 0;
} else if(d->p[d->i++] != 'u') {
return 0;
}
lo = int_from_hex(&(d->u[d->i]));
if(lo < 0) {
return 0;
}
hi = utf8_from_pair(hi, lo);
if(hi < 0) {
return 0;
}
}
hi = utf8_len(hi);
if(hi < 0) {
return 0;
}
if(lo == 0) {
num_escapes += 5 - hi;
} else {
num_escapes += 11 - hi;
}
break;
default:
return 0;
}
} else if(d->u[d->i] < 0x80) {
d->i++;
} else {
ulen = -1;
if((d->u[d->i] & 0xE0) == 0xC0) {
ulen = 1;
} else if((d->u[d->i] & 0xF0) == 0xE0) {
ulen = 2;
} else if((d->u[d->i] & 0xF8) == 0xF0) {
ulen = 3;
} else if((d->u[d->i] & 0xFC) == 0xF8) {
ulen = 4;
} else if((d->u[d->i] & 0xFE) == 0xFC) {
ulen = 5;
}
if(ulen < 0) {
return 0;
}
if(d->i + ulen >= d->len) {
return 0;
}
for(ui = 0; ui < ulen; ui++) {
if((d->u[d->i+1+ui] & 0xC0) != 0x80) {
return 0;
}
}
// Wikipedia says I have to check that a UTF-8 encoding
// uses as few bits as possible. This means that we
// can't do things like encode 't' in three bytes.
// To check this all we need to ensure is that for each
// of the following bit patterns that there is at least
// one 1 bit in each of the x's
// 11: 110xxxxy 10yyyyyy
// 16: 1110xxxx 10xyyyyy 10yyyyyy
// 21: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
// 26: 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy
// 31: 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy
if(ulen == 1) {
if((d->u[d->i] & 0x1E) == 0) return 0;
} else if(ulen == 2) {
if((d->u[d->i] & 0x0F) + (d->u[d->i+1] & 0x20) == 0) return 0;
} else if(ulen == 3) {
if((d->u[d->i] & 0x07) + (d->u[d->i+1] & 0x30) == 0) return 0;
} else if(ulen == 4) {
if((d->u[d->i] & 0x03) + (d->u[d->i+1] & 0x38) == 0) return 0;
} else if(ulen == 5) {
if((d->u[d->i] & 0x01) + (d->u[d->i+1] & 0x3C) == 0) return 0;
}
d->i += 1 + ulen;
}
}
parse:
if(!has_escape) {
*value = enif_make_sub_binary(d->env, d->arg, st, (d->i - st - 1));
return 1;
}
hi = 0;
lo = 0;
ulen = (d->i - 1) - st - num_escapes;
chrbuf = (char*) enif_make_new_binary(d->env, ulen, value);
chrpos = 0;
ui = st;
while(ui < d->i - 1) {
if(d->p[ui] != '\\') {
chrbuf[chrpos++] = d->p[ui++];
continue;
}
ui++;
switch(d->p[ui]) {
case '\"':
case '\\':
case '/':
chrbuf[chrpos++] = d->p[ui];
ui++;
break;
case 'b':
chrbuf[chrpos++] = '\b';
ui++;
break;
case 'f':
chrbuf[chrpos++] = '\f';
ui++;
break;
case 'n':
chrbuf[chrpos++] = '\n';
ui++;
break;
case 'r':
chrbuf[chrpos++] = '\r';
ui++;
break;
case 't':
chrbuf[chrpos++] = '\t';
ui++;
break;
case 'u':
ui++;
hi = int_from_hex(&(d->u[ui]));
if(hi >= 0xD800 && hi < 0xDC00) {
lo = int_from_hex(&(d->u[ui+6]));
hi = utf8_from_pair(hi, lo);
ui += 10;
} else {
ui += 4;
}
hi = utf8_to_binary(hi, (unsigned char*) chrbuf+chrpos);
if(hi < 0) {
return 0;
}
chrpos += hi;
break;
default:
return 0;
}
}
return 1;
}
int
dec_number(Decoder* d, ERL_NIF_TERM* value)
{
char state = nst_init;
char nbuf[NUM_BUF_LEN];
int st = d->i;
int is_double = 0;
double dval;
long lval;
while(d->i < d->len) {
switch(state) {
case nst_init:
switch(d->p[d->i]) {
case '-':
state = nst_sign;
d->i++;
break;
case '0':
state = nst_frac0;
d->i++;
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
state = nst_mantissa;
d->i++;
break;
default:
return 0;
}
break;
case nst_sign:
switch(d->p[d->i]) {
case '0':
state = nst_frac0;
d->i++;
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
state = nst_mantissa;
d->i++;
break;
default:
return 0;
}
break;
case nst_mantissa:
switch(d->p[d->i]) {
case '.':
state = nst_frac1;
d->i++;
break;
case 'e':
case 'E':
state = nst_esign;
d->i++;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
d->i++;
break;
default:
goto parse;
}
break;
case nst_frac0:
switch(d->p[d->i]) {
case '.':
state = nst_frac1;
d->i++;
break;
case 'e':
case 'E':
state = nst_esign;
d->i++;
break;
default:
goto parse;
}
break;
case nst_frac1:
is_double = 1;
switch(d->p[d->i]) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
state = nst_frac;
d->i++;
break;
default:
goto parse;
}
break;
case nst_frac:
switch(d->p[d->i]) {
case 'e':
case 'E':
state = nst_esign;
d->i++;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
d->i++;
break;
default:
goto parse;
}
break;
case nst_esign:
is_double = 1;
switch(d->p[d->i]) {
case '-':
case '+':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
state = nst_edigit;
d->i++;
break;
default:
return 0;
}
break;
case nst_edigit:
switch(d->p[d->i]) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
d->i++;
break;
default:
goto parse;
}
break;
default:
return 0;
}
}
parse:
switch(state) {
case nst_init:
case nst_sign:
case nst_frac1:
case nst_esign:
return 0;
default:
break;
}
if(st - d->i > NUM_BUF_LEN && is_double) {
return 0;
} else if(st - d->i > NUM_BUF_LEN) {
d->has_bignum = 1;
*value = enif_make_sub_binary(d->env, d->arg, st, d->i - st);
*value = enif_make_tuple2(d->env, d->atoms->atom_bignum, *value);
return 1;
}
memset(nbuf, 0, NUM_BUF_LEN);
memcpy(nbuf, &(d->p[st]), d->i - st);
errno = 0;
if(is_double) {
dval = strtod(nbuf, NULL);
if(errno == ERANGE) {
return 0;
}
*value = enif_make_double(d->env, dval);
return 1;
}
lval = strtol(nbuf, NULL, 10);
if(errno == ERANGE) {
d->has_bignum = 1;
*value = enif_make_sub_binary(d->env, d->arg, st, d->i - st);
*value = enif_make_tuple2(d->env, d->atoms->atom_bignum, *value);
} else {
*value = enif_make_int64(d->env, lval);
}
return 1;
}
ERL_NIF_TERM
make_object(ErlNifEnv* env, ERL_NIF_TERM pairs)
{
ERL_NIF_TERM ret = enif_make_list(env, 0);
ERL_NIF_TERM key, val;
while(enif_get_list_cell(env, pairs, &val, &pairs)) {
if(!enif_get_list_cell(env, pairs, &key, &pairs)) {
assert(0 == 1 && "Unbalanced object pairs.");
}
val = enif_make_tuple2(env, key, val);
ret = enif_make_list_cell(env, val, ret);
}
return enif_make_tuple1(env, ret);
}
ERL_NIF_TERM
make_array(ErlNifEnv* env, ERL_NIF_TERM list)
{
ERL_NIF_TERM ret = enif_make_list(env, 0);
ERL_NIF_TERM item;
while(enif_get_list_cell(env, list, &item, &list)) {
ret = enif_make_list_cell(env, item, ret);
}
return ret;
}
ERL_NIF_TERM
decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
Decoder dec;
Decoder* d = &dec;
ErlNifBinary bin;
ERL_NIF_TERM objs = enif_make_list(env, 0);
ERL_NIF_TERM curr;
ERL_NIF_TERM val;
ERL_NIF_TERM ret;
if(argc != 1) {
return enif_make_badarg(env);
} else if(!enif_inspect_binary(env, argv[0], &bin)) {
return enif_make_badarg(env);
}
dec_init(d, env, argv[0], &bin);
//fprintf(stderr, "Parsing:\r\n");
while(d->i < bin.size) {
//fprintf(stderr, "state: %d\r\n", dec_curr(d));
switch(dec_curr(d)) {
case st_value:
switch(d->p[d->i]) {
case ' ':
case '\n':
case '\r':
case '\t':
d->i++;
break;
case 'n':
if(d->i + 3 >= d->len) {
ret = dec_error(d, "invalid_literal");
goto done;
}
if(memcmp(&(d->p[d->i]), "null", 4) != 0) {
ret = dec_error(d, "invalid_literal");
goto done;
}
val = d->atoms->atom_null;
dec_pop(d, st_value);
d->i += 4;
break;
case 't':
if(d->i + 3 >= d->len) {
ret = dec_error(d, "invalid_literal");
goto done;
}
if(memcmp(&(d->p[d->i]), "true", 4) != 0) {
ret = dec_error(d, "invalid_literal");
goto done;
}
val = d->atoms->atom_true;
dec_pop(d, st_value);
d->i += 4;
break;
case 'f':
if(d->i + 4 >= bin.size) {
ret = dec_error(d, "invalid_literal");
goto done;
}
if(memcmp(&(d->p[d->i]), "false", 5) != 0) {
ret = dec_error(d, "invalid_literal");
goto done;
}
val = d->atoms->atom_false;
dec_pop(d, st_value);
d->i += 5;
break;
case '\"':
if(!dec_string(d, &val)) {
ret = dec_error(d, "invalid_string");
goto done;
}
dec_pop(d, st_value);
break;
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if(!dec_number(d, &val)) {
ret = dec_error(d, "invalid_number");
goto done;
}
dec_pop(d, st_value);
break;
case '{':
dec_push(d, st_object);
dec_push(d, st_key);
objs = enif_make_list_cell(env, curr, objs);
curr = enif_make_list(env, 0);
d->i++;
break;
case '[':
dec_push(d, st_array);
dec_push(d, st_value);
objs = enif_make_list_cell(env, curr, objs);
curr = enif_make_list(env, 0);
d->i++;
break;
case ']':
if(!enif_is_empty_list(env, curr)) {
ret = dec_error(d, "invalid_json");
goto done;
}
dec_pop(d, st_value);
if(dec_curr(d) != st_array) {
ret = dec_error(d, "invalid_json");
goto done;
}
dec_pop(d, st_array);
dec_pop(d, st_value);
val = curr; // curr is []
if(!enif_get_list_cell(env, objs, &curr, &objs)) {
ret = dec_error(d, "internal_error");
goto done;
}
d->i++;
break;
default:
ret = dec_error(d, "invalid_json");
goto done;
}
if(dec_top(d) == 0) {
dec_push(d, st_done);
} else if(dec_curr(d) != st_value && dec_curr(d) != st_key) {
dec_push(d, st_comma);
curr = enif_make_list_cell(env, val, curr);
}
break;
case st_key:
switch(d->p[d->i]) {
case ' ':
case '\n':
case '\r':
case '\t':
d->i++;
break;
case '\"':
if(!dec_string(d, &val)) {
ret = dec_error(d, "invalid_string");
goto done;
}
dec_pop(d, st_key);
dec_push(d, st_colon);
curr = enif_make_list_cell(env, val, curr);
break;
case '}':
if(!enif_is_empty_list(env, curr)) {
ret = dec_error(d, "invalid_json");
goto done;
}
dec_pop(d, st_key);
dec_pop(d, st_object);
dec_pop(d, st_value);
val = enif_make_tuple1(env, curr);
if(!enif_get_list_cell(env, objs, &curr, &objs)) {
ret = dec_error(d, "internal_error");
goto done;
}
if(dec_top(d) == 0) {
dec_push(d, st_done);
} else {
dec_push(d, st_comma);
curr = enif_make_list_cell(env, val, curr);
}
d->i++;
break;
default:
ret = dec_error(d, "invalid_json");
goto done;
}
break;
case st_colon:
switch(d->p[d->i]) {
case ' ':
case '\n':
case '\r':
case '\t':
d->i++;
break;
case ':':
dec_pop(d, st_colon);
dec_push(d, st_value);
d->i++;
break;
default:
ret = dec_error(d, "invalid_json");
goto done;
}
break;
case st_comma:
switch(d->p[d->i]) {
case ' ':
case '\n':
case '\r':
case '\t':
d->i++;
break;
case ',':
dec_pop(d, st_comma);
switch(dec_curr(d)) {
case st_object:
dec_push(d, st_key);
break;
case st_array:
dec_push(d, st_value);
break;
default:
ret = dec_error(d, "internal_error");
goto done;
}
d->i++;
break;
case '}':
dec_pop(d, st_comma);
if(dec_curr(d) != st_object) {
ret = dec_error(d, "invalid_json");
goto done;
}
dec_pop(d, st_object);
dec_pop(d, st_value);
val = make_object(env, curr);
if(!enif_get_list_cell(env, objs, &curr, &objs)) {
ret = dec_error(d, "internal_error");
goto done;
}
if(dec_top(d) > 0) {
dec_push(d, st_comma);
curr = enif_make_list_cell(env, val, curr);
} else {
dec_push(d, st_done);
}
d->i++;
break;
case ']':
dec_pop(d, st_comma);
if(dec_curr(d) != st_array) {
ret = dec_error(d, "invalid_json");
goto done;
}
dec_pop(d, st_array);
dec_pop(d, st_value);
val = make_array(env, curr);
if(!enif_get_list_cell(env, objs, &curr, &objs)) {
ret = dec_error(d, "internal_error");
goto done;
}
if(dec_top(d) > 0) {
dec_push(d, st_comma);
curr = enif_make_list_cell(env, val, curr);
} else {
dec_push(d, st_done);
}
d->i++;
break;
default:
ret = dec_error(d, "invalid_json");
goto done;
}
break;
case st_done:
switch(d->p[d->i]) {
case ' ':
case '\n':
case '\r':
case '\t':
d->i++;
break;
default:
ret = dec_error(d, "invalid_trailing_data");
goto done;
}
break;
default:
ret = dec_error(d, "invalid_internal_state");
goto done;
}
}
if(dec_curr(d) != st_done) {
ret = dec_error(d, "truncated_json");
} else if(d->has_bignum) {
ret = enif_make_tuple2(env, d->atoms->atom_bignum, val);
} else {
ret = enif_make_tuple2(env, d->atoms->atom_ok, val);
}
done:
dec_destroy(d);
return ret;
}