#include #include #include #include #include #include "erl_nif.h" #include "jiffy.h" #define U(c) ((unsigned char) (c)) #define ERROR(i, msg) make_error(st, env, msg) #define STACK_SIZE_INC 64 #define NUM_BUF_LEN 256 enum { st_value=0, st_object, st_array, st_key, st_colon, st_comma, st_done, st_invalid } JsonState; enum { nst_init=0, nst_sign, nst_mantissa, nst_frac0, nst_frac1, nst_frac, nst_esign, nst_edigit } JsonNumState; typedef struct { ErlNifEnv* env; jiffy_st* atoms; ERL_NIF_TERM arg; ErlNifBinary bin; int has_bignum; char* p; unsigned char* u; int i; int len; char* st_data; int st_size; int st_top; } Decoder; void dec_init(Decoder* d, ErlNifEnv* env, ERL_NIF_TERM arg, ErlNifBinary* bin) { int i; d->env = env; d->atoms = enif_priv_data(env); d->arg = arg; d->has_bignum = 0; d->p = (char*) bin->data; d->u = bin->data; d->len = bin->size; d->i = 0; d->st_data = (char*) enif_alloc(STACK_SIZE_INC * sizeof(char)); d->st_size = STACK_SIZE_INC; d->st_top = 0; for(i = 0; i < d->st_size; i++) { d->st_data[i] = st_invalid; } d->st_data[0] = st_value; d->st_top++; } void dec_destroy(Decoder* d) { if(d->st_data != NULL) { enif_free(d->st_data); } } ERL_NIF_TERM dec_error(Decoder* d, const char* atom) { ERL_NIF_TERM pos = enif_make_int(d->env, d->i+1); ERL_NIF_TERM msg = make_atom(d->env, atom); ERL_NIF_TERM ret = enif_make_tuple2(d->env, pos, msg); return enif_make_tuple2(d->env, d->atoms->atom_error, ret); } char dec_curr(Decoder* d) { return d->st_data[d->st_top-1]; } int dec_top(Decoder* d) { return d->st_top; } void dec_push(Decoder* d, char val) { char* tmp; int new_sz; int i; if(d->st_top >= d->st_size) { new_sz = d->st_size + STACK_SIZE_INC; tmp = (char*) enif_alloc(new_sz * sizeof(char)); memcpy(tmp, d->st_data, d->st_size * sizeof(char)); enif_free(d->st_data); d->st_data = tmp; d->st_size = new_sz; for(i = d->st_top; i < d->st_size; i++) { d->st_data[i] = st_invalid; } } d->st_data[d->st_top++] = val; } void dec_pop(Decoder* d, char val) { assert(d->st_data[d->st_top-1] == val && "popped invalid state."); d->st_data[d->st_top-1] = st_invalid; d->st_top--; } int dec_string(Decoder* d, ERL_NIF_TERM* value) { int has_escape = 0; int num_escapes = 0; int st; int ulen; int ui; int hi; int lo; char* chrbuf; int chrpos; if(d->p[d->i] != '\"') { return 0; } d->i++; st = d->i; while(d->i < d->len) { if(d->u[d->i] < 0x20) { return 0; } else if(d->p[d->i] == '\"') { d->i++; goto parse; } else if(d->p[d->i] == '\\') { if(d->i+1 >= d->len) { return 0; } has_escape = 1; num_escapes += 1; d->i++; switch(d->p[d->i]) { case '\"': case '\\': case '/': case 'b': case 'f': case 'n': case 'r': case 't': d->i++; break; case 'u': hi = 0; lo = 0; d->i++; if(d->i + 4 >= d->len) { return 0; } hi = int_from_hex(&(d->u[d->i])); d->i += 4; if(hi < 0) { return 0; } if(hi >= 0xD800 && hi < 0xDC00) { if(d->i + 6 >= d->len) { return 0; } if(d->p[d->i++] != '\\') { return 0; } else if(d->p[d->i++] != 'u') { return 0; } lo = int_from_hex(&(d->u[d->i])); if(lo < 0) { return 0; } hi = utf8_from_pair(hi, lo); if(hi < 0) { return 0; } } hi = utf8_len(hi); if(hi < 0) { return 0; } if(lo == 0) { num_escapes += 5 - hi; } else { num_escapes += 11 - hi; } break; default: return 0; } } else if(d->u[d->i] < 0x80) { d->i++; } else { ulen = -1; if((d->u[d->i] & 0xE0) == 0xC0) { ulen = 1; } else if((d->u[d->i] & 0xF0) == 0xE0) { ulen = 2; } else if((d->u[d->i] & 0xF8) == 0xF0) { ulen = 3; } else if((d->u[d->i] & 0xFC) == 0xF8) { ulen = 4; } else if((d->u[d->i] & 0xFE) == 0xFC) { ulen = 5; } if(ulen < 0) { return 0; } if(d->i + ulen >= d->len) { return 0; } for(ui = 0; ui < ulen; ui++) { if((d->u[d->i+1+ui] & 0xC0) != 0x80) { return 0; } } // Wikipedia says I have to check that a UTF-8 encoding // uses as few bits as possible. This means that we // can't do things like encode 't' in three bytes. // To check this all we need to ensure is that for each // of the following bit patterns that there is at least // one 1 bit in any of the x's // 11: 110xxxxy 10yyyyyy // 16: 1110xxxx 10xyyyyy 10yyyyyy // 21: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy // 26: 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy // 31: 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy if(ulen == 1) { if((d->u[d->i] & 0x1E) == 0) return 0; } else if(ulen == 2) { if((d->u[d->i] & 0x0F) + (d->u[d->i+1] & 0x20) == 0) return 0; } else if(ulen == 3) { if((d->u[d->i] & 0x07) + (d->u[d->i+1] & 0x30) == 0) return 0; } else if(ulen == 4) { if((d->u[d->i] & 0x03) + (d->u[d->i+1] & 0x38) == 0) return 0; } else if(ulen == 5) { if((d->u[d->i] & 0x01) + (d->u[d->i+1] & 0x3C) == 0) return 0; } d->i += 1 + ulen; } } parse: if(!has_escape) { *value = enif_make_sub_binary(d->env, d->arg, st, (d->i - st - 1)); return 1; } hi = 0; lo = 0; ulen = (d->i - 1) - st - num_escapes; chrbuf = (char*) enif_make_new_binary(d->env, ulen, value); chrpos = 0; ui = st; while(ui < d->i - 1) { if(d->p[ui] != '\\') { chrbuf[chrpos++] = d->p[ui++]; continue; } ui++; switch(d->p[ui]) { case '\"': case '\\': case '/': chrbuf[chrpos++] = d->p[ui]; ui++; break; case 'b': chrbuf[chrpos++] = '\b'; ui++; break; case 'f': chrbuf[chrpos++] = '\f'; ui++; break; case 'n': chrbuf[chrpos++] = '\n'; ui++; break; case 'r': chrbuf[chrpos++] = '\r'; ui++; break; case 't': chrbuf[chrpos++] = '\t'; ui++; break; case 'u': ui++; hi = int_from_hex(&(d->u[ui])); if(hi >= 0xD800 && hi < 0xDC00) { lo = int_from_hex(&(d->u[ui+6])); hi = utf8_from_pair(hi, lo); ui += 10; } else { ui += 4; } hi = utf8_to_binary(hi, (unsigned char*) chrbuf+chrpos); if(hi < 0) { return 0; } chrpos += hi; break; default: return 0; } } return 1; } int dec_number(Decoder* d, ERL_NIF_TERM* value) { char state = nst_init; char nbuf[NUM_BUF_LEN]; int st = d->i; int is_double = 0; double dval; long lval; while(d->i < d->len) { switch(state) { case nst_init: switch(d->p[d->i]) { case '-': state = nst_sign; d->i++; break; case '0': state = nst_frac0; d->i++; break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = nst_mantissa; d->i++; break; default: return 0; } break; case nst_sign: switch(d->p[d->i]) { case '0': state = nst_frac0; d->i++; break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = nst_mantissa; d->i++; break; default: return 0; } break; case nst_mantissa: switch(d->p[d->i]) { case '.': state = nst_frac1; d->i++; break; case 'e': case 'E': state = nst_esign; d->i++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': d->i++; break; default: goto parse; } break; case nst_frac0: switch(d->p[d->i]) { case '.': state = nst_frac1; d->i++; break; case 'e': case 'E': state = nst_esign; d->i++; break; default: goto parse; } break; case nst_frac1: is_double = 1; switch(d->p[d->i]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = nst_frac; d->i++; break; default: goto parse; } break; case nst_frac: switch(d->p[d->i]) { case 'e': case 'E': state = nst_esign; d->i++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': d->i++; break; default: goto parse; } break; case nst_esign: is_double = 1; switch(d->p[d->i]) { case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = nst_edigit; d->i++; break; default: return 0; } break; case nst_edigit: switch(d->p[d->i]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': d->i++; break; default: goto parse; } break; default: return 0; } } parse: switch(state) { case nst_init: case nst_sign: case nst_frac1: case nst_esign: return 0; default: break; } if(st - d->i > NUM_BUF_LEN && is_double) { return 0; } else if(st - d->i > NUM_BUF_LEN) { d->has_bignum = 1; *value = enif_make_sub_binary(d->env, d->arg, st, d->i - st); *value = enif_make_tuple2(d->env, d->atoms->atom_bignum, *value); return 1; } memset(nbuf, 0, NUM_BUF_LEN); memcpy(nbuf, &(d->p[st]), d->i - st); errno = 0; if(is_double) { dval = strtod(nbuf, NULL); if(errno == ERANGE) { return 0; } *value = enif_make_double(d->env, dval); return 1; } lval = strtol(nbuf, NULL, 10); if(errno == ERANGE) { d->has_bignum = 1; *value = enif_make_sub_binary(d->env, d->arg, st, d->i - st); *value = enif_make_tuple2(d->env, d->atoms->atom_bignum, *value); } else { *value = enif_make_int64(d->env, lval); } return 1; } ERL_NIF_TERM make_object(ErlNifEnv* env, ERL_NIF_TERM pairs) { ERL_NIF_TERM ret = enif_make_list(env, 0); ERL_NIF_TERM key, val; while(enif_get_list_cell(env, pairs, &val, &pairs)) { if(!enif_get_list_cell(env, pairs, &key, &pairs)) { assert(0 == 1 && "Unbalanced object pairs."); } val = enif_make_tuple2(env, key, val); ret = enif_make_list_cell(env, val, ret); } return enif_make_tuple1(env, ret); } ERL_NIF_TERM make_array(ErlNifEnv* env, ERL_NIF_TERM list) { ERL_NIF_TERM ret = enif_make_list(env, 0); ERL_NIF_TERM item; while(enif_get_list_cell(env, list, &item, &list)) { ret = enif_make_list_cell(env, item, ret); } return ret; } ERL_NIF_TERM decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { Decoder dec; Decoder* d = &dec; ErlNifBinary bin; ERL_NIF_TERM objs = enif_make_list(env, 0); ERL_NIF_TERM curr = enif_make_list(env, 0); ERL_NIF_TERM val; ERL_NIF_TERM ret; if(argc != 1) { return enif_make_badarg(env); } else if(!enif_inspect_binary(env, argv[0], &bin)) { return enif_make_badarg(env); } dec_init(d, env, argv[0], &bin); //fprintf(stderr, "Parsing:\r\n"); while(d->i < bin.size) { //fprintf(stderr, "state: %d\r\n", dec_curr(d)); switch(dec_curr(d)) { case st_value: switch(d->p[d->i]) { case ' ': case '\n': case '\r': case '\t': d->i++; break; case 'n': if(d->i + 3 >= d->len) { ret = dec_error(d, "invalid_literal"); goto done; } if(memcmp(&(d->p[d->i]), "null", 4) != 0) { ret = dec_error(d, "invalid_literal"); goto done; } val = d->atoms->atom_null; dec_pop(d, st_value); d->i += 4; break; case 't': if(d->i + 3 >= d->len) { ret = dec_error(d, "invalid_literal"); goto done; } if(memcmp(&(d->p[d->i]), "true", 4) != 0) { ret = dec_error(d, "invalid_literal"); goto done; } val = d->atoms->atom_true; dec_pop(d, st_value); d->i += 4; break; case 'f': if(d->i + 4 >= bin.size) { ret = dec_error(d, "invalid_literal"); goto done; } if(memcmp(&(d->p[d->i]), "false", 5) != 0) { ret = dec_error(d, "invalid_literal"); goto done; } val = d->atoms->atom_false; dec_pop(d, st_value); d->i += 5; break; case '\"': if(!dec_string(d, &val)) { ret = dec_error(d, "invalid_string"); goto done; } dec_pop(d, st_value); break; case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if(!dec_number(d, &val)) { ret = dec_error(d, "invalid_number"); goto done; } dec_pop(d, st_value); break; case '{': dec_push(d, st_object); dec_push(d, st_key); objs = enif_make_list_cell(env, curr, objs); curr = enif_make_list(env, 0); d->i++; break; case '[': dec_push(d, st_array); dec_push(d, st_value); objs = enif_make_list_cell(env, curr, objs); curr = enif_make_list(env, 0); d->i++; break; case ']': if(!enif_is_empty_list(env, curr)) { ret = dec_error(d, "invalid_json"); goto done; } dec_pop(d, st_value); if(dec_curr(d) != st_array) { ret = dec_error(d, "invalid_json"); goto done; } dec_pop(d, st_array); dec_pop(d, st_value); val = curr; // curr is [] if(!enif_get_list_cell(env, objs, &curr, &objs)) { ret = dec_error(d, "internal_error"); goto done; } d->i++; break; default: ret = dec_error(d, "invalid_json"); goto done; } if(dec_top(d) == 0) { dec_push(d, st_done); } else if(dec_curr(d) != st_value && dec_curr(d) != st_key) { dec_push(d, st_comma); curr = enif_make_list_cell(env, val, curr); } break; case st_key: switch(d->p[d->i]) { case ' ': case '\n': case '\r': case '\t': d->i++; break; case '\"': if(!dec_string(d, &val)) { ret = dec_error(d, "invalid_string"); goto done; } dec_pop(d, st_key); dec_push(d, st_colon); curr = enif_make_list_cell(env, val, curr); break; case '}': if(!enif_is_empty_list(env, curr)) { ret = dec_error(d, "invalid_json"); goto done; } dec_pop(d, st_key); dec_pop(d, st_object); dec_pop(d, st_value); val = enif_make_tuple1(env, curr); if(!enif_get_list_cell(env, objs, &curr, &objs)) { ret = dec_error(d, "internal_error"); goto done; } if(dec_top(d) == 0) { dec_push(d, st_done); } else { dec_push(d, st_comma); curr = enif_make_list_cell(env, val, curr); } d->i++; break; default: ret = dec_error(d, "invalid_json"); goto done; } break; case st_colon: switch(d->p[d->i]) { case ' ': case '\n': case '\r': case '\t': d->i++; break; case ':': dec_pop(d, st_colon); dec_push(d, st_value); d->i++; break; default: ret = dec_error(d, "invalid_json"); goto done; } break; case st_comma: switch(d->p[d->i]) { case ' ': case '\n': case '\r': case '\t': d->i++; break; case ',': dec_pop(d, st_comma); switch(dec_curr(d)) { case st_object: dec_push(d, st_key); break; case st_array: dec_push(d, st_value); break; default: ret = dec_error(d, "internal_error"); goto done; } d->i++; break; case '}': dec_pop(d, st_comma); if(dec_curr(d) != st_object) { ret = dec_error(d, "invalid_json"); goto done; } dec_pop(d, st_object); dec_pop(d, st_value); val = make_object(env, curr); if(!enif_get_list_cell(env, objs, &curr, &objs)) { ret = dec_error(d, "internal_error"); goto done; } if(dec_top(d) > 0) { dec_push(d, st_comma); curr = enif_make_list_cell(env, val, curr); } else { dec_push(d, st_done); } d->i++; break; case ']': dec_pop(d, st_comma); if(dec_curr(d) != st_array) { ret = dec_error(d, "invalid_json"); goto done; } dec_pop(d, st_array); dec_pop(d, st_value); val = make_array(env, curr); if(!enif_get_list_cell(env, objs, &curr, &objs)) { ret = dec_error(d, "internal_error"); goto done; } if(dec_top(d) > 0) { dec_push(d, st_comma); curr = enif_make_list_cell(env, val, curr); } else { dec_push(d, st_done); } d->i++; break; default: ret = dec_error(d, "invalid_json"); goto done; } break; case st_done: switch(d->p[d->i]) { case ' ': case '\n': case '\r': case '\t': d->i++; break; default: ret = dec_error(d, "invalid_trailing_data"); goto done; } break; default: ret = dec_error(d, "invalid_internal_state"); goto done; } } if(dec_curr(d) != st_done) { ret = dec_error(d, "truncated_json"); } else if(d->has_bignum) { ret = enif_make_tuple2(env, d->atoms->atom_bignum, val); } else { ret = enif_make_tuple2(env, d->atoms->atom_ok, val); } done: dec_destroy(d); return ret; }