You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

914 lines
26 KiB

  1. // This file is part of Jiffy released under the MIT license.
  2. // See the LICENSE file for more information.
  3. #include <assert.h>
  4. #include <errno.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <string.h>
  8. #include "erl_nif.h"
  9. #include "jiffy.h"
  10. #define U(c) ((unsigned char) (c))
  11. #define ERROR(i, msg) make_error(st, env, msg)
  12. #define STACK_SIZE_INC 64
  13. #define NUM_BUF_LEN 32
  14. #if WINDOWS || WIN32
  15. #define snprintf _snprintf
  16. #endif
  17. enum {
  18. st_value=0,
  19. st_object,
  20. st_array,
  21. st_key,
  22. st_colon,
  23. st_comma,
  24. st_done,
  25. st_invalid
  26. } JsonState;
  27. enum {
  28. nst_init=0,
  29. nst_sign,
  30. nst_mantissa,
  31. nst_frac0,
  32. nst_frac1,
  33. nst_frac,
  34. nst_esign,
  35. nst_edigit
  36. } JsonNumState;
  37. typedef struct {
  38. ErlNifEnv* env;
  39. jiffy_st* atoms;
  40. ERL_NIF_TERM arg;
  41. ErlNifBinary bin;
  42. int is_partial;
  43. char* p;
  44. unsigned char* u;
  45. int i;
  46. int len;
  47. char* st_data;
  48. int st_size;
  49. int st_top;
  50. } Decoder;
  51. void
  52. dec_init(Decoder* d, ErlNifEnv* env, ERL_NIF_TERM arg, ErlNifBinary* bin)
  53. {
  54. int i;
  55. d->env = env;
  56. d->atoms = enif_priv_data(env);
  57. d->arg = arg;
  58. d->is_partial = 0;
  59. d->p = (char*) bin->data;
  60. d->u = bin->data;
  61. d->len = bin->size;
  62. d->i = 0;
  63. d->st_data = (char*) enif_alloc(STACK_SIZE_INC * sizeof(char));
  64. d->st_size = STACK_SIZE_INC;
  65. d->st_top = 0;
  66. for(i = 0; i < d->st_size; i++) {
  67. d->st_data[i] = st_invalid;
  68. }
  69. d->st_data[0] = st_value;
  70. d->st_top++;
  71. }
  72. void
  73. dec_destroy(Decoder* d)
  74. {
  75. if(d->st_data != NULL) {
  76. enif_free(d->st_data);
  77. }
  78. }
  79. ERL_NIF_TERM
  80. dec_error(Decoder* d, const char* atom)
  81. {
  82. ERL_NIF_TERM pos = enif_make_int(d->env, d->i+1);
  83. ERL_NIF_TERM msg = make_atom(d->env, atom);
  84. ERL_NIF_TERM ret = enif_make_tuple2(d->env, pos, msg);
  85. return enif_make_tuple2(d->env, d->atoms->atom_error, ret);
  86. }
  87. char
  88. dec_curr(Decoder* d)
  89. {
  90. return d->st_data[d->st_top-1];
  91. }
  92. int
  93. dec_top(Decoder* d)
  94. {
  95. return d->st_top;
  96. }
  97. void
  98. dec_push(Decoder* d, char val)
  99. {
  100. char* tmp;
  101. int new_sz;
  102. int i;
  103. if(d->st_top >= d->st_size) {
  104. new_sz = d->st_size + STACK_SIZE_INC;
  105. tmp = (char*) enif_alloc(new_sz * sizeof(char));
  106. memcpy(tmp, d->st_data, d->st_size * sizeof(char));
  107. enif_free(d->st_data);
  108. d->st_data = tmp;
  109. d->st_size = new_sz;
  110. for(i = d->st_top; i < d->st_size; i++) {
  111. d->st_data[i] = st_invalid;
  112. }
  113. }
  114. d->st_data[d->st_top++] = val;
  115. }
  116. void
  117. dec_pop(Decoder* d, char val)
  118. {
  119. assert(d->st_data[d->st_top-1] == val && "popped invalid state.");
  120. d->st_data[d->st_top-1] = st_invalid;
  121. d->st_top--;
  122. }
  123. int
  124. dec_string(Decoder* d, ERL_NIF_TERM* value)
  125. {
  126. int has_escape = 0;
  127. int num_escapes = 0;
  128. int st;
  129. int ulen;
  130. int ui;
  131. int hi;
  132. int lo;
  133. char* chrbuf;
  134. int chrpos;
  135. if(d->p[d->i] != '\"') {
  136. return 0;
  137. }
  138. d->i++;
  139. st = d->i;
  140. while(d->i < d->len) {
  141. if(d->u[d->i] < 0x20) {
  142. return 0;
  143. } else if(d->p[d->i] == '\"') {
  144. d->i++;
  145. goto parse;
  146. } else if(d->p[d->i] == '\\') {
  147. if(d->i+1 >= d->len) {
  148. return 0;
  149. }
  150. has_escape = 1;
  151. num_escapes += 1;
  152. d->i++;
  153. switch(d->p[d->i]) {
  154. case '\"':
  155. case '\\':
  156. case '/':
  157. case 'b':
  158. case 'f':
  159. case 'n':
  160. case 'r':
  161. case 't':
  162. d->i++;
  163. break;
  164. case 'u':
  165. hi = 0;
  166. lo = 0;
  167. d->i++;
  168. if(d->i + 4 >= d->len) {
  169. return 0;
  170. }
  171. hi = int_from_hex(&(d->u[d->i]));
  172. if(hi < 0) {
  173. return 0;
  174. }
  175. d->i += 4;
  176. if(hi >= 0xD800 && hi < 0xDC00) {
  177. if(d->i + 6 >= d->len) {
  178. return 0;
  179. }
  180. if(d->p[d->i++] != '\\') {
  181. return 0;
  182. } else if(d->p[d->i++] != 'u') {
  183. return 0;
  184. }
  185. lo = int_from_hex(&(d->u[d->i]));
  186. if(lo < 0) {
  187. return 0;
  188. }
  189. hi = unicode_from_pair(hi, lo);
  190. if(hi < 0) {
  191. return 0;
  192. }
  193. }
  194. hi = utf8_len(hi);
  195. if(hi < 0) {
  196. return 0;
  197. }
  198. if(lo == 0) {
  199. num_escapes += 5 - hi;
  200. } else {
  201. num_escapes += 11 - hi;
  202. }
  203. break;
  204. default:
  205. return 0;
  206. }
  207. } else if(d->u[d->i] < 0x80) {
  208. d->i++;
  209. } else {
  210. ulen = utf8_validate(&(d->u[d->i]), d->len - d->i);
  211. if(ulen < 0) {
  212. return 0;
  213. }
  214. d->i += ulen;
  215. }
  216. }
  217. // The goto above ensures that we only
  218. // hit this when a string is not terminated
  219. // correctly.
  220. return 0;
  221. parse:
  222. if(!has_escape) {
  223. *value = enif_make_sub_binary(d->env, d->arg, st, (d->i - st - 1));
  224. return 1;
  225. }
  226. hi = 0;
  227. lo = 0;
  228. ulen = (d->i - 1) - st - num_escapes;
  229. chrbuf = (char*) enif_make_new_binary(d->env, ulen, value);
  230. chrpos = 0;
  231. ui = st;
  232. while(ui < d->i - 1) {
  233. if(d->p[ui] != '\\') {
  234. chrbuf[chrpos++] = d->p[ui++];
  235. continue;
  236. }
  237. ui++;
  238. switch(d->p[ui]) {
  239. case '\"':
  240. case '\\':
  241. case '/':
  242. chrbuf[chrpos++] = d->p[ui];
  243. ui++;
  244. break;
  245. case 'b':
  246. chrbuf[chrpos++] = '\b';
  247. ui++;
  248. break;
  249. case 'f':
  250. chrbuf[chrpos++] = '\f';
  251. ui++;
  252. break;
  253. case 'n':
  254. chrbuf[chrpos++] = '\n';
  255. ui++;
  256. break;
  257. case 'r':
  258. chrbuf[chrpos++] = '\r';
  259. ui++;
  260. break;
  261. case 't':
  262. chrbuf[chrpos++] = '\t';
  263. ui++;
  264. break;
  265. case 'u':
  266. ui++;
  267. hi = int_from_hex(&(d->u[ui]));
  268. if(hi < 0) {
  269. return 0;
  270. }
  271. if(hi >= 0xD800 && hi < 0xDC00) {
  272. lo = int_from_hex(&(d->u[ui+6]));
  273. if(lo < 0) {
  274. return 0;
  275. }
  276. hi = unicode_from_pair(hi, lo);
  277. ui += 10;
  278. } else {
  279. ui += 4;
  280. }
  281. hi = unicode_to_utf8(hi, (unsigned char*) chrbuf+chrpos);
  282. if(hi < 0) {
  283. return 0;
  284. }
  285. chrpos += hi;
  286. break;
  287. default:
  288. return 0;
  289. }
  290. }
  291. return 1;
  292. }
  293. int
  294. dec_number(Decoder* d, ERL_NIF_TERM* value)
  295. {
  296. ERL_NIF_TERM num_type = d->atoms->atom_error;
  297. char state = nst_init;
  298. char nbuf[NUM_BUF_LEN];
  299. int st = d->i;
  300. int has_frac = 0;
  301. int has_exp = 0;
  302. double dval;
  303. long lval;
  304. while(d->i < d->len) {
  305. switch(state) {
  306. case nst_init:
  307. switch(d->p[d->i]) {
  308. case '-':
  309. state = nst_sign;
  310. d->i++;
  311. break;
  312. case '0':
  313. state = nst_frac0;
  314. d->i++;
  315. break;
  316. case '1':
  317. case '2':
  318. case '3':
  319. case '4':
  320. case '5':
  321. case '6':
  322. case '7':
  323. case '8':
  324. case '9':
  325. state = nst_mantissa;
  326. d->i++;
  327. break;
  328. default:
  329. return 0;
  330. }
  331. break;
  332. case nst_sign:
  333. switch(d->p[d->i]) {
  334. case '0':
  335. state = nst_frac0;
  336. d->i++;
  337. break;
  338. case '1':
  339. case '2':
  340. case '3':
  341. case '4':
  342. case '5':
  343. case '6':
  344. case '7':
  345. case '8':
  346. case '9':
  347. state = nst_mantissa;
  348. d->i++;
  349. break;
  350. default:
  351. return 0;
  352. }
  353. break;
  354. case nst_mantissa:
  355. switch(d->p[d->i]) {
  356. case '.':
  357. state = nst_frac1;
  358. d->i++;
  359. break;
  360. case 'e':
  361. case 'E':
  362. state = nst_esign;
  363. d->i++;
  364. break;
  365. case '0':
  366. case '1':
  367. case '2':
  368. case '3':
  369. case '4':
  370. case '5':
  371. case '6':
  372. case '7':
  373. case '8':
  374. case '9':
  375. d->i++;
  376. break;
  377. default:
  378. goto parse;
  379. }
  380. break;
  381. case nst_frac0:
  382. switch(d->p[d->i]) {
  383. case '.':
  384. state = nst_frac1;
  385. d->i++;
  386. break;
  387. case 'e':
  388. case 'E':
  389. state = nst_esign;
  390. d->i++;
  391. break;
  392. default:
  393. goto parse;
  394. }
  395. break;
  396. case nst_frac1:
  397. has_frac = 1;
  398. switch(d->p[d->i]) {
  399. case '0':
  400. case '1':
  401. case '2':
  402. case '3':
  403. case '4':
  404. case '5':
  405. case '6':
  406. case '7':
  407. case '8':
  408. case '9':
  409. state = nst_frac;
  410. d->i++;
  411. break;
  412. default:
  413. goto parse;
  414. }
  415. break;
  416. case nst_frac:
  417. switch(d->p[d->i]) {
  418. case 'e':
  419. case 'E':
  420. state = nst_esign;
  421. d->i++;
  422. break;
  423. case '0':
  424. case '1':
  425. case '2':
  426. case '3':
  427. case '4':
  428. case '5':
  429. case '6':
  430. case '7':
  431. case '8':
  432. case '9':
  433. d->i++;
  434. break;
  435. default:
  436. goto parse;
  437. }
  438. break;
  439. case nst_esign:
  440. has_exp = 1;
  441. switch(d->p[d->i]) {
  442. case '-':
  443. case '+':
  444. case '0':
  445. case '1':
  446. case '2':
  447. case '3':
  448. case '4':
  449. case '5':
  450. case '6':
  451. case '7':
  452. case '8':
  453. case '9':
  454. state = nst_edigit;
  455. d->i++;
  456. break;
  457. default:
  458. return 0;
  459. }
  460. break;
  461. case nst_edigit:
  462. switch(d->p[d->i]) {
  463. case '0':
  464. case '1':
  465. case '2':
  466. case '3':
  467. case '4':
  468. case '5':
  469. case '6':
  470. case '7':
  471. case '8':
  472. case '9':
  473. d->i++;
  474. break;
  475. default:
  476. goto parse;
  477. }
  478. break;
  479. default:
  480. return 0;
  481. }
  482. }
  483. parse:
  484. switch(state) {
  485. case nst_init:
  486. case nst_sign:
  487. case nst_frac1:
  488. case nst_esign:
  489. return 0;
  490. default:
  491. break;
  492. }
  493. errno = 0;
  494. if(d->i - st < NUM_BUF_LEN) {
  495. memset(nbuf, 0, NUM_BUF_LEN);
  496. memcpy(nbuf, &(d->p[st]), d->i - st);
  497. if(has_frac || has_exp) {
  498. dval = strtod(nbuf, NULL);
  499. if(errno != ERANGE) {
  500. *value = enif_make_double(d->env, dval);
  501. return 1;
  502. }
  503. } else {
  504. lval = strtol(nbuf, NULL, 10);
  505. if(errno != ERANGE) {
  506. *value = enif_make_int64(d->env, lval);
  507. return 1;
  508. }
  509. }
  510. }
  511. if(!has_frac && !has_exp) {
  512. num_type = d->atoms->atom_bignum;
  513. } else if(!has_frac && has_exp) {
  514. num_type = d->atoms->atom_bignum_e;
  515. } else {
  516. num_type = d->atoms->atom_bigdbl;
  517. }
  518. d->is_partial = 1;
  519. *value = enif_make_sub_binary(d->env, d->arg, st, d->i - st);
  520. *value = enif_make_tuple2(d->env, num_type, *value);
  521. return 1;
  522. }
  523. ERL_NIF_TERM
  524. make_object(ErlNifEnv* env, ERL_NIF_TERM pairs)
  525. {
  526. ERL_NIF_TERM ret = enif_make_list(env, 0);
  527. ERL_NIF_TERM key, val;
  528. while(enif_get_list_cell(env, pairs, &val, &pairs)) {
  529. if(!enif_get_list_cell(env, pairs, &key, &pairs)) {
  530. assert(0 == 1 && "Unbalanced object pairs.");
  531. }
  532. val = enif_make_tuple2(env, key, val);
  533. ret = enif_make_list_cell(env, val, ret);
  534. }
  535. return enif_make_tuple1(env, ret);
  536. }
  537. ERL_NIF_TERM
  538. make_array(ErlNifEnv* env, ERL_NIF_TERM list)
  539. {
  540. ERL_NIF_TERM ret = enif_make_list(env, 0);
  541. ERL_NIF_TERM item;
  542. while(enif_get_list_cell(env, list, &item, &list)) {
  543. ret = enif_make_list_cell(env, item, ret);
  544. }
  545. return ret;
  546. }
  547. ERL_NIF_TERM
  548. decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
  549. {
  550. Decoder dec;
  551. Decoder* d = &dec;
  552. ErlNifBinary bin;
  553. ERL_NIF_TERM objs = enif_make_list(env, 0);
  554. ERL_NIF_TERM curr = enif_make_list(env, 0);
  555. ERL_NIF_TERM val;
  556. ERL_NIF_TERM ret;
  557. if(argc != 1) {
  558. return enif_make_badarg(env);
  559. } else if(!enif_inspect_binary(env, argv[0], &bin)) {
  560. return enif_make_badarg(env);
  561. }
  562. dec_init(d, env, argv[0], &bin);
  563. //fprintf(stderr, "Parsing:\r\n");
  564. while(d->i < bin.size) {
  565. //fprintf(stderr, "state: %d\r\n", dec_curr(d));
  566. switch(dec_curr(d)) {
  567. case st_value:
  568. switch(d->p[d->i]) {
  569. case ' ':
  570. case '\n':
  571. case '\r':
  572. case '\t':
  573. d->i++;
  574. break;
  575. case 'n':
  576. if(d->i + 3 >= d->len) {
  577. ret = dec_error(d, "invalid_literal");
  578. goto done;
  579. }
  580. if(memcmp(&(d->p[d->i]), "null", 4) != 0) {
  581. ret = dec_error(d, "invalid_literal");
  582. goto done;
  583. }
  584. val = d->atoms->atom_null;
  585. dec_pop(d, st_value);
  586. d->i += 4;
  587. break;
  588. case 't':
  589. if(d->i + 3 >= d->len) {
  590. ret = dec_error(d, "invalid_literal");
  591. goto done;
  592. }
  593. if(memcmp(&(d->p[d->i]), "true", 4) != 0) {
  594. ret = dec_error(d, "invalid_literal");
  595. goto done;
  596. }
  597. val = d->atoms->atom_true;
  598. dec_pop(d, st_value);
  599. d->i += 4;
  600. break;
  601. case 'f':
  602. if(d->i + 4 >= bin.size) {
  603. ret = dec_error(d, "invalid_literal");
  604. goto done;
  605. }
  606. if(memcmp(&(d->p[d->i]), "false", 5) != 0) {
  607. ret = dec_error(d, "invalid_literal");
  608. goto done;
  609. }
  610. val = d->atoms->atom_false;
  611. dec_pop(d, st_value);
  612. d->i += 5;
  613. break;
  614. case '\"':
  615. if(!dec_string(d, &val)) {
  616. ret = dec_error(d, "invalid_string");
  617. goto done;
  618. }
  619. dec_pop(d, st_value);
  620. break;
  621. case '-':
  622. case '0':
  623. case '1':
  624. case '2':
  625. case '3':
  626. case '4':
  627. case '5':
  628. case '6':
  629. case '7':
  630. case '8':
  631. case '9':
  632. if(!dec_number(d, &val)) {
  633. ret = dec_error(d, "invalid_number");
  634. goto done;
  635. }
  636. dec_pop(d, st_value);
  637. break;
  638. case '{':
  639. dec_push(d, st_object);
  640. dec_push(d, st_key);
  641. objs = enif_make_list_cell(env, curr, objs);
  642. curr = enif_make_list(env, 0);
  643. d->i++;
  644. break;
  645. case '[':
  646. dec_push(d, st_array);
  647. dec_push(d, st_value);
  648. objs = enif_make_list_cell(env, curr, objs);
  649. curr = enif_make_list(env, 0);
  650. d->i++;
  651. break;
  652. case ']':
  653. if(!enif_is_empty_list(env, curr)) {
  654. ret = dec_error(d, "invalid_json");
  655. goto done;
  656. }
  657. dec_pop(d, st_value);
  658. if(dec_curr(d) != st_array) {
  659. ret = dec_error(d, "invalid_json");
  660. goto done;
  661. }
  662. dec_pop(d, st_array);
  663. dec_pop(d, st_value);
  664. val = curr; // curr is []
  665. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  666. ret = dec_error(d, "internal_error");
  667. goto done;
  668. }
  669. d->i++;
  670. break;
  671. default:
  672. ret = dec_error(d, "invalid_json");
  673. goto done;
  674. }
  675. if(dec_top(d) == 0) {
  676. dec_push(d, st_done);
  677. } else if(dec_curr(d) != st_value && dec_curr(d) != st_key) {
  678. dec_push(d, st_comma);
  679. curr = enif_make_list_cell(env, val, curr);
  680. }
  681. break;
  682. case st_key:
  683. switch(d->p[d->i]) {
  684. case ' ':
  685. case '\n':
  686. case '\r':
  687. case '\t':
  688. d->i++;
  689. break;
  690. case '\"':
  691. if(!dec_string(d, &val)) {
  692. ret = dec_error(d, "invalid_string");
  693. goto done;
  694. }
  695. dec_pop(d, st_key);
  696. dec_push(d, st_colon);
  697. curr = enif_make_list_cell(env, val, curr);
  698. break;
  699. case '}':
  700. if(!enif_is_empty_list(env, curr)) {
  701. ret = dec_error(d, "invalid_json");
  702. goto done;
  703. }
  704. dec_pop(d, st_key);
  705. dec_pop(d, st_object);
  706. dec_pop(d, st_value);
  707. val = enif_make_tuple1(env, curr);
  708. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  709. ret = dec_error(d, "internal_error");
  710. goto done;
  711. }
  712. if(dec_top(d) == 0) {
  713. dec_push(d, st_done);
  714. } else {
  715. dec_push(d, st_comma);
  716. curr = enif_make_list_cell(env, val, curr);
  717. }
  718. d->i++;
  719. break;
  720. default:
  721. ret = dec_error(d, "invalid_json");
  722. goto done;
  723. }
  724. break;
  725. case st_colon:
  726. switch(d->p[d->i]) {
  727. case ' ':
  728. case '\n':
  729. case '\r':
  730. case '\t':
  731. d->i++;
  732. break;
  733. case ':':
  734. dec_pop(d, st_colon);
  735. dec_push(d, st_value);
  736. d->i++;
  737. break;
  738. default:
  739. ret = dec_error(d, "invalid_json");
  740. goto done;
  741. }
  742. break;
  743. case st_comma:
  744. switch(d->p[d->i]) {
  745. case ' ':
  746. case '\n':
  747. case '\r':
  748. case '\t':
  749. d->i++;
  750. break;
  751. case ',':
  752. dec_pop(d, st_comma);
  753. switch(dec_curr(d)) {
  754. case st_object:
  755. dec_push(d, st_key);
  756. break;
  757. case st_array:
  758. dec_push(d, st_value);
  759. break;
  760. default:
  761. ret = dec_error(d, "internal_error");
  762. goto done;
  763. }
  764. d->i++;
  765. break;
  766. case '}':
  767. dec_pop(d, st_comma);
  768. if(dec_curr(d) != st_object) {
  769. ret = dec_error(d, "invalid_json");
  770. goto done;
  771. }
  772. dec_pop(d, st_object);
  773. dec_pop(d, st_value);
  774. val = make_object(env, curr);
  775. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  776. ret = dec_error(d, "internal_error");
  777. goto done;
  778. }
  779. if(dec_top(d) > 0) {
  780. dec_push(d, st_comma);
  781. curr = enif_make_list_cell(env, val, curr);
  782. } else {
  783. dec_push(d, st_done);
  784. }
  785. d->i++;
  786. break;
  787. case ']':
  788. dec_pop(d, st_comma);
  789. if(dec_curr(d) != st_array) {
  790. ret = dec_error(d, "invalid_json");
  791. goto done;
  792. }
  793. dec_pop(d, st_array);
  794. dec_pop(d, st_value);
  795. val = make_array(env, curr);
  796. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  797. ret = dec_error(d, "internal_error");
  798. goto done;
  799. }
  800. if(dec_top(d) > 0) {
  801. dec_push(d, st_comma);
  802. curr = enif_make_list_cell(env, val, curr);
  803. } else {
  804. dec_push(d, st_done);
  805. }
  806. d->i++;
  807. break;
  808. default:
  809. ret = dec_error(d, "invalid_json");
  810. goto done;
  811. }
  812. break;
  813. case st_done:
  814. switch(d->p[d->i]) {
  815. case ' ':
  816. case '\n':
  817. case '\r':
  818. case '\t':
  819. d->i++;
  820. break;
  821. default:
  822. ret = dec_error(d, "invalid_trailing_data");
  823. goto done;
  824. }
  825. break;
  826. default:
  827. ret = dec_error(d, "invalid_internal_state");
  828. goto done;
  829. }
  830. }
  831. if(dec_curr(d) != st_done) {
  832. ret = dec_error(d, "truncated_json");
  833. } else if(d->is_partial) {
  834. ret = enif_make_tuple2(env, d->atoms->atom_partial, val);
  835. } else {
  836. ret = val;
  837. }
  838. done:
  839. dec_destroy(d);
  840. return ret;
  841. }