You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

913 lines
26 KiB

  1. // This file is part of Jiffy released under the MIT license.
  2. // See the LICENSE file for more information.
  3. #include <assert.h>
  4. #include <errno.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <string.h>
  8. #include "erl_nif.h"
  9. #include "jiffy.h"
  10. #define U(c) ((unsigned char) (c))
  11. #define ERROR(i, msg) make_error(st, env, msg)
  12. #define STACK_SIZE_INC 64
  13. #define NUM_BUF_LEN 32
  14. #if WINDOWS || WIN32
  15. #define snprintf _snprintf
  16. #endif
  17. enum {
  18. st_value=0,
  19. st_object,
  20. st_array,
  21. st_key,
  22. st_colon,
  23. st_comma,
  24. st_done,
  25. st_invalid
  26. } JsonState;
  27. enum {
  28. nst_init=0,
  29. nst_sign,
  30. nst_mantissa,
  31. nst_frac0,
  32. nst_frac1,
  33. nst_frac,
  34. nst_esign,
  35. nst_edigit
  36. } JsonNumState;
  37. typedef struct {
  38. ErlNifEnv* env;
  39. jiffy_st* atoms;
  40. ERL_NIF_TERM arg;
  41. ErlNifBinary bin;
  42. int is_partial;
  43. char* p;
  44. unsigned char* u;
  45. int i;
  46. int len;
  47. char* st_data;
  48. int st_size;
  49. int st_top;
  50. } Decoder;
  51. void
  52. dec_init(Decoder* d, ErlNifEnv* env, ERL_NIF_TERM arg, ErlNifBinary* bin)
  53. {
  54. int i;
  55. d->env = env;
  56. d->atoms = enif_priv_data(env);
  57. d->arg = arg;
  58. d->is_partial = 0;
  59. d->p = (char*) bin->data;
  60. d->u = bin->data;
  61. d->len = bin->size;
  62. d->i = 0;
  63. d->st_data = (char*) enif_alloc(STACK_SIZE_INC * sizeof(char));
  64. d->st_size = STACK_SIZE_INC;
  65. d->st_top = 0;
  66. for(i = 0; i < d->st_size; i++) {
  67. d->st_data[i] = st_invalid;
  68. }
  69. d->st_data[0] = st_value;
  70. d->st_top++;
  71. }
  72. void
  73. dec_destroy(Decoder* d)
  74. {
  75. if(d->st_data != NULL) {
  76. enif_free(d->st_data);
  77. }
  78. }
  79. ERL_NIF_TERM
  80. dec_error(Decoder* d, const char* atom)
  81. {
  82. ERL_NIF_TERM pos = enif_make_int(d->env, d->i+1);
  83. ERL_NIF_TERM msg = make_atom(d->env, atom);
  84. ERL_NIF_TERM ret = enif_make_tuple2(d->env, pos, msg);
  85. return enif_make_tuple2(d->env, d->atoms->atom_error, ret);
  86. }
  87. char
  88. dec_curr(Decoder* d)
  89. {
  90. return d->st_data[d->st_top-1];
  91. }
  92. int
  93. dec_top(Decoder* d)
  94. {
  95. return d->st_top;
  96. }
  97. void
  98. dec_push(Decoder* d, char val)
  99. {
  100. char* tmp;
  101. int new_sz;
  102. int i;
  103. if(d->st_top >= d->st_size) {
  104. new_sz = d->st_size + STACK_SIZE_INC;
  105. tmp = (char*) enif_alloc(new_sz * sizeof(char));
  106. memcpy(tmp, d->st_data, d->st_size * sizeof(char));
  107. enif_free(d->st_data);
  108. d->st_data = tmp;
  109. d->st_size = new_sz;
  110. for(i = d->st_top; i < d->st_size; i++) {
  111. d->st_data[i] = st_invalid;
  112. }
  113. }
  114. d->st_data[d->st_top++] = val;
  115. }
  116. void
  117. dec_pop(Decoder* d, char val)
  118. {
  119. assert(d->st_data[d->st_top-1] == val && "popped invalid state.");
  120. d->st_data[d->st_top-1] = st_invalid;
  121. d->st_top--;
  122. }
  123. int
  124. dec_string(Decoder* d, ERL_NIF_TERM* value)
  125. {
  126. int has_escape = 0;
  127. int num_escapes = 0;
  128. int st;
  129. int ulen;
  130. int ui;
  131. int hi;
  132. int lo;
  133. char* chrbuf;
  134. int chrpos;
  135. if(d->p[d->i] != '\"') {
  136. return 0;
  137. }
  138. d->i++;
  139. st = d->i;
  140. while(d->i < d->len) {
  141. if(d->u[d->i] < 0x20) {
  142. return 0;
  143. } else if(d->p[d->i] == '\"') {
  144. d->i++;
  145. goto parse;
  146. } else if(d->p[d->i] == '\\') {
  147. if(d->i+1 >= d->len) {
  148. return 0;
  149. }
  150. has_escape = 1;
  151. num_escapes += 1;
  152. d->i++;
  153. switch(d->p[d->i]) {
  154. case '\"':
  155. case '\\':
  156. case '/':
  157. case 'b':
  158. case 'f':
  159. case 'n':
  160. case 'r':
  161. case 't':
  162. d->i++;
  163. break;
  164. case 'u':
  165. hi = 0;
  166. lo = 0;
  167. d->i++;
  168. if(d->i + 4 >= d->len) {
  169. return 0;
  170. }
  171. hi = int_from_hex(&(d->u[d->i]));
  172. if(hi < 0) {
  173. return 0;
  174. }
  175. d->i += 4;
  176. if(hi >= 0xD800 && hi < 0xDC00) {
  177. if(d->i + 6 >= d->len) {
  178. return 0;
  179. }
  180. if(d->p[d->i++] != '\\') {
  181. return 0;
  182. } else if(d->p[d->i++] != 'u') {
  183. return 0;
  184. }
  185. lo = int_from_hex(&(d->u[d->i]));
  186. if(lo < 0) {
  187. return 0;
  188. }
  189. hi = unicode_from_pair(hi, lo);
  190. if(hi < 0) {
  191. return 0;
  192. }
  193. }
  194. hi = utf8_len(hi);
  195. if(hi < 0) {
  196. return 0;
  197. }
  198. if(lo == 0) {
  199. num_escapes += 5 - hi;
  200. } else {
  201. num_escapes += 11 - hi;
  202. }
  203. break;
  204. default:
  205. return 0;
  206. }
  207. } else if(d->u[d->i] < 0x80) {
  208. d->i++;
  209. } else {
  210. ulen = utf8_validate(&(d->u[d->i]), d->len - d->i);
  211. if(ulen < 0) {
  212. return 0;
  213. }
  214. d->i += ulen;
  215. }
  216. }
  217. parse:
  218. if(d->p[d->i-1] != '\"') {
  219. return 0;
  220. }
  221. if(!has_escape) {
  222. *value = enif_make_sub_binary(d->env, d->arg, st, (d->i - st - 1));
  223. return 1;
  224. }
  225. hi = 0;
  226. lo = 0;
  227. ulen = (d->i - 1) - st - num_escapes;
  228. chrbuf = (char*) enif_make_new_binary(d->env, ulen, value);
  229. chrpos = 0;
  230. ui = st;
  231. while(ui < d->i - 1) {
  232. if(d->p[ui] != '\\') {
  233. chrbuf[chrpos++] = d->p[ui++];
  234. continue;
  235. }
  236. ui++;
  237. switch(d->p[ui]) {
  238. case '\"':
  239. case '\\':
  240. case '/':
  241. chrbuf[chrpos++] = d->p[ui];
  242. ui++;
  243. break;
  244. case 'b':
  245. chrbuf[chrpos++] = '\b';
  246. ui++;
  247. break;
  248. case 'f':
  249. chrbuf[chrpos++] = '\f';
  250. ui++;
  251. break;
  252. case 'n':
  253. chrbuf[chrpos++] = '\n';
  254. ui++;
  255. break;
  256. case 'r':
  257. chrbuf[chrpos++] = '\r';
  258. ui++;
  259. break;
  260. case 't':
  261. chrbuf[chrpos++] = '\t';
  262. ui++;
  263. break;
  264. case 'u':
  265. ui++;
  266. hi = int_from_hex(&(d->u[ui]));
  267. if(hi < 0) {
  268. return 0;
  269. }
  270. if(hi >= 0xD800 && hi < 0xDC00) {
  271. lo = int_from_hex(&(d->u[ui+6]));
  272. if(lo < 0) {
  273. return 0;
  274. }
  275. hi = unicode_from_pair(hi, lo);
  276. ui += 10;
  277. } else {
  278. ui += 4;
  279. }
  280. hi = unicode_to_utf8(hi, (unsigned char*) chrbuf+chrpos);
  281. if(hi < 0) {
  282. return 0;
  283. }
  284. chrpos += hi;
  285. break;
  286. default:
  287. return 0;
  288. }
  289. }
  290. return 1;
  291. }
  292. int
  293. dec_number(Decoder* d, ERL_NIF_TERM* value)
  294. {
  295. ERL_NIF_TERM num_type = d->atoms->atom_error;
  296. char state = nst_init;
  297. char nbuf[NUM_BUF_LEN];
  298. int st = d->i;
  299. int has_frac = 0;
  300. int has_exp = 0;
  301. double dval;
  302. long lval;
  303. while(d->i < d->len) {
  304. switch(state) {
  305. case nst_init:
  306. switch(d->p[d->i]) {
  307. case '-':
  308. state = nst_sign;
  309. d->i++;
  310. break;
  311. case '0':
  312. state = nst_frac0;
  313. d->i++;
  314. break;
  315. case '1':
  316. case '2':
  317. case '3':
  318. case '4':
  319. case '5':
  320. case '6':
  321. case '7':
  322. case '8':
  323. case '9':
  324. state = nst_mantissa;
  325. d->i++;
  326. break;
  327. default:
  328. return 0;
  329. }
  330. break;
  331. case nst_sign:
  332. switch(d->p[d->i]) {
  333. case '0':
  334. state = nst_frac0;
  335. d->i++;
  336. break;
  337. case '1':
  338. case '2':
  339. case '3':
  340. case '4':
  341. case '5':
  342. case '6':
  343. case '7':
  344. case '8':
  345. case '9':
  346. state = nst_mantissa;
  347. d->i++;
  348. break;
  349. default:
  350. return 0;
  351. }
  352. break;
  353. case nst_mantissa:
  354. switch(d->p[d->i]) {
  355. case '.':
  356. state = nst_frac1;
  357. d->i++;
  358. break;
  359. case 'e':
  360. case 'E':
  361. state = nst_esign;
  362. d->i++;
  363. break;
  364. case '0':
  365. case '1':
  366. case '2':
  367. case '3':
  368. case '4':
  369. case '5':
  370. case '6':
  371. case '7':
  372. case '8':
  373. case '9':
  374. d->i++;
  375. break;
  376. default:
  377. goto parse;
  378. }
  379. break;
  380. case nst_frac0:
  381. switch(d->p[d->i]) {
  382. case '.':
  383. state = nst_frac1;
  384. d->i++;
  385. break;
  386. case 'e':
  387. case 'E':
  388. state = nst_esign;
  389. d->i++;
  390. break;
  391. default:
  392. goto parse;
  393. }
  394. break;
  395. case nst_frac1:
  396. has_frac = 1;
  397. switch(d->p[d->i]) {
  398. case '0':
  399. case '1':
  400. case '2':
  401. case '3':
  402. case '4':
  403. case '5':
  404. case '6':
  405. case '7':
  406. case '8':
  407. case '9':
  408. state = nst_frac;
  409. d->i++;
  410. break;
  411. default:
  412. goto parse;
  413. }
  414. break;
  415. case nst_frac:
  416. switch(d->p[d->i]) {
  417. case 'e':
  418. case 'E':
  419. state = nst_esign;
  420. d->i++;
  421. break;
  422. case '0':
  423. case '1':
  424. case '2':
  425. case '3':
  426. case '4':
  427. case '5':
  428. case '6':
  429. case '7':
  430. case '8':
  431. case '9':
  432. d->i++;
  433. break;
  434. default:
  435. goto parse;
  436. }
  437. break;
  438. case nst_esign:
  439. has_exp = 1;
  440. switch(d->p[d->i]) {
  441. case '-':
  442. case '+':
  443. case '0':
  444. case '1':
  445. case '2':
  446. case '3':
  447. case '4':
  448. case '5':
  449. case '6':
  450. case '7':
  451. case '8':
  452. case '9':
  453. state = nst_edigit;
  454. d->i++;
  455. break;
  456. default:
  457. return 0;
  458. }
  459. break;
  460. case nst_edigit:
  461. switch(d->p[d->i]) {
  462. case '0':
  463. case '1':
  464. case '2':
  465. case '3':
  466. case '4':
  467. case '5':
  468. case '6':
  469. case '7':
  470. case '8':
  471. case '9':
  472. d->i++;
  473. break;
  474. default:
  475. goto parse;
  476. }
  477. break;
  478. default:
  479. return 0;
  480. }
  481. }
  482. parse:
  483. switch(state) {
  484. case nst_init:
  485. case nst_sign:
  486. case nst_frac1:
  487. case nst_esign:
  488. return 0;
  489. default:
  490. break;
  491. }
  492. errno = 0;
  493. if(d->i - st < NUM_BUF_LEN) {
  494. memset(nbuf, 0, NUM_BUF_LEN);
  495. memcpy(nbuf, &(d->p[st]), d->i - st);
  496. if(has_frac || has_exp) {
  497. dval = strtod(nbuf, NULL);
  498. if(errno != ERANGE) {
  499. *value = enif_make_double(d->env, dval);
  500. return 1;
  501. }
  502. } else {
  503. lval = strtol(nbuf, NULL, 10);
  504. if(errno != ERANGE) {
  505. *value = enif_make_int64(d->env, lval);
  506. return 1;
  507. }
  508. }
  509. }
  510. if(!has_frac && !has_exp) {
  511. num_type = d->atoms->atom_bignum;
  512. } else if(!has_frac && has_exp) {
  513. num_type = d->atoms->atom_bignum_e;
  514. } else {
  515. num_type = d->atoms->atom_bigdbl;
  516. }
  517. d->is_partial = 1;
  518. *value = enif_make_sub_binary(d->env, d->arg, st, d->i - st);
  519. *value = enif_make_tuple2(d->env, num_type, *value);
  520. return 1;
  521. }
  522. ERL_NIF_TERM
  523. make_object(ErlNifEnv* env, ERL_NIF_TERM pairs)
  524. {
  525. ERL_NIF_TERM ret = enif_make_list(env, 0);
  526. ERL_NIF_TERM key, val;
  527. while(enif_get_list_cell(env, pairs, &val, &pairs)) {
  528. if(!enif_get_list_cell(env, pairs, &key, &pairs)) {
  529. assert(0 == 1 && "Unbalanced object pairs.");
  530. }
  531. val = enif_make_tuple2(env, key, val);
  532. ret = enif_make_list_cell(env, val, ret);
  533. }
  534. return enif_make_tuple1(env, ret);
  535. }
  536. ERL_NIF_TERM
  537. make_array(ErlNifEnv* env, ERL_NIF_TERM list)
  538. {
  539. ERL_NIF_TERM ret = enif_make_list(env, 0);
  540. ERL_NIF_TERM item;
  541. while(enif_get_list_cell(env, list, &item, &list)) {
  542. ret = enif_make_list_cell(env, item, ret);
  543. }
  544. return ret;
  545. }
  546. ERL_NIF_TERM
  547. decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
  548. {
  549. Decoder dec;
  550. Decoder* d = &dec;
  551. ErlNifBinary bin;
  552. ERL_NIF_TERM objs = enif_make_list(env, 0);
  553. ERL_NIF_TERM curr = enif_make_list(env, 0);
  554. ERL_NIF_TERM val;
  555. ERL_NIF_TERM ret;
  556. if(argc != 1) {
  557. return enif_make_badarg(env);
  558. } else if(!enif_inspect_binary(env, argv[0], &bin)) {
  559. return enif_make_badarg(env);
  560. }
  561. dec_init(d, env, argv[0], &bin);
  562. //fprintf(stderr, "Parsing:\r\n");
  563. while(d->i < bin.size) {
  564. //fprintf(stderr, "state: %d\r\n", dec_curr(d));
  565. switch(dec_curr(d)) {
  566. case st_value:
  567. switch(d->p[d->i]) {
  568. case ' ':
  569. case '\n':
  570. case '\r':
  571. case '\t':
  572. d->i++;
  573. break;
  574. case 'n':
  575. if(d->i + 3 >= d->len) {
  576. ret = dec_error(d, "invalid_literal");
  577. goto done;
  578. }
  579. if(memcmp(&(d->p[d->i]), "null", 4) != 0) {
  580. ret = dec_error(d, "invalid_literal");
  581. goto done;
  582. }
  583. val = d->atoms->atom_null;
  584. dec_pop(d, st_value);
  585. d->i += 4;
  586. break;
  587. case 't':
  588. if(d->i + 3 >= d->len) {
  589. ret = dec_error(d, "invalid_literal");
  590. goto done;
  591. }
  592. if(memcmp(&(d->p[d->i]), "true", 4) != 0) {
  593. ret = dec_error(d, "invalid_literal");
  594. goto done;
  595. }
  596. val = d->atoms->atom_true;
  597. dec_pop(d, st_value);
  598. d->i += 4;
  599. break;
  600. case 'f':
  601. if(d->i + 4 >= bin.size) {
  602. ret = dec_error(d, "invalid_literal");
  603. goto done;
  604. }
  605. if(memcmp(&(d->p[d->i]), "false", 5) != 0) {
  606. ret = dec_error(d, "invalid_literal");
  607. goto done;
  608. }
  609. val = d->atoms->atom_false;
  610. dec_pop(d, st_value);
  611. d->i += 5;
  612. break;
  613. case '\"':
  614. if(!dec_string(d, &val)) {
  615. ret = dec_error(d, "invalid_string");
  616. goto done;
  617. }
  618. dec_pop(d, st_value);
  619. break;
  620. case '-':
  621. case '0':
  622. case '1':
  623. case '2':
  624. case '3':
  625. case '4':
  626. case '5':
  627. case '6':
  628. case '7':
  629. case '8':
  630. case '9':
  631. if(!dec_number(d, &val)) {
  632. ret = dec_error(d, "invalid_number");
  633. goto done;
  634. }
  635. dec_pop(d, st_value);
  636. break;
  637. case '{':
  638. dec_push(d, st_object);
  639. dec_push(d, st_key);
  640. objs = enif_make_list_cell(env, curr, objs);
  641. curr = enif_make_list(env, 0);
  642. d->i++;
  643. break;
  644. case '[':
  645. dec_push(d, st_array);
  646. dec_push(d, st_value);
  647. objs = enif_make_list_cell(env, curr, objs);
  648. curr = enif_make_list(env, 0);
  649. d->i++;
  650. break;
  651. case ']':
  652. if(!enif_is_empty_list(env, curr)) {
  653. ret = dec_error(d, "invalid_json");
  654. goto done;
  655. }
  656. dec_pop(d, st_value);
  657. if(dec_curr(d) != st_array) {
  658. ret = dec_error(d, "invalid_json");
  659. goto done;
  660. }
  661. dec_pop(d, st_array);
  662. dec_pop(d, st_value);
  663. val = curr; // curr is []
  664. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  665. ret = dec_error(d, "internal_error");
  666. goto done;
  667. }
  668. d->i++;
  669. break;
  670. default:
  671. ret = dec_error(d, "invalid_json");
  672. goto done;
  673. }
  674. if(dec_top(d) == 0) {
  675. dec_push(d, st_done);
  676. } else if(dec_curr(d) != st_value && dec_curr(d) != st_key) {
  677. dec_push(d, st_comma);
  678. curr = enif_make_list_cell(env, val, curr);
  679. }
  680. break;
  681. case st_key:
  682. switch(d->p[d->i]) {
  683. case ' ':
  684. case '\n':
  685. case '\r':
  686. case '\t':
  687. d->i++;
  688. break;
  689. case '\"':
  690. if(!dec_string(d, &val)) {
  691. ret = dec_error(d, "invalid_string");
  692. goto done;
  693. }
  694. dec_pop(d, st_key);
  695. dec_push(d, st_colon);
  696. curr = enif_make_list_cell(env, val, curr);
  697. break;
  698. case '}':
  699. if(!enif_is_empty_list(env, curr)) {
  700. ret = dec_error(d, "invalid_json");
  701. goto done;
  702. }
  703. dec_pop(d, st_key);
  704. dec_pop(d, st_object);
  705. dec_pop(d, st_value);
  706. val = enif_make_tuple1(env, curr);
  707. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  708. ret = dec_error(d, "internal_error");
  709. goto done;
  710. }
  711. if(dec_top(d) == 0) {
  712. dec_push(d, st_done);
  713. } else {
  714. dec_push(d, st_comma);
  715. curr = enif_make_list_cell(env, val, curr);
  716. }
  717. d->i++;
  718. break;
  719. default:
  720. ret = dec_error(d, "invalid_json");
  721. goto done;
  722. }
  723. break;
  724. case st_colon:
  725. switch(d->p[d->i]) {
  726. case ' ':
  727. case '\n':
  728. case '\r':
  729. case '\t':
  730. d->i++;
  731. break;
  732. case ':':
  733. dec_pop(d, st_colon);
  734. dec_push(d, st_value);
  735. d->i++;
  736. break;
  737. default:
  738. ret = dec_error(d, "invalid_json");
  739. goto done;
  740. }
  741. break;
  742. case st_comma:
  743. switch(d->p[d->i]) {
  744. case ' ':
  745. case '\n':
  746. case '\r':
  747. case '\t':
  748. d->i++;
  749. break;
  750. case ',':
  751. dec_pop(d, st_comma);
  752. switch(dec_curr(d)) {
  753. case st_object:
  754. dec_push(d, st_key);
  755. break;
  756. case st_array:
  757. dec_push(d, st_value);
  758. break;
  759. default:
  760. ret = dec_error(d, "internal_error");
  761. goto done;
  762. }
  763. d->i++;
  764. break;
  765. case '}':
  766. dec_pop(d, st_comma);
  767. if(dec_curr(d) != st_object) {
  768. ret = dec_error(d, "invalid_json");
  769. goto done;
  770. }
  771. dec_pop(d, st_object);
  772. dec_pop(d, st_value);
  773. val = make_object(env, curr);
  774. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  775. ret = dec_error(d, "internal_error");
  776. goto done;
  777. }
  778. if(dec_top(d) > 0) {
  779. dec_push(d, st_comma);
  780. curr = enif_make_list_cell(env, val, curr);
  781. } else {
  782. dec_push(d, st_done);
  783. }
  784. d->i++;
  785. break;
  786. case ']':
  787. dec_pop(d, st_comma);
  788. if(dec_curr(d) != st_array) {
  789. ret = dec_error(d, "invalid_json");
  790. goto done;
  791. }
  792. dec_pop(d, st_array);
  793. dec_pop(d, st_value);
  794. val = make_array(env, curr);
  795. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  796. ret = dec_error(d, "internal_error");
  797. goto done;
  798. }
  799. if(dec_top(d) > 0) {
  800. dec_push(d, st_comma);
  801. curr = enif_make_list_cell(env, val, curr);
  802. } else {
  803. dec_push(d, st_done);
  804. }
  805. d->i++;
  806. break;
  807. default:
  808. ret = dec_error(d, "invalid_json");
  809. goto done;
  810. }
  811. break;
  812. case st_done:
  813. switch(d->p[d->i]) {
  814. case ' ':
  815. case '\n':
  816. case '\r':
  817. case '\t':
  818. d->i++;
  819. break;
  820. default:
  821. ret = dec_error(d, "invalid_trailing_data");
  822. goto done;
  823. }
  824. break;
  825. default:
  826. ret = dec_error(d, "invalid_internal_state");
  827. goto done;
  828. }
  829. }
  830. if(dec_curr(d) != st_done) {
  831. ret = dec_error(d, "truncated_json");
  832. } else if(d->is_partial) {
  833. ret = enif_make_tuple2(env, d->atoms->atom_partial, val);
  834. } else {
  835. ret = val;
  836. }
  837. done:
  838. dec_destroy(d);
  839. return ret;
  840. }