You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

909 lines
26 KiB

преди 13 години
преди 13 години
преди 13 години
преди 13 години
преди 13 години
преди 13 години
преди 13 години
  1. // This file is part of Jiffy released under the MIT license.
  2. // See the LICENSE file for more information.
  3. #include <assert.h>
  4. #include <errno.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <string.h>
  8. #include "erl_nif.h"
  9. #include "jiffy.h"
  10. #define U(c) ((unsigned char) (c))
  11. #define ERROR(i, msg) make_error(st, env, msg)
  12. #define STACK_SIZE_INC 64
  13. #define NUM_BUF_LEN 32
  14. enum {
  15. st_value=0,
  16. st_object,
  17. st_array,
  18. st_key,
  19. st_colon,
  20. st_comma,
  21. st_done,
  22. st_invalid
  23. } JsonState;
  24. enum {
  25. nst_init=0,
  26. nst_sign,
  27. nst_mantissa,
  28. nst_frac0,
  29. nst_frac1,
  30. nst_frac,
  31. nst_esign,
  32. nst_edigit
  33. } JsonNumState;
  34. typedef struct {
  35. ErlNifEnv* env;
  36. jiffy_st* atoms;
  37. ERL_NIF_TERM arg;
  38. ErlNifBinary bin;
  39. int is_partial;
  40. char* p;
  41. unsigned char* u;
  42. int i;
  43. int len;
  44. char* st_data;
  45. int st_size;
  46. int st_top;
  47. } Decoder;
  48. void
  49. dec_init(Decoder* d, ErlNifEnv* env, ERL_NIF_TERM arg, ErlNifBinary* bin)
  50. {
  51. int i;
  52. d->env = env;
  53. d->atoms = enif_priv_data(env);
  54. d->arg = arg;
  55. d->is_partial = 0;
  56. d->p = (char*) bin->data;
  57. d->u = bin->data;
  58. d->len = bin->size;
  59. d->i = 0;
  60. d->st_data = (char*) enif_alloc(STACK_SIZE_INC * sizeof(char));
  61. d->st_size = STACK_SIZE_INC;
  62. d->st_top = 0;
  63. for(i = 0; i < d->st_size; i++) {
  64. d->st_data[i] = st_invalid;
  65. }
  66. d->st_data[0] = st_value;
  67. d->st_top++;
  68. }
  69. void
  70. dec_destroy(Decoder* d)
  71. {
  72. if(d->st_data != NULL) {
  73. enif_free(d->st_data);
  74. }
  75. }
  76. ERL_NIF_TERM
  77. dec_error(Decoder* d, const char* atom)
  78. {
  79. ERL_NIF_TERM pos = enif_make_int(d->env, d->i+1);
  80. ERL_NIF_TERM msg = make_atom(d->env, atom);
  81. ERL_NIF_TERM ret = enif_make_tuple2(d->env, pos, msg);
  82. return enif_make_tuple2(d->env, d->atoms->atom_error, ret);
  83. }
  84. char
  85. dec_curr(Decoder* d)
  86. {
  87. return d->st_data[d->st_top-1];
  88. }
  89. int
  90. dec_top(Decoder* d)
  91. {
  92. return d->st_top;
  93. }
  94. void
  95. dec_push(Decoder* d, char val)
  96. {
  97. char* tmp;
  98. int new_sz;
  99. int i;
  100. if(d->st_top >= d->st_size) {
  101. new_sz = d->st_size + STACK_SIZE_INC;
  102. tmp = (char*) enif_alloc(new_sz * sizeof(char));
  103. memcpy(tmp, d->st_data, d->st_size * sizeof(char));
  104. enif_free(d->st_data);
  105. d->st_data = tmp;
  106. d->st_size = new_sz;
  107. for(i = d->st_top; i < d->st_size; i++) {
  108. d->st_data[i] = st_invalid;
  109. }
  110. }
  111. d->st_data[d->st_top++] = val;
  112. }
  113. void
  114. dec_pop(Decoder* d, char val)
  115. {
  116. assert(d->st_data[d->st_top-1] == val && "popped invalid state.");
  117. d->st_data[d->st_top-1] = st_invalid;
  118. d->st_top--;
  119. }
  120. int
  121. dec_string(Decoder* d, ERL_NIF_TERM* value)
  122. {
  123. int has_escape = 0;
  124. int num_escapes = 0;
  125. int st;
  126. int ulen;
  127. int ui;
  128. int hi;
  129. int lo;
  130. char* chrbuf;
  131. int chrpos;
  132. if(d->p[d->i] != '\"') {
  133. return 0;
  134. }
  135. d->i++;
  136. st = d->i;
  137. while(d->i < d->len) {
  138. if(d->u[d->i] < 0x20) {
  139. return 0;
  140. } else if(d->p[d->i] == '\"') {
  141. d->i++;
  142. goto parse;
  143. } else if(d->p[d->i] == '\\') {
  144. if(d->i+1 >= d->len) {
  145. return 0;
  146. }
  147. has_escape = 1;
  148. num_escapes += 1;
  149. d->i++;
  150. switch(d->p[d->i]) {
  151. case '\"':
  152. case '\\':
  153. case '/':
  154. case 'b':
  155. case 'f':
  156. case 'n':
  157. case 'r':
  158. case 't':
  159. d->i++;
  160. break;
  161. case 'u':
  162. hi = 0;
  163. lo = 0;
  164. d->i++;
  165. if(d->i + 4 >= d->len) {
  166. return 0;
  167. }
  168. hi = int_from_hex(&(d->u[d->i]));
  169. if(hi < 0) {
  170. return 0;
  171. }
  172. d->i += 4;
  173. if(hi >= 0xD800 && hi < 0xDC00) {
  174. if(d->i + 6 >= d->len) {
  175. return 0;
  176. }
  177. if(d->p[d->i++] != '\\') {
  178. return 0;
  179. } else if(d->p[d->i++] != 'u') {
  180. return 0;
  181. }
  182. lo = int_from_hex(&(d->u[d->i]));
  183. if(lo < 0) {
  184. return 0;
  185. }
  186. hi = unicode_from_pair(hi, lo);
  187. if(hi < 0) {
  188. return 0;
  189. }
  190. }
  191. hi = utf8_len(hi);
  192. if(hi < 0) {
  193. return 0;
  194. }
  195. if(lo == 0) {
  196. num_escapes += 5 - hi;
  197. } else {
  198. num_escapes += 11 - hi;
  199. }
  200. break;
  201. default:
  202. return 0;
  203. }
  204. } else if(d->u[d->i] < 0x80) {
  205. d->i++;
  206. } else {
  207. ulen = utf8_validate(&(d->u[d->i]), d->len - d->i);
  208. if(ulen < 0) {
  209. return 0;
  210. }
  211. d->i += ulen;
  212. }
  213. }
  214. parse:
  215. if(d->p[d->i-1] != '\"') {
  216. return 0;
  217. }
  218. if(!has_escape) {
  219. *value = enif_make_sub_binary(d->env, d->arg, st, (d->i - st - 1));
  220. return 1;
  221. }
  222. hi = 0;
  223. lo = 0;
  224. ulen = (d->i - 1) - st - num_escapes;
  225. chrbuf = (char*) enif_make_new_binary(d->env, ulen, value);
  226. chrpos = 0;
  227. ui = st;
  228. while(ui < d->i - 1) {
  229. if(d->p[ui] != '\\') {
  230. chrbuf[chrpos++] = d->p[ui++];
  231. continue;
  232. }
  233. ui++;
  234. switch(d->p[ui]) {
  235. case '\"':
  236. case '\\':
  237. case '/':
  238. chrbuf[chrpos++] = d->p[ui];
  239. ui++;
  240. break;
  241. case 'b':
  242. chrbuf[chrpos++] = '\b';
  243. ui++;
  244. break;
  245. case 'f':
  246. chrbuf[chrpos++] = '\f';
  247. ui++;
  248. break;
  249. case 'n':
  250. chrbuf[chrpos++] = '\n';
  251. ui++;
  252. break;
  253. case 'r':
  254. chrbuf[chrpos++] = '\r';
  255. ui++;
  256. break;
  257. case 't':
  258. chrbuf[chrpos++] = '\t';
  259. ui++;
  260. break;
  261. case 'u':
  262. ui++;
  263. hi = int_from_hex(&(d->u[ui]));
  264. if(hi < 0) {
  265. return 0;
  266. }
  267. if(hi >= 0xD800 && hi < 0xDC00) {
  268. lo = int_from_hex(&(d->u[ui+6]));
  269. if(lo < 0) {
  270. return 0;
  271. }
  272. hi = unicode_from_pair(hi, lo);
  273. ui += 10;
  274. } else {
  275. ui += 4;
  276. }
  277. hi = unicode_to_utf8(hi, (unsigned char*) chrbuf+chrpos);
  278. if(hi < 0) {
  279. return 0;
  280. }
  281. chrpos += hi;
  282. break;
  283. default:
  284. return 0;
  285. }
  286. }
  287. return 1;
  288. }
  289. int
  290. dec_number(Decoder* d, ERL_NIF_TERM* value)
  291. {
  292. ERL_NIF_TERM num_type = d->atoms->atom_error;
  293. char state = nst_init;
  294. char nbuf[NUM_BUF_LEN];
  295. int st = d->i;
  296. int has_frac = 0;
  297. int has_exp = 0;
  298. double dval;
  299. long lval;
  300. while(d->i < d->len) {
  301. switch(state) {
  302. case nst_init:
  303. switch(d->p[d->i]) {
  304. case '-':
  305. state = nst_sign;
  306. d->i++;
  307. break;
  308. case '0':
  309. state = nst_frac0;
  310. d->i++;
  311. break;
  312. case '1':
  313. case '2':
  314. case '3':
  315. case '4':
  316. case '5':
  317. case '6':
  318. case '7':
  319. case '8':
  320. case '9':
  321. state = nst_mantissa;
  322. d->i++;
  323. break;
  324. default:
  325. return 0;
  326. }
  327. break;
  328. case nst_sign:
  329. switch(d->p[d->i]) {
  330. case '0':
  331. state = nst_frac0;
  332. d->i++;
  333. break;
  334. case '1':
  335. case '2':
  336. case '3':
  337. case '4':
  338. case '5':
  339. case '6':
  340. case '7':
  341. case '8':
  342. case '9':
  343. state = nst_mantissa;
  344. d->i++;
  345. break;
  346. default:
  347. return 0;
  348. }
  349. break;
  350. case nst_mantissa:
  351. switch(d->p[d->i]) {
  352. case '.':
  353. state = nst_frac1;
  354. d->i++;
  355. break;
  356. case 'e':
  357. case 'E':
  358. state = nst_esign;
  359. d->i++;
  360. break;
  361. case '0':
  362. case '1':
  363. case '2':
  364. case '3':
  365. case '4':
  366. case '5':
  367. case '6':
  368. case '7':
  369. case '8':
  370. case '9':
  371. d->i++;
  372. break;
  373. default:
  374. goto parse;
  375. }
  376. break;
  377. case nst_frac0:
  378. switch(d->p[d->i]) {
  379. case '.':
  380. state = nst_frac1;
  381. d->i++;
  382. break;
  383. case 'e':
  384. case 'E':
  385. state = nst_esign;
  386. d->i++;
  387. break;
  388. default:
  389. goto parse;
  390. }
  391. break;
  392. case nst_frac1:
  393. has_frac = 1;
  394. switch(d->p[d->i]) {
  395. case '0':
  396. case '1':
  397. case '2':
  398. case '3':
  399. case '4':
  400. case '5':
  401. case '6':
  402. case '7':
  403. case '8':
  404. case '9':
  405. state = nst_frac;
  406. d->i++;
  407. break;
  408. default:
  409. goto parse;
  410. }
  411. break;
  412. case nst_frac:
  413. switch(d->p[d->i]) {
  414. case 'e':
  415. case 'E':
  416. state = nst_esign;
  417. d->i++;
  418. break;
  419. case '0':
  420. case '1':
  421. case '2':
  422. case '3':
  423. case '4':
  424. case '5':
  425. case '6':
  426. case '7':
  427. case '8':
  428. case '9':
  429. d->i++;
  430. break;
  431. default:
  432. goto parse;
  433. }
  434. break;
  435. case nst_esign:
  436. has_exp = 1;
  437. switch(d->p[d->i]) {
  438. case '-':
  439. case '+':
  440. case '0':
  441. case '1':
  442. case '2':
  443. case '3':
  444. case '4':
  445. case '5':
  446. case '6':
  447. case '7':
  448. case '8':
  449. case '9':
  450. state = nst_edigit;
  451. d->i++;
  452. break;
  453. default:
  454. return 0;
  455. }
  456. break;
  457. case nst_edigit:
  458. switch(d->p[d->i]) {
  459. case '0':
  460. case '1':
  461. case '2':
  462. case '3':
  463. case '4':
  464. case '5':
  465. case '6':
  466. case '7':
  467. case '8':
  468. case '9':
  469. d->i++;
  470. break;
  471. default:
  472. goto parse;
  473. }
  474. break;
  475. default:
  476. return 0;
  477. }
  478. }
  479. parse:
  480. switch(state) {
  481. case nst_init:
  482. case nst_sign:
  483. case nst_frac1:
  484. case nst_esign:
  485. return 0;
  486. default:
  487. break;
  488. }
  489. errno = 0;
  490. if(d->i - st < NUM_BUF_LEN) {
  491. memset(nbuf, 0, NUM_BUF_LEN);
  492. memcpy(nbuf, &(d->p[st]), d->i - st);
  493. if(has_frac || has_exp) {
  494. dval = strtod(nbuf, NULL);
  495. if(errno != ERANGE) {
  496. *value = enif_make_double(d->env, dval);
  497. return 1;
  498. }
  499. } else {
  500. lval = strtol(nbuf, NULL, 10);
  501. if(errno != ERANGE) {
  502. *value = enif_make_int64(d->env, lval);
  503. return 1;
  504. }
  505. }
  506. }
  507. if(!has_frac && !has_exp) {
  508. num_type = d->atoms->atom_bignum;
  509. } else if(has_exp) {
  510. num_type = d->atoms->atom_bignum_e;
  511. } else {
  512. num_type = d->atoms->atom_bigdbl;
  513. }
  514. d->is_partial = 1;
  515. *value = enif_make_sub_binary(d->env, d->arg, st, d->i - st);
  516. *value = enif_make_tuple2(d->env, num_type, *value);
  517. return 1;
  518. }
  519. ERL_NIF_TERM
  520. make_object(ErlNifEnv* env, ERL_NIF_TERM pairs)
  521. {
  522. ERL_NIF_TERM ret = enif_make_list(env, 0);
  523. ERL_NIF_TERM key, val;
  524. while(enif_get_list_cell(env, pairs, &val, &pairs)) {
  525. if(!enif_get_list_cell(env, pairs, &key, &pairs)) {
  526. assert(0 == 1 && "Unbalanced object pairs.");
  527. }
  528. val = enif_make_tuple2(env, key, val);
  529. ret = enif_make_list_cell(env, val, ret);
  530. }
  531. return enif_make_tuple1(env, ret);
  532. }
  533. ERL_NIF_TERM
  534. make_array(ErlNifEnv* env, ERL_NIF_TERM list)
  535. {
  536. ERL_NIF_TERM ret = enif_make_list(env, 0);
  537. ERL_NIF_TERM item;
  538. while(enif_get_list_cell(env, list, &item, &list)) {
  539. ret = enif_make_list_cell(env, item, ret);
  540. }
  541. return ret;
  542. }
  543. ERL_NIF_TERM
  544. decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
  545. {
  546. Decoder dec;
  547. Decoder* d = &dec;
  548. ErlNifBinary bin;
  549. ERL_NIF_TERM objs = enif_make_list(env, 0);
  550. ERL_NIF_TERM curr = enif_make_list(env, 0);
  551. ERL_NIF_TERM val;
  552. ERL_NIF_TERM ret;
  553. if(argc != 1) {
  554. return enif_make_badarg(env);
  555. } else if(!enif_inspect_binary(env, argv[0], &bin)) {
  556. return enif_make_badarg(env);
  557. }
  558. dec_init(d, env, argv[0], &bin);
  559. //fprintf(stderr, "Parsing:\r\n");
  560. while(d->i < bin.size) {
  561. //fprintf(stderr, "state: %d\r\n", dec_curr(d));
  562. switch(dec_curr(d)) {
  563. case st_value:
  564. switch(d->p[d->i]) {
  565. case ' ':
  566. case '\n':
  567. case '\r':
  568. case '\t':
  569. d->i++;
  570. break;
  571. case 'n':
  572. if(d->i + 3 >= d->len) {
  573. ret = dec_error(d, "invalid_literal");
  574. goto done;
  575. }
  576. if(memcmp(&(d->p[d->i]), "null", 4) != 0) {
  577. ret = dec_error(d, "invalid_literal");
  578. goto done;
  579. }
  580. val = d->atoms->atom_null;
  581. dec_pop(d, st_value);
  582. d->i += 4;
  583. break;
  584. case 't':
  585. if(d->i + 3 >= d->len) {
  586. ret = dec_error(d, "invalid_literal");
  587. goto done;
  588. }
  589. if(memcmp(&(d->p[d->i]), "true", 4) != 0) {
  590. ret = dec_error(d, "invalid_literal");
  591. goto done;
  592. }
  593. val = d->atoms->atom_true;
  594. dec_pop(d, st_value);
  595. d->i += 4;
  596. break;
  597. case 'f':
  598. if(d->i + 4 >= bin.size) {
  599. ret = dec_error(d, "invalid_literal");
  600. goto done;
  601. }
  602. if(memcmp(&(d->p[d->i]), "false", 5) != 0) {
  603. ret = dec_error(d, "invalid_literal");
  604. goto done;
  605. }
  606. val = d->atoms->atom_false;
  607. dec_pop(d, st_value);
  608. d->i += 5;
  609. break;
  610. case '\"':
  611. if(!dec_string(d, &val)) {
  612. ret = dec_error(d, "invalid_string");
  613. goto done;
  614. }
  615. dec_pop(d, st_value);
  616. break;
  617. case '-':
  618. case '0':
  619. case '1':
  620. case '2':
  621. case '3':
  622. case '4':
  623. case '5':
  624. case '6':
  625. case '7':
  626. case '8':
  627. case '9':
  628. if(!dec_number(d, &val)) {
  629. ret = dec_error(d, "invalid_number");
  630. goto done;
  631. }
  632. dec_pop(d, st_value);
  633. break;
  634. case '{':
  635. dec_push(d, st_object);
  636. dec_push(d, st_key);
  637. objs = enif_make_list_cell(env, curr, objs);
  638. curr = enif_make_list(env, 0);
  639. d->i++;
  640. break;
  641. case '[':
  642. dec_push(d, st_array);
  643. dec_push(d, st_value);
  644. objs = enif_make_list_cell(env, curr, objs);
  645. curr = enif_make_list(env, 0);
  646. d->i++;
  647. break;
  648. case ']':
  649. if(!enif_is_empty_list(env, curr)) {
  650. ret = dec_error(d, "invalid_json");
  651. goto done;
  652. }
  653. dec_pop(d, st_value);
  654. if(dec_curr(d) != st_array) {
  655. ret = dec_error(d, "invalid_json");
  656. goto done;
  657. }
  658. dec_pop(d, st_array);
  659. dec_pop(d, st_value);
  660. val = curr; // curr is []
  661. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  662. ret = dec_error(d, "internal_error");
  663. goto done;
  664. }
  665. d->i++;
  666. break;
  667. default:
  668. ret = dec_error(d, "invalid_json");
  669. goto done;
  670. }
  671. if(dec_top(d) == 0) {
  672. dec_push(d, st_done);
  673. } else if(dec_curr(d) != st_value && dec_curr(d) != st_key) {
  674. dec_push(d, st_comma);
  675. curr = enif_make_list_cell(env, val, curr);
  676. }
  677. break;
  678. case st_key:
  679. switch(d->p[d->i]) {
  680. case ' ':
  681. case '\n':
  682. case '\r':
  683. case '\t':
  684. d->i++;
  685. break;
  686. case '\"':
  687. if(!dec_string(d, &val)) {
  688. ret = dec_error(d, "invalid_string");
  689. goto done;
  690. }
  691. dec_pop(d, st_key);
  692. dec_push(d, st_colon);
  693. curr = enif_make_list_cell(env, val, curr);
  694. break;
  695. case '}':
  696. if(!enif_is_empty_list(env, curr)) {
  697. ret = dec_error(d, "invalid_json");
  698. goto done;
  699. }
  700. dec_pop(d, st_key);
  701. dec_pop(d, st_object);
  702. dec_pop(d, st_value);
  703. val = enif_make_tuple1(env, curr);
  704. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  705. ret = dec_error(d, "internal_error");
  706. goto done;
  707. }
  708. if(dec_top(d) == 0) {
  709. dec_push(d, st_done);
  710. } else {
  711. dec_push(d, st_comma);
  712. curr = enif_make_list_cell(env, val, curr);
  713. }
  714. d->i++;
  715. break;
  716. default:
  717. ret = dec_error(d, "invalid_json");
  718. goto done;
  719. }
  720. break;
  721. case st_colon:
  722. switch(d->p[d->i]) {
  723. case ' ':
  724. case '\n':
  725. case '\r':
  726. case '\t':
  727. d->i++;
  728. break;
  729. case ':':
  730. dec_pop(d, st_colon);
  731. dec_push(d, st_value);
  732. d->i++;
  733. break;
  734. default:
  735. ret = dec_error(d, "invalid_json");
  736. goto done;
  737. }
  738. break;
  739. case st_comma:
  740. switch(d->p[d->i]) {
  741. case ' ':
  742. case '\n':
  743. case '\r':
  744. case '\t':
  745. d->i++;
  746. break;
  747. case ',':
  748. dec_pop(d, st_comma);
  749. switch(dec_curr(d)) {
  750. case st_object:
  751. dec_push(d, st_key);
  752. break;
  753. case st_array:
  754. dec_push(d, st_value);
  755. break;
  756. default:
  757. ret = dec_error(d, "internal_error");
  758. goto done;
  759. }
  760. d->i++;
  761. break;
  762. case '}':
  763. dec_pop(d, st_comma);
  764. if(dec_curr(d) != st_object) {
  765. ret = dec_error(d, "invalid_json");
  766. goto done;
  767. }
  768. dec_pop(d, st_object);
  769. dec_pop(d, st_value);
  770. val = make_object(env, curr);
  771. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  772. ret = dec_error(d, "internal_error");
  773. goto done;
  774. }
  775. if(dec_top(d) > 0) {
  776. dec_push(d, st_comma);
  777. curr = enif_make_list_cell(env, val, curr);
  778. } else {
  779. dec_push(d, st_done);
  780. }
  781. d->i++;
  782. break;
  783. case ']':
  784. dec_pop(d, st_comma);
  785. if(dec_curr(d) != st_array) {
  786. ret = dec_error(d, "invalid_json");
  787. goto done;
  788. }
  789. dec_pop(d, st_array);
  790. dec_pop(d, st_value);
  791. val = make_array(env, curr);
  792. if(!enif_get_list_cell(env, objs, &curr, &objs)) {
  793. ret = dec_error(d, "internal_error");
  794. goto done;
  795. }
  796. if(dec_top(d) > 0) {
  797. dec_push(d, st_comma);
  798. curr = enif_make_list_cell(env, val, curr);
  799. } else {
  800. dec_push(d, st_done);
  801. }
  802. d->i++;
  803. break;
  804. default:
  805. ret = dec_error(d, "invalid_json");
  806. goto done;
  807. }
  808. break;
  809. case st_done:
  810. switch(d->p[d->i]) {
  811. case ' ':
  812. case '\n':
  813. case '\r':
  814. case '\t':
  815. d->i++;
  816. break;
  817. default:
  818. ret = dec_error(d, "invalid_trailing_data");
  819. goto done;
  820. }
  821. break;
  822. default:
  823. ret = dec_error(d, "invalid_internal_state");
  824. goto done;
  825. }
  826. }
  827. if(dec_curr(d) != st_done) {
  828. ret = dec_error(d, "truncated_json");
  829. } else if(d->is_partial) {
  830. ret = enif_make_tuple2(env, d->atoms->atom_partial, val);
  831. } else {
  832. ret = val;
  833. }
  834. done:
  835. dec_destroy(d);
  836. return ret;
  837. }