You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

247 line
6.1 KiB

  1. // This file is part of Jiffy released under the MIT license.
  2. // See the LICENSE file for more information.
  3. #include "jiffy.h"
  4. #include <stdio.h>
  5. static const char hexvals[256] = {
  6. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  7. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  8. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  9. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
  10. -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  11. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  12. -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  13. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  14. };
  15. static const char hexdigits[16] = {
  16. '0', '1', '2', '3',
  17. '4', '5', '6', '7',
  18. '8', '9', 'A', 'B',
  19. 'C', 'D', 'E', 'F'
  20. };
  21. int
  22. int_from_hex(const unsigned char* p)
  23. {
  24. unsigned char* h = (unsigned char*) p;
  25. int ret;
  26. if(hexvals[*(h+0)] < 0) return -1;
  27. if(hexvals[*(h+1)] < 0) return -1;
  28. if(hexvals[*(h+2)] < 0) return -1;
  29. if(hexvals[*(h+3)] < 0) return -1;
  30. ret = (hexvals[*(h+0)] << 12)
  31. + (hexvals[*(h+1)] << 8)
  32. + (hexvals[*(h+2)] << 4)
  33. + (hexvals[*(h+3)] << 0);
  34. return ret;
  35. }
  36. int
  37. int_to_hex(int val, char* p)
  38. {
  39. if(val < 0 || val > 65535)
  40. return -1;
  41. p[0] = hexdigits[(val >> 12) & 0xF];
  42. p[1] = hexdigits[(val >> 8) & 0xF];
  43. p[2] = hexdigits[(val >> 4) & 0xF];
  44. p[3] = hexdigits[val & 0xF];
  45. return 1;
  46. }
  47. int
  48. utf8_len(int c)
  49. {
  50. if(c < 128) {
  51. return 1;
  52. } else if(c < 0x800) {
  53. return 2;
  54. } else if(c < 0x10000) {
  55. if(c < 0xD800 || (c > 0xDFFF)) {
  56. return 3;
  57. } else {
  58. return -1;
  59. }
  60. } else if(c <= 0x10FFFF) {
  61. return 4;
  62. } else {
  63. return -1;
  64. }
  65. }
  66. int
  67. utf8_esc_len(int c)
  68. {
  69. if(c < 0x10000) {
  70. return 6;
  71. } else if(c <= 0x10FFFF) {
  72. return 12;
  73. } else {
  74. return -1;
  75. }
  76. }
  77. int
  78. utf8_validate(unsigned char* data, size_t size)
  79. {
  80. int ulen = -1;
  81. int ui;
  82. if((data[0] & 0x80) == 0x00) {
  83. ulen = 1;
  84. } if((data[0] & 0xE0) == 0xC0) {
  85. ulen = 2;
  86. } else if((data[0] & 0xF0) == 0xE0) {
  87. ulen = 3;
  88. } else if((data[0] & 0xF8) == 0xF0) {
  89. ulen = 4;
  90. }
  91. if(ulen < 0 || ulen > size) {
  92. return -1;
  93. }
  94. // Check each continuation byte.
  95. for(ui = 1; ui < ulen; ui++) {
  96. if((data[ui] & 0xC0) != 0x80) return -1;
  97. }
  98. // Wikipedia says I have to check that a UTF-8 encoding
  99. // uses as few bits as possible. This means that we
  100. // can't do things like encode 't' in three bytes.
  101. // To check this all we need to ensure is that for each
  102. // of the following bit patterns that there is at least
  103. // one 1 bit in any of the x's
  104. // 1: 0yyyyyyy
  105. // 2: 110xxxxy 10yyyyyy
  106. // 3: 1110xxxx 10xyyyyy 10yyyyyy
  107. // 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
  108. // ulen == 1 passes by definition
  109. if(ulen == 2) {
  110. if((data[0] & 0x1E) == 0)
  111. return -1;
  112. } else if(ulen == 3) {
  113. if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
  114. return -1;
  115. } else if(ulen == 4) {
  116. if((data[0] & 0x07) + (data[1] & 0x30) == 0)
  117. return -1;
  118. }
  119. // Lastly we need to check some miscellaneous ranges for
  120. // some of the larger code point values.
  121. if(ulen >= 3) {
  122. ui = utf8_to_unicode(data, ulen);
  123. if(ui < 0) {
  124. return -1;
  125. } else if(ui >= 0xD800 && ui <= 0xDFFF) {
  126. return -1;
  127. } else if(ui > 0x10FFFF) {
  128. return -1;
  129. }
  130. }
  131. return ulen;
  132. }
  133. int
  134. utf8_to_unicode(unsigned char* buf, size_t size)
  135. {
  136. int ret;
  137. if((buf[0] & 0x80) == 0x00) {
  138. // 0xxxxxxx
  139. ret = (int) buf[0];
  140. } else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
  141. // 110xxxxy 10yyyyyy
  142. ret = ((buf[0] & 0x1F) << 6)
  143. | ((buf[1] & 0x3F));
  144. } else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
  145. // 1110xxxx 10xyyyyy 10yyyyyy
  146. ret = ((buf[0] & 0x0F) << 12)
  147. | ((buf[1] & 0x3F) << 6)
  148. | ((buf[2] & 0x3F));
  149. if(ret >= 0xD800 && ret <= 0xDFFF) {
  150. ret = -1;
  151. }
  152. } else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
  153. // 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
  154. ret = ((buf[0] & 0x07) << 18)
  155. | ((buf[1] & 0x3F) << 12)
  156. | ((buf[2] & 0x3F) << 6)
  157. | ((buf[3] & 0x3F));
  158. } else {
  159. ret = -1;
  160. }
  161. return ret;
  162. }
  163. int
  164. unicode_to_utf8(int c, unsigned char* buf)
  165. {
  166. if(c < 0x80) {
  167. buf[0] = (unsigned char) c;
  168. return 1;
  169. } else if(c < 0x800) {
  170. buf[0] = (unsigned char) 0xC0 + (c >> 6);
  171. buf[1] = (unsigned char) 0x80 + (c & 0x3F);
  172. return 2;
  173. } else if(c < 0x10000) {
  174. if(c < 0xD800 || (c > 0xDFFF)) {
  175. buf[0] = (unsigned char) 0xE0 + (c >> 12);
  176. buf[1] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
  177. buf[2] = (unsigned char) 0x80 + (c & 0x3F);
  178. return 3;
  179. } else {
  180. return -1;
  181. }
  182. } else if(c < 0x10FFFF) {
  183. buf[0] = (unsigned char) 0xF0 + (c >> 18);
  184. buf[1] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
  185. buf[2] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
  186. buf[3] = (unsigned char) 0x80 + (c & 0x3F);
  187. return 4;
  188. }
  189. return -1;
  190. }
  191. int
  192. unicode_from_pair(int hi, int lo)
  193. {
  194. if(hi < 0xD800 || hi >= 0xDC00) return -1;
  195. if(lo < 0xDC00 || lo > 0xDFFF) return -1;
  196. return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
  197. }
  198. int
  199. unicode_uescape(int val, char* p)
  200. {
  201. int n;
  202. if(val < 0x10000) {
  203. p[0] = '\\';
  204. p[1] = 'u';
  205. if(int_to_hex(val, p+2) < 0) {
  206. return -1;
  207. }
  208. return 6;
  209. } else if (val <= 0x10FFFF) {
  210. n = val - 0x10000;
  211. p[0] = '\\';
  212. p[1] = 'u';
  213. if(int_to_hex((0xD800 | ((n >> 10) & 0x03FF)), p+2) < 0) {
  214. return -1;
  215. }
  216. p[6] = '\\';
  217. p[7] = 'u';
  218. if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
  219. return -1;
  220. }
  221. return 12;
  222. }
  223. return -1;
  224. }