You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

255 lines
6.3 KiB

  1. // This file is part of Jiffy released under the MIT license.
  2. // See the LICENSE file for more information.
  3. #include "jiffy.h"
  4. #include <stdio.h>
  5. static const unsigned char hexvals[256] = {
  6. 255, 255, 255, 255, 255, 255, 255, 255,
  7. 255, 255, 255, 255, 255, 255, 255, 255,
  8. 255, 255, 255, 255, 255, 255, 255, 255,
  9. 255, 255, 255, 255, 255, 255, 255, 255,
  10. 255, 255, 255, 255, 255, 255, 255, 255,
  11. 255, 255, 255, 255, 255, 255, 255, 255,
  12. 0, 1, 2, 3, 4, 5, 6, 7,
  13. 8, 9, 255, 255, 255, 255, 255, 255,
  14. 255, 10, 11, 12, 13, 14, 15, 255,
  15. 255, 255, 255, 255, 255, 255, 255, 255,
  16. 255, 255, 255, 255, 255, 255, 255, 255,
  17. 255, 255, 255, 255, 255, 255, 255, 255,
  18. 255, 10, 11, 12, 13, 14, 15, 255,
  19. 255, 255, 255, 255, 255, 255, 255, 255,
  20. 255, 255, 255, 255, 255, 255, 255, 255,
  21. 255, 255, 255, 255, 255, 255, 255, 255
  22. };
  23. static const char hexdigits[16] = {
  24. '0', '1', '2', '3',
  25. '4', '5', '6', '7',
  26. '8', '9', 'A', 'B',
  27. 'C', 'D', 'E', 'F'
  28. };
  29. int
  30. int_from_hex(const unsigned char* p)
  31. {
  32. unsigned char* h = (unsigned char*) p;
  33. int ret;
  34. if(hexvals[*(h+0)] == 255) return -1;
  35. if(hexvals[*(h+1)] == 255) return -1;
  36. if(hexvals[*(h+2)] == 255) return -1;
  37. if(hexvals[*(h+3)] == 255) return -1;
  38. ret = (hexvals[*(h+0)] << 12)
  39. + (hexvals[*(h+1)] << 8)
  40. + (hexvals[*(h+2)] << 4)
  41. + (hexvals[*(h+3)] << 0);
  42. return ret;
  43. }
  44. int
  45. int_to_hex(int val, char* p)
  46. {
  47. if(val < 0 || val > 65535)
  48. return -1;
  49. p[0] = hexdigits[(val >> 12) & 0xF];
  50. p[1] = hexdigits[(val >> 8) & 0xF];
  51. p[2] = hexdigits[(val >> 4) & 0xF];
  52. p[3] = hexdigits[val & 0xF];
  53. return 1;
  54. }
  55. int
  56. utf8_len(int c)
  57. {
  58. if(c < 128) {
  59. return 1;
  60. } else if(c < 0x800) {
  61. return 2;
  62. } else if(c < 0x10000) {
  63. if(c < 0xD800 || (c > 0xDFFF)) {
  64. return 3;
  65. } else {
  66. return -1;
  67. }
  68. } else if(c <= 0x10FFFF) {
  69. return 4;
  70. } else {
  71. return -1;
  72. }
  73. }
  74. int
  75. utf8_esc_len(int c)
  76. {
  77. if(c < 0x10000) {
  78. return 6;
  79. } else if(c <= 0x10FFFF) {
  80. return 12;
  81. } else {
  82. return -1;
  83. }
  84. }
  85. int
  86. utf8_validate(unsigned char* data, size_t size)
  87. {
  88. int ulen = -1;
  89. int ui;
  90. if((data[0] & 0x80) == 0x00) {
  91. ulen = 1;
  92. } if((data[0] & 0xE0) == 0xC0) {
  93. ulen = 2;
  94. } else if((data[0] & 0xF0) == 0xE0) {
  95. ulen = 3;
  96. } else if((data[0] & 0xF8) == 0xF0) {
  97. ulen = 4;
  98. }
  99. if(ulen < 0 || ulen > size) {
  100. return -1;
  101. }
  102. // Check each continuation byte.
  103. for(ui = 1; ui < ulen; ui++) {
  104. if((data[ui] & 0xC0) != 0x80) return -1;
  105. }
  106. // Wikipedia says I have to check that a UTF-8 encoding
  107. // uses as few bits as possible. This means that we
  108. // can't do things like encode 't' in three bytes.
  109. // To check this all we need to ensure is that for each
  110. // of the following bit patterns that there is at least
  111. // one 1 bit in any of the x's
  112. // 1: 0yyyyyyy
  113. // 2: 110xxxxy 10yyyyyy
  114. // 3: 1110xxxx 10xyyyyy 10yyyyyy
  115. // 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
  116. // ulen == 1 passes by definition
  117. if(ulen == 2) {
  118. if((data[0] & 0x1E) == 0)
  119. return -1;
  120. } else if(ulen == 3) {
  121. if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
  122. return -1;
  123. } else if(ulen == 4) {
  124. if((data[0] & 0x07) + (data[1] & 0x30) == 0)
  125. return -1;
  126. }
  127. // Lastly we need to check some miscellaneous ranges for
  128. // some of the larger code point values.
  129. if(ulen >= 3) {
  130. ui = utf8_to_unicode(data, ulen);
  131. if(ui < 0) {
  132. return -1;
  133. } else if(ui >= 0xD800 && ui <= 0xDFFF) {
  134. return -1;
  135. } else if(ui > 0x10FFFF) {
  136. return -1;
  137. }
  138. }
  139. return ulen;
  140. }
  141. int
  142. utf8_to_unicode(unsigned char* buf, size_t size)
  143. {
  144. int ret;
  145. if((buf[0] & 0x80) == 0x00) {
  146. // 0xxxxxxx
  147. ret = (int) buf[0];
  148. } else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
  149. // 110xxxxy 10yyyyyy
  150. ret = ((buf[0] & 0x1F) << 6)
  151. | ((buf[1] & 0x3F));
  152. } else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
  153. // 1110xxxx 10xyyyyy 10yyyyyy
  154. ret = ((buf[0] & 0x0F) << 12)
  155. | ((buf[1] & 0x3F) << 6)
  156. | ((buf[2] & 0x3F));
  157. if(ret >= 0xD800 && ret <= 0xDFFF) {
  158. ret = -1;
  159. }
  160. } else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
  161. // 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
  162. ret = ((buf[0] & 0x07) << 18)
  163. | ((buf[1] & 0x3F) << 12)
  164. | ((buf[2] & 0x3F) << 6)
  165. | ((buf[3] & 0x3F));
  166. } else {
  167. ret = -1;
  168. }
  169. return ret;
  170. }
  171. int
  172. unicode_to_utf8(int c, unsigned char* buf)
  173. {
  174. if(c < 0x80) {
  175. buf[0] = (unsigned char) c;
  176. return 1;
  177. } else if(c < 0x800) {
  178. buf[0] = (unsigned char) 0xC0 + (c >> 6);
  179. buf[1] = (unsigned char) 0x80 + (c & 0x3F);
  180. return 2;
  181. } else if(c < 0x10000) {
  182. if(c < 0xD800 || (c > 0xDFFF)) {
  183. buf[0] = (unsigned char) 0xE0 + (c >> 12);
  184. buf[1] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
  185. buf[2] = (unsigned char) 0x80 + (c & 0x3F);
  186. return 3;
  187. } else {
  188. return -1;
  189. }
  190. } else if(c <= 0x10FFFF) {
  191. buf[0] = (unsigned char) 0xF0 + (c >> 18);
  192. buf[1] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
  193. buf[2] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
  194. buf[3] = (unsigned char) 0x80 + (c & 0x3F);
  195. return 4;
  196. }
  197. return -1;
  198. }
  199. int
  200. unicode_from_pair(int hi, int lo)
  201. {
  202. if(hi < 0xD800 || hi >= 0xDC00) return -1;
  203. if(lo < 0xDC00 || lo > 0xDFFF) return -1;
  204. return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
  205. }
  206. int
  207. unicode_uescape(int val, char* p)
  208. {
  209. int n;
  210. if(val < 0x10000) {
  211. p[0] = '\\';
  212. p[1] = 'u';
  213. if(int_to_hex(val, p+2) < 0) {
  214. return -1;
  215. }
  216. return 6;
  217. } else if (val <= 0x10FFFF) {
  218. n = val - 0x10000;
  219. p[0] = '\\';
  220. p[1] = 'u';
  221. if(int_to_hex((0xD800 | ((n >> 10) & 0x03FF)), p+2) < 0) {
  222. return -1;
  223. }
  224. p[6] = '\\';
  225. p[7] = 'u';
  226. if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
  227. return -1;
  228. }
  229. return 12;
  230. }
  231. return -1;
  232. }