您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

272 行
6.8 KiB

  1. // This file is part of Jiffy released under the MIT license.
  2. // See the LICENSE file for more information.
  3. #include "jiffy.h"
  4. #include <stdio.h>
  5. static const unsigned char hexvals[256] = {
  6. 255, 255, 255, 255, 255, 255, 255, 255,
  7. 255, 255, 255, 255, 255, 255, 255, 255,
  8. 255, 255, 255, 255, 255, 255, 255, 255,
  9. 255, 255, 255, 255, 255, 255, 255, 255,
  10. 255, 255, 255, 255, 255, 255, 255, 255,
  11. 255, 255, 255, 255, 255, 255, 255, 255,
  12. 0, 1, 2, 3, 4, 5, 6, 7,
  13. 8, 9, 255, 255, 255, 255, 255, 255,
  14. 255, 10, 11, 12, 13, 14, 15, 255,
  15. 255, 255, 255, 255, 255, 255, 255, 255,
  16. 255, 255, 255, 255, 255, 255, 255, 255,
  17. 255, 255, 255, 255, 255, 255, 255, 255,
  18. 255, 10, 11, 12, 13, 14, 15, 255,
  19. 255, 255, 255, 255, 255, 255, 255, 255,
  20. 255, 255, 255, 255, 255, 255, 255, 255,
  21. 255, 255, 255, 255, 255, 255, 255, 255,
  22. 255, 255, 255, 255, 255, 255, 255, 255,
  23. 255, 255, 255, 255, 255, 255, 255, 255,
  24. 255, 255, 255, 255, 255, 255, 255, 255,
  25. 255, 255, 255, 255, 255, 255, 255, 255,
  26. 255, 255, 255, 255, 255, 255, 255, 255,
  27. 255, 255, 255, 255, 255, 255, 255, 255,
  28. 255, 255, 255, 255, 255, 255, 255, 255,
  29. 255, 255, 255, 255, 255, 255, 255, 255,
  30. 255, 255, 255, 255, 255, 255, 255, 255,
  31. 255, 255, 255, 255, 255, 255, 255, 255,
  32. 255, 255, 255, 255, 255, 255, 255, 255,
  33. 255, 255, 255, 255, 255, 255, 255, 255,
  34. 255, 255, 255, 255, 255, 255, 255, 255,
  35. 255, 255, 255, 255, 255, 255, 255, 255,
  36. 255, 255, 255, 255, 255, 255, 255, 255,
  37. 255, 255, 255, 255, 255, 255, 255, 255
  38. };
  39. static const char hexdigits[16] = {
  40. '0', '1', '2', '3',
  41. '4', '5', '6', '7',
  42. '8', '9', 'A', 'B',
  43. 'C', 'D', 'E', 'F'
  44. };
  45. int
  46. int_from_hex(const unsigned char* p)
  47. {
  48. unsigned char* h = (unsigned char*) p;
  49. int ret;
  50. if(hexvals[*(h+0)] == 255) return -1;
  51. if(hexvals[*(h+1)] == 255) return -1;
  52. if(hexvals[*(h+2)] == 255) return -1;
  53. if(hexvals[*(h+3)] == 255) return -1;
  54. ret = (hexvals[*(h+0)] << 12)
  55. + (hexvals[*(h+1)] << 8)
  56. + (hexvals[*(h+2)] << 4)
  57. + (hexvals[*(h+3)] << 0);
  58. return ret;
  59. }
  60. int
  61. int_to_hex(int val, unsigned char* p)
  62. {
  63. if(val < 0 || val > 65535)
  64. return -1;
  65. p[0] = hexdigits[(val >> 12) & 0xF];
  66. p[1] = hexdigits[(val >> 8) & 0xF];
  67. p[2] = hexdigits[(val >> 4) & 0xF];
  68. p[3] = hexdigits[val & 0xF];
  69. return 1;
  70. }
  71. int
  72. utf8_len(int c)
  73. {
  74. if(c < 128) {
  75. return 1;
  76. } else if(c < 0x800) {
  77. return 2;
  78. } else if(c < 0x10000) {
  79. if(c < 0xD800 || (c > 0xDFFF)) {
  80. return 3;
  81. } else {
  82. return -1;
  83. }
  84. } else if(c <= 0x10FFFF) {
  85. return 4;
  86. } else {
  87. return -1;
  88. }
  89. }
  90. int
  91. utf8_esc_len(int c)
  92. {
  93. if(c < 0x10000) {
  94. return 6;
  95. } else if(c <= 0x10FFFF) {
  96. return 12;
  97. } else {
  98. return -1;
  99. }
  100. }
  101. int
  102. utf8_validate(unsigned char* data, size_t size)
  103. {
  104. int ulen = -1;
  105. int ui;
  106. if((data[0] & 0x80) == 0x00) {
  107. ulen = 1;
  108. } if((data[0] & 0xE0) == 0xC0) {
  109. ulen = 2;
  110. } else if((data[0] & 0xF0) == 0xE0) {
  111. ulen = 3;
  112. } else if((data[0] & 0xF8) == 0xF0) {
  113. ulen = 4;
  114. }
  115. if(ulen < 0 || ulen > size) {
  116. return -1;
  117. }
  118. // Check each continuation byte.
  119. for(ui = 1; ui < ulen; ui++) {
  120. if((data[ui] & 0xC0) != 0x80) return -1;
  121. }
  122. // Wikipedia says I have to check that a UTF-8 encoding
  123. // uses as few bits as possible. This means that we
  124. // can't do things like encode 't' in three bytes.
  125. // To check this all we need to ensure is that for each
  126. // of the following bit patterns that there is at least
  127. // one 1 bit in any of the x's
  128. // 1: 0yyyyyyy
  129. // 2: 110xxxxy 10yyyyyy
  130. // 3: 1110xxxx 10xyyyyy 10yyyyyy
  131. // 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
  132. // ulen == 1 passes by definition
  133. if(ulen == 2) {
  134. if((data[0] & 0x1E) == 0)
  135. return -1;
  136. } else if(ulen == 3) {
  137. if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
  138. return -1;
  139. } else if(ulen == 4) {
  140. if((data[0] & 0x07) + (data[1] & 0x30) == 0)
  141. return -1;
  142. }
  143. // Lastly we need to check some miscellaneous ranges for
  144. // some of the larger code point values.
  145. if(ulen >= 3) {
  146. ui = utf8_to_unicode(data, ulen);
  147. if(ui < 0) {
  148. return -1;
  149. } else if(ui >= 0xD800 && ui <= 0xDFFF) {
  150. return -1;
  151. } else if(ui > 0x10FFFF) {
  152. return -1;
  153. }
  154. }
  155. return ulen;
  156. }
  157. int
  158. utf8_to_unicode(unsigned char* buf, size_t size)
  159. {
  160. int ret;
  161. if((buf[0] & 0x80) == 0x00) {
  162. // 0xxxxxxx
  163. ret = buf[0];
  164. } else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
  165. // 110xxxxy 10yyyyyy
  166. ret = ((buf[0] & 0x1F) << 6)
  167. | ((buf[1] & 0x3F));
  168. } else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
  169. // 1110xxxx 10xyyyyy 10yyyyyy
  170. ret = ((buf[0] & 0x0F) << 12)
  171. | ((buf[1] & 0x3F) << 6)
  172. | ((buf[2] & 0x3F));
  173. if(ret >= 0xD800 && ret <= 0xDFFF) {
  174. ret = -1;
  175. }
  176. } else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
  177. // 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
  178. ret = ((buf[0] & 0x07) << 18)
  179. | ((buf[1] & 0x3F) << 12)
  180. | ((buf[2] & 0x3F) << 6)
  181. | ((buf[3] & 0x3F));
  182. } else {
  183. ret = -1;
  184. }
  185. return ret;
  186. }
  187. int
  188. unicode_to_utf8(int c, unsigned char* buf)
  189. {
  190. if(c < 0x80) {
  191. buf[0] = c;
  192. return 1;
  193. } else if(c < 0x800) {
  194. buf[0] = 0xC0 + (c >> 6);
  195. buf[1] = 0x80 + (c & 0x3F);
  196. return 2;
  197. } else if(c < 0x10000) {
  198. if(c < 0xD800 || (c > 0xDFFF)) {
  199. buf[0] = 0xE0 + (c >> 12);
  200. buf[1] = 0x80 + ((c >> 6) & 0x3F);
  201. buf[2] = 0x80 + (c & 0x3F);
  202. return 3;
  203. } else {
  204. return -1;
  205. }
  206. } else if(c <= 0x10FFFF) {
  207. buf[0] = 0xF0 + (c >> 18);
  208. buf[1] = 0x80 + ((c >> 12) & 0x3F);
  209. buf[2] = 0x80 + ((c >> 6) & 0x3F);
  210. buf[3] = 0x80 + (c & 0x3F);
  211. return 4;
  212. }
  213. return -1;
  214. }
  215. int
  216. unicode_from_pair(int hi, int lo)
  217. {
  218. if(hi < 0xD800 || hi >= 0xDC00) return -1;
  219. if(lo < 0xDC00 || lo > 0xDFFF) return -1;
  220. return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
  221. }
  222. int
  223. unicode_uescape(int val, unsigned char* p)
  224. {
  225. int n;
  226. if(val < 0x10000) {
  227. p[0] = '\\';
  228. p[1] = 'u';
  229. if(int_to_hex(val, p+2) < 0) {
  230. return -1;
  231. }
  232. return 6;
  233. } else if (val <= 0x10FFFF) {
  234. n = val - 0x10000;
  235. p[0] = '\\';
  236. p[1] = 'u';
  237. if(int_to_hex((0xD800 | ((n >> 10) & 0x03FF)), p+2) < 0) {
  238. return -1;
  239. }
  240. p[6] = '\\';
  241. p[7] = 'u';
  242. if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
  243. return -1;
  244. }
  245. return 12;
  246. }
  247. return -1;
  248. }