源战役客户端
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

306 lines
9.0 KiB

  1. -- local utf8 = {}
  2. -- --byte index of the next char after the char at byte index i, followed by a valid flag for the char at byte index i.
  3. -- --nil if not found. invalid characters are iterated as 1-byte chars.
  4. -- function utf8.next_raw(s, i)
  5. -- if not i then
  6. -- if #s == 0 then return nil end
  7. -- return 1, true --fake flag (doesn't matter since this flag is not to be taken as full validation)
  8. -- end
  9. -- if i > #s then return end
  10. -- local c = s:byte(i)
  11. -- if c >= 0x00 and c <= 0x7F then
  12. -- i = i + 1
  13. -- elseif c >= 0xC2 and c <= 0xDF then
  14. -- i = i + 2
  15. -- elseif c >= 0xE0 and c <= 0xEF then
  16. -- i = i + 3
  17. -- elseif c >= 0xF0 and c <= 0xF4 then
  18. -- i = i + 4
  19. -- else --invalid
  20. -- return i + 1, false
  21. -- end
  22. -- if i > #s then return end
  23. -- return i, true
  24. -- end
  25. -- --next() is the generic iterator and can be replaced for different semantics. next_raw() must preserve its semantics.
  26. -- utf8.next = utf8.next_raw
  27. -- --iterate chars, returning the byte index where each char starts
  28. -- function utf8.byte_indices(s, previ)
  29. -- return utf8.next, s, previ
  30. -- end
  31. -- --number of chars in string
  32. -- function utf8.len(s)
  33. -- assert(s, "bad argument #1 to 'len' (string expected, got nil)")
  34. -- local len = 0
  35. -- for _ in utf8.byte_indices(s) do
  36. -- len = len + 1
  37. -- end
  38. -- return len
  39. -- end
  40. -- --byte index given char index. nil if the index is outside the string.
  41. -- function utf8.byte_index(s, target_ci)
  42. -- if target_ci < 1 then return end
  43. -- local ci = 0
  44. -- for i in utf8.byte_indices(s) do
  45. -- ci = ci + 1
  46. -- if ci == target_ci then
  47. -- return i
  48. -- end
  49. -- end
  50. -- assert(target_ci > ci, "invalid index")
  51. -- end
  52. -- --char index given byte index. nil if the index is outside the string.
  53. -- function utf8.char_index(s, target_i)
  54. -- if target_i < 1 or target_i > #s then return end
  55. -- local ci = 0
  56. -- for i in utf8.byte_indices(s) do
  57. -- ci = ci + 1
  58. -- if i == target_i then
  59. -- return ci
  60. -- end
  61. -- end
  62. -- LogError("invalid index")
  63. -- end
  64. -- --byte index of the prev. char before the char at byte index i, which defaults to #s + 1.
  65. -- --nil if the index is outside the 2..#s+1 range.
  66. -- --NOTE: unlike next(), this is a O(N) operation!
  67. -- function utf8.prev(s, nexti)
  68. -- nexti = nexti or #s + 1
  69. -- if nexti <= 1 or nexti > #s + 1 then return end
  70. -- local lasti, lastvalid = utf8.next(s)
  71. -- for i, valid in utf8.byte_indices(s) do
  72. -- if i == nexti then
  73. -- return lasti, lastvalid
  74. -- end
  75. -- lasti, lastvalid = i, valid
  76. -- end
  77. -- if nexti == #s + 1 then
  78. -- return lasti, lastvalid
  79. -- end
  80. -- LogError("invalid index")
  81. -- end
  82. -- --iterate chars in reverse order, returning the byte index where each char starts.
  83. -- function utf8.byte_indices_reverse(s, nexti)
  84. -- if #s < 200 then
  85. -- --using prev() is a O(N^2/2) operation, ok for small strings (200 chars need 40,000 iterations)
  86. -- return utf8.prev, s, nexti
  87. -- else
  88. -- --store byte indices in a table and iterate them in reverse.
  89. -- --this is 40x slower than byte_indices() but still fast at 2mil chars/second (but eats RAM and makes garbage).
  90. -- local t = {}
  91. -- for i in utf8.byte_indices(s) do
  92. -- if nexti and i >= nexti then break end
  93. -- table.insert(t, i)
  94. -- end
  95. -- local i = #t + 1
  96. -- return function()
  97. -- i = i - 1
  98. -- return t[i]
  99. -- end
  100. -- end
  101. -- end
  102. -- --sub based on char indices, which, unlike with standard string.sub(), can't be negative.
  103. -- --start_ci can be 1..inf and end_ci can be 0..inf. end_ci can be nil meaning last char.
  104. -- --if start_ci is out of range or end_ci < start_ci, the empty string is returned.
  105. -- --if end_ci is out of range, it is considered to be the last position in the string.
  106. -- function utf8.sub(s, start_ci, end_ci)
  107. -- --assert for positive indices because we might implement negative indices in the future.
  108. -- assert(start_ci >= 1)
  109. -- assert(not end_ci or end_ci >= 0)
  110. -- local ci = 0
  111. -- local start_i, end_i
  112. -- for i in utf8.byte_indices(s) do
  113. -- ci = ci + 1
  114. -- if ci == start_ci then
  115. -- start_i = i
  116. -- end
  117. -- if ci == end_ci then
  118. -- end_i = i
  119. -- end
  120. -- end
  121. -- if not start_i then
  122. -- assert(start_ci > ci, 'invalid index')
  123. -- return ''
  124. -- end
  125. -- if end_ci and not end_i then
  126. -- if end_ci < start_ci then
  127. -- return ''
  128. -- end
  129. -- assert(end_ci > ci, 'invalid index')
  130. -- end
  131. -- return s:sub(start_i, end_i and end_i - 1)
  132. -- end
  133. -- --check if a string contains a substring at byte index i without making garbage.
  134. -- --nil if the index is out of range. true if searching for the empty string.
  135. -- function utf8.contains(s, i, sub)
  136. -- if i < 1 or i > #s then return nil end
  137. -- for si = 1, #sub do
  138. -- if s:byte(i + si - 1) ~= sub:byte(si) then
  139. -- return false
  140. -- end
  141. -- end
  142. -- return true
  143. -- end
  144. -- --count the number of occurences of a substring in a string. the substring cannot be the empty string.
  145. -- function utf8.count(s, sub)
  146. -- assert(#sub > 0)
  147. -- local count = 0
  148. -- local i = 1
  149. -- while i do
  150. -- if utf8.contains(s, i, sub) then
  151. -- count = count + 1
  152. -- i = i + #sub
  153. -- if i > #s then break end
  154. -- else
  155. -- i = utf8.next(s, i)
  156. -- end
  157. -- end
  158. -- return count
  159. -- end
  160. -- --utf8 validation and sanitization
  161. -- --check if there's a valid utf8 codepoint at byte index i. valid ranges for each utf8 byte are:
  162. -- -- byte 1 2 3 4
  163. -- --------------------------------------------
  164. -- -- 00 - 7F
  165. -- -- C2 - DF 80 - BF
  166. -- -- E0 A0 - BF 80 - BF
  167. -- -- E1 - EC 80 - BF 80 - BF
  168. -- -- ED 80 - 9F 80 - BF
  169. -- -- EE - EF 80 - BF 80 - BF
  170. -- -- F0 90 - BF 80 - BF 80 - BF
  171. -- -- F1 - F3 80 - BF 80 - BF 80 - BF
  172. -- -- F4 80 - 8F 80 - BF 80 - BF
  173. -- function utf8.isvalid(s, i)
  174. -- local c = s:byte(i)
  175. -- if not c then
  176. -- return false
  177. -- elseif c >= 0x00 and c <= 0x7F then
  178. -- return true
  179. -- elseif c >= 0xC2 and c <= 0xDF then
  180. -- local c2 = s:byte(i + 1)
  181. -- return c2 and c2 >= 0x80 and c2 <= 0xBF
  182. -- elseif c >= 0xE0 and c <= 0xEF then
  183. -- local c2 = s:byte(i + 1)
  184. -- local c3 = s:byte(i + 2)
  185. -- if c == 0xE0 then
  186. -- return c2 and c3 and
  187. -- c2 >= 0xA0 and c2 <= 0xBF and
  188. -- c3 >= 0x80 and c3 <= 0xBF
  189. -- elseif c >= 0xE1 and c <= 0xEC then
  190. -- return c2 and c3 and
  191. -- c2 >= 0x80 and c2 <= 0xBF and
  192. -- c3 >= 0x80 and c3 <= 0xBF
  193. -- elseif c == 0xED then
  194. -- return c2 and c3 and
  195. -- c2 >= 0x80 and c2 <= 0x9F and
  196. -- c3 >= 0x80 and c3 <= 0xBF
  197. -- elseif c >= 0xEE and c <= 0xEF then
  198. -- if c == 0xEF and c2 == 0xBF and (c3 == 0xBE or c3 == 0xBF) then
  199. -- return false --uFFFE and uFFFF non-characters
  200. -- end
  201. -- return c2 and c3 and
  202. -- c2 >= 0x80 and c2 <= 0xBF and
  203. -- c3 >= 0x80 and c3 <= 0xBF
  204. -- end
  205. -- elseif c >= 0xF0 and c <= 0xF4 then
  206. -- local c2 = s:byte(i + 1)
  207. -- local c3 = s:byte(i + 2)
  208. -- local c4 = s:byte(i + 3)
  209. -- if c == 0xF0 then
  210. -- return c2 and c3 and c4 and
  211. -- c2 >= 0x90 and c2 <= 0xBF and
  212. -- c3 >= 0x80 and c3 <= 0xBF and
  213. -- c4 >= 0x80 and c4 <= 0xBF
  214. -- elseif c >= 0xF1 and c <= 0xF3 then
  215. -- return c2 and c3 and c4 and
  216. -- c2 >= 0x80 and c2 <= 0xBF and
  217. -- c3 >= 0x80 and c3 <= 0xBF and
  218. -- c4 >= 0x80 and c4 <= 0xBF
  219. -- elseif c == 0xF4 then
  220. -- return c2 and c3 and c4 and
  221. -- c2 >= 0x80 and c2 <= 0x8F and
  222. -- c3 >= 0x80 and c3 <= 0xBF and
  223. -- c4 >= 0x80 and c4 <= 0xBF
  224. -- end
  225. -- end
  226. -- return false
  227. -- end
  228. -- --byte index of the next valid utf8 char after the char at byte index i.
  229. -- --nil if indices go out of range. invalid characters are skipped.
  230. -- function utf8.next_valid(s, i)
  231. -- local valid
  232. -- i, valid = utf8.next_raw(s, i)
  233. -- while i and (not valid or not utf8.isvalid(s, i)) do
  234. -- i, valid = utf8.next(s, i)
  235. -- end
  236. -- return i
  237. -- end
  238. -- --iterate valid chars, returning the byte index where each char starts
  239. -- function utf8.valid_byte_indices(s)
  240. -- return utf8.next_valid, s
  241. -- end
  242. -- --assert that a string only contains valid utf8 characters
  243. -- function utf8.validate(s)
  244. -- for i, valid in utf8.byte_indices(s) do
  245. -- if not valid or not utf8.isvalid(s, i) then
  246. -- LogError(string.format('invalid utf8 char at #%d', i))
  247. -- end
  248. -- end
  249. -- end
  250. -- local function table_lookup(s, i, j, t)
  251. -- return t[s:sub(i, j)]
  252. -- end
  253. -- --replace characters in string based on a function f(s, i, j, ...) -> replacement_string | nil
  254. -- function utf8.replace(s, f, ...)
  255. -- if type(f) == 'table' then
  256. -- return utf8.replace(s, table_lookup, f)
  257. -- end
  258. -- if s == '' then
  259. -- return s
  260. -- end
  261. -- local t = {}
  262. -- local lasti = 1
  263. -- for i in utf8.byte_indices(s) do
  264. -- local nexti = utf8.next(s, i) or #s + 1
  265. -- local repl = f(s, i, nexti - 1, ...)
  266. -- if repl then
  267. -- table.insert(t, s:sub(lasti, i - 1))
  268. -- table.insert(t, repl)
  269. -- lasti = nexti
  270. -- end
  271. -- end
  272. -- table.insert(t, s:sub(lasti))
  273. -- return table.concat(t)
  274. -- end
  275. -- local function replace_invalid(s, i, j, repl_char)
  276. -- if not utf8.isvalid(s, i) then
  277. -- return repl_char
  278. -- end
  279. -- end
  280. -- --replace invalid utf8 chars with a replacement char
  281. -- function utf8.sanitize(s, repl_char)
  282. -- repl_char = repl_char or '�' --\uFFFD
  283. -- return utf8.replace(s, replace_invalid, repl_char)
  284. -- end
  285. -- return utf8