You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

431 lines
10 KiB

  1. import re, sys
  2. from unicodedata import ucd_3_2_0 as unicodedata
  3. if sys.maxunicode == 65535:
  4. raise RuntimeError("need UCS-4 Python")
  5. def gen_category(cats):
  6. for i in range(0, 0x110000):
  7. if unicodedata.category(chr(i)) in cats:
  8. yield(i)
  9. def gen_bidirectional(cats):
  10. for i in range(0, 0x110000):
  11. if unicodedata.bidirectional(chr(i)) in cats:
  12. yield(i)
  13. def compact_set(l):
  14. single = []
  15. tuple = []
  16. prev = None
  17. span = 0
  18. for e in l:
  19. if prev is None:
  20. prev = e
  21. span = 0
  22. continue
  23. if prev+span+1 != e:
  24. if span > 2:
  25. tuple.append((prev,prev+span+1))
  26. else:
  27. for i in range(prev, prev+span+1):
  28. single.append(i)
  29. prev = e
  30. span = 0
  31. else:
  32. span += 1
  33. if span:
  34. tuple.append((prev,prev+span+1))
  35. else:
  36. single.append(prev)
  37. if not single and len(tuple) == 1:
  38. tuple = "range(%d,%d)" % tuple[0]
  39. else:
  40. tuple = " + ".join("list(range(%d,%d))" % t for t in tuple)
  41. if not single:
  42. return "set(%s)" % tuple
  43. if not tuple:
  44. return "set(%r)" % (single,)
  45. return "set(%r + %s)" % (single, tuple)
  46. ############## Read the tables in the RFC #######################
  47. with open("rfc3454.txt") as f:
  48. data = f.readlines()
  49. tables = []
  50. curname = None
  51. for l in data:
  52. l = l.strip()
  53. if not l:
  54. continue
  55. # Skip RFC page breaks
  56. if l.startswith(("Hoffman & Blanchet", "RFC 3454")):
  57. continue
  58. # Find start/end lines
  59. m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
  60. if m:
  61. if m.group(1) == "Start":
  62. if curname:
  63. raise RuntimeError("Double Start", (curname, l))
  64. curname = m.group(2)
  65. table = {}
  66. tables.append((curname, table))
  67. continue
  68. else:
  69. if not curname:
  70. raise RuntimeError("End without start", l)
  71. if curname != m.group(2):
  72. raise RuntimeError("Unexpected end", l)
  73. curname = None
  74. continue
  75. if not curname:
  76. continue
  77. # Now we are in a table
  78. fields = l.split(";")
  79. if len(fields) > 1:
  80. # Drop comment field
  81. fields = fields[:-1]
  82. if len(fields) == 1:
  83. fields = fields[0].split("-")
  84. if len(fields) > 1:
  85. # range
  86. try:
  87. start, end = fields
  88. except ValueError:
  89. raise RuntimeError("Unpacking problem", l)
  90. else:
  91. start = end = fields[0]
  92. start = int(start, 16)
  93. end = int(end, 16)
  94. for i in range(start, end+1):
  95. table[i] = i
  96. else:
  97. code, value = fields
  98. value = value.strip()
  99. if value:
  100. value = [int(v, 16) for v in value.split(" ")]
  101. else:
  102. # table B.1
  103. value = None
  104. table[int(code, 16)] = value
  105. ########### Generate compact Python versions of the tables #############
  106. print("""# This file is generated by mkstringprep.py. DO NOT EDIT.
  107. \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
  108. There are two kinds of tables: sets, for which a member test is provided,
  109. and mappings, for which a mapping function is provided.
  110. \"\"\"
  111. from unicodedata import ucd_3_2_0 as unicodedata
  112. """)
  113. print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,))
  114. # A.1 is the table of unassigned characters
  115. # XXX Plane 15 PUA is listed as unassigned in Python.
  116. name, table = tables[0]
  117. del tables[0]
  118. assert name == "A.1"
  119. table = set(table.keys())
  120. Cn = set(gen_category(["Cn"]))
  121. # FDD0..FDEF are process internal codes
  122. Cn -= set(range(0xFDD0, 0xFDF0))
  123. # not a character
  124. Cn -= set(range(0xFFFE, 0x110000, 0x10000))
  125. Cn -= set(range(0xFFFF, 0x110000, 0x10000))
  126. # assert table == Cn
  127. print("""
  128. def in_table_a1(code):
  129. if unicodedata.category(code) != 'Cn': return False
  130. c = ord(code)
  131. if 0xFDD0 <= c < 0xFDF0: return False
  132. return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
  133. """)
  134. # B.1 cannot easily be derived
  135. name, table = tables[0]
  136. del tables[0]
  137. assert name == "B.1"
  138. table = sorted(table.keys())
  139. print("""
  140. b1_set = """ + compact_set(table) + """
  141. def in_table_b1(code):
  142. return ord(code) in b1_set
  143. """)
  144. # B.2 and B.3 is case folding.
  145. # It takes CaseFolding.txt into account, which is
  146. # not available in the Python database. Since
  147. # B.2 is derived from B.3, we process B.3 first.
  148. # B.3 supposedly *is* CaseFolding-3.2.0.txt.
  149. name, table_b2 = tables[0]
  150. del tables[0]
  151. assert name == "B.2"
  152. name, table_b3 = tables[0]
  153. del tables[0]
  154. assert name == "B.3"
  155. # B.3 is mostly Python's .lower, except for a number
  156. # of special cases, e.g. considering canonical forms.
  157. b3_exceptions = {}
  158. for k,v in table_b2.items():
  159. if list(map(ord, chr(k).lower())) != v:
  160. b3_exceptions[k] = "".join(map(chr,v))
  161. b3 = sorted(b3_exceptions.items())
  162. print("""
  163. b3_exceptions = {""")
  164. for i, kv in enumerate(b3):
  165. print("0x%x:%a," % kv, end=' ')
  166. if i % 4 == 3:
  167. print()
  168. print("}")
  169. print("""
  170. def map_table_b3(code):
  171. r = b3_exceptions.get(ord(code))
  172. if r is not None: return r
  173. return code.lower()
  174. """)
  175. def map_table_b3(code):
  176. r = b3_exceptions.get(ord(code))
  177. if r is not None: return r
  178. return code.lower()
  179. # B.2 is case folding for NFKC. This is the same as B.3,
  180. # except where NormalizeWithKC(Fold(a)) !=
  181. # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
  182. def map_table_b2(a):
  183. al = map_table_b3(a)
  184. b = unicodedata.normalize("NFKC", al)
  185. bl = "".join([map_table_b3(ch) for ch in b])
  186. c = unicodedata.normalize("NFKC", bl)
  187. if b != c:
  188. return c
  189. else:
  190. return al
  191. specials = {}
  192. for k,v in table_b2.items():
  193. if list(map(ord, map_table_b2(chr(k)))) != v:
  194. specials[k] = v
  195. # B.3 should not add any additional special cases
  196. assert specials == {}
  197. print("""
  198. def map_table_b2(a):
  199. al = map_table_b3(a)
  200. b = unicodedata.normalize("NFKC", al)
  201. bl = "".join([map_table_b3(ch) for ch in b])
  202. c = unicodedata.normalize("NFKC", bl)
  203. if b != c:
  204. return c
  205. else:
  206. return al
  207. """)
  208. # C.1.1 is a table with a single character
  209. name, table = tables[0]
  210. del tables[0]
  211. assert name == "C.1.1"
  212. assert table == {0x20:0x20}
  213. print("""
  214. def in_table_c11(code):
  215. return code == " "
  216. """)
  217. # C.1.2 is the rest of all space characters
  218. name, table = tables[0]
  219. del tables[0]
  220. assert name == "C.1.2"
  221. # table = set(table.keys())
  222. # Zs = set(gen_category(["Zs"])) - {0x20}
  223. # assert Zs == table
  224. print("""
  225. def in_table_c12(code):
  226. return unicodedata.category(code) == "Zs" and code != " "
  227. def in_table_c11_c12(code):
  228. return unicodedata.category(code) == "Zs"
  229. """)
  230. # C.2.1 ASCII control characters
  231. name, table_c21 = tables[0]
  232. del tables[0]
  233. assert name == "C.2.1"
  234. Cc = set(gen_category(["Cc"]))
  235. Cc_ascii = Cc & set(range(128))
  236. table_c21 = set(table_c21.keys())
  237. assert Cc_ascii == table_c21
  238. print("""
  239. def in_table_c21(code):
  240. return ord(code) < 128 and unicodedata.category(code) == "Cc"
  241. """)
  242. # C.2.2 Non-ASCII control characters. It also includes
  243. # a number of characters in category Cf.
  244. name, table_c22 = tables[0]
  245. del tables[0]
  246. assert name == "C.2.2"
  247. Cc_nonascii = Cc - Cc_ascii
  248. table_c22 = set(table_c22.keys())
  249. assert len(Cc_nonascii - table_c22) == 0
  250. specials = list(table_c22 - Cc_nonascii)
  251. specials.sort()
  252. print("""c22_specials = """ + compact_set(specials) + """
  253. def in_table_c22(code):
  254. c = ord(code)
  255. if c < 128: return False
  256. if unicodedata.category(code) == "Cc": return True
  257. return c in c22_specials
  258. def in_table_c21_c22(code):
  259. return unicodedata.category(code) == "Cc" or \\
  260. ord(code) in c22_specials
  261. """)
  262. # C.3 Private use
  263. name, table = tables[0]
  264. del tables[0]
  265. assert name == "C.3"
  266. Co = set(gen_category(["Co"]))
  267. assert set(table.keys()) == Co
  268. print("""
  269. def in_table_c3(code):
  270. return unicodedata.category(code) == "Co"
  271. """)
  272. # C.4 Non-character code points, xFFFE, xFFFF
  273. # plus process internal codes
  274. name, table = tables[0]
  275. del tables[0]
  276. assert name == "C.4"
  277. nonchar = set(range(0xFDD0,0xFDF0))
  278. nonchar.update(range(0xFFFE,0x110000,0x10000))
  279. nonchar.update(range(0xFFFF,0x110000,0x10000))
  280. table = set(table.keys())
  281. assert table == nonchar
  282. print("""
  283. def in_table_c4(code):
  284. c = ord(code)
  285. if c < 0xFDD0: return False
  286. if c < 0xFDF0: return True
  287. return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
  288. """)
  289. # C.5 Surrogate codes
  290. name, table = tables[0]
  291. del tables[0]
  292. assert name == "C.5"
  293. Cs = set(gen_category(["Cs"]))
  294. assert set(table.keys()) == Cs
  295. print("""
  296. def in_table_c5(code):
  297. return unicodedata.category(code) == "Cs"
  298. """)
  299. # C.6 Inappropriate for plain text
  300. name, table = tables[0]
  301. del tables[0]
  302. assert name == "C.6"
  303. table = sorted(table.keys())
  304. print("""
  305. c6_set = """ + compact_set(table) + """
  306. def in_table_c6(code):
  307. return ord(code) in c6_set
  308. """)
  309. # C.7 Inappropriate for canonical representation
  310. name, table = tables[0]
  311. del tables[0]
  312. assert name == "C.7"
  313. table = sorted(table.keys())
  314. print("""
  315. c7_set = """ + compact_set(table) + """
  316. def in_table_c7(code):
  317. return ord(code) in c7_set
  318. """)
  319. # C.8 Change display properties or are deprecated
  320. name, table = tables[0]
  321. del tables[0]
  322. assert name == "C.8"
  323. table = sorted(table.keys())
  324. print("""
  325. c8_set = """ + compact_set(table) + """
  326. def in_table_c8(code):
  327. return ord(code) in c8_set
  328. """)
  329. # C.9 Tagging characters
  330. name, table = tables[0]
  331. del tables[0]
  332. assert name == "C.9"
  333. table = sorted(table.keys())
  334. print("""
  335. c9_set = """ + compact_set(table) + """
  336. def in_table_c9(code):
  337. return ord(code) in c9_set
  338. """)
  339. # D.1 Characters with bidirectional property "R" or "AL"
  340. name, table = tables[0]
  341. del tables[0]
  342. assert name == "D.1"
  343. RandAL = set(gen_bidirectional(["R","AL"]))
  344. assert set(table.keys()) == RandAL
  345. print("""
  346. def in_table_d1(code):
  347. return unicodedata.bidirectional(code) in ("R","AL")
  348. """)
  349. # D.2 Characters with bidirectional property "L"
  350. name, table = tables[0]
  351. del tables[0]
  352. assert name == "D.2"
  353. L = set(gen_bidirectional(["L"]))
  354. assert set(table.keys()) == L
  355. print("""
  356. def in_table_d2(code):
  357. return unicodedata.bidirectional(code) == "L"
  358. """)