utf8_gen.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # Copyright (C) 2014-2020 Free Software Foundation, Inc.
  4. # This file is part of the GNU C Library.
  5. #
  6. # The GNU C Library is free software; you can redistribute it and/or
  7. # modify it under the terms of the GNU Lesser General Public
  8. # License as published by the Free Software Foundation; either
  9. # version 2.1 of the License, or (at your option) any later version.
  10. #
  11. # The GNU C Library is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. # Lesser General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU Lesser General Public
  17. # License along with the GNU C Library; if not, see
  18. # <https://www.gnu.org/licenses/>.
  19. '''glibc/localedata/charmaps/UTF-8 file generator script
  20. This script generates a glibc/localedata/charmaps/UTF-8 file
  21. from Unicode data.
  22. Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
  23. It will output UTF-8 file
  24. '''
  25. import argparse
  26. import sys
  27. import re
  28. import unicode_utils
  29. # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
  30. # sections 3.11 and 4.4.
  31. JAMO_INITIAL_SHORT_NAME = (
  32. 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
  33. 'C', 'K', 'T', 'P', 'H'
  34. )
  35. JAMO_MEDIAL_SHORT_NAME = (
  36. 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
  37. 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
  38. )
  39. JAMO_FINAL_SHORT_NAME = (
  40. '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
  41. 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
  42. 'P', 'H'
  43. )
  44. def process_range(start, end, outfile, name):
  45. '''Writes a range of code points into the CHARMAP section of the
  46. output file
  47. '''
  48. if 'Hangul Syllable' in name:
  49. # from glibc/localedata/ChangeLog:
  50. #
  51. # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
  52. # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
  53. # so they become printable and carry a width. Comment out surrogate
  54. # ranges. Add a WIDTH table
  55. #
  56. # So we expand the Hangul Syllables here:
  57. for i in range(int(start, 16), int(end, 16)+1 ):
  58. index2, index3 = divmod(i - 0xaC00, 28)
  59. index1, index2 = divmod(index2, 21)
  60. hangul_syllable_name = 'HANGUL SYLLABLE ' \
  61. + JAMO_INITIAL_SHORT_NAME[index1] \
  62. + JAMO_MEDIAL_SHORT_NAME[index2] \
  63. + JAMO_FINAL_SHORT_NAME[index3]
  64. outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  65. unicode_utils.ucs_symbol(i), convert_to_hex(i),
  66. hangul_syllable_name))
  67. return
  68. # UnicodeData.txt file has contains code point ranges like this:
  69. #
  70. # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  71. # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  72. #
  73. # The glibc UTF-8 file splits ranges like these into shorter
  74. # ranges of 64 code points each:
  75. #
  76. # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
  77. # …
  78. # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
  79. for i in range(int(start, 16), int(end, 16), 64 ):
  80. if i > (int(end, 16)-64):
  81. outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  82. unicode_utils.ucs_symbol(i),
  83. unicode_utils.ucs_symbol(int(end,16)),
  84. convert_to_hex(i),
  85. name))
  86. break
  87. outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  88. unicode_utils.ucs_symbol(i),
  89. unicode_utils.ucs_symbol(i+63),
  90. convert_to_hex(i),
  91. name))
  92. def process_charmap(flines, outfile):
  93. '''This function takes an array which contains *all* lines of
  94. of UnicodeData.txt and write lines to outfile as used in the
  95. CHARMAP
  96. END CHARMAP
  97. section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
  98. Samples for input lines:
  99. 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
  100. 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  101. 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  102. D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
  103. DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
  104. 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
  105. 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
  106. Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
  107. <U0010> /x10 DATA LINK ESCAPE
  108. <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
  109. %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
  110. %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
  111. <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
  112. '''
  113. fields_start = []
  114. for line in flines:
  115. fields = line.split(";")
  116. # Some characters have “<control>” as their name. We try to
  117. # use the “Unicode 1.0 Name” (10th field in
  118. # UnicodeData.txt) for them.
  119. #
  120. # The Characters U+0080, U+0081, U+0084 and U+0099 have
  121. # “<control>” as their name but do not even have aa
  122. # ”Unicode 1.0 Name”. We could write code to take their
  123. # alternate names from NameAliases.txt.
  124. if fields[1] == "<control>" and fields[10]:
  125. fields[1] = fields[10]
  126. # Handling code point ranges like:
  127. #
  128. # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  129. # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  130. if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
  131. fields_start = fields
  132. continue
  133. if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
  134. process_range(fields_start[0], fields[0],
  135. outfile, fields[1][:-7]+'>')
  136. fields_start = []
  137. continue
  138. fields_start = []
  139. if 'Surrogate,' in fields[1]:
  140. # Comment out the surrogates in the UTF-8 file.
  141. # One could of course skip them completely but
  142. # the original UTF-8 file in glibc had them as
  143. # comments, so we keep these comment lines.
  144. outfile.write('%')
  145. outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  146. unicode_utils.ucs_symbol(int(fields[0], 16)),
  147. convert_to_hex(int(fields[0], 16)),
  148. fields[1]))
  149. def convert_to_hex(code_point):
  150. '''Converts a code point to a hexadecimal UTF-8 representation
  151. like /x**/x**/x**.'''
  152. # Getting UTF8 of Unicode characters.
  153. # In Python3, .encode('UTF-8') does not work for
  154. # surrogates. Therefore, we use this conversion table
  155. surrogates = {
  156. 0xD800: '/xed/xa0/x80',
  157. 0xDB7F: '/xed/xad/xbf',
  158. 0xDB80: '/xed/xae/x80',
  159. 0xDBFF: '/xed/xaf/xbf',
  160. 0xDC00: '/xed/xb0/x80',
  161. 0xDFFF: '/xed/xbf/xbf',
  162. }
  163. if code_point in surrogates:
  164. return surrogates[code_point]
  165. return ''.join([
  166. '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
  167. ])
  168. def write_header_charmap(outfile):
  169. '''Write the header on top of the CHARMAP section to the output file'''
  170. outfile.write("<code_set_name> UTF-8\n")
  171. outfile.write("<comment_char> %\n")
  172. outfile.write("<escape_char> /\n")
  173. outfile.write("<mb_cur_min> 1\n")
  174. outfile.write("<mb_cur_max> 6\n\n")
  175. outfile.write("% CHARMAP generated using utf8_gen.py\n")
  176. outfile.write("% alias ISO-10646/UTF-8\n")
  177. outfile.write("CHARMAP\n")
  178. def write_header_width(outfile, unicode_version):
  179. '''Writes the header on top of the WIDTH section to the output file'''
  180. outfile.write('% Character width according to Unicode '
  181. + '{:s}.\n'.format(unicode_version))
  182. outfile.write('% - Default width is 1.\n')
  183. outfile.write('% - Double-width characters have width 2; generated from\n')
  184. outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
  185. outfile.write('% - Non-spacing characters have width 0; '
  186. + 'generated from PropList.txt or\n')
  187. outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
  188. + 'UnicodeData.txt"\n')
  189. outfile.write('% - Format control characters have width 0; '
  190. + 'generated from\n')
  191. outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
  192. # Not needed covered by Cf
  193. # outfile.write("% - Zero width characters have width 0; generated from\n")
  194. # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
  195. outfile.write("WIDTH\n")
  196. def process_width(outfile, ulines, elines, plines):
  197. '''ulines are lines from UnicodeData.txt, elines are lines from
  198. EastAsianWidth.txt containing characters with width “W” or “F”,
  199. plines are lines from PropList.txt which contain characters
  200. with the property “Prepended_Concatenation_Mark”.
  201. '''
  202. width_dict = {}
  203. for line in elines:
  204. fields = line.split(";")
  205. if not '..' in fields[0]:
  206. code_points = (fields[0], fields[0])
  207. else:
  208. code_points = fields[0].split("..")
  209. for key in range(int(code_points[0], 16),
  210. int(code_points[1], 16)+1):
  211. width_dict[key] = 2
  212. for line in ulines:
  213. fields = line.split(";")
  214. if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
  215. width_dict[int(fields[0], 16)] = 0
  216. for line in plines:
  217. # Characters with the property “Prepended_Concatenation_Mark”
  218. # should have the width 1:
  219. fields = line.split(";")
  220. if not '..' in fields[0]:
  221. code_points = (fields[0], fields[0])
  222. else:
  223. code_points = fields[0].split("..")
  224. for key in range(int(code_points[0], 16),
  225. int(code_points[1], 16)+1):
  226. del width_dict[key] # default width is 1
  227. # handle special cases for compatibility
  228. for key in list((0x00AD,)):
  229. # https://www.cs.tut.fi/~jkorpela/shy.html
  230. if key in width_dict:
  231. del width_dict[key] # default width is 1
  232. for key in list(range(0x1160, 0x1200)):
  233. # Hangul jungseong and jongseong:
  234. if key in unicode_utils.UNICODE_ATTRIBUTES:
  235. width_dict[key] = 0
  236. for key in list(range(0xD7B0, 0xD800)):
  237. # Hangul jungseong and jongseong:
  238. if key in unicode_utils.UNICODE_ATTRIBUTES:
  239. width_dict[key] = 0
  240. for key in list(range(0x3248, 0x3250)):
  241. # These are “A” which means we can decide whether to treat them
  242. # as “W” or “N” based on context:
  243. # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
  244. # For us, “W” seems better.
  245. width_dict[key] = 2
  246. for key in list(range(0x4DC0, 0x4E00)):
  247. width_dict[key] = 2
  248. same_width_lists = []
  249. current_width_list = []
  250. for key in sorted(width_dict):
  251. if not current_width_list:
  252. current_width_list = [key]
  253. elif (key == current_width_list[-1] + 1
  254. and width_dict[key] == width_dict[current_width_list[0]]):
  255. current_width_list.append(key)
  256. else:
  257. same_width_lists.append(current_width_list)
  258. current_width_list = [key]
  259. if current_width_list:
  260. same_width_lists.append(current_width_list)
  261. for same_width_list in same_width_lists:
  262. if len(same_width_list) == 1:
  263. outfile.write('{:s}\t{:d}\n'.format(
  264. unicode_utils.ucs_symbol(same_width_list[0]),
  265. width_dict[same_width_list[0]]))
  266. else:
  267. outfile.write('{:s}...{:s}\t{:d}\n'.format(
  268. unicode_utils.ucs_symbol(same_width_list[0]),
  269. unicode_utils.ucs_symbol(same_width_list[-1]),
  270. width_dict[same_width_list[0]]))
  271. if __name__ == "__main__":
  272. PARSER = argparse.ArgumentParser(
  273. description='''
  274. Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
  275. ''')
  276. PARSER.add_argument(
  277. '-u', '--unicode_data_file',
  278. nargs='?',
  279. type=str,
  280. default='UnicodeData.txt',
  281. help=('The UnicodeData.txt file to read, '
  282. + 'default: %(default)s'))
  283. PARSER.add_argument(
  284. '-e', '--east_asian_with_file',
  285. nargs='?',
  286. type=str,
  287. default='EastAsianWidth.txt',
  288. help=('The EastAsianWidth.txt file to read, '
  289. + 'default: %(default)s'))
  290. PARSER.add_argument(
  291. '-p', '--prop_list_file',
  292. nargs='?',
  293. type=str,
  294. default='PropList.txt',
  295. help=('The PropList.txt file to read, '
  296. + 'default: %(default)s'))
  297. PARSER.add_argument(
  298. '--unicode_version',
  299. nargs='?',
  300. required=True,
  301. type=str,
  302. help='The Unicode version of the input files used.')
  303. ARGS = PARSER.parse_args()
  304. unicode_utils.fill_attributes(ARGS.unicode_data_file)
  305. with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
  306. UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
  307. with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
  308. EAST_ASIAN_WIDTH_LINES = []
  309. for LINE in EAST_ASIAN_WIDTH_FILE:
  310. # If characters from EastAasianWidth.txt which are from
  311. # from reserved ranges (i.e. not yet assigned code points)
  312. # are added to the WIDTH section of the UTF-8 file, then
  313. # “make check” produces “Unknown Character” errors for
  314. # these code points because such unassigned code points
  315. # are not in the CHARMAP section of the UTF-8 file.
  316. #
  317. # Therefore, we skip all reserved code points when reading
  318. # the EastAsianWidth.txt file.
  319. if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
  320. continue
  321. if re.match(r'^[^;]*;[WF]', LINE):
  322. EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
  323. with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
  324. PROP_LIST_LINES = []
  325. for LINE in PROP_LIST_FILE:
  326. if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
  327. PROP_LIST_LINES.append(LINE.strip())
  328. with open('UTF-8', mode='w') as OUTFILE:
  329. # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
  330. write_header_charmap(OUTFILE)
  331. process_charmap(UNICODE_DATA_LINES, OUTFILE)
  332. OUTFILE.write("END CHARMAP\n\n")
  333. # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
  334. write_header_width(OUTFILE, ARGS.unicode_version)
  335. process_width(OUTFILE,
  336. UNICODE_DATA_LINES,
  337. EAST_ASIAN_WIDTH_LINES,
  338. PROP_LIST_LINES)
  339. OUTFILE.write("END WIDTH\n")