dbcs-data.js 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. "use strict";
  2. // Description of supported double byte encodings and aliases.
  3. // Tables are not require()-d until they are needed to speed up library load.
  4. // require()-s are direct to support Browserify.
  5. module.exports = {
  6. // == Japanese/ShiftJIS ====================================================
  7. // All japanese encodings are based on JIS X set of standards:
  8. // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF.
  9. // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
  10. // Has several variations in 1978, 1983, 1990 and 1997.
  11. // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead.
  12. // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233.
  13. // 2 planes, first is superset of 0208, second - revised 0212.
  14. // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx)
  15. // Byte encodings are:
  16. // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte
  17. // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC.
  18. // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI.
  19. // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes.
  20. // 0x00-0x7F - lower part of 0201
  21. // 0x8E, 0xA1-0xDF - upper part of 0201
  22. // (0xA1-0xFE)x2 - 0208 plane (94x94).
  23. // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94).
  24. // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon.
  25. // Used as-is in ISO2022 family.
  26. // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
  27. // 0201-1976 Roman, 0208-1978, 0208-1983.
  28. // * ISO2022-JP-1: Adds esc seq for 0212-1990.
  29. // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7.
  30. // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2.
  31. // * ISO2022-JP-2004: Adds 0213-2004 Plane 1.
  32. //
  33. // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes.
  34. //
  35. // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html
  36. 'shiftjis': {
  37. type: '_dbcs',
  38. table: function() { return require('./tables/shiftjis.json') },
  39. encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
  40. encodeSkipVals: [{from: 0xED40, to: 0xF940}],
  41. },
  42. 'csshiftjis': 'shiftjis',
  43. 'mskanji': 'shiftjis',
  44. 'sjis': 'shiftjis',
  45. 'windows31j': 'shiftjis',
  46. 'ms31j': 'shiftjis',
  47. 'xsjis': 'shiftjis',
  48. 'windows932': 'shiftjis',
  49. 'ms932': 'shiftjis',
  50. '932': 'shiftjis',
  51. 'cp932': 'shiftjis',
  52. 'eucjp': {
  53. type: '_dbcs',
  54. table: function() { return require('./tables/eucjp.json') },
  55. encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
  56. },
  57. // TODO: KDDI extension to Shift_JIS
  58. // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes.
  59. // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars.
  60. // == Chinese/GBK ==========================================================
  61. // http://en.wikipedia.org/wiki/GBK
  62. // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder
  63. // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936
  64. 'gb2312': 'cp936',
  65. 'gb231280': 'cp936',
  66. 'gb23121980': 'cp936',
  67. 'csgb2312': 'cp936',
  68. 'csiso58gb231280': 'cp936',
  69. 'euccn': 'cp936',
  70. // Microsoft's CP936 is a subset and approximation of GBK.
  71. 'windows936': 'cp936',
  72. 'ms936': 'cp936',
  73. '936': 'cp936',
  74. 'cp936': {
  75. type: '_dbcs',
  76. table: function() { return require('./tables/cp936.json') },
  77. },
  78. // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other.
  79. 'gbk': {
  80. type: '_dbcs',
  81. table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
  82. },
  83. 'xgbk': 'gbk',
  84. 'isoir58': 'gbk',
  85. // GB18030 is an algorithmic extension of GBK.
  86. // Main source: https://www.w3.org/TR/encoding/#gbk-encoder
  87. // http://icu-project.org/docs/papers/gb18030.html
  88. // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
  89. // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
  90. 'gb18030': {
  91. type: '_dbcs',
  92. table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
  93. gb18030: function() { return require('./tables/gb18030-ranges.json') },
  94. encodeSkipVals: [0x80],
  95. encodeAdd: {'€': 0xA2E3},
  96. },
  97. 'chinese': 'gb18030',
  98. // == Korean ===============================================================
  99. // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same.
  100. 'windows949': 'cp949',
  101. 'ms949': 'cp949',
  102. '949': 'cp949',
  103. 'cp949': {
  104. type: '_dbcs',
  105. table: function() { return require('./tables/cp949.json') },
  106. },
  107. 'cseuckr': 'cp949',
  108. 'csksc56011987': 'cp949',
  109. 'euckr': 'cp949',
  110. 'isoir149': 'cp949',
  111. 'korean': 'cp949',
  112. 'ksc56011987': 'cp949',
  113. 'ksc56011989': 'cp949',
  114. 'ksc5601': 'cp949',
  115. // == Big5/Taiwan/Hong Kong ================================================
  116. // There are lots of tables for Big5 and cp950. Please see the following links for history:
  117. // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html
  118. // Variations, in roughly number of defined chars:
  119. // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
  120. // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/
  121. // * Big5-2003 (Taiwan standard) almost superset of cp950.
  122. // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers.
  123. // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
  124. // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years.
  125. // Plus, it has 4 combining sequences.
  126. // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299
  127. // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way.
  128. // Implementations are not consistent within browsers; sometimes labeled as just big5.
  129. // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied.
  130. // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31
  131. // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s.
  132. // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt
  133. // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt
  134. //
  135. // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder
  136. // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.
  137. 'windows950': 'cp950',
  138. 'ms950': 'cp950',
  139. '950': 'cp950',
  140. 'cp950': {
  141. type: '_dbcs',
  142. table: function() { return require('./tables/cp950.json') },
  143. },
  144. // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus.
  145. 'big5': 'big5hkscs',
  146. 'big5hkscs': {
  147. type: '_dbcs',
  148. table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) },
  149. encodeSkipVals: [
  150. // Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of
  151. // https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU.
  152. // But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter.
  153. 0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe,
  154. 0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca,
  155. 0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62,
  156. 0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef,
  157. 0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed,
  158. // Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345
  159. 0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce,
  160. ],
  161. },
  162. 'cnbig5': 'big5hkscs',
  163. 'csbig5': 'big5hkscs',
  164. 'xxbig5': 'big5hkscs',
  165. };