unicode.js 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. /**
  2. * Regex for matching unicode values out of Basic Multilingual Plane (BMP)
  3. * Reference:
  4. * - https://github.com/mathiasbynens/regenerate
  5. * - https://unicode-table.com/
  6. * - https://mathiasbynens.be/notes/javascript-unicode
  7. *
  8. * @returns {RegExp}
  9. */
  10. export function getUnicodeNonBmpRegExp() {
  11. /**
  12. * Regex for matching astral plane unicode
  13. * - http://kourge.net/projects/regexp-unicode-block
  14. */
  15. /**
  16. * Notes on various unicode planes being used in the regex below:
  17. * '\u1D00-\u1D7F' Phonetic Extensions
  18. * '\u1D80-\u1DBF' Phonetic Extensions Supplement
  19. * '\u1DC0-\u1DFF' Combining Diacritical Marks Supplement
  20. * '\u20A0-\u20CF' Currency symbols
  21. * '\u20D0-\u20FF' Combining Diacritical Marks for Symbols
  22. * '\u2100-\u214F' Letter like symbols
  23. * '\u2150-\u218F' Number forms (eg: Roman numbers)
  24. * '\u2190-\u21FF' Arrows
  25. * '\u2200-\u22FF' Mathematical operators
  26. * '\u2300-\u23FF' Misc Technical
  27. * '\u2400-\u243F' Control pictures
  28. * '\u2440-\u245F' OCR
  29. * '\u2460-\u24FF' Enclosed alpha numerics
  30. * '\u2500-\u257F' Box Drawing
  31. * '\u2580-\u259F' Block Elements
  32. * '\u25A0-\u25FF' Geometric Shapes
  33. * '\u2600-\u26FF' Misc Symbols
  34. * '\u2700-\u27BF' Dingbats
  35. * '\uE000-\uF8FF' Private Use
  36. *
  37. * Note: plane '\u2000-\u206F' used for General punctuation is excluded as it is handled in -> getPunctuationRegExp
  38. */
  39. return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF]/g;
  40. }
  41. /**
  42. * Get regular expression for matching punctuations
  43. *
  44. * @returns {RegExp}
  45. */
  46. export function getPunctuationRegExp() {
  47. /**
  48. * Reference: http://kunststube.net/encoding/
  49. * US-ASCII
  50. * -> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
  51. *
  52. * General Punctuation block
  53. * -> \u2000-\u206F
  54. *
  55. * Supplemental Punctuation block
  56. * Reference: https://en.wikipedia.org/wiki/Supplemental_Punctuation
  57. * -> \u2E00-\u2E7F Reference
  58. */
  59. return /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&£¢¥§€()*+,\-.\/:;<=>?@\[\]^_`{|}~±]/g;
  60. }
  61. /**
  62. * Get regular expression for supplementary private use
  63. *
  64. * @returns {RegExp}
  65. */
  66. export function getSupplementaryPrivateUseRegExp() {
  67. // Supplementary private use area A (https://www.unicode.org/charts/PDF/UF0000.pdf) contains
  68. // characters between F0000 and FFFFF. Because ES5 doesn't have a syntax for regular expressions
  69. // of such characters, search instead for the corresponding surrogate pairs.
  70. //
  71. // Code points FFFFD and FFFFF are "noncharacters", but the regex still matches them, because its
  72. // intent is to match things we don't want to check color contrast for. This is why the low
  73. // surrogate range in the regex ends at DFFF, not DFFD.
  74. //
  75. // 1. High surrogate area (https://www.unicode.org/charts/PDF/UD800.pdf)
  76. // 2. Low surrogate area (https://www.unicode.org/charts/PDF/UDC00.pdf)
  77. //
  78. // 1 2
  79. // ┏━━━━━━┻━━━━━━┓┏━━━━━━┻━━━━━━┓
  80. return /[\uDB80-\uDBBF][\uDC00-\uDFFF]/g;
  81. }