You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
3.7 KiB

  1. /* Finding the optimal width of unicode characters in a buffer */
  2. #if STRINGLIB_IS_UNICODE
  3. /* Mask to quickly check whether a C 'long' contains a
  4. non-ASCII, UTF8-encoded char. */
  5. #if (SIZEOF_LONG == 8)
  6. # define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL
  7. #elif (SIZEOF_LONG == 4)
  8. # define UCS1_ASCII_CHAR_MASK 0x80808080UL
  9. #else
  10. # error C 'long' size should be either 4 or 8!
  11. #endif
  12. #if STRINGLIB_SIZEOF_CHAR == 1
  13. Py_LOCAL_INLINE(Py_UCS4)
  14. STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
  15. {
  16. const unsigned char *p = (const unsigned char *) begin;
  17. const unsigned char *aligned_end =
  18. (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
  19. while (p < end) {
  20. if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
  21. /* Help register allocation */
  22. const unsigned char *_p = p;
  23. while (_p < aligned_end) {
  24. unsigned long value = *(unsigned long *) _p;
  25. if (value & UCS1_ASCII_CHAR_MASK)
  26. return 255;
  27. _p += SIZEOF_LONG;
  28. }
  29. p = _p;
  30. if (p == end)
  31. break;
  32. }
  33. if (*p++ & 0x80)
  34. return 255;
  35. }
  36. return 127;
  37. }
  38. #undef ASCII_CHAR_MASK
  39. #else /* STRINGLIB_SIZEOF_CHAR == 1 */
  40. #define MASK_ASCII 0xFFFFFF80
  41. #define MASK_UCS1 0xFFFFFF00
  42. #define MASK_UCS2 0xFFFF0000
  43. #define MAX_CHAR_ASCII 0x7f
  44. #define MAX_CHAR_UCS1 0xff
  45. #define MAX_CHAR_UCS2 0xffff
  46. #define MAX_CHAR_UCS4 0x10ffff
  47. Py_LOCAL_INLINE(Py_UCS4)
  48. STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
  49. {
  50. #if STRINGLIB_SIZEOF_CHAR == 2
  51. const Py_UCS4 mask_limit = MASK_UCS1;
  52. const Py_UCS4 max_char_limit = MAX_CHAR_UCS2;
  53. #elif STRINGLIB_SIZEOF_CHAR == 4
  54. const Py_UCS4 mask_limit = MASK_UCS2;
  55. const Py_UCS4 max_char_limit = MAX_CHAR_UCS4;
  56. #else
  57. #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4)
  58. #endif
  59. Py_UCS4 mask;
  60. Py_ssize_t n = end - begin;
  61. const STRINGLIB_CHAR *p = begin;
  62. const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4);
  63. Py_UCS4 max_char;
  64. max_char = MAX_CHAR_ASCII;
  65. mask = MASK_ASCII;
  66. while (p < unrolled_end) {
  67. STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
  68. if (bits & mask) {
  69. if (mask == mask_limit) {
  70. /* Limit reached */
  71. return max_char_limit;
  72. }
  73. if (mask == MASK_ASCII) {
  74. max_char = MAX_CHAR_UCS1;
  75. mask = MASK_UCS1;
  76. }
  77. else {
  78. /* mask can't be MASK_UCS2 because of mask_limit above */
  79. assert(mask == MASK_UCS1);
  80. max_char = MAX_CHAR_UCS2;
  81. mask = MASK_UCS2;
  82. }
  83. /* We check the new mask on the same chars in the next iteration */
  84. continue;
  85. }
  86. p += 4;
  87. }
  88. while (p < end) {
  89. if (p[0] & mask) {
  90. if (mask == mask_limit) {
  91. /* Limit reached */
  92. return max_char_limit;
  93. }
  94. if (mask == MASK_ASCII) {
  95. max_char = MAX_CHAR_UCS1;
  96. mask = MASK_UCS1;
  97. }
  98. else {
  99. /* mask can't be MASK_UCS2 because of mask_limit above */
  100. assert(mask == MASK_UCS1);
  101. max_char = MAX_CHAR_UCS2;
  102. mask = MASK_UCS2;
  103. }
  104. /* We check the new mask on the same chars in the next iteration */
  105. continue;
  106. }
  107. p++;
  108. }
  109. return max_char;
  110. }
  111. #undef MASK_ASCII
  112. #undef MASK_UCS1
  113. #undef MASK_UCS2
  114. #undef MAX_CHAR_ASCII
  115. #undef MAX_CHAR_UCS1
  116. #undef MAX_CHAR_UCS2
  117. #undef MAX_CHAR_UCS4
  118. #endif /* STRINGLIB_SIZEOF_CHAR == 1 */
  119. #endif /* STRINGLIB_IS_UNICODE */