You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

633 lines
21 KiB

  1. /* stringlib: codec implementations */
  2. #if STRINGLIB_IS_UNICODE
  3. /* Mask to check or force alignment of a pointer to C 'long' boundaries */
  4. #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
  5. /* Mask to quickly check whether a C 'long' contains a
  6. non-ASCII, UTF8-encoded char. */
  7. #if (SIZEOF_LONG == 8)
  8. # define ASCII_CHAR_MASK 0x8080808080808080UL
  9. #elif (SIZEOF_LONG == 4)
  10. # define ASCII_CHAR_MASK 0x80808080UL
  11. #else
  12. # error C 'long' size should be either 4 or 8!
  13. #endif
  14. /* 10xxxxxx */
  15. #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
  16. Py_LOCAL_INLINE(Py_UCS4)
  17. STRINGLIB(utf8_decode)(const char **inptr, const char *end,
  18. STRINGLIB_CHAR *dest,
  19. Py_ssize_t *outpos)
  20. {
  21. Py_UCS4 ch;
  22. const char *s = *inptr;
  23. const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
  24. STRINGLIB_CHAR *p = dest + *outpos;
  25. while (s < end) {
  26. ch = (unsigned char)*s;
  27. if (ch < 0x80) {
  28. /* Fast path for runs of ASCII characters. Given that common UTF-8
  29. input will consist of an overwhelming majority of ASCII
  30. characters, we try to optimize for this case by checking
  31. as many characters as a C 'long' can contain.
  32. First, check if we can do an aligned read, as most CPUs have
  33. a penalty for unaligned reads.
  34. */
  35. if (!((size_t) s & LONG_PTR_MASK)) {
  36. /* Help register allocation */
  37. register const char *_s = s;
  38. register STRINGLIB_CHAR *_p = p;
  39. while (_s < aligned_end) {
  40. /* Read a whole long at a time (either 4 or 8 bytes),
  41. and do a fast unrolled copy if it only contains ASCII
  42. characters. */
  43. unsigned long value = *(unsigned long *) _s;
  44. if (value & ASCII_CHAR_MASK)
  45. break;
  46. #ifdef BYTEORDER_IS_LITTLE_ENDIAN
  47. _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
  48. _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
  49. _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
  50. _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
  51. # if SIZEOF_LONG == 8
  52. _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
  53. _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
  54. _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
  55. _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
  56. # endif
  57. #else
  58. # if SIZEOF_LONG == 8
  59. _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
  60. _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
  61. _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
  62. _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
  63. _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
  64. _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
  65. _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
  66. _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
  67. # else
  68. _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
  69. _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
  70. _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
  71. _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
  72. # endif
  73. #endif
  74. _s += SIZEOF_LONG;
  75. _p += SIZEOF_LONG;
  76. }
  77. s = _s;
  78. p = _p;
  79. if (s == end)
  80. break;
  81. ch = (unsigned char)*s;
  82. }
  83. if (ch < 0x80) {
  84. s++;
  85. *p++ = ch;
  86. continue;
  87. }
  88. }
  89. if (ch < 0xC2) {
  90. /* invalid sequence
  91. \x80-\xBF -- continuation byte
  92. \xC0-\xC1 -- fake 0000-007F */
  93. goto InvalidStart;
  94. }
  95. if (ch < 0xE0) {
  96. /* \xC2\x80-\xDF\xBF -- 0080-07FF */
  97. Py_UCS4 ch2;
  98. if (end - s < 2) {
  99. /* unexpected end of data: the caller will decide whether
  100. it's an error or not */
  101. break;
  102. }
  103. ch2 = (unsigned char)s[1];
  104. if (!IS_CONTINUATION_BYTE(ch2))
  105. /* invalid continuation byte */
  106. goto InvalidContinuation;
  107. ch = (ch << 6) + ch2 -
  108. ((0xC0 << 6) + 0x80);
  109. assert ((ch > 0x007F) && (ch <= 0x07FF));
  110. s += 2;
  111. if (STRINGLIB_MAX_CHAR <= 0x007F ||
  112. (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
  113. goto Overflow;
  114. *p++ = ch;
  115. continue;
  116. }
  117. if (ch < 0xF0) {
  118. /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
  119. Py_UCS4 ch2, ch3;
  120. if (end - s < 3) {
  121. /* unexpected end of data: the caller will decide whether
  122. it's an error or not */
  123. break;
  124. }
  125. ch2 = (unsigned char)s[1];
  126. ch3 = (unsigned char)s[2];
  127. if (!IS_CONTINUATION_BYTE(ch2) ||
  128. !IS_CONTINUATION_BYTE(ch3)) {
  129. /* invalid continuation byte */
  130. goto InvalidContinuation;
  131. }
  132. if (ch == 0xE0) {
  133. if (ch2 < 0xA0)
  134. /* invalid sequence
  135. \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
  136. goto InvalidContinuation;
  137. }
  138. else if (ch == 0xED && ch2 > 0x9F) {
  139. /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
  140. will result in surrogates in range D800-DFFF. Surrogates are
  141. not valid UTF-8 so they are rejected.
  142. See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
  143. (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
  144. goto InvalidContinuation;
  145. }
  146. ch = (ch << 12) + (ch2 << 6) + ch3 -
  147. ((0xE0 << 12) + (0x80 << 6) + 0x80);
  148. assert ((ch > 0x07FF) && (ch <= 0xFFFF));
  149. s += 3;
  150. if (STRINGLIB_MAX_CHAR <= 0x07FF ||
  151. (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
  152. goto Overflow;
  153. *p++ = ch;
  154. continue;
  155. }
  156. if (ch < 0xF5) {
  157. /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
  158. Py_UCS4 ch2, ch3, ch4;
  159. if (end - s < 4) {
  160. /* unexpected end of data: the caller will decide whether
  161. it's an error or not */
  162. break;
  163. }
  164. ch2 = (unsigned char)s[1];
  165. ch3 = (unsigned char)s[2];
  166. ch4 = (unsigned char)s[3];
  167. if (!IS_CONTINUATION_BYTE(ch2) ||
  168. !IS_CONTINUATION_BYTE(ch3) ||
  169. !IS_CONTINUATION_BYTE(ch4)) {
  170. /* invalid continuation byte */
  171. goto InvalidContinuation;
  172. }
  173. if (ch == 0xF0) {
  174. if (ch2 < 0x90)
  175. /* invalid sequence
  176. \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
  177. goto InvalidContinuation;
  178. }
  179. else if (ch == 0xF4 && ch2 > 0x8F) {
  180. /* invalid sequence
  181. \xF4\x90\x80\80- -- 110000- overflow */
  182. goto InvalidContinuation;
  183. }
  184. ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
  185. ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
  186. assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
  187. s += 4;
  188. if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
  189. (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
  190. goto Overflow;
  191. *p++ = ch;
  192. continue;
  193. }
  194. goto InvalidStart;
  195. }
  196. ch = 0;
  197. Overflow:
  198. Return:
  199. *inptr = s;
  200. *outpos = p - dest;
  201. return ch;
  202. InvalidStart:
  203. ch = 1;
  204. goto Return;
  205. InvalidContinuation:
  206. ch = 2;
  207. goto Return;
  208. }
  209. #undef ASCII_CHAR_MASK
  210. #undef IS_CONTINUATION_BYTE
  211. /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
  212. PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
  213. UCS-1 strings don't need to handle surrogates for example. */
  214. Py_LOCAL_INLINE(PyObject *)
  215. STRINGLIB(utf8_encoder)(PyObject *unicode,
  216. STRINGLIB_CHAR *data,
  217. Py_ssize_t size,
  218. const char *errors)
  219. {
  220. #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
  221. Py_ssize_t i; /* index into s of next input byte */
  222. PyObject *result; /* result string object */
  223. char *p; /* next free byte in output buffer */
  224. Py_ssize_t nallocated; /* number of result bytes allocated */
  225. Py_ssize_t nneeded; /* number of result bytes needed */
  226. #if STRINGLIB_SIZEOF_CHAR > 1
  227. PyObject *errorHandler = NULL;
  228. PyObject *exc = NULL;
  229. PyObject *rep = NULL;
  230. #endif
  231. #if STRINGLIB_SIZEOF_CHAR == 1
  232. const Py_ssize_t max_char_size = 2;
  233. char stackbuf[MAX_SHORT_UNICHARS * 2];
  234. #elif STRINGLIB_SIZEOF_CHAR == 2
  235. const Py_ssize_t max_char_size = 3;
  236. char stackbuf[MAX_SHORT_UNICHARS * 3];
  237. #else /* STRINGLIB_SIZEOF_CHAR == 4 */
  238. const Py_ssize_t max_char_size = 4;
  239. char stackbuf[MAX_SHORT_UNICHARS * 4];
  240. #endif
  241. assert(size >= 0);
  242. if (size <= MAX_SHORT_UNICHARS) {
  243. /* Write into the stack buffer; nallocated can't overflow.
  244. * At the end, we'll allocate exactly as much heap space as it
  245. * turns out we need.
  246. */
  247. nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
  248. result = NULL; /* will allocate after we're done */
  249. p = stackbuf;
  250. }
  251. else {
  252. if (size > PY_SSIZE_T_MAX / max_char_size) {
  253. /* integer overflow */
  254. return PyErr_NoMemory();
  255. }
  256. /* Overallocate on the heap, and give the excess back at the end. */
  257. nallocated = size * max_char_size;
  258. result = PyBytes_FromStringAndSize(NULL, nallocated);
  259. if (result == NULL)
  260. return NULL;
  261. p = PyBytes_AS_STRING(result);
  262. }
  263. for (i = 0; i < size;) {
  264. Py_UCS4 ch = data[i++];
  265. if (ch < 0x80) {
  266. /* Encode ASCII */
  267. *p++ = (char) ch;
  268. }
  269. else
  270. #if STRINGLIB_SIZEOF_CHAR > 1
  271. if (ch < 0x0800)
  272. #endif
  273. {
  274. /* Encode Latin-1 */
  275. *p++ = (char)(0xc0 | (ch >> 6));
  276. *p++ = (char)(0x80 | (ch & 0x3f));
  277. }
  278. #if STRINGLIB_SIZEOF_CHAR > 1
  279. else if (Py_UNICODE_IS_SURROGATE(ch)) {
  280. Py_ssize_t newpos;
  281. Py_ssize_t repsize, k, startpos;
  282. startpos = i-1;
  283. rep = unicode_encode_call_errorhandler(
  284. errors, &errorHandler, "utf-8", "surrogates not allowed",
  285. unicode, &exc, startpos, startpos+1, &newpos);
  286. if (!rep)
  287. goto error;
  288. if (PyBytes_Check(rep))
  289. repsize = PyBytes_GET_SIZE(rep);
  290. else
  291. repsize = PyUnicode_GET_LENGTH(rep);
  292. if (repsize > max_char_size) {
  293. Py_ssize_t offset;
  294. if (result == NULL)
  295. offset = p - stackbuf;
  296. else
  297. offset = p - PyBytes_AS_STRING(result);
  298. if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
  299. /* integer overflow */
  300. PyErr_NoMemory();
  301. goto error;
  302. }
  303. nallocated += repsize - max_char_size;
  304. if (result != NULL) {
  305. if (_PyBytes_Resize(&result, nallocated) < 0)
  306. goto error;
  307. } else {
  308. result = PyBytes_FromStringAndSize(NULL, nallocated);
  309. if (result == NULL)
  310. goto error;
  311. Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
  312. }
  313. p = PyBytes_AS_STRING(result) + offset;
  314. }
  315. if (PyBytes_Check(rep)) {
  316. char *prep = PyBytes_AS_STRING(rep);
  317. for(k = repsize; k > 0; k--)
  318. *p++ = *prep++;
  319. } else /* rep is unicode */ {
  320. enum PyUnicode_Kind repkind;
  321. void *repdata;
  322. if (PyUnicode_READY(rep) < 0)
  323. goto error;
  324. repkind = PyUnicode_KIND(rep);
  325. repdata = PyUnicode_DATA(rep);
  326. for(k=0; k<repsize; k++) {
  327. Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
  328. if (0x80 <= c) {
  329. raise_encode_exception(&exc, "utf-8",
  330. unicode,
  331. i-1, i,
  332. "surrogates not allowed");
  333. goto error;
  334. }
  335. *p++ = (char)c;
  336. }
  337. }
  338. Py_CLEAR(rep);
  339. }
  340. else
  341. #if STRINGLIB_SIZEOF_CHAR > 2
  342. if (ch < 0x10000)
  343. #endif
  344. {
  345. *p++ = (char)(0xe0 | (ch >> 12));
  346. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  347. *p++ = (char)(0x80 | (ch & 0x3f));
  348. }
  349. #if STRINGLIB_SIZEOF_CHAR > 2
  350. else /* ch >= 0x10000 */
  351. {
  352. assert(ch <= MAX_UNICODE);
  353. /* Encode UCS4 Unicode ordinals */
  354. *p++ = (char)(0xf0 | (ch >> 18));
  355. *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
  356. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  357. *p++ = (char)(0x80 | (ch & 0x3f));
  358. }
  359. #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
  360. #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
  361. }
  362. if (result == NULL) {
  363. /* This was stack allocated. */
  364. nneeded = p - stackbuf;
  365. assert(nneeded <= nallocated);
  366. result = PyBytes_FromStringAndSize(stackbuf, nneeded);
  367. }
  368. else {
  369. /* Cut back to size actually needed. */
  370. nneeded = p - PyBytes_AS_STRING(result);
  371. assert(nneeded <= nallocated);
  372. _PyBytes_Resize(&result, nneeded);
  373. }
  374. #if STRINGLIB_SIZEOF_CHAR > 1
  375. Py_XDECREF(errorHandler);
  376. Py_XDECREF(exc);
  377. #endif
  378. return result;
  379. #if STRINGLIB_SIZEOF_CHAR > 1
  380. error:
  381. Py_XDECREF(rep);
  382. Py_XDECREF(errorHandler);
  383. Py_XDECREF(exc);
  384. Py_XDECREF(result);
  385. return NULL;
  386. #endif
  387. #undef MAX_SHORT_UNICHARS
  388. }
  389. /* The pattern for constructing UCS2-repeated masks. */
  390. #if SIZEOF_LONG == 8
  391. # define UCS2_REPEAT_MASK 0x0001000100010001ul
  392. #elif SIZEOF_LONG == 4
  393. # define UCS2_REPEAT_MASK 0x00010001ul
  394. #else
  395. # error C 'long' size should be either 4 or 8!
  396. #endif
  397. /* The mask for fast checking. */
  398. #if STRINGLIB_SIZEOF_CHAR == 1
  399. /* The mask for fast checking of whether a C 'long' contains a
  400. non-ASCII or non-Latin1 UTF16-encoded characters. */
  401. # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
  402. #else
  403. /* The mask for fast checking of whether a C 'long' may contain
  404. UTF16-encoded surrogate characters. This is an efficient heuristic,
  405. assuming that non-surrogate characters with a code point >= 0x8000 are
  406. rare in most input.
  407. */
  408. # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
  409. #endif
  410. /* The mask for fast byte-swapping. */
  411. #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
  412. /* Swap bytes. */
  413. #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
  414. (((value) & STRIPPED_MASK) << 8))
  415. Py_LOCAL_INLINE(Py_UCS4)
  416. STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
  417. STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
  418. int native_ordering)
  419. {
  420. Py_UCS4 ch;
  421. const unsigned char *aligned_end =
  422. (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
  423. const unsigned char *q = *inptr;
  424. STRINGLIB_CHAR *p = dest + *outpos;
  425. /* Offsets from q for retrieving byte pairs in the right order. */
  426. #ifdef BYTEORDER_IS_LITTLE_ENDIAN
  427. int ihi = !!native_ordering, ilo = !native_ordering;
  428. #else
  429. int ihi = !native_ordering, ilo = !!native_ordering;
  430. #endif
  431. --e;
  432. while (q < e) {
  433. Py_UCS4 ch2;
  434. /* First check for possible aligned read of a C 'long'. Unaligned
  435. reads are more expensive, better to defer to another iteration. */
  436. if (!((size_t) q & LONG_PTR_MASK)) {
  437. /* Fast path for runs of in-range non-surrogate chars. */
  438. register const unsigned char *_q = q;
  439. while (_q < aligned_end) {
  440. unsigned long block = * (unsigned long *) _q;
  441. if (native_ordering) {
  442. /* Can use buffer directly */
  443. if (block & FAST_CHAR_MASK)
  444. break;
  445. }
  446. else {
  447. /* Need to byte-swap */
  448. if (block & SWAB(FAST_CHAR_MASK))
  449. break;
  450. #if STRINGLIB_SIZEOF_CHAR == 1
  451. block >>= 8;
  452. #else
  453. block = SWAB(block);
  454. #endif
  455. }
  456. #ifdef BYTEORDER_IS_LITTLE_ENDIAN
  457. # if SIZEOF_LONG == 4
  458. p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  459. p[1] = (STRINGLIB_CHAR)(block >> 16);
  460. # elif SIZEOF_LONG == 8
  461. p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  462. p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
  463. p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
  464. p[3] = (STRINGLIB_CHAR)(block >> 48);
  465. # endif
  466. #else
  467. # if SIZEOF_LONG == 4
  468. p[0] = (STRINGLIB_CHAR)(block >> 16);
  469. p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  470. # elif SIZEOF_LONG == 8
  471. p[0] = (STRINGLIB_CHAR)(block >> 48);
  472. p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
  473. p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
  474. p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  475. # endif
  476. #endif
  477. _q += SIZEOF_LONG;
  478. p += SIZEOF_LONG / 2;
  479. }
  480. q = _q;
  481. if (q >= e)
  482. break;
  483. }
  484. ch = (q[ihi] << 8) | q[ilo];
  485. q += 2;
  486. if (!Py_UNICODE_IS_SURROGATE(ch)) {
  487. #if STRINGLIB_SIZEOF_CHAR < 2
  488. if (ch > STRINGLIB_MAX_CHAR)
  489. /* Out-of-range */
  490. goto Return;
  491. #endif
  492. *p++ = (STRINGLIB_CHAR)ch;
  493. continue;
  494. }
  495. /* UTF-16 code pair: */
  496. if (q >= e)
  497. goto UnexpectedEnd;
  498. if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
  499. goto IllegalEncoding;
  500. ch2 = (q[ihi] << 8) | q[ilo];
  501. q += 2;
  502. if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
  503. goto IllegalSurrogate;
  504. ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
  505. #if STRINGLIB_SIZEOF_CHAR < 4
  506. /* Out-of-range */
  507. goto Return;
  508. #else
  509. *p++ = (STRINGLIB_CHAR)ch;
  510. #endif
  511. }
  512. ch = 0;
  513. Return:
  514. *inptr = q;
  515. *outpos = p - dest;
  516. return ch;
  517. UnexpectedEnd:
  518. ch = 1;
  519. goto Return;
  520. IllegalEncoding:
  521. ch = 2;
  522. goto Return;
  523. IllegalSurrogate:
  524. ch = 3;
  525. goto Return;
  526. }
  527. #undef UCS2_REPEAT_MASK
  528. #undef FAST_CHAR_MASK
  529. #undef STRIPPED_MASK
  530. #undef SWAB
  531. #undef LONG_PTR_MASK
  532. Py_LOCAL_INLINE(void)
  533. STRINGLIB(utf16_encode)(unsigned short *out,
  534. const STRINGLIB_CHAR *in,
  535. Py_ssize_t len,
  536. int native_ordering)
  537. {
  538. const STRINGLIB_CHAR *end = in + len;
  539. #if STRINGLIB_SIZEOF_CHAR == 1
  540. # define SWAB2(CH) ((CH) << 8)
  541. #else
  542. # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
  543. #endif
  544. #if STRINGLIB_MAX_CHAR < 0x10000
  545. if (native_ordering) {
  546. # if STRINGLIB_SIZEOF_CHAR == 2
  547. Py_MEMCPY(out, in, 2 * len);
  548. # else
  549. _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
  550. # endif
  551. } else {
  552. const STRINGLIB_CHAR *unrolled_end = in + (len & ~ (Py_ssize_t) 3);
  553. while (in < unrolled_end) {
  554. out[0] = SWAB2(in[0]);
  555. out[1] = SWAB2(in[1]);
  556. out[2] = SWAB2(in[2]);
  557. out[3] = SWAB2(in[3]);
  558. in += 4; out += 4;
  559. }
  560. while (in < end) {
  561. *out++ = SWAB2(*in);
  562. ++in;
  563. }
  564. }
  565. #else
  566. if (native_ordering) {
  567. while (in < end) {
  568. Py_UCS4 ch = *in++;
  569. if (ch < 0x10000)
  570. *out++ = ch;
  571. else {
  572. out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
  573. out[1] = Py_UNICODE_LOW_SURROGATE(ch);
  574. out += 2;
  575. }
  576. }
  577. } else {
  578. while (in < end) {
  579. Py_UCS4 ch = *in++;
  580. if (ch < 0x10000)
  581. *out++ = SWAB2((Py_UCS2)ch);
  582. else {
  583. Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
  584. Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
  585. out[0] = SWAB2(ch1);
  586. out[1] = SWAB2(ch2);
  587. out += 2;
  588. }
  589. }
  590. }
  591. #endif
  592. #undef SWAB2
  593. }
  594. #endif /* STRINGLIB_IS_UNICODE */