You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

723 lines
24 KiB

  1. /* stringlib: codec implementations */
  2. #if STRINGLIB_IS_UNICODE
  3. /* Mask to quickly check whether a C 'long' contains a
  4. non-ASCII, UTF8-encoded char. */
  5. #if (SIZEOF_LONG == 8)
  6. # define ASCII_CHAR_MASK 0x8080808080808080UL
  7. #elif (SIZEOF_LONG == 4)
  8. # define ASCII_CHAR_MASK 0x80808080UL
  9. #else
  10. # error C 'long' size should be either 4 or 8!
  11. #endif
  12. /* 10xxxxxx */
  13. #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
  14. Py_LOCAL_INLINE(Py_UCS4)
  15. STRINGLIB(utf8_decode)(const char **inptr, const char *end,
  16. STRINGLIB_CHAR *dest,
  17. Py_ssize_t *outpos)
  18. {
  19. Py_UCS4 ch;
  20. const char *s = *inptr;
  21. const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
  22. STRINGLIB_CHAR *p = dest + *outpos;
  23. while (s < end) {
  24. ch = (unsigned char)*s;
  25. if (ch < 0x80) {
  26. /* Fast path for runs of ASCII characters. Given that common UTF-8
  27. input will consist of an overwhelming majority of ASCII
  28. characters, we try to optimize for this case by checking
  29. as many characters as a C 'long' can contain.
  30. First, check if we can do an aligned read, as most CPUs have
  31. a penalty for unaligned reads.
  32. */
  33. if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
  34. /* Help register allocation */
  35. const char *_s = s;
  36. STRINGLIB_CHAR *_p = p;
  37. while (_s < aligned_end) {
  38. /* Read a whole long at a time (either 4 or 8 bytes),
  39. and do a fast unrolled copy if it only contains ASCII
  40. characters. */
  41. unsigned long value = *(unsigned long *) _s;
  42. if (value & ASCII_CHAR_MASK)
  43. break;
  44. #if PY_LITTLE_ENDIAN
  45. _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
  46. _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
  47. _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
  48. _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
  49. # if SIZEOF_LONG == 8
  50. _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
  51. _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
  52. _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
  53. _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
  54. # endif
  55. #else
  56. # if SIZEOF_LONG == 8
  57. _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
  58. _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
  59. _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
  60. _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
  61. _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
  62. _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
  63. _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
  64. _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
  65. # else
  66. _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
  67. _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
  68. _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
  69. _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
  70. # endif
  71. #endif
  72. _s += SIZEOF_LONG;
  73. _p += SIZEOF_LONG;
  74. }
  75. s = _s;
  76. p = _p;
  77. if (s == end)
  78. break;
  79. ch = (unsigned char)*s;
  80. }
  81. if (ch < 0x80) {
  82. s++;
  83. *p++ = ch;
  84. continue;
  85. }
  86. }
  87. if (ch < 0xE0) {
  88. /* \xC2\x80-\xDF\xBF -- 0080-07FF */
  89. Py_UCS4 ch2;
  90. if (ch < 0xC2) {
  91. /* invalid sequence
  92. \x80-\xBF -- continuation byte
  93. \xC0-\xC1 -- fake 0000-007F */
  94. goto InvalidStart;
  95. }
  96. if (end - s < 2) {
  97. /* unexpected end of data: the caller will decide whether
  98. it's an error or not */
  99. break;
  100. }
  101. ch2 = (unsigned char)s[1];
  102. if (!IS_CONTINUATION_BYTE(ch2))
  103. /* invalid continuation byte */
  104. goto InvalidContinuation1;
  105. ch = (ch << 6) + ch2 -
  106. ((0xC0 << 6) + 0x80);
  107. assert ((ch > 0x007F) && (ch <= 0x07FF));
  108. s += 2;
  109. if (STRINGLIB_MAX_CHAR <= 0x007F ||
  110. (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
  111. /* Out-of-range */
  112. goto Return;
  113. *p++ = ch;
  114. continue;
  115. }
  116. if (ch < 0xF0) {
  117. /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
  118. Py_UCS4 ch2, ch3;
  119. if (end - s < 3) {
  120. /* unexpected end of data: the caller will decide whether
  121. it's an error or not */
  122. if (end - s < 2)
  123. break;
  124. ch2 = (unsigned char)s[1];
  125. if (!IS_CONTINUATION_BYTE(ch2) ||
  126. (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
  127. /* for clarification see comments below */
  128. goto InvalidContinuation1;
  129. break;
  130. }
  131. ch2 = (unsigned char)s[1];
  132. ch3 = (unsigned char)s[2];
  133. if (!IS_CONTINUATION_BYTE(ch2)) {
  134. /* invalid continuation byte */
  135. goto InvalidContinuation1;
  136. }
  137. if (ch == 0xE0) {
  138. if (ch2 < 0xA0)
  139. /* invalid sequence
  140. \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
  141. goto InvalidContinuation1;
  142. } else if (ch == 0xED && ch2 >= 0xA0) {
  143. /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
  144. will result in surrogates in range D800-DFFF. Surrogates are
  145. not valid UTF-8 so they are rejected.
  146. See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
  147. (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
  148. goto InvalidContinuation1;
  149. }
  150. if (!IS_CONTINUATION_BYTE(ch3)) {
  151. /* invalid continuation byte */
  152. goto InvalidContinuation2;
  153. }
  154. ch = (ch << 12) + (ch2 << 6) + ch3 -
  155. ((0xE0 << 12) + (0x80 << 6) + 0x80);
  156. assert ((ch > 0x07FF) && (ch <= 0xFFFF));
  157. s += 3;
  158. if (STRINGLIB_MAX_CHAR <= 0x07FF ||
  159. (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
  160. /* Out-of-range */
  161. goto Return;
  162. *p++ = ch;
  163. continue;
  164. }
  165. if (ch < 0xF5) {
  166. /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
  167. Py_UCS4 ch2, ch3, ch4;
  168. if (end - s < 4) {
  169. /* unexpected end of data: the caller will decide whether
  170. it's an error or not */
  171. if (end - s < 2)
  172. break;
  173. ch2 = (unsigned char)s[1];
  174. if (!IS_CONTINUATION_BYTE(ch2) ||
  175. (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
  176. /* for clarification see comments below */
  177. goto InvalidContinuation1;
  178. if (end - s < 3)
  179. break;
  180. ch3 = (unsigned char)s[2];
  181. if (!IS_CONTINUATION_BYTE(ch3))
  182. goto InvalidContinuation2;
  183. break;
  184. }
  185. ch2 = (unsigned char)s[1];
  186. ch3 = (unsigned char)s[2];
  187. ch4 = (unsigned char)s[3];
  188. if (!IS_CONTINUATION_BYTE(ch2)) {
  189. /* invalid continuation byte */
  190. goto InvalidContinuation1;
  191. }
  192. if (ch == 0xF0) {
  193. if (ch2 < 0x90)
  194. /* invalid sequence
  195. \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
  196. goto InvalidContinuation1;
  197. } else if (ch == 0xF4 && ch2 >= 0x90) {
  198. /* invalid sequence
  199. \xF4\x90\x80\80- -- 110000- overflow */
  200. goto InvalidContinuation1;
  201. }
  202. if (!IS_CONTINUATION_BYTE(ch3)) {
  203. /* invalid continuation byte */
  204. goto InvalidContinuation2;
  205. }
  206. if (!IS_CONTINUATION_BYTE(ch4)) {
  207. /* invalid continuation byte */
  208. goto InvalidContinuation3;
  209. }
  210. ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
  211. ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
  212. assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
  213. s += 4;
  214. if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
  215. (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
  216. /* Out-of-range */
  217. goto Return;
  218. *p++ = ch;
  219. continue;
  220. }
  221. goto InvalidStart;
  222. }
  223. ch = 0;
  224. Return:
  225. *inptr = s;
  226. *outpos = p - dest;
  227. return ch;
  228. InvalidStart:
  229. ch = 1;
  230. goto Return;
  231. InvalidContinuation1:
  232. ch = 2;
  233. goto Return;
  234. InvalidContinuation2:
  235. ch = 3;
  236. goto Return;
  237. InvalidContinuation3:
  238. ch = 4;
  239. goto Return;
  240. }
  241. #undef ASCII_CHAR_MASK
  242. /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
  243. PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
  244. UCS-1 strings don't need to handle surrogates for example. */
  245. Py_LOCAL_INLINE(PyObject *)
  246. STRINGLIB(utf8_encoder)(PyObject *unicode,
  247. STRINGLIB_CHAR *data,
  248. Py_ssize_t size,
  249. const char *errors)
  250. {
  251. #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
  252. Py_ssize_t i; /* index into s of next input byte */
  253. PyObject *result; /* result string object */
  254. char *p; /* next free byte in output buffer */
  255. Py_ssize_t nallocated; /* number of result bytes allocated */
  256. Py_ssize_t nneeded; /* number of result bytes needed */
  257. #if STRINGLIB_SIZEOF_CHAR > 1
  258. PyObject *errorHandler = NULL;
  259. PyObject *exc = NULL;
  260. PyObject *rep = NULL;
  261. #endif
  262. #if STRINGLIB_SIZEOF_CHAR == 1
  263. const Py_ssize_t max_char_size = 2;
  264. char stackbuf[MAX_SHORT_UNICHARS * 2];
  265. #elif STRINGLIB_SIZEOF_CHAR == 2
  266. const Py_ssize_t max_char_size = 3;
  267. char stackbuf[MAX_SHORT_UNICHARS * 3];
  268. #else /* STRINGLIB_SIZEOF_CHAR == 4 */
  269. const Py_ssize_t max_char_size = 4;
  270. char stackbuf[MAX_SHORT_UNICHARS * 4];
  271. #endif
  272. assert(size >= 0);
  273. if (size <= MAX_SHORT_UNICHARS) {
  274. /* Write into the stack buffer; nallocated can't overflow.
  275. * At the end, we'll allocate exactly as much heap space as it
  276. * turns out we need.
  277. */
  278. nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
  279. result = NULL; /* will allocate after we're done */
  280. p = stackbuf;
  281. }
  282. else {
  283. if (size > PY_SSIZE_T_MAX / max_char_size) {
  284. /* integer overflow */
  285. return PyErr_NoMemory();
  286. }
  287. /* Overallocate on the heap, and give the excess back at the end. */
  288. nallocated = size * max_char_size;
  289. result = PyBytes_FromStringAndSize(NULL, nallocated);
  290. if (result == NULL)
  291. return NULL;
  292. p = PyBytes_AS_STRING(result);
  293. }
  294. for (i = 0; i < size;) {
  295. Py_UCS4 ch = data[i++];
  296. if (ch < 0x80) {
  297. /* Encode ASCII */
  298. *p++ = (char) ch;
  299. }
  300. else
  301. #if STRINGLIB_SIZEOF_CHAR > 1
  302. if (ch < 0x0800)
  303. #endif
  304. {
  305. /* Encode Latin-1 */
  306. *p++ = (char)(0xc0 | (ch >> 6));
  307. *p++ = (char)(0x80 | (ch & 0x3f));
  308. }
  309. #if STRINGLIB_SIZEOF_CHAR > 1
  310. else if (Py_UNICODE_IS_SURROGATE(ch)) {
  311. Py_ssize_t newpos;
  312. Py_ssize_t repsize, k, startpos;
  313. startpos = i-1;
  314. rep = unicode_encode_call_errorhandler(
  315. errors, &errorHandler, "utf-8", "surrogates not allowed",
  316. unicode, &exc, startpos, startpos+1, &newpos);
  317. if (!rep)
  318. goto error;
  319. if (PyBytes_Check(rep))
  320. repsize = PyBytes_GET_SIZE(rep);
  321. else
  322. repsize = PyUnicode_GET_LENGTH(rep);
  323. if (repsize > max_char_size) {
  324. Py_ssize_t offset;
  325. if (result == NULL)
  326. offset = p - stackbuf;
  327. else
  328. offset = p - PyBytes_AS_STRING(result);
  329. if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
  330. /* integer overflow */
  331. PyErr_NoMemory();
  332. goto error;
  333. }
  334. nallocated += repsize - max_char_size;
  335. if (result != NULL) {
  336. if (_PyBytes_Resize(&result, nallocated) < 0)
  337. goto error;
  338. } else {
  339. result = PyBytes_FromStringAndSize(NULL, nallocated);
  340. if (result == NULL)
  341. goto error;
  342. Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
  343. }
  344. p = PyBytes_AS_STRING(result) + offset;
  345. }
  346. if (PyBytes_Check(rep)) {
  347. char *prep = PyBytes_AS_STRING(rep);
  348. for(k = repsize; k > 0; k--)
  349. *p++ = *prep++;
  350. } else /* rep is unicode */ {
  351. enum PyUnicode_Kind repkind;
  352. void *repdata;
  353. if (PyUnicode_READY(rep) < 0)
  354. goto error;
  355. repkind = PyUnicode_KIND(rep);
  356. repdata = PyUnicode_DATA(rep);
  357. for(k=0; k<repsize; k++) {
  358. Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
  359. if (0x80 <= c) {
  360. raise_encode_exception(&exc, "utf-8",
  361. unicode,
  362. i-1, i,
  363. "surrogates not allowed");
  364. goto error;
  365. }
  366. *p++ = (char)c;
  367. }
  368. }
  369. Py_CLEAR(rep);
  370. }
  371. else
  372. #if STRINGLIB_SIZEOF_CHAR > 2
  373. if (ch < 0x10000)
  374. #endif
  375. {
  376. *p++ = (char)(0xe0 | (ch >> 12));
  377. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  378. *p++ = (char)(0x80 | (ch & 0x3f));
  379. }
  380. #if STRINGLIB_SIZEOF_CHAR > 2
  381. else /* ch >= 0x10000 */
  382. {
  383. assert(ch <= MAX_UNICODE);
  384. /* Encode UCS4 Unicode ordinals */
  385. *p++ = (char)(0xf0 | (ch >> 18));
  386. *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
  387. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  388. *p++ = (char)(0x80 | (ch & 0x3f));
  389. }
  390. #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
  391. #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
  392. }
  393. if (result == NULL) {
  394. /* This was stack allocated. */
  395. nneeded = p - stackbuf;
  396. assert(nneeded <= nallocated);
  397. result = PyBytes_FromStringAndSize(stackbuf, nneeded);
  398. }
  399. else {
  400. /* Cut back to size actually needed. */
  401. nneeded = p - PyBytes_AS_STRING(result);
  402. assert(nneeded <= nallocated);
  403. _PyBytes_Resize(&result, nneeded);
  404. }
  405. #if STRINGLIB_SIZEOF_CHAR > 1
  406. Py_XDECREF(errorHandler);
  407. Py_XDECREF(exc);
  408. #endif
  409. return result;
  410. #if STRINGLIB_SIZEOF_CHAR > 1
  411. error:
  412. Py_XDECREF(rep);
  413. Py_XDECREF(errorHandler);
  414. Py_XDECREF(exc);
  415. Py_XDECREF(result);
  416. return NULL;
  417. #endif
  418. #undef MAX_SHORT_UNICHARS
  419. }
  420. /* The pattern for constructing UCS2-repeated masks. */
  421. #if SIZEOF_LONG == 8
  422. # define UCS2_REPEAT_MASK 0x0001000100010001ul
  423. #elif SIZEOF_LONG == 4
  424. # define UCS2_REPEAT_MASK 0x00010001ul
  425. #else
  426. # error C 'long' size should be either 4 or 8!
  427. #endif
  428. /* The mask for fast checking. */
  429. #if STRINGLIB_SIZEOF_CHAR == 1
  430. /* The mask for fast checking of whether a C 'long' contains a
  431. non-ASCII or non-Latin1 UTF16-encoded characters. */
  432. # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
  433. #else
  434. /* The mask for fast checking of whether a C 'long' may contain
  435. UTF16-encoded surrogate characters. This is an efficient heuristic,
  436. assuming that non-surrogate characters with a code point >= 0x8000 are
  437. rare in most input.
  438. */
  439. # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
  440. #endif
  441. /* The mask for fast byte-swapping. */
  442. #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
  443. /* Swap bytes. */
  444. #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
  445. (((value) & STRIPPED_MASK) << 8))
  446. Py_LOCAL_INLINE(Py_UCS4)
  447. STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
  448. STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
  449. int native_ordering)
  450. {
  451. Py_UCS4 ch;
  452. const unsigned char *aligned_end =
  453. (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
  454. const unsigned char *q = *inptr;
  455. STRINGLIB_CHAR *p = dest + *outpos;
  456. /* Offsets from q for retrieving byte pairs in the right order. */
  457. #if PY_LITTLE_ENDIAN
  458. int ihi = !!native_ordering, ilo = !native_ordering;
  459. #else
  460. int ihi = !native_ordering, ilo = !!native_ordering;
  461. #endif
  462. --e;
  463. while (q < e) {
  464. Py_UCS4 ch2;
  465. /* First check for possible aligned read of a C 'long'. Unaligned
  466. reads are more expensive, better to defer to another iteration. */
  467. if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
  468. /* Fast path for runs of in-range non-surrogate chars. */
  469. const unsigned char *_q = q;
  470. while (_q < aligned_end) {
  471. unsigned long block = * (unsigned long *) _q;
  472. if (native_ordering) {
  473. /* Can use buffer directly */
  474. if (block & FAST_CHAR_MASK)
  475. break;
  476. }
  477. else {
  478. /* Need to byte-swap */
  479. if (block & SWAB(FAST_CHAR_MASK))
  480. break;
  481. #if STRINGLIB_SIZEOF_CHAR == 1
  482. block >>= 8;
  483. #else
  484. block = SWAB(block);
  485. #endif
  486. }
  487. #if PY_LITTLE_ENDIAN
  488. # if SIZEOF_LONG == 4
  489. p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  490. p[1] = (STRINGLIB_CHAR)(block >> 16);
  491. # elif SIZEOF_LONG == 8
  492. p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  493. p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
  494. p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
  495. p[3] = (STRINGLIB_CHAR)(block >> 48);
  496. # endif
  497. #else
  498. # if SIZEOF_LONG == 4
  499. p[0] = (STRINGLIB_CHAR)(block >> 16);
  500. p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  501. # elif SIZEOF_LONG == 8
  502. p[0] = (STRINGLIB_CHAR)(block >> 48);
  503. p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
  504. p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
  505. p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
  506. # endif
  507. #endif
  508. _q += SIZEOF_LONG;
  509. p += SIZEOF_LONG / 2;
  510. }
  511. q = _q;
  512. if (q >= e)
  513. break;
  514. }
  515. ch = (q[ihi] << 8) | q[ilo];
  516. q += 2;
  517. if (!Py_UNICODE_IS_SURROGATE(ch)) {
  518. #if STRINGLIB_SIZEOF_CHAR < 2
  519. if (ch > STRINGLIB_MAX_CHAR)
  520. /* Out-of-range */
  521. goto Return;
  522. #endif
  523. *p++ = (STRINGLIB_CHAR)ch;
  524. continue;
  525. }
  526. /* UTF-16 code pair: */
  527. if (q >= e)
  528. goto UnexpectedEnd;
  529. if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
  530. goto IllegalEncoding;
  531. ch2 = (q[ihi] << 8) | q[ilo];
  532. q += 2;
  533. if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
  534. goto IllegalSurrogate;
  535. ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
  536. #if STRINGLIB_SIZEOF_CHAR < 4
  537. /* Out-of-range */
  538. goto Return;
  539. #else
  540. *p++ = (STRINGLIB_CHAR)ch;
  541. #endif
  542. }
  543. ch = 0;
  544. Return:
  545. *inptr = q;
  546. *outpos = p - dest;
  547. return ch;
  548. UnexpectedEnd:
  549. ch = 1;
  550. goto Return;
  551. IllegalEncoding:
  552. ch = 2;
  553. goto Return;
  554. IllegalSurrogate:
  555. ch = 3;
  556. goto Return;
  557. }
  558. #undef UCS2_REPEAT_MASK
  559. #undef FAST_CHAR_MASK
  560. #undef STRIPPED_MASK
  561. #undef SWAB
  562. #if STRINGLIB_MAX_CHAR >= 0x80
  563. Py_LOCAL_INLINE(Py_ssize_t)
  564. STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
  565. Py_ssize_t len,
  566. unsigned short **outptr,
  567. int native_ordering)
  568. {
  569. unsigned short *out = *outptr;
  570. const STRINGLIB_CHAR *end = in + len;
  571. #if STRINGLIB_SIZEOF_CHAR == 1
  572. if (native_ordering) {
  573. const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
  574. while (in < unrolled_end) {
  575. out[0] = in[0];
  576. out[1] = in[1];
  577. out[2] = in[2];
  578. out[3] = in[3];
  579. in += 4; out += 4;
  580. }
  581. while (in < end) {
  582. *out++ = *in++;
  583. }
  584. } else {
  585. # define SWAB2(CH) ((CH) << 8) /* high byte is zero */
  586. const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
  587. while (in < unrolled_end) {
  588. out[0] = SWAB2(in[0]);
  589. out[1] = SWAB2(in[1]);
  590. out[2] = SWAB2(in[2]);
  591. out[3] = SWAB2(in[3]);
  592. in += 4; out += 4;
  593. }
  594. while (in < end) {
  595. Py_UCS4 ch = *in++;
  596. *out++ = SWAB2((Py_UCS2)ch);
  597. }
  598. #undef SWAB2
  599. }
  600. *outptr = out;
  601. return len;
  602. #else
  603. if (native_ordering) {
  604. #if STRINGLIB_MAX_CHAR < 0x10000
  605. const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
  606. while (in < unrolled_end) {
  607. /* check if any character is a surrogate character */
  608. if (((in[0] ^ 0xd800) &
  609. (in[1] ^ 0xd800) &
  610. (in[2] ^ 0xd800) &
  611. (in[3] ^ 0xd800) & 0xf800) == 0)
  612. break;
  613. out[0] = in[0];
  614. out[1] = in[1];
  615. out[2] = in[2];
  616. out[3] = in[3];
  617. in += 4; out += 4;
  618. }
  619. #endif
  620. while (in < end) {
  621. Py_UCS4 ch;
  622. ch = *in++;
  623. if (ch < 0xd800)
  624. *out++ = ch;
  625. else if (ch < 0xe000)
  626. /* reject surrogate characters (U+DC800-U+DFFF) */
  627. goto fail;
  628. #if STRINGLIB_MAX_CHAR >= 0x10000
  629. else if (ch >= 0x10000) {
  630. out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
  631. out[1] = Py_UNICODE_LOW_SURROGATE(ch);
  632. out += 2;
  633. }
  634. #endif
  635. else
  636. *out++ = ch;
  637. }
  638. } else {
  639. #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
  640. #if STRINGLIB_MAX_CHAR < 0x10000
  641. const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
  642. while (in < unrolled_end) {
  643. /* check if any character is a surrogate character */
  644. if (((in[0] ^ 0xd800) &
  645. (in[1] ^ 0xd800) &
  646. (in[2] ^ 0xd800) &
  647. (in[3] ^ 0xd800) & 0xf800) == 0)
  648. break;
  649. out[0] = SWAB2(in[0]);
  650. out[1] = SWAB2(in[1]);
  651. out[2] = SWAB2(in[2]);
  652. out[3] = SWAB2(in[3]);
  653. in += 4; out += 4;
  654. }
  655. #endif
  656. while (in < end) {
  657. Py_UCS4 ch = *in++;
  658. if (ch < 0xd800)
  659. *out++ = SWAB2((Py_UCS2)ch);
  660. else if (ch < 0xe000)
  661. /* reject surrogate characters (U+DC800-U+DFFF) */
  662. goto fail;
  663. #if STRINGLIB_MAX_CHAR >= 0x10000
  664. else if (ch >= 0x10000) {
  665. Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
  666. Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
  667. out[0] = SWAB2(ch1);
  668. out[1] = SWAB2(ch2);
  669. out += 2;
  670. }
  671. #endif
  672. else
  673. *out++ = SWAB2((Py_UCS2)ch);
  674. }
  675. #undef SWAB2
  676. }
  677. *outptr = out;
  678. return len;
  679. fail:
  680. *outptr = out;
  681. return len - (end - in + 1);
  682. #endif
  683. }
  684. #endif
  685. #endif /* STRINGLIB_IS_UNICODE */