You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

353 lines
11 KiB

  1. /* stringlib: codec implementations */
  2. #if STRINGLIB_IS_UNICODE
  3. /* Mask to check or force alignment of a pointer to C 'long' boundaries */
  4. #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
  5. /* Mask to quickly check whether a C 'long' contains a
  6. non-ASCII, UTF8-encoded char. */
  7. #if (SIZEOF_LONG == 8)
  8. # define ASCII_CHAR_MASK 0x8080808080808080L
  9. #elif (SIZEOF_LONG == 4)
  10. # define ASCII_CHAR_MASK 0x80808080L
  11. #else
  12. # error C 'long' size should be either 4 or 8!
  13. #endif
  14. Py_LOCAL_INLINE(int)
  15. STRINGLIB(utf8_try_decode)(const char *start, const char *end,
  16. STRINGLIB_CHAR *dest,
  17. const char **src_pos, Py_ssize_t *dest_index)
  18. {
  19. int ret;
  20. Py_ssize_t n;
  21. const char *s = start;
  22. const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
  23. STRINGLIB_CHAR *p = dest;
  24. while (s < end) {
  25. Py_UCS4 ch = (unsigned char)*s;
  26. if (ch < 0x80) {
  27. /* Fast path for runs of ASCII characters. Given that common UTF-8
  28. input will consist of an overwhelming majority of ASCII
  29. characters, we try to optimize for this case by checking
  30. as many characters as a C 'long' can contain.
  31. First, check if we can do an aligned read, as most CPUs have
  32. a penalty for unaligned reads.
  33. */
  34. if (!((size_t) s & LONG_PTR_MASK)) {
  35. /* Help register allocation */
  36. register const char *_s = s;
  37. register STRINGLIB_CHAR *_p = p;
  38. while (_s < aligned_end) {
  39. /* Read a whole long at a time (either 4 or 8 bytes),
  40. and do a fast unrolled copy if it only contains ASCII
  41. characters. */
  42. unsigned long value = *(unsigned long *) _s;
  43. if (value & ASCII_CHAR_MASK)
  44. break;
  45. _p[0] = _s[0];
  46. _p[1] = _s[1];
  47. _p[2] = _s[2];
  48. _p[3] = _s[3];
  49. #if (SIZEOF_LONG == 8)
  50. _p[4] = _s[4];
  51. _p[5] = _s[5];
  52. _p[6] = _s[6];
  53. _p[7] = _s[7];
  54. #endif
  55. _s += SIZEOF_LONG;
  56. _p += SIZEOF_LONG;
  57. }
  58. s = _s;
  59. p = _p;
  60. if (s == end)
  61. break;
  62. ch = (unsigned char)*s;
  63. }
  64. }
  65. if (ch < 0x80) {
  66. s++;
  67. *p++ = ch;
  68. continue;
  69. }
  70. n = utf8_code_length[ch];
  71. if (s + n > end) {
  72. /* unexpected end of data: the caller will decide whether
  73. it's an error or not */
  74. goto _error;
  75. }
  76. switch (n) {
  77. case 0:
  78. /* invalid start byte */
  79. goto _error;
  80. case 1:
  81. /* internal error */
  82. goto _error;
  83. case 2:
  84. if ((s[1] & 0xc0) != 0x80)
  85. /* invalid continuation byte */
  86. goto _error;
  87. ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
  88. assert ((ch > 0x007F) && (ch <= 0x07FF));
  89. s += 2;
  90. *p++ = ch;
  91. break;
  92. case 3:
  93. /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
  94. will result in surrogates in range d800-dfff. Surrogates are
  95. not valid UTF-8 so they are rejected.
  96. See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
  97. (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
  98. if ((s[1] & 0xc0) != 0x80 ||
  99. (s[2] & 0xc0) != 0x80 ||
  100. ((unsigned char)s[0] == 0xE0 &&
  101. (unsigned char)s[1] < 0xA0) ||
  102. ((unsigned char)s[0] == 0xED &&
  103. (unsigned char)s[1] > 0x9F)) {
  104. /* invalid continuation byte */
  105. goto _error;
  106. }
  107. ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
  108. assert ((ch > 0x07FF) && (ch <= 0xFFFF));
  109. s += 3;
  110. *p++ = ch;
  111. break;
  112. case 4:
  113. if ((s[1] & 0xc0) != 0x80 ||
  114. (s[2] & 0xc0) != 0x80 ||
  115. (s[3] & 0xc0) != 0x80 ||
  116. ((unsigned char)s[0] == 0xF0 &&
  117. (unsigned char)s[1] < 0x90) ||
  118. ((unsigned char)s[0] == 0xF4 &&
  119. (unsigned char)s[1] > 0x8F)) {
  120. /* invalid continuation byte */
  121. goto _error;
  122. }
  123. ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
  124. ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
  125. assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
  126. s += 4;
  127. *p++ = ch;
  128. break;
  129. }
  130. }
  131. ret = 0;
  132. goto _ok;
  133. _error:
  134. ret = -1;
  135. _ok:
  136. *src_pos = s;
  137. *dest_index = p - dest;
  138. return ret;
  139. }
  140. #undef LONG_PTR_MASK
  141. #undef ASCII_CHAR_MASK
  142. /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
  143. PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
  144. UCS-1 strings don't need to handle surrogates for example. */
  145. Py_LOCAL_INLINE(PyObject *)
  146. STRINGLIB(utf8_encoder)(PyObject *unicode,
  147. STRINGLIB_CHAR *data,
  148. Py_ssize_t size,
  149. const char *errors)
  150. {
  151. #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
  152. Py_ssize_t i; /* index into s of next input byte */
  153. PyObject *result; /* result string object */
  154. char *p; /* next free byte in output buffer */
  155. Py_ssize_t nallocated; /* number of result bytes allocated */
  156. Py_ssize_t nneeded; /* number of result bytes needed */
  157. #if STRINGLIB_SIZEOF_CHAR > 1
  158. PyObject *errorHandler = NULL;
  159. PyObject *exc = NULL;
  160. PyObject *rep = NULL;
  161. #endif
  162. #if STRINGLIB_SIZEOF_CHAR == 1
  163. const Py_ssize_t max_char_size = 2;
  164. char stackbuf[MAX_SHORT_UNICHARS * 2];
  165. #elif STRINGLIB_SIZEOF_CHAR == 2
  166. const Py_ssize_t max_char_size = 3;
  167. char stackbuf[MAX_SHORT_UNICHARS * 3];
  168. #else /* STRINGLIB_SIZEOF_CHAR == 4 */
  169. const Py_ssize_t max_char_size = 4;
  170. char stackbuf[MAX_SHORT_UNICHARS * 4];
  171. #endif
  172. assert(size >= 0);
  173. if (size <= MAX_SHORT_UNICHARS) {
  174. /* Write into the stack buffer; nallocated can't overflow.
  175. * At the end, we'll allocate exactly as much heap space as it
  176. * turns out we need.
  177. */
  178. nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
  179. result = NULL; /* will allocate after we're done */
  180. p = stackbuf;
  181. }
  182. else {
  183. if (size > PY_SSIZE_T_MAX / max_char_size) {
  184. /* integer overflow */
  185. return PyErr_NoMemory();
  186. }
  187. /* Overallocate on the heap, and give the excess back at the end. */
  188. nallocated = size * max_char_size;
  189. result = PyBytes_FromStringAndSize(NULL, nallocated);
  190. if (result == NULL)
  191. return NULL;
  192. p = PyBytes_AS_STRING(result);
  193. }
  194. for (i = 0; i < size;) {
  195. Py_UCS4 ch = data[i++];
  196. if (ch < 0x80) {
  197. /* Encode ASCII */
  198. *p++ = (char) ch;
  199. }
  200. else
  201. #if STRINGLIB_SIZEOF_CHAR > 1
  202. if (ch < 0x0800)
  203. #endif
  204. {
  205. /* Encode Latin-1 */
  206. *p++ = (char)(0xc0 | (ch >> 6));
  207. *p++ = (char)(0x80 | (ch & 0x3f));
  208. }
  209. #if STRINGLIB_SIZEOF_CHAR > 1
  210. else if (Py_UNICODE_IS_SURROGATE(ch)) {
  211. Py_ssize_t newpos;
  212. Py_ssize_t repsize, k, startpos;
  213. startpos = i-1;
  214. rep = unicode_encode_call_errorhandler(
  215. errors, &errorHandler, "utf-8", "surrogates not allowed",
  216. unicode, &exc, startpos, startpos+1, &newpos);
  217. if (!rep)
  218. goto error;
  219. if (PyBytes_Check(rep))
  220. repsize = PyBytes_GET_SIZE(rep);
  221. else
  222. repsize = PyUnicode_GET_LENGTH(rep);
  223. if (repsize > max_char_size) {
  224. Py_ssize_t offset;
  225. if (result == NULL)
  226. offset = p - stackbuf;
  227. else
  228. offset = p - PyBytes_AS_STRING(result);
  229. if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
  230. /* integer overflow */
  231. PyErr_NoMemory();
  232. goto error;
  233. }
  234. nallocated += repsize - max_char_size;
  235. if (result != NULL) {
  236. if (_PyBytes_Resize(&result, nallocated) < 0)
  237. goto error;
  238. } else {
  239. result = PyBytes_FromStringAndSize(NULL, nallocated);
  240. if (result == NULL)
  241. goto error;
  242. Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
  243. }
  244. p = PyBytes_AS_STRING(result) + offset;
  245. }
  246. if (PyBytes_Check(rep)) {
  247. char *prep = PyBytes_AS_STRING(rep);
  248. for(k = repsize; k > 0; k--)
  249. *p++ = *prep++;
  250. } else /* rep is unicode */ {
  251. enum PyUnicode_Kind repkind;
  252. void *repdata;
  253. if (PyUnicode_READY(rep) < 0)
  254. goto error;
  255. repkind = PyUnicode_KIND(rep);
  256. repdata = PyUnicode_DATA(rep);
  257. for(k=0; k<repsize; k++) {
  258. Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
  259. if (0x80 <= c) {
  260. raise_encode_exception(&exc, "utf-8",
  261. unicode,
  262. i-1, i,
  263. "surrogates not allowed");
  264. goto error;
  265. }
  266. *p++ = (char)c;
  267. }
  268. }
  269. Py_CLEAR(rep);
  270. }
  271. else
  272. #if STRINGLIB_SIZEOF_CHAR > 2
  273. if (ch < 0x10000)
  274. #endif
  275. {
  276. *p++ = (char)(0xe0 | (ch >> 12));
  277. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  278. *p++ = (char)(0x80 | (ch & 0x3f));
  279. }
  280. #if STRINGLIB_SIZEOF_CHAR > 2
  281. else /* ch >= 0x10000 */
  282. {
  283. assert(ch <= MAX_UNICODE);
  284. /* Encode UCS4 Unicode ordinals */
  285. *p++ = (char)(0xf0 | (ch >> 18));
  286. *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
  287. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  288. *p++ = (char)(0x80 | (ch & 0x3f));
  289. }
  290. #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
  291. #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
  292. }
  293. if (result == NULL) {
  294. /* This was stack allocated. */
  295. nneeded = p - stackbuf;
  296. assert(nneeded <= nallocated);
  297. result = PyBytes_FromStringAndSize(stackbuf, nneeded);
  298. }
  299. else {
  300. /* Cut back to size actually needed. */
  301. nneeded = p - PyBytes_AS_STRING(result);
  302. assert(nneeded <= nallocated);
  303. _PyBytes_Resize(&result, nneeded);
  304. }
  305. #if STRINGLIB_SIZEOF_CHAR > 1
  306. Py_XDECREF(errorHandler);
  307. Py_XDECREF(exc);
  308. #endif
  309. return result;
  310. #if STRINGLIB_SIZEOF_CHAR > 1
  311. error:
  312. Py_XDECREF(rep);
  313. Py_XDECREF(errorHandler);
  314. Py_XDECREF(exc);
  315. Py_XDECREF(result);
  316. return NULL;
  317. #endif
  318. #undef MAX_SHORT_UNICHARS
  319. }
  320. #endif /* STRINGLIB_IS_UNICODE */