You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

680 lines
18 KiB

  1. #include "Python.h"
  2. #ifdef MS_WINDOWS
  3. # include <windows.h>
  4. #endif
  5. #ifdef HAVE_LANGINFO_H
  6. #include <locale.h>
  7. #include <langinfo.h>
  8. #endif
  9. #ifdef __APPLE__
  10. extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
  11. #endif
  12. #if !defined(__APPLE__) && !defined(MS_WINDOWS)
  13. extern int _Py_normalize_encoding(const char *, char *, size_t);
  14. /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
  15. On these operating systems, nl_langinfo(CODESET) announces an alias of the
  16. ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
  17. ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
  18. locale.getpreferredencoding() codec. For example, if command line arguments
  19. are decoded by mbstowcs() and encoded back by os.fsencode(), we get a
  20. UnicodeEncodeError instead of retrieving the original byte string.
  21. The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C",
  22. nl_langinfo(CODESET) announces "ascii" (or an alias to ASCII), and at least
  23. one byte in range 0x80-0xff can be decoded from the locale encoding. The
  24. workaround is also enabled on error, for example if getting the locale
  25. failed.
  26. Values of locale_is_ascii:
  27. 1: the workaround is used: _Py_wchar2char() uses
  28. encode_ascii_surrogateescape() and _Py_char2wchar() uses
  29. decode_ascii_surrogateescape()
  30. 0: the workaround is not used: _Py_wchar2char() uses wcstombs() and
  31. _Py_char2wchar() uses mbstowcs()
  32. -1: unknown, need to call check_force_ascii() to get the value
  33. */
  34. static int force_ascii = -1;
  35. static int
  36. check_force_ascii(void)
  37. {
  38. char *loc;
  39. #if defined(HAVE_LANGINFO_H) && defined(CODESET)
  40. char *codeset, **alias;
  41. char encoding[100];
  42. int is_ascii;
  43. unsigned int i;
  44. char* ascii_aliases[] = {
  45. "ascii",
  46. "646",
  47. "ansi-x3.4-1968",
  48. "ansi-x3-4-1968",
  49. "ansi-x3.4-1986",
  50. "cp367",
  51. "csascii",
  52. "ibm367",
  53. "iso646-us",
  54. "iso-646.irv-1991",
  55. "iso-ir-6",
  56. "us",
  57. "us-ascii",
  58. NULL
  59. };
  60. #endif
  61. loc = setlocale(LC_CTYPE, NULL);
  62. if (loc == NULL)
  63. goto error;
  64. if (strcmp(loc, "C") != 0) {
  65. /* the LC_CTYPE locale is different than C */
  66. return 0;
  67. }
  68. #if defined(HAVE_LANGINFO_H) && defined(CODESET)
  69. codeset = nl_langinfo(CODESET);
  70. if (!codeset || codeset[0] == '\0') {
  71. /* CODESET is not set or empty */
  72. goto error;
  73. }
  74. if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
  75. goto error;
  76. is_ascii = 0;
  77. for (alias=ascii_aliases; *alias != NULL; alias++) {
  78. if (strcmp(encoding, *alias) == 0) {
  79. is_ascii = 1;
  80. break;
  81. }
  82. }
  83. if (!is_ascii) {
  84. /* nl_langinfo(CODESET) is not "ascii" or an alias of ASCII */
  85. return 0;
  86. }
  87. for (i=0x80; i<0xff; i++) {
  88. unsigned char ch;
  89. wchar_t wch;
  90. size_t res;
  91. ch = (unsigned char)i;
  92. res = mbstowcs(&wch, (char*)&ch, 1);
  93. if (res != (size_t)-1) {
  94. /* decoding a non-ASCII character from the locale encoding succeed:
  95. the locale encoding is not ASCII, force ASCII */
  96. return 1;
  97. }
  98. }
  99. /* None of the bytes in the range 0x80-0xff can be decoded from the locale
  100. encoding: the locale encoding is really ASCII */
  101. return 0;
  102. #else
  103. /* nl_langinfo(CODESET) is not available: always force ASCII */
  104. return 1;
  105. #endif
  106. error:
  107. /* if an error occured, force the ASCII encoding */
  108. return 1;
  109. }
  110. static char*
  111. encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
  112. {
  113. char *result = NULL, *out;
  114. size_t len, i;
  115. wchar_t ch;
  116. if (error_pos != NULL)
  117. *error_pos = (size_t)-1;
  118. len = wcslen(text);
  119. result = PyMem_Malloc(len + 1); /* +1 for NUL byte */
  120. if (result == NULL)
  121. return NULL;
  122. out = result;
  123. for (i=0; i<len; i++) {
  124. ch = text[i];
  125. if (ch <= 0x7f) {
  126. /* ASCII character */
  127. *out++ = (char)ch;
  128. }
  129. else if (0xdc80 <= ch && ch <= 0xdcff) {
  130. /* UTF-8b surrogate */
  131. *out++ = (char)(ch - 0xdc00);
  132. }
  133. else {
  134. if (error_pos != NULL)
  135. *error_pos = i;
  136. PyMem_Free(result);
  137. return NULL;
  138. }
  139. }
  140. *out = '\0';
  141. return result;
  142. }
  143. #endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
  144. #if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
  145. static wchar_t*
  146. decode_ascii_surrogateescape(const char *arg, size_t *size)
  147. {
  148. wchar_t *res;
  149. unsigned char *in;
  150. wchar_t *out;
  151. res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
  152. if (!res)
  153. return NULL;
  154. in = (unsigned char*)arg;
  155. out = res;
  156. while(*in)
  157. if(*in < 128)
  158. *out++ = *in++;
  159. else
  160. *out++ = 0xdc00 + *in++;
  161. *out = 0;
  162. if (size != NULL)
  163. *size = out - res;
  164. return res;
  165. }
  166. #endif
  167. /* Decode a byte string from the locale encoding with the
  168. surrogateescape error handler (undecodable bytes are decoded as characters
  169. in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
  170. character, escape the bytes using the surrogateescape error handler instead
  171. of decoding them.
  172. Use _Py_wchar2char() to encode the character string back to a byte string.
  173. Return a pointer to a newly allocated wide character string (use
  174. PyMem_Free() to free the memory) and write the number of written wide
  175. characters excluding the null character into *size if size is not NULL, or
  176. NULL on error (conversion or memory allocation error).
  177. Conversion errors should never happen, unless there is a bug in the C
  178. library. */
  179. wchar_t*
  180. _Py_char2wchar(const char* arg, size_t *size)
  181. {
  182. #ifdef __APPLE__
  183. wchar_t *wstr;
  184. wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
  185. if (size != NULL) {
  186. if (wstr != NULL)
  187. *size = wcslen(wstr);
  188. else
  189. *size = (size_t)-1;
  190. }
  191. return wstr;
  192. #else
  193. wchar_t *res;
  194. size_t argsize;
  195. size_t count;
  196. unsigned char *in;
  197. wchar_t *out;
  198. #ifdef HAVE_MBRTOWC
  199. mbstate_t mbs;
  200. #endif
  201. #ifndef MS_WINDOWS
  202. if (force_ascii == -1)
  203. force_ascii = check_force_ascii();
  204. if (force_ascii) {
  205. /* force ASCII encoding to workaround mbstowcs() issue */
  206. res = decode_ascii_surrogateescape(arg, size);
  207. if (res == NULL)
  208. goto oom;
  209. return res;
  210. }
  211. #endif
  212. #ifdef HAVE_BROKEN_MBSTOWCS
  213. /* Some platforms have a broken implementation of
  214. * mbstowcs which does not count the characters that
  215. * would result from conversion. Use an upper bound.
  216. */
  217. argsize = strlen(arg);
  218. #else
  219. argsize = mbstowcs(NULL, arg, 0);
  220. #endif
  221. if (argsize != (size_t)-1) {
  222. res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
  223. if (!res)
  224. goto oom;
  225. count = mbstowcs(res, arg, argsize+1);
  226. if (count != (size_t)-1) {
  227. wchar_t *tmp;
  228. /* Only use the result if it contains no
  229. surrogate characters. */
  230. for (tmp = res; *tmp != 0 &&
  231. (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
  232. ;
  233. if (*tmp == 0) {
  234. if (size != NULL)
  235. *size = count;
  236. return res;
  237. }
  238. }
  239. PyMem_Free(res);
  240. }
  241. /* Conversion failed. Fall back to escaping with surrogateescape. */
  242. #ifdef HAVE_MBRTOWC
  243. /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
  244. /* Overallocate; as multi-byte characters are in the argument, the
  245. actual output could use less memory. */
  246. argsize = strlen(arg) + 1;
  247. res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
  248. if (!res)
  249. goto oom;
  250. in = (unsigned char*)arg;
  251. out = res;
  252. memset(&mbs, 0, sizeof mbs);
  253. while (argsize) {
  254. size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
  255. if (converted == 0)
  256. /* Reached end of string; null char stored. */
  257. break;
  258. if (converted == (size_t)-2) {
  259. /* Incomplete character. This should never happen,
  260. since we provide everything that we have -
  261. unless there is a bug in the C library, or I
  262. misunderstood how mbrtowc works. */
  263. fprintf(stderr, "unexpected mbrtowc result -2\n");
  264. PyMem_Free(res);
  265. return NULL;
  266. }
  267. if (converted == (size_t)-1) {
  268. /* Conversion error. Escape as UTF-8b, and start over
  269. in the initial shift state. */
  270. *out++ = 0xdc00 + *in++;
  271. argsize--;
  272. memset(&mbs, 0, sizeof mbs);
  273. continue;
  274. }
  275. if (*out >= 0xd800 && *out <= 0xdfff) {
  276. /* Surrogate character. Escape the original
  277. byte sequence with surrogateescape. */
  278. argsize -= converted;
  279. while (converted--)
  280. *out++ = 0xdc00 + *in++;
  281. continue;
  282. }
  283. /* successfully converted some bytes */
  284. in += converted;
  285. argsize -= converted;
  286. out++;
  287. }
  288. if (size != NULL)
  289. *size = out - res;
  290. #else /* HAVE_MBRTOWC */
  291. /* Cannot use C locale for escaping; manually escape as if charset
  292. is ASCII (i.e. escape all bytes > 128. This will still roundtrip
  293. correctly in the locale's charset, which must be an ASCII superset. */
  294. res = decode_ascii_surrogateescape(arg, size);
  295. if (res == NULL)
  296. goto oom;
  297. #endif /* HAVE_MBRTOWC */
  298. return res;
  299. oom:
  300. fprintf(stderr, "out of memory\n");
  301. return NULL;
  302. #endif /* __APPLE__ */
  303. }
  304. /* Encode a (wide) character string to the locale encoding with the
  305. surrogateescape error handler (characters in range U+DC80..U+DCFF are
  306. converted to bytes 0x80..0xFF).
  307. This function is the reverse of _Py_char2wchar().
  308. Return a pointer to a newly allocated byte string (use PyMem_Free() to free
  309. the memory), or NULL on conversion or memory allocation error.
  310. If error_pos is not NULL: *error_pos is the index of the invalid character
  311. on conversion error, or (size_t)-1 otherwise. */
  312. char*
  313. _Py_wchar2char(const wchar_t *text, size_t *error_pos)
  314. {
  315. #ifdef __APPLE__
  316. Py_ssize_t len;
  317. PyObject *unicode, *bytes = NULL;
  318. char *cpath;
  319. unicode = PyUnicode_FromWideChar(text, wcslen(text));
  320. if (unicode == NULL)
  321. return NULL;
  322. bytes = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
  323. PyUnicode_GET_SIZE(unicode),
  324. "surrogateescape");
  325. Py_DECREF(unicode);
  326. if (bytes == NULL) {
  327. PyErr_Clear();
  328. if (error_pos != NULL)
  329. *error_pos = (size_t)-1;
  330. return NULL;
  331. }
  332. len = PyBytes_GET_SIZE(bytes);
  333. cpath = PyMem_Malloc(len+1);
  334. if (cpath == NULL) {
  335. PyErr_Clear();
  336. Py_DECREF(bytes);
  337. if (error_pos != NULL)
  338. *error_pos = (size_t)-1;
  339. return NULL;
  340. }
  341. memcpy(cpath, PyBytes_AsString(bytes), len + 1);
  342. Py_DECREF(bytes);
  343. return cpath;
  344. #else /* __APPLE__ */
  345. const size_t len = wcslen(text);
  346. char *result = NULL, *bytes = NULL;
  347. size_t i, size, converted;
  348. wchar_t c, buf[2];
  349. #ifndef MS_WINDOWS
  350. if (force_ascii == -1)
  351. force_ascii = check_force_ascii();
  352. if (force_ascii)
  353. return encode_ascii_surrogateescape(text, error_pos);
  354. #endif
  355. /* The function works in two steps:
  356. 1. compute the length of the output buffer in bytes (size)
  357. 2. outputs the bytes */
  358. size = 0;
  359. buf[1] = 0;
  360. while (1) {
  361. for (i=0; i < len; i++) {
  362. c = text[i];
  363. if (c >= 0xdc80 && c <= 0xdcff) {
  364. /* UTF-8b surrogate */
  365. if (bytes != NULL) {
  366. *bytes++ = c - 0xdc00;
  367. size--;
  368. }
  369. else
  370. size++;
  371. continue;
  372. }
  373. else {
  374. buf[0] = c;
  375. if (bytes != NULL)
  376. converted = wcstombs(bytes, buf, size);
  377. else
  378. converted = wcstombs(NULL, buf, 0);
  379. if (converted == (size_t)-1) {
  380. if (result != NULL)
  381. PyMem_Free(result);
  382. if (error_pos != NULL)
  383. *error_pos = i;
  384. return NULL;
  385. }
  386. if (bytes != NULL) {
  387. bytes += converted;
  388. size -= converted;
  389. }
  390. else
  391. size += converted;
  392. }
  393. }
  394. if (result != NULL) {
  395. *bytes = '\0';
  396. break;
  397. }
  398. size += 1; /* nul byte at the end */
  399. result = PyMem_Malloc(size);
  400. if (result == NULL) {
  401. if (error_pos != NULL)
  402. *error_pos = (size_t)-1;
  403. return NULL;
  404. }
  405. bytes = result;
  406. }
  407. return result;
  408. #endif /* __APPLE__ */
  409. }
  410. /* In principle, this should use HAVE__WSTAT, and _wstat
  411. should be detected by autoconf. However, no current
  412. POSIX system provides that function, so testing for
  413. it is pointless.
  414. Not sure whether the MS_WINDOWS guards are necessary:
  415. perhaps for cygwin/mingw builds?
  416. */
  417. #if defined(HAVE_STAT) && !defined(MS_WINDOWS)
  418. /* Get file status. Encode the path to the locale encoding. */
  419. int
  420. _Py_wstat(const wchar_t* path, struct stat *buf)
  421. {
  422. int err;
  423. char *fname;
  424. fname = _Py_wchar2char(path, NULL);
  425. if (fname == NULL) {
  426. errno = EINVAL;
  427. return -1;
  428. }
  429. err = stat(fname, buf);
  430. PyMem_Free(fname);
  431. return err;
  432. }
  433. #endif
  434. #ifdef HAVE_STAT
  435. /* Call _wstat() on Windows, or encode the path to the filesystem encoding and
  436. call stat() otherwise. Only fill st_mode attribute on Windows.
  437. Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
  438. unicode error. */
  439. int
  440. _Py_stat(PyObject *path, struct stat *statbuf)
  441. {
  442. #ifdef MS_WINDOWS
  443. int err;
  444. struct _stat wstatbuf;
  445. err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
  446. if (!err)
  447. statbuf->st_mode = wstatbuf.st_mode;
  448. return err;
  449. #else
  450. int ret;
  451. PyObject *bytes = PyUnicode_EncodeFSDefault(path);
  452. if (bytes == NULL)
  453. return -1;
  454. ret = stat(PyBytes_AS_STRING(bytes), statbuf);
  455. Py_DECREF(bytes);
  456. return ret;
  457. #endif
  458. }
  459. #endif
  460. /* Open a file. Use _wfopen() on Windows, encode the path to the locale
  461. encoding and use fopen() otherwise. */
  462. FILE *
  463. _Py_wfopen(const wchar_t *path, const wchar_t *mode)
  464. {
  465. #ifndef MS_WINDOWS
  466. FILE *f;
  467. char *cpath;
  468. char cmode[10];
  469. size_t r;
  470. r = wcstombs(cmode, mode, 10);
  471. if (r == (size_t)-1 || r >= 10) {
  472. errno = EINVAL;
  473. return NULL;
  474. }
  475. cpath = _Py_wchar2char(path, NULL);
  476. if (cpath == NULL)
  477. return NULL;
  478. f = fopen(cpath, cmode);
  479. PyMem_Free(cpath);
  480. return f;
  481. #else
  482. return _wfopen(path, mode);
  483. #endif
  484. }
  485. /* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
  486. call fopen() otherwise.
  487. Return the new file object on success, or NULL if the file cannot be open or
  488. (if PyErr_Occurred()) on unicode error */
  489. FILE*
  490. _Py_fopen(PyObject *path, const char *mode)
  491. {
  492. #ifdef MS_WINDOWS
  493. wchar_t wmode[10];
  494. int usize;
  495. usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
  496. if (usize == 0)
  497. return NULL;
  498. return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
  499. #else
  500. FILE *f;
  501. PyObject *bytes = PyUnicode_EncodeFSDefault(path);
  502. if (bytes == NULL)
  503. return NULL;
  504. f = fopen(PyBytes_AS_STRING(bytes), mode);
  505. Py_DECREF(bytes);
  506. return f;
  507. #endif
  508. }
  509. #ifdef HAVE_READLINK
  510. /* Read value of symbolic link. Encode the path to the locale encoding, decode
  511. the result from the locale encoding. */
  512. int
  513. _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
  514. {
  515. char *cpath;
  516. char cbuf[PATH_MAX];
  517. wchar_t *wbuf;
  518. int res;
  519. size_t r1;
  520. cpath = _Py_wchar2char(path, NULL);
  521. if (cpath == NULL) {
  522. errno = EINVAL;
  523. return -1;
  524. }
  525. res = (int)readlink(cpath, cbuf, PATH_MAX);
  526. PyMem_Free(cpath);
  527. if (res == -1)
  528. return -1;
  529. if (res == PATH_MAX) {
  530. errno = EINVAL;
  531. return -1;
  532. }
  533. cbuf[res] = '\0'; /* buf will be null terminated */
  534. wbuf = _Py_char2wchar(cbuf, &r1);
  535. if (wbuf == NULL) {
  536. errno = EINVAL;
  537. return -1;
  538. }
  539. if (bufsiz <= r1) {
  540. PyMem_Free(wbuf);
  541. errno = EINVAL;
  542. return -1;
  543. }
  544. wcsncpy(buf, wbuf, bufsiz);
  545. PyMem_Free(wbuf);
  546. return (int)r1;
  547. }
  548. #endif
  549. #ifdef HAVE_REALPATH
  550. /* Return the canonicalized absolute pathname. Encode path to the locale
  551. encoding, decode the result from the locale encoding. */
  552. wchar_t*
  553. _Py_wrealpath(const wchar_t *path,
  554. wchar_t *resolved_path, size_t resolved_path_size)
  555. {
  556. char *cpath;
  557. char cresolved_path[PATH_MAX];
  558. wchar_t *wresolved_path;
  559. char *res;
  560. size_t r;
  561. cpath = _Py_wchar2char(path, NULL);
  562. if (cpath == NULL) {
  563. errno = EINVAL;
  564. return NULL;
  565. }
  566. res = realpath(cpath, cresolved_path);
  567. PyMem_Free(cpath);
  568. if (res == NULL)
  569. return NULL;
  570. wresolved_path = _Py_char2wchar(cresolved_path, &r);
  571. if (wresolved_path == NULL) {
  572. errno = EINVAL;
  573. return NULL;
  574. }
  575. if (resolved_path_size <= r) {
  576. PyMem_Free(wresolved_path);
  577. errno = EINVAL;
  578. return NULL;
  579. }
  580. wcsncpy(resolved_path, wresolved_path, resolved_path_size);
  581. PyMem_Free(wresolved_path);
  582. return resolved_path;
  583. }
  584. #endif
  585. /* Get the current directory. size is the buffer size in wide characters
  586. including the null character. Decode the path from the locale encoding. */
  587. wchar_t*
  588. _Py_wgetcwd(wchar_t *buf, size_t size)
  589. {
  590. #ifdef MS_WINDOWS
  591. return _wgetcwd(buf, size);
  592. #else
  593. char fname[PATH_MAX];
  594. wchar_t *wname;
  595. size_t len;
  596. if (getcwd(fname, PATH_MAX) == NULL)
  597. return NULL;
  598. wname = _Py_char2wchar(fname, &len);
  599. if (wname == NULL)
  600. return NULL;
  601. if (size <= len) {
  602. PyMem_Free(wname);
  603. return NULL;
  604. }
  605. wcsncpy(buf, wname, size);
  606. PyMem_Free(wname);
  607. return buf;
  608. #endif
  609. }