You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

731 lines
19 KiB

13 years ago
  1. #include "Python.h"
  2. #include "osdefs.h"
  3. #ifdef MS_WINDOWS
  4. # include <windows.h>
  5. #endif
  6. #ifdef HAVE_LANGINFO_H
  7. #include <locale.h>
  8. #include <langinfo.h>
  9. #endif
  10. #ifdef __APPLE__
  11. extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
  12. #endif
  13. PyObject *
  14. _Py_device_encoding(int fd)
  15. {
  16. #if defined(MS_WINDOWS)
  17. UINT cp;
  18. #endif
  19. if (!_PyVerify_fd(fd) || !isatty(fd)) {
  20. Py_RETURN_NONE;
  21. }
  22. #if defined(MS_WINDOWS)
  23. if (fd == 0)
  24. cp = GetConsoleCP();
  25. else if (fd == 1 || fd == 2)
  26. cp = GetConsoleOutputCP();
  27. else
  28. cp = 0;
  29. /* GetConsoleCP() and GetConsoleOutputCP() return 0 if the application
  30. has no console */
  31. if (cp != 0)
  32. return PyUnicode_FromFormat("cp%u", (unsigned int)cp);
  33. #elif defined(CODESET)
  34. {
  35. char *codeset = nl_langinfo(CODESET);
  36. if (codeset != NULL && codeset[0] != 0)
  37. return PyUnicode_FromString(codeset);
  38. }
  39. #endif
  40. Py_RETURN_NONE;
  41. }
  42. #if !defined(__APPLE__) && !defined(MS_WINDOWS)
  43. extern int _Py_normalize_encoding(const char *, char *, size_t);
  44. /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
  45. On these operating systems, nl_langinfo(CODESET) announces an alias of the
  46. ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
  47. ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
  48. locale.getpreferredencoding() codec. For example, if command line arguments
  49. are decoded by mbstowcs() and encoded back by os.fsencode(), we get a
  50. UnicodeEncodeError instead of retrieving the original byte string.
  51. The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C",
  52. nl_langinfo(CODESET) announces "ascii" (or an alias to ASCII), and at least
  53. one byte in range 0x80-0xff can be decoded from the locale encoding. The
  54. workaround is also enabled on error, for example if getting the locale
  55. failed.
  56. Values of force_ascii:
  57. 1: the workaround is used: _Py_wchar2char() uses
  58. encode_ascii_surrogateescape() and _Py_char2wchar() uses
  59. decode_ascii_surrogateescape()
  60. 0: the workaround is not used: _Py_wchar2char() uses wcstombs() and
  61. _Py_char2wchar() uses mbstowcs()
  62. -1: unknown, need to call check_force_ascii() to get the value
  63. */
  64. static int force_ascii = -1;
  65. static int
  66. check_force_ascii(void)
  67. {
  68. char *loc;
  69. #if defined(HAVE_LANGINFO_H) && defined(CODESET)
  70. char *codeset, **alias;
  71. char encoding[100];
  72. int is_ascii;
  73. unsigned int i;
  74. char* ascii_aliases[] = {
  75. "ascii",
  76. "646",
  77. "ansi-x3.4-1968",
  78. "ansi-x3-4-1968",
  79. "ansi-x3.4-1986",
  80. "cp367",
  81. "csascii",
  82. "ibm367",
  83. "iso646-us",
  84. "iso-646.irv-1991",
  85. "iso-ir-6",
  86. "us",
  87. "us-ascii",
  88. NULL
  89. };
  90. #endif
  91. loc = setlocale(LC_CTYPE, NULL);
  92. if (loc == NULL)
  93. goto error;
  94. if (strcmp(loc, "C") != 0) {
  95. /* the LC_CTYPE locale is different than C */
  96. return 0;
  97. }
  98. #if defined(HAVE_LANGINFO_H) && defined(CODESET)
  99. codeset = nl_langinfo(CODESET);
  100. if (!codeset || codeset[0] == '\0') {
  101. /* CODESET is not set or empty */
  102. goto error;
  103. }
  104. if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
  105. goto error;
  106. is_ascii = 0;
  107. for (alias=ascii_aliases; *alias != NULL; alias++) {
  108. if (strcmp(encoding, *alias) == 0) {
  109. is_ascii = 1;
  110. break;
  111. }
  112. }
  113. if (!is_ascii) {
  114. /* nl_langinfo(CODESET) is not "ascii" or an alias of ASCII */
  115. return 0;
  116. }
  117. for (i=0x80; i<0xff; i++) {
  118. unsigned char ch;
  119. wchar_t wch;
  120. size_t res;
  121. ch = (unsigned char)i;
  122. res = mbstowcs(&wch, (char*)&ch, 1);
  123. if (res != (size_t)-1) {
  124. /* decoding a non-ASCII character from the locale encoding succeed:
  125. the locale encoding is not ASCII, force ASCII */
  126. return 1;
  127. }
  128. }
  129. /* None of the bytes in the range 0x80-0xff can be decoded from the locale
  130. encoding: the locale encoding is really ASCII */
  131. return 0;
  132. #else
  133. /* nl_langinfo(CODESET) is not available: always force ASCII */
  134. return 1;
  135. #endif
  136. error:
  137. /* if an error occured, force the ASCII encoding */
  138. return 1;
  139. }
  140. static char*
  141. encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
  142. {
  143. char *result = NULL, *out;
  144. size_t len, i;
  145. wchar_t ch;
  146. if (error_pos != NULL)
  147. *error_pos = (size_t)-1;
  148. len = wcslen(text);
  149. result = PyMem_Malloc(len + 1); /* +1 for NUL byte */
  150. if (result == NULL)
  151. return NULL;
  152. out = result;
  153. for (i=0; i<len; i++) {
  154. ch = text[i];
  155. if (ch <= 0x7f) {
  156. /* ASCII character */
  157. *out++ = (char)ch;
  158. }
  159. else if (0xdc80 <= ch && ch <= 0xdcff) {
  160. /* UTF-8b surrogate */
  161. *out++ = (char)(ch - 0xdc00);
  162. }
  163. else {
  164. if (error_pos != NULL)
  165. *error_pos = i;
  166. PyMem_Free(result);
  167. return NULL;
  168. }
  169. }
  170. *out = '\0';
  171. return result;
  172. }
  173. #endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
  174. #if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
  175. static wchar_t*
  176. decode_ascii_surrogateescape(const char *arg, size_t *size)
  177. {
  178. wchar_t *res;
  179. unsigned char *in;
  180. wchar_t *out;
  181. res = PyMem_RawMalloc((strlen(arg)+1)*sizeof(wchar_t));
  182. if (!res)
  183. return NULL;
  184. in = (unsigned char*)arg;
  185. out = res;
  186. while(*in)
  187. if(*in < 128)
  188. *out++ = *in++;
  189. else
  190. *out++ = 0xdc00 + *in++;
  191. *out = 0;
  192. if (size != NULL)
  193. *size = out - res;
  194. return res;
  195. }
  196. #endif
  197. /* Decode a byte string from the locale encoding with the
  198. surrogateescape error handler (undecodable bytes are decoded as characters
  199. in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
  200. character, escape the bytes using the surrogateescape error handler instead
  201. of decoding them.
  202. Use _Py_wchar2char() to encode the character string back to a byte string.
  203. Return a pointer to a newly allocated wide character string (use
  204. PyMem_RawFree() to free the memory) and write the number of written wide
  205. characters excluding the null character into *size if size is not NULL, or
  206. NULL on error (decoding or memory allocation error). If size is not NULL,
  207. *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
  208. error.
  209. Conversion errors should never happen, unless there is a bug in the C
  210. library. */
  211. wchar_t*
  212. _Py_char2wchar(const char* arg, size_t *size)
  213. {
  214. #ifdef __APPLE__
  215. wchar_t *wstr;
  216. wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
  217. if (size != NULL) {
  218. if (wstr != NULL)
  219. *size = wcslen(wstr);
  220. else
  221. *size = (size_t)-1;
  222. }
  223. return wstr;
  224. #else
  225. wchar_t *res;
  226. size_t argsize;
  227. size_t count;
  228. #ifdef HAVE_MBRTOWC
  229. unsigned char *in;
  230. wchar_t *out;
  231. mbstate_t mbs;
  232. #endif
  233. #ifndef MS_WINDOWS
  234. if (force_ascii == -1)
  235. force_ascii = check_force_ascii();
  236. if (force_ascii) {
  237. /* force ASCII encoding to workaround mbstowcs() issue */
  238. res = decode_ascii_surrogateescape(arg, size);
  239. if (res == NULL)
  240. goto oom;
  241. return res;
  242. }
  243. #endif
  244. #ifdef HAVE_BROKEN_MBSTOWCS
  245. /* Some platforms have a broken implementation of
  246. * mbstowcs which does not count the characters that
  247. * would result from conversion. Use an upper bound.
  248. */
  249. argsize = strlen(arg);
  250. #else
  251. argsize = mbstowcs(NULL, arg, 0);
  252. #endif
  253. if (argsize != (size_t)-1) {
  254. res = (wchar_t *)PyMem_RawMalloc((argsize+1)*sizeof(wchar_t));
  255. if (!res)
  256. goto oom;
  257. count = mbstowcs(res, arg, argsize+1);
  258. if (count != (size_t)-1) {
  259. wchar_t *tmp;
  260. /* Only use the result if it contains no
  261. surrogate characters. */
  262. for (tmp = res; *tmp != 0 &&
  263. !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
  264. ;
  265. if (*tmp == 0) {
  266. if (size != NULL)
  267. *size = count;
  268. return res;
  269. }
  270. }
  271. PyMem_RawFree(res);
  272. }
  273. /* Conversion failed. Fall back to escaping with surrogateescape. */
  274. #ifdef HAVE_MBRTOWC
  275. /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
  276. /* Overallocate; as multi-byte characters are in the argument, the
  277. actual output could use less memory. */
  278. argsize = strlen(arg) + 1;
  279. res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t));
  280. if (!res)
  281. goto oom;
  282. in = (unsigned char*)arg;
  283. out = res;
  284. memset(&mbs, 0, sizeof mbs);
  285. while (argsize) {
  286. size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
  287. if (converted == 0)
  288. /* Reached end of string; null char stored. */
  289. break;
  290. if (converted == (size_t)-2) {
  291. /* Incomplete character. This should never happen,
  292. since we provide everything that we have -
  293. unless there is a bug in the C library, or I
  294. misunderstood how mbrtowc works. */
  295. PyMem_RawFree(res);
  296. if (size != NULL)
  297. *size = (size_t)-2;
  298. return NULL;
  299. }
  300. if (converted == (size_t)-1) {
  301. /* Conversion error. Escape as UTF-8b, and start over
  302. in the initial shift state. */
  303. *out++ = 0xdc00 + *in++;
  304. argsize--;
  305. memset(&mbs, 0, sizeof mbs);
  306. continue;
  307. }
  308. if (Py_UNICODE_IS_SURROGATE(*out)) {
  309. /* Surrogate character. Escape the original
  310. byte sequence with surrogateescape. */
  311. argsize -= converted;
  312. while (converted--)
  313. *out++ = 0xdc00 + *in++;
  314. continue;
  315. }
  316. /* successfully converted some bytes */
  317. in += converted;
  318. argsize -= converted;
  319. out++;
  320. }
  321. if (size != NULL)
  322. *size = out - res;
  323. #else /* HAVE_MBRTOWC */
  324. /* Cannot use C locale for escaping; manually escape as if charset
  325. is ASCII (i.e. escape all bytes > 128. This will still roundtrip
  326. correctly in the locale's charset, which must be an ASCII superset. */
  327. res = decode_ascii_surrogateescape(arg, size);
  328. if (res == NULL)
  329. goto oom;
  330. #endif /* HAVE_MBRTOWC */
  331. return res;
  332. oom:
  333. if (size != NULL)
  334. *size = (size_t)-1;
  335. return NULL;
  336. #endif /* __APPLE__ */
  337. }
  338. /* Encode a (wide) character string to the locale encoding with the
  339. surrogateescape error handler (characters in range U+DC80..U+DCFF are
  340. converted to bytes 0x80..0xFF).
  341. This function is the reverse of _Py_char2wchar().
  342. Return a pointer to a newly allocated byte string (use PyMem_Free() to free
  343. the memory), or NULL on encoding or memory allocation error.
  344. If error_pos is not NULL: *error_pos is the index of the invalid character
  345. on encoding error, or (size_t)-1 otherwise. */
  346. char*
  347. _Py_wchar2char(const wchar_t *text, size_t *error_pos)
  348. {
  349. #ifdef __APPLE__
  350. Py_ssize_t len;
  351. PyObject *unicode, *bytes = NULL;
  352. char *cpath;
  353. unicode = PyUnicode_FromWideChar(text, wcslen(text));
  354. if (unicode == NULL)
  355. return NULL;
  356. bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
  357. Py_DECREF(unicode);
  358. if (bytes == NULL) {
  359. PyErr_Clear();
  360. if (error_pos != NULL)
  361. *error_pos = (size_t)-1;
  362. return NULL;
  363. }
  364. len = PyBytes_GET_SIZE(bytes);
  365. cpath = PyMem_Malloc(len+1);
  366. if (cpath == NULL) {
  367. PyErr_Clear();
  368. Py_DECREF(bytes);
  369. if (error_pos != NULL)
  370. *error_pos = (size_t)-1;
  371. return NULL;
  372. }
  373. memcpy(cpath, PyBytes_AsString(bytes), len + 1);
  374. Py_DECREF(bytes);
  375. return cpath;
  376. #else /* __APPLE__ */
  377. const size_t len = wcslen(text);
  378. char *result = NULL, *bytes = NULL;
  379. size_t i, size, converted;
  380. wchar_t c, buf[2];
  381. #ifndef MS_WINDOWS
  382. if (force_ascii == -1)
  383. force_ascii = check_force_ascii();
  384. if (force_ascii)
  385. return encode_ascii_surrogateescape(text, error_pos);
  386. #endif
  387. /* The function works in two steps:
  388. 1. compute the length of the output buffer in bytes (size)
  389. 2. outputs the bytes */
  390. size = 0;
  391. buf[1] = 0;
  392. while (1) {
  393. for (i=0; i < len; i++) {
  394. c = text[i];
  395. if (c >= 0xdc80 && c <= 0xdcff) {
  396. /* UTF-8b surrogate */
  397. if (bytes != NULL) {
  398. *bytes++ = c - 0xdc00;
  399. size--;
  400. }
  401. else
  402. size++;
  403. continue;
  404. }
  405. else {
  406. buf[0] = c;
  407. if (bytes != NULL)
  408. converted = wcstombs(bytes, buf, size);
  409. else
  410. converted = wcstombs(NULL, buf, 0);
  411. if (converted == (size_t)-1) {
  412. if (result != NULL)
  413. PyMem_Free(result);
  414. if (error_pos != NULL)
  415. *error_pos = i;
  416. return NULL;
  417. }
  418. if (bytes != NULL) {
  419. bytes += converted;
  420. size -= converted;
  421. }
  422. else
  423. size += converted;
  424. }
  425. }
  426. if (result != NULL) {
  427. *bytes = '\0';
  428. break;
  429. }
  430. size += 1; /* nul byte at the end */
  431. result = PyMem_Malloc(size);
  432. if (result == NULL) {
  433. if (error_pos != NULL)
  434. *error_pos = (size_t)-1;
  435. return NULL;
  436. }
  437. bytes = result;
  438. }
  439. return result;
  440. #endif /* __APPLE__ */
  441. }
  442. /* In principle, this should use HAVE__WSTAT, and _wstat
  443. should be detected by autoconf. However, no current
  444. POSIX system provides that function, so testing for
  445. it is pointless.
  446. Not sure whether the MS_WINDOWS guards are necessary:
  447. perhaps for cygwin/mingw builds?
  448. */
  449. #if defined(HAVE_STAT) && !defined(MS_WINDOWS)
  450. /* Get file status. Encode the path to the locale encoding. */
  451. int
  452. _Py_wstat(const wchar_t* path, struct stat *buf)
  453. {
  454. int err;
  455. char *fname;
  456. fname = _Py_wchar2char(path, NULL);
  457. if (fname == NULL) {
  458. errno = EINVAL;
  459. return -1;
  460. }
  461. err = stat(fname, buf);
  462. PyMem_Free(fname);
  463. return err;
  464. }
  465. #endif
  466. #ifdef HAVE_STAT
  467. /* Call _wstat() on Windows, or encode the path to the filesystem encoding and
  468. call stat() otherwise. Only fill st_mode attribute on Windows.
  469. Return 0 on success, -1 on _wstat() / stat() error, -2 if an exception was
  470. raised. */
  471. int
  472. _Py_stat(PyObject *path, struct stat *statbuf)
  473. {
  474. #ifdef MS_WINDOWS
  475. int err;
  476. struct _stat wstatbuf;
  477. wchar_t *wpath;
  478. wpath = PyUnicode_AsUnicode(path);
  479. if (wpath == NULL)
  480. return -2;
  481. err = _wstat(wpath, &wstatbuf);
  482. if (!err)
  483. statbuf->st_mode = wstatbuf.st_mode;
  484. return err;
  485. #else
  486. int ret;
  487. PyObject *bytes = PyUnicode_EncodeFSDefault(path);
  488. if (bytes == NULL)
  489. return -2;
  490. ret = stat(PyBytes_AS_STRING(bytes), statbuf);
  491. Py_DECREF(bytes);
  492. return ret;
  493. #endif
  494. }
  495. #endif
  496. /* Open a file. Use _wfopen() on Windows, encode the path to the locale
  497. encoding and use fopen() otherwise. */
  498. FILE *
  499. _Py_wfopen(const wchar_t *path, const wchar_t *mode)
  500. {
  501. #ifndef MS_WINDOWS
  502. FILE *f;
  503. char *cpath;
  504. char cmode[10];
  505. size_t r;
  506. r = wcstombs(cmode, mode, 10);
  507. if (r == (size_t)-1 || r >= 10) {
  508. errno = EINVAL;
  509. return NULL;
  510. }
  511. cpath = _Py_wchar2char(path, NULL);
  512. if (cpath == NULL)
  513. return NULL;
  514. f = fopen(cpath, cmode);
  515. PyMem_Free(cpath);
  516. return f;
  517. #else
  518. return _wfopen(path, mode);
  519. #endif
  520. }
  521. /* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
  522. call fopen() otherwise.
  523. Return the new file object on success, or NULL if the file cannot be open or
  524. (if PyErr_Occurred()) on unicode error */
  525. FILE*
  526. _Py_fopen(PyObject *path, const char *mode)
  527. {
  528. #ifdef MS_WINDOWS
  529. wchar_t *wpath;
  530. wchar_t wmode[10];
  531. int usize;
  532. if (!PyUnicode_Check(path)) {
  533. PyErr_Format(PyExc_TypeError,
  534. "str file path expected under Windows, got %R",
  535. Py_TYPE(path));
  536. return NULL;
  537. }
  538. wpath = PyUnicode_AsUnicode(path);
  539. if (wpath == NULL)
  540. return NULL;
  541. usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
  542. if (usize == 0)
  543. return NULL;
  544. return _wfopen(wpath, wmode);
  545. #else
  546. FILE *f;
  547. PyObject *bytes;
  548. if (!PyUnicode_FSConverter(path, &bytes))
  549. return NULL;
  550. f = fopen(PyBytes_AS_STRING(bytes), mode);
  551. Py_DECREF(bytes);
  552. return f;
  553. #endif
  554. }
  555. #ifdef HAVE_READLINK
  556. /* Read value of symbolic link. Encode the path to the locale encoding, decode
  557. the result from the locale encoding. Return -1 on error. */
  558. int
  559. _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
  560. {
  561. char *cpath;
  562. char cbuf[PATH_MAX];
  563. wchar_t *wbuf;
  564. int res;
  565. size_t r1;
  566. cpath = _Py_wchar2char(path, NULL);
  567. if (cpath == NULL) {
  568. errno = EINVAL;
  569. return -1;
  570. }
  571. res = (int)readlink(cpath, cbuf, PATH_MAX);
  572. PyMem_Free(cpath);
  573. if (res == -1)
  574. return -1;
  575. if (res == PATH_MAX) {
  576. errno = EINVAL;
  577. return -1;
  578. }
  579. cbuf[res] = '\0'; /* buf will be null terminated */
  580. wbuf = _Py_char2wchar(cbuf, &r1);
  581. if (wbuf == NULL) {
  582. errno = EINVAL;
  583. return -1;
  584. }
  585. if (bufsiz <= r1) {
  586. PyMem_RawFree(wbuf);
  587. errno = EINVAL;
  588. return -1;
  589. }
  590. wcsncpy(buf, wbuf, bufsiz);
  591. PyMem_RawFree(wbuf);
  592. return (int)r1;
  593. }
  594. #endif
  595. #ifdef HAVE_REALPATH
  596. /* Return the canonicalized absolute pathname. Encode path to the locale
  597. encoding, decode the result from the locale encoding.
  598. Return NULL on error. */
  599. wchar_t*
  600. _Py_wrealpath(const wchar_t *path,
  601. wchar_t *resolved_path, size_t resolved_path_size)
  602. {
  603. char *cpath;
  604. char cresolved_path[PATH_MAX];
  605. wchar_t *wresolved_path;
  606. char *res;
  607. size_t r;
  608. cpath = _Py_wchar2char(path, NULL);
  609. if (cpath == NULL) {
  610. errno = EINVAL;
  611. return NULL;
  612. }
  613. res = realpath(cpath, cresolved_path);
  614. PyMem_Free(cpath);
  615. if (res == NULL)
  616. return NULL;
  617. wresolved_path = _Py_char2wchar(cresolved_path, &r);
  618. if (wresolved_path == NULL) {
  619. errno = EINVAL;
  620. return NULL;
  621. }
  622. if (resolved_path_size <= r) {
  623. PyMem_RawFree(wresolved_path);
  624. errno = EINVAL;
  625. return NULL;
  626. }
  627. wcsncpy(resolved_path, wresolved_path, resolved_path_size);
  628. PyMem_RawFree(wresolved_path);
  629. return resolved_path;
  630. }
  631. #endif
  632. /* Get the current directory. size is the buffer size in wide characters
  633. including the null character. Decode the path from the locale encoding.
  634. Return NULL on error. */
  635. wchar_t*
  636. _Py_wgetcwd(wchar_t *buf, size_t size)
  637. {
  638. #ifdef MS_WINDOWS
  639. int isize = (int)Py_MIN(size, INT_MAX);
  640. return _wgetcwd(buf, isize);
  641. #else
  642. char fname[PATH_MAX];
  643. wchar_t *wname;
  644. size_t len;
  645. if (getcwd(fname, PATH_MAX) == NULL)
  646. return NULL;
  647. wname = _Py_char2wchar(fname, &len);
  648. if (wname == NULL)
  649. return NULL;
  650. if (size <= len) {
  651. PyMem_RawFree(wname);
  652. return NULL;
  653. }
  654. wcsncpy(buf, wname, size);
  655. PyMem_RawFree(wname);
  656. return buf;
  657. #endif
  658. }