You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

481 lines
13 KiB

  1. #include "Python.h"
  2. #ifdef MS_WINDOWS
  3. # include <windows.h>
  4. #endif
  5. #ifdef __APPLE__
  6. extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
  7. #endif
  8. #ifdef HAVE_STAT
  9. /* Decode a byte string from the locale encoding with the
  10. surrogateescape error handler (undecodable bytes are decoded as characters
  11. in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
  12. character, escape the bytes using the surrogateescape error handler instead
  13. of decoding them.
  14. Use _Py_wchar2char() to encode the character string back to a byte string.
  15. Return a pointer to a newly allocated wide character string (use
  16. PyMem_Free() to free the memory) and write the number of written wide
  17. characters excluding the null character into *size if size is not NULL, or
  18. NULL on error (conversion or memory allocation error).
  19. Conversion errors should never happen, unless there is a bug in the C
  20. library. */
  21. wchar_t*
  22. _Py_char2wchar(const char* arg, size_t *size)
  23. {
  24. #ifdef __APPLE__
  25. wchar_t *wstr;
  26. wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
  27. if (size != NULL) {
  28. if (wstr != NULL)
  29. *size = wcslen(wstr);
  30. else
  31. *size = (size_t)-1;
  32. }
  33. return wstr;
  34. #else
  35. wchar_t *res;
  36. #ifdef HAVE_BROKEN_MBSTOWCS
  37. /* Some platforms have a broken implementation of
  38. * mbstowcs which does not count the characters that
  39. * would result from conversion. Use an upper bound.
  40. */
  41. size_t argsize = strlen(arg);
  42. #else
  43. size_t argsize = mbstowcs(NULL, arg, 0);
  44. #endif
  45. size_t count;
  46. unsigned char *in;
  47. wchar_t *out;
  48. #ifdef HAVE_MBRTOWC
  49. mbstate_t mbs;
  50. #endif
  51. if (argsize != (size_t)-1) {
  52. res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
  53. if (!res)
  54. goto oom;
  55. count = mbstowcs(res, arg, argsize+1);
  56. if (count != (size_t)-1) {
  57. wchar_t *tmp;
  58. /* Only use the result if it contains no
  59. surrogate characters. */
  60. for (tmp = res; *tmp != 0 &&
  61. (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
  62. ;
  63. if (*tmp == 0) {
  64. if (size != NULL)
  65. *size = count;
  66. return res;
  67. }
  68. }
  69. PyMem_Free(res);
  70. }
  71. /* Conversion failed. Fall back to escaping with surrogateescape. */
  72. #ifdef HAVE_MBRTOWC
  73. /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
  74. /* Overallocate; as multi-byte characters are in the argument, the
  75. actual output could use less memory. */
  76. argsize = strlen(arg) + 1;
  77. res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
  78. if (!res)
  79. goto oom;
  80. in = (unsigned char*)arg;
  81. out = res;
  82. memset(&mbs, 0, sizeof mbs);
  83. while (argsize) {
  84. size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
  85. if (converted == 0)
  86. /* Reached end of string; null char stored. */
  87. break;
  88. if (converted == (size_t)-2) {
  89. /* Incomplete character. This should never happen,
  90. since we provide everything that we have -
  91. unless there is a bug in the C library, or I
  92. misunderstood how mbrtowc works. */
  93. fprintf(stderr, "unexpected mbrtowc result -2\n");
  94. PyMem_Free(res);
  95. return NULL;
  96. }
  97. if (converted == (size_t)-1) {
  98. /* Conversion error. Escape as UTF-8b, and start over
  99. in the initial shift state. */
  100. *out++ = 0xdc00 + *in++;
  101. argsize--;
  102. memset(&mbs, 0, sizeof mbs);
  103. continue;
  104. }
  105. if (*out >= 0xd800 && *out <= 0xdfff) {
  106. /* Surrogate character. Escape the original
  107. byte sequence with surrogateescape. */
  108. argsize -= converted;
  109. while (converted--)
  110. *out++ = 0xdc00 + *in++;
  111. continue;
  112. }
  113. /* successfully converted some bytes */
  114. in += converted;
  115. argsize -= converted;
  116. out++;
  117. }
  118. #else /* HAVE_MBRTOWC */
  119. /* Cannot use C locale for escaping; manually escape as if charset
  120. is ASCII (i.e. escape all bytes > 128. This will still roundtrip
  121. correctly in the locale's charset, which must be an ASCII superset. */
  122. res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
  123. if (!res) goto oom;
  124. in = (unsigned char*)arg;
  125. out = res;
  126. while(*in)
  127. if(*in < 128)
  128. *out++ = *in++;
  129. else
  130. *out++ = 0xdc00 + *in++;
  131. *out = 0;
  132. #endif /* HAVE_MBRTOWC */
  133. if (size != NULL)
  134. *size = out - res;
  135. return res;
  136. oom:
  137. fprintf(stderr, "out of memory\n");
  138. return NULL;
  139. #endif /* __APPLE__ */
  140. }
  141. /* Encode a (wide) character string to the locale encoding with the
  142. surrogateescape error handler (characters in range U+DC80..U+DCFF are
  143. converted to bytes 0x80..0xFF).
  144. This function is the reverse of _Py_char2wchar().
  145. Return a pointer to a newly allocated byte string (use PyMem_Free() to free
  146. the memory), or NULL on conversion or memory allocation error.
  147. If error_pos is not NULL: *error_pos is the index of the invalid character
  148. on conversion error, or (size_t)-1 otherwise. */
  149. char*
  150. _Py_wchar2char(const wchar_t *text, size_t *error_pos)
  151. {
  152. #ifdef __APPLE__
  153. Py_ssize_t len;
  154. PyObject *unicode, *bytes = NULL;
  155. char *cpath;
  156. unicode = PyUnicode_FromWideChar(text, wcslen(text));
  157. if (unicode == NULL)
  158. return NULL;
  159. bytes = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
  160. PyUnicode_GET_SIZE(unicode),
  161. "surrogateescape");
  162. Py_DECREF(unicode);
  163. if (bytes == NULL) {
  164. PyErr_Clear();
  165. if (error_pos != NULL)
  166. *error_pos = (size_t)-1;
  167. return NULL;
  168. }
  169. len = PyBytes_GET_SIZE(bytes);
  170. cpath = PyMem_Malloc(len+1);
  171. if (cpath == NULL) {
  172. PyErr_Clear();
  173. Py_DECREF(bytes);
  174. if (error_pos != NULL)
  175. *error_pos = (size_t)-1;
  176. return NULL;
  177. }
  178. memcpy(cpath, PyBytes_AsString(bytes), len + 1);
  179. Py_DECREF(bytes);
  180. return cpath;
  181. #else /* __APPLE__ */
  182. const size_t len = wcslen(text);
  183. char *result = NULL, *bytes = NULL;
  184. size_t i, size, converted;
  185. wchar_t c, buf[2];
  186. /* The function works in two steps:
  187. 1. compute the length of the output buffer in bytes (size)
  188. 2. outputs the bytes */
  189. size = 0;
  190. buf[1] = 0;
  191. while (1) {
  192. for (i=0; i < len; i++) {
  193. c = text[i];
  194. if (c >= 0xdc80 && c <= 0xdcff) {
  195. /* UTF-8b surrogate */
  196. if (bytes != NULL) {
  197. *bytes++ = c - 0xdc00;
  198. size--;
  199. }
  200. else
  201. size++;
  202. continue;
  203. }
  204. else {
  205. buf[0] = c;
  206. if (bytes != NULL)
  207. converted = wcstombs(bytes, buf, size);
  208. else
  209. converted = wcstombs(NULL, buf, 0);
  210. if (converted == (size_t)-1) {
  211. if (result != NULL)
  212. PyMem_Free(result);
  213. if (error_pos != NULL)
  214. *error_pos = i;
  215. return NULL;
  216. }
  217. if (bytes != NULL) {
  218. bytes += converted;
  219. size -= converted;
  220. }
  221. else
  222. size += converted;
  223. }
  224. }
  225. if (result != NULL) {
  226. *bytes = 0;
  227. break;
  228. }
  229. size += 1; /* nul byte at the end */
  230. result = PyMem_Malloc(size);
  231. if (result == NULL) {
  232. if (error_pos != NULL)
  233. *error_pos = (size_t)-1;
  234. return NULL;
  235. }
  236. bytes = result;
  237. }
  238. return result;
  239. #endif /* __APPLE__ */
  240. }
  241. /* In principle, this should use HAVE__WSTAT, and _wstat
  242. should be detected by autoconf. However, no current
  243. POSIX system provides that function, so testing for
  244. it is pointless.
  245. Not sure whether the MS_WINDOWS guards are necessary:
  246. perhaps for cygwin/mingw builds?
  247. */
  248. #if defined(HAVE_STAT) && !defined(MS_WINDOWS)
  249. /* Get file status. Encode the path to the locale encoding. */
  250. int
  251. _Py_wstat(const wchar_t* path, struct stat *buf)
  252. {
  253. int err;
  254. char *fname;
  255. fname = _Py_wchar2char(path, NULL);
  256. if (fname == NULL) {
  257. errno = EINVAL;
  258. return -1;
  259. }
  260. err = stat(fname, buf);
  261. PyMem_Free(fname);
  262. return err;
  263. }
  264. #endif
  265. /* Call _wstat() on Windows, or encode the path to the filesystem encoding and
  266. call stat() otherwise. Only fill st_mode attribute on Windows.
  267. Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
  268. unicode error. */
  269. int
  270. _Py_stat(PyObject *path, struct stat *statbuf)
  271. {
  272. #ifdef MS_WINDOWS
  273. int err;
  274. struct _stat wstatbuf;
  275. err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
  276. if (!err)
  277. statbuf->st_mode = wstatbuf.st_mode;
  278. return err;
  279. #else
  280. int ret;
  281. PyObject *bytes = PyUnicode_EncodeFSDefault(path);
  282. if (bytes == NULL)
  283. return -1;
  284. ret = stat(PyBytes_AS_STRING(bytes), statbuf);
  285. Py_DECREF(bytes);
  286. return ret;
  287. #endif
  288. }
  289. /* Open a file. Use _wfopen() on Windows, encode the path to the locale
  290. encoding and use fopen() otherwise. */
  291. FILE *
  292. _Py_wfopen(const wchar_t *path, const wchar_t *mode)
  293. {
  294. #ifndef MS_WINDOWS
  295. FILE *f;
  296. char *cpath;
  297. char cmode[10];
  298. size_t r;
  299. r = wcstombs(cmode, mode, 10);
  300. if (r == (size_t)-1 || r >= 10) {
  301. errno = EINVAL;
  302. return NULL;
  303. }
  304. cpath = _Py_wchar2char(path, NULL);
  305. if (cpath == NULL)
  306. return NULL;
  307. f = fopen(cpath, cmode);
  308. PyMem_Free(cpath);
  309. return f;
  310. #else
  311. return _wfopen(path, mode);
  312. #endif
  313. }
  314. /* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
  315. call fopen() otherwise.
  316. Return the new file object on success, or NULL if the file cannot be open or
  317. (if PyErr_Occurred()) on unicode error */
  318. FILE*
  319. _Py_fopen(PyObject *path, const char *mode)
  320. {
  321. #ifdef MS_WINDOWS
  322. wchar_t wmode[10];
  323. int usize;
  324. usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
  325. if (usize == 0)
  326. return NULL;
  327. return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
  328. #else
  329. FILE *f;
  330. PyObject *bytes = PyUnicode_EncodeFSDefault(path);
  331. if (bytes == NULL)
  332. return NULL;
  333. f = fopen(PyBytes_AS_STRING(bytes), mode);
  334. Py_DECREF(bytes);
  335. return f;
  336. #endif
  337. }
  338. #ifdef HAVE_READLINK
  339. /* Read value of symbolic link. Encode the path to the locale encoding, decode
  340. the result from the locale encoding. */
  341. int
  342. _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
  343. {
  344. char *cpath;
  345. char cbuf[PATH_MAX];
  346. wchar_t *wbuf;
  347. int res;
  348. size_t r1;
  349. cpath = _Py_wchar2char(path, NULL);
  350. if (cpath == NULL) {
  351. errno = EINVAL;
  352. return -1;
  353. }
  354. res = (int)readlink(cpath, cbuf, PATH_MAX);
  355. PyMem_Free(cpath);
  356. if (res == -1)
  357. return -1;
  358. if (res == PATH_MAX) {
  359. errno = EINVAL;
  360. return -1;
  361. }
  362. cbuf[res] = '\0'; /* buf will be null terminated */
  363. wbuf = _Py_char2wchar(cbuf, &r1);
  364. if (wbuf == NULL) {
  365. errno = EINVAL;
  366. return -1;
  367. }
  368. if (bufsiz <= r1) {
  369. PyMem_Free(wbuf);
  370. errno = EINVAL;
  371. return -1;
  372. }
  373. wcsncpy(buf, wbuf, bufsiz);
  374. PyMem_Free(wbuf);
  375. return (int)r1;
  376. }
  377. #endif
  378. #ifdef HAVE_REALPATH
  379. /* Return the canonicalized absolute pathname. Encode path to the locale
  380. encoding, decode the result from the locale encoding. */
  381. wchar_t*
  382. _Py_wrealpath(const wchar_t *path,
  383. wchar_t *resolved_path, size_t resolved_path_size)
  384. {
  385. char *cpath;
  386. char cresolved_path[PATH_MAX];
  387. wchar_t *wresolved_path;
  388. char *res;
  389. size_t r;
  390. cpath = _Py_wchar2char(path, NULL);
  391. if (cpath == NULL) {
  392. errno = EINVAL;
  393. return NULL;
  394. }
  395. res = realpath(cpath, cresolved_path);
  396. PyMem_Free(cpath);
  397. if (res == NULL)
  398. return NULL;
  399. wresolved_path = _Py_char2wchar(cresolved_path, &r);
  400. if (wresolved_path == NULL) {
  401. errno = EINVAL;
  402. return NULL;
  403. }
  404. if (resolved_path_size <= r) {
  405. PyMem_Free(wresolved_path);
  406. errno = EINVAL;
  407. return NULL;
  408. }
  409. wcsncpy(resolved_path, wresolved_path, resolved_path_size);
  410. PyMem_Free(wresolved_path);
  411. return resolved_path;
  412. }
  413. #endif
  414. /* Get the current directory. size is the buffer size in wide characters
  415. including the null character. Decode the path from the locale encoding. */
  416. wchar_t*
  417. _Py_wgetcwd(wchar_t *buf, size_t size)
  418. {
  419. #ifdef MS_WINDOWS
  420. return _wgetcwd(buf, size);
  421. #else
  422. char fname[PATH_MAX];
  423. wchar_t *wname;
  424. size_t len;
  425. if (getcwd(fname, PATH_MAX) == NULL)
  426. return NULL;
  427. wname = _Py_char2wchar(fname, &len);
  428. if (wname == NULL)
  429. return NULL;
  430. if (size <= len) {
  431. PyMem_Free(wname);
  432. return NULL;
  433. }
  434. wcsncpy(buf, wname, size);
  435. PyMem_Free(wname);
  436. return buf;
  437. #endif
  438. }
  439. #endif