You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

453 lines
12 KiB

  1. #include "Python.h"
  2. #ifdef MS_WINDOWS
  3. # include <windows.h>
  4. #endif
  5. #ifdef HAVE_STAT
  6. /* Decode a byte string from the locale encoding with the
  7. surrogateescape error handler (undecodable bytes are decoded as characters
  8. in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
  9. character, escape the bytes using the surrogateescape error handler instead
  10. of decoding them.
  11. Use _Py_wchar2char() to encode the character string back to a byte string.
  12. Return a pointer to a newly allocated wide character string (use
  13. PyMem_Free() to free the memory) and write the number of written wide
  14. characters excluding the null character into *size if size is not NULL, or
  15. NULL on error (decoding or memory allocation error). If size is not NULL,
  16. *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
  17. error.
  18. Conversion errors should never happen, unless there is a bug in the C
  19. library. */
  20. wchar_t*
  21. _Py_char2wchar(const char* arg, size_t *size)
  22. {
  23. wchar_t *res;
  24. #ifdef HAVE_BROKEN_MBSTOWCS
  25. /* Some platforms have a broken implementation of
  26. * mbstowcs which does not count the characters that
  27. * would result from conversion. Use an upper bound.
  28. */
  29. size_t argsize = strlen(arg);
  30. #else
  31. size_t argsize = mbstowcs(NULL, arg, 0);
  32. #endif
  33. size_t count;
  34. unsigned char *in;
  35. wchar_t *out;
  36. #ifdef HAVE_MBRTOWC
  37. mbstate_t mbs;
  38. #endif
  39. if (argsize != (size_t)-1) {
  40. res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
  41. if (!res)
  42. goto oom;
  43. count = mbstowcs(res, arg, argsize+1);
  44. if (count != (size_t)-1) {
  45. wchar_t *tmp;
  46. /* Only use the result if it contains no
  47. surrogate characters. */
  48. for (tmp = res; *tmp != 0 &&
  49. (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
  50. ;
  51. if (*tmp == 0) {
  52. if (size != NULL)
  53. *size = count;
  54. return res;
  55. }
  56. }
  57. PyMem_Free(res);
  58. }
  59. /* Conversion failed. Fall back to escaping with surrogateescape. */
  60. #ifdef HAVE_MBRTOWC
  61. /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
  62. /* Overallocate; as multi-byte characters are in the argument, the
  63. actual output could use less memory. */
  64. argsize = strlen(arg) + 1;
  65. res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
  66. if (!res)
  67. goto oom;
  68. in = (unsigned char*)arg;
  69. out = res;
  70. memset(&mbs, 0, sizeof mbs);
  71. while (argsize) {
  72. size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
  73. if (converted == 0)
  74. /* Reached end of string; null char stored. */
  75. break;
  76. if (converted == (size_t)-2) {
  77. /* Incomplete character. This should never happen,
  78. since we provide everything that we have -
  79. unless there is a bug in the C library, or I
  80. misunderstood how mbrtowc works. */
  81. PyMem_Free(res);
  82. if (size != NULL)
  83. *size = (size_t)-2;
  84. return NULL;
  85. }
  86. if (converted == (size_t)-1) {
  87. /* Conversion error. Escape as UTF-8b, and start over
  88. in the initial shift state. */
  89. *out++ = 0xdc00 + *in++;
  90. argsize--;
  91. memset(&mbs, 0, sizeof mbs);
  92. continue;
  93. }
  94. if (*out >= 0xd800 && *out <= 0xdfff) {
  95. /* Surrogate character. Escape the original
  96. byte sequence with surrogateescape. */
  97. argsize -= converted;
  98. while (converted--)
  99. *out++ = 0xdc00 + *in++;
  100. continue;
  101. }
  102. /* successfully converted some bytes */
  103. in += converted;
  104. argsize -= converted;
  105. out++;
  106. }
  107. #else
  108. /* Cannot use C locale for escaping; manually escape as if charset
  109. is ASCII (i.e. escape all bytes > 128. This will still roundtrip
  110. correctly in the locale's charset, which must be an ASCII superset. */
  111. res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
  112. if (!res)
  113. goto oom;
  114. in = (unsigned char*)arg;
  115. out = res;
  116. while(*in)
  117. if(*in < 128)
  118. *out++ = *in++;
  119. else
  120. *out++ = 0xdc00 + *in++;
  121. *out = 0;
  122. #endif
  123. if (size != NULL)
  124. *size = out - res;
  125. return res;
  126. oom:
  127. if (size != NULL)
  128. *size = (size_t)-1;
  129. return NULL;
  130. }
  131. /* Encode a (wide) character string to the locale encoding with the
  132. surrogateescape error handler (characters in range U+DC80..U+DCFF are
  133. converted to bytes 0x80..0xFF).
  134. This function is the reverse of _Py_char2wchar().
  135. Return a pointer to a newly allocated byte string (use PyMem_Free() to free
  136. the memory), or NULL on encoding or memory allocation error.
  137. If error_pos is not NULL: *error_pos is the index of the invalid character
  138. on encoding error, or (size_t)-1 otherwise. */
  139. char*
  140. _Py_wchar2char(const wchar_t *text, size_t *error_pos)
  141. {
  142. const size_t len = wcslen(text);
  143. char *result = NULL, *bytes = NULL;
  144. size_t i, size, converted;
  145. wchar_t c, buf[2];
  146. if (error_pos != NULL)
  147. *error_pos = (size_t)-1;
  148. /* The function works in two steps:
  149. 1. compute the length of the output buffer in bytes (size)
  150. 2. outputs the bytes */
  151. size = 0;
  152. buf[1] = 0;
  153. while (1) {
  154. for (i=0; i < len; i++) {
  155. c = text[i];
  156. if (c >= 0xdc80 && c <= 0xdcff) {
  157. /* UTF-8b surrogate */
  158. if (bytes != NULL) {
  159. *bytes++ = c - 0xdc00;
  160. size--;
  161. }
  162. else
  163. size++;
  164. continue;
  165. }
  166. else {
  167. buf[0] = c;
  168. if (bytes != NULL)
  169. converted = wcstombs(bytes, buf, size);
  170. else
  171. converted = wcstombs(NULL, buf, 0);
  172. if (converted == (size_t)-1) {
  173. if (result != NULL)
  174. PyMem_Free(result);
  175. if (error_pos != NULL)
  176. *error_pos = i;
  177. return NULL;
  178. }
  179. if (bytes != NULL) {
  180. bytes += converted;
  181. size -= converted;
  182. }
  183. else
  184. size += converted;
  185. }
  186. }
  187. if (result != NULL) {
  188. *bytes = 0;
  189. break;
  190. }
  191. size += 1; /* nul byte at the end */
  192. result = PyMem_Malloc(size);
  193. if (result == NULL)
  194. return NULL;
  195. bytes = result;
  196. }
  197. return result;
  198. }
  199. /* In principle, this should use HAVE__WSTAT, and _wstat
  200. should be detected by autoconf. However, no current
  201. POSIX system provides that function, so testing for
  202. it is pointless.
  203. Not sure whether the MS_WINDOWS guards are necessary:
  204. perhaps for cygwin/mingw builds?
  205. */
  206. #if defined(HAVE_STAT) && !defined(MS_WINDOWS)
  207. /* Get file status. Encode the path to the locale encoding. */
  208. int
  209. _Py_wstat(const wchar_t* path, struct stat *buf)
  210. {
  211. int err;
  212. char *fname;
  213. fname = _Py_wchar2char(path, NULL);
  214. if (fname == NULL) {
  215. errno = EINVAL;
  216. return -1;
  217. }
  218. err = stat(fname, buf);
  219. PyMem_Free(fname);
  220. return err;
  221. }
  222. #endif
  223. /* Call _wstat() on Windows, or encode the path to the filesystem encoding and
  224. call stat() otherwise. Only fill st_mode attribute on Windows.
  225. Return 0 on success, -1 on _wstat() / stat() error, -2 if an exception was
  226. raised. */
  227. int
  228. _Py_stat(PyObject *path, struct stat *statbuf)
  229. {
  230. #ifdef MS_WINDOWS
  231. int err;
  232. struct _stat wstatbuf;
  233. wchar_t *wpath;
  234. wpath = PyUnicode_AsUnicode(path);
  235. if (wpath == NULL)
  236. return -2;
  237. err = _wstat(wpath, &wstatbuf);
  238. if (!err)
  239. statbuf->st_mode = wstatbuf.st_mode;
  240. return err;
  241. #else
  242. int ret;
  243. PyObject *bytes = PyUnicode_EncodeFSDefault(path);
  244. if (bytes == NULL)
  245. return -2;
  246. ret = stat(PyBytes_AS_STRING(bytes), statbuf);
  247. Py_DECREF(bytes);
  248. return ret;
  249. #endif
  250. }
  251. /* Open a file. Use _wfopen() on Windows, encode the path to the locale
  252. encoding and use fopen() otherwise. */
  253. FILE *
  254. _Py_wfopen(const wchar_t *path, const wchar_t *mode)
  255. {
  256. #ifndef MS_WINDOWS
  257. FILE *f;
  258. char *cpath;
  259. char cmode[10];
  260. size_t r;
  261. r = wcstombs(cmode, mode, 10);
  262. if (r == (size_t)-1 || r >= 10) {
  263. errno = EINVAL;
  264. return NULL;
  265. }
  266. cpath = _Py_wchar2char(path, NULL);
  267. if (cpath == NULL)
  268. return NULL;
  269. f = fopen(cpath, cmode);
  270. PyMem_Free(cpath);
  271. return f;
  272. #else
  273. return _wfopen(path, mode);
  274. #endif
  275. }
  276. /* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
  277. call fopen() otherwise.
  278. Return the new file object on success, or NULL if the file cannot be open or
  279. (if PyErr_Occurred()) on unicode error */
  280. FILE*
  281. _Py_fopen(PyObject *path, const char *mode)
  282. {
  283. #ifdef MS_WINDOWS
  284. wchar_t *wpath;
  285. wchar_t wmode[10];
  286. int usize;
  287. if (!PyUnicode_Check(path)) {
  288. PyErr_Format(PyExc_TypeError,
  289. "str file path expected under Windows, got %R",
  290. Py_TYPE(path));
  291. return NULL;
  292. }
  293. wpath = PyUnicode_AsUnicode(path);
  294. if (wpath == NULL)
  295. return NULL;
  296. usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
  297. if (usize == 0)
  298. return NULL;
  299. return _wfopen(wpath, wmode);
  300. #else
  301. FILE *f;
  302. PyObject *bytes;
  303. if (!PyUnicode_FSConverter(path, &bytes))
  304. return NULL;
  305. f = fopen(PyBytes_AS_STRING(bytes), mode);
  306. Py_DECREF(bytes);
  307. return f;
  308. #endif
  309. }
  310. #ifdef HAVE_READLINK
  311. /* Read value of symbolic link. Encode the path to the locale encoding, decode
  312. the result from the locale encoding. Return -1 on error. */
  313. int
  314. _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
  315. {
  316. char *cpath;
  317. char cbuf[PATH_MAX];
  318. wchar_t *wbuf;
  319. int res;
  320. size_t r1;
  321. cpath = _Py_wchar2char(path, NULL);
  322. if (cpath == NULL) {
  323. errno = EINVAL;
  324. return -1;
  325. }
  326. res = (int)readlink(cpath, cbuf, PATH_MAX);
  327. PyMem_Free(cpath);
  328. if (res == -1)
  329. return -1;
  330. if (res == PATH_MAX) {
  331. errno = EINVAL;
  332. return -1;
  333. }
  334. cbuf[res] = '\0'; /* buf will be null terminated */
  335. wbuf = _Py_char2wchar(cbuf, &r1);
  336. if (wbuf == NULL) {
  337. errno = EINVAL;
  338. return -1;
  339. }
  340. if (bufsiz <= r1) {
  341. PyMem_Free(wbuf);
  342. errno = EINVAL;
  343. return -1;
  344. }
  345. wcsncpy(buf, wbuf, bufsiz);
  346. PyMem_Free(wbuf);
  347. return (int)r1;
  348. }
  349. #endif
  350. #ifdef HAVE_REALPATH
  351. /* Return the canonicalized absolute pathname. Encode path to the locale
  352. encoding, decode the result from the locale encoding.
  353. Return NULL on error. */
  354. wchar_t*
  355. _Py_wrealpath(const wchar_t *path,
  356. wchar_t *resolved_path, size_t resolved_path_size)
  357. {
  358. char *cpath;
  359. char cresolved_path[PATH_MAX];
  360. wchar_t *wresolved_path;
  361. char *res;
  362. size_t r;
  363. cpath = _Py_wchar2char(path, NULL);
  364. if (cpath == NULL) {
  365. errno = EINVAL;
  366. return NULL;
  367. }
  368. res = realpath(cpath, cresolved_path);
  369. PyMem_Free(cpath);
  370. if (res == NULL)
  371. return NULL;
  372. wresolved_path = _Py_char2wchar(cresolved_path, &r);
  373. if (wresolved_path == NULL) {
  374. errno = EINVAL;
  375. return NULL;
  376. }
  377. if (resolved_path_size <= r) {
  378. PyMem_Free(wresolved_path);
  379. errno = EINVAL;
  380. return NULL;
  381. }
  382. wcsncpy(resolved_path, wresolved_path, resolved_path_size);
  383. PyMem_Free(wresolved_path);
  384. return resolved_path;
  385. }
  386. #endif
  387. /* Get the current directory. size is the buffer size in wide characters
  388. including the null character. Decode the path from the locale encoding.
  389. Return NULL on error. */
  390. wchar_t*
  391. _Py_wgetcwd(wchar_t *buf, size_t size)
  392. {
  393. #ifdef MS_WINDOWS
  394. return _wgetcwd(buf, size);
  395. #else
  396. char fname[PATH_MAX];
  397. wchar_t *wname;
  398. size_t len;
  399. if (getcwd(fname, PATH_MAX) == NULL)
  400. return NULL;
  401. wname = _Py_char2wchar(fname, &len);
  402. if (wname == NULL)
  403. return NULL;
  404. if (size <= len) {
  405. PyMem_Free(wname);
  406. return NULL;
  407. }
  408. wcsncpy(buf, wname, size);
  409. PyMem_Free(wname);
  410. return buf;
  411. #endif
  412. }
  413. #endif