You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

394 lines
11 KiB

  1. /* stringlib: split implementation */
  2. #ifndef STRINGLIB_SPLIT_H
  3. #define STRINGLIB_SPLIT_H
  4. #ifndef STRINGLIB_FASTSEARCH_H
  5. #error must include "stringlib/fastsearch.h" before including this module
  6. #endif
  7. /* Overallocate the initial list to reduce the number of reallocs for small
  8. split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
  9. resizes, to sizes 4, 8, then 16. Most observed string splits are for human
  10. text (roughly 11 words per line) and field delimited data (usually 1-10
  11. fields). For large strings the split algorithms are bandwidth limited
  12. so increasing the preallocation likely will not improve things.*/
  13. #define MAX_PREALLOC 12
  14. /* 5 splits gives 6 elements */
  15. #define PREALLOC_SIZE(maxsplit) \
  16. (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
  17. #define SPLIT_APPEND(data, left, right) \
  18. sub = STRINGLIB_NEW((data) + (left), \
  19. (right) - (left)); \
  20. if (sub == NULL) \
  21. goto onError; \
  22. if (PyList_Append(list, sub)) { \
  23. Py_DECREF(sub); \
  24. goto onError; \
  25. } \
  26. else \
  27. Py_DECREF(sub);
  28. #define SPLIT_ADD(data, left, right) { \
  29. sub = STRINGLIB_NEW((data) + (left), \
  30. (right) - (left)); \
  31. if (sub == NULL) \
  32. goto onError; \
  33. if (count < MAX_PREALLOC) { \
  34. PyList_SET_ITEM(list, count, sub); \
  35. } else { \
  36. if (PyList_Append(list, sub)) { \
  37. Py_DECREF(sub); \
  38. goto onError; \
  39. } \
  40. else \
  41. Py_DECREF(sub); \
  42. } \
  43. count++; }
  44. /* Always force the list to the expected size. */
  45. #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
  46. Py_LOCAL_INLINE(PyObject *)
  47. stringlib_split_whitespace(PyObject* str_obj,
  48. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  49. Py_ssize_t maxcount)
  50. {
  51. Py_ssize_t i, j, count=0;
  52. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  53. PyObject *sub;
  54. if (list == NULL)
  55. return NULL;
  56. i = j = 0;
  57. while (maxcount-- > 0) {
  58. while (i < str_len && STRINGLIB_ISSPACE(str[i]))
  59. i++;
  60. if (i == str_len) break;
  61. j = i; i++;
  62. while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
  63. i++;
  64. #ifndef STRINGLIB_MUTABLE
  65. if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
  66. /* No whitespace in str_obj, so just use it as list[0] */
  67. Py_INCREF(str_obj);
  68. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  69. count++;
  70. break;
  71. }
  72. #endif
  73. SPLIT_ADD(str, j, i);
  74. }
  75. if (i < str_len) {
  76. /* Only occurs when maxcount was reached */
  77. /* Skip any remaining whitespace and copy to end of string */
  78. while (i < str_len && STRINGLIB_ISSPACE(str[i]))
  79. i++;
  80. if (i != str_len)
  81. SPLIT_ADD(str, i, str_len);
  82. }
  83. FIX_PREALLOC_SIZE(list);
  84. return list;
  85. onError:
  86. Py_DECREF(list);
  87. return NULL;
  88. }
  89. Py_LOCAL_INLINE(PyObject *)
  90. stringlib_split_char(PyObject* str_obj,
  91. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  92. const STRINGLIB_CHAR ch,
  93. Py_ssize_t maxcount)
  94. {
  95. Py_ssize_t i, j, count=0;
  96. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  97. PyObject *sub;
  98. if (list == NULL)
  99. return NULL;
  100. i = j = 0;
  101. while ((j < str_len) && (maxcount-- > 0)) {
  102. for(; j < str_len; j++) {
  103. /* I found that using memchr makes no difference */
  104. if (str[j] == ch) {
  105. SPLIT_ADD(str, i, j);
  106. i = j = j + 1;
  107. break;
  108. }
  109. }
  110. }
  111. #ifndef STRINGLIB_MUTABLE
  112. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  113. /* ch not in str_obj, so just use str_obj as list[0] */
  114. Py_INCREF(str_obj);
  115. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  116. count++;
  117. } else
  118. #endif
  119. if (i <= str_len) {
  120. SPLIT_ADD(str, i, str_len);
  121. }
  122. FIX_PREALLOC_SIZE(list);
  123. return list;
  124. onError:
  125. Py_DECREF(list);
  126. return NULL;
  127. }
  128. Py_LOCAL_INLINE(PyObject *)
  129. stringlib_split(PyObject* str_obj,
  130. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  131. const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
  132. Py_ssize_t maxcount)
  133. {
  134. Py_ssize_t i, j, pos, count=0;
  135. PyObject *list, *sub;
  136. if (sep_len == 0) {
  137. PyErr_SetString(PyExc_ValueError, "empty separator");
  138. return NULL;
  139. }
  140. else if (sep_len == 1)
  141. return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
  142. list = PyList_New(PREALLOC_SIZE(maxcount));
  143. if (list == NULL)
  144. return NULL;
  145. i = j = 0;
  146. while (maxcount-- > 0) {
  147. pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
  148. if (pos < 0)
  149. break;
  150. j = i + pos;
  151. SPLIT_ADD(str, i, j);
  152. i = j + sep_len;
  153. }
  154. #ifndef STRINGLIB_MUTABLE
  155. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  156. /* No match in str_obj, so just use it as list[0] */
  157. Py_INCREF(str_obj);
  158. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  159. count++;
  160. } else
  161. #endif
  162. {
  163. SPLIT_ADD(str, i, str_len);
  164. }
  165. FIX_PREALLOC_SIZE(list);
  166. return list;
  167. onError:
  168. Py_DECREF(list);
  169. return NULL;
  170. }
  171. Py_LOCAL_INLINE(PyObject *)
  172. stringlib_rsplit_whitespace(PyObject* str_obj,
  173. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  174. Py_ssize_t maxcount)
  175. {
  176. Py_ssize_t i, j, count=0;
  177. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  178. PyObject *sub;
  179. if (list == NULL)
  180. return NULL;
  181. i = j = str_len - 1;
  182. while (maxcount-- > 0) {
  183. while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
  184. i--;
  185. if (i < 0) break;
  186. j = i; i--;
  187. while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
  188. i--;
  189. #ifndef STRINGLIB_MUTABLE
  190. if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  191. /* No whitespace in str_obj, so just use it as list[0] */
  192. Py_INCREF(str_obj);
  193. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  194. count++;
  195. break;
  196. }
  197. #endif
  198. SPLIT_ADD(str, i + 1, j + 1);
  199. }
  200. if (i >= 0) {
  201. /* Only occurs when maxcount was reached */
  202. /* Skip any remaining whitespace and copy to beginning of string */
  203. while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
  204. i--;
  205. if (i >= 0)
  206. SPLIT_ADD(str, 0, i + 1);
  207. }
  208. FIX_PREALLOC_SIZE(list);
  209. if (PyList_Reverse(list) < 0)
  210. goto onError;
  211. return list;
  212. onError:
  213. Py_DECREF(list);
  214. return NULL;
  215. }
  216. Py_LOCAL_INLINE(PyObject *)
  217. stringlib_rsplit_char(PyObject* str_obj,
  218. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  219. const STRINGLIB_CHAR ch,
  220. Py_ssize_t maxcount)
  221. {
  222. Py_ssize_t i, j, count=0;
  223. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  224. PyObject *sub;
  225. if (list == NULL)
  226. return NULL;
  227. i = j = str_len - 1;
  228. while ((i >= 0) && (maxcount-- > 0)) {
  229. for(; i >= 0; i--) {
  230. if (str[i] == ch) {
  231. SPLIT_ADD(str, i + 1, j + 1);
  232. j = i = i - 1;
  233. break;
  234. }
  235. }
  236. }
  237. #ifndef STRINGLIB_MUTABLE
  238. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  239. /* ch not in str_obj, so just use str_obj as list[0] */
  240. Py_INCREF(str_obj);
  241. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  242. count++;
  243. } else
  244. #endif
  245. if (j >= -1) {
  246. SPLIT_ADD(str, 0, j + 1);
  247. }
  248. FIX_PREALLOC_SIZE(list);
  249. if (PyList_Reverse(list) < 0)
  250. goto onError;
  251. return list;
  252. onError:
  253. Py_DECREF(list);
  254. return NULL;
  255. }
  256. Py_LOCAL_INLINE(PyObject *)
  257. stringlib_rsplit(PyObject* str_obj,
  258. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  259. const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
  260. Py_ssize_t maxcount)
  261. {
  262. Py_ssize_t j, pos, count=0;
  263. PyObject *list, *sub;
  264. if (sep_len == 0) {
  265. PyErr_SetString(PyExc_ValueError, "empty separator");
  266. return NULL;
  267. }
  268. else if (sep_len == 1)
  269. return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
  270. list = PyList_New(PREALLOC_SIZE(maxcount));
  271. if (list == NULL)
  272. return NULL;
  273. j = str_len;
  274. while (maxcount-- > 0) {
  275. pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
  276. if (pos < 0)
  277. break;
  278. SPLIT_ADD(str, pos + sep_len, j);
  279. j = pos;
  280. }
  281. #ifndef STRINGLIB_MUTABLE
  282. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  283. /* No match in str_obj, so just use it as list[0] */
  284. Py_INCREF(str_obj);
  285. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  286. count++;
  287. } else
  288. #endif
  289. {
  290. SPLIT_ADD(str, 0, j);
  291. }
  292. FIX_PREALLOC_SIZE(list);
  293. if (PyList_Reverse(list) < 0)
  294. goto onError;
  295. return list;
  296. onError:
  297. Py_DECREF(list);
  298. return NULL;
  299. }
  300. Py_LOCAL_INLINE(PyObject *)
  301. stringlib_splitlines(PyObject* str_obj,
  302. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  303. int keepends)
  304. {
  305. /* This does not use the preallocated list because splitlines is
  306. usually run with hundreds of newlines. The overhead of
  307. switching between PyList_SET_ITEM and append causes about a
  308. 2-3% slowdown for that common case. A smarter implementation
  309. could move the if check out, so the SET_ITEMs are done first
  310. and the appends only done when the prealloc buffer is full.
  311. That's too much work for little gain.*/
  312. register Py_ssize_t i;
  313. register Py_ssize_t j;
  314. PyObject *list = PyList_New(0);
  315. PyObject *sub;
  316. if (list == NULL)
  317. return NULL;
  318. for (i = j = 0; i < str_len; ) {
  319. Py_ssize_t eol;
  320. /* Find a line and append it */
  321. while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
  322. i++;
  323. /* Skip the line break reading CRLF as one line break */
  324. eol = i;
  325. if (i < str_len) {
  326. if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
  327. i += 2;
  328. else
  329. i++;
  330. if (keepends)
  331. eol = i;
  332. }
  333. #ifndef STRINGLIB_MUTABLE
  334. if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
  335. /* No linebreak in str_obj, so just use it as list[0] */
  336. if (PyList_Append(list, str_obj))
  337. goto onError;
  338. break;
  339. }
  340. #endif
  341. SPLIT_APPEND(str, j, eol);
  342. j = i;
  343. }
  344. return list;
  345. onError:
  346. Py_DECREF(list);
  347. return NULL;
  348. }
  349. #endif