You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1469 lines
43 KiB

27 years ago
27 years ago
24 years ago
24 years ago
24 years ago
25 years ago
25 years ago
21 years ago
21 years ago
23 years ago
23 years ago
23 years ago
23 years ago
23 years ago
21 years ago
24 years ago
24 years ago
24 years ago
24 years ago
18 years ago
18 years ago
24 years ago
22 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
19 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
24 years ago
23 years ago
25 years ago
27 years ago
25 years ago
20 years ago
24 years ago
19 years ago
27 years ago
25 years ago
24 years ago
24 years ago
24 years ago
19 years ago
24 years ago
25 years ago
  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 5 |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 1997-2009 The PHP Group |
  6. +----------------------------------------------------------------------+
  7. | This source file is subject to version 3.01 of the PHP license, |
  8. | that is bundled with this package in the file LICENSE, and is |
  9. | available through the world-wide-web at the following url: |
  10. | http://www.php.net/license/3_01.txt |
  11. | If you did not receive a copy of the PHP license and are unable to |
  12. | obtain it through the world-wide-web, please send a note to |
  13. | license@php.net so we can mail you a copy immediately. |
  14. +----------------------------------------------------------------------+
  15. | Authors: Rasmus Lerdorf <rasmus@php.net> |
  16. | Jaakko Hyvtti <jaakko.hyvatti@iki.fi> |
  17. | Wez Furlong <wez@thebrainroom.com> |
  18. +----------------------------------------------------------------------+
  19. */
  20. /* $Id$ */
  21. /*
  22. * HTML entity resources:
  23. *
  24. * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
  25. * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
  26. * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
  27. *
  28. * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
  29. *
  30. */
  31. #include "php.h"
  32. #if PHP_WIN32
  33. #include "config.w32.h"
  34. #else
  35. #include <php_config.h>
  36. #endif
  37. #include "html.h"
  38. #include "php_string.h"
  39. #include "SAPI.h"
  40. #if HAVE_LOCALE_H
  41. #include <locale.h>
  42. #endif
  43. #if HAVE_LANGINFO_H
  44. #include <langinfo.h>
  45. #endif
  46. #if HAVE_MBSTRING
  47. # include "ext/mbstring/mbstring.h"
  48. ZEND_EXTERN_MODULE_GLOBALS(mbstring)
  49. #endif
  50. enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
  51. cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
  52. cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
  53. cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
  54. };
  55. typedef const char *const entity_table_t;
  56. /* codepage 1252 is a Windows extension to iso-8859-1. */
  57. static entity_table_t ent_cp_1252[] = {
  58. "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
  59. "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
  60. NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
  61. "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
  62. "oelig", NULL, NULL, "Yuml"
  63. };
  64. static entity_table_t ent_iso_8859_1[] = {
  65. "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
  66. "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
  67. "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
  68. "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
  69. "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
  70. "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
  71. "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
  72. "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
  73. "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
  74. "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
  75. "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
  76. "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
  77. "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
  78. "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
  79. "uuml", "yacute", "thorn", "yuml"
  80. };
  81. static entity_table_t ent_iso_8859_15[] = {
  82. "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
  83. "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
  84. "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
  85. "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
  86. "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
  87. "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
  88. "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
  89. "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
  90. "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
  91. "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
  92. "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
  93. "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
  94. "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
  95. "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
  96. "uuml", "yacute", "thorn", "yuml"
  97. };
  98. static entity_table_t ent_uni_338_402[] = {
  99. /* 338 (0x0152) */
  100. "OElig", "oelig", NULL, NULL, NULL, NULL,
  101. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  102. /* 352 (0x0160) */
  103. "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
  104. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  105. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  106. /* 376 (0x0178) */
  107. "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  108. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  109. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  110. /* 400 (0x0190) */
  111. NULL, NULL, "fnof"
  112. };
  113. static entity_table_t ent_uni_spacing[] = {
  114. /* 710 */
  115. "circ",
  116. /* 711 - 730 */
  117. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  118. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  119. /* 731 - 732 */
  120. NULL, "tilde"
  121. };
  122. static entity_table_t ent_uni_greek[] = {
  123. /* 913 */
  124. "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
  125. "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
  126. NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
  127. /* 938 - 944 are not mapped */
  128. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  129. "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
  130. "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
  131. "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
  132. /* 970 - 976 are not mapped */
  133. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  134. "thetasym", "upsih",
  135. NULL, NULL, NULL,
  136. "piv"
  137. };
  138. static entity_table_t ent_uni_punct[] = {
  139. /* 8194 */
  140. "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
  141. "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
  142. NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
  143. /* 8216 */
  144. "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
  145. "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
  146. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
  147. /* 8242 */
  148. "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
  149. NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
  150. "frasl"
  151. };
  152. static entity_table_t ent_uni_euro[] = {
  153. "euro"
  154. };
  155. static entity_table_t ent_uni_8465_8501[] = {
  156. /* 8465 */
  157. "image", NULL, NULL, NULL, NULL, NULL, NULL,
  158. /* 8472 */
  159. "weierp", NULL, NULL, NULL,
  160. /* 8476 */
  161. "real", NULL, NULL, NULL, NULL, NULL,
  162. /* 8482 */
  163. "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  164. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  165. /* 8501 */
  166. "alefsym",
  167. };
  168. static entity_table_t ent_uni_8592_9002[] = {
  169. /* 8592 (0x2190) */
  170. "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
  171. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  172. /* 8608 (0x21a0) */
  173. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  174. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  175. /* 8624 (0x21b0) */
  176. NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
  177. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  178. /* 8640 (0x21c0) */
  179. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  180. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  181. /* 8656 (0x21d0) */
  182. "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
  183. NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
  184. /* 8672 (0x21e0) */
  185. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  186. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  187. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  188. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  189. /* 8704 (0x2200) */
  190. "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
  191. "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
  192. /* 8720 (0x2210) */
  193. "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
  194. "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
  195. /* 8736 (0x2220) */
  196. "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
  197. "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
  198. /* 8752 (0x2230) */
  199. NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
  200. NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
  201. /* 8768 (0x2240) */
  202. "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
  203. "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
  204. /* 8784 (0x2250) */
  205. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  206. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  207. /* 8800 (0x2260) */
  208. "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
  209. "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
  210. /* 8816 (0x2270) */
  211. "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
  212. NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
  213. /* 8832 (0x2280) */
  214. "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
  215. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  216. /* 8848 (0x2290) */
  217. NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
  218. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  219. /* 8864 (0x22a0) */
  220. NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
  221. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  222. /* 8880 (0x22b0) */
  223. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  224. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  225. /* 8896 (0x22c0) */
  226. NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
  227. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  228. /* 8912 (0x22d0) */
  229. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  230. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  231. /* 8928 (0x22e0) */
  232. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  233. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  234. /* 8944 (0x22f0) */
  235. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  236. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  237. /* 8960 (0x2300) */
  238. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  239. "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
  240. /* 8976 (0x2310) */
  241. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  242. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  243. /* 8992 (0x2320) */
  244. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  245. NULL, "lang", "rang"
  246. };
  247. static entity_table_t ent_uni_9674[] = {
  248. /* 9674 */
  249. "loz"
  250. };
  251. static entity_table_t ent_uni_9824_9830[] = {
  252. /* 9824 */
  253. "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
  254. };
  255. static entity_table_t ent_koi8r[] = {
  256. "#1105", /* "jo "*/
  257. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  258. NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
  259. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  260. "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
  261. "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
  262. "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
  263. "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
  264. "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
  265. "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
  266. "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
  267. "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
  268. "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
  269. "#1066"
  270. };
  271. static entity_table_t ent_cp_1251[] = {
  272. "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
  273. "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
  274. "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
  275. "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
  276. "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
  277. "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
  278. "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
  279. "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
  280. "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
  281. "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
  282. "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
  283. "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
  284. "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
  285. "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
  286. "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
  287. "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
  288. "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
  289. "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
  290. "#1103"
  291. };
  292. static entity_table_t ent_iso_8859_5[] = {
  293. "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
  294. "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
  295. "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
  296. "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
  297. "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
  298. "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
  299. "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
  300. "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
  301. "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
  302. "#1119"
  303. };
  304. static entity_table_t ent_cp_866[] = {
  305. "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
  306. "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
  307. "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
  308. "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
  309. "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
  310. "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
  311. "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
  312. "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
  313. "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
  314. "#160"
  315. };
  316. /* MacRoman has a couple of low-ascii chars that need mapping too */
  317. /* Vertical tab (ASCII 11) is often used to store line breaks inside */
  318. /* DB exports, this mapping changes it to a space */
  319. static entity_table_t ent_macroman[] = {
  320. "sp", NULL, NULL, NULL,
  321. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  322. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  323. NULL, NULL, NULL, NULL, NULL, "quot", NULL,
  324. NULL, NULL, "amp", NULL, NULL, NULL, NULL,
  325. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  326. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  327. NULL, NULL, NULL, "lt", NULL, "gt", NULL,
  328. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  329. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  330. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  331. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  332. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  333. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  334. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  335. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  336. NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  337. NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
  338. "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
  339. "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
  340. "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
  341. "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
  342. "cent", "pound", "sect", "bull", "para", "szlig", "reg",
  343. "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
  344. "infin", "plusmn", "le", "ge", "yen", "micro", "part",
  345. "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
  346. "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
  347. "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
  348. "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
  349. "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
  350. "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
  351. "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
  352. "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
  353. "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
  354. "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
  355. "#733", "#731", "#711"
  356. };
  357. struct html_entity_map {
  358. enum entity_charset charset; /* charset identifier */
  359. unsigned short basechar; /* char code at start of table */
  360. unsigned short endchar; /* last char code in the table */
  361. entity_table_t *table; /* the table of mappings */
  362. };
  363. static const struct html_entity_map entity_map[] = {
  364. { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
  365. { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
  366. { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
  367. { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
  368. { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
  369. { cs_utf_8, 338, 402, ent_uni_338_402 },
  370. { cs_utf_8, 710, 732, ent_uni_spacing },
  371. { cs_utf_8, 913, 982, ent_uni_greek },
  372. { cs_utf_8, 8194, 8260, ent_uni_punct },
  373. { cs_utf_8, 8364, 8364, ent_uni_euro },
  374. { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
  375. { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
  376. { cs_utf_8, 9674, 9674, ent_uni_9674 },
  377. { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
  378. { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
  379. { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
  380. { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
  381. { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
  382. { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
  383. { cs_koi8r, 0xa3, 0xff, ent_koi8r },
  384. { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
  385. { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
  386. { cs_cp866, 0xc0, 0xff, ent_cp_866 },
  387. { cs_macroman, 0x0b, 0xff, ent_macroman },
  388. { cs_terminator }
  389. };
  390. static const struct {
  391. const char *codeset;
  392. enum entity_charset charset;
  393. } charset_map[] = {
  394. { "ISO-8859-1", cs_8859_1 },
  395. { "ISO8859-1", cs_8859_1 },
  396. { "ISO-8859-15", cs_8859_15 },
  397. { "ISO8859-15", cs_8859_15 },
  398. { "utf-8", cs_utf_8 },
  399. { "cp1252", cs_cp1252 },
  400. { "Windows-1252", cs_cp1252 },
  401. { "1252", cs_cp1252 },
  402. { "BIG5", cs_big5 },
  403. { "950", cs_big5 },
  404. { "GB2312", cs_gb2312 },
  405. { "936", cs_gb2312 },
  406. { "BIG5-HKSCS", cs_big5hkscs },
  407. { "Shift_JIS", cs_sjis },
  408. { "SJIS", cs_sjis },
  409. { "932", cs_sjis },
  410. { "EUCJP", cs_eucjp },
  411. { "EUC-JP", cs_eucjp },
  412. { "KOI8-R", cs_koi8r },
  413. { "koi8-ru", cs_koi8r },
  414. { "koi8r", cs_koi8r },
  415. { "cp1251", cs_cp1251 },
  416. { "Windows-1251", cs_cp1251 },
  417. { "win-1251", cs_cp1251 },
  418. { "iso8859-5", cs_8859_5 },
  419. { "iso-8859-5", cs_8859_5 },
  420. { "cp866", cs_cp866 },
  421. { "866", cs_cp866 },
  422. { "ibm866", cs_cp866 },
  423. { "MacRoman", cs_macroman },
  424. { NULL }
  425. };
  426. static const struct {
  427. unsigned short charcode;
  428. char *entity;
  429. int entitylen;
  430. int flags;
  431. } basic_entities[] = {
  432. { '"', "&quot;", 6, ENT_HTML_QUOTE_DOUBLE },
  433. { '\'', "&#039;", 6, ENT_HTML_QUOTE_SINGLE },
  434. { '\'', "&#39;", 5, ENT_HTML_QUOTE_SINGLE },
  435. { '<', "&lt;", 4, 0 },
  436. { '>', "&gt;", 4, 0 },
  437. { 0, NULL, 0, 0 }
  438. };
  439. struct basic_entities_dec {
  440. unsigned short charcode;
  441. char entity[8];
  442. int entitylen;
  443. };
  444. #define MB_RETURN { \
  445. *newpos = pos; \
  446. mbseq[mbpos] = '\0'; \
  447. *mbseqlen = mbpos; \
  448. return this_char; }
  449. #define MB_WRITE(mbchar) { \
  450. mbspace--; \
  451. if (mbspace == 0) { \
  452. MB_RETURN; \
  453. } \
  454. mbseq[mbpos++] = (mbchar); }
  455. #define CHECK_LEN(pos, chars_need) \
  456. if((str_len - (pos)) < chars_need) { \
  457. *newpos = pos; \
  458. *status = FAILURE; \
  459. return 0; \
  460. }
  461. /* {{{ get_next_char
  462. */
  463. inline static unsigned short get_next_char(enum entity_charset charset,
  464. unsigned char * str,
  465. int str_len,
  466. int * newpos,
  467. unsigned char * mbseq,
  468. int * mbseqlen,
  469. int *status)
  470. {
  471. int pos = *newpos;
  472. int mbpos = 0;
  473. int mbspace = *mbseqlen;
  474. unsigned short this_char = str[pos++];
  475. unsigned char next_char;
  476. *status = SUCCESS;
  477. if (mbspace <= 0) {
  478. *mbseqlen = 0;
  479. return this_char;
  480. }
  481. MB_WRITE((unsigned char)this_char);
  482. switch (charset) {
  483. case cs_utf_8:
  484. {
  485. unsigned long utf = 0;
  486. int stat = 0;
  487. int more = 1;
  488. /* unpack utf-8 encoding into a wide char.
  489. * Code stolen from the mbstring extension */
  490. do {
  491. if (this_char < 0x80) {
  492. more = 0;
  493. if(stat) {
  494. /* we didn't finish the UTF sequence correctly */
  495. --pos;
  496. *status = FAILURE;
  497. }
  498. break;
  499. } else if (this_char < 0xc0) {
  500. switch (stat) {
  501. case 0x10: /* 2, 2nd */
  502. case 0x21: /* 3, 3rd */
  503. case 0x32: /* 4, 4th */
  504. case 0x43: /* 5, 5th */
  505. case 0x54: /* 6, 6th */
  506. /* last byte in sequence */
  507. more = 0;
  508. utf |= (this_char & 0x3f);
  509. this_char = (unsigned short)utf;
  510. break;
  511. case 0x20: /* 3, 2nd */
  512. case 0x31: /* 4, 3rd */
  513. case 0x42: /* 5, 4th */
  514. case 0x53: /* 6, 5th */
  515. /* penultimate char */
  516. utf |= ((this_char & 0x3f) << 6);
  517. stat++;
  518. break;
  519. case 0x30: /* 4, 2nd */
  520. case 0x41: /* 5, 3rd */
  521. case 0x52: /* 6, 4th */
  522. utf |= ((this_char & 0x3f) << 12);
  523. stat++;
  524. break;
  525. case 0x40: /* 5, 2nd */
  526. case 0x51:
  527. utf |= ((this_char & 0x3f) << 18);
  528. stat++;
  529. break;
  530. case 0x50: /* 6, 2nd */
  531. utf |= ((this_char & 0x3f) << 24);
  532. stat++;
  533. break;
  534. default:
  535. /* invalid */
  536. *status = FAILURE;
  537. more = 0;
  538. }
  539. }
  540. /* lead byte */
  541. else if (this_char < 0xe0) {
  542. stat = 0x10; /* 2 byte */
  543. utf = (this_char & 0x1f) << 6;
  544. CHECK_LEN(pos, 1);
  545. } else if (this_char < 0xf0) {
  546. stat = 0x20; /* 3 byte */
  547. utf = (this_char & 0xf) << 12;
  548. CHECK_LEN(pos, 2);
  549. } else if (this_char < 0xf8) {
  550. stat = 0x30; /* 4 byte */
  551. utf = (this_char & 0x7) << 18;
  552. CHECK_LEN(pos, 3);
  553. } else if (this_char < 0xfc) {
  554. stat = 0x40; /* 5 byte */
  555. utf = (this_char & 0x3) << 24;
  556. CHECK_LEN(pos, 4);
  557. } else if (this_char < 0xfe) {
  558. stat = 0x50; /* 6 byte */
  559. utf = (this_char & 0x1) << 30;
  560. CHECK_LEN(pos, 5);
  561. } else {
  562. /* invalid; bail */
  563. more = 0;
  564. *status = FAILURE;
  565. break;
  566. }
  567. if (more) {
  568. this_char = str[pos++];
  569. MB_WRITE((unsigned char)this_char);
  570. }
  571. } while (more);
  572. }
  573. break;
  574. case cs_big5:
  575. case cs_gb2312:
  576. case cs_big5hkscs:
  577. {
  578. /* check if this is the first of a 2-byte sequence */
  579. if (this_char >= 0xa1 && this_char <= 0xfe) {
  580. /* peek at the next char */
  581. CHECK_LEN(pos, 1);
  582. next_char = str[pos];
  583. if ((next_char >= 0x40 && next_char <= 0x7e) ||
  584. (next_char >= 0xa1 && next_char <= 0xfe)) {
  585. /* yes, this a wide char */
  586. this_char <<= 8;
  587. MB_WRITE(next_char);
  588. this_char |= next_char;
  589. pos++;
  590. }
  591. }
  592. break;
  593. }
  594. case cs_sjis:
  595. {
  596. /* check if this is the first of a 2-byte sequence */
  597. if ( (this_char >= 0x81 && this_char <= 0x9f) ||
  598. (this_char >= 0xe0 && this_char <= 0xef)
  599. ) {
  600. /* peek at the next char */
  601. CHECK_LEN(pos, 1);
  602. next_char = str[pos];
  603. if ((next_char >= 0x40 && next_char <= 0x7e) ||
  604. (next_char >= 0x80 && next_char <= 0xfc))
  605. {
  606. /* yes, this a wide char */
  607. this_char <<= 8;
  608. MB_WRITE(next_char);
  609. this_char |= next_char;
  610. pos++;
  611. }
  612. }
  613. break;
  614. }
  615. case cs_eucjp:
  616. {
  617. /* check if this is the first of a multi-byte sequence */
  618. if (this_char >= 0xa1 && this_char <= 0xfe) {
  619. /* peek at the next char */
  620. CHECK_LEN(pos, 1);
  621. next_char = str[pos];
  622. if (next_char >= 0xa1 && next_char <= 0xfe) {
  623. /* yes, this a jis kanji char */
  624. this_char <<= 8;
  625. MB_WRITE(next_char);
  626. this_char |= next_char;
  627. pos++;
  628. }
  629. } else if (this_char == 0x8e) {
  630. /* peek at the next char */
  631. CHECK_LEN(pos, 1);
  632. next_char = str[pos];
  633. if (next_char >= 0xa1 && next_char <= 0xdf) {
  634. /* JIS X 0201 kana */
  635. this_char <<= 8;
  636. MB_WRITE(next_char);
  637. this_char |= next_char;
  638. pos++;
  639. }
  640. } else if (this_char == 0x8f) {
  641. /* peek at the next two char */
  642. unsigned char next2_char;
  643. CHECK_LEN(pos, 2);
  644. next_char = str[pos];
  645. next2_char = str[pos+1];
  646. if ((next_char >= 0xa1 && next_char <= 0xfe) &&
  647. (next2_char >= 0xa1 && next2_char <= 0xfe)) {
  648. /* JIS X 0212 hojo-kanji */
  649. this_char <<= 8;
  650. MB_WRITE(next_char);
  651. this_char |= next_char;
  652. pos++;
  653. this_char <<= 8;
  654. MB_WRITE(next2_char);
  655. this_char |= next2_char;
  656. pos++;
  657. }
  658. }
  659. break;
  660. }
  661. default:
  662. break;
  663. }
  664. MB_RETURN;
  665. }
  666. /* }}} */
  667. /* {{{ entity_charset determine_charset
  668. * returns the charset identifier based on current locale or a hint.
  669. * defaults to iso-8859-1 */
  670. static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
  671. {
  672. int i;
  673. enum entity_charset charset = cs_8859_1;
  674. int len = 0;
  675. zval *uf_result = NULL;
  676. /* Guarantee default behaviour for backwards compatibility */
  677. if (charset_hint == NULL)
  678. return cs_8859_1;
  679. if ((len = strlen(charset_hint)) != 0) {
  680. goto det_charset;
  681. }
  682. #if HAVE_MBSTRING
  683. #if !defined(COMPILE_DL_MBSTRING)
  684. /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
  685. switch (MBSTRG(current_internal_encoding)) {
  686. case mbfl_no_encoding_8859_1:
  687. return cs_8859_1;
  688. case mbfl_no_encoding_utf8:
  689. return cs_utf_8;
  690. case mbfl_no_encoding_euc_jp:
  691. case mbfl_no_encoding_eucjp_win:
  692. return cs_eucjp;
  693. case mbfl_no_encoding_sjis:
  694. case mbfl_no_encoding_sjis_win:
  695. case mbfl_no_encoding_sjis_mac:
  696. return cs_sjis;
  697. case mbfl_no_encoding_cp1252:
  698. return cs_cp1252;
  699. case mbfl_no_encoding_8859_15:
  700. return cs_8859_15;
  701. case mbfl_no_encoding_big5:
  702. return cs_big5;
  703. case mbfl_no_encoding_euc_cn:
  704. case mbfl_no_encoding_hz:
  705. case mbfl_no_encoding_cp936:
  706. return cs_gb2312;
  707. case mbfl_no_encoding_koi8r:
  708. return cs_koi8r;
  709. case mbfl_no_encoding_cp866:
  710. return cs_cp866;
  711. case mbfl_no_encoding_cp1251:
  712. return cs_cp1251;
  713. case mbfl_no_encoding_8859_5:
  714. return cs_8859_5;
  715. default:
  716. ;
  717. }
  718. #else
  719. {
  720. zval nm_mb_internal_encoding;
  721. ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
  722. if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
  723. charset_hint = Z_STRVAL_P(uf_result);
  724. len = Z_STRLEN_P(uf_result);
  725. if (len == 4) { /* sizeof(none|auto|pass)-1 */
  726. if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
  727. !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
  728. !memcmp("none", charset_hint, sizeof("none") - 1)) {
  729. charset_hint = NULL;
  730. len = 0;
  731. }
  732. }
  733. goto det_charset;
  734. }
  735. }
  736. #endif
  737. #endif
  738. charset_hint = SG(default_charset);
  739. if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
  740. goto det_charset;
  741. }
  742. /* try to detect the charset for the locale */
  743. #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
  744. charset_hint = nl_langinfo(CODESET);
  745. if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
  746. goto det_charset;
  747. }
  748. #endif
  749. #if HAVE_LOCALE_H
  750. /* try to figure out the charset from the locale */
  751. {
  752. char *localename;
  753. char *dot, *at;
  754. /* lang[_territory][.codeset][@modifier] */
  755. localename = setlocale(LC_CTYPE, NULL);
  756. dot = strchr(localename, '.');
  757. if (dot) {
  758. dot++;
  759. /* locale specifies a codeset */
  760. at = strchr(dot, '@');
  761. if (at)
  762. len = at - dot;
  763. else
  764. len = strlen(dot);
  765. charset_hint = dot;
  766. } else {
  767. /* no explicit name; see if the name itself
  768. * is the charset */
  769. charset_hint = localename;
  770. len = strlen(charset_hint);
  771. }
  772. }
  773. #endif
  774. det_charset:
  775. if (charset_hint) {
  776. int found = 0;
  777. /* now walk the charset map and look for the codeset */
  778. for (i = 0; charset_map[i].codeset; i++) {
  779. if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
  780. charset = charset_map[i].charset;
  781. found = 1;
  782. break;
  783. }
  784. }
  785. if (!found) {
  786. php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
  787. charset_hint);
  788. }
  789. }
  790. if (uf_result != NULL) {
  791. zval_ptr_dtor(&uf_result);
  792. }
  793. return charset;
  794. }
  795. /* }}} */
  796. /* {{{ php_utf32_utf8 */
  797. size_t php_utf32_utf8(unsigned char *buf, int k)
  798. {
  799. size_t retval = 0;
  800. if (k < 0x80) {
  801. buf[0] = k;
  802. retval = 1;
  803. } else if (k < 0x800) {
  804. buf[0] = 0xc0 | (k >> 6);
  805. buf[1] = 0x80 | (k & 0x3f);
  806. retval = 2;
  807. } else if (k < 0x10000) {
  808. buf[0] = 0xe0 | (k >> 12);
  809. buf[1] = 0x80 | ((k >> 6) & 0x3f);
  810. buf[2] = 0x80 | (k & 0x3f);
  811. retval = 3;
  812. } else if (k < 0x200000) {
  813. buf[0] = 0xf0 | (k >> 18);
  814. buf[1] = 0x80 | ((k >> 12) & 0x3f);
  815. buf[2] = 0x80 | ((k >> 6) & 0x3f);
  816. buf[3] = 0x80 | (k & 0x3f);
  817. retval = 4;
  818. } else if (k < 0x4000000) {
  819. buf[0] = 0xf8 | (k >> 24);
  820. buf[1] = 0x80 | ((k >> 18) & 0x3f);
  821. buf[2] = 0x80 | ((k >> 12) & 0x3f);
  822. buf[3] = 0x80 | ((k >> 6) & 0x3f);
  823. buf[4] = 0x80 | (k & 0x3f);
  824. retval = 5;
  825. } else {
  826. buf[0] = 0xfc | (k >> 30);
  827. buf[1] = 0x80 | ((k >> 24) & 0x3f);
  828. buf[2] = 0x80 | ((k >> 18) & 0x3f);
  829. buf[3] = 0x80 | ((k >> 12) & 0x3f);
  830. buf[4] = 0x80 | ((k >> 6) & 0x3f);
  831. buf[5] = 0x80 | (k & 0x3f);
  832. retval = 6;
  833. }
  834. buf[retval] = '\0';
  835. return retval;
  836. }
  837. /* }}} */
  838. /* {{{ php_unescape_html_entities
  839. */
  840. PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
  841. {
  842. int retlen;
  843. int j, k;
  844. char *replaced, *ret, *p, *q, *lim, *next;
  845. enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
  846. unsigned char replacement[15];
  847. int replacement_len;
  848. ret = estrndup(old, oldlen);
  849. retlen = oldlen;
  850. if (!retlen) {
  851. goto empty_source;
  852. }
  853. if (all) {
  854. /* look for a match in the maps for this charset */
  855. for (j = 0; entity_map[j].charset != cs_terminator; j++) {
  856. if (entity_map[j].charset != charset)
  857. continue;
  858. for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
  859. unsigned char entity[32];
  860. int entity_length = 0;
  861. if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
  862. continue;
  863. entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
  864. if (entity_length >= sizeof(entity)) {
  865. continue;
  866. }
  867. /* When we have MBCS entities in the tables above, this will need to handle it */
  868. replacement_len = 0;
  869. switch (charset) {
  870. case cs_8859_1:
  871. case cs_cp1252:
  872. case cs_8859_15:
  873. case cs_cp1251:
  874. case cs_8859_5:
  875. case cs_cp866:
  876. case cs_koi8r:
  877. replacement[0] = k;
  878. replacement[1] = '\0';
  879. replacement_len = 1;
  880. break;
  881. case cs_big5:
  882. case cs_gb2312:
  883. case cs_big5hkscs:
  884. case cs_sjis:
  885. case cs_eucjp:
  886. /* we cannot properly handle those multibyte encodings
  887. * with php_str_to_str. skip it. */
  888. continue;
  889. case cs_utf_8:
  890. replacement_len = php_utf32_utf8(replacement, k);
  891. break;
  892. default:
  893. php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
  894. efree(ret);
  895. return NULL;
  896. }
  897. if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
  898. replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
  899. efree(ret);
  900. ret = replaced;
  901. }
  902. }
  903. }
  904. }
  905. for (j = 0; basic_entities[j].charcode != 0; j++) {
  906. if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
  907. continue;
  908. replacement[0] = (unsigned char)basic_entities[j].charcode;
  909. replacement[1] = '\0';
  910. if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
  911. replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
  912. efree(ret);
  913. ret = replaced;
  914. }
  915. }
  916. /* replace numeric entities & "&amp;" */
  917. lim = ret + retlen;
  918. for (p = ret, q = ret; p < lim;) {
  919. int code;
  920. if (p[0] == '&') {
  921. if (p + 2 < lim) {
  922. if (p[1] == '#') {
  923. int invalid_code = 0;
  924. if (p[2] == 'x' || p[2] == 'X') {
  925. code = strtol(p + 3, &next, 16);
  926. } else {
  927. code = strtol(p + 2, &next, 10);
  928. }
  929. if (next != NULL && *next == ';') {
  930. switch (charset) {
  931. case cs_utf_8:
  932. q += php_utf32_utf8(q, code);
  933. break;
  934. case cs_8859_1:
  935. case cs_8859_5:
  936. case cs_8859_15:
  937. if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
  938. invalid_code = 1;
  939. } else {
  940. if (code == 39 || !quote_style) {
  941. invalid_code = 1;
  942. } else {
  943. *(q++) = code;
  944. }
  945. }
  946. break;
  947. case cs_cp1252:
  948. if (code > 0xff) {
  949. invalid_code = 1;
  950. } else {
  951. *(q++) = code;
  952. }
  953. break;
  954. case cs_cp1251:
  955. case cs_cp866:
  956. case cs_big5:
  957. case cs_big5hkscs:
  958. case cs_sjis:
  959. case cs_eucjp:
  960. if (code >= 0x80) {
  961. invalid_code = 1;
  962. } else {
  963. *(q++) = code;
  964. }
  965. break;
  966. case cs_gb2312:
  967. if (code >= 0x81) {
  968. invalid_code = 1;
  969. } else {
  970. *(q++) = code;
  971. }
  972. break;
  973. default:
  974. /* for backwards compatilibity */
  975. invalid_code = 1;
  976. break;
  977. }
  978. if (invalid_code) {
  979. for (; p <= next; p++) {
  980. *(q++) = *p;
  981. }
  982. }
  983. p = next + 1;
  984. } else {
  985. *(q++) = *(p++);
  986. *(q++) = *(p++);
  987. }
  988. } else if (p + 4 < lim &&
  989. p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
  990. p[4] == ';') {
  991. *(q++) = '&';
  992. p += 5;
  993. } else {
  994. *(q++) = *(p++);
  995. *(q++) = *(p++);
  996. }
  997. } else {
  998. *(q++) = *(p++);
  999. }
  1000. } else {
  1001. *(q++) = *(p++);
  1002. }
  1003. }
  1004. *q = '\0';
  1005. retlen = (size_t)(q - ret);
  1006. empty_source:
  1007. *newlen = retlen;
  1008. return ret;
  1009. }
  1010. /* }}} */
  1011. PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
  1012. {
  1013. return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
  1014. }
  1015. /* {{{ php_escape_html_entities
  1016. */
  1017. PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
  1018. {
  1019. int i, j, maxlen, len;
  1020. char *replaced;
  1021. enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
  1022. int matches_map;
  1023. maxlen = 2 * oldlen;
  1024. if (maxlen < 128)
  1025. maxlen = 128;
  1026. replaced = emalloc (maxlen);
  1027. len = 0;
  1028. i = 0;
  1029. while (i < oldlen) {
  1030. unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
  1031. int mbseqlen = sizeof(mbsequence);
  1032. int status = SUCCESS;
  1033. unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
  1034. if(status == FAILURE) {
  1035. /* invalid MB sequence */
  1036. if (quote_style & ENT_HTML_IGNORE_ERRORS) {
  1037. continue;
  1038. }
  1039. efree(replaced);
  1040. if(!PG(display_errors)) {
  1041. php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
  1042. }
  1043. *newlen = 0;
  1044. return STR_EMPTY_ALLOC();
  1045. }
  1046. matches_map = 0;
  1047. if (len + 16 > maxlen)
  1048. replaced = erealloc (replaced, maxlen += 128);
  1049. if (all) {
  1050. /* look for a match in the maps for this charset */
  1051. unsigned char *rep = NULL;
  1052. for (j = 0; entity_map[j].charset != cs_terminator; j++) {
  1053. if (entity_map[j].charset == charset
  1054. && this_char >= entity_map[j].basechar
  1055. && this_char <= entity_map[j].endchar) {
  1056. rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
  1057. if (rep == NULL) {
  1058. /* there is no entity for this position; fall through and
  1059. * just output the character itself */
  1060. break;
  1061. }
  1062. matches_map = 1;
  1063. break;
  1064. }
  1065. }
  1066. if (matches_map) {
  1067. int l = strlen(rep);
  1068. /* increase the buffer size */
  1069. if (len + 2 + l >= maxlen) {
  1070. replaced = erealloc(replaced, maxlen += 128);
  1071. }
  1072. replaced[len++] = '&';
  1073. strlcpy(replaced + len, rep, maxlen);
  1074. len += l;
  1075. replaced[len++] = ';';
  1076. }
  1077. }
  1078. if (!matches_map) {
  1079. int is_basic = 0;
  1080. if (this_char == '&') {
  1081. if (double_encode) {
  1082. encode_amp:
  1083. memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
  1084. len += sizeof("&amp;") - 1;
  1085. } else {
  1086. char *e = memchr(old + i, ';', oldlen - i);
  1087. char *s = old + i;
  1088. if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
  1089. goto encode_amp;
  1090. } else {
  1091. if (*s == '#') { /* numeric entities */
  1092. s++;
  1093. /* Hex (&#x5A;) */
  1094. if (*s == 'x' || *s == 'X') {
  1095. s++;
  1096. while (s < e) {
  1097. if (!isxdigit((int)*(unsigned char *)s++)) {
  1098. goto encode_amp;
  1099. }
  1100. }
  1101. /* Dec (&#90;)*/
  1102. } else {
  1103. while (s < e) {
  1104. if (!isdigit((int)*(unsigned char *)s++)) {
  1105. goto encode_amp;
  1106. }
  1107. }
  1108. }
  1109. } else { /* text entities */
  1110. while (s < e) {
  1111. if (!isalnum((int)*(unsigned char *)s++)) {
  1112. goto encode_amp;
  1113. }
  1114. }
  1115. }
  1116. replaced[len++] = '&';
  1117. }
  1118. }
  1119. is_basic = 1;
  1120. } else {
  1121. for (j = 0; basic_entities[j].charcode != 0; j++) {
  1122. if ((basic_entities[j].charcode != this_char) ||
  1123. (basic_entities[j].flags &&
  1124. (quote_style & basic_entities[j].flags) == 0)) {
  1125. continue;
  1126. }
  1127. memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
  1128. len += basic_entities[j].entitylen;
  1129. is_basic = 1;
  1130. break;
  1131. }
  1132. }
  1133. if (!is_basic) {
  1134. /* a wide char without a named entity; pass through the original sequence */
  1135. if (mbseqlen > 1) {
  1136. memcpy(replaced + len, mbsequence, mbseqlen);
  1137. len += mbseqlen;
  1138. } else {
  1139. replaced[len++] = (unsigned char)this_char;
  1140. }
  1141. }
  1142. }
  1143. }
  1144. replaced[len] = '\0';
  1145. *newlen = len;
  1146. return replaced;
  1147. }
  1148. /* }}} */
  1149. /* {{{ php_html_entities
  1150. */
  1151. static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
  1152. {
  1153. char *str, *hint_charset = NULL;
  1154. int str_len, hint_charset_len = 0;
  1155. int len;
  1156. long quote_style = ENT_COMPAT;
  1157. char *replaced;
  1158. zend_bool double_encode = 1;
  1159. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
  1160. return;
  1161. }
  1162. replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
  1163. RETVAL_STRINGL(replaced, len, 0);
  1164. }
  1165. /* }}} */
  1166. #define HTML_SPECIALCHARS 0
  1167. #define HTML_ENTITIES 1
  1168. /* {{{ register_html_constants
  1169. */
  1170. void register_html_constants(INIT_FUNC_ARGS)
  1171. {
  1172. REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
  1173. REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
  1174. REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
  1175. REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
  1176. REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
  1177. REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
  1178. }
  1179. /* }}} */
  1180. /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
  1181. Convert special characters to HTML entities */
  1182. PHP_FUNCTION(htmlspecialchars)
  1183. {
  1184. php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  1185. }
  1186. /* }}} */
  1187. /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
  1188. Convert special HTML entities back to characters */
  1189. PHP_FUNCTION(htmlspecialchars_decode)
  1190. {
  1191. char *str, *new_str, *e, *p;
  1192. int len, j, i, new_len;
  1193. long quote_style = ENT_COMPAT;
  1194. struct basic_entities_dec basic_entities_dec[8];
  1195. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
  1196. return;
  1197. }
  1198. new_str = estrndup(str, len);
  1199. new_len = len;
  1200. e = new_str + new_len;
  1201. if (!(p = memchr(new_str, '&', new_len))) {
  1202. RETURN_STRINGL(new_str, new_len, 0);
  1203. }
  1204. for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
  1205. if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
  1206. continue;
  1207. }
  1208. basic_entities_dec[j].charcode = basic_entities[i].charcode;
  1209. memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
  1210. basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
  1211. j++;
  1212. }
  1213. basic_entities_dec[j].charcode = '&';
  1214. basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
  1215. memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
  1216. i = j + 1;
  1217. do {
  1218. int l = e - p;
  1219. for (j = 0; j < i; j++) {
  1220. if (basic_entities_dec[j].entitylen > l) {
  1221. continue;
  1222. }
  1223. if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
  1224. int e_len = basic_entities_dec[j].entitylen - 1;
  1225. *p++ = basic_entities_dec[j].charcode;
  1226. memmove(p, p + e_len, (e - p - e_len));
  1227. e -= e_len;
  1228. goto done;
  1229. }
  1230. }
  1231. p++;
  1232. done:
  1233. if (p >= e) {
  1234. break;
  1235. }
  1236. } while ((p = memchr(p, '&', (e - p))));
  1237. new_len = e - new_str;
  1238. new_str[new_len] = '\0';
  1239. RETURN_STRINGL(new_str, new_len, 0);
  1240. }
  1241. /* }}} */
  1242. /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
  1243. Convert all HTML entities to their applicable characters */
  1244. PHP_FUNCTION(html_entity_decode)
  1245. {
  1246. char *str, *hint_charset = NULL;
  1247. int str_len, hint_charset_len = 0, len;
  1248. long quote_style = ENT_COMPAT;
  1249. char *replaced;
  1250. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
  1251. &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
  1252. return;
  1253. }
  1254. replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
  1255. if (replaced) {
  1256. RETURN_STRINGL(replaced, len, 0);
  1257. }
  1258. RETURN_FALSE;
  1259. }
  1260. /* }}} */
  1261. /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
  1262. Convert all applicable characters to HTML entities */
  1263. PHP_FUNCTION(htmlentities)
  1264. {
  1265. php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  1266. }
  1267. /* }}} */
  1268. /* {{{ proto array get_html_translation_table([int table [, int quote_style]])
  1269. Returns the internal translation table used by htmlspecialchars and htmlentities */
  1270. PHP_FUNCTION(get_html_translation_table)
  1271. {
  1272. long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
  1273. int i, j;
  1274. char ind[2];
  1275. enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
  1276. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, &quote_style) == FAILURE) {
  1277. return;
  1278. }
  1279. array_init(return_value);
  1280. ind[1] = 0;
  1281. switch (which) {
  1282. case HTML_ENTITIES:
  1283. for (j=0; entity_map[j].charset != cs_terminator; j++) {
  1284. if (entity_map[j].charset != charset)
  1285. continue;
  1286. for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
  1287. char buffer[16];
  1288. if (entity_map[j].table[i] == NULL)
  1289. continue;
  1290. /* what about wide chars here ?? */
  1291. ind[0] = i + entity_map[j].basechar;
  1292. snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
  1293. add_assoc_string(return_value, ind, buffer, 1);
  1294. }
  1295. }
  1296. /* break thru */
  1297. case HTML_SPECIALCHARS:
  1298. for (j = 0; basic_entities[j].charcode != 0; j++) {
  1299. if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
  1300. continue;
  1301. ind[0] = (unsigned char)basic_entities[j].charcode;
  1302. add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
  1303. }
  1304. add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
  1305. break;
  1306. }
  1307. }
  1308. /* }}} */
  1309. /*
  1310. * Local variables:
  1311. * tab-width: 4
  1312. * c-basic-offset: 4
  1313. * End:
  1314. * vim600: sw=4 ts=4 fdm=marker
  1315. * vim<600: sw=4 ts=4
  1316. */