Rapid spam filtering system https://rspamd.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
2.5 KiB

10 years ago
10 years ago
11 years ago
11 years ago
10 years ago
10 years ago
10 years ago
  1. context("HTML processing", function()
  2. local rspamd_util = require("rspamd_util")
  3. local logger = require("rspamd_logger")
  4. local cases = {
  5. -- Entities
  6. {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
  7. [[.firebaseapp.com]]},
  8. {[[
  9. <?xml version="1.0" encoding="iso-8859-1"?>
  10. <!DOCTYPE html
  11. PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  12. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  13. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  14. <head>
  15. <title>
  16. Wikibooks
  17. </title>
  18. </head>
  19. <body>
  20. <p>
  21. Hello, world!
  22. </p>
  23. </body>
  24. </html>]], 'Hello, world!\n'},
  25. {[[
  26. <!DOCTYPE html>
  27. <html lang="en">
  28. <head>
  29. <meta charset="utf-8">
  30. <title>title</title>
  31. <link rel="stylesheet" href="style.css">
  32. <script src="script.js"></script>
  33. <style><!--
  34. - -a -a -a -- --- -
  35. --></head>
  36. <body>
  37. <!-- page content -->
  38. Hello, world!
  39. </body>
  40. </html>
  41. ]], 'Hello, world!'},
  42. {[[
  43. <html lang="en">
  44. <head>
  45. <meta charset="utf-8">
  46. <title>title</title>
  47. <link rel="stylesheet" href="style.css">
  48. <script src="script.js"></script>
  49. </head>
  50. <body>
  51. <!-- page content -->
  52. Hello, world!<br>test</br><br>content</hr>more content<br>
  53. <div>
  54. content inside div
  55. </div>
  56. </body>
  57. </html>
  58. ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
  59. {[[
  60. <html lang="en">
  61. <head>
  62. <meta charset="utf-8">
  63. <title>title</title>
  64. <link rel="stylesheet" href="style.css">
  65. <script src="script.js"></script>
  66. </head>
  67. <body>
  68. <!-- tabular content -->
  69. <table>
  70. content
  71. </table>
  72. <table>
  73. <tr>
  74. <th>heada</th>
  75. <th>headb</th>
  76. </tr>
  77. <tr>
  78. <td>data1</td>
  79. <td>data2</td>
  80. </tr>
  81. </table>
  82. </body>
  83. </html>
  84. ]], 'content\nheada headb\ndata1 data2\n'},
  85. {[[
  86. <html lang="en">
  87. <head>
  88. <meta charset="utf-8">
  89. <title>title</title>
  90. <link rel="stylesheet" href="style.css">
  91. <script src="script.js"></script>
  92. </head>
  93. <body>
  94. <!-- escape content -->
  95. a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
  96. </body>
  97. </html>
  98. ]], 'a b a > b a < b a & b \'a "a"'},
  99. }
  100. for i,c in ipairs(cases) do
  101. test("Extract text from HTML " .. tostring(i), function()
  102. local t = rspamd_util.parse_html(c[1])
  103. assert_not_nil(t)
  104. assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
  105. c[2], t))
  106. end)
  107. end
  108. end)