Browse Source

bpo-31325: Fix usage of namedtuple in RobotFileParser.parse() (#4529)

pull/4495/merge
Berker Peksag 8 years ago
committed by Raymond Hettinger
parent
commit
3df02dbc8e
  1. 8
      Doc/library/urllib.robotparser.rst
  2. 9
      Lib/test/test_robotparser.py
  3. 9
      Lib/urllib/robotparser.py
  4. 5
      Misc/NEWS.d/next/Library/2017-11-23-22-12-11.bpo-31325.8jAUxN.rst

8
Doc/library/urllib.robotparser.rst

@ -69,10 +69,10 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
.. method:: request_rate(useragent)
Returns the contents of the ``Request-rate`` parameter from
``robots.txt`` in the form of a :func:`~collections.namedtuple`
``(requests, seconds)``. If there is no such parameter or it doesn't
apply to the *useragent* specified or the ``robots.txt`` entry for this
parameter has invalid syntax, return ``None``.
``robots.txt`` as a :term:`named tuple` ``RequestRate(requests, seconds)``.
If there is no such parameter or it doesn't apply to the *useragent*
specified or the ``robots.txt`` entry for this parameter has invalid
syntax, return ``None``.
.. versionadded:: 3.6

9
Lib/test/test_robotparser.py

@ -3,7 +3,6 @@ import os
import threading
import unittest
import urllib.robotparser
from collections import namedtuple
from test import support
from http.server import BaseHTTPRequestHandler, HTTPServer
@ -87,6 +86,10 @@ class BaseRequestRateTest(BaseRobotTest):
self.parser.crawl_delay(agent), self.crawl_delay
)
if self.request_rate:
self.assertIsInstance(
self.parser.request_rate(agent),
urllib.robotparser.RequestRate
)
self.assertEqual(
self.parser.request_rate(agent).requests,
self.request_rate.requests
@ -108,7 +111,7 @@ Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
"""
agent = 'figtree'
request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
request_rate = urllib.robotparser.RequestRate(9, 30)
crawl_delay = 3
good = [('figtree', '/foo.html')]
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
@ -237,7 +240,7 @@ Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
"""
request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
request_rate = urllib.robotparser.RequestRate(3, 15)
crawl_delay = 1
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']

9
Lib/urllib/robotparser.py

@ -16,6 +16,9 @@ import urllib.request
__all__ = ["RobotFileParser"]
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
@ -136,11 +139,7 @@ class RobotFileParser:
# check if all values are sane
if (len(numbers) == 2 and numbers[0].strip().isdigit()
and numbers[1].strip().isdigit()):
req_rate = collections.namedtuple('req_rate',
'requests seconds')
entry.req_rate = req_rate
entry.req_rate.requests = int(numbers[0])
entry.req_rate.seconds = int(numbers[1])
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2
if state == 2:
self._add_entry(entry)

5
Misc/NEWS.d/next/Library/2017-11-23-22-12-11.bpo-31325.8jAUxN.rst

@ -0,0 +1,5 @@
Fix wrong usage of :func:`collections.namedtuple` in
the :meth:`RobotFileParser.parse() <urllib.robotparser.RobotFileParser.parse>`
method.
Initial patch by Robin Wellner.
Loading…
Cancel
Save