mς
­fIc           @   s°   d  Z  d k Z d k Z d g Z d a d   Z d f  d     YZ d f  d     YZ d f  d	     YZ d
 e i	 f d     YZ
 d   Z d   Z e d j o e   n d S(   s<   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Nt   RobotFileParseri    c         C   s   t  o	 |  GHn d  S(   N(   t   debugt   msg(   R   (    (    t(   /mit/python/lib/python2.4/robotparser.pyt   _debug   s     c           B   sb   t  Z d  Z d d  Z d   Z d   Z d   Z d   Z d   Z d   Z	 d	   Z
 d
   Z RS(   ss    This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    t    c         C   s>   g  |  _ d  |  _ t |  _ t |  _ |  i |  d |  _	 d  S(   Ni    (
   t   selft   entriest   Nonet   default_entryt   Falset   disallow_allt	   allow_allt   set_urlt   urlt   last_checked(   R   R   (    (    R   t   __init__   s    				c         C   s   |  i S(   s·   Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        N(   R   R   (   R   (    (    R   t   mtime$   s     c         C   s   d k  } | i    |  _ d S(   sY   Sets the time the robots.txt file was last fetched to the
        current time.

        N(   t   timeR   R   (   R   R   (    (    R   t   modified-   s     	c         C   s/   | |  _  t i |  d d !\ |  _ |  _ d S(   s,   Sets the URL referring to a robots.txt file.i   i   N(   R   R   t   urlparset   hostt   path(   R   R   (    (    R   R   5   s     	c         C   sϋ   t    } | i |  i  } g  } | i   } x+ | o# | i	 | i
    | i   } q0 W| i |  _ |  i d j p |  i d j o t |  _ t d  nZ |  i d j o t |  _ t d  n3 |  i d j o" | o t d  |  i |  n d S(	   s4   Reads the robots.txt URL and feeds it to the parser.i  i  s   disallow alli  s	   allow alliΘ   s   parse linesN(   t	   URLopenert   openert   openR   R   t   ft   linest   readlinet   linet   appendt   stript   errcodet   TrueR   R   R   t   parse(   R   R   R   R   R   (    (    R   t   read:   s&     	  		
c         C   s1   d | i j o | |  _ n |  i i |  d  S(   Nt   *(   t   entryt
   useragentsR   R	   R   R   (   R   R%   (    (    R   t
   _add_entryN   s    c         C   s«  d } d } t   } xZ| D]R} | d } | p_ | d j o! t d |  t   } d } q | d j o  |  i |  t   } d } q n | i	 d  } | d j o | |  } n | i   } | p q n | i d d  } t |  d j o_| d i   i   | d <t i | d i    | d <| d d j oS | d j o( t d |  |  i |  t   } n | i i | d  d } qn| d d	 j oF | d j o t d
 |  qV| i i t | d t   d } qn| d d j o@ | d j o t d
 |  qV| i i t | d t   qnt d | | d f  q t d | | f  q W| d j o |  i i |  n t d t |    d S(   s   parse the input lines from a robots.txt file.
           We allow that a user-agent: line is not preceded by
           one or more blank lines.i    i   s]   line %d: warning: you should insert allow: or disallow: directives below any user-agent: linei   t   #t   :s
   user-agentsP   line %d: warning: you should insert a blank line before any user-agent directivet   disallowsH   line %d: error: you must insert a user-agent: directive before this linet   allows    line %d: warning: unknown key %ss!   line %d: error: malformed line %ss   Parsed rules:
%sN(   t   statet
   linenumbert   EntryR%   R   R   R   R   R'   t   findt   iR   t   splitt   lent   lowert   urllibt   unquoteR&   R   t	   rulelinest   RuleLineR
   R!   R   t   str(   R   R   R0   R-   R,   R%   R   (    (    R   R"   U   s^     	 
	
	

!c         C   s»   t  d | | f  |  i o t Sn |  i o t Sn t i	 t
 i
 t i |   d  p d } x2 |  i D]' } | i |  o | i |  Sqn qn W|  i o |  i i |  Sn t S(   s=   using the parsed robots.txt decide if useragent can fetch urls=   Checking robots.txt allowance for:
  user agent: %s
  url: %si   t   /N(   R   t	   useragentR   R   R   R
   R   R!   R4   t   quoteR   R5   R   R%   t
   applies_tot	   allowanceR	   (   R   R:   R   R%   (    (    R   t	   can_fetch   s     

,
 
c         C   s2   d } x% |  i D] } | t |  d } q W| S(   NR   s   
(   t   retR   R   R%   R8   (   R   R%   R?   (    (    R   t   __str__ͺ   s
    
 (   t   __name__t
   __module__t   __doc__R   R   R   R   R#   R'   R"   R>   R@   (    (    (    R   R       s    							@	R7   c           B   s)   t  Z d  Z d   Z d   Z d   Z RS(   so   A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c         C   s>   | d j o | o
 t } n t i |  |  _  | |  _ d  S(   NR   (   R   R=   R!   R4   R;   R   (   R   R   R=   (    (    R   R   ΄   s    
c         C   s    |  i d j p | i |  i  S(   NR$   (   R   R   t   filenamet
   startswith(   R   RD   (    (    R   R<   »   s    c         C   s    |  i o d p d d |  i S(   Nt   Allowt   Disallows   : (   R   R=   R   (   R   (    (    R   R@   Ύ   s    (   RA   RB   RC   R   R<   R@   (    (    (    R   R7   ±   s    		R.   c           B   s2   t  Z d  Z d   Z d   Z d   Z d   Z RS(   s?   An entry has one or more user-agents and zero or more rulelinesc         C   s   g  |  _ g  |  _ d  S(   N(   R   R&   R6   (   R   (    (    R   R   Δ   s    	c         C   sX   d } x# |  i D] } | d | d } q Wx% |  i D] } | t |  d } q6 W| S(   NR   s   User-agent: s   
(   R?   R   R&   t   agentR6   R   R8   (   R   R   RH   R?   (    (    R   R@   Θ   s    
 
 c         C   sg   | i d  d i   } xG |  i D]< } | d j o t Sn | i   } | | j o t Sq# q# Wt S(   s2   check if this entry applies to the specified agentR9   i    R$   N(   R:   R1   R3   R   R&   RH   R!   R
   (   R   R:   RH   (    (    R   R<   Π   s     
 c         C   sO   xH |  i D]= } t | t |  | i f  | i |  o | i Sq
 q
 Wt S(   sZ   Preconditions:
        - our agent applies to this entry
        - filename is URL decodedN(	   R   R6   R   R   RD   R8   R=   R<   R!   (   R   RD   R   (    (    R   R=   έ   s     
 (   RA   RB   RC   R   R@   R<   R=   (    (    (    R   R.   Β   s
    			R   c           B   s   t  Z d   Z d   Z RS(   Nc         G   s    t  i i |  |  d |  _ d  S(   NiΘ   (   R4   t   FancyURLopenerR   R   t   argsR    (   R   RJ   (    (    R   R   θ   s    c         C   s(   | |  _  t i i |  | | | | |  S(   N(	   R    R   R4   RI   t   http_error_defaultR   t   fpt   errmsgt   headers(   R   R   RL   R    RM   RN   (    (    R   RK   μ   s    	(   RA   RB   R   RK   (    (    (    R   R   η   s   	c         C   s;   | p
 d } n d } |  | j o	 d GHn
 d | GHHd  S(   Ns   access denieds   access allowedt   faileds   ok (%s)(   t   bt   act   a(   RR   RP   RQ   (    (    R   t   _checkρ   s    
		c          C   s  t    }  d a |  i d  |  i   t |  i d d  d  t |  i d d  d  t |  i d d  d  t |  i d	 d  d  t |  i d
 d  d  t |  i d d  d  t |  i d d  d  t |  i d d  d  t |  i d d  d  t |  i d d  d  t |  i d d  d  t |  i d d  d  |  i d  |  i   t |  i d d  d  d  S(   Ni   s"   http://www.musi-cal.com/robots.txtR$   s   http://www.musi-cal.com/R   i    t   CherryPickerSEs?   http://www.musi-cal.com/cgi-bin/event-search?city=San+Franciscos   CherryPickerSE/1.0s   CherryPickerSE/1.5t   ExtractorPros   http://www.musi-cal.com/blubbat   extractorpros   toolpak/1.1t   spams   http://www.musi-cal.com/searchs#   http://www.musi-cal.com/Musician/mes   http://www.lycos.com/robots.txtt   Mozillas   http://www.lycos.com/search(   R    t   rpR   R   R#   RS   R>   (   RY   (    (    R   t   _testό   s4     	

t   __main__(   RC   R   R4   t   __all__R   R   R    R7   R.   RI   R   RS   RZ   RA   (
   R   R\   R   RZ   R4   R   RS   R7   R.   R    (    (    R   t   ?   s   		%
		'