summaryrefslogtreecommitdiff
path: root/sys/src/cmd/python/Doc/lib/librobotparser.tex
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@localhost>2011-05-03 11:25:13 +0000
committercinap_lenrek <cinap_lenrek@localhost>2011-05-03 11:25:13 +0000
commit458120dd40db6b4df55a4e96b650e16798ef06a0 (patch)
tree8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/src/cmd/python/Doc/lib/librobotparser.tex
parent3a742c699f6806c1145aea5149bf15de15a0afd7 (diff)
add hg and python
Diffstat (limited to 'sys/src/cmd/python/Doc/lib/librobotparser.tex')
-rw-r--r--sys/src/cmd/python/Doc/lib/librobotparser.tex66
1 files changed, 66 insertions, 0 deletions
diff --git a/sys/src/cmd/python/Doc/lib/librobotparser.tex b/sys/src/cmd/python/Doc/lib/librobotparser.tex
new file mode 100644
index 000000000..5eac5283e
--- /dev/null
+++ b/sys/src/cmd/python/Doc/lib/librobotparser.tex
@@ -0,0 +1,66 @@
+\section{\module{robotparser} ---
+ Parser for robots.txt}
+
+\declaremodule{standard}{robotparser}
+\modulesynopsis{Loads a \protect\file{robots.txt} file and
+ answers questions about fetchability of other URLs.}
+\sectionauthor{Skip Montanaro}{skip@mojam.com}
+
+\index{WWW}
+\index{World Wide Web}
+\index{URL}
+\index{robots.txt}
+
+This module provides a single class, \class{RobotFileParser}, which answers
+questions about whether or not a particular user agent can fetch a URL on
+the Web site that published the \file{robots.txt} file. For more details on
+the structure of \file{robots.txt} files, see
+\url{http://www.robotstxt.org/wc/norobots.html}.
+
+\begin{classdesc}{RobotFileParser}{}
+
+This class provides a set of methods to read, parse and answer questions
+about a single \file{robots.txt} file.
+
+\begin{methoddesc}{set_url}{url}
+Sets the URL referring to a \file{robots.txt} file.
+\end{methoddesc}
+
+\begin{methoddesc}{read}{}
+Reads the \file{robots.txt} URL and feeds it to the parser.
+\end{methoddesc}
+
+\begin{methoddesc}{parse}{lines}
+Parses the lines argument.
+\end{methoddesc}
+
+\begin{methoddesc}{can_fetch}{useragent, url}
+Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url}
+according to the rules contained in the parsed \file{robots.txt} file.
+\end{methoddesc}
+
+\begin{methoddesc}{mtime}{}
+Returns the time the \code{robots.txt} file was last fetched. This is
+useful for long-running web spiders that need to check for new
+\code{robots.txt} files periodically.
+\end{methoddesc}
+
+\begin{methoddesc}{modified}{}
+Sets the time the \code{robots.txt} file was last fetched to the current
+time.
+\end{methoddesc}
+
+\end{classdesc}
+
+The following example demonstrates basic use of the RobotFileParser class.
+
+\begin{verbatim}
+>>> import robotparser
+>>> rp = robotparser.RobotFileParser()
+>>> rp.set_url("http://www.musi-cal.com/robots.txt")
+>>> rp.read()
+>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
+False
+>>> rp.can_fetch("*", "http://www.musi-cal.com/")
+True
+\end{verbatim}