add hg and python

author: cinap_lenrek <cinap_lenrek@localhost> 2011-05-03 11:25:13 +0000
committer: cinap_lenrek <cinap_lenrek@localhost> 2011-05-03 11:25:13 +0000
commit: 458120dd40db6b4df55a4e96b650e16798ef06a0 (patch)
tree: 8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/src/cmd/python/Doc/lib/librobotparser.tex
parent: 3a742c699f6806c1145aea5149bf15de15a0afd7 (diff)
1 files changed, 66 insertions, 0 deletions
diff --git a/sys/src/cmd/python/Doc/lib/librobotparser.tex b/sys/src/cmd/python/Doc/lib/librobotparser.tex
new file mode 100644
index 000000000..5eac5283e
--- /dev/null
+++ b/sys/src/cmd/python/Doc/lib/librobotparser.tex
@@ -0,0 +1,66 @@
+\section{\module{robotparser} --- 
+         Parser for robots.txt}
+
+\declaremodule{standard}{robotparser}
+\modulesynopsis{Loads a \protect\file{robots.txt} file and
+                answers questions about fetchability of other URLs.}
+\sectionauthor{Skip Montanaro}{skip@mojam.com}
+
+\index{WWW}
+\index{World Wide Web}
+\index{URL}
+\index{robots.txt}
+
+This module provides a single class, \class{RobotFileParser}, which answers
+questions about whether or not a particular user agent can fetch a URL on
+the Web site that published the \file{robots.txt} file.  For more details on 
+the structure of \file{robots.txt} files, see
+\url{http://www.robotstxt.org/wc/norobots.html}. 
+
+\begin{classdesc}{RobotFileParser}{}
+
+This class provides a set of methods to read, parse and answer questions
+about a single \file{robots.txt} file.
+
+\begin{methoddesc}{set_url}{url}
+Sets the URL referring to a \file{robots.txt} file.
+\end{methoddesc}
+
+\begin{methoddesc}{read}{}
+Reads the \file{robots.txt} URL and feeds it to the parser.
+\end{methoddesc}
+
+\begin{methoddesc}{parse}{lines}
+Parses the lines argument.
+\end{methoddesc}
+
+\begin{methoddesc}{can_fetch}{useragent, url}
+Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url}
+according to the rules contained in the parsed \file{robots.txt} file.
+\end{methoddesc}
+
+\begin{methoddesc}{mtime}{}
+Returns the time the \code{robots.txt} file was last fetched.  This is
+useful for long-running web spiders that need to check for new
+\code{robots.txt} files periodically.
+\end{methoddesc}
+
+\begin{methoddesc}{modified}{}
+Sets the time the \code{robots.txt} file was last fetched to the current
+time.
+\end{methoddesc}
+
+\end{classdesc}
+
+The following example demonstrates basic use of the RobotFileParser class.
+
+\begin{verbatim}
+>>> import robotparser
+>>> rp = robotparser.RobotFileParser()
+>>> rp.set_url("http://www.musi-cal.com/robots.txt")
+>>> rp.read()
+>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
+False
+>>> rp.can_fetch("*", "http://www.musi-cal.com/")
+True
+\end{verbatim}
author	cinap_lenrek <cinap_lenrek@localhost>	2011-05-03 11:25:13 +0000
committer	cinap_lenrek <cinap_lenrek@localhost>	2011-05-03 11:25:13 +0000
commit	458120dd40db6b4df55a4e96b650e16798ef06a0 (patch)
tree	8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/src/cmd/python/Doc/lib/librobotparser.tex
parent	3a742c699f6806c1145aea5149bf15de15a0afd7 (diff)