From 458120dd40db6b4df55a4e96b650e16798ef06a0 Mon Sep 17 00:00:00 2001 From: cinap_lenrek Date: Tue, 3 May 2011 11:25:13 +0000 Subject: add hg and python --- sys/src/cmd/python/Doc/lib/librobotparser.tex | 66 +++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 sys/src/cmd/python/Doc/lib/librobotparser.tex (limited to 'sys/src/cmd/python/Doc/lib/librobotparser.tex') diff --git a/sys/src/cmd/python/Doc/lib/librobotparser.tex b/sys/src/cmd/python/Doc/lib/librobotparser.tex new file mode 100644 index 000000000..5eac5283e --- /dev/null +++ b/sys/src/cmd/python/Doc/lib/librobotparser.tex @@ -0,0 +1,66 @@ +\section{\module{robotparser} --- + Parser for robots.txt} + +\declaremodule{standard}{robotparser} +\modulesynopsis{Loads a \protect\file{robots.txt} file and + answers questions about fetchability of other URLs.} +\sectionauthor{Skip Montanaro}{skip@mojam.com} + +\index{WWW} +\index{World Wide Web} +\index{URL} +\index{robots.txt} + +This module provides a single class, \class{RobotFileParser}, which answers +questions about whether or not a particular user agent can fetch a URL on +the Web site that published the \file{robots.txt} file. For more details on +the structure of \file{robots.txt} files, see +\url{http://www.robotstxt.org/wc/norobots.html}. + +\begin{classdesc}{RobotFileParser}{} + +This class provides a set of methods to read, parse and answer questions +about a single \file{robots.txt} file. + +\begin{methoddesc}{set_url}{url} +Sets the URL referring to a \file{robots.txt} file. +\end{methoddesc} + +\begin{methoddesc}{read}{} +Reads the \file{robots.txt} URL and feeds it to the parser. +\end{methoddesc} + +\begin{methoddesc}{parse}{lines} +Parses the lines argument. +\end{methoddesc} + +\begin{methoddesc}{can_fetch}{useragent, url} +Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url} +according to the rules contained in the parsed \file{robots.txt} file. +\end{methoddesc} + +\begin{methoddesc}{mtime}{} +Returns the time the \code{robots.txt} file was last fetched. This is +useful for long-running web spiders that need to check for new +\code{robots.txt} files periodically. +\end{methoddesc} + +\begin{methoddesc}{modified}{} +Sets the time the \code{robots.txt} file was last fetched to the current +time. +\end{methoddesc} + +\end{classdesc} + +The following example demonstrates basic use of the RobotFileParser class. + +\begin{verbatim} +>>> import robotparser +>>> rp = robotparser.RobotFileParser() +>>> rp.set_url("http://www.musi-cal.com/robots.txt") +>>> rp.read() +>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco") +False +>>> rp.can_fetch("*", "http://www.musi-cal.com/") +True +\end{verbatim} -- cgit v1.2.3