summaryrefslogtreecommitdiff
path: root/sys/src/cmd/python/Doc/lib/libhtmlparser.tex
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@localhost>2011-05-03 11:25:13 +0000
committercinap_lenrek <cinap_lenrek@localhost>2011-05-03 11:25:13 +0000
commit458120dd40db6b4df55a4e96b650e16798ef06a0 (patch)
tree8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/src/cmd/python/Doc/lib/libhtmlparser.tex
parent3a742c699f6806c1145aea5149bf15de15a0afd7 (diff)
add hg and python
Diffstat (limited to 'sys/src/cmd/python/Doc/lib/libhtmlparser.tex')
-rw-r--r--sys/src/cmd/python/Doc/lib/libhtmlparser.tex169
1 files changed, 169 insertions, 0 deletions
diff --git a/sys/src/cmd/python/Doc/lib/libhtmlparser.tex b/sys/src/cmd/python/Doc/lib/libhtmlparser.tex
new file mode 100644
index 000000000..52f8409a4
--- /dev/null
+++ b/sys/src/cmd/python/Doc/lib/libhtmlparser.tex
@@ -0,0 +1,169 @@
+\section{\module{HTMLParser} ---
+ Simple HTML and XHTML parser}
+
+\declaremodule{standard}{HTMLParser}
+\modulesynopsis{A simple parser that can handle HTML and XHTML.}
+
+\versionadded{2.2}
+
+This module defines a class \class{HTMLParser} which serves as the
+basis for parsing text files formatted in HTML\index{HTML} (HyperText
+Mark-up Language) and XHTML.\index{XHTML} Unlike the parser in
+\refmodule{htmllib}, this parser is not based on the SGML parser in
+\refmodule{sgmllib}.
+
+
+\begin{classdesc}{HTMLParser}{}
+The \class{HTMLParser} class is instantiated without arguments.
+
+An HTMLParser instance is fed HTML data and calls handler functions
+when tags begin and end. The \class{HTMLParser} class is meant to be
+overridden by the user to provide a desired behavior.
+
+Unlike the parser in \refmodule{htmllib}, this parser does not check
+that end tags match start tags or call the end-tag handler for
+elements which are closed implicitly by closing an outer element.
+\end{classdesc}
+
+An exception is defined as well:
+
+\begin{excdesc}{HTMLParseError}
+Exception raised by the \class{HTMLParser} class when it encounters an
+error while parsing. This exception provides three attributes:
+\member{msg} is a brief message explaining the error, \member{lineno}
+is the number of the line on which the broken construct was detected,
+and \member{offset} is the number of characters into the line at which
+the construct starts.
+\end{excdesc}
+
+
+\class{HTMLParser} instances have the following methods:
+
+\begin{methoddesc}{reset}{}
+Reset the instance. Loses all unprocessed data. This is called
+implicitly at instantiation time.
+\end{methoddesc}
+
+\begin{methoddesc}{feed}{data}
+Feed some text to the parser. It is processed insofar as it consists
+of complete elements; incomplete data is buffered until more data is
+fed or \method{close()} is called.
+\end{methoddesc}
+
+\begin{methoddesc}{close}{}
+Force processing of all buffered data as if it were followed by an
+end-of-file mark. This method may be redefined by a derived class to
+define additional processing at the end of the input, but the
+redefined version should always call the \class{HTMLParser} base class
+method \method{close()}.
+\end{methoddesc}
+
+\begin{methoddesc}{getpos}{}
+Return current line number and offset.
+\end{methoddesc}
+
+\begin{methoddesc}{get_starttag_text}{}
+Return the text of the most recently opened start tag. This should
+not normally be needed for structured processing, but may be useful in
+dealing with HTML ``as deployed'' or for re-generating input with
+minimal changes (whitespace between attributes can be preserved,
+etc.).
+\end{methoddesc}
+
+\begin{methoddesc}{handle_starttag}{tag, attrs}
+This method is called to handle the start of a tag. It is intended to
+be overridden by a derived class; the base class implementation does
+nothing.
+
+The \var{tag} argument is the name of the tag converted to
+lower case. The \var{attrs} argument is a list of \code{(\var{name},
+\var{value})} pairs containing the attributes found inside the tag's
+\code{<>} brackets. The \var{name} will be translated to lower case
+and double quotes and backslashes in the \var{value} have been
+interpreted. For instance, for the tag \code{<A
+HREF="http://www.cwi.nl/">}, this method would be called as
+\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_startendtag}{tag, attrs}
+Similar to \method{handle_starttag()}, but called when the parser
+encounters an XHTML-style empty tag (\code{<a .../>}). This method
+may be overridden by subclasses which require this particular lexical
+information; the default implementation simple calls
+\method{handle_starttag()} and \method{handle_endtag()}.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_endtag}{tag}
+This method is called to handle the end tag of an element. It is
+intended to be overridden by a derived class; the base class
+implementation does nothing. The \var{tag} argument is the name of
+the tag converted to lower case.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_data}{data}
+This method is called to process arbitrary data. It is intended to be
+overridden by a derived class; the base class implementation does
+nothing.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_charref}{name} This method is called to
+process a character reference of the form \samp{\&\#\var{ref};}. It
+is intended to be overridden by a derived class; the base class
+implementation does nothing.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_entityref}{name}
+This method is called to process a general entity reference of the
+form \samp{\&\var{name};} where \var{name} is an general entity
+reference. It is intended to be overridden by a derived class; the
+base class implementation does nothing.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_comment}{data}
+This method is called when a comment is encountered. The
+\var{comment} argument is a string containing the text between the
+\samp{--} and \samp{--} delimiters, but not the delimiters
+themselves. For example, the comment \samp{<!--text-->} will
+cause this method to be called with the argument \code{'text'}. It is
+intended to be overridden by a derived class; the base class
+implementation does nothing.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_decl}{decl}
+Method called when an SGML declaration is read by the parser. The
+\var{decl} parameter will be the entire contents of the declaration
+inside the \code{<!}...\code{>} markup. It is intended to be overridden
+by a derived class; the base class implementation does nothing.
+\end{methoddesc}
+
+\begin{methoddesc}{handle_pi}{data}
+Method called when a processing instruction is encountered. The
+\var{data} parameter will contain the entire processing instruction.
+For example, for the processing instruction \code{<?proc color='red'>},
+this method would be called as \code{handle_pi("proc color='red'")}. It
+is intended to be overridden by a derived class; the base class
+implementation does nothing.
+
+\note{The \class{HTMLParser} class uses the SGML syntactic rules for
+processing instructions. An XHTML processing instruction using the
+trailing \character{?} will cause the \character{?} to be included in
+\var{data}.}
+\end{methoddesc}
+
+
+\subsection{Example HTML Parser Application \label{htmlparser-example}}
+
+As a basic example, below is a very basic HTML parser that uses the
+\class{HTMLParser} class to print out tags as they are encountered:
+
+\begin{verbatim}
+from HTMLParser import HTMLParser
+
+class MyHTMLParser(HTMLParser):
+
+ def handle_starttag(self, tag, attrs):
+ print "Encountered the beginning of a %s tag" % tag
+
+ def handle_endtag(self, tag):
+ print "Encountered the end of a %s tag" % tag
+\end{verbatim}