diff options
author | cinap_lenrek <cinap_lenrek@localhost> | 2011-05-03 11:25:13 +0000 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@localhost> | 2011-05-03 11:25:13 +0000 |
commit | 458120dd40db6b4df55a4e96b650e16798ef06a0 (patch) | |
tree | 8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/src/cmd/python/Doc/lib/libhtmlparser.tex | |
parent | 3a742c699f6806c1145aea5149bf15de15a0afd7 (diff) |
add hg and python
Diffstat (limited to 'sys/src/cmd/python/Doc/lib/libhtmlparser.tex')
-rw-r--r-- | sys/src/cmd/python/Doc/lib/libhtmlparser.tex | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/sys/src/cmd/python/Doc/lib/libhtmlparser.tex b/sys/src/cmd/python/Doc/lib/libhtmlparser.tex new file mode 100644 index 000000000..52f8409a4 --- /dev/null +++ b/sys/src/cmd/python/Doc/lib/libhtmlparser.tex @@ -0,0 +1,169 @@ +\section{\module{HTMLParser} --- + Simple HTML and XHTML parser} + +\declaremodule{standard}{HTMLParser} +\modulesynopsis{A simple parser that can handle HTML and XHTML.} + +\versionadded{2.2} + +This module defines a class \class{HTMLParser} which serves as the +basis for parsing text files formatted in HTML\index{HTML} (HyperText +Mark-up Language) and XHTML.\index{XHTML} Unlike the parser in +\refmodule{htmllib}, this parser is not based on the SGML parser in +\refmodule{sgmllib}. + + +\begin{classdesc}{HTMLParser}{} +The \class{HTMLParser} class is instantiated without arguments. + +An HTMLParser instance is fed HTML data and calls handler functions +when tags begin and end. The \class{HTMLParser} class is meant to be +overridden by the user to provide a desired behavior. + +Unlike the parser in \refmodule{htmllib}, this parser does not check +that end tags match start tags or call the end-tag handler for +elements which are closed implicitly by closing an outer element. +\end{classdesc} + +An exception is defined as well: + +\begin{excdesc}{HTMLParseError} +Exception raised by the \class{HTMLParser} class when it encounters an +error while parsing. This exception provides three attributes: +\member{msg} is a brief message explaining the error, \member{lineno} +is the number of the line on which the broken construct was detected, +and \member{offset} is the number of characters into the line at which +the construct starts. +\end{excdesc} + + +\class{HTMLParser} instances have the following methods: + +\begin{methoddesc}{reset}{} +Reset the instance. Loses all unprocessed data. This is called +implicitly at instantiation time. +\end{methoddesc} + +\begin{methoddesc}{feed}{data} +Feed some text to the parser. It is processed insofar as it consists +of complete elements; incomplete data is buffered until more data is +fed or \method{close()} is called. +\end{methoddesc} + +\begin{methoddesc}{close}{} +Force processing of all buffered data as if it were followed by an +end-of-file mark. This method may be redefined by a derived class to +define additional processing at the end of the input, but the +redefined version should always call the \class{HTMLParser} base class +method \method{close()}. +\end{methoddesc} + +\begin{methoddesc}{getpos}{} +Return current line number and offset. +\end{methoddesc} + +\begin{methoddesc}{get_starttag_text}{} +Return the text of the most recently opened start tag. This should +not normally be needed for structured processing, but may be useful in +dealing with HTML ``as deployed'' or for re-generating input with +minimal changes (whitespace between attributes can be preserved, +etc.). +\end{methoddesc} + +\begin{methoddesc}{handle_starttag}{tag, attrs} +This method is called to handle the start of a tag. It is intended to +be overridden by a derived class; the base class implementation does +nothing. + +The \var{tag} argument is the name of the tag converted to +lower case. The \var{attrs} argument is a list of \code{(\var{name}, +\var{value})} pairs containing the attributes found inside the tag's +\code{<>} brackets. The \var{name} will be translated to lower case +and double quotes and backslashes in the \var{value} have been +interpreted. For instance, for the tag \code{<A +HREF="http://www.cwi.nl/">}, this method would be called as +\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}. +\end{methoddesc} + +\begin{methoddesc}{handle_startendtag}{tag, attrs} +Similar to \method{handle_starttag()}, but called when the parser +encounters an XHTML-style empty tag (\code{<a .../>}). This method +may be overridden by subclasses which require this particular lexical +information; the default implementation simple calls +\method{handle_starttag()} and \method{handle_endtag()}. +\end{methoddesc} + +\begin{methoddesc}{handle_endtag}{tag} +This method is called to handle the end tag of an element. It is +intended to be overridden by a derived class; the base class +implementation does nothing. The \var{tag} argument is the name of +the tag converted to lower case. +\end{methoddesc} + +\begin{methoddesc}{handle_data}{data} +This method is called to process arbitrary data. It is intended to be +overridden by a derived class; the base class implementation does +nothing. +\end{methoddesc} + +\begin{methoddesc}{handle_charref}{name} This method is called to +process a character reference of the form \samp{\&\#\var{ref};}. It +is intended to be overridden by a derived class; the base class +implementation does nothing. +\end{methoddesc} + +\begin{methoddesc}{handle_entityref}{name} +This method is called to process a general entity reference of the +form \samp{\&\var{name};} where \var{name} is an general entity +reference. It is intended to be overridden by a derived class; the +base class implementation does nothing. +\end{methoddesc} + +\begin{methoddesc}{handle_comment}{data} +This method is called when a comment is encountered. The +\var{comment} argument is a string containing the text between the +\samp{--} and \samp{--} delimiters, but not the delimiters +themselves. For example, the comment \samp{<!--text-->} will +cause this method to be called with the argument \code{'text'}. It is +intended to be overridden by a derived class; the base class +implementation does nothing. +\end{methoddesc} + +\begin{methoddesc}{handle_decl}{decl} +Method called when an SGML declaration is read by the parser. The +\var{decl} parameter will be the entire contents of the declaration +inside the \code{<!}...\code{>} markup. It is intended to be overridden +by a derived class; the base class implementation does nothing. +\end{methoddesc} + +\begin{methoddesc}{handle_pi}{data} +Method called when a processing instruction is encountered. The +\var{data} parameter will contain the entire processing instruction. +For example, for the processing instruction \code{<?proc color='red'>}, +this method would be called as \code{handle_pi("proc color='red'")}. It +is intended to be overridden by a derived class; the base class +implementation does nothing. + +\note{The \class{HTMLParser} class uses the SGML syntactic rules for +processing instructions. An XHTML processing instruction using the +trailing \character{?} will cause the \character{?} to be included in +\var{data}.} +\end{methoddesc} + + +\subsection{Example HTML Parser Application \label{htmlparser-example}} + +As a basic example, below is a very basic HTML parser that uses the +\class{HTMLParser} class to print out tags as they are encountered: + +\begin{verbatim} +from HTMLParser import HTMLParser + +class MyHTMLParser(HTMLParser): + + def handle_starttag(self, tag, attrs): + print "Encountered the beginning of a %s tag" % tag + + def handle_endtag(self, tag): + print "Encountered the end of a %s tag" % tag +\end{verbatim} |