"""A parser for SGML, using the derived class as a static DTD.""" # Stolen from the Python 2.0 distribution and tweaked by JMT # XXX This only supports those SGML features used by HTML. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import re import string # Regular expressions used for parsing interesting = re.compile('[&<]') incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' '<([a-zA-Z][^<>]*|' '/([a-zA-Z][^<>]*)?|' '![^<>]*)?') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*);') charref = re.compile('&#([0-9]+)[^0-9];') starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piopen = re.compile('<\?') piclose = re.compile('>') endtagopen = re.compile('a-zA-Z]') endbracket = re.compile('[<>]') special = re.compile(']*>') commentopen = re.compile('