A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pip/_vendor/html5lib/html5parser.py

2795 lines
114 KiB

from __future__ import absolute_import, division, unicode_literals
from pip._vendor.six import with_metaclass, viewkeys
import types
from . import _inputstream
from . import _tokenizer
from . import treebuilders
from .treebuilders.base import Marker
from . import _utils
from .constants import (
spaceCharacters, asciiUpper2Lower,
specialElements, headingElements, cdataElements, rcdataElements,
tokenTypes, tagTokenTypes,
namespaces,
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
adjustForeignAttributes as adjustForeignAttributesMap,
adjustMathMLAttributes, adjustSVGAttributes,
E,
_ReparseException
)
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse an HTML document as a string or file-like object into a tree
:arg doc: the document to parse as a string or file-like object
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5parser import parse
>>> parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, **kwargs)
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse an HTML fragment as a string or file-like object into a tree
:arg doc: the fragment to parse as a string or file-like object
:arg container: the container context to parse the fragment in
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import parseFragment
>>> parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parseFragment(doc, container=container, **kwargs)
def method_decorator_metaclass(function):
class Decorated(type):
def __new__(meta, classname, bases, classDict):
for attributeName, attribute in classDict.items():
if isinstance(attribute, types.FunctionType):
attribute = function(attribute)
classDict[attributeName] = attribute
return type.__new__(meta, classname, bases, classDict)
return Decorated
class HTMLParser(object):
"""HTML parser
Generates a tree structure from a stream of (possibly malformed) HTML.
"""
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
"""
:arg tree: a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
:arg strict: raise an exception when a parse error is encountered
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:arg debug: whether or not to enable debug mode which logs things
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser() # generates parser with etree builder
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
"""
# Raise an exception on the first error encountered
self.strict = strict
if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
self.errors = []
self.phases = {name: cls(self, self.tree) for name, cls in
getPhases(debug).items()}
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
self.innerHTMLMode = innerHTML
self.container = container
self.scripting = scripting
self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()
try:
self.mainLoop()
except _ReparseException:
self.reset()
self.mainLoop()
def reset(self):
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.log = [] # only used with debug mode
# "quirks" / "limited quirks" / "no quirks"
self.compatMode = "no quirks"
if self.innerHTMLMode:
self.innerHTML = self.container.lower()
if self.innerHTML in cdataElements:
self.tokenizer.state = self.tokenizer.rcdataState
elif self.innerHTML in rcdataElements:
self.tokenizer.state = self.tokenizer.rawtextState
elif self.innerHTML == 'plaintext':
self.tokenizer.state = self.tokenizer.plaintextState
else:
# state already is data state
# self.tokenizer.state = self.tokenizer.dataState
pass
self.phase = self.phases["beforeHtml"]
self.phase.insertHtmlElement()
self.resetInsertionMode()
else:
self.innerHTML = False # pylint:disable=redefined-variable-type
self.phase = self.phases["initial"]
self.lastPhase = None
self.beforeRCDataPhase = None
self.framesetOK = True
@property
def documentEncoding(self):
"""Name of the character encoding that was used to decode the input stream, or
:obj:`None` if that is not determined yet
"""
if not hasattr(self, 'tokenizer'):
return None
return self.tokenizer.stream.charEncoding[0].name
def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]):
return ("encoding" in element.attributes and
element.attributes["encoding"].translate(
asciiUpper2Lower) in
("text/html", "application/xhtml+xml"))
else:
return (element.namespace, element.name) in htmlIntegrationPointElements
def isMathMLTextIntegrationPoint(self, element):
return (element.namespace, element.name) in mathmlTextIntegrationPointElements
def mainLoop(self):
CharactersToken = tokenTypes["Characters"]
SpaceCharactersToken = tokenTypes["SpaceCharacters"]
StartTagToken = tokenTypes["StartTag"]
EndTagToken = tokenTypes["EndTag"]
CommentToken = tokenTypes["Comment"]
DoctypeToken = tokenTypes["Doctype"]
ParseErrorToken = tokenTypes["ParseError"]
for token in self.tokenizer:
prev_token = None
new_token = token
while new_token is not None:
prev_token = new_token
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
currentNodeNamespace = currentNode.namespace if currentNode else None
currentNodeName = currentNode.name if currentNode else None
type = new_token["type"]
if type == ParseErrorToken:
self.parseError(new_token["data"], new_token.get("datavars", {}))
new_token = None
else:
if (len(self.tree.openElements) == 0 or
currentNodeNamespace == self.tree.defaultNamespace or
(self.isMathMLTextIntegrationPoint(currentNode) and
((type == StartTagToken and
token["name"] not in frozenset(["mglyph", "malignmark"])) or
type in (CharactersToken, SpaceCharactersToken))) or
(currentNodeNamespace == namespaces["mathml"] and
currentNodeName == "annotation-xml" and
type == StartTagToken and
token["name"] == "svg") or
(self.isHTMLIntegrationPoint(currentNode) and
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
phase = self.phase
else:
phase = self.phases["inForeignContent"]
if type == CharactersToken:
new_token = phase.processCharacters(new_token)
elif type == SpaceCharactersToken:
new_token = phase.processSpaceCharacters(new_token)
elif type == StartTagToken:
new_token = phase.processStartTag(new_token)
elif type == EndTagToken:
new_token = phase.processEndTag(new_token)
elif type == CommentToken:
new_token = phase.processComment(new_token)
elif type == DoctypeToken:
new_token = phase.processDoctype(new_token)
if (type == StartTagToken and prev_token["selfClosing"] and
not prev_token["selfClosingAcknowledged"]):
self.parseError("non-void-element-with-trailing-solidus",
{"name": prev_token["name"]})
# When the loop finishes it's EOF
reprocess = True
phases = []
while reprocess:
phases.append(self.phase)
reprocess = self.phase.processEOF()
if reprocess:
assert self.phase not in phases
def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree
:arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element).
:arg scripting: treat noscript elements as if JavaScript was turned on
:returns: parsed tree
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
"""
self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument()
def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment
:arg container: name of the element we're setting the innerHTML
property if set to None, default to 'div'
:arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
:arg scripting: treat noscript elements as if JavaScript was turned on
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
"""
self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()
def parseError(self, errorcode="XXX-undefined-error", datavars=None):
# XXX The idea is to make errorcode mandatory.
if datavars is None:
datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
raise ParseError(E[errorcode] % datavars)
def adjustMathMLAttributes(self, token):
adjust_attributes(token, adjustMathMLAttributes)
def adjustSVGAttributes(self, token):
adjust_attributes(token, adjustSVGAttributes)
def adjustForeignAttributes(self, token):
adjust_attributes(token, adjustForeignAttributesMap)
def reparseTokenNormal(self, token):
# pylint:disable=unused-argument
self.parser.phase()
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
newModes = {
"select": "inSelect",
"td": "inCell",
"th": "inCell",
"tr": "inRow",
"tbody": "inTableBody",
"thead": "inTableBody",
"tfoot": "inTableBody",
"caption": "inCaption",
"colgroup": "inColumnGroup",
"table": "inTable",
"head": "inBody",
"body": "inBody",
"frameset": "inFrameset",
"html": "beforeHead"
}
for node in self.tree.openElements[::-1]:
nodeName = node.name
new_phase = None
if node == self.tree.openElements[0]:
assert self.innerHTML
last = True
nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
if nodeName in ("select", "colgroup", "head", "html"):
assert self.innerHTML
if not last and node.namespace != self.tree.defaultNamespace:
continue
if nodeName in newModes:
new_phase = self.phases[newModes[nodeName]]
break
elif last:
new_phase = self.phases["inBody"]
break
self.phase = new_phase
def parseRCDataRawtext(self, token, contentType):
# Generic RCDATA/RAWTEXT Parsing algorithm
assert contentType in ("RAWTEXT", "RCDATA")
self.tree.insertElement(token)
if contentType == "RAWTEXT":
self.tokenizer.state = self.tokenizer.rawtextState
else:
self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
self.phase = self.phases["text"]
@_utils.memoize
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = {value: key for key, value in tokenTypes.items()}
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
token = args[0]
info = {"type": type_names[token['type']]}
if token['type'] in tagTokenTypes:
info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__,
self.parser.phase.__class__.__name__,
self.__class__.__name__,
function.__name__,
info))
return function(self, *args, **kwargs)
else:
return function(self, *args, **kwargs)
return wrapped
def getMetaclass(use_metaclass, metaclass_func):
if use_metaclass:
return method_decorator_metaclass(metaclass_func)
else:
return type
# pylint:disable=unused-argument
class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing
"""
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
self.__startTagCache = {}
self.__endTagCache = {}
def processEOF(self):
raise NotImplementedError
def processComment(self, token):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insertComment(token, self.tree.openElements[-1])
def processDoctype(self, token):
self.parser.parseError("unexpected-doctype")
def processCharacters(self, token):
self.tree.insertText(token["data"])
def processSpaceCharacters(self, token):
self.tree.insertText(token["data"])
def processStartTag(self, token):
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__startTagCache:
func = self.__startTagCache[name]
else:
func = self.__startTagCache[name] = self.startTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__startTagCache.pop(next(iter(self.__startTagCache)))
return func(token)
def startTagHtml(self, token):
if not self.parser.firstStartTag and token["name"] == "html":
self.parser.parseError("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in token["data"].items():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, token):
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__endTagCache:
func = self.__endTagCache[name]
else:
func = self.__endTagCache[name] = self.endTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__endTagCache.pop(next(iter(self.__endTagCache)))
return func(token)
class InitialPhase(Phase):
__slots__ = tuple()
def processSpaceCharacters(self, token):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId is not None or
systemId is not None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
self.tree.insertDoctype(token)
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html" or
publicId.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) or
publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
"html") or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is None or
systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
"-//w3c//dtd xhtml 1.0 transitional//")) or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is not None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def anythingElse(self):
self.parser.compatMode = "quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
self.anythingElse()
return token
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEOF(self):
self.parser.parseError("expected-doctype-but-got-eof")
self.anythingElse()
return True
class BeforeHtmlPhase(Phase):
__slots__ = tuple()
# helper methods
def insertHtmlElement(self):
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
self.parser.phase = self.parser.phases["beforeHead"]
# other
def processEOF(self):
self.insertHtmlElement()
return True
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.insertHtmlElement()
return token
def processStartTag(self, token):
if token["name"] == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
return token
def processEndTag(self, token):
if token["name"] not in ("head", "body", "html", "br"):
self.parser.parseError("unexpected-end-tag-before-html",
{"name": token["name"]})
else:
self.insertHtmlElement()
return token
class BeforeHeadPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.startTagHead(impliedTagToken("head", "StartTag"))
return True
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagOther(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagImplyHead(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagOther(self, token):
self.parser.parseError("end-tag-after-implied-root",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), endTagImplyHead)
])
endTagHandler.default = endTagOther
class InHeadPhase(Phase):
__slots__ = tuple()
# the real thing
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagMeta(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
attributes = token["data"]
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif ("content" in attributes and
"http-equiv" in attributes and
attributes["http-equiv"].lower() == "content-type"):
# Encoding it as UTF-8 here is a hack, as really we should pass
# the abstract Unicode string, and just use the
# ContentAttrParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = _inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoFramesStyle(self, token):
# Need to decide whether to implement the scripting-disabled case
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagNoscript(self, token):
if self.parser.scripting:
self.parser.parseRCDataRawtext(token, "RAWTEXT")
else:
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inHeadNoscript"]
def startTagScript(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
self.parser.originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHead(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "head", "Expected head got %s" % node.name
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.endTagHead(impliedTagToken("head"))
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("title", startTagTitle),
(("noframes", "style"), startTagNoFramesStyle),
("noscript", startTagNoscript),
("script", startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
startTagBaseLinkCommand),
("meta", startTagMeta),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("head", endTagHead),
(("br", "html", "body"), endTagHtmlBodyBr)
])
endTagHandler.default = endTagOther
class InHeadNoscriptPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.parser.parseError("eof-in-head-noscript")
self.anythingElse()
return True
def processComment(self, token):
return self.parser.phases["inHead"].processComment(token)
def processCharacters(self, token):
self.parser.parseError("char-in-head-noscript")
self.anythingElse()
return token
def processSpaceCharacters(self, token):
return self.parser.phases["inHead"].processSpaceCharacters(token)
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBaseLinkCommand(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagHeadNoscript(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagNoscript(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "noscript", "Expected noscript got %s" % node.name
self.parser.phase = self.parser.phases["inHead"]
def endTagBr(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
# Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript"))
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
(("head", "noscript"), startTagHeadNoscript),
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("noscript", endTagNoscript),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class AfterHeadPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBody(self, token):
self.parser.framesetOK = False
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inBody"]
def startTagFrameset(self, token):
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, token):
self.parser.parseError("unexpected-start-tag-out-of-my-head",
{"name": token["name"]})
self.tree.openElements.append(self.tree.headPointer)
self.parser.phases["inHead"].processStartTag(token)
for node in self.tree.openElements[::-1]:
if node.name == "head":
self.tree.openElements.remove(node)
break
def startTagHead(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.tree.insertElement(impliedTagToken("body", "StartTag"))
self.parser.phase = self.parser.phases["inBody"]
self.parser.framesetOK = True
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("body", startTagBody),
("frameset", startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
startTagFromHead),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
endTagHtmlBodyBr)])
endTagHandler.default = endTagOther
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode
__slots__ = ("processSpaceCharacters",)
def __init__(self, *args, **kwargs):
super(InBodyPhase, self).__init__(*args, **kwargs)
# Set this to the default handler
self.processSpaceCharacters = self.processSpaceCharactersNonPre
def isMatchingFormattingElement(self, node1, node2):
return (node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)
# helper
def addFormattingElement(self, token):
self.tree.insertElement(token)
element = self.tree.openElements[-1]
matchingElements = []
for node in self.tree.activeFormattingElements[::-1]:
if node is Marker:
break
elif self.isMatchingFormattingElement(node, element):
matchingElements.append(node)
assert len(matchingElements) <= 3
if len(matchingElements) == 3:
self.tree.activeFormattingElements.remove(matchingElements[-1])
self.tree.activeFormattingElements.append(element)
# the real deal
def processEOF(self):
allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
"tfoot", "th", "thead", "tr", "body",
"html"))
for node in self.tree.openElements[::-1]:
if node.name not in allowed_elements:
self.parser.parseError("expected-closing-tag-but-got-eof")
break
# Stop parsing
def processSpaceCharactersDropNewline(self, token):
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
# want to drop leading newlines
data = token["data"]
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and
self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data)
def processCharacters(self, token):
if token["data"] == "\u0000":
# The tokenizer should always emit null on its own
return
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
# This must be bad for performance
if (self.parser.framesetOK and
any([char not in spaceCharacters
for char in token["data"]])):
self.parser.framesetOK = False
def processSpaceCharactersNonPre(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
def startTagProcessInHead(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagBody(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "body"})
if (len(self.tree.openElements) == 1 or
self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
else:
self.parser.framesetOK = False
for attr, value in token["data"].items():
if attr not in self.tree.openElements[1].attributes:
self.tree.openElements[1].attributes[attr] = value
def startTagFrameset(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
elif not self.parser.framesetOK:
pass
else:
if self.tree.openElements[1].parent:
self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
while self.tree.openElements[-1].name != "html":
self.tree.openElements.pop()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagCloseP(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
def startTagPreListing(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.parser.framesetOK = False
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagForm(self, token):
if self.tree.formPointer:
self.parser.parseError("unexpected-start-tag", {"name": "form"})
else:
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.tree.formPointer = self.tree.openElements[-1]
def startTagListItem(self, token):
self.parser.framesetOK = False
stopNamesMap = {"li": ["li"],
"dt": ["dt", "dd"],
"dd": ["dt", "dd"]}
stopNames = stopNamesMap[token["name"]]
for node in reversed(self.tree.openElements):
if node.name in stopNames:
self.parser.phase.processEndTag(
impliedTagToken(node.name, "EndTag"))
break
if (node.nameTuple in specialElements and
node.name not in ("address", "div", "p")):
break
if self.tree.elementInScope("p", variant="button"):
self.parser.phase.processEndTag(
impliedTagToken("p", "EndTag"))
self.tree.insertElement(token)
def startTagPlaintext(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
def startTagHeading(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
if self.tree.openElements[-1].name in headingElements:
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
self.tree.openElements.pop()
self.tree.insertElement(token)
def startTagA(self, token):
afeAElement = self.tree.elementInActiveFormattingElements("a")
if afeAElement:
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "a", "endName": "a"})
self.endTagFormatting(impliedTagToken("a"))
if afeAElement in self.tree.openElements:
self.tree.openElements.remove(afeAElement)
if afeAElement in self.tree.activeFormattingElements:
self.tree.activeFormattingElements.remove(afeAElement)
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(token)
def startTagFormatting(self, token):
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(token)
def startTagNobr(self, token):
self.tree.reconstructActiveFormattingElements()
if self.tree.elementInScope("nobr"):
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "nobr", "endName": "nobr"})
self.processEndTag(impliedTagToken("nobr"))
# XXX Need tests that trigger the following
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(token)
def startTagButton(self, token):
if self.tree.elementInScope("button"):
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "button", "endName": "button"})
self.processEndTag(impliedTagToken("button"))
return token
else:
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.parser.framesetOK = False
def startTagAppletMarqueeObject(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.tree.activeFormattingElements.append(Marker)
self.parser.framesetOK = False
def startTagXmp(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.reconstructActiveFormattingElements()
self.parser.framesetOK = False
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagTable(self, token):
if self.parser.compatMode != "quirks":
if self.tree.elementInScope("p", variant="button"):
self.processEndTag(impliedTagToken("p"))
self.tree.insertElement(token)
self.parser.framesetOK = False
self.parser.phase = self.parser.phases["inTable"]
def startTagVoidFormatting(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
self.parser.framesetOK = False
def startTagInput(self, token):
framesetOK = self.parser.framesetOK
self.startTagVoidFormatting(token)
if ("type" in token["data"] and
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
# input type=hidden doesn't change framesetOK
self.parser.framesetOK = framesetOK
def startTagParamSource(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagHr(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
self.parser.framesetOK = False
def startTagImage(self, token):
# No really...
self.parser.parseError("unexpected-start-tag-treated-as",
{"originalName": "image", "newName": "img"})
self.processStartTag(impliedTagToken("img", "StartTag",
attributes=token["data"],
selfClosing=token["selfClosing"]))
def startTagIsIndex(self, token):
self.parser.parseError("deprecated-tag", {"name": "isindex"})
if self.tree.formPointer:
return
form_attrs = {}
if "action" in token["data"]:
form_attrs["action"] = token["data"]["action"]
self.processStartTag(impliedTagToken("form", "StartTag",
attributes=form_attrs))
self.processStartTag(impliedTagToken("hr", "StartTag"))
self.processStartTag(impliedTagToken("label", "StartTag"))
# XXX Localization ...
if "prompt" in token["data"]:
prompt = token["data"]["prompt"]
else:
prompt = "This is a searchable index. Enter search keywords: "
self.processCharacters(
{"type": tokenTypes["Characters"], "data": prompt})
attributes = token["data"].copy()
if "action" in attributes:
del attributes["action"]
if "prompt" in attributes:
del attributes["prompt"]
attributes["name"] = "isindex"
self.processStartTag(impliedTagToken("input", "StartTag",
attributes=attributes,
selfClosing=token["selfClosing"]))
self.processEndTag(impliedTagToken("label"))
self.processStartTag(impliedTagToken("hr", "StartTag"))
self.processEndTag(impliedTagToken("form"))
def startTagTextarea(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
self.parser.framesetOK = False
def startTagIFrame(self, token):
self.parser.framesetOK = False
self.startTagRawtext(token)
def startTagNoscript(self, token):
if self.parser.scripting:
self.startTagRawtext(token)
else:
self.startTagOther(token)
def startTagRawtext(self, token):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagOpt(self, token):
if self.tree.openElements[-1].name == "option":
self.parser.phase.processEndTag(impliedTagToken("option"))
self.tree.reconstructActiveFormattingElements()
self.parser.tree.insertElement(token)
def startTagSelect(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.parser.framesetOK = False
if self.parser.phase in (self.parser.phases["inTable"],
self.parser.phases["inCaption"],
self.parser.phases["inColumnGroup"],
self.parser.phases["inTableBody"],
self.parser.phases["inRow"],
self.parser.phases["inCell"]):
self.parser.phase = self.parser.phases["inSelectInTable"]
else:
self.parser.phase = self.parser.phases["inSelect"]
def startTagRpRt(self, token):
if self.tree.elementInScope("ruby"):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "ruby":
self.parser.parseError()
self.tree.insertElement(token)
def startTagMath(self, token):
self.tree.reconstructActiveFormattingElements()
self.parser.adjustMathMLAttributes(token)
self.parser.adjustForeignAttributes(token)
token["namespace"] = namespaces["mathml"]
self.tree.insertElement(token)
# Need to get the parse error right for the case where the token
# has a namespace not equal to the xmlns attribute
if token["selfClosing"]:
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagSvg(self, token):
self.tree.reconstructActiveFormattingElements()
self.parser.adjustSVGAttributes(token)
self.parser.adjustForeignAttributes(token)
token["namespace"] = namespaces["svg"]
self.tree.insertElement(token)
# Need to get the parse error right for the case where the token
# has a namespace not equal to the xmlns attribute
if token["selfClosing"]:
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagMisplaced(self, token):
""" Elements that should be children of other elements that have a
different insertion mode; here they are ignored
"caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "noscript"
"""
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
def startTagOther(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
def endTagP(self, token):
if not self.tree.elementInScope("p", variant="button"):
self.startTagCloseP(impliedTagToken("p", "StartTag"))
self.parser.parseError("unexpected-end-tag", {"name": "p"})
self.endTagP(impliedTagToken("p", "EndTag"))
else:
self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError("unexpected-end-tag", {"name": "p"})
node = self.tree.openElements.pop()
while node.name != "p":
node = self.tree.openElements.pop()
def endTagBody(self, token):
if not self.tree.elementInScope("body"):
self.parser.parseError()
return
elif self.tree.openElements[-1].name != "body":
for node in self.tree.openElements[2:]:
if node.name not in frozenset(("dd", "dt", "li", "optgroup",
"option", "p", "rp", "rt",
"tbody", "td", "tfoot",
"th", "thead", "tr", "body",
"html")):
# Not sure this is the correct name for the parse error
self.parser.parseError(
"expected-one-end-tag-but-got-another",
{"gotName": "body", "expectedName": node.name})
break
self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, token):
# We repeat the test for the body end tag token being ignored here
if self.tree.elementInScope("body"):
self.endTagBody(impliedTagToken("body"))
return token
def endTagBlock(self, token):
# Put us back in the right whitespace handling mode
if token["name"] == "pre":
self.processSpaceCharacters = self.processSpaceCharactersNonPre
inScope = self.tree.elementInScope(token["name"])
if inScope:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
if inScope:
node = self.tree.openElements.pop()
while node.name != token["name"]:
node = self.tree.openElements.pop()
def endTagForm(self, token):
node = self.tree.formPointer
self.tree.formPointer = None
if node is None or not self.tree.elementInScope(node):
self.parser.parseError("unexpected-end-tag",
{"name": "form"})
else:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1] != node:
self.parser.parseError("end-tag-too-early-ignored",
{"name": "form"})
self.tree.openElements.remove(node)
def endTagListItem(self, token):
if token["name"] == "li":
variant = "list"
else:
variant = None
if not self.tree.elementInScope(token["name"], variant=variant):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
else:
self.tree.generateImpliedEndTags(exclude=token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError(
"end-tag-too-early",
{"name": token["name"]})
node = self.tree.openElements.pop()
while node.name != token["name"]:
node = self.tree.openElements.pop()
def endTagHeading(self, token):
for item in headingElements:
if self.tree.elementInScope(item):
self.tree.generateImpliedEndTags()
break
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
for item in headingElements:
if self.tree.elementInScope(item):
item = self.tree.openElements.pop()
while item.name not in headingElements:
item = self.tree.openElements.pop()
break
def endTagFormatting(self, token):
"""The much-feared adoption agency algorithm"""
# http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
# XXX Better parseError messages appreciated.
# Step 1
outerLoopCounter = 0
# Step 2
while outerLoopCounter < 8:
# Step 3
outerLoopCounter += 1
# Step 4:
# Let the formatting element be the last element in
# the list of active formatting elements that:
# - is between the end of the list and the last scope
# marker in the list, if any, or the start of the list
# otherwise, and
# - has the same tag name as the token.
formattingElement = self.tree.elementInActiveFormattingElements(
token["name"])
if (not formattingElement or
(formattingElement in self.tree.openElements and
not self.tree.elementInScope(formattingElement.name))):
# If there is no such node, then abort these steps
# and instead act as described in the "any other
# end tag" entry below.
self.endTagOther(token)
return
# Otherwise, if there is such a node, but that node is
# not in the stack of open elements, then this is a
# parse error; remove the element from the list, and
# abort these steps.
elif formattingElement not in self.tree.openElements:
self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
self.tree.activeFormattingElements.remove(formattingElement)
return
# Otherwise, if there is such a node, and that node is
# also in the stack of open elements, but the element
# is not in scope, then this is a parse error; ignore
# the token, and abort these steps.
elif not self.tree.elementInScope(formattingElement.name):
self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
return
# Otherwise, there is a formatting element and that
# element is in the stack and is in scope. If the
# element is not the current node, this is a parse
# error. In any case, proceed with the algorithm as
# written in the following steps.
else:
if formattingElement != self.tree.openElements[-1]:
self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
# Step 5:
# Let the furthest block be the topmost node in the
# stack of open elements that is lower in the stack
# than the formatting element, and is an element in
# the special category. There might not be one.
afeIndex = self.tree.openElements.index(formattingElement)
furthestBlock = None
for element in self.tree.openElements[afeIndex:]:
if element.nameTuple in specialElements:
furthestBlock = element
break
# Step 6:
# If there is no furthest block, then the UA must
# first pop all the nodes from the bottom of the stack
# of open elements, from the current node up to and
# including the formatting element, then remove the
# formatting element from the list of active
# formatting elements, and finally abort these steps.
if furthestBlock is None:
element = self.tree.openElements.pop()
while element != formattingElement:
element = self.tree.openElements.pop()
self.tree.activeFormattingElements.remove(element)
return
# Step 7
commonAncestor = self.tree.openElements[afeIndex - 1]
# Step 8:
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 15. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 9.7
bookmark = self.tree.activeFormattingElements.index(formattingElement)
# Step 9
lastNode = node = furthestBlock
innerLoopCounter = 0
index = self.tree.openElements.index(node)
while innerLoopCounter < 3:
innerLoopCounter += 1
# Node is element before node in open elements
index -= 1
node = self.tree.openElements[index]
if node not in self.tree.activeFormattingElements:
self.tree.openElements.remove(node)
continue
# Step 9.6
if node == formattingElement:
break
# Step 9.7
if lastNode == furthestBlock:
bookmark = self.tree.activeFormattingElements.index(node) + 1
# Step 9.8
clone = node.cloneNode()
# Replace node with clone
self.tree.activeFormattingElements[
self.tree.activeFormattingElements.index(node)] = clone
self.tree.openElements[
self.tree.openElements.index(node)] = clone
node = clone
# Step 9.9
# Remove lastNode from its parents, if any
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
node.appendChild(lastNode)
# Step 9.10
lastNode = node
# Step 10
# Foster parent lastNode if commonAncestor is a
# table, tbody, tfoot, thead, or tr we need to foster
# parent the lastNode
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
parent, insertBefore = self.tree.getTableMisnestedNodePosition()
parent.insertBefore(lastNode, insertBefore)
else:
commonAncestor.appendChild(lastNode)
# Step 11
clone = formattingElement.cloneNode()
# Step 12
furthestBlock.reparentChildren(clone)
# Step 13
furthestBlock.appendChild(clone)
# Step 14
self.tree.activeFormattingElements.remove(formattingElement)
self.tree.activeFormattingElements.insert(bookmark, clone)
# Step 15
self.tree.openElements.remove(formattingElement)
self.tree.openElements.insert(
self.tree.openElements.index(furthestBlock) + 1, clone)
def endTagAppletMarqueeObject(self, token):
if self.tree.elementInScope(token["name"]):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
if self.tree.elementInScope(token["name"]):
element = self.tree.openElements.pop()
while element.name != token["name"]:
element = self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
def endTagBr(self, token):
self.parser.parseError("unexpected-end-tag-treated-as",
{"originalName": "br", "newName": "br element"})
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(impliedTagToken("br", "StartTag"))
self.tree.openElements.pop()
def endTagOther(self, token):
for node in self.tree.openElements[::-1]:
if node.name == token["name"]:
self.tree.generateImpliedEndTags(exclude=token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
while self.tree.openElements.pop() != node:
pass
break
else:
if node.nameTuple in specialElements:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
break
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
startTagProcessInHead),
("body", startTagBody),
("frameset", startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
startTagCloseP),
(headingElements, startTagHeading),
(("pre", "listing"), startTagPreListing),
("form", startTagForm),
(("li", "dd", "dt"), startTagListItem),
("plaintext", startTagPlaintext),
("a", startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), startTagFormatting),
("nobr", startTagNobr),
("button", startTagButton),
(("applet", "marquee", "object"), startTagAppletMarqueeObject),
("xmp", startTagXmp),
("table", startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
startTagVoidFormatting),
(("param", "source", "track"), startTagParamSource),
("input", startTagInput),
("hr", startTagHr),
("image", startTagImage),
("isindex", startTagIsIndex),
("textarea", startTagTextarea),
("iframe", startTagIFrame),
("noscript", startTagNoscript),
(("noembed", "noframes"), startTagRawtext),
("select", startTagSelect),
(("rp", "rt"), startTagRpRt),
(("option", "optgroup"), startTagOpt),
(("math"), startTagMath),
(("svg"), startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), startTagMisplaced)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("body", endTagBody),
("html", endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), endTagBlock),
("form", endTagForm),
("p", endTagP),
(("dd", "dt", "li"), endTagListItem),
(headingElements, endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), endTagFormatting),
(("applet", "marquee", "object"), endTagAppletMarqueeObject),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class TextPhase(Phase):
__slots__ = tuple()
def processCharacters(self, token):
self.tree.insertText(token["data"])
def processEOF(self):
self.parser.parseError("expected-named-closing-tag-but-got-eof",
{"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
self.parser.phase = self.parser.originalPhase
return True
def startTagOther(self, token):
assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
def endTagScript(self, token):
node = self.tree.openElements.pop()
assert node.name == "script"
self.parser.phase = self.parser.originalPhase
# The rest of this method is all stuff that only happens if
# document.write works
def endTagOther(self, token):
self.tree.openElements.pop()
self.parser.phase = self.parser.originalPhase
startTagHandler = _utils.MethodDispatcher([])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("script", endTagScript)])
endTagHandler.default = endTagOther
class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
__slots__ = tuple()
# helper methods
def clearStackToTableContext(self):
# "clear the stack back to a table context"
while self.tree.openElements[-1].name not in ("table", "html"):
# self.parser.parseError("unexpected-implied-end-tag-in-table",
# {"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
# When the current node is <html> it's an innerHTML case
# processing methods
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-table")
else:
assert self.parser.innerHTML
# Stop parsing
def processSpaceCharacters(self, token):
originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["inTableText"]
self.parser.phase.originalPhase = originalPhase
self.parser.phase.processSpaceCharacters(token)
def processCharacters(self, token):
originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["inTableText"]
self.parser.phase.originalPhase = originalPhase
self.parser.phase.processCharacters(token)
def insertText(self, token):
# If we get here there must be at least one non-whitespace character
# Do the table magic!
self.tree.insertFromTable = True
self.parser.phases["inBody"].processCharacters(token)
self.tree.insertFromTable = False
def startTagCaption(self, token):
self.clearStackToTableContext()
self.tree.activeFormattingElements.append(Marker)
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inCaption"]
def startTagColgroup(self, token):
self.clearStackToTableContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inColumnGroup"]
def startTagCol(self, token):
self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
return token
def startTagRowGroup(self, token):
self.clearStackToTableContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inTableBody"]
def startTagImplyTbody(self, token):
self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
return token
def startTagTable(self, token):
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "table", "endName": "table"})
self.parser.phase.processEndTag(impliedTagToken("table"))
if not self.parser.innerHTML:
return token
def startTagStyleScript(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagInput(self, token):
if ("type" in token["data"] and
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
self.parser.parseError("unexpected-hidden-input-in-table")
self.tree.insertElement(token)
# XXX associate with form
self.tree.openElements.pop()
else:
self.startTagOther(token)
def startTagForm(self, token):
self.parser.parseError("unexpected-form-in-table")
if self.tree.formPointer is None:
self.tree.insertElement(token)
self.tree.formPointer = self.tree.openElements[-1]
self.tree.openElements.pop()
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
# Do the table magic!
self.tree.insertFromTable = True
self.parser.phases["inBody"].processStartTag(token)
self.tree.insertFromTable = False
def endTagTable(self, token):
if self.tree.elementInScope("table", variant="table"):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
self.parser.parseError("end-tag-too-early-named",
{"gotName": "table",
"expectedName": self.tree.openElements[-1].name})
while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
# Do the table magic!
self.tree.insertFromTable = True
self.parser.phases["inBody"].processEndTag(token)
self.tree.insertFromTable = False
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("caption", startTagCaption),
("colgroup", startTagColgroup),
("col", startTagCol),
(("tbody", "tfoot", "thead"), startTagRowGroup),
(("td", "th", "tr"), startTagImplyTbody),
("table", startTagTable),
(("style", "script"), startTagStyleScript),
("input", startTagInput),
("form", startTagForm)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("table", endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InTableTextPhase(Phase):
__slots__ = ("originalPhase", "characterTokens")
def __init__(self, *args, **kwargs):
super(InTableTextPhase, self).__init__(*args, **kwargs)
self.originalPhase = None
self.characterTokens = []
def flushCharacters(self):
data = "".join([item["data"] for item in self.characterTokens])
if any([item not in spaceCharacters for item in data]):
token = {"type": tokenTypes["Characters"], "data": data}
self.parser.phases["inTable"].insertText(token)
elif data:
self.tree.insertText(data)
self.characterTokens = []
def processComment(self, token):
self.flushCharacters()
self.parser.phase = self.originalPhase
return token
def processEOF(self):
self.flushCharacters()
self.parser.phase = self.originalPhase
return True
def processCharacters(self, token):
if token["data"] == "\u0000":
return
self.characterTokens.append(token)
def processSpaceCharacters(self, token):
# pretty sure we should never reach here
self.characterTokens.append(token)
# assert False
def processStartTag(self, token):
self.flushCharacters()
self.parser.phase = self.originalPhase
return token
def processEndTag(self, token):
self.flushCharacters()
self.parser.phase = self.originalPhase
return token
class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
__slots__ = tuple()
def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", variant="table")
def processEOF(self):
self.parser.phases["inBody"].processEOF()
def processCharacters(self, token):
return self.parser.phases["inBody"].processCharacters(token)
def startTagTableElement(self, token):
self.parser.parseError()
# XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag(impliedTagToken("caption"))
if not ignoreEndTag:
return token
def startTagOther(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def endTagCaption(self, token):
if not self.ignoreEndTagCaption():
# AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "caption":
self.parser.parseError("expected-one-end-tag-but-got-another",
{"gotName": "caption",
"expectedName": self.tree.openElements[-1].name})
while self.tree.openElements[-1].name != "caption":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inTable"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, token):
self.parser.parseError()
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag(impliedTagToken("caption"))
if not ignoreEndTag:
return token
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), startTagTableElement)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("caption", endTagCaption),
("table", endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
__slots__ = tuple()
def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html"
def processEOF(self):
if self.tree.openElements[-1].name == "html":
assert self.parser.innerHTML
return
else:
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return True
def processCharacters(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return token
def startTagCol(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return token
def endTagColgroup(self, token):
if self.ignoreEndTagColgroup():
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
else:
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
def endTagCol(self, token):
self.parser.parseError("no-end-tag", {"name": "col"})
def endTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return token
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("col", startTagCol)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("colgroup", endTagColgroup),
("col", endTagCol)
])
endTagHandler.default = endTagOther
class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
__slots__ = tuple()
# helper methods
def clearStackToTableBodyContext(self):
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
"thead", "html"):
# self.parser.parseError("unexpected-implied-end-tag-in-table",
# {"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
if self.tree.openElements[-1].name == "html":
assert self.parser.innerHTML
# the rest
def processEOF(self):
self.parser.phases["inTable"].processEOF()
def processSpaceCharacters(self, token):
return self.parser.phases["inTable"].processSpaceCharacters(token)
def processCharacters(self, token):
return self.parser.phases["inTable"].processCharacters(token)
def startTagTr(self, token):
self.clearStackToTableBodyContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inRow"]
def startTagTableCell(self, token):
self.parser.parseError("unexpected-cell-in-table-body",
{"name": token["name"]})
self.startTagTr(impliedTagToken("tr", "StartTag"))
return token
def startTagTableOther(self, token):
# XXX AT Any ideas on how to share this with endTagTable?
if (self.tree.elementInScope("tbody", variant="table") or
self.tree.elementInScope("thead", variant="table") or
self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
return token
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def startTagOther(self, token):
return self.parser.phases["inTable"].processStartTag(token)
def endTagTableRowGroup(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.clearStackToTableBodyContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
else:
self.parser.parseError("unexpected-end-tag-in-table-body",
{"name": token["name"]})
def endTagTable(self, token):
if (self.tree.elementInScope("tbody", variant="table") or
self.tree.elementInScope("thead", variant="table") or
self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
return token
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag-in-table-body",
{"name": token["name"]})
def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("tr", startTagTr),
(("td", "th"), startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), endTagTableRowGroup),
("table", endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
__slots__ = tuple()
# helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self):
while self.tree.openElements[-1].name not in ("tr", "html"):
self.parser.parseError("unexpected-implied-end-tag-in-table-row",
{"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
def ignoreEndTagTr(self):
return not self.tree.elementInScope("tr", variant="table")
# the rest
def processEOF(self):
self.parser.phases["inTable"].processEOF()
def processSpaceCharacters(self, token):
return self.parser.phases["inTable"].processSpaceCharacters(token)
def processCharacters(self, token):
return self.parser.phases["inTable"].processCharacters(token)
def startTagTableCell(self, token):
self.clearStackToTableRowContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inCell"]
self.tree.activeFormattingElements.append(Marker)
def startTagTableOther(self, token):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr(impliedTagToken("tr"))
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
return token
def startTagOther(self, token):
return self.parser.phases["inTable"].processStartTag(token)
def endTagTr(self, token):
if not self.ignoreEndTagTr():
self.clearStackToTableRowContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTableBody"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, token):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr(impliedTagToken("tr"))
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
return token
def endTagTableRowGroup(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.endTagTr(impliedTagToken("tr"))
return token
else:
self.parser.parseError()
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag-in-table-row",
{"name": token["name"]})
def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("td", "th"), startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("tr", endTagTr),
("table", endTagTable),
(("tbody", "tfoot", "thead"), endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
endTagIgnore)
])
endTagHandler.default = endTagOther
class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
__slots__ = tuple()
# helper
def closeCell(self):
if self.tree.elementInScope("td", variant="table"):
self.endTagTableCell(impliedTagToken("td"))
elif self.tree.elementInScope("th", variant="table"):
self.endTagTableCell(impliedTagToken("th"))
# the rest
def processEOF(self):
self.parser.phases["inBody"].processEOF()
def processCharacters(self, token):
return self.parser.phases["inBody"].processCharacters(token)
def startTagTableOther(self, token):
if (self.tree.elementInScope("td", variant="table") or
self.tree.elementInScope("th", variant="table")):
self.closeCell()
return token
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def startTagOther(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def endTagTableCell(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.tree.generateImpliedEndTags(token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("unexpected-cell-end-tag",
{"name": token["name"]})
while True:
node = self.tree.openElements.pop()
if node.name == token["name"]:
break
else:
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inRow"]
else:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagImply(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.closeCell()
return token
else:
# sometimes innerHTML case
self.parser.parseError()
def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("td", "th"), endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
])
endTagHandler.default = endTagOther
class InSelectPhase(Phase):
__slots__ = tuple()
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-select")
else:
assert self.parser.innerHTML
def processCharacters(self, token):
if token["data"] == "\u0000":
return
self.tree.insertText(token["data"])
def startTagOption(self, token):
# We need to imply </option> if <option> is the current node.
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
self.tree.insertElement(token)
def startTagOptgroup(self, token):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
if self.tree.openElements[-1].name == "optgroup":
self.tree.openElements.pop()
self.tree.insertElement(token)
def startTagSelect(self, token):
self.parser.parseError("unexpected-select-in-select")
self.endTagSelect(impliedTagToken("select"))
def startTagInput(self, token):
self.parser.parseError("unexpected-input-in-select")
if self.tree.elementInScope("select", variant="select"):
self.endTagSelect(impliedTagToken("select"))
return token
else:
assert self.parser.innerHTML
def startTagScript(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-in-select",
{"name": token["name"]})
def endTagOption(self, token):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
else:
self.parser.parseError("unexpected-end-tag-in-select",
{"name": "option"})
def endTagOptgroup(self, token):
# </optgroup> implicitly closes <option>
if (self.tree.openElements[-1].name == "option" and
self.tree.openElements[-2].name == "optgroup"):
self.tree.openElements.pop()
# It also closes </optgroup>
if self.tree.openElements[-1].name == "optgroup":
self.tree.openElements.pop()
# But nothing else
else:
self.parser.parseError("unexpected-end-tag-in-select",
{"name": "optgroup"})
def endTagSelect(self, token):
if self.tree.elementInScope("select", variant="select"):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-in-select",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("option", startTagOption),
("optgroup", startTagOptgroup),
("select", startTagSelect),
(("input", "keygen", "textarea"), startTagInput),
("script", startTagScript)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("option", endTagOption),
("optgroup", endTagOptgroup),
("select", endTagSelect)
])
endTagHandler.default = endTagOther
class InSelectInTablePhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.parser.phases["inSelect"].processEOF()
def processCharacters(self, token):
return self.parser.phases["inSelect"].processCharacters(token)
def startTagTable(self, token):
self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
self.endTagOther(impliedTagToken("select"))
return token
def startTagOther(self, token):
return self.parser.phases["inSelect"].processStartTag(token)
def endTagTable(self, token):
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
if self.tree.elementInScope(token["name"], variant="table"):
self.endTagOther(impliedTagToken("select"))
return token
def endTagOther(self, token):
return self.parser.phases["inSelect"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
startTagTable)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
endTagTable)
])
endTagHandler.default = endTagOther
class InForeignContentPhase(Phase):
__slots__ = tuple()
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
"center", "code", "dd", "div", "dl", "dt",
"em", "embed", "h1", "h2", "h3",
"h4", "h5", "h6", "head", "hr", "i", "img",
"li", "listing", "menu", "meta", "nobr",
"ol", "p", "pre", "ruby", "s", "small",
"span", "strong", "strike", "sub", "sup",
"table", "tt", "u", "ul", "var"])
def adjustSVGTagNames(self, token):
replacements = {"altglyph": "altGlyph",
"altglyphdef": "altGlyphDef",
"altglyphitem": "altGlyphItem",
"animatecolor": "animateColor",
"animatemotion": "animateMotion",
"animatetransform": "animateTransform",
"clippath": "clipPath",
"feblend": "feBlend",
"fecolormatrix": "feColorMatrix",
"fecomponenttransfer": "feComponentTransfer",
"fecomposite": "feComposite",
"feconvolvematrix": "feConvolveMatrix",
"fediffuselighting": "feDiffuseLighting",
"fedisplacementmap": "feDisplacementMap",
"fedistantlight": "feDistantLight",
"feflood": "feFlood",
"fefunca": "feFuncA",
"fefuncb": "feFuncB",
"fefuncg": "feFuncG",
"fefuncr": "feFuncR",
"fegaussianblur": "feGaussianBlur",
"feimage": "feImage",
"femerge": "feMerge",
"femergenode": "feMergeNode",
"femorphology": "feMorphology",
"feoffset": "feOffset",
"fepointlight": "fePointLight",
"fespecularlighting": "feSpecularLighting",
"fespotlight": "feSpotLight",
"fetile": "feTile",
"feturbulence": "feTurbulence",
"foreignobject": "foreignObject",
"glyphref": "glyphRef",
"lineargradient": "linearGradient",
"radialgradient": "radialGradient",
"textpath": "textPath"}
if token["name"] in replacements:
token["name"] = replacements[token["name"]]
def processCharacters(self, token):
if token["data"] == "\u0000":
token["data"] = "\uFFFD"
elif (self.parser.framesetOK and
any(char not in spaceCharacters for char in token["data"])):
self.parser.framesetOK = False
Phase.processCharacters(self, token)
def processStartTag(self, token):
currentNode = self.tree.openElements[-1]
if (token["name"] in self.breakoutElements or
(token["name"] == "font" and
set(token["data"].keys()) & {"color", "face", "size"})):
self.parser.parseError("unexpected-html-element-in-foreign-content",
{"name": token["name"]})
while (self.tree.openElements[-1].namespace !=
self.tree.defaultNamespace and
not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
self.tree.openElements.pop()
return token
else:
if currentNode.namespace == namespaces["mathml"]:
self.parser.adjustMathMLAttributes(token)
elif currentNode.namespace == namespaces["svg"]:
self.adjustSVGTagNames(token)
self.parser.adjustSVGAttributes(token)
self.parser.adjustForeignAttributes(token)
token["namespace"] = currentNode.namespace
self.tree.insertElement(token)
if token["selfClosing"]:
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def processEndTag(self, token):
nodeIndex = len(self.tree.openElements) - 1
node = self.tree.openElements[-1]
if node.name.translate(asciiUpper2Lower) != token["name"]:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
while True:
if node.name.translate(asciiUpper2Lower) == token["name"]:
# XXX this isn't in the spec but it seems necessary
if self.parser.phase == self.parser.phases["inTableText"]:
self.parser.phase.flushCharacters()
self.parser.phase = self.parser.phase.originalPhase
while self.tree.openElements.pop() != node:
assert self.tree.openElements
new_token = None
break
nodeIndex -= 1
node = self.tree.openElements[nodeIndex]
if node.namespace != self.tree.defaultNamespace:
continue
else:
new_token = self.parser.phase.processEndTag(token)
break
return new_token
class AfterBodyPhase(Phase):
__slots__ = tuple()
def processEOF(self):
# Stop parsing
pass
def processComment(self, token):
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
self.tree.insertComment(token, self.tree.openElements[0])
def processCharacters(self, token):
self.parser.parseError("unexpected-char-after-body")
self.parser.phase = self.parser.phases["inBody"]
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-after-body",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
def endTagHtml(self, name):
if self.parser.innerHTML:
self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
else:
self.parser.phase = self.parser.phases["afterAfterBody"]
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-after-body",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
endTagHandler.default = endTagOther
class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
__slots__ = tuple()
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-frameset")
else:
assert self.parser.innerHTML
def processCharacters(self, token):
self.parser.parseError("unexpected-char-in-frameset")
def startTagFrameset(self, token):
self.tree.insertElement(token)
def startTagFrame(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
def startTagNoframes(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-in-frameset",
{"name": token["name"]})
def endTagFrameset(self, token):
if self.tree.openElements[-1].name == "html":
# innerHTML case
self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
else:
self.tree.openElements.pop()
if (not self.parser.innerHTML and
self.tree.openElements[-1].name != "frameset"):
# If we're not in innerHTML mode and the current node is not a
# "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["afterFrameset"]
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-in-frameset",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("frameset", startTagFrameset),
("frame", startTagFrame),
("noframes", startTagNoframes)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("frameset", endTagFrameset)
])
endTagHandler.default = endTagOther
class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
__slots__ = tuple()
def processEOF(self):
# Stop parsing
pass
def processCharacters(self, token):
self.parser.parseError("unexpected-char-after-frameset")
def startTagNoframes(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-after-frameset",
{"name": token["name"]})
def endTagHtml(self, token):
self.parser.phase = self.parser.phases["afterAfterFrameset"]
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-after-frameset",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("noframes", startTagNoframes)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("html", endTagHtml)
])
endTagHandler.default = endTagOther
class AfterAfterBodyPhase(Phase):
__slots__ = tuple()
def processEOF(self):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
return self.parser.phases["inBody"].processSpaceCharacters(token)
def processCharacters(self, token):
self.parser.parseError("expected-eof-but-got-char")
self.parser.phase = self.parser.phases["inBody"]
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("expected-eof-but-got-start-tag",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml)
])
startTagHandler.default = startTagOther
class AfterAfterFramesetPhase(Phase):
__slots__ = tuple()
def processEOF(self):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
return self.parser.phases["inBody"].processSpaceCharacters(token)
def processCharacters(self, token):
self.parser.parseError("expected-eof-but-got-char")
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagNoFrames(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("expected-eof-but-got-start-tag",
{"name": token["name"]})
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("noframes", startTagNoFrames)
])
startTagHandler.default = startTagOther
# pylint:enable=unused-argument
return {
"initial": InitialPhase,
"beforeHtml": BeforeHtmlPhase,
"beforeHead": BeforeHeadPhase,
"inHead": InHeadPhase,
"inHeadNoscript": InHeadNoscriptPhase,
"afterHead": AfterHeadPhase,
"inBody": InBodyPhase,
"text": TextPhase,
"inTable": InTablePhase,
"inTableText": InTableTextPhase,
"inCaption": InCaptionPhase,
"inColumnGroup": InColumnGroupPhase,
"inTableBody": InTableBodyPhase,
"inRow": InRowPhase,
"inCell": InCellPhase,
"inSelect": InSelectPhase,
"inSelectInTable": InSelectInTablePhase,
"inForeignContent": InForeignContentPhase,
"afterBody": AfterBodyPhase,
"inFrameset": InFramesetPhase,
"afterFrameset": AfterFramesetPhase,
"afterAfterBody": AfterAfterBodyPhase,
"afterAfterFrameset": AfterAfterFramesetPhase,
# XXX after after frameset
}
def adjust_attributes(token, replacements):
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
if needs_adjustment:
token['data'] = type(token['data'])((replacements.get(k, k), v)
for k, v in token['data'].items())
def impliedTagToken(name, type="EndTag", attributes=None,
selfClosing=False):
if attributes is None:
attributes = {}
return {"type": tokenTypes[type], "name": name, "data": attributes,
"selfClosing": selfClosing}
class ParseError(Exception):
"""Error in parsed document"""
pass