A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pip/_vendor/html5lib/serializer.py

409 lines
15 KiB

from __future__ import absolute_import, division, unicode_literals
from pip._vendor.six import text_type
import re
from codecs import register_error, xmlcharrefreplace_errors
from .constants import voidElements, booleanAttributes, spaceCharacters
from .constants import rcdataElements, entities, xmlEntities
from . import treewalkers, _utils
from xml.sax.saxutils import escape
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")
_encode_entity_map = {}
_is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((_is_ucs4 and len(v) > 1) or
(not _is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = _utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if v not in _encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
_encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = _encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error("htmlentityreplace", htmlentityreplace_errors)
def serialize(input, tree="etree", encoding=None, **serializer_opts):
"""Serializes the input token stream using the specified treewalker
:arg input: the token stream to serialize
:arg tree: the treewalker to use
:arg encoding: the encoding to use
:arg serializer_opts: any options to pass to the
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
:returns: the tree serialized as a string
Example:
>>> from html5lib.html5parser import parse
>>> from html5lib.serializer import serialize
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
>>> serialize(token_stream, omit_optional_tags=False)
'<html><head></head><body><p>Hi!</p></body></html>'
"""
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
s = HTMLSerializer(**serializer_opts)
return s.render(walker(input), encoding)
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer
:arg inject_meta_charset: Whether or not to inject the meta charset.
Defaults to ``True``.
:arg quote_attr_values: Whether to quote attribute values that don't
require quoting per legacy browser behavior (``"legacy"``), when
required by the standard (``"spec"``), or always (``"always"``).
Defaults to ``"legacy"``.
:arg quote_char: Use given quote character for attribute quoting.
Defaults to ``"`` which will use double quotes unless attribute
value contains a double quote, in which case single quotes are
used.
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
values.
Defaults to ``False``.
:arg escape_rcdata: Whether to escape characters that need to be
escaped within normal elements within rcdata elements such as
style.
Defaults to ``False``.
:arg resolve_entities: Whether to resolve named character entities that
appear in the source tree. The XML predefined entities &lt; &gt;
&amp; &quot; &apos; are unaffected by this setting.
Defaults to ``True``.
:arg strip_whitespace: Whether to remove semantically meaningless
whitespace. (This compresses all whitespace to a single space
except within ``pre``.)
Defaults to ``False``.
:arg minimize_boolean_attributes: Shortens boolean attributes to give
just the attribute value, for example::
<input disabled="disabled">
becomes::
<input disabled>
Defaults to ``True``.
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
start tag of void elements (empty elements whose end tag is
forbidden). E.g. ``<hr/>``.
Defaults to ``False``.
:arg space_before_trailing_solidus: Places a space immediately before
the closing slash in a tag using a trailing solidus. E.g.
``<hr />``. Requires ``use_trailing_solidus=True``.
Defaults to ``True``.
:arg sanitize: Strip all unsafe or unknown constructs from output.
See :py:class:`html5lib.filters.sanitizer.Filter`.
Defaults to ``False``.
:arg omit_optional_tags: Omit start/end tags that are optional.
Defaults to ``True``.
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
Defaults to ``False``.
"""
unexpected_args = frozenset(kwargs) - frozenset(self.options)
if len(unexpected_args) > 0:
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "htmlentityreplace")
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
# pylint:disable=too-many-nested-blocks
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from .filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# Alphabetical attributes is here under the assumption that none of
# the later filters add or change order of attributes; it needs to be
# before the sanitizer so escaped elements come out correctly
if self.alphabetical_attributes:
from .filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from .filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from .filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from .filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError("System identifier contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = _quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = _quoteAttributeLegacy.search(v) is not None
else:
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(" /")
else:
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if key not in entities:
self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
"""Serializes the stream from the treewalker into a string
:arg treewalker: the treewalker to serialize
:arg encoding: the string encoding to use
:returns: the serialized tree
Example:
>>> from html5lib import parse, getTreeWalker
>>> from html5lib.serializer import HTMLSerializer
>>> token_stream = parse('<html><body>Hi!</body></html>')
>>> walker = getTreeWalker('etree')
>>> serializer = HTMLSerializer(omit_optional_tags=False)
>>> serializer.render(walker(token_stream))
'<html><head></head><body>Hi!</body></html>'
"""
if encoding:
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
class SerializeError(Exception):
"""Error in serialized tree"""
pass