Skip to content

gh-95472: [xml.etree.ElementTree] Add fine-grained formatting classes #95476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 64 additions & 22 deletions Lib/xml/etree/ElementTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,13 @@

__all__ = [
# public symbols
"Comment",
"dump",
"Element", "ElementTree",
"fromstring", "fromstringlist",
"indent", "iselement", "iterparse",
"parse", "ParseError",
"PI", "ProcessingInstruction",
"QName",
"SubElement",
"tostring", "tostringlist",
"TreeBuilder",
"VERSION",
"XML", "XMLID",
"XMLParser", "XMLPullParser",
"register_namespace",
"canonicalize", "C14NWriterTarget",
]
"Comment", "dump", "Element", "ElementTree", "fromstring",
"fromstringlist", "indent", "iselement", "iterparse", "parse",
"ParseError", "PI", "ProcessingInstruction", "QName", "SubElement",
"tostring", "tostringlist", "TreeBuilder", "VERSION", "XML", "XMLID",
"XMLParser", "XMLPullParser", "register_namespace", "canonicalize",
"C14NWriterTarget", "ShortEmptyElements"
]

VERSION = "1.3.0"

Expand All @@ -99,6 +89,7 @@
import collections
import collections.abc
import contextlib
import enum

from . import ElementPath

Expand Down Expand Up @@ -509,6 +500,46 @@ def __eq__(self, other):
# --------------------------------------------------------------------


class ShortEmptyElements(enum.Enum):
"""
This class creates backwards compatibility with the boolean value of
*short_empty_elements* that existed prior to 3.??.

Assuming the tag `<q/>`, the results will be:

*SPACE* (default): `<q />`
*NOSPACE*: `<q/>`
*NONE*: `<q></q>`
"""
SPACE = " "
NOSPACE = ""
NONE = False

def __bool__(self):
return self != ShortEmptyElements.NONE

@classmethod
def _missing_(cls, value):
if value is enum.no_arg:
return cls.SPACE
elif isinstance(value, bool):
return cls.SPACE if value else cls.NONE
else:
return super()._missing_(value)

@classmethod
def tag_defaultdict(cls, short_empty_elements):
if not isinstance(short_empty_elements, collections.defaultdict):
if isinstance(short_empty_elements, ShortEmptyElements):
return collections.defaultdict(lambda: short_empty_elements)
Comment on lines +532 to +534
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also allow mappings that implement __missing__. Therefore, I suggest checking for everything else and assuming that what remains is probably some kind of acceptable dict. Or check for collections.abc.Mapping instead.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I don't quite understand. Should I just check for hasattr('__missing__') and then try to call it and make sure its result is a ShortEmptyElements?

Copy link
Contributor

@scoder scoder Aug 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggested to accept any Mapping, basically. You can either check isinstance(..., collections.abc.Mapping), or you could also ducktypingly check for the enums first and simply assume that, otherwise, users probably passed some kind of acceptable mapping, and will have to live with a runtime error if the object they provided misbehaves somehow.

There are really different ways for users to make an item lookup (subscripting) return a reasonable value that we can use for our processing here. A defaultdict is just one in many.

elif short_empty_elements:
return collections.defaultdict(lambda: ShortEmptyElements.SPACE)
else:
return collections.defaultdict(lambda: ShortEmptyElements.NONE)
else:
return short_empty_elements


class ElementTree:
"""An XML element hierarchy.

Expand Down Expand Up @@ -680,6 +711,7 @@ def iterfind(self, path, namespaces=None):
def write(self, file_or_filename,
encoding=None,
xml_declaration=None,
xml_declaration_quotes="'",
default_namespace=None,
method=None, *,
short_empty_elements=True):
Expand All @@ -695,6 +727,9 @@ def write(self, file_or_filename,
is added if encoding IS NOT either of:
US-ASCII, UTF-8, or Unicode

*xml_declaration_quotes* -- Changes character used in XML declaration,
should be a *str*.

*default_namespace* -- sets the default XML namespace (for "xmlns")

*method* -- either "xml" (default), "html, "text", or "c14n"
Expand All @@ -703,8 +738,12 @@ def write(self, file_or_filename,
that contain no content. If True (default)
they are emitted as a single self-closed
tag, otherwise they are emitted as a pair
of start/end tags
of start/end tags.

For more control, can be a
*ShortEmptyElements* object, or a
defaultdict keyed by tags as strings and
valued with such objects.
"""
if not method:
method = "xml"
Expand All @@ -715,18 +754,21 @@ def write(self, file_or_filename,
encoding = "utf-8"
else:
encoding = "us-ascii"
if not xml_declaration_quotes in ['"', "'"]:
raise ValueError("xml_declaration_quotes must be either ' or \"")
with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
if method == "xml" and (xml_declaration or
(xml_declaration is None and
encoding.lower() != "unicode" and
declared_encoding.lower() not in ("utf-8", "us-ascii"))):
write("<?xml version='1.0' encoding='%s'?>\n" % (
declared_encoding,))
write("<?xml version={0}1.0{0} encoding={0}{1}{0}?>\n"
.format(xml_declaration_quotes, declared_encoding))
if method == "text":
_serialize_text(write, self._root)
else:
qnames, namespaces = _namespaces(self._root, default_namespace)
serialize = _serialize[method]
short_empty_elements = ShortEmptyElements.tag_defaultdict(short_empty_elements)
serialize(write, self._root, qnames, namespaces,
short_empty_elements=short_empty_elements)

Expand Down Expand Up @@ -885,7 +927,7 @@ def _serialize_xml(write, elem, qnames, namespaces,
else:
v = _escape_attrib(v)
write(" %s=\"%s\"" % (qnames[k], v))
if text or len(elem) or not short_empty_elements:
if text or len(elem) or not short_empty_elements[tag]:
write(">")
if text:
write(_escape_cdata(text))
Expand All @@ -894,7 +936,7 @@ def _serialize_xml(write, elem, qnames, namespaces,
short_empty_elements=short_empty_elements)
write("</" + tag + ">")
else:
write(" />")
write(short_empty_elements[tag].value+"/>")
if elem.tail:
write(_escape_cdata(elem.tail))

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added two fine-grained formatting options to ``xml.etree.ElementTree`` so programmers can prevent Git diffs from occurring due to incompatible XML libraries. Spaces can now be eliminated in e.g. ``<q />`` (as ``<q/>``) (``ShortEmptyElements``), and double quotes can be used in the XML declaration (``XMLDeclarationQuotes``).