# SPDX-FileCopyrightText: Copyright © 2026 BBC
#
# SPDX-License-Identifier: BSD-3-Clause
from .xmlCheck import XmlCheck
from xml.etree.ElementTree import Element
from src.validationLogging.validationLogger import ValidationLogger
from src.validationLogging.validationCodes import ValidationCode
[docs]
def get_namespace(tag: str) -> str:
if (len(tag) == 0 or tag[0] != '{'):
return ''
if '}' not in tag:
raise ValueError('No closing brace found')
return tag.split('{', 1)[1].split('}', 1)[0]
[docs]
def get_unqualified_name(tag: str) -> str:
if '}' not in tag:
return tag
return tag.split('}', 1)[1]
[docs]
class Pruner(XmlCheck):
def __init__(
self,
no_prune_namespaces: set[str] = set(),
no_prune_no_namespace_attributes: set[str] = set()
) -> None:
self._no_prune_namespaces = no_prune_namespaces
self._no_prune_no_namespace_attributes = \
no_prune_no_namespace_attributes
[docs]
def run(
self,
input: Element,
context: dict,
validation_results: ValidationLogger
) -> bool:
# pruned:
# dict('namespace':
# 'els': dict($tag: count),
# 'attrs': dict($name: count))
pruned = {}
self.prune_unrecognised_vocabulary(el=input, pruned=pruned)
# print(pruned)
for ns, dicts in pruned.items():
els_dict = dicts.get('els', {})
attrs_dict = dicts.get('attrs', {})
msg_str = \
'Pruned {} elements {}and {} attributes {}in namespace "{}"' \
.format(
len(els_dict),
'(' +
', '.join(
['"{}" {} time{}'.format(k, v, 's' if v > 1 else '')
for k, v in els_dict.items()]) +
') ' if len(els_dict) > 0 else '',
len(attrs_dict),
'(' +
', '.join(
['"{}" {} time{}'.format(k, v, 's' if v > 1 else '')
for k, v in attrs_dict.items()]) +
') ' if len(attrs_dict) > 0 else '',
ns)
validation_results.info(
location='Document',
message=msg_str,
code=ValidationCode.xml_prune
)
return True
[docs]
def prune_unrecognised_vocabulary(self, el: Element, pruned: dict):
to_remove = []
for child in el:
child_ns = get_namespace(child.tag)
if child_ns not in self._no_prune_namespaces:
# logging.debug('pruning element {}'.format(child.tag))
to_remove.append(child)
self.log_pruned_el(
pruned=pruned,
ns=child_ns,
tag=get_unqualified_name(child.tag))
else:
self.prune_unrecognised_vocabulary(el=child, pruned=pruned)
for e in to_remove:
el.remove(e)
for attr_key in el.keys():
attr_ns = get_namespace(attr_key)
attr_name = get_unqualified_name(attr_key)
if (attr_ns and attr_ns not in self._no_prune_namespaces) \
or \
(not attr_ns and
attr_name not in self._no_prune_no_namespace_attributes):
# logging.debug('pruning {}@{}'.format(el.tag, attr_key))
self.log_pruned_attr(
pruned=pruned,
ns=attr_ns,
attr_name=attr_name)
el.attrib.pop(attr_key)
return el
[docs]
def log_pruned_el(self, pruned: dict, ns: str, tag: str):
# print('pruning element {} {}'.format(ns, tag))
ns_dict = pruned.get(ns, {})
els_dict = ns_dict.get('els', {})
tag_count = els_dict.get(tag, 0)
tag_count += 1
els_dict[tag] = tag_count
ns_dict['els'] = els_dict
pruned[ns] = ns_dict
return
[docs]
def log_pruned_attr(self, pruned: dict, ns: str, attr_name: str):
# print('pruning attr {} {}'.format(ns, attr_name))
ns_dict = pruned.get(ns, {})
attr_dict = ns_dict.get('attrs', {})
attr_count = attr_dict.get(attr_name, 0)
attr_count += 1
attr_dict[attr_name] = attr_count
ns_dict['attrs'] = attr_dict
pruned[ns] = ns_dict
return