Source code for src.xmlChecks.pruner

# SPDX-FileCopyrightText: Copyright © 2026 BBC
#
# SPDX-License-Identifier: BSD-3-Clause

from .xmlCheck import XmlCheck
from xml.etree.ElementTree import Element
from src.validationLogging.validationLogger import ValidationLogger
from src.validationLogging.validationCodes import ValidationCode


[docs] def get_namespace(tag: str) -> str: if (len(tag) == 0 or tag[0] != '{'): return '' if '}' not in tag: raise ValueError('No closing brace found') return tag.split('{', 1)[1].split('}', 1)[0]
[docs] def get_unqualified_name(tag: str) -> str: if '}' not in tag: return tag return tag.split('}', 1)[1]
[docs] class Pruner(XmlCheck): def __init__( self, no_prune_namespaces: set[str] = set(), no_prune_no_namespace_attributes: set[str] = set() ) -> None: self._no_prune_namespaces = no_prune_namespaces self._no_prune_no_namespace_attributes = \ no_prune_no_namespace_attributes
[docs] def run( self, input: Element, context: dict, validation_results: ValidationLogger ) -> bool: # pruned: # dict('namespace': # 'els': dict($tag: count), # 'attrs': dict($name: count)) pruned = {} self.prune_unrecognised_vocabulary(el=input, pruned=pruned) # print(pruned) for ns, dicts in pruned.items(): els_dict = dicts.get('els', {}) attrs_dict = dicts.get('attrs', {}) msg_str = \ 'Pruned {} elements {}and {} attributes {}in namespace "{}"' \ .format( len(els_dict), '(' + ', '.join( ['"{}" {} time{}'.format(k, v, 's' if v > 1 else '') for k, v in els_dict.items()]) + ') ' if len(els_dict) > 0 else '', len(attrs_dict), '(' + ', '.join( ['"{}" {} time{}'.format(k, v, 's' if v > 1 else '') for k, v in attrs_dict.items()]) + ') ' if len(attrs_dict) > 0 else '', ns) validation_results.info( location='Document', message=msg_str, code=ValidationCode.xml_prune ) return True
[docs] def prune_unrecognised_vocabulary(self, el: Element, pruned: dict): to_remove = [] for child in el: child_ns = get_namespace(child.tag) if child_ns not in self._no_prune_namespaces: # logging.debug('pruning element {}'.format(child.tag)) to_remove.append(child) self.log_pruned_el( pruned=pruned, ns=child_ns, tag=get_unqualified_name(child.tag)) else: self.prune_unrecognised_vocabulary(el=child, pruned=pruned) for e in to_remove: el.remove(e) for attr_key in el.keys(): attr_ns = get_namespace(attr_key) attr_name = get_unqualified_name(attr_key) if (attr_ns and attr_ns not in self._no_prune_namespaces) \ or \ (not attr_ns and attr_name not in self._no_prune_no_namespace_attributes): # logging.debug('pruning {}@{}'.format(el.tag, attr_key)) self.log_pruned_attr( pruned=pruned, ns=attr_ns, attr_name=attr_name) el.attrib.pop(attr_key) return el
[docs] def log_pruned_el(self, pruned: dict, ns: str, tag: str): # print('pruning element {} {}'.format(ns, tag)) ns_dict = pruned.get(ns, {}) els_dict = ns_dict.get('els', {}) tag_count = els_dict.get(tag, 0) tag_count += 1 els_dict[tag] = tag_count ns_dict['els'] = els_dict pruned[ns] = ns_dict return
[docs] def log_pruned_attr(self, pruned: dict, ns: str, attr_name: str): # print('pruning attr {} {}'.format(ns, attr_name)) ns_dict = pruned.get(ns, {}) attr_dict = ns_dict.get('attrs', {}) attr_count = attr_dict.get(attr_name, 0) attr_count += 1 attr_dict[attr_name] = attr_count ns_dict['attrs'] = attr_dict pruned[ns] = ns_dict return