Source code for src.preParseChecks.xmlStructureCheck

# SPDX-FileCopyrightText: Copyright © 2026 BBC
#
# SPDX-License-Identifier: BSD-3-Clause

from .preParseCheck import PreParseCheck
from src.validationLogging.validationLogger import ValidationLogger
from src.validationLogging.validationCodes import ValidationCode
from xml.parsers.expat import ParserCreate, ExpatError, errors
import logging


[docs] class XmlStructureCheck(PreParseCheck): """ Check for entity declarations and non-UTF-8 declared encodings. This is a PreParseCheck because when we parse the XML using ElementTree later it processes all the declarations and replaces the entity references and there's no way to stop that. Only expat seems to allow the entity declarations to be observed, so using it directly even though the documentation says that doing so is deprecated (although it doesn't offer an alternative!). """
[docs] def run( self, input: bytes, validation_results: ValidationLogger) -> tuple[bool, bytes]: encodingDecl = None entityDeclarationsFound = False xmlDeclFound = False doctypeFound = False externalDtd = None def entityDeclHandler( entityName: str, is_parameter_entity: bool, value: str | None, base: str | None, systemId: str, publicId: str | None, notationName: str | None) -> None: logging.debug('Found an entity declaration. Name: "{}"' .format(entityName)) nonlocal entityDeclarationsFound entityDeclarationsFound = True return def xmlDeclHandler( version: str, encoding: str | None, standalone: int) -> None: logging.debug( 'XML Declaration version {} encoding {} standalone {}' .format(version, encoding, standalone)) nonlocal encodingDecl, xmlDeclFound encodingDecl = encoding xmlDeclFound = True return def doctypeHandler( doctypeName: str, systemId: str | None, publicId: str | None, has_internal_subset: bool) -> None: logging.info( 'Doctype Declaration doctypeName {} ' 'systemId {} ' 'publicId {} ' 'has_internal_subset {}' .format(doctypeName, systemId, publicId, has_internal_subset)) nonlocal doctypeFound, externalDtd doctypeFound = True externalDtd = systemId return valid = True parser = ParserCreate() parser.StartDoctypeDeclHandler = doctypeHandler parser.EntityDeclHandler = entityDeclHandler parser.XmlDeclHandler = xmlDeclHandler try: parser.Parse(input) except ExpatError as xe: valid = False validation_results.error( location='XML Document line {} position {}' .format(xe.lineno, xe.offset), message=errors.messages[xe.code], code=ValidationCode.xml_document_validity ) if encodingDecl is not None and encodingDecl.upper() != 'UTF-8': valid = False validation_results.error( location='XML prolog', message='Non-UTF-8 encoding declaration: {}, UTF-8 required' .format(encodingDecl), code=ValidationCode.xml_encoding_decl ) elif xmlDeclFound and encodingDecl is not None: validation_results.good( location='XML prolog', message='XML Prolog declares UTF-8 encoding', code=ValidationCode.xml_encoding_decl ) elif xmlDeclFound and encodingDecl is None: validation_results.warn( location='XML prolog', message='XML Prolog found with no encoding declaration, ' 'assuming UTF-8', code=ValidationCode.xml_encoding_decl ) else: validation_results.warn( location='XML prolog', message='No XML Prolog present, assuming XML document ' 'with UTF-8 encoding', code=ValidationCode.xml_encoding_decl ) if doctypeFound: valid = False validation_results.error( location='XML Document Type Declaration', message='Prohibited Document Type Declaration present{}' .format( ' referencing external DTD {}'.format(externalDtd) if externalDtd else ''), code=ValidationCode.xml_dtd, ) else: validation_results.good( location='XML Document Type Declaration', message='No Document Type Declarations found', code=ValidationCode.xml_dtd ) if entityDeclarationsFound: valid = False validation_results.error( location='XML Document Type Declaration', message='XML Entity declaration found - ' 'these are not permitted', code=ValidationCode.xml_entity_decl ) else: validation_results.good( location='XML Document Type', message='No XML Entity declarations found', code=ValidationCode.xml_entity_decl ) return (valid, input)