Skip to content

CWE-611: XML External Entity (XXE) Injection - Python

Overview

Python XML parsing is risky for untrusted input because parser behavior depends on the library, Python build, and underlying Expat/libxml2 version. Standard-library parsers do not fetch local files or open network connections through Expat by default, but untrusted XML can still create entity-expansion and resource-exhaustion risk, and third-party parsers can reintroduce external entity handling when unsafe options are enabled.

Primary Defence: Use defusedxml for standard-library XML parsing of untrusted data, or configure third-party parsers to disable DTD loading, external entities, network access, and oversized trees before parsing.

Common Vulnerable Patterns

xml.etree.ElementTree for Hostile Input

# RISKY - Standard parser used directly on hostile XML

import xml.etree.ElementTree as ET

def parse_xml(xml_string):
    root = ET.fromstring(xml_string)
    return root.find('name').text

# Attacker sends:
# <?xml version="1.0"?>
# <!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
# <root><name>&xxe;</name></root>

Why this is risky:

  • Security depends on Python's bundled or system Expat version and parser limits.
  • Safer alternatives (defusedxml) exist for hostile or unauthenticated inputs.

lxml.etree with Unsafe Configuration

# VULNERABLE - lxml with entity resolution and DTD loading enabled

from lxml import etree

def parse_lxml(xml_string):
    parser = etree.XMLParser(
        load_dtd=True,
        resolve_entities=True,
        no_network=False
    )
    root = etree.fromstring(xml_string, parser=parser)
    return root.find('.//name').text

Why this is vulnerable:

  • DTD loading and entity resolution are explicitly enabled.
  • Enables file disclosure, SSRF, and entity expansion DoS.

xml.dom.minidom

# VULNERABLE - minidom with default parsing

import xml.dom.minidom

def parse_minidom(xml_string):
    doc = xml.dom.minidom.parseString(xml_string)  # DANGEROUS!
    return doc.getElementsByTagName('name')[0].firstChild.data

Why this is vulnerable:

  • DTD handling can enable entity expansion and DoS.
  • External entity behavior depends on the underlying parser.

xml.sax

# VULNERABLE - SAX parser without features

import xml.sax
from xml.sax.handler import ContentHandler

class MyHandler(ContentHandler):
    def startElement(self, name, attrs):
        pass

def parse_sax(xml_string):
    handler = MyHandler()
    xml.sax.parseString(xml_string, handler)

Why this is vulnerable:

  • External entity and DTD handling are parser-dependent across SAX implementations.
  • Direct standard-library parsing is not the preferred boundary for hostile XML when defusedxml is available.

Secure Patterns

# SECURE - defusedxml library blocks XXE by default

import defusedxml.ElementTree as ET

def parse_xml_secure(xml_string):
    """Parse XML safely with defusedxml"""
    try:
        root = ET.fromstring(xml_string)
        return root.find('name').text
    except ET.ParseError as e:
        raise ValueError(f"Invalid XML: {e}")

# Installation: pip install defusedxml

Why this works:

  • Blocks DTDs, entities, and external references for the supported standard-library APIs.
  • Drop-in replacement removes the need for parser-specific flags.

lxml with Secure Configuration

# SECURE - lxml with no_network and resolve_entities=False

from lxml import etree

def parse_lxml_secure(xml_string):
    """Parse XML with lxml securely"""
    parser = etree.XMLParser(
        no_network=True,           # Block network access
        resolve_entities=False,    # Don't resolve entities
        load_dtd=False,            # Don't load external DTDs
        dtd_validation=False       # Don't validate against DTD
    )

    try:
        root = etree.fromstring(xml_string.encode('utf-8'), parser=parser)
        return root.find('.//name').text
    except etree.XMLSyntaxError as e:
        raise ValueError(f"Invalid XML: {e}")

Why this works:

  • Disables network access, DTDs, and entity resolution.
  • Safe when you must use lxml for performance/features.

xml.etree with Manual Entity Prevention

# DEFENSE IN DEPTH - Check for entities before parsing

import xml.etree.ElementTree as ET
import re

def parse_xml_with_validation(xml_string):
    """Parse XML after validating no entities present"""

    # Block if contains DOCTYPE or entity declarations
    if '<!DOCTYPE' in xml_string or '<!ENTITY' in xml_string:
        raise ValueError("XML contains DTD/entities - rejected")

    # Block entity references
    if re.search(r'&(?!amp;|lt;|gt;|quot;|apos;)[a-zA-Z0-9_]+;', xml_string):
        raise ValueError("XML contains entity references - rejected")

    try:
        root = ET.fromstring(xml_string)
        return root
    except ET.ParseError as e:
        raise ValueError(f"Invalid XML: {e}")

# Note: defusedxml is still preferred over this brittle boundary check

Why this works:

  • Rejects obvious DOCTYPE/ENTITY and custom entity refs up front.
  • Useful as a boundary check, but not a replacement for defusedxml or secure parser settings.

Framework-Specific Guidance

Django

# SECURE - Django views with defusedxml

from django.http import JsonResponse, HttpResponseBadRequest
from django.views.decorators.csrf import csrf_exempt
import defusedxml.ElementTree as ET

@csrf_exempt
def process_xml(request):
    """Process XML upload securely"""
    if request.method != 'POST':
        return HttpResponseBadRequest('POST required')

    try:
        xml_data = request.body.decode('utf-8')

        # Parse with defusedxml
        root = ET.fromstring(xml_data)

        # Extract data
        name = root.find('name').text
        email = root.find('email').text

        # Validate
        if not name or len(name) > 100:
            return HttpResponseBadRequest('Invalid name')

        if not email or '@' not in email:
            return HttpResponseBadRequest('Invalid email')

        # Process data
        user = User.objects.create(name=name, email=email)

        return JsonResponse({
            'id': user.id,
            'name': user.name,
            'email': user.email
        })

    except ET.ParseError as e:
        return HttpResponseBadRequest(f'Invalid XML: {e}')
    except Exception as e:
        return HttpResponseBadRequest(f'Error: {e}')

Flask

# SECURE - Flask API with defusedxml

from flask import Flask, request, jsonify
import defusedxml.ElementTree as ET

app = Flask(__name__)

@app.route('/api/users', methods=['POST'])
def create_user():
    """Create user from XML"""
    if request.content_type != 'application/xml':
        return jsonify({'error': 'Content-Type must be application/xml'}), 400

    try:
        xml_data = request.data.decode('utf-8')

        # Parse securely
        root = ET.fromstring(xml_data)

        # Extract and validate
        name = root.find('name')
        email = root.find('email')

        if name is None or not name.text:
            return jsonify({'error': 'Name is required'}), 400

        if email is None or not email.text or '@' not in email.text:
            return jsonify({'error': 'Valid email is required'}), 400

        # Create user
        user = {
            'name': name.text,
            'email': email.text
        }

        # Save to database...

        return jsonify(user), 201

    except ET.ParseError as e:
        return jsonify({'error': f'Invalid XML: {str(e)}'}), 400
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=False)

FastAPI

# SECURE - FastAPI with defusedxml

from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel, EmailStr
import defusedxml.ElementTree as ET

app = FastAPI()

class User(BaseModel):
    name: str
    email: EmailStr

@app.post("/users", response_model=User)
async def create_user(request: Request):
    """Create user from XML request"""

    # Check content type
    if request.headers.get('content-type') != 'application/xml':
        raise HTTPException(400, 'Content-Type must be application/xml')

    try:
        # Read XML body
        xml_data = await request.body()
        xml_string = xml_data.decode('utf-8')

        # Parse securely with defusedxml
        root = ET.fromstring(xml_string)

        # Extract data
        name_elem = root.find('name')
        email_elem = root.find('email')

        if name_elem is None or not name_elem.text:
            raise HTTPException(400, 'Name is required')

        if email_elem is None or not email_elem.text:
            raise HTTPException(400, 'Email is required')

        # Create User (Pydantic validates)
        user = User(
            name=name_elem.text,
            email=email_elem.text
        )

        # Save to database...

        return user

    except ET.ParseError as e:
        raise HTTPException(400, f'Invalid XML: {str(e)}')

RSS/Atom Feed Parsing

# SECURE - Parse RSS feeds with defusedxml

import defusedxml.ElementTree as ET
import requests

def parse_rss_feed(feed_url):
    """Safely parse RSS feed"""

    # Fetch feed
    response = requests.get(feed_url, timeout=10)
    response.raise_for_status()

    # Parse with defusedxml
    root = ET.fromstring(response.content)

    items = []
    for item in root.findall('.//item'):
        title = item.find('title')
        link = item.find('link')

        if title is not None and link is not None:
            items.append({
                'title': title.text,
                'link': link.text
            })

    return items

# Alternative: Use a maintained feed parser and keep it updated

import feedparser

def parse_feed_with_feedparser(feed_url):
    """Parse feed with feedparser library"""
    feed = feedparser.parse(feed_url)

    return [{
        'title': entry.title,
        'link': entry.link
    } for entry in feed.entries]

SOAP/XML-RPC

# SECURE - SOAP response parsing with hardened lxml

from lxml import etree
import requests

def call_soap_service(endpoint, xml_request):
    """Call SOAP service securely"""

    headers = {
        'Content-Type': 'text/xml; charset=utf-8',
        'SOAPAction': 'urn:action'
    }

    # Send request
    response = requests.post(endpoint, data=xml_request, headers=headers)
    response.raise_for_status()

    parser = etree.XMLParser(
        no_network=True,
        resolve_entities=False,
        load_dtd=False,
        dtd_validation=False
    )
    root = etree.fromstring(response.content, parser=parser)

    # Extract data from SOAP envelope
    body = root.find('.//{http://schemas.xmlsoap.org/soap/envelope/}Body')
    return body

Input Validation

# Validate XML structure after parsing

import defusedxml.ElementTree as ET

def parse_and_validate_user_xml(xml_string):
    """Parse and validate user XML"""

    # Parse securely
    root = ET.fromstring(xml_string)

    # Validate root element
    if root.tag != 'user':
        raise ValueError('Root element must be <user>')

    # Extract required fields
    name = root.find('name')
    email = root.find('email')
    age = root.find('age')

    # Validate presence
    if name is None or not name.text:
        raise ValueError('Name is required')

    if email is None or not email.text:
        raise ValueError('Email is required')

    # Validate content
    if len(name.text) > 100:
        raise ValueError('Name too long')

    if '@' not in email.text:
        raise ValueError('Invalid email format')

    if age is not None and age.text:
        try:
            age_int = int(age.text)
            if age_int < 0 or age_int > 150:
                raise ValueError('Age out of range')
        except ValueError:
            raise ValueError('Age must be an integer')

    return {
        'name': name.text,
        'email': email.text,
        'age': int(age.text) if age is not None and age.text else None
    }

Additional Resources