CWE-611: XML External Entity (XXE) Injection - Python
Overview
Python XML parsing is risky for untrusted input because parser behavior depends on the library, Python build, and underlying Expat/libxml2 version. Standard-library parsers do not fetch local files or open network connections through Expat by default, but untrusted XML can still create entity-expansion and resource-exhaustion risk, and third-party parsers can reintroduce external entity handling when unsafe options are enabled.
Primary Defence: Use defusedxml for standard-library XML parsing of untrusted data, or configure third-party parsers to disable DTD loading, external entities, network access, and oversized trees before parsing.
Common Vulnerable Patterns
xml.etree.ElementTree for Hostile Input
# RISKY - Standard parser used directly on hostile XML
import xml.etree.ElementTree as ET
def parse_xml(xml_string):
root = ET.fromstring(xml_string)
return root.find('name').text
# Attacker sends:
# <?xml version="1.0"?>
# <!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
# <root><name>&xxe;</name></root>
Why this is risky:
- Security depends on Python's bundled or system Expat version and parser limits.
- Safer alternatives (
defusedxml) exist for hostile or unauthenticated inputs.
lxml.etree with Unsafe Configuration
# VULNERABLE - lxml with entity resolution and DTD loading enabled
from lxml import etree
def parse_lxml(xml_string):
parser = etree.XMLParser(
load_dtd=True,
resolve_entities=True,
no_network=False
)
root = etree.fromstring(xml_string, parser=parser)
return root.find('.//name').text
Why this is vulnerable:
- DTD loading and entity resolution are explicitly enabled.
- Enables file disclosure, SSRF, and entity expansion DoS.
xml.dom.minidom
# VULNERABLE - minidom with default parsing
import xml.dom.minidom
def parse_minidom(xml_string):
doc = xml.dom.minidom.parseString(xml_string) # DANGEROUS!
return doc.getElementsByTagName('name')[0].firstChild.data
Why this is vulnerable:
- DTD handling can enable entity expansion and DoS.
- External entity behavior depends on the underlying parser.
xml.sax
# VULNERABLE - SAX parser without features
import xml.sax
from xml.sax.handler import ContentHandler
class MyHandler(ContentHandler):
def startElement(self, name, attrs):
pass
def parse_sax(xml_string):
handler = MyHandler()
xml.sax.parseString(xml_string, handler)
Why this is vulnerable:
- External entity and DTD handling are parser-dependent across SAX implementations.
- Direct standard-library parsing is not the preferred boundary for hostile XML when
defusedxmlis available.
Secure Patterns
defusedxml (Recommended)
# SECURE - defusedxml library blocks XXE by default
import defusedxml.ElementTree as ET
def parse_xml_secure(xml_string):
"""Parse XML safely with defusedxml"""
try:
root = ET.fromstring(xml_string)
return root.find('name').text
except ET.ParseError as e:
raise ValueError(f"Invalid XML: {e}")
# Installation: pip install defusedxml
Why this works:
- Blocks DTDs, entities, and external references for the supported standard-library APIs.
- Drop-in replacement removes the need for parser-specific flags.
lxml with Secure Configuration
# SECURE - lxml with no_network and resolve_entities=False
from lxml import etree
def parse_lxml_secure(xml_string):
"""Parse XML with lxml securely"""
parser = etree.XMLParser(
no_network=True, # Block network access
resolve_entities=False, # Don't resolve entities
load_dtd=False, # Don't load external DTDs
dtd_validation=False # Don't validate against DTD
)
try:
root = etree.fromstring(xml_string.encode('utf-8'), parser=parser)
return root.find('.//name').text
except etree.XMLSyntaxError as e:
raise ValueError(f"Invalid XML: {e}")
Why this works:
- Disables network access, DTDs, and entity resolution.
- Safe when you must use lxml for performance/features.
xml.etree with Manual Entity Prevention
# DEFENSE IN DEPTH - Check for entities before parsing
import xml.etree.ElementTree as ET
import re
def parse_xml_with_validation(xml_string):
"""Parse XML after validating no entities present"""
# Block if contains DOCTYPE or entity declarations
if '<!DOCTYPE' in xml_string or '<!ENTITY' in xml_string:
raise ValueError("XML contains DTD/entities - rejected")
# Block entity references
if re.search(r'&(?!amp;|lt;|gt;|quot;|apos;)[a-zA-Z0-9_]+;', xml_string):
raise ValueError("XML contains entity references - rejected")
try:
root = ET.fromstring(xml_string)
return root
except ET.ParseError as e:
raise ValueError(f"Invalid XML: {e}")
# Note: defusedxml is still preferred over this brittle boundary check
Why this works:
- Rejects obvious DOCTYPE/ENTITY and custom entity refs up front.
- Useful as a boundary check, but not a replacement for
defusedxmlor secure parser settings.
Framework-Specific Guidance
Django
# SECURE - Django views with defusedxml
from django.http import JsonResponse, HttpResponseBadRequest
from django.views.decorators.csrf import csrf_exempt
import defusedxml.ElementTree as ET
@csrf_exempt
def process_xml(request):
"""Process XML upload securely"""
if request.method != 'POST':
return HttpResponseBadRequest('POST required')
try:
xml_data = request.body.decode('utf-8')
# Parse with defusedxml
root = ET.fromstring(xml_data)
# Extract data
name = root.find('name').text
email = root.find('email').text
# Validate
if not name or len(name) > 100:
return HttpResponseBadRequest('Invalid name')
if not email or '@' not in email:
return HttpResponseBadRequest('Invalid email')
# Process data
user = User.objects.create(name=name, email=email)
return JsonResponse({
'id': user.id,
'name': user.name,
'email': user.email
})
except ET.ParseError as e:
return HttpResponseBadRequest(f'Invalid XML: {e}')
except Exception as e:
return HttpResponseBadRequest(f'Error: {e}')
Flask
# SECURE - Flask API with defusedxml
from flask import Flask, request, jsonify
import defusedxml.ElementTree as ET
app = Flask(__name__)
@app.route('/api/users', methods=['POST'])
def create_user():
"""Create user from XML"""
if request.content_type != 'application/xml':
return jsonify({'error': 'Content-Type must be application/xml'}), 400
try:
xml_data = request.data.decode('utf-8')
# Parse securely
root = ET.fromstring(xml_data)
# Extract and validate
name = root.find('name')
email = root.find('email')
if name is None or not name.text:
return jsonify({'error': 'Name is required'}), 400
if email is None or not email.text or '@' not in email.text:
return jsonify({'error': 'Valid email is required'}), 400
# Create user
user = {
'name': name.text,
'email': email.text
}
# Save to database...
return jsonify(user), 201
except ET.ParseError as e:
return jsonify({'error': f'Invalid XML: {str(e)}'}), 400
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(debug=False)
FastAPI
# SECURE - FastAPI with defusedxml
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel, EmailStr
import defusedxml.ElementTree as ET
app = FastAPI()
class User(BaseModel):
name: str
email: EmailStr
@app.post("/users", response_model=User)
async def create_user(request: Request):
"""Create user from XML request"""
# Check content type
if request.headers.get('content-type') != 'application/xml':
raise HTTPException(400, 'Content-Type must be application/xml')
try:
# Read XML body
xml_data = await request.body()
xml_string = xml_data.decode('utf-8')
# Parse securely with defusedxml
root = ET.fromstring(xml_string)
# Extract data
name_elem = root.find('name')
email_elem = root.find('email')
if name_elem is None or not name_elem.text:
raise HTTPException(400, 'Name is required')
if email_elem is None or not email_elem.text:
raise HTTPException(400, 'Email is required')
# Create User (Pydantic validates)
user = User(
name=name_elem.text,
email=email_elem.text
)
# Save to database...
return user
except ET.ParseError as e:
raise HTTPException(400, f'Invalid XML: {str(e)}')
RSS/Atom Feed Parsing
# SECURE - Parse RSS feeds with defusedxml
import defusedxml.ElementTree as ET
import requests
def parse_rss_feed(feed_url):
"""Safely parse RSS feed"""
# Fetch feed
response = requests.get(feed_url, timeout=10)
response.raise_for_status()
# Parse with defusedxml
root = ET.fromstring(response.content)
items = []
for item in root.findall('.//item'):
title = item.find('title')
link = item.find('link')
if title is not None and link is not None:
items.append({
'title': title.text,
'link': link.text
})
return items
# Alternative: Use a maintained feed parser and keep it updated
import feedparser
def parse_feed_with_feedparser(feed_url):
"""Parse feed with feedparser library"""
feed = feedparser.parse(feed_url)
return [{
'title': entry.title,
'link': entry.link
} for entry in feed.entries]
SOAP/XML-RPC
# SECURE - SOAP response parsing with hardened lxml
from lxml import etree
import requests
def call_soap_service(endpoint, xml_request):
"""Call SOAP service securely"""
headers = {
'Content-Type': 'text/xml; charset=utf-8',
'SOAPAction': 'urn:action'
}
# Send request
response = requests.post(endpoint, data=xml_request, headers=headers)
response.raise_for_status()
parser = etree.XMLParser(
no_network=True,
resolve_entities=False,
load_dtd=False,
dtd_validation=False
)
root = etree.fromstring(response.content, parser=parser)
# Extract data from SOAP envelope
body = root.find('.//{http://schemas.xmlsoap.org/soap/envelope/}Body')
return body
Input Validation
# Validate XML structure after parsing
import defusedxml.ElementTree as ET
def parse_and_validate_user_xml(xml_string):
"""Parse and validate user XML"""
# Parse securely
root = ET.fromstring(xml_string)
# Validate root element
if root.tag != 'user':
raise ValueError('Root element must be <user>')
# Extract required fields
name = root.find('name')
email = root.find('email')
age = root.find('age')
# Validate presence
if name is None or not name.text:
raise ValueError('Name is required')
if email is None or not email.text:
raise ValueError('Email is required')
# Validate content
if len(name.text) > 100:
raise ValueError('Name too long')
if '@' not in email.text:
raise ValueError('Invalid email format')
if age is not None and age.text:
try:
age_int = int(age.text)
if age_int < 0 or age_int > 150:
raise ValueError('Age out of range')
except ValueError:
raise ValueError('Age must be an integer')
return {
'name': name.text,
'email': email.text,
'age': int(age.text) if age is not None and age.text else None
}