CWE-611: XML External Entity (XXE) Injection - Python
Overview
Python XML parsing is risky for untrusted input because some parsers allow DTDs, entities, or entity expansion. Use defusedxml or configure parsers securely to prevent file disclosure, SSRF, and denial of service attacks.
Primary Defence: Use defusedxml library as a drop-in replacement for standard XML parsers, which disables all dangerous features by default.
Common Vulnerable Patterns
xml.etree.ElementTree (Default)
# VULNERABLE - Default ElementTree parsing for untrusted XML
import xml.etree.ElementTree as ET
def parse_xml(xml_string):
root = ET.fromstring(xml_string) # DANGEROUS!
return root.find('name').text
# Attacker sends:
# <?xml version="1.0"?>
# <!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
# <root><name>&xxe;</name></root>
Why this is vulnerable:
- Untrusted XML can trigger entity expansion or parser-specific behavior.
- Safer alternatives (defusedxml) exist for hostile inputs.
lxml.etree (Default Configuration)
# VULNERABLE - lxml with default settings
from lxml import etree
def parse_lxml(xml_string):
root = etree.fromstring(xml_string) # Allows external entities!
return root.find('.//name').text
Why this is vulnerable:
- DTDs and external entities are enabled by default.
- Enables file disclosure, SSRF, and entity expansion DoS.
xml.dom.minidom
# VULNERABLE - minidom with default parsing
import xml.dom.minidom
def parse_minidom(xml_string):
doc = xml.dom.minidom.parseString(xml_string) # DANGEROUS!
return doc.getElementsByTagName('name')[0].firstChild.data
Why this is vulnerable:
- DTD handling can enable entity expansion and DoS.
- External entity behavior depends on the underlying parser.
xml.sax
# VULNERABLE - SAX parser without features
import xml.sax
from xml.sax.handler import ContentHandler
class MyHandler(ContentHandler):
def startElement(self, name, attrs):
pass
def parse_sax(xml_string):
handler = MyHandler()
xml.sax.parseString(xml_string, handler) # Allows external entities!
Why this is vulnerable:
- External entity handling is parser-dependent and unsafe by default.
- Can lead to SSRF, file disclosure, or parser crashes.
Secure Patterns
defusedxml (Recommended)
# SECURE - defusedxml library blocks XXE by default
import defusedxml.ElementTree as ET
def parse_xml_secure(xml_string):
"""Parse XML safely with defusedxml"""
try:
root = ET.fromstring(xml_string)
return root.find('name').text
except ET.ParseError as e:
raise ValueError(f"Invalid XML: {e}")
# Installation: pip install defusedxml
Why this works:
- Blocks DTDs, external entities, and entity expansion by default.
- Drop-in replacement removes the need for parser-specific flags.
lxml with Secure Configuration
# SECURE - lxml with no_network and resolve_entities=False
from lxml import etree
def parse_lxml_secure(xml_string):
"""Parse XML with lxml securely"""
parser = etree.XMLParser(
no_network=True, # Block network access
resolve_entities=False, # Don't resolve entities
load_dtd=False, # Don't load external DTDs
dtd_validation=False # Don't validate against DTD
)
try:
root = etree.fromstring(xml_string.encode('utf-8'), parser=parser)
return root.find('.//name').text
except etree.XMLSyntaxError as e:
raise ValueError(f"Invalid XML: {e}")
Why this works:
- Disables network access, DTDs, and entity resolution.
- Safe when you must use lxml for performance/features.
xml.etree with Manual Entity Prevention
# SECURE - Check for entities before parsing
import xml.etree.ElementTree as ET
import re
def parse_xml_with_validation(xml_string):
"""Parse XML after validating no entities present"""
# Block if contains DOCTYPE or entity declarations
if '<!DOCTYPE' in xml_string or '<!ENTITY' in xml_string:
raise ValueError("XML contains DTD/entities - rejected")
# Block entity references
if re.search(r'&(?!amp;|lt;|gt;|quot;|apos;)[a-zA-Z0-9_]+;', xml_string):
raise ValueError("XML contains entity references - rejected")
try:
root = ET.fromstring(xml_string)
return root
except ET.ParseError as e:
raise ValueError(f"Invalid XML: {e}")
# Note: defusedxml is still preferred over this approach
Why this works:
- Rejects DOCTYPE/ENTITY and custom entity refs up front.
- Defense-in-depth when defusedxml is unavailable.
Framework-Specific Guidance
Django
# SECURE - Django views with defusedxml
from django.http import JsonResponse, HttpResponseBadRequest
from django.views.decorators.csrf import csrf_exempt
import defusedxml.ElementTree as ET
@csrf_exempt
def process_xml(request):
"""Process XML upload securely"""
if request.method != 'POST':
return HttpResponseBadRequest('POST required')
try:
xml_data = request.body.decode('utf-8')
# Parse with defusedxml
root = ET.fromstring(xml_data)
# Extract data
name = root.find('name').text
email = root.find('email').text
# Validate
if not name or len(name) > 100:
return HttpResponseBadRequest('Invalid name')
if not email or '@' not in email:
return HttpResponseBadRequest('Invalid email')
# Process data
user = User.objects.create(name=name, email=email)
return JsonResponse({
'id': user.id,
'name': user.name,
'email': user.email
})
except ET.ParseError as e:
return HttpResponseBadRequest(f'Invalid XML: {e}')
except Exception as e:
return HttpResponseBadRequest(f'Error: {e}')
Flask
# SECURE - Flask API with defusedxml
from flask import Flask, request, jsonify
import defusedxml.ElementTree as ET
app = Flask(__name__)
@app.route('/api/users', methods=['POST'])
def create_user():
"""Create user from XML"""
if request.content_type != 'application/xml':
return jsonify({'error': 'Content-Type must be application/xml'}), 400
try:
xml_data = request.data.decode('utf-8')
# Parse securely
root = ET.fromstring(xml_data)
# Extract and validate
name = root.find('name')
email = root.find('email')
if name is None or not name.text:
return jsonify({'error': 'Name is required'}), 400
if email is None or not email.text or '@' not in email.text:
return jsonify({'error': 'Valid email is required'}), 400
# Create user
user = {
'name': name.text,
'email': email.text
}
# Save to database...
return jsonify(user), 201
except ET.ParseError as e:
return jsonify({'error': f'Invalid XML: {str(e)}'}), 400
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(debug=False)
FastAPI
# SECURE - FastAPI with defusedxml
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel, EmailStr
import defusedxml.ElementTree as ET
app = FastAPI()
class User(BaseModel):
name: str
email: EmailStr
@app.post("/users", response_model=User)
async def create_user(request: Request):
"""Create user from XML request"""
# Check content type
if request.headers.get('content-type') != 'application/xml':
raise HTTPException(400, 'Content-Type must be application/xml')
try:
# Read XML body
xml_data = await request.body()
xml_string = xml_data.decode('utf-8')
# Parse securely with defusedxml
root = ET.fromstring(xml_string)
# Extract data
name_elem = root.find('name')
email_elem = root.find('email')
if name_elem is None or not name_elem.text:
raise HTTPException(400, 'Name is required')
if email_elem is None or not email_elem.text:
raise HTTPException(400, 'Email is required')
# Create User (Pydantic validates)
user = User(
name=name_elem.text,
email=email_elem.text
)
# Save to database...
return user
except ET.ParseError as e:
raise HTTPException(400, f'Invalid XML: {str(e)}')
RSS/Atom Feed Parsing
# SECURE - Parse RSS feeds with defusedxml
import defusedxml.ElementTree as ET
import requests
def parse_rss_feed(feed_url):
"""Safely parse RSS feed"""
# Fetch feed
response = requests.get(feed_url, timeout=10)
response.raise_for_status()
# Parse with defusedxml
root = ET.fromstring(response.content)
items = []
for item in root.findall('.//item'):
title = item.find('title')
link = item.find('link')
if title is not None and link is not None:
items.append({
'title': title.text,
'link': link.text
})
return items
# Alternative: Use feedparser library (handles XXE internally)
import feedparser
def parse_feed_with_feedparser(feed_url):
"""Parse feed with feedparser library"""
feed = feedparser.parse(feed_url)
return [{
'title': entry.title,
'link': entry.link
} for entry in feed.entries]
SOAP/XML-RPC
# SECURE - SOAP with defusedxml
from defusedxml.lxml import fromstring
import requests
def call_soap_service(endpoint, xml_request):
"""Call SOAP service securely"""
headers = {
'Content-Type': 'text/xml; charset=utf-8',
'SOAPAction': 'urn:action'
}
# Send request
response = requests.post(endpoint, data=xml_request, headers=headers)
response.raise_for_status()
# Parse response with defusedxml
root = fromstring(response.content)
# Extract data from SOAP envelope
body = root.find('.//{http://schemas.xmlsoap.org/soap/envelope/}Body')
return body
Input Validation
# Validate XML structure after parsing
import defusedxml.ElementTree as ET
def parse_and_validate_user_xml(xml_string):
"""Parse and validate user XML"""
# Parse securely
root = ET.fromstring(xml_string)
# Validate root element
if root.tag != 'user':
raise ValueError('Root element must be <user>')
# Extract required fields
name = root.find('name')
email = root.find('email')
age = root.find('age')
# Validate presence
if name is None or not name.text:
raise ValueError('Name is required')
if email is None or not email.text:
raise ValueError('Email is required')
# Validate content
if len(name.text) > 100:
raise ValueError('Name too long')
if '@' not in email.text:
raise ValueError('Invalid email format')
if age is not None and age.text:
try:
age_int = int(age.text)
if age_int < 0 or age_int > 150:
raise ValueError('Age out of range')
except ValueError:
raise ValueError('Age must be an integer')
return {
'name': name.text,
'email': email.text,
'age': int(age.text) if age is not None and age.text else None
}
Verification
After implementing the recommended secure patterns, verify the fix through multiple approaches:
- Manual testing: Submit malicious payloads relevant to this vulnerability and confirm they're handled safely without executing unintended operations
- Code review: Confirm all instances use the secure pattern (parameterized queries, safe APIs, proper encoding) with no string concatenation or unsafe operations
- Static analysis: Use security scanners to verify no new vulnerabilities exist and the original finding is resolved
- Regression testing: Ensure legitimate user inputs and application workflows continue to function correctly
- Edge case validation: Test with special characters, boundary conditions, and unusual inputs to verify proper handling
- Framework verification: If using a framework or library, confirm the recommended APIs are used correctly according to documentation
- Authentication/session testing: Verify security controls remain effective and cannot be bypassed (if applicable to the vulnerability type)
- Rescan: Run the security scanner again to confirm the finding is resolved and no new issues were introduced