Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 38 additions & 5 deletions api/factories/file_factory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import mimetypes
import os
import re
import urllib.parse
import uuid
from collections.abc import Callable, Mapping, Sequence
Expand Down Expand Up @@ -249,15 +250,47 @@ def _build_from_remote_url(


def _extract_filename(url_path: str, content_disposition: str | None) -> str | None:
filename = None
filename: str | None = None
# Try to extract from Content-Disposition header first
if content_disposition:
_, params = parse_options_header(content_disposition)
# RFC 5987 https://datatracker.ietf.org/doc/html/rfc5987: filename* takes precedence over filename
filename = params.get("filename*") or params.get("filename")
# Manually extract filename* parameter since parse_options_header doesn't support it
filename_star_match = re.search(r"filename\*=([^;]+)", content_disposition)
if filename_star_match:
raw_star = filename_star_match.group(1).strip()
# Remove trailing quotes if present
raw_star = raw_star.removesuffix('"')
# format: charset'lang'value
try:
parts = raw_star.split("'", 2)
charset = (parts[0] or "utf-8").lower() if len(parts) >= 1 else "utf-8"
value = parts[2] if len(parts) == 3 else parts[-1]
filename = urllib.parse.unquote(value, encoding=charset, errors="replace")
except Exception:
# Fallback: try to extract value after the last single quote
if "''" in raw_star:
filename = urllib.parse.unquote(raw_star.split("''")[-1])
else:
filename = urllib.parse.unquote(raw_star)

if not filename:
# Fallback to regular filename parameter
_, params = parse_options_header(content_disposition)
raw = params.get("filename")
if raw:
# Strip surrounding quotes and percent-decode if present
if len(raw) >= 2 and raw[0] == raw[-1] == '"':
raw = raw[1:-1]
filename = urllib.parse.unquote(raw)
# Fallback to URL path if no filename from header
if not filename:
filename = os.path.basename(url_path)
candidate = os.path.basename(url_path)
filename = urllib.parse.unquote(candidate) if candidate else None
# Defense-in-depth: ensure basename only
if filename:
filename = os.path.basename(filename)
# Return None if filename is empty or only whitespace
if not filename or not filename.strip():
filename = None
return filename or None


Expand Down
119 changes: 118 additions & 1 deletion api/tests/unit_tests/factories/test_file_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from factories.file_factory import _get_remote_file_info
from factories.file_factory import _extract_filename, _get_remote_file_info


class _FakeResponse:
Expand Down Expand Up @@ -113,3 +113,120 @@ def test_no_filename_in_url_or_header_generates_uuid_bin(self, monkeypatch: pyte
# Should generate a random hex filename with .bin extension
assert re.match(r"^[0-9a-f]{32}\.bin$", filename) is not None
assert mime_type == "application/octet-stream"


class TestExtractFilename:
"""Tests for _extract_filename function focusing on RFC5987 parsing and security."""

def test_no_content_disposition_uses_url_basename(self):
"""Test that URL basename is used when no Content-Disposition header."""
result = _extract_filename("http://example.com/path/file.txt", None)
assert result == "file.txt"

def test_no_content_disposition_with_percent_encoded_url(self):
"""Test that percent-encoded URL basename is decoded."""
result = _extract_filename("http://example.com/path/file%20name.txt", None)
assert result == "file name.txt"

def test_no_content_disposition_empty_url_path(self):
"""Test that empty URL path returns None."""
result = _extract_filename("http://example.com/", None)
assert result is None

def test_simple_filename_header(self):
"""Test basic filename extraction from Content-Disposition."""
result = _extract_filename("http://example.com/", 'attachment; filename="test.txt"')
assert result == "test.txt"

def test_quoted_filename_with_spaces(self):
"""Test filename with spaces in quotes."""
result = _extract_filename("http://example.com/", 'attachment; filename="my file.txt"')
assert result == "my file.txt"

def test_unquoted_filename(self):
"""Test unquoted filename."""
result = _extract_filename("http://example.com/", "attachment; filename=test.txt")
assert result == "test.txt"

def test_percent_encoded_filename(self):
"""Test percent-encoded filename."""
result = _extract_filename("http://example.com/", 'attachment; filename="file%20name.txt"')
assert result == "file name.txt"

def test_rfc5987_filename_star_utf8(self):
"""Test RFC5987 filename* with UTF-8 encoding."""
result = _extract_filename("http://example.com/", "attachment; filename*=UTF-8''file%20name.txt")
assert result == "file name.txt"

def test_rfc5987_filename_star_chinese(self):
"""Test RFC5987 filename* with Chinese characters."""
result = _extract_filename(
"http://example.com/", "attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.txt"
)
assert result == "测试文件.txt"

def test_rfc5987_filename_star_with_language(self):
"""Test RFC5987 filename* with language tag."""
result = _extract_filename("http://example.com/", "attachment; filename*=UTF-8'en'file%20name.txt")
assert result == "file name.txt"

def test_rfc5987_filename_star_fallback_charset(self):
"""Test RFC5987 filename* with fallback charset."""
result = _extract_filename("http://example.com/", "attachment; filename*=''file%20name.txt")
assert result == "file name.txt"

def test_rfc5987_filename_star_malformed_fallback(self):
"""Test RFC5987 filename* with malformed format falls back to simple unquote."""
result = _extract_filename("http://example.com/", "attachment; filename*=malformed%20filename.txt")
assert result == "malformed filename.txt"

def test_filename_star_takes_precedence_over_filename(self):
"""Test that filename* takes precedence over filename."""
test_string = 'attachment; filename="old.txt"; filename*=UTF-8\'\'new.txt"'
result = _extract_filename("http://example.com/", test_string)
assert result == "new.txt"

def test_path_injection_protection(self):
"""Test that path injection attempts are blocked by os.path.basename."""
result = _extract_filename("http://example.com/", 'attachment; filename="../../../etc/passwd"')
assert result == "passwd"

def test_path_injection_protection_rfc5987(self):
"""Test that path injection attempts in RFC5987 are blocked."""
result = _extract_filename("http://example.com/", "attachment; filename*=UTF-8''..%2F..%2F..%2Fetc%2Fpasswd")
assert result == "passwd"

def test_empty_filename_returns_none(self):
"""Test that empty filename returns None."""
result = _extract_filename("http://example.com/", 'attachment; filename=""')
assert result is None

def test_whitespace_only_filename_returns_none(self):
"""Test that whitespace-only filename returns None."""
result = _extract_filename("http://example.com/", 'attachment; filename=" "')
assert result is None

def test_complex_rfc5987_encoding(self):
"""Test complex RFC5987 encoding with special characters."""
result = _extract_filename(
"http://example.com/",
"attachment; filename*=UTF-8''%E4%B8%AD%E6%96%87%E6%96%87%E4%BB%B6%20%28%E5%89%AF%E6%9C%AC%29.pdf",
)
assert result == "中文文件 (副本).pdf"

def test_iso8859_1_encoding(self):
"""Test ISO-8859-1 encoding in RFC5987."""
result = _extract_filename("http://example.com/", "attachment; filename*=ISO-8859-1''file%20name.txt")
assert result == "file name.txt"

def test_encoding_error_fallback(self):
"""Test that encoding errors fall back to safe ASCII filename."""
result = _extract_filename("http://example.com/", "attachment; filename*=INVALID-CHARSET''file%20name.txt")
assert result == "file name.txt"

def test_mixed_quotes_and_encoding(self):
"""Test filename with mixed quotes and percent encoding."""
result = _extract_filename(
"http://example.com/", 'attachment; filename="file%20with%20quotes%20%26%20encoding.txt"'
)
assert result == "file with quotes & encoding.txt"