diff --git a/changelog.d/safe-census-zip-extraction.fixed.md b/changelog.d/safe-census-zip-extraction.fixed.md new file mode 100644 index 00000000000..065b66543e5 --- /dev/null +++ b/changelog.d/safe-census-zip-extraction.fixed.md @@ -0,0 +1 @@ +Validate extracted census archive paths before unpacking downloaded state block archives. diff --git a/policyengine_us/tests/utilities/test_download_50_state_census_block_data.py b/policyengine_us/tests/utilities/test_download_50_state_census_block_data.py new file mode 100644 index 00000000000..4b0f0ac7bfb --- /dev/null +++ b/policyengine_us/tests/utilities/test_download_50_state_census_block_data.py @@ -0,0 +1,79 @@ +import zipfile +from pathlib import Path + +import pytest + +from policyengine_us.tools.geography.download_50_state_census_block_data import ( + MAX_ARCHIVE_DOWNLOAD_BYTES, + MAX_ARCHIVE_UNCOMPRESSED_BYTES, + _download_with_limits, + _safe_extract, +) + + +def test_safe_extract_rejects_zip_slip(tmp_path): + archive_path = tmp_path / "malicious.zip" + extract_dir = tmp_path / "extract" + + with zipfile.ZipFile(archive_path, "w") as zip_ref: + zip_ref.writestr("../evil.txt", "pwned") + + with zipfile.ZipFile(archive_path, "r") as zip_ref: + with pytest.raises(ValueError, match="Unsafe path"): + _safe_extract(zip_ref, extract_dir) + + assert not (tmp_path / "evil.txt").exists() + assert not (extract_dir / "evil.txt").exists() + + +def test_safe_extract_rejects_oversized_archives(tmp_path): + class FakeMember: + def __init__(self, filename: str, file_size: int): + self.filename = filename + self.file_size = file_size + + class FakeZipFile: + def __init__(self): + self.extracted = False + + def infolist(self): + return [FakeMember("data.txt", MAX_ARCHIVE_UNCOMPRESSED_BYTES + 1)] + + def extractall(self, destination): + self.extracted = True + + extract_dir = tmp_path / "extract" + fake_zip = FakeZipFile() + + with pytest.raises(ValueError, match="uncompressed size limit"): + _safe_extract(fake_zip, extract_dir) + + assert fake_zip.extracted is False + + +def test_download_with_limits_rejects_oversized_responses(tmp_path, monkeypatch): + class FakeResponse: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def raise_for_status(self): + return None + + def iter_content(self, chunk_size): + yield b"x" * MAX_ARCHIVE_DOWNLOAD_BYTES + yield b"x" + + monkeypatch.setattr( + "requests.get", + lambda *args, **kwargs: FakeResponse(), + ) + + destination = Path(tmp_path) / "archive.zip" + + with pytest.raises(ValueError, match="exceeds"): + _download_with_limits("https://example.com/archive.zip", destination) + + assert not destination.exists() diff --git a/policyengine_us/tools/geography/download_50_state_census_block_data.py b/policyengine_us/tools/geography/download_50_state_census_block_data.py index 35244cc2c33..4fd50559500 100644 --- a/policyengine_us/tools/geography/download_50_state_census_block_data.py +++ b/policyengine_us/tools/geography/download_50_state_census_block_data.py @@ -1,10 +1,11 @@ +import os +import shutil +import zipfile +from pathlib import Path + import pandas as pd import requests from tqdm import tqdm -from pathlib import Path -import os -import zipfile -import shutil STATE_NAMES = [ "Alabama", @@ -112,31 +113,103 @@ ] STATE_CODES = [x.lower() for x in STATE_CODES] DATA_FOLDER = Path("data") -DATA_FOLDER.mkdir(exist_ok=True) - -dfs = [] -for state_name, state_code in tqdm( - zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data" -): - data_url = f"https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip" - # Download the file and save to a folder called "block_level_population_data_by_state/" - r = requests.get(data_url) - with open(DATA_FOLDER / f"{state_code}2020.pl.zip", "wb") as f: - f.write(r.content) - # Unzip the file - with zipfile.ZipFile(DATA_FOLDER / f"{state_code}2020.pl.zip", "r") as zip_ref: - zip_ref.extractall(DATA_FOLDER / f"{state_code}2020.pl") - # Delete the zip file - os.remove(DATA_FOLDER / f"{state_code}2020.pl.zip") - # Read the file - df = pd.read_csv( - DATA_FOLDER / f"{state_code}2020.pl/{state_code}geo2020.pl", - sep="|", - low_memory=False, - encoding="ISO-8859-1", - ) - df["state"] = state_code - dfs += [df] - full_df = pd.concat(dfs) - full_df.to_csv(DATA_FOLDER / "50_state_block_data.csv", index=False) - shutil.rmtree(DATA_FOLDER / f"{state_code}2020.pl") +DOWNLOAD_TIMEOUT_SECONDS = 60 +DOWNLOAD_CHUNK_SIZE = 1024 * 1024 +MAX_ARCHIVE_DOWNLOAD_BYTES = 100 * 1024 * 1024 +MAX_ARCHIVE_MEMBER_COUNT = 64 +MAX_ARCHIVE_UNCOMPRESSED_BYTES = 512 * 1024 * 1024 + + +def _download_with_limits(url: str, destination: Path) -> None: + downloaded_bytes = 0 + + try: + with requests.get( + url, stream=True, timeout=DOWNLOAD_TIMEOUT_SECONDS + ) as response: + response.raise_for_status() + with Path(destination).open("wb") as file: + for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): + if not chunk: + continue + downloaded_bytes += len(chunk) + if downloaded_bytes > MAX_ARCHIVE_DOWNLOAD_BYTES: + raise ValueError( + f"Downloaded archive exceeds {MAX_ARCHIVE_DOWNLOAD_BYTES} bytes" + ) + file.write(chunk) + except Exception: + Path(destination).unlink(missing_ok=True) + raise + + +def _safe_extract(zip_ref: zipfile.ZipFile, destination: Path) -> None: + """Extract an archive only if every member stays within the destination.""" + + destination = Path(destination).resolve() + destination.mkdir(parents=True, exist_ok=True) + members = zip_ref.infolist() + if len(members) > MAX_ARCHIVE_MEMBER_COUNT: + raise ValueError( + f"Archive contains {len(members)} files, exceeding the limit of " + f"{MAX_ARCHIVE_MEMBER_COUNT}" + ) + + total_uncompressed_bytes = 0 + for member in members: + total_uncompressed_bytes += member.file_size + if total_uncompressed_bytes > MAX_ARCHIVE_UNCOMPRESSED_BYTES: + raise ValueError( + "Archive exceeds the allowed uncompressed size limit of " + f"{MAX_ARCHIVE_UNCOMPRESSED_BYTES} bytes" + ) + target_path = (destination / member.filename).resolve() + if destination != target_path and destination not in target_path.parents: + raise ValueError(f"Unsafe path in zip archive: {member.filename}") + zip_ref.extractall(destination) + + +def download_state_block_data(data_folder: Path = DATA_FOLDER) -> pd.DataFrame: + data_folder = Path(data_folder) + data_folder.mkdir(parents=True, exist_ok=True) + + dfs = [] + for state_name, state_code in tqdm( + zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data" + ): + data_url = ( + "https://www2.census.gov/programs-surveys/decennial/2020/data/" + f"01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip" + ) + zip_path = data_folder / f"{state_code}2020.pl.zip" + extract_dir = data_folder / f"{state_code}2020.pl" + + _download_with_limits(data_url, zip_path) + + with zipfile.ZipFile(zip_path, "r") as zip_ref: + _safe_extract(zip_ref, extract_dir) + + os.remove(zip_path) + + df = pd.read_csv( + extract_dir / f"{state_code}geo2020.pl", + sep="|", + low_memory=False, + encoding="ISO-8859-1", + ) + df["state"] = state_code + dfs.append(df) + + full_df = pd.concat(dfs) + full_df.to_csv(data_folder / "50_state_block_data.csv", index=False) + shutil.rmtree(extract_dir) + + return full_df + + +def main() -> None: + download_state_block_data(DATA_FOLDER) + + +if __name__ == "__main__": + main()