Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/safe-census-zip-extraction.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Validate extracted census archive paths before unpacking downloaded state block archives.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import zipfile
from pathlib import Path

import pytest

from policyengine_us.tools.geography.download_50_state_census_block_data import (
MAX_ARCHIVE_DOWNLOAD_BYTES,
MAX_ARCHIVE_UNCOMPRESSED_BYTES,
_download_with_limits,
_safe_extract,
)


def test_safe_extract_rejects_zip_slip(tmp_path):
archive_path = tmp_path / "malicious.zip"
extract_dir = tmp_path / "extract"

with zipfile.ZipFile(archive_path, "w") as zip_ref:
zip_ref.writestr("../evil.txt", "pwned")

with zipfile.ZipFile(archive_path, "r") as zip_ref:
with pytest.raises(ValueError, match="Unsafe path"):
_safe_extract(zip_ref, extract_dir)

assert not (tmp_path / "evil.txt").exists()
assert not (extract_dir / "evil.txt").exists()


def test_safe_extract_rejects_oversized_archives(tmp_path):
class FakeMember:
def __init__(self, filename: str, file_size: int):
self.filename = filename
self.file_size = file_size

class FakeZipFile:
def __init__(self):
self.extracted = False

def infolist(self):
return [FakeMember("data.txt", MAX_ARCHIVE_UNCOMPRESSED_BYTES + 1)]

def extractall(self, destination):
self.extracted = True

extract_dir = tmp_path / "extract"
fake_zip = FakeZipFile()

with pytest.raises(ValueError, match="uncompressed size limit"):
_safe_extract(fake_zip, extract_dir)

assert fake_zip.extracted is False


def test_download_with_limits_rejects_oversized_responses(tmp_path, monkeypatch):
class FakeResponse:
def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False

def raise_for_status(self):
return None

def iter_content(self, chunk_size):
yield b"x" * MAX_ARCHIVE_DOWNLOAD_BYTES
yield b"x"

monkeypatch.setattr(
"requests.get",
lambda *args, **kwargs: FakeResponse(),
)

destination = Path(tmp_path) / "archive.zip"

with pytest.raises(ValueError, match="exceeds"):
_download_with_limits("https://example.com/archive.zip", destination)

assert not destination.exists()
137 changes: 105 additions & 32 deletions policyengine_us/tools/geography/download_50_state_census_block_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import shutil
import zipfile
from pathlib import Path

import pandas as pd
import requests
from tqdm import tqdm
from pathlib import Path
import os
import zipfile
import shutil

STATE_NAMES = [
"Alabama",
Expand Down Expand Up @@ -112,31 +113,103 @@
]
STATE_CODES = [x.lower() for x in STATE_CODES]
DATA_FOLDER = Path("data")
DATA_FOLDER.mkdir(exist_ok=True)

dfs = []
for state_name, state_code in tqdm(
zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data"
):
data_url = f"https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip"
# Download the file and save to a folder called "block_level_population_data_by_state/"
r = requests.get(data_url)
with open(DATA_FOLDER / f"{state_code}2020.pl.zip", "wb") as f:
f.write(r.content)
# Unzip the file
with zipfile.ZipFile(DATA_FOLDER / f"{state_code}2020.pl.zip", "r") as zip_ref:
zip_ref.extractall(DATA_FOLDER / f"{state_code}2020.pl")
# Delete the zip file
os.remove(DATA_FOLDER / f"{state_code}2020.pl.zip")
# Read the file
df = pd.read_csv(
DATA_FOLDER / f"{state_code}2020.pl/{state_code}geo2020.pl",
sep="|",
low_memory=False,
encoding="ISO-8859-1",
)
df["state"] = state_code
dfs += [df]
full_df = pd.concat(dfs)
full_df.to_csv(DATA_FOLDER / "50_state_block_data.csv", index=False)
shutil.rmtree(DATA_FOLDER / f"{state_code}2020.pl")
DOWNLOAD_TIMEOUT_SECONDS = 60
DOWNLOAD_CHUNK_SIZE = 1024 * 1024
MAX_ARCHIVE_DOWNLOAD_BYTES = 100 * 1024 * 1024
MAX_ARCHIVE_MEMBER_COUNT = 64
MAX_ARCHIVE_UNCOMPRESSED_BYTES = 512 * 1024 * 1024


def _download_with_limits(url: str, destination: Path) -> None:
downloaded_bytes = 0

try:
with requests.get(
url, stream=True, timeout=DOWNLOAD_TIMEOUT_SECONDS
) as response:
response.raise_for_status()
with Path(destination).open("wb") as file:
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if not chunk:
continue
downloaded_bytes += len(chunk)
if downloaded_bytes > MAX_ARCHIVE_DOWNLOAD_BYTES:
raise ValueError(
f"Downloaded archive exceeds {MAX_ARCHIVE_DOWNLOAD_BYTES} bytes"
)
file.write(chunk)
except Exception:
Path(destination).unlink(missing_ok=True)
raise


def _safe_extract(zip_ref: zipfile.ZipFile, destination: Path) -> None:
"""Extract an archive only if every member stays within the destination."""

destination = Path(destination).resolve()
destination.mkdir(parents=True, exist_ok=True)
members = zip_ref.infolist()
if len(members) > MAX_ARCHIVE_MEMBER_COUNT:
raise ValueError(
f"Archive contains {len(members)} files, exceeding the limit of "
f"{MAX_ARCHIVE_MEMBER_COUNT}"
)

total_uncompressed_bytes = 0
for member in members:
total_uncompressed_bytes += member.file_size
if total_uncompressed_bytes > MAX_ARCHIVE_UNCOMPRESSED_BYTES:
raise ValueError(
"Archive exceeds the allowed uncompressed size limit of "
f"{MAX_ARCHIVE_UNCOMPRESSED_BYTES} bytes"
)
target_path = (destination / member.filename).resolve()
if destination != target_path and destination not in target_path.parents:
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
zip_ref.extractall(destination)


def download_state_block_data(data_folder: Path = DATA_FOLDER) -> pd.DataFrame:
data_folder = Path(data_folder)
data_folder.mkdir(parents=True, exist_ok=True)

dfs = []
for state_name, state_code in tqdm(
zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data"
):
data_url = (
"https://www2.census.gov/programs-surveys/decennial/2020/data/"
f"01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip"
)
zip_path = data_folder / f"{state_code}2020.pl.zip"
extract_dir = data_folder / f"{state_code}2020.pl"

_download_with_limits(data_url, zip_path)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
_safe_extract(zip_ref, extract_dir)

os.remove(zip_path)

df = pd.read_csv(
extract_dir / f"{state_code}geo2020.pl",
sep="|",
low_memory=False,
encoding="ISO-8859-1",
)
df["state"] = state_code
dfs.append(df)

full_df = pd.concat(dfs)
full_df.to_csv(data_folder / "50_state_block_data.csv", index=False)
shutil.rmtree(extract_dir)

return full_df


def main() -> None:
download_state_block_data(DATA_FOLDER)


if __name__ == "__main__":
main()
Loading