diff --git a/CHANGES/7157.bugfix.rst b/CHANGES/7157.bugfix.rst new file mode 100644 index 00000000000..60f06d8da9f --- /dev/null +++ b/CHANGES/7157.bugfix.rst @@ -0,0 +1,6 @@ +Fixed ``ZLibDecompressor`` silently dropping data past the first +member when decompressing concatenated gzip/deflate streams. Each subsequent +member is now handed to a fresh decompressor, matching the behaviour already +implemented for ZSTD multi-frame streams. + +-- by :user:`Ashutosh-177` diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 3cee31d4d65..d13a189129a 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -57,6 +57,7 @@ Arie Bovenberg Arseny Timoniq Artem Yushkovskiy Arthur Darcet +Ashutosh Kumar Singh Austin Scola Bai Haoran Ben Bader diff --git a/aiohttp/compression_utils.py b/aiohttp/compression_utils.py index 9373344e65c..400df141470 100644 --- a/aiohttp/compression_utils.py +++ b/aiohttp/compression_utils.py @@ -55,6 +55,9 @@ def eof(self) -> bool: ... @property def unconsumed_tail(self) -> bytes: ... + @property + def unused_data(self) -> bytes: ... + class ZLibBackendProtocol(Protocol): MAX_WBITS: int @@ -275,15 +278,42 @@ def __init__( self._zlib_backend: Final = ZLibBackendWrapper(ZLibBackend._zlib_backend) self._decompressor = self._zlib_backend.decompressobj(wbits=self._mode) self._last_empty = False + self._pending_unused_data: bytes | None = None def decompress_sync( self, data: Buffer, max_length: int = ZLIB_MAX_LENGTH_UNLIMITED ) -> bytes: + if self._pending_unused_data is not None: + data = self._pending_unused_data + bytes(data) + self._pending_unused_data = None result = self._decompressor.decompress( self._decompressor.unconsumed_tail + data, max_length ) # Only way to know that isal has no further data is checking we get no output self._last_empty = result == b"" + + # Handle concatenated gzip/deflate streams (multi-member). + # After a member ends, unused_data holds the start of the next member. + # Create a fresh decompressor for each subsequent member. + while self._decompressor.eof and self._decompressor.unused_data: + unused = self._decompressor.unused_data + self._decompressor = self._zlib_backend.decompressobj(wbits=self._mode) + if max_length != ZLIB_MAX_LENGTH_UNLIMITED: + max_length -= len(result) + if max_length <= 0: + self._pending_unused_data = unused + break + chunk = self._decompressor.decompress(unused, max_length) + self._last_empty = chunk == b"" + result += chunk + + # Member ended exactly at chunk boundary — no unused_data, but the + # next feed_data() call would fail on the spent decompressor. + # Only reset for gzip; deflate's feed_eof() relies on eof=True to + # confirm the stream is complete. + if self._decompressor.eof and self._mode > self._zlib_backend.MAX_WBITS: + self._decompressor = self._zlib_backend.decompressobj(wbits=self._mode) + return result def flush(self, length: int = 0) -> bytes: @@ -295,7 +325,11 @@ def flush(self, length: int = 0) -> bytes: @property def data_available(self) -> bool: - return bool(self._decompressor.unconsumed_tail) or not self._last_empty + return ( + bool(self._decompressor.unconsumed_tail) + or not self._last_empty + or self._pending_unused_data is not None + ) @property def eof(self) -> bool: diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 185d9ebfdf5..75d3d0c8323 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -99,6 +99,7 @@ Cython Cythonize cythonized de +decompressor deduplicate defs Dependabot @@ -145,6 +146,7 @@ github google gunicorn gunicorn’s +gzip gzipped hackish highlevel diff --git a/requirements/base-ft.txt b/requirements/base-ft.txt index 95b2c278df2..1eeed867279 100644 --- a/requirements/base-ft.txt +++ b/requirements/base-ft.txt @@ -24,7 +24,7 @@ frozenlist==1.8.0 # aiosignal gunicorn==26.0.0 # via -r requirements/base-ft.in -idna==3.16 +idna==3.17 # via yarl multidict==6.7.1 # via diff --git a/requirements/base.txt b/requirements/base.txt index 4f8536ea416..159c65a0ce2 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -24,7 +24,7 @@ frozenlist==1.8.0 # aiosignal gunicorn==26.0.0 # via -r requirements/base.in -idna==3.16 +idna==3.17 # via yarl multidict==6.7.1 # via diff --git a/requirements/constraints.txt b/requirements/constraints.txt index db292a829a2..366ec29f17a 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -100,7 +100,7 @@ gunicorn==26.0.0 # via -r requirements/base.in identify==2.6.19 # via pre-commit -idna==3.16 +idna==3.17 # via # requests # trustme diff --git a/requirements/dev.txt b/requirements/dev.txt index d1536836b77..115647a9095 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -98,7 +98,7 @@ gunicorn==26.0.0 # via -r requirements/base.in identify==2.6.19 # via pre-commit -idna==3.16 +idna==3.17 # via # requests # trustme diff --git a/requirements/doc-spelling.txt b/requirements/doc-spelling.txt index 7ce9ae9e976..2878edd94f6 100644 --- a/requirements/doc-spelling.txt +++ b/requirements/doc-spelling.txt @@ -20,7 +20,7 @@ docutils==0.21.2 # via # myst-parser # sphinx -idna==3.16 +idna==3.17 # via requests imagesize==2.0.0 # via sphinx diff --git a/requirements/doc.txt b/requirements/doc.txt index ec2f8e6326d..55d30f206cb 100644 --- a/requirements/doc.txt +++ b/requirements/doc.txt @@ -20,7 +20,7 @@ docutils==0.21.2 # via # myst-parser # sphinx -idna==3.16 +idna==3.17 # via requests imagesize==2.0.0 # via sphinx diff --git a/requirements/lint.txt b/requirements/lint.txt index 331a8b8855d..fa4d1fba9b3 100644 --- a/requirements/lint.txt +++ b/requirements/lint.txt @@ -54,7 +54,7 @@ frozenlist==1.8.0 # aiosignal identify==2.6.19 # via pre-commit -idna==3.16 +idna==3.17 # via # trustme # yarl diff --git a/requirements/runtime-deps.txt b/requirements/runtime-deps.txt index b553d1fe24b..f7080a0e341 100644 --- a/requirements/runtime-deps.txt +++ b/requirements/runtime-deps.txt @@ -22,7 +22,7 @@ frozenlist==1.8.0 # via # -r requirements/runtime-deps.in # aiosignal -idna==3.16 +idna==3.17 # via yarl multidict==6.7.1 # via diff --git a/requirements/test-common-base.txt b/requirements/test-common-base.txt index b581f3f9153..ac93663fef7 100644 --- a/requirements/test-common-base.txt +++ b/requirements/test-common-base.txt @@ -26,7 +26,7 @@ frozenlist==1.8.0 # via # aiohttp # aiosignal -idna==3.15 +idna==3.17 # via yarl iniconfig==2.3.0 # via pytest diff --git a/requirements/test-common.txt b/requirements/test-common.txt index ed2b1fcf010..cd82228ae08 100644 --- a/requirements/test-common.txt +++ b/requirements/test-common.txt @@ -42,7 +42,7 @@ frozenlist==1.8.0 # via # aiohttp # aiosignal -idna==3.16 +idna==3.17 # via # trustme # yarl diff --git a/requirements/test-ft.txt b/requirements/test-ft.txt index 90838cc1d6d..c89f4423c89 100644 --- a/requirements/test-ft.txt +++ b/requirements/test-ft.txt @@ -59,7 +59,7 @@ frozenlist==1.8.0 # aiosignal gunicorn==26.0.0 # via -r requirements/base-ft.in -idna==3.16 +idna==3.17 # via # trustme # yarl diff --git a/requirements/test-mobile.txt b/requirements/test-mobile.txt index c3504d693da..edce1810ac0 100644 --- a/requirements/test-mobile.txt +++ b/requirements/test-mobile.txt @@ -47,7 +47,7 @@ frozenlist==1.8.0 # aiosignal gunicorn==26.0.0 # via -r requirements/base-ft.in -idna==3.15 +idna==3.17 # via yarl iniconfig==2.3.0 # via pytest diff --git a/requirements/test.txt b/requirements/test.txt index 039262e7526..27414c43592 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -59,7 +59,7 @@ frozenlist==1.8.0 # aiosignal gunicorn==26.0.0 # via -r requirements/base.in -idna==3.16 +idna==3.17 # via # trustme # yarl diff --git a/tests/test_compression_utils.py b/tests/test_compression_utils.py index 3362b8feed0..5deebc8470d 100644 --- a/tests/test_compression_utils.py +++ b/tests/test_compression_utils.py @@ -1,5 +1,6 @@ """Tests for compression utils.""" +import gzip import sys import pytest @@ -87,3 +88,38 @@ def test_zstd_multi_frame_max_length_exhausted_preserves_unused_data() -> None: assert result1 == b"AAAA" result2 = d.decompress_sync(frame3) assert result2 == b"BBBBCCCC" + + +def test_zlib_gzip_multi_member_unlimited() -> None: + d = ZLibDecompressor(encoding="gzip") + member1 = gzip.compress(b"AAAA") + member2 = gzip.compress(b"BBBB") + result = d.decompress_sync(member1 + member2) + assert result == b"AAAABBBB" + + +def test_zlib_gzip_multi_member_max_length_partial() -> None: + d = ZLibDecompressor(encoding="gzip") + member1 = gzip.compress(b"AAAA") + member2 = gzip.compress(b"BBBB") + result = d.decompress_sync(member1 + member2, max_length=6) + assert result == b"AAAABB" + + +def test_zlib_gzip_multi_member_max_length_exhausted() -> None: + d = ZLibDecompressor(encoding="gzip") + member1 = gzip.compress(b"AAAA") + member2 = gzip.compress(b"BBBB") + result = d.decompress_sync(member1 + member2, max_length=4) + assert result == b"AAAA" + + +def test_zlib_gzip_multi_member_max_length_exhausted_preserves_unused_data() -> None: + d = ZLibDecompressor(encoding="gzip") + member1 = gzip.compress(b"AAAA") + member2 = gzip.compress(b"BBBB") + member3 = gzip.compress(b"CCCC") + result1 = d.decompress_sync(member1 + member2, max_length=4) + assert result1 == b"AAAA" + result2 = d.decompress_sync(member3) + assert result2 == b"BBBBCCCC" diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index 118103a5b7f..f0e19325005 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -1,6 +1,7 @@ # Tests for aiohttp/protocol.py import asyncio +import gzip import platform import re import sys @@ -2647,6 +2648,81 @@ async def test_http_payload_zstandard_many_small_frames( assert b"".join(parts) == b"".join(out._buffer) assert out.is_eof() + async def test_http_payload_gzip_multi_member(self, protocol: BaseProtocol) -> None: + member1 = gzip.compress(b"first") + member2 = gzip.compress(b"second") + payload = member1 + member2 + out = aiohttp.StreamReader( + protocol, DEFAULT_CHUNK_SIZE, loop=asyncio.get_running_loop() + ) + p = HttpPayloadParser( + out, + length=len(payload), + compression="gzip", + headers_parser=HeadersParser(), + ) + p.feed_data(payload) + assert b"firstsecond" == b"".join(out._buffer) + assert out.is_eof() + + async def test_http_payload_gzip_multi_member_chunked( + self, protocol: BaseProtocol + ) -> None: + member1 = gzip.compress(b"chunk1") + member2 = gzip.compress(b"chunk2") + out = aiohttp.StreamReader( + protocol, DEFAULT_CHUNK_SIZE, loop=asyncio.get_running_loop() + ) + p = HttpPayloadParser( + out, + length=len(member1) + len(member2), + compression="gzip", + headers_parser=HeadersParser(), + ) + p.feed_data(member1) + p.feed_data(member2) + assert b"chunk1chunk2" == b"".join(out._buffer) + assert out.is_eof() + + async def test_http_payload_gzip_member_split_mid_chunk( + self, protocol: BaseProtocol + ) -> None: + member1 = gzip.compress(b"AAAA") + member2 = gzip.compress(b"BBBB") + combined = member1 + member2 + split_point = len(member1) + 3 # 3 bytes into member2 + out = aiohttp.StreamReader( + protocol, DEFAULT_CHUNK_SIZE, loop=asyncio.get_running_loop() + ) + p = HttpPayloadParser( + out, + length=len(combined), + compression="gzip", + headers_parser=HeadersParser(), + ) + p.feed_data(combined[:split_point]) + p.feed_data(combined[split_point:]) + assert b"AAAABBBB" == b"".join(out._buffer) + assert out.is_eof() + + async def test_http_payload_gzip_many_small_members( + self, protocol: BaseProtocol + ) -> None: + parts = [f"part{i}".encode() for i in range(10)] + payload = b"".join(gzip.compress(p) for p in parts) + out = aiohttp.StreamReader( + protocol, DEFAULT_CHUNK_SIZE, loop=asyncio.get_running_loop() + ) + p = HttpPayloadParser( + out, + length=len(payload), + compression="gzip", + headers_parser=HeadersParser(), + ) + p.feed_data(payload) + assert b"".join(parts) == b"".join(out._buffer) + assert out.is_eof() + class TestDeflateBuffer: async def test_feed_data(self, protocol: BaseProtocol) -> None: