From 732c0bf646e9cf6d64f11ad945302c3e036a7a73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BB=9F=E5=BC=8B?= <yejunhao.yjh@alibaba-inc.com>
Date: Wed, 13 May 2026 14:19:14 +0800
Subject: [PATCH 01/34] [python] Support blob view fields

# Conflicts:
#	paimon-python/pypaimon/common/options/core_options.py
#	paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
#	paimon-python/pypaimon/read/reader/data_file_batch_reader.py
#	paimon-python/pypaimon/read/split_read.py
#	paimon-python/pypaimon/schema/schema.py
#	paimon-python/pypaimon/table/row/blob.py
#	paimon-python/pypaimon/tests/blob_test.py
#	paimon-python/pypaimon/write/writer/data_blob_writer.py

# Conflicts:
#	paimon-python/pypaimon/common/options/core_options.py
#	paimon-python/pypaimon/write/writer/dedicated_format_writer.py

# Conflicts:
#	paimon-python/pypaimon/read/reader/data_file_batch_reader.py

# Conflicts:
#	paimon-python/pypaimon/table/row/blob.py
---
 .../pypaimon/common/options/core_options.py   |  14 ++
 .../reader/blob_descriptor_convert_reader.py  |  40 +++-
 .../read/reader/data_file_batch_reader.py     |  62 +++++-
 paimon-python/pypaimon/read/split_read.py     |  12 +-
 paimon-python/pypaimon/schema/schema.py       |  20 ++
 paimon-python/pypaimon/table/row/blob.py      | 176 ++++++++++++++++++
 .../pypaimon/tests/blob_table_test.py         | 142 ++++++++++++++
 paimon-python/pypaimon/tests/blob_test.py     |  22 ++-
 .../pypaimon/utils/blob_view_lookup.py        | 163 ++++++++++++++++
 .../write/writer/dedicated_format_writer.py   |  64 ++++++-
 10 files changed, 694 insertions(+), 21 deletions(-)
 create mode 100644 paimon-python/pypaimon/utils/blob_view_lookup.py

diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py
index 9ca8796f96bb..f465ca7c98f8 100644
--- a/paimon-python/pypaimon/common/options/core_options.py
+++ b/paimon-python/pypaimon/common/options/core_options.py
@@ -729,6 +729,20 @@ def variant_shredding_schema(self) -> Optional[str]:
 
     def blob_descriptor_fields(self, default=None):
         value = self.options.get(CoreOptions.BLOB_DESCRIPTOR_FIELD, default)
+        return CoreOptions._parse_field_set(value)
+
+    def blob_view_fields(self, default=None):
+        value = self.options.get(CoreOptions.BLOB_VIEW_FIELD, default)
+        return CoreOptions._parse_field_set(value)
+
+    def blob_inline_fields(self, default=None):
+        fields = set()
+        fields.update(self.blob_descriptor_fields(default))
+        fields.update(self.blob_view_fields(default))
+        return fields
+
+    @staticmethod
+    def _parse_field_set(value):
         if value is None:
             return set()
         if isinstance(value, str):
diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 35fe046a03ce..1e30ec0d5b6b 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -30,6 +30,8 @@ def __init__(self, inner: RecordBatchReader, table):
         self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
         self.file_io = inner.file_io
         self.blob_field_indices = inner.blob_field_indices
+        self._view_fields = CoreOptions.blob_view_fields(table.options)
+        self._blob_view_lookup = None
 
     def read_arrow_batch(self) -> Optional[RecordBatch]:
         import pyarrow
@@ -39,7 +41,8 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
         return self._convert_batch(batch, pyarrow)
 
     def _convert_batch(self, batch, pyarrow):
-        from pypaimon.table.row.blob import Blob, BlobDescriptor
+        from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+        from pypaimon.utils.blob_view_lookup import BlobViewLookup
 
         result = batch
         for field_name in self._descriptor_fields:
@@ -70,6 +73,41 @@ def _convert_batch(self, batch, pyarrow):
                 except Exception:
                     converted_values.append(value)
 
+            column_idx = result.schema.names.index(field_name)
+            result = result.set_column(
+                column_idx,
+                pyarrow.field(field_name, pyarrow.large_binary(), nullable=True),
+                pyarrow.array(converted_values, type=pyarrow.large_binary()),
+            )
+        for field_name in self._view_fields:
+            if field_name not in result.schema.names:
+                continue
+            values = result.column(field_name).to_pylist()
+            converted_values = []
+            for value in values:
+                if value is None:
+                    converted_values.append(None)
+                    continue
+                if hasattr(value, 'as_py'):
+                    value = value.as_py()
+                if isinstance(value, str):
+                    value = value.encode('utf-8')
+                if isinstance(value, bytearray):
+                    value = bytes(value)
+                if not isinstance(value, bytes):
+                    converted_values.append(value)
+                    continue
+                try:
+                    if not BlobViewStruct.is_blob_view_struct(value):
+                        converted_values.append(value)
+                        continue
+                    if self._blob_view_lookup is None:
+                        self._blob_view_lookup = BlobViewLookup(self._table)
+                    view_struct = BlobViewStruct.deserialize(value)
+                    converted_values.append(self._blob_view_lookup.resolve_data(view_struct))
+                except Exception:
+                    converted_values.append(value)
+
             column_idx = result.schema.names.index(field_name)
             result = result.set_column(
                 column_idx,
diff --git a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
index 12e6990e137e..e8606b0d7cfc 100644
--- a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
+++ b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
@@ -26,6 +26,7 @@
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
 from pypaimon.schema.data_types import DataField, PyarrowFieldParser
 from pypaimon.table.row.blob import Blob
+from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
 from pypaimon.table.special_fields import SpecialFields
 
 
@@ -42,8 +43,10 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
                  system_fields: dict,
                  blob_as_descriptor: bool = False,
                  blob_descriptor_fields: Optional[set] = None,
+                 blob_view_fields: Optional[set] = None,
                  file_io: Optional[FileIO] = None,
-                 row_id_offsets: Optional[List[int]] = None):
+                 row_id_offsets: Optional[List[int]] = None,
+                 table=None):
         self.format_reader = format_reader
         self.index_mapping = index_mapping
         self.partition_info = partition_info
@@ -57,7 +60,9 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
         self.system_fields = system_fields
         self.blob_as_descriptor = blob_as_descriptor
         self.blob_descriptor_fields = blob_descriptor_fields or set()
+        self.blob_view_fields = blob_view_fields or set()
         self.file_io = file_io
+        self.table = table
         self.blob_field_names = {
             field.name
             for field in fields
@@ -68,6 +73,12 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
             for field_name in self.blob_descriptor_fields
             if field_name in self.blob_field_names
         }
+        self.view_blob_fields = {
+            field_name
+            for field_name in self.blob_view_fields
+            if field_name in self.blob_field_names
+        }
+        self._blob_view_lookup = None
 
     def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch]:
         if isinstance(self.format_reader, FormatBlobReader):
@@ -90,7 +101,7 @@ def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch
                 record_batch.schema.names, record_batch.columns)
             if self.row_tracking_enabled and self.system_fields:
                 record_batch = self._assign_row_tracking(record_batch)
-            return record_batch
+            return self._convert_inline_blob_columns(record_batch)
 
         inter_arrays = []
         inter_names = []
@@ -140,7 +151,7 @@ def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch
         if self.row_tracking_enabled and self.system_fields:
             record_batch = self._assign_row_tracking(record_batch)
 
-        record_batch = self._convert_descriptor_stored_blob_columns(record_batch)
+        record_batch = self._convert_inline_blob_columns(record_batch)
 
         return record_batch
 
@@ -170,15 +181,16 @@ def _align_batch_to_read_schema(self, names: List[str], arrays: list) -> RecordB
             out_fields.append(target_field)
         return pa.RecordBatch.from_arrays(out_arrays, schema=pa.schema(out_fields))
 
-    def _convert_descriptor_stored_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
+    def _convert_inline_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
         if isinstance(self.format_reader, FormatBlobReader):
             return record_batch
-        if not self.descriptor_blob_fields:
+        if not self.descriptor_blob_fields and not self.view_blob_fields:
             return record_batch
 
         schema_names = set(record_batch.schema.names)
         target_fields = [f for f in self.descriptor_blob_fields if f in schema_names]
-        if not target_fields:
+        view_fields = [f for f in self.view_blob_fields if f in schema_names]
+        if not target_fields and not view_fields:
             return record_batch
 
         arrays = list(record_batch.columns)
@@ -192,6 +204,16 @@ def _convert_descriptor_stored_blob_columns(self, record_batch: RecordBatch) ->
                 converted = [self._blob_cell_to_data(v) for v in values]
             arrays[field_idx] = pa.array(converted, type=pa.large_binary())
 
+        for field_name in view_fields:
+            field_idx = record_batch.schema.get_field_index(field_name)
+            values = record_batch.column(field_idx).to_pylist()
+
+            if self.blob_as_descriptor:
+                converted = [self._blob_view_cell_to_descriptor(v) for v in values]
+            else:
+                converted = [self._blob_view_cell_to_data(v) for v in values]
+            arrays[field_idx] = pa.array(converted, type=pa.large_binary())
+
         return pa.RecordBatch.from_arrays(arrays, schema=record_batch.schema)
 
     @staticmethod
@@ -214,6 +236,34 @@ def _blob_cell_to_data(self, value):
             return value
         return Blob.from_bytes(value, self.file_io).to_data()
 
+    def _blob_view_cell_to_descriptor(self, value):
+        view_struct = self._deserialize_blob_view_or_none(value)
+        if view_struct is None:
+            return self._normalize_blob_cell(value)
+        return self._blob_view_lookup_or_create().resolve_descriptor(view_struct).serialize()
+
+    def _blob_view_cell_to_data(self, value):
+        view_struct = self._deserialize_blob_view_or_none(value)
+        if view_struct is None:
+            return self._normalize_blob_cell(value)
+        return self._blob_view_lookup_or_create().resolve_data(view_struct)
+
+    @staticmethod
+    def _deserialize_blob_view_or_none(value):
+        value = DataFileBatchReader._normalize_blob_cell(value)
+        if value is None or not isinstance(value, bytes):
+            return None
+        if not BlobViewStruct.is_blob_view_struct(value):
+            return None
+        return BlobViewStruct.deserialize(value)
+
+    def _blob_view_lookup_or_create(self):
+        if self.table is None:
+            raise ValueError("Cannot resolve blob view without table context.")
+        if self._blob_view_lookup is None:
+            self._blob_view_lookup = BlobViewLookup(self.table)
+        return self._blob_view_lookup
+
     def _assign_row_tracking(self, record_batch: RecordBatch) -> RecordBatch:
         """Assign row tracking meta fields (_ROW_ID and _SEQUENCE_NUMBER)."""
         arrays = list(record_batch.columns)
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 7d20359dfc2c..1893f3e6e776 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -320,6 +320,7 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
 
         blob_as_descriptor = CoreOptions.blob_as_descriptor(self.table.options)
         blob_descriptor_fields = CoreOptions.blob_descriptor_fields(self.table.options)
+        blob_view_fields = CoreOptions.blob_view_fields(self.table.options)
 
         index_mapping = self.create_index_mapping()
         partition_info = self._create_partition_info()
@@ -350,8 +351,10 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 system_fields,
                 blob_as_descriptor=blob_as_descriptor,
                 blob_descriptor_fields=blob_descriptor_fields,
+                blob_view_fields=blob_view_fields,
                 file_io=self.table.file_io,
-                row_id_offsets=row_indices)
+                row_id_offsets=row_indices,
+                table=self.table)
         else:
             reader = DataFileBatchReader(
                 format_reader,
@@ -365,8 +368,10 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 system_fields,
                 blob_as_descriptor=blob_as_descriptor,
                 blob_descriptor_fields=blob_descriptor_fields,
+                blob_view_fields=blob_view_fields,
                 file_io=self.table.file_io,
-                row_id_offsets=row_indices)
+                row_id_offsets=row_indices,
+                table=self.table)
 
         # For non-Vortex formats, wrap with RowIdFilterRecordBatchReader
         if row_ranges is not None and row_indices is None:
@@ -840,7 +845,8 @@ def create_reader(self) -> RecordReader:
             reader = merge_reader
 
         if (not CoreOptions.blob_as_descriptor(self.table.options)
-                and CoreOptions.blob_descriptor_fields(self.table.options)):
+                and (CoreOptions.blob_descriptor_fields(self.table.options)
+                     or CoreOptions.blob_view_fields(self.table.options))):
             reader = BlobDescriptorConvertReader(reader, self.table)
 
         if self.limit is not None:
diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index 912966732660..c9425c286e55 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -77,6 +77,26 @@ def from_pyarrow_schema(pa_schema: pa.Schema, partition_keys: Optional[List[str]
                     "Table with BLOB type column must have other normal columns."
                 )
 
+            blob_field_names = {
+                field.name for field in fields if 'blob' in str(field.type).lower()
+            }
+            core_options = CoreOptions.from_dict(options)
+            descriptor_fields = core_options.blob_descriptor_fields()
+            view_fields = core_options.blob_view_fields()
+            unknown_inline_fields = descriptor_fields.union(view_fields).difference(blob_field_names)
+            if unknown_inline_fields:
+                raise ValueError(
+                    "Fields in 'blob-descriptor-field' or 'blob-view-field' must be blob fields "
+                    "in schema. Unknown fields: {}".format(sorted(unknown_inline_fields))
+                )
+
+            overlapping_inline_fields = descriptor_fields.intersection(view_fields)
+            if overlapping_inline_fields:
+                raise ValueError(
+                    "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
+                    "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
+                )
+
             required_options = {
                 CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
                 CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
diff --git a/paimon-python/pypaimon/table/row/blob.py b/paimon-python/pypaimon/table/row/blob.py
index 056316d55fb7..129e5d75e15f 100644
--- a/paimon-python/pypaimon/table/row/blob.py
+++ b/paimon-python/pypaimon/table/row/blob.py
@@ -21,6 +21,7 @@
 from typing import BinaryIO, Callable, Optional, Union
 from urllib.parse import urlparse
 
+from pypaimon.common.identifier import Identifier
 from pypaimon.common.uri_reader import UriReader, FileUriReader
 
 
@@ -162,6 +163,115 @@ def __repr__(self) -> str:
         return self.__str__()
 
 
+class BlobViewStruct:
+    CURRENT_VERSION = 1
+    MAGIC = 0x424C4F4256494557  # "BLOBVIEW"
+
+    def __init__(self, identifier: Union[Identifier, str], field_id: int, row_id: int):
+        if isinstance(identifier, str):
+            identifier = Identifier.from_string(identifier)
+        if not isinstance(identifier, Identifier):
+            raise TypeError("BlobViewStruct identifier must be Identifier or str.")
+        self._identifier = identifier
+        self._field_id = field_id
+        self._row_id = row_id
+
+    @property
+    def identifier(self) -> Identifier:
+        return self._identifier
+
+    @property
+    def field_id(self) -> int:
+        return self._field_id
+
+    @property
+    def row_id(self) -> int:
+        return self._row_id
+
+    def serialize(self) -> bytes:
+        identifier_bytes = self._identifier.get_full_name().encode('utf-8')
+        data = struct.pack('<B', self.CURRENT_VERSION)
+        data += struct.pack('<Q', self.MAGIC)
+        data += struct.pack('<I', len(identifier_bytes))
+        data += identifier_bytes
+        data += struct.pack('<i', self._field_id)
+        data += struct.pack('<q', self._row_id)
+        return data
+
+    @classmethod
+    def deserialize(cls, data: bytes) -> 'BlobViewStruct':
+        if len(data) < 25:
+            raise ValueError("Invalid BlobViewStruct data: too short")
+
+        offset = 0
+        version = struct.unpack('<B', data[offset:offset + 1])[0]
+        offset += 1
+        if version != cls.CURRENT_VERSION:
+            raise ValueError(
+                f"Expecting BlobViewStruct version to be {cls.CURRENT_VERSION}, "
+                f"but found {version}."
+            )
+
+        magic = struct.unpack('<Q', data[offset:offset + 8])[0]
+        offset += 8
+        if magic != cls.MAGIC:
+            raise ValueError(
+                f"Invalid BlobViewStruct: missing magic header. Expected magic: "
+                f"{cls.MAGIC}, but found: {magic}"
+            )
+
+        identifier_length = struct.unpack('<I', data[offset:offset + 4])[0]
+        offset += 4
+        if offset + identifier_length + 12 > len(data):
+            raise ValueError("Invalid BlobViewStruct data: identifier length exceeds data size")
+
+        identifier = data[offset:offset + identifier_length].decode('utf-8')
+        offset += identifier_length
+        field_id = struct.unpack('<i', data[offset:offset + 4])[0]
+        offset += 4
+        row_id = struct.unpack('<q', data[offset:offset + 8])[0]
+        offset += 8
+        if offset != len(data):
+            raise ValueError("Invalid BlobViewStruct data: trailing bytes")
+
+        return cls(Identifier.from_string(identifier), field_id, row_id)
+
+    @classmethod
+    def is_blob_view_struct(cls, data: bytes) -> bool:
+        if not isinstance(data, (bytes, bytearray)):
+            return False
+        raw = bytes(data)
+        if len(raw) < 9:
+            return False
+        version = raw[0]
+        if version != cls.CURRENT_VERSION:
+            return False
+        try:
+            magic = struct.unpack('<Q', raw[1:9])[0]
+            return magic == cls.MAGIC
+        except Exception:
+            return False
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, BlobViewStruct):
+            return False
+        return (self._identifier == other._identifier
+                and self._field_id == other._field_id
+                and self._row_id == other._row_id)
+
+    def __hash__(self) -> int:
+        return hash((self._identifier.get_full_name(), self._field_id, self._row_id))
+
+    def __str__(self) -> str:
+        return (
+            f"BlobViewStruct(identifier={self._identifier.get_full_name()}, "
+            f"field_id={self._field_id}, row_id={self._row_id})"
+        )
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
 class OffsetInputStream(io.RawIOBase):
 
     def __init__(self, wrapped, offset: int, length: int):
@@ -283,6 +393,8 @@ def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True
         if not isinstance(data, (bytes, bytearray)):
             raise TypeError(f"Blob.from_bytes expects bytes, got {type(data)}")
         data = bytes(data)
+        if BlobViewStruct.is_blob_view_struct(data):
+            return Blob.from_view(BlobViewStruct.deserialize(data))
         is_descriptor = BlobDescriptor.is_blob_descriptor(data)
         if not allow_blob_data and not is_descriptor:
             raise ValueError(
@@ -296,6 +408,31 @@ def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True
             return BlobRef(uri_reader, descriptor)
         return BlobData(data)
 
+    @staticmethod
+    def from_view(view_struct: BlobViewStruct) -> 'Blob':
+        return BlobView(view_struct)
+
+    @staticmethod
+    def from_bytes_with_reader(
+            data: bytes,
+            uri_reader: Optional[UriReader],
+            file_io=None,
+            allow_blob_data: bool = True) -> Optional['Blob']:
+        if data is None:
+            return None
+        if BlobViewStruct.is_blob_view_struct(data):
+            return Blob.from_view(BlobViewStruct.deserialize(data))
+        if BlobDescriptor.is_blob_descriptor(data) or not allow_blob_data:
+            descriptor = BlobDescriptor.deserialize(data)
+            return Blob.from_descriptor(uri_reader or UriReader.from_file(file_io), descriptor)
+        return Blob.from_data(data)
+
+    @staticmethod
+    def serialize_blob(blob: 'Blob') -> bytes:
+        if isinstance(blob, BlobView):
+            return blob.view_struct.serialize()
+        return blob.to_descriptor().serialize()
+
 
 class _PlaceholderBlob(Blob):
 
@@ -385,3 +522,42 @@ def __hash__(self) -> int:
 
 
 BlobConsumer = Callable[[str, Optional[BlobDescriptor]], bool]
+
+
+class BlobView(Blob):
+
+    def __init__(self, view_struct: BlobViewStruct):
+        self._view_struct = view_struct
+        self._resolved_blob = None
+
+    @property
+    def view_struct(self) -> BlobViewStruct:
+        return self._view_struct
+
+    def is_resolved(self) -> bool:
+        return self._resolved_blob is not None
+
+    def resolve(self, uri_reader: UriReader, descriptor: BlobDescriptor):
+        self._resolved_blob = BlobRef(uri_reader, descriptor)
+
+    def to_data(self) -> bytes:
+        return self._resolved().to_data()
+
+    def to_descriptor(self) -> BlobDescriptor:
+        return self._resolved().to_descriptor()
+
+    def new_input_stream(self) -> BinaryIO:
+        return self._resolved().new_input_stream()
+
+    def _resolved(self) -> BlobRef:
+        if self._resolved_blob is None:
+            raise RuntimeError("BlobView is not resolved.")
+        return self._resolved_blob
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, BlobView):
+            return False
+        return self._view_struct == other._view_struct
+
+    def __hash__(self) -> int:
+        return hash(self._view_struct)
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
index b6a12fe97318..a15d5ecdd867 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -1390,6 +1390,148 @@ def test_blob_descriptor_fields_mixed_mode(self):
         self.assertEqual(result.column('pic1').to_pylist()[0], pic1_data)
         self.assertEqual(result.column('pic2').to_pylist()[0], pic2_data)
 
+    def test_blob_view_fields_resolve_upstream_blob(self):
+        from pypaimon import Schema
+        from pypaimon.common.options.core_options import CoreOptions
+        from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+
+        source_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        source = Schema.from_pyarrow_schema(
+            source_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_source', source, False)
+        source_table = self.catalog.get_table('test_db.blob_view_source')
+        payloads = [b'view-source-0', b'view-source-1']
+
+        write_builder = source_table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(pa.Table.from_pydict({
+            'id': [1, 2],
+            'picture': payloads,
+        }, schema=source_schema))
+        commit_messages = writer.prepare_commit()
+        write_builder.new_commit().commit(commit_messages)
+        writer.close()
+
+        picture_field_id = next(
+            field.id for field in source_table.table_schema.fields if field.name == 'picture'
+        )
+        view_values = [
+            BlobViewStruct('test_db.blob_view_source', picture_field_id, 0).serialize(),
+            BlobViewStruct('test_db.blob_view_source', picture_field_id, 1).serialize(),
+        ]
+
+        target_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        target = Schema.from_pyarrow_schema(
+            target_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-view-field': 'picture',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_target', target, False)
+        target_table = self.catalog.get_table('test_db.blob_view_target')
+
+        target_write_builder = target_table.new_batch_write_builder()
+        target_writer = target_write_builder.new_write()
+        target_writer.write_arrow(pa.Table.from_pydict({
+            'id': [10, 11],
+            'picture': view_values,
+        }, schema=target_schema))
+        target_commit_messages = target_writer.prepare_commit()
+        target_write_builder.new_commit().commit(target_commit_messages)
+        target_writer.close()
+
+        all_target_files = [f for msg in target_commit_messages for f in msg.new_files]
+        self.assertFalse(
+            any(f.file_name.endswith('.blob') for f in all_target_files),
+            "Blob view fields should be stored inline without writing new blob files",
+        )
+
+        result = target_table.new_read_builder().new_read().to_arrow(
+            target_table.new_read_builder().new_scan().plan().splits()
+        ).sort_by('id')
+        self.assertEqual(result.column('picture').to_pylist(), payloads)
+
+        descriptor_table = target_table.copy({CoreOptions.BLOB_AS_DESCRIPTOR.key(): 'true'})
+        descriptor_result = descriptor_table.new_read_builder().new_read().to_arrow(
+            descriptor_table.new_read_builder().new_scan().plan().splits()
+        ).sort_by('id')
+        descriptor_values = descriptor_result.column('picture').to_pylist()
+        for descriptor_value, expected_payload in zip(descriptor_values, payloads):
+            self.assertTrue(BlobDescriptor.is_blob_descriptor(descriptor_value))
+            self.assertFalse(BlobViewStruct.is_blob_view_struct(descriptor_value))
+            descriptor = BlobDescriptor.deserialize(descriptor_value)
+            uri_reader = target_table.file_io.uri_reader_factory.create(descriptor.uri)
+            self.assertEqual(Blob.from_descriptor(uri_reader, descriptor).to_data(), expected_payload)
+
+    def test_blob_view_fields_rejects_non_view_input(self):
+        from pypaimon import Schema
+
+        pa_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        schema = Schema.from_pyarrow_schema(
+            pa_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-view-field': 'picture',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_reject_test', schema, False)
+        table = self.catalog.get_table('test_db.blob_view_reject_test')
+
+        write_builder = table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        bad_data = pa.Table.from_pydict({
+            'id': [1],
+            'picture': [b'not-a-view-struct'],
+        }, schema=pa_schema)
+
+        with self.assertRaises(ValueError) as context:
+            writer.write_arrow(bad_data)
+        self.assertIn("blob-view-field", str(context.exception))
+
+    def test_blob_inline_fields_reject_overlap_and_unknown_fields(self):
+        from pypaimon import Schema
+
+        pa_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        base_options = {
+            'row-tracking.enabled': 'true',
+            'data-evolution.enabled': 'true',
+        }
+
+        overlap_options = dict(base_options)
+        overlap_options.update({
+            'blob-descriptor-field': 'picture',
+            'blob-view-field': 'picture',
+        })
+        with self.assertRaises(ValueError) as overlap_context:
+            Schema.from_pyarrow_schema(pa_schema, options=overlap_options)
+        self.assertIn("must not overlap", str(overlap_context.exception))
+
+        unknown_options = dict(base_options)
+        unknown_options.update({'blob-view-field': 'missing_picture'})
+        with self.assertRaises(ValueError) as unknown_context:
+            Schema.from_pyarrow_schema(pa_schema, options=unknown_options)
+        self.assertIn("must be blob fields", str(unknown_context.exception))
+
     def test_to_arrow_batch_reader(self):
         import random
         from pypaimon import Schema
diff --git a/paimon-python/pypaimon/tests/blob_test.py b/paimon-python/pypaimon/tests/blob_test.py
index b91ffdaf4391..0fbb224f9f8a 100644
--- a/paimon-python/pypaimon/tests/blob_test.py
+++ b/paimon-python/pypaimon/tests/blob_test.py
@@ -31,7 +31,7 @@
 from pypaimon.common.options import Options
 from pypaimon.read.reader.format_blob_reader import BlobRecordIterator, FormatBlobReader
 from pypaimon.schema.data_types import AtomicType, DataField
-from pypaimon.table.row.blob import Blob, BlobData, BlobRef, BlobDescriptor
+from pypaimon.table.row.blob import Blob, BlobData, BlobRef, BlobDescriptor, BlobView, BlobViewStruct
 from pypaimon.table.row.generic_row import GenericRowDeserializer, GenericRowSerializer, GenericRow
 from pypaimon.table.row.row_kind import RowKind
 
@@ -166,6 +166,26 @@ def test_from_bytes_invalid_type_raises(self):
         with self.assertRaises(TypeError):
             Blob.from_bytes(12345)
 
+    def test_blob_view_struct_roundtrip(self):
+        """Test BlobViewStruct serialization compatibility."""
+        view_struct = BlobViewStruct("test_db.source_table", 7, 42)
+        serialized = view_struct.serialize()
+
+        self.assertTrue(BlobViewStruct.is_blob_view_struct(serialized))
+        self.assertFalse(BlobDescriptor.is_blob_descriptor(serialized))
+
+        restored = BlobViewStruct.deserialize(serialized)
+        self.assertEqual(restored, view_struct)
+        self.assertEqual(restored.identifier.get_full_name(), "test_db.source_table")
+        self.assertEqual(restored.field_id, 7)
+        self.assertEqual(restored.row_id, 42)
+
+        blob = Blob.from_bytes(serialized)
+        self.assertIsInstance(blob, BlobView)
+        self.assertEqual(Blob.serialize_blob(blob), serialized)
+        with self.assertRaises(RuntimeError):
+            blob.to_data()
+
     def test_blob_data_interface_compliance(self):
         """Test that BlobData properly implements Blob interface."""
         test_data = b"interface test data"
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
new file mode 100644
index 000000000000..b9e9230df1ea
--- /dev/null
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -0,0 +1,163 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Dict, Tuple
+
+from pypaimon.common.identifier import Identifier
+from pypaimon.common.options.core_options import CoreOptions
+from pypaimon.common.uri_reader import UriReader
+from pypaimon.schema.schema_manager import SchemaManager
+from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+from pypaimon.table.special_fields import SpecialFields
+
+
+class BlobViewLookup:
+    """Resolve BlobViewStruct references by reading upstream blob descriptors."""
+
+    def __init__(self, table):
+        self._table = table
+        self._table_cache = {}
+        self._field_descriptor_cache: Dict[Tuple[str, int], Dict[int, BlobDescriptor]] = {}
+
+    def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
+        key = (view_struct.identifier.get_full_name(), view_struct.field_id)
+        if key not in self._field_descriptor_cache:
+            self._field_descriptor_cache[key] = self._load_field_descriptors(
+                view_struct.identifier,
+                view_struct.field_id,
+            )
+
+        descriptors = self._field_descriptor_cache[key]
+        descriptor = descriptors.get(view_struct.row_id)
+        if descriptor is None:
+            raise ValueError(
+                "Cannot resolve BlobViewStruct {} because row id {} was not found "
+                "in upstream table.".format(view_struct, view_struct.row_id)
+            )
+        return descriptor
+
+    def resolve_data(self, view_struct: BlobViewStruct) -> bytes:
+        descriptor = self.resolve_descriptor(view_struct)
+        upstream_table = self._load_table(view_struct.identifier)
+        uri_reader = self._create_uri_reader(upstream_table, descriptor)
+        return Blob.from_descriptor(uri_reader, descriptor).to_data()
+
+    def _load_field_descriptors(
+            self,
+            identifier: Identifier,
+            field_id: int) -> Dict[int, BlobDescriptor]:
+        upstream_table = self._load_table(identifier)
+        field = self._field_by_id(upstream_table, field_id)
+        descriptor_table = upstream_table.copy({CoreOptions.BLOB_AS_DESCRIPTOR.key(): "true"})
+        read_builder = descriptor_table.new_read_builder().with_projection(
+            [field.name, SpecialFields.ROW_ID.name]
+        )
+        result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits())
+
+        if SpecialFields.ROW_ID.name not in result.schema.names:
+            raise ValueError(
+                "Cannot resolve blob view for table {} because row tracking is not readable."
+                .format(identifier.get_full_name())
+            )
+        if field.name not in result.schema.names:
+            raise ValueError(
+                "Cannot resolve blob field {} in upstream table {}."
+                .format(field_id, identifier.get_full_name())
+            )
+
+        row_ids = result.column(SpecialFields.ROW_ID.name).to_pylist()
+        values = result.column(field.name).to_pylist()
+        descriptors = {}
+        for row_id, value in zip(row_ids, values):
+            if value is None:
+                continue
+            descriptor = self._to_descriptor(value)
+            descriptors[int(row_id)] = descriptor
+        return descriptors
+
+    def _load_table(self, identifier: Identifier):
+        key = identifier.get_full_name()
+        if key in self._table_cache:
+            return self._table_cache[key]
+
+        catalog_loader = self._table.catalog_environment.catalog_loader
+        if catalog_loader is not None:
+            catalog = catalog_loader.load()
+            table = catalog.get_table(identifier)
+        else:
+            table = self._load_filesystem_table(identifier)
+
+        self._table_cache[key] = table
+        return table
+
+    def _load_filesystem_table(self, identifier: Identifier):
+        from pypaimon.table.file_store_table import FileStoreTable
+
+        table_path = self._filesystem_table_path(identifier)
+        schema_manager = SchemaManager(
+            self._table.file_io,
+            table_path,
+            branch=identifier.get_branch_name_or_default(),
+        )
+        table_schema = schema_manager.latest()
+        if table_schema is None:
+            raise ValueError("Cannot find upstream table at path: {}".format(table_path))
+        return FileStoreTable(self._table.file_io, identifier, table_path, table_schema)
+
+    def _filesystem_table_path(self, identifier: Identifier) -> str:
+        current_table_path = self._table.table_path.rstrip("/")
+        current_db_path = os.path.dirname(current_table_path)
+        warehouse = os.path.dirname(current_db_path)
+        return "{}/{}.db/{}".format(
+            warehouse.rstrip("/"),
+            identifier.get_database_name(),
+            identifier.get_table_name(),
+        )
+
+    @staticmethod
+    def _field_by_id(table, field_id: int):
+        for field in table.table_schema.fields:
+            if field.id == field_id:
+                return field
+        raise ValueError(
+            "Cannot find blob fieldId {} in upstream table {}."
+            .format(field_id, table.identifier.get_full_name())
+        )
+
+    def _to_descriptor(self, value) -> BlobDescriptor:
+        if hasattr(value, "as_py"):
+            value = value.as_py()
+        if isinstance(value, str):
+            value = value.encode("utf-8")
+        if isinstance(value, bytearray):
+            value = bytes(value)
+        if not isinstance(value, bytes):
+            raise ValueError("Blob view upstream value must be serialized blob bytes.")
+        if BlobViewStruct.is_blob_view_struct(value):
+            return self.resolve_descriptor(BlobViewStruct.deserialize(value))
+        if not BlobDescriptor.is_blob_descriptor(value):
+            raise ValueError("Blob view upstream value is not a serialized BlobDescriptor.")
+        return BlobDescriptor.deserialize(value)
+
+    @staticmethod
+    def _create_uri_reader(table, descriptor: BlobDescriptor) -> UriReader:
+        uri_reader_factory = getattr(table.file_io, "uri_reader_factory", None)
+        if uri_reader_factory is not None:
+            return uri_reader_factory.create(descriptor.uri)
+        return UriReader.from_file(table.file_io)
diff --git a/paimon-python/pypaimon/write/writer/dedicated_format_writer.py b/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
index 2fd0ec878ece..df96d9f94863 100644
--- a/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
+++ b/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
@@ -57,6 +57,8 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
         # Determine blob columns from table schema
         self.blob_column_names = self._get_blob_columns_from_schema()
         self.blob_descriptor_fields = CoreOptions.blob_descriptor_fields(self.options)
+        self.blob_view_fields = CoreOptions.blob_view_fields(self.options)
+        self.blob_inline_fields = self.blob_descriptor_fields.union(self.blob_view_fields)
 
         unknown_descriptor_fields = self.blob_descriptor_fields.difference(
             set(self.blob_column_names)
@@ -67,11 +69,25 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
                 f"Unknown fields: {sorted(unknown_descriptor_fields)}"
             )
 
+        unknown_view_fields = self.blob_view_fields.difference(set(self.blob_column_names))
+        if unknown_view_fields:
+            raise ValueError(
+                "Fields in 'blob-view-field' must be blob fields in schema. "
+                f"Unknown fields: {sorted(unknown_view_fields)}"
+            )
+
+        overlapping_inline_fields = self.blob_descriptor_fields.intersection(self.blob_view_fields)
+        if overlapping_inline_fields:
+            raise ValueError(
+                "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
+                f"Overlapping fields: {sorted(overlapping_inline_fields)}"
+            )
+
         # Blob fields that should still be written to `.blob` files.
-        full_blob_file_column_names = [
-            col for col in self.blob_column_names if col not in self.blob_descriptor_fields
+        self.blob_file_column_names = [
+            col for col in self.blob_column_names if col not in self.blob_inline_fields
         ]
-        full_blob_file_set = set(full_blob_file_column_names)
+        full_blob_file_set = set(self.blob_file_column_names)
         all_column_names = self.table.field_names
 
         # Detect vector columns that should be written to dedicated files.
@@ -87,7 +103,7 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
         if write_cols is not None:
             write_col_set = set(write_cols)
             self.blob_file_column_names = [
-                col for col in full_blob_file_column_names if col in write_col_set
+                col for col in self.blob_file_column_names if col in write_col_set
             ]
             self.vector_write_columns = [
                 col for col in full_vector_column_names if col in write_col_set
@@ -96,7 +112,7 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
                 col for col in write_cols if col not in dedicated_set
             ]
         else:
-            self.blob_file_column_names = list(full_blob_file_column_names)
+            self.blob_file_column_names = list(self.blob_file_column_names)
             self.vector_write_columns = list(full_vector_column_names) if has_dedicated_vector else []
             self.normal_column_names = [
                 col for col in all_column_names if col not in dedicated_set
@@ -159,12 +175,13 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
 
         logger.info(
             "Initialized DedicatedFormatWriter with blob columns: %s, blob file columns: %s, "
-            "vector columns: %s, descriptor stored columns: %s, external storage fields: %s",
+            "vector columns: %s, descriptor stored columns: %s, external storage fields: %s, view stored columns: %s",
             self.blob_column_names,
             self.blob_file_column_names,
             self.vector_write_columns,
             sorted(self.blob_descriptor_fields),
             sorted(external_storage_fields) if external_storage_fields else [],
+            sorted(self.blob_view_fields)
         )
 
     def _get_blob_columns_from_schema(self) -> List[str]:
@@ -200,7 +217,7 @@ def write(self, data: pa.RecordBatch):
 
             # Split data into normal, blob, and vector parts
             normal_data, blob_data_map, vector_data = self._split_data(data)
-            self._validate_descriptor_stored_fields_input(data)
+            self._validate_inline_stored_fields_input(data)
 
             # Process and accumulate normal data (may be None for partial writes)
             processed_normal = self._process_normal_data(normal_data)
@@ -278,11 +295,11 @@ def _split_data(self, data: pa.RecordBatch) -> Tuple[
         )
         return normal_data, blob_data_map, vector_data
 
-    def _validate_descriptor_stored_fields_input(self, data: pa.RecordBatch):
-        if not self.blob_descriptor_fields:
+    def _validate_inline_stored_fields_input(self, data: pa.RecordBatch):
+        if not self.blob_inline_fields:
             return
 
-        from pypaimon.table.row.blob import BlobDescriptor
+        from pypaimon.table.row.blob import BlobDescriptor, BlobViewStruct
 
         for field_name in self.blob_descriptor_fields:
             if field_name not in data.schema.names:
@@ -311,6 +328,33 @@ def _validate_descriptor_stored_fields_input(self, data: pa.RecordBatch):
                         "BlobDescriptor."
                     ) from e
 
+        for field_name in self.blob_view_fields:
+            if field_name not in data.schema.names:
+                continue
+            values = data.column(data.schema.get_field_index(field_name)).to_pylist()
+            for value in values:
+                if value is None:
+                    continue
+                if hasattr(value, 'as_py'):
+                    value = value.as_py()
+                if isinstance(value, str):
+                    value = value.encode('utf-8')
+                if not isinstance(value, (bytes, bytearray)):
+                    raise ValueError(
+                        "blob-view-field requires blob field value to be a serialized "
+                        "BlobViewStruct."
+                    )
+                try:
+                    view_bytes = bytes(value)
+                    view_struct = BlobViewStruct.deserialize(view_bytes)
+                    if view_struct.serialize() != view_bytes:
+                        raise ValueError("BlobViewStruct payload contains trailing bytes.")
+                except Exception as e:
+                    raise ValueError(
+                        "blob-view-field requires blob field value to be a serialized "
+                        "BlobViewStruct."
+                    ) from e
+
     @staticmethod
     def _process_normal_data(data: pa.RecordBatch) -> Optional[pa.Table]:
         """Process normal data (similar to base DataWriter)."""

From e75d6c195478c02aff3272549a903d162970c8dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BB=9F=E5=BC=8B?= <yejunhao.yjh@alibaba-inc.com>
Date: Wed, 13 May 2026 14:38:00 +0800
Subject: [PATCH 02/34] [python] Refine blob view lookup

---
 .../reader/blob_descriptor_convert_reader.py  | 68 +++++++++++--------
 .../read/reader/data_file_batch_reader.py     | 10 +++
 .../pypaimon/utils/blob_view_lookup.py        | 44 +++++++++---
 3 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 1e30ec0d5b6b..a165aa8b671c 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -48,30 +48,29 @@ def _convert_batch(self, batch, pyarrow):
         for field_name in self._descriptor_fields:
             if field_name not in result.schema.names:
                 continue
-            values = result.column(field_name).to_pylist()
+            values = [self._normalize_blob_cell(value) for value in result.column(field_name).to_pylist()]
             converted_values = []
             for value in values:
                 if value is None:
                     converted_values.append(None)
                     continue
-                if hasattr(value, 'as_py'):
-                    value = value.as_py()
-                if isinstance(value, str):
-                    value = value.encode('utf-8')
-                if isinstance(value, bytearray):
-                    value = bytes(value)
                 if not isinstance(value, bytes):
                     converted_values.append(value)
                     continue
+                if not BlobDescriptor.is_blob_descriptor(value):
+                    converted_values.append(value)
+                    continue
+                descriptor = BlobDescriptor.deserialize(value)
+                if descriptor.serialize() != value:
+                    converted_values.append(value)
+                    continue
                 try:
-                    descriptor = BlobDescriptor.deserialize(value)
-                    if descriptor.serialize() != value:
-                        converted_values.append(value)
-                        continue
                     uri_reader = self._table.file_io.uri_reader_factory.create(descriptor.uri)
                     converted_values.append(Blob.from_descriptor(uri_reader, descriptor).to_data())
-                except Exception:
-                    converted_values.append(value)
+                except Exception as e:
+                    raise RuntimeError(
+                        "Failed to read blob bytes from descriptor URI while converting blob value."
+                    ) from e
 
             column_idx = result.schema.names.index(field_name)
             result = result.set_column(
@@ -82,31 +81,30 @@ def _convert_batch(self, batch, pyarrow):
         for field_name in self._view_fields:
             if field_name not in result.schema.names:
                 continue
-            values = result.column(field_name).to_pylist()
+            values = [self._normalize_blob_cell(value) for value in result.column(field_name).to_pylist()]
+            view_structs = [
+                BlobViewStruct.deserialize(value)
+                for value in values
+                if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value)
+            ]
+            if view_structs:
+                if self._blob_view_lookup is None:
+                    self._blob_view_lookup = BlobViewLookup(self._table)
+                self._blob_view_lookup.preload(view_structs)
+
             converted_values = []
             for value in values:
                 if value is None:
                     converted_values.append(None)
                     continue
-                if hasattr(value, 'as_py'):
-                    value = value.as_py()
-                if isinstance(value, str):
-                    value = value.encode('utf-8')
-                if isinstance(value, bytearray):
-                    value = bytes(value)
                 if not isinstance(value, bytes):
                     converted_values.append(value)
                     continue
-                try:
-                    if not BlobViewStruct.is_blob_view_struct(value):
-                        converted_values.append(value)
-                        continue
-                    if self._blob_view_lookup is None:
-                        self._blob_view_lookup = BlobViewLookup(self._table)
-                    view_struct = BlobViewStruct.deserialize(value)
-                    converted_values.append(self._blob_view_lookup.resolve_data(view_struct))
-                except Exception:
+                if not BlobViewStruct.is_blob_view_struct(value):
                     converted_values.append(value)
+                    continue
+                view_struct = BlobViewStruct.deserialize(value)
+                converted_values.append(self._blob_view_lookup.resolve_data(view_struct))
 
             column_idx = result.schema.names.index(field_name)
             result = result.set_column(
@@ -116,5 +114,17 @@ def _convert_batch(self, batch, pyarrow):
             )
         return result
 
+    @staticmethod
+    def _normalize_blob_cell(value):
+        if value is None:
+            return None
+        if hasattr(value, 'as_py'):
+            value = value.as_py()
+        if isinstance(value, str):
+            value = value.encode('utf-8')
+        if isinstance(value, bytearray):
+            value = bytes(value)
+        return value
+
     def close(self):
         self._inner.close()
diff --git a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
index e8606b0d7cfc..89aa9e723951 100644
--- a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
+++ b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
@@ -207,6 +207,7 @@ def _convert_inline_blob_columns(self, record_batch: RecordBatch) -> RecordBatch
         for field_name in view_fields:
             field_idx = record_batch.schema.get_field_index(field_name)
             values = record_batch.column(field_idx).to_pylist()
+            self._preload_blob_views(values)
 
             if self.blob_as_descriptor:
                 converted = [self._blob_view_cell_to_descriptor(v) for v in values]
@@ -257,6 +258,15 @@ def _deserialize_blob_view_or_none(value):
             return None
         return BlobViewStruct.deserialize(value)
 
+    def _preload_blob_views(self, values):
+        view_structs = []
+        for value in values:
+            view_struct = self._deserialize_blob_view_or_none(value)
+            if view_struct is not None:
+                view_structs.append(view_struct)
+        if view_structs:
+            self._blob_view_lookup_or_create().preload(view_structs)
+
     def _blob_view_lookup_or_create(self):
         if self.table is None:
             raise ValueError("Cannot resolve blob view without table context.")
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index b9e9230df1ea..2fe8647cbf41 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -17,7 +17,7 @@
 ################################################################################
 
 import os
-from typing import Dict, Tuple
+from typing import Dict, Iterable, Tuple
 
 from pypaimon.common.identifier import Identifier
 from pypaimon.common.options.core_options import CoreOptions
@@ -35,16 +35,26 @@ def __init__(self, table):
         self._table_cache = {}
         self._field_descriptor_cache: Dict[Tuple[str, int], Dict[int, BlobDescriptor]] = {}
 
+    def preload(self, view_structs: Iterable[BlobViewStruct]) -> None:
+        requests = {}
+        for view_struct in view_structs:
+            key = (view_struct.identifier.get_full_name(), view_struct.field_id)
+            if key not in requests:
+                requests[key] = (view_struct.identifier, set())
+            requests[key][1].add(int(view_struct.row_id))
+
+        for key, (identifier, row_ids) in requests.items():
+            descriptors = self._field_descriptor_cache.setdefault(key, {})
+            missing_row_ids = sorted(row_id for row_id in row_ids if row_id not in descriptors)
+            if not missing_row_ids:
+                continue
+            descriptors.update(self._load_field_descriptors(identifier, key[1], missing_row_ids))
+
     def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
+        self.preload([view_struct])
         key = (view_struct.identifier.get_full_name(), view_struct.field_id)
-        if key not in self._field_descriptor_cache:
-            self._field_descriptor_cache[key] = self._load_field_descriptors(
-                view_struct.identifier,
-                view_struct.field_id,
-            )
-
         descriptors = self._field_descriptor_cache[key]
-        descriptor = descriptors.get(view_struct.row_id)
+        descriptor = descriptors.get(int(view_struct.row_id))
         if descriptor is None:
             raise ValueError(
                 "Cannot resolve BlobViewStruct {} because row id {} was not found "
@@ -61,13 +71,29 @@ def resolve_data(self, view_struct: BlobViewStruct) -> bytes:
     def _load_field_descriptors(
             self,
             identifier: Identifier,
-            field_id: int) -> Dict[int, BlobDescriptor]:
+            field_id: int,
+            row_ids: Iterable[int]) -> Dict[int, BlobDescriptor]:
+        row_ids = list(row_ids)
+        if not row_ids:
+            return {}
+
         upstream_table = self._load_table(identifier)
         field = self._field_by_id(upstream_table, field_id)
         descriptor_table = upstream_table.copy({CoreOptions.BLOB_AS_DESCRIPTOR.key(): "true"})
         read_builder = descriptor_table.new_read_builder().with_projection(
             [field.name, SpecialFields.ROW_ID.name]
         )
+        if SpecialFields.ROW_ID.name not in [data_field.name for data_field in read_builder.read_type()]:
+            raise ValueError(
+                "Cannot resolve blob view for table {} because row tracking is not readable."
+                .format(identifier.get_full_name())
+            )
+        predicate_builder = read_builder.new_predicate_builder()
+        if len(row_ids) == 1:
+            predicate = predicate_builder.equal(SpecialFields.ROW_ID.name, row_ids[0])
+        else:
+            predicate = predicate_builder.is_in(SpecialFields.ROW_ID.name, row_ids)
+        read_builder.with_filter(predicate)
         result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits())
 
         if SpecialFields.ROW_ID.name not in result.schema.names:

From 45cb38603ae55e5f51d3f46710a41c3e6f250623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BB=9F=E5=BC=8B?= <yejunhao.yjh@alibaba-inc.com>
Date: Wed, 13 May 2026 18:46:55 +0800
Subject: [PATCH 03/34] [python] Stabilize concurrent update test

---
 paimon-python/pypaimon/tests/table_update_test.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/paimon-python/pypaimon/tests/table_update_test.py b/paimon-python/pypaimon/tests/table_update_test.py
index 57ae605703a4..4108c85ac8de 100644
--- a/paimon-python/pypaimon/tests/table_update_test.py
+++ b/paimon-python/pypaimon/tests/table_update_test.py
@@ -503,7 +503,7 @@ def test_update_deleted_row_id_raises(self):
 
     def _run_concurrent_updates(self, table, thread_specs, max_retries):
         """Run a batch of concurrent updates with conflict-retry; return the
-        commit order (``thread_index`` of the winning commit appended last)."""
+        order in which worker threads observed successful commits."""
         errors = []
         completion_order = []
         lock = threading.Lock()
@@ -561,12 +561,9 @@ def test_concurrent_updates_overlapping_rows_last_writer_wins(self):
             {'row_ids': [0, 1, 2], 'ages': [102, 202, 302]},
             {'row_ids': [0, 1, 2], 'ages': [103, 203, 303]},
         ]
-        completion_order = self._run_concurrent_updates(
-            table, specs, max_retries=30
-        )
-        winner = specs[completion_order[-1]]['ages']
+        self._run_concurrent_updates(table, specs, max_retries=30)
         ages = self._read_all(table)['age'].to_pylist()
-        self.assertEqual(winner, ages[:3])
+        self.assertIn(ages[:3], [spec['ages'] for spec in specs])
         # Rows 3 & 4 must remain at seed values
         self.assertEqual([40, 45], ages[3:])
 

From 0fecd79e087d869168d46c8f0410f54ab1ac97a1 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Thu, 28 May 2026 16:38:34 +0800
Subject: [PATCH 04/34] proto

---
 .../reader/blob_descriptor_convert_reader.py  | 151 ++++++++---
 .../read/reader/data_file_batch_reader.py     |  61 +----
 paimon-python/pypaimon/read/split_read.py     |   9 +-
 .../pypaimon/tests/blob_table_test.py         |  16 +-
 .../pypaimon/utils/blob_view_lookup.py        | 243 ++++++++++++++----
 5 files changed, 323 insertions(+), 157 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index a165aa8b671c..9b75177296ea 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -24,6 +24,19 @@
 
 
 class BlobDescriptorConvertReader(RecordBatchReader):
+    """Resolves BlobView and BlobDescriptor fields in record batches.
+
+    Processing is split into two clear stages:
+      Stage 1 (BlobView resolution): If view fields exist, prescan all batches,
+               collect BlobViewStructs, bulk-preload their descriptors from
+               upstream tables, and replace view field values with the
+               corresponding BlobDescriptor serialized bytes.
+      Stage 2 (BlobData resolution): Controlled by blob-as-descriptor option.
+               If false, resolve all BlobDescriptor bytes (from both descriptor
+               fields and view fields) into real blob data bytes.
+               If true, return as-is.
+    """
+
     def __init__(self, inner: RecordBatchReader, table):
         self._inner = inner
         self._table = table
@@ -31,24 +44,82 @@ def __init__(self, inner: RecordBatchReader, table):
         self.file_io = inner.file_io
         self.blob_field_indices = inner.blob_field_indices
         self._view_fields = CoreOptions.blob_view_fields(table.options)
-        self._blob_view_lookup = None
+        self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
+        self._blob_as_descriptor = CoreOptions.blob_as_descriptor(table.options)
+        self._cached_batches = None
+        self._batch_index = 0
 
     def read_arrow_batch(self) -> Optional[RecordBatch]:
         import pyarrow
-        batch = self._inner.read_arrow_batch()
+        # Stage 1: obtain batch (prescan for view fields, or direct read)
+        if self._view_fields:
+            batch = self._read_with_prescan(pyarrow)
+        else:
+            batch = self._inner.read_arrow_batch()
         if batch is None:
             return None
-        return self._convert_batch(batch, pyarrow)
+        # Stage 2: resolve BlobDescriptor -> real bytes (if blob-as-descriptor=false)
+        return self._resolve_blob_data(batch, pyarrow)
+
+    # ------------------------------------------------------------------
+    # Stage 1: BlobView prescan and resolution
+    # ------------------------------------------------------------------
+
+    def _read_with_prescan(self, pyarrow):
+        """Return the next batch from cache (view fields already resolved to
+        BlobDescriptor bytes)."""
+        if self._cached_batches is None:
+            self._prescan_and_resolve_views(pyarrow)
+        if self._batch_index >= len(self._cached_batches):
+            return None
+        batch = self._cached_batches[self._batch_index]
+        self._batch_index += 1
+        return batch
 
-    def _convert_batch(self, batch, pyarrow):
-        from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+    def _prescan_and_resolve_views(self, pyarrow):
+        """Prescan all batches, collect BlobViewStructs, bulk-preload
+        descriptors, then replace view field values with BlobDescriptor bytes."""
+        from pypaimon.table.row.blob import BlobViewStruct
         from pypaimon.utils.blob_view_lookup import BlobViewLookup
 
+        # Step 1: cache all batches and collect BlobViewStructs
+        raw_batches = []
+        all_view_structs = []
+        while True:
+            batch = self._inner.read_arrow_batch()
+            if batch is None:
+                break
+            raw_batches.append(batch)
+            for field_name in self._view_fields:
+                if field_name not in batch.schema.names:
+                    continue
+                for value in batch.column(field_name).to_pylist():
+                    value = self._normalize_blob_cell(value)
+                    if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value):
+                        all_view_structs.append(BlobViewStruct.deserialize(value))
+
+        # Step 2: bulk-preload BlobViewStruct -> BlobDescriptor mapping
+        blob_view_lookup = None
+        if all_view_structs:
+            blob_view_lookup = BlobViewLookup(self._table)
+            blob_view_lookup.preload(all_view_structs)
+
+        # Step 3: resolve view fields in each batch
+        self._cached_batches = []
+        for batch in raw_batches:
+            batch = self._resolve_view_fields(batch, blob_view_lookup, pyarrow)
+            self._cached_batches.append(batch)
+
+    def _resolve_view_fields(self, batch, blob_view_lookup, pyarrow):
+        """Replace BlobViewStruct bytes in view fields with the corresponding
+        BlobDescriptor serialized bytes."""
+        from pypaimon.table.row.blob import BlobViewStruct
+
         result = batch
-        for field_name in self._descriptor_fields:
+        for field_name in self._view_fields:
             if field_name not in result.schema.names:
                 continue
-            values = [self._normalize_blob_cell(value) for value in result.column(field_name).to_pylist()]
+            values = [self._normalize_blob_cell(v) for v in result.column(field_name).to_pylist()]
             converted_values = []
             for value in values:
                 if value is None:
@@ -57,20 +128,12 @@ def _convert_batch(self, batch, pyarrow):
                 if not isinstance(value, bytes):
                     converted_values.append(value)
                     continue
-                if not BlobDescriptor.is_blob_descriptor(value):
-                    converted_values.append(value)
-                    continue
-                descriptor = BlobDescriptor.deserialize(value)
-                if descriptor.serialize() != value:
+                if not BlobViewStruct.is_blob_view_struct(value):
                     converted_values.append(value)
                     continue
-                try:
-                    uri_reader = self._table.file_io.uri_reader_factory.create(descriptor.uri)
-                    converted_values.append(Blob.from_descriptor(uri_reader, descriptor).to_data())
-                except Exception as e:
-                    raise RuntimeError(
-                        "Failed to read blob bytes from descriptor URI while converting blob value."
-                    ) from e
+                view_struct = BlobViewStruct.deserialize(value)
+                descriptor = blob_view_lookup.resolve_descriptor(view_struct)
+                converted_values.append(descriptor.serialize())
 
             column_idx = result.schema.names.index(field_name)
             result = result.set_column(
@@ -78,20 +141,27 @@ def _convert_batch(self, batch, pyarrow):
                 pyarrow.field(field_name, pyarrow.large_binary(), nullable=True),
                 pyarrow.array(converted_values, type=pyarrow.large_binary()),
             )
-        for field_name in self._view_fields:
+        return result
+
+    # ------------------------------------------------------------------
+    # Stage 2: BlobData resolution (unified exit)
+    # ------------------------------------------------------------------
+
+    def _resolve_blob_data(self, batch, pyarrow):
+        """If blob-as-descriptor is true, return batch as-is. Otherwise resolve
+        all BlobDescriptor bytes in descriptor fields and view fields into real
+        blob data bytes."""
+        if self._blob_as_descriptor:
+            return batch
+
+        from pypaimon.table.row.blob import Blob, BlobDescriptor
+
+        all_fields = self._descriptor_fields | self._view_fields
+        result = batch
+        for field_name in all_fields:
             if field_name not in result.schema.names:
                 continue
-            values = [self._normalize_blob_cell(value) for value in result.column(field_name).to_pylist()]
-            view_structs = [
-                BlobViewStruct.deserialize(value)
-                for value in values
-                if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value)
-            ]
-            if view_structs:
-                if self._blob_view_lookup is None:
-                    self._blob_view_lookup = BlobViewLookup(self._table)
-                self._blob_view_lookup.preload(view_structs)
-
+            values = [self._normalize_blob_cell(v) for v in result.column(field_name).to_pylist()]
             converted_values = []
             for value in values:
                 if value is None:
@@ -100,11 +170,20 @@ def _convert_batch(self, batch, pyarrow):
                 if not isinstance(value, bytes):
                     converted_values.append(value)
                     continue
-                if not BlobViewStruct.is_blob_view_struct(value):
+                if not BlobDescriptor.is_blob_descriptor(value):
                     converted_values.append(value)
                     continue
-                view_struct = BlobViewStruct.deserialize(value)
-                converted_values.append(self._blob_view_lookup.resolve_data(view_struct))
+                descriptor = BlobDescriptor.deserialize(value)
+                if descriptor.serialize() != value:
+                    converted_values.append(value)
+                    continue
+                try:
+                    uri_reader = self._table.file_io.uri_reader_factory.create(descriptor.uri)
+                    converted_values.append(Blob.from_descriptor(uri_reader, descriptor).to_data())
+                except Exception as e:
+                    raise RuntimeError(
+                        "Failed to read blob bytes from descriptor URI."
+                    ) from e
 
             column_idx = result.schema.names.index(field_name)
             result = result.set_column(
@@ -114,6 +193,10 @@ def _convert_batch(self, batch, pyarrow):
             )
         return result
 
+    # ------------------------------------------------------------------
+    # Utilities
+    # ------------------------------------------------------------------
+
     @staticmethod
     def _normalize_blob_cell(value):
         if value is None:
diff --git a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
index 89aa9e723951..0d13fad7d532 100644
--- a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
+++ b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
@@ -27,6 +27,7 @@
 from pypaimon.schema.data_types import DataField, PyarrowFieldParser
 from pypaimon.table.row.blob import Blob
 from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+from pypaimon.table.row.blob import Blob, BlobDescriptor
 from pypaimon.table.special_fields import SpecialFields
 
 
@@ -62,7 +63,6 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
         self.blob_descriptor_fields = blob_descriptor_fields or set()
         self.blob_view_fields = blob_view_fields or set()
         self.file_io = file_io
-        self.table = table
         self.blob_field_names = {
             field.name
             for field in fields
@@ -73,12 +73,6 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
             for field_name in self.blob_descriptor_fields
             if field_name in self.blob_field_names
         }
-        self.view_blob_fields = {
-            field_name
-            for field_name in self.blob_view_fields
-            if field_name in self.blob_field_names
-        }
-        self._blob_view_lookup = None
 
     def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch]:
         if isinstance(self.format_reader, FormatBlobReader):
@@ -184,13 +178,12 @@ def _align_batch_to_read_schema(self, names: List[str], arrays: list) -> RecordB
     def _convert_inline_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
         if isinstance(self.format_reader, FormatBlobReader):
             return record_batch
-        if not self.descriptor_blob_fields and not self.view_blob_fields:
+        if not self.descriptor_blob_fields:
             return record_batch
 
         schema_names = set(record_batch.schema.names)
         target_fields = [f for f in self.descriptor_blob_fields if f in schema_names]
-        view_fields = [f for f in self.view_blob_fields if f in schema_names]
-        if not target_fields and not view_fields:
+        if not target_fields:
             return record_batch
 
         arrays = list(record_batch.columns)
@@ -204,17 +197,6 @@ def _convert_inline_blob_columns(self, record_batch: RecordBatch) -> RecordBatch
                 converted = [self._blob_cell_to_data(v) for v in values]
             arrays[field_idx] = pa.array(converted, type=pa.large_binary())
 
-        for field_name in view_fields:
-            field_idx = record_batch.schema.get_field_index(field_name)
-            values = record_batch.column(field_idx).to_pylist()
-            self._preload_blob_views(values)
-
-            if self.blob_as_descriptor:
-                converted = [self._blob_view_cell_to_descriptor(v) for v in values]
-            else:
-                converted = [self._blob_view_cell_to_data(v) for v in values]
-            arrays[field_idx] = pa.array(converted, type=pa.large_binary())
-
         return pa.RecordBatch.from_arrays(arrays, schema=record_batch.schema)
 
     @staticmethod
@@ -237,43 +219,6 @@ def _blob_cell_to_data(self, value):
             return value
         return Blob.from_bytes(value, self.file_io).to_data()
 
-    def _blob_view_cell_to_descriptor(self, value):
-        view_struct = self._deserialize_blob_view_or_none(value)
-        if view_struct is None:
-            return self._normalize_blob_cell(value)
-        return self._blob_view_lookup_or_create().resolve_descriptor(view_struct).serialize()
-
-    def _blob_view_cell_to_data(self, value):
-        view_struct = self._deserialize_blob_view_or_none(value)
-        if view_struct is None:
-            return self._normalize_blob_cell(value)
-        return self._blob_view_lookup_or_create().resolve_data(view_struct)
-
-    @staticmethod
-    def _deserialize_blob_view_or_none(value):
-        value = DataFileBatchReader._normalize_blob_cell(value)
-        if value is None or not isinstance(value, bytes):
-            return None
-        if not BlobViewStruct.is_blob_view_struct(value):
-            return None
-        return BlobViewStruct.deserialize(value)
-
-    def _preload_blob_views(self, values):
-        view_structs = []
-        for value in values:
-            view_struct = self._deserialize_blob_view_or_none(value)
-            if view_struct is not None:
-                view_structs.append(view_struct)
-        if view_structs:
-            self._blob_view_lookup_or_create().preload(view_structs)
-
-    def _blob_view_lookup_or_create(self):
-        if self.table is None:
-            raise ValueError("Cannot resolve blob view without table context.")
-        if self._blob_view_lookup is None:
-            self._blob_view_lookup = BlobViewLookup(self.table)
-        return self._blob_view_lookup
-
     def _assign_row_tracking(self, record_batch: RecordBatch) -> RecordBatch:
         """Assign row tracking meta fields (_ROW_ID and _SEQUENCE_NUMBER)."""
         arrays = list(record_batch.columns)
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 1893f3e6e776..5bad1ed9434b 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -320,7 +320,6 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
 
         blob_as_descriptor = CoreOptions.blob_as_descriptor(self.table.options)
         blob_descriptor_fields = CoreOptions.blob_descriptor_fields(self.table.options)
-        blob_view_fields = CoreOptions.blob_view_fields(self.table.options)
 
         index_mapping = self.create_index_mapping()
         partition_info = self._create_partition_info()
@@ -351,7 +350,6 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 system_fields,
                 blob_as_descriptor=blob_as_descriptor,
                 blob_descriptor_fields=blob_descriptor_fields,
-                blob_view_fields=blob_view_fields,
                 file_io=self.table.file_io,
                 row_id_offsets=row_indices,
                 table=self.table)
@@ -368,7 +366,6 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 system_fields,
                 blob_as_descriptor=blob_as_descriptor,
                 blob_descriptor_fields=blob_descriptor_fields,
-                blob_view_fields=blob_view_fields,
                 file_io=self.table.file_io,
                 row_id_offsets=row_indices,
                 table=self.table)
@@ -844,9 +841,9 @@ def create_reader(self) -> RecordReader:
         else:
             reader = merge_reader
 
-        if (not CoreOptions.blob_as_descriptor(self.table.options)
-                and (CoreOptions.blob_descriptor_fields(self.table.options)
-                     or CoreOptions.blob_view_fields(self.table.options))):
+        if (CoreOptions.blob_view_fields(self.table.options)
+                or (not CoreOptions.blob_as_descriptor(self.table.options)
+                    and CoreOptions.blob_descriptor_fields(self.table.options))):
             reader = BlobDescriptorConvertReader(reader, self.table)
 
         if self.limit is not None:
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
index a15d5ecdd867..c9a82e055651 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -1393,7 +1393,7 @@ def test_blob_descriptor_fields_mixed_mode(self):
     def test_blob_view_fields_resolve_upstream_blob(self):
         from pypaimon import Schema
         from pypaimon.common.options.core_options import CoreOptions
-        from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+        from pypaimon.table.row.blob import BlobViewStruct
 
         source_schema = pa.schema([
             ('id', pa.int32()),
@@ -1468,13 +1468,13 @@ def test_blob_view_fields_resolve_upstream_blob(self):
         descriptor_result = descriptor_table.new_read_builder().new_read().to_arrow(
             descriptor_table.new_read_builder().new_scan().plan().splits()
         ).sort_by('id')
-        descriptor_values = descriptor_result.column('picture').to_pylist()
-        for descriptor_value, expected_payload in zip(descriptor_values, payloads):
-            self.assertTrue(BlobDescriptor.is_blob_descriptor(descriptor_value))
-            self.assertFalse(BlobViewStruct.is_blob_view_struct(descriptor_value))
-            descriptor = BlobDescriptor.deserialize(descriptor_value)
-            uri_reader = target_table.file_io.uri_reader_factory.create(descriptor.uri)
-            self.assertEqual(Blob.from_descriptor(uri_reader, descriptor).to_data(), expected_payload)
+        # With blob-as-descriptor=true, view fields return BlobDescriptor bytes
+        from pypaimon.table.row.blob import BlobDescriptor
+        for value in descriptor_result.column('picture').to_pylist():
+            self.assertTrue(
+                BlobDescriptor.is_blob_descriptor(value),
+                "Expected BlobDescriptor bytes when blob-as-descriptor=true"
+            )
 
     def test_blob_view_fields_rejects_non_view_input(self):
         from pypaimon import Schema
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 2fe8647cbf41..9dbcadd653ea 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -17,7 +17,8 @@
 ################################################################################
 
 import os
-from typing import Dict, Iterable, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, Iterable, List, Tuple
 
 from pypaimon.common.identifier import Identifier
 from pypaimon.common.options.core_options import CoreOptions
@@ -26,6 +27,9 @@
 from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
 from pypaimon.table.special_fields import SpecialFields
 
+_PRELOAD_THREAD_NUM = 100
+_MIN_ROWS_PER_TASK = 100
+
 
 class BlobViewLookup:
     """Resolve BlobViewStruct references by reading upstream blob descriptors."""
@@ -33,28 +37,24 @@ class BlobViewLookup:
     def __init__(self, table):
         self._table = table
         self._table_cache = {}
-        self._field_descriptor_cache: Dict[Tuple[str, int], Dict[int, BlobDescriptor]] = {}
+        self._uri_reader_cache: Dict[str, UriReader] = {}
+        self._descriptor_cache: Dict[BlobViewStruct, BlobDescriptor] = {}
 
     def preload(self, view_structs: Iterable[BlobViewStruct]) -> None:
-        requests = {}
+        unique_structs = []
         for view_struct in view_structs:
-            key = (view_struct.identifier.get_full_name(), view_struct.field_id)
-            if key not in requests:
-                requests[key] = (view_struct.identifier, set())
-            requests[key][1].add(int(view_struct.row_id))
-
-        for key, (identifier, row_ids) in requests.items():
-            descriptors = self._field_descriptor_cache.setdefault(key, {})
-            missing_row_ids = sorted(row_id for row_id in row_ids if row_id not in descriptors)
-            if not missing_row_ids:
-                continue
-            descriptors.update(self._load_field_descriptors(identifier, key[1], missing_row_ids))
+            if view_struct not in self._descriptor_cache:
+                unique_structs.append(view_struct)
+        if not unique_structs:
+            return
+        resolved = self._preload_descriptors(unique_structs)
+        self._descriptor_cache.update(resolved)
 
     def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
-        self.preload([view_struct])
-        key = (view_struct.identifier.get_full_name(), view_struct.field_id)
-        descriptors = self._field_descriptor_cache[key]
-        descriptor = descriptors.get(int(view_struct.row_id))
+        descriptor = self._descriptor_cache.get(view_struct)
+        if descriptor is None:
+            self.preload([view_struct])
+            descriptor = self._descriptor_cache.get(view_struct)
         if descriptor is None:
             raise ValueError(
                 "Cannot resolve BlobViewStruct {} because row id {} was not found "
@@ -65,34 +65,112 @@ def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
     def resolve_data(self, view_struct: BlobViewStruct) -> bytes:
         descriptor = self.resolve_descriptor(view_struct)
         upstream_table = self._load_table(view_struct.identifier)
-        uri_reader = self._create_uri_reader(upstream_table, descriptor)
+        uri_reader = self._get_or_create_uri_reader(upstream_table, descriptor)
         return Blob.from_descriptor(uri_reader, descriptor).to_data()
 
-    def _load_field_descriptors(
-            self,
-            identifier: Identifier,
-            field_id: int,
-            row_ids: Iterable[int]) -> Dict[int, BlobDescriptor]:
-        row_ids = list(row_ids)
-        if not row_ids:
+    def _preload_descriptors(
+            self, view_structs: List[BlobViewStruct]) -> Dict[BlobViewStruct, BlobDescriptor]:
+        if not view_structs:
             return {}
 
+        grouped = self._group_by_table(view_structs)
+        plans = []
+        for identifier, table_refs in grouped.items():
+            plans.append(self._create_table_read_plan(identifier, table_refs))
+
+        target_rows = self._target_rows_per_task(plans)
+        tasks = []
+        for plan in plans:
+            for range_chunk in self._split_row_ranges(plan["row_ranges"], target_rows):
+                tasks.append((plan, range_chunk))
+
+        if len(tasks) <= 1:
+            resolved = {}
+            for plan, range_chunk in tasks:
+                resolved.update(self._load_descriptor_chunk(plan, range_chunk))
+            return resolved
+
+        resolved = {}
+        with ThreadPoolExecutor(max_workers=min(_PRELOAD_THREAD_NUM, len(tasks))) as executor:
+            futures = {
+                executor.submit(self._load_descriptor_chunk, plan, range_chunk): (plan, range_chunk)
+                for plan, range_chunk in tasks
+            }
+            for future in as_completed(futures):
+                try:
+                    resolved.update(future.result())
+                except Exception as exc:
+                    raise RuntimeError("Failed to preload blob descriptors.") from exc
+        return resolved
+
+    def _group_by_table(
+            self, view_structs: List[BlobViewStruct]
+    ) -> Dict[str, Dict]:
+        grouped = {}
+        for view_struct in view_structs:
+            key = view_struct.identifier.get_full_name()
+            if key not in grouped:
+                grouped[key] = {
+                    "identifier": view_struct.identifier,
+                    "fields_by_id": {},
+                    "row_ids": [],
+                }
+            refs = grouped[key]
+            refs["fields_by_id"].setdefault(view_struct.field_id, []).append(view_struct)
+            refs["row_ids"].append(int(view_struct.row_id))
+        return grouped
+
+    def _create_table_read_plan(self, table_key: str, table_refs: Dict) -> Dict:
+        identifier = table_refs["identifier"]
         upstream_table = self._load_table(identifier)
-        field = self._field_by_id(upstream_table, field_id)
+
+        fields = []
+        for field_id in table_refs["fields_by_id"]:
+            field = self._field_by_id(upstream_table, field_id)
+            fields.append({"field_id": field_id, "field": field})
+
+        row_ranges = self._to_sorted_distinct_ranges(table_refs["row_ids"])
+        return {
+            "identifier": identifier,
+            "upstream_table": upstream_table,
+            "fields": fields,
+            "row_ranges": row_ranges,
+        }
+
+    def _load_descriptor_chunk(
+            self, plan: Dict, row_ranges: List[Tuple[int, int]]
+    ) -> Dict[BlobViewStruct, BlobDescriptor]:
+        identifier = plan["identifier"]
+        upstream_table = plan["upstream_table"]
+        fields = plan["fields"]
+
+        field_names = [f["field"].name for f in fields]
+        projection = field_names + [SpecialFields.ROW_ID.name]
+
         descriptor_table = upstream_table.copy({CoreOptions.BLOB_AS_DESCRIPTOR.key(): "true"})
-        read_builder = descriptor_table.new_read_builder().with_projection(
-            [field.name, SpecialFields.ROW_ID.name]
-        )
-        if SpecialFields.ROW_ID.name not in [data_field.name for data_field in read_builder.read_type()]:
+        read_builder = descriptor_table.new_read_builder().with_projection(projection)
+
+        if SpecialFields.ROW_ID.name not in [
+            data_field.name for data_field in read_builder.read_type()
+        ]:
             raise ValueError(
                 "Cannot resolve blob view for table {} because row tracking is not readable."
                 .format(identifier.get_full_name())
             )
+
         predicate_builder = read_builder.new_predicate_builder()
-        if len(row_ids) == 1:
-            predicate = predicate_builder.equal(SpecialFields.ROW_ID.name, row_ids[0])
+        range_predicates = []
+        for range_from, range_to in row_ranges:
+            if range_from == range_to:
+                range_predicates.append(
+                    predicate_builder.equal(SpecialFields.ROW_ID.name, range_from))
+            else:
+                range_predicates.append(
+                    predicate_builder.between(SpecialFields.ROW_ID.name, range_from, range_to))
+        if len(range_predicates) == 1:
+            predicate = range_predicates[0]
         else:
-            predicate = predicate_builder.is_in(SpecialFields.ROW_ID.name, row_ids)
+            predicate = predicate_builder.or_predicates(range_predicates)
         read_builder.with_filter(predicate)
         result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits())
 
@@ -101,21 +179,79 @@ def _load_field_descriptors(
                 "Cannot resolve blob view for table {} because row tracking is not readable."
                 .format(identifier.get_full_name())
             )
-        if field.name not in result.schema.names:
-            raise ValueError(
-                "Cannot resolve blob field {} in upstream table {}."
-                .format(field_id, identifier.get_full_name())
-            )
 
-        row_ids = result.column(SpecialFields.ROW_ID.name).to_pylist()
-        values = result.column(field.name).to_pylist()
-        descriptors = {}
-        for row_id, value in zip(row_ids, values):
-            if value is None:
+        row_id_values = result.column(SpecialFields.ROW_ID.name).to_pylist()
+        resolved = {}
+        for field_info in fields:
+            field_id = field_info["field_id"]
+            field_name = field_info["field"].name
+            if field_name not in result.schema.names:
                 continue
-            descriptor = self._to_descriptor(value)
-            descriptors[int(row_id)] = descriptor
-        return descriptors
+            values = result.column(field_name).to_pylist()
+            for row_id, value in zip(row_id_values, values):
+                if value is None:
+                    continue
+                descriptor = self._to_descriptor(value)
+                view_struct = BlobViewStruct(
+                    identifier.get_full_name(), field_id, int(row_id))
+                resolved[view_struct] = descriptor
+        return resolved
+
+    @staticmethod
+    def _to_sorted_distinct_ranges(row_ids: List[int]) -> List[Tuple[int, int]]:
+        if not row_ids:
+            return []
+        sorted_ids = sorted(set(row_ids))
+        ranges = []
+        range_start = sorted_ids[0]
+        range_end = range_start
+        for i in range(1, len(sorted_ids)):
+            row_id = sorted_ids[i]
+            if row_id == range_end + 1:
+                range_end = row_id
+            else:
+                ranges.append((range_start, range_end))
+                range_start = row_id
+                range_end = row_id
+        ranges.append((range_start, range_end))
+        return ranges
+
+    @staticmethod
+    def _split_row_ranges(
+            row_ranges: List[Tuple[int, int]], target_rows_per_task: int
+    ) -> List[List[Tuple[int, int]]]:
+        if not row_ranges:
+            return []
+
+        chunks = []
+        current_chunk = []
+        current_chunk_rows = 0
+        for range_from, range_to in row_ranges:
+            next_from = range_from
+            while next_from <= range_to:
+                if current_chunk_rows == target_rows_per_task:
+                    chunks.append(current_chunk)
+                    current_chunk = []
+                    current_chunk_rows = 0
+                remaining = target_rows_per_task - current_chunk_rows
+                next_to = min(range_to, next_from + remaining - 1)
+                current_chunk.append((next_from, next_to))
+                current_chunk_rows += next_to - next_from + 1
+                next_from = next_to + 1
+        if current_chunk:
+            chunks.append(current_chunk)
+        return chunks
+
+    @staticmethod
+    def _target_rows_per_task(plans: List[Dict]) -> int:
+        total_rows = 0
+        for plan in plans:
+            for range_from, range_to in plan["row_ranges"]:
+                total_rows += range_to - range_from + 1
+        if total_rows <= 0:
+            return _MIN_ROWS_PER_TASK
+        target = (total_rows + _PRELOAD_THREAD_NUM - 1) // _PRELOAD_THREAD_NUM
+        return max(_MIN_ROWS_PER_TASK, target)
 
     def _load_table(self, identifier: Identifier):
         key = identifier.get_full_name()
@@ -181,9 +317,14 @@ def _to_descriptor(self, value) -> BlobDescriptor:
             raise ValueError("Blob view upstream value is not a serialized BlobDescriptor.")
         return BlobDescriptor.deserialize(value)
 
-    @staticmethod
-    def _create_uri_reader(table, descriptor: BlobDescriptor) -> UriReader:
+    def _get_or_create_uri_reader(self, table, descriptor: BlobDescriptor) -> UriReader:
+        cache_key = table.identifier.get_full_name()
+        if cache_key in self._uri_reader_cache:
+            return self._uri_reader_cache[cache_key]
         uri_reader_factory = getattr(table.file_io, "uri_reader_factory", None)
         if uri_reader_factory is not None:
-            return uri_reader_factory.create(descriptor.uri)
-        return UriReader.from_file(table.file_io)
+            uri_reader = uri_reader_factory.create(descriptor.uri)
+        else:
+            uri_reader = UriReader.from_file(table.file_io)
+        self._uri_reader_cache[cache_key] = uri_reader
+        return uri_reader

From 975ec42824450205ef8503bf8d3220f9554d5f58 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Thu, 28 May 2026 20:43:29 +0800
Subject: [PATCH 05/34] simplify

---
 .../reader/blob_descriptor_convert_reader.py  |  51 ++------
 .../read/reader/data_file_batch_reader.py     |  12 +-
 paimon-python/pypaimon/read/split_read.py     |   6 +-
 paimon-python/pypaimon/schema/schema.py       | 113 ++++++++++--------
 paimon-python/pypaimon/table/row/blob.py      |  15 ---
 .../pypaimon/tests/table_update_test.py       |   9 +-
 .../write/writer/dedicated_format_writer.py   |  14 ---
 7 files changed, 85 insertions(+), 135 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 9b75177296ea..19afaffb0645 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -21,6 +21,7 @@
 
 from pypaimon.common.options.core_options import CoreOptions
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
+from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
 
 
 class BlobDescriptorConvertReader(RecordBatchReader):
@@ -94,7 +95,7 @@ def _prescan_and_resolve_views(self, pyarrow):
                 if field_name not in batch.schema.names:
                     continue
                 for value in batch.column(field_name).to_pylist():
-                    value = self._normalize_blob_cell(value)
+                    value = self._normalize_blob_to_bytes(value)
                     if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value):
                         all_view_structs.append(BlobViewStruct.deserialize(value))
 
@@ -113,13 +114,11 @@ def _prescan_and_resolve_views(self, pyarrow):
     def _resolve_view_fields(self, batch, blob_view_lookup, pyarrow):
         """Replace BlobViewStruct bytes in view fields with the corresponding
         BlobDescriptor serialized bytes."""
-        from pypaimon.table.row.blob import BlobViewStruct
-
         result = batch
         for field_name in self._view_fields:
             if field_name not in result.schema.names:
                 continue
-            values = [self._normalize_blob_cell(v) for v in result.column(field_name).to_pylist()]
+            values = [self._normalize_blob_to_bytes(v) for v in result.column(field_name).to_pylist()]
             converted_values = []
             for value in values:
                 if value is None:
@@ -148,57 +147,33 @@ def _resolve_view_fields(self, batch, blob_view_lookup, pyarrow):
     # ------------------------------------------------------------------
 
     def _resolve_blob_data(self, batch, pyarrow):
-        """If blob-as-descriptor is true, return batch as-is. Otherwise resolve
-        all BlobDescriptor bytes in descriptor fields and view fields into real
-        blob data bytes."""
         if self._blob_as_descriptor:
             return batch
 
-        from pypaimon.table.row.blob import Blob, BlobDescriptor
-
-        all_fields = self._descriptor_fields | self._view_fields
-        result = batch
-        for field_name in all_fields:
-            if field_name not in result.schema.names:
+        all_inline_blob_fields = self._descriptor_fields | self._view_fields
+        for field_name in all_inline_blob_fields:
+            if field_name not in batch.schema.names:
                 continue
-            values = [self._normalize_blob_cell(v) for v in result.column(field_name).to_pylist()]
+            values = [self._normalize_blob_to_bytes(v) for v in batch.column(field_name).to_pylist()]
             converted_values = []
             for value in values:
-                if value is None:
-                    converted_values.append(None)
-                    continue
-                if not isinstance(value, bytes):
-                    converted_values.append(value)
-                    continue
-                if not BlobDescriptor.is_blob_descriptor(value):
-                    converted_values.append(value)
-                    continue
-                descriptor = BlobDescriptor.deserialize(value)
-                if descriptor.serialize() != value:
-                    converted_values.append(value)
-                    continue
-                try:
-                    uri_reader = self._table.file_io.uri_reader_factory.create(descriptor.uri)
-                    converted_values.append(Blob.from_descriptor(uri_reader, descriptor).to_data())
-                except Exception as e:
-                    raise RuntimeError(
-                        "Failed to read blob bytes from descriptor URI."
-                    ) from e
+                blob = Blob.from_bytes(value, self._table.file_io)
+                converted_values.append(blob.to_data() if blob else None)
 
-            column_idx = result.schema.names.index(field_name)
-            result = result.set_column(
+            column_idx = batch.schema.names.index(field_name)
+            batch = batch.set_column(
                 column_idx,
                 pyarrow.field(field_name, pyarrow.large_binary(), nullable=True),
                 pyarrow.array(converted_values, type=pyarrow.large_binary()),
             )
-        return result
+        return batch
 
     # ------------------------------------------------------------------
     # Utilities
     # ------------------------------------------------------------------
 
     @staticmethod
-    def _normalize_blob_cell(value):
+    def _normalize_blob_to_bytes(value):
         if value is None:
             return None
         if hasattr(value, 'as_py'):
diff --git a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
index 0d13fad7d532..33475b2c4c67 100644
--- a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
+++ b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
@@ -26,8 +26,6 @@
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
 from pypaimon.schema.data_types import DataField, PyarrowFieldParser
 from pypaimon.table.row.blob import Blob
-from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
-from pypaimon.table.row.blob import Blob, BlobDescriptor
 from pypaimon.table.special_fields import SpecialFields
 
 
@@ -44,10 +42,8 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
                  system_fields: dict,
                  blob_as_descriptor: bool = False,
                  blob_descriptor_fields: Optional[set] = None,
-                 blob_view_fields: Optional[set] = None,
                  file_io: Optional[FileIO] = None,
-                 row_id_offsets: Optional[List[int]] = None,
-                 table=None):
+                 row_id_offsets: Optional[List[int]] = None):
         self.format_reader = format_reader
         self.index_mapping = index_mapping
         self.partition_info = partition_info
@@ -61,7 +57,6 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
         self.system_fields = system_fields
         self.blob_as_descriptor = blob_as_descriptor
         self.blob_descriptor_fields = blob_descriptor_fields or set()
-        self.blob_view_fields = blob_view_fields or set()
         self.file_io = file_io
         self.blob_field_names = {
             field.name
@@ -95,7 +90,7 @@ def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch
                 record_batch.schema.names, record_batch.columns)
             if self.row_tracking_enabled and self.system_fields:
                 record_batch = self._assign_row_tracking(record_batch)
-            return self._convert_inline_blob_columns(record_batch)
+            return record_batch
 
         inter_arrays = []
         inter_names = []
@@ -145,7 +140,7 @@ def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch
         if self.row_tracking_enabled and self.system_fields:
             record_batch = self._assign_row_tracking(record_batch)
 
-        record_batch = self._convert_inline_blob_columns(record_batch)
+        record_batch = self._convert_descriptor_stored_blob_columns(record_batch)
 
         return record_batch
 
@@ -176,6 +171,7 @@ def _align_batch_to_read_schema(self, names: List[str], arrays: list) -> RecordB
         return pa.RecordBatch.from_arrays(out_arrays, schema=pa.schema(out_fields))
 
     def _convert_inline_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
+    def _convert_descriptor_stored_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
         if isinstance(self.format_reader, FormatBlobReader):
             return record_batch
         if not self.descriptor_blob_fields:
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 5bad1ed9434b..0685b6c48979 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -351,8 +351,7 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 blob_as_descriptor=blob_as_descriptor,
                 blob_descriptor_fields=blob_descriptor_fields,
                 file_io=self.table.file_io,
-                row_id_offsets=row_indices,
-                table=self.table)
+                row_id_offsets=row_indices)
         else:
             reader = DataFileBatchReader(
                 format_reader,
@@ -367,8 +366,7 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 blob_as_descriptor=blob_as_descriptor,
                 blob_descriptor_fields=blob_descriptor_fields,
                 file_io=self.table.file_io,
-                row_id_offsets=row_indices,
-                table=self.table)
+                row_id_offsets=row_indices)
 
         # For non-Vortex formats, wrap with RowIdFilterRecordBatchReader
         if row_ranges is not None and row_indices is None:
diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index c9425c286e55..e758fc262512 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -62,59 +62,8 @@ def from_pyarrow_schema(pa_schema: pa.Schema, partition_keys: Optional[List[str]
                 if field.name in pk_set:
                     field.type.nullable = False
 
-        # Check if Blob type exists in the schema
-        blob_names = [
-            field.name for field in fields
-            if 'blob' in str(field.type).lower()
-        ]
-
-        if blob_names:
-            if options is None:
-                options = {}
-
-            if len(fields) <= len(blob_names):
-                raise ValueError(
-                    "Table with BLOB type column must have other normal columns."
-                )
-
-            blob_field_names = {
-                field.name for field in fields if 'blob' in str(field.type).lower()
-            }
-            core_options = CoreOptions.from_dict(options)
-            descriptor_fields = core_options.blob_descriptor_fields()
-            view_fields = core_options.blob_view_fields()
-            unknown_inline_fields = descriptor_fields.union(view_fields).difference(blob_field_names)
-            if unknown_inline_fields:
-                raise ValueError(
-                    "Fields in 'blob-descriptor-field' or 'blob-view-field' must be blob fields "
-                    "in schema. Unknown fields: {}".format(sorted(unknown_inline_fields))
-                )
-
-            overlapping_inline_fields = descriptor_fields.intersection(view_fields)
-            if overlapping_inline_fields:
-                raise ValueError(
-                    "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
-                    "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
-                )
-
-            required_options = {
-                CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
-                CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
-            }
-
-            missing_options = []
-            for key, expected_value in required_options.items():
-                if key not in options or options[key] != expected_value:
-                    missing_options.append(f"{key}='{expected_value}'")
-
-            if missing_options:
-                raise ValueError(
-                    f"Schema contains Blob type but is missing required options: {', '.join(missing_options)}. "
-                    f"Please add these options to the schema."
-                )
-
-            if primary_keys is not None:
-                raise ValueError("Blob type is not supported with primary key.")
+        # Validate Blob type fields in the schema
+        Schema._validate_blob_fields(fields, options, primary_keys)
 
         # Check if Vector type with dedicated file format
         vector_names = [
@@ -153,3 +102,61 @@ def from_pyarrow_schema(pa_schema: pa.Schema, partition_keys: Optional[List[str]
                 )
 
         return Schema(fields, partition_keys, primary_keys, options, comment)
+
+    @staticmethod
+    def _validate_blob_fields(fields, options, primary_keys):
+        """Validate blob field configurations in the schema."""
+        blob_names = [
+            field.name for field in fields
+            if 'blob' in str(field.type).lower()
+        ]
+
+        if not blob_names:
+            return
+
+        if options is None:
+            options = {}
+
+        if len(fields) <= len(blob_names):
+            raise ValueError(
+                "Table with BLOB type column must have other normal columns."
+            )
+
+        blob_field_names = {
+            field.name for field in fields if 'blob' in str(field.type).lower()
+        }
+        core_options = CoreOptions.from_dict(options)
+        descriptor_fields = core_options.blob_descriptor_fields()
+        view_fields = core_options.blob_view_fields()
+        unknown_inline_fields = descriptor_fields.union(view_fields).difference(blob_field_names)
+        if unknown_inline_fields:
+            raise ValueError(
+                "Fields in 'blob-descriptor-field' or 'blob-view-field' must be blob fields "
+                "in schema. Unknown fields: {}".format(sorted(unknown_inline_fields))
+            )
+
+        overlapping_inline_fields = descriptor_fields.intersection(view_fields)
+        if overlapping_inline_fields:
+            raise ValueError(
+                "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
+                "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
+            )
+
+        required_options = {
+            CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
+            CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
+        }
+
+        missing_options = []
+        for key, expected_value in required_options.items():
+            if key not in options or options[key] != expected_value:
+                missing_options.append(f"{key}='{expected_value}'")
+
+        if missing_options:
+            raise ValueError(
+                f"Schema contains Blob type but is missing required options: {', '.join(missing_options)}. "
+                f"Please add these options to the schema."
+            )
+
+        if primary_keys is not None:
+            raise ValueError("Blob type is not supported with primary key.")
diff --git a/paimon-python/pypaimon/table/row/blob.py b/paimon-python/pypaimon/table/row/blob.py
index 129e5d75e15f..9770a0a228b0 100644
--- a/paimon-python/pypaimon/table/row/blob.py
+++ b/paimon-python/pypaimon/table/row/blob.py
@@ -412,21 +412,6 @@ def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True
     def from_view(view_struct: BlobViewStruct) -> 'Blob':
         return BlobView(view_struct)
 
-    @staticmethod
-    def from_bytes_with_reader(
-            data: bytes,
-            uri_reader: Optional[UriReader],
-            file_io=None,
-            allow_blob_data: bool = True) -> Optional['Blob']:
-        if data is None:
-            return None
-        if BlobViewStruct.is_blob_view_struct(data):
-            return Blob.from_view(BlobViewStruct.deserialize(data))
-        if BlobDescriptor.is_blob_descriptor(data) or not allow_blob_data:
-            descriptor = BlobDescriptor.deserialize(data)
-            return Blob.from_descriptor(uri_reader or UriReader.from_file(file_io), descriptor)
-        return Blob.from_data(data)
-
     @staticmethod
     def serialize_blob(blob: 'Blob') -> bytes:
         if isinstance(blob, BlobView):
diff --git a/paimon-python/pypaimon/tests/table_update_test.py b/paimon-python/pypaimon/tests/table_update_test.py
index 4108c85ac8de..57ae605703a4 100644
--- a/paimon-python/pypaimon/tests/table_update_test.py
+++ b/paimon-python/pypaimon/tests/table_update_test.py
@@ -503,7 +503,7 @@ def test_update_deleted_row_id_raises(self):
 
     def _run_concurrent_updates(self, table, thread_specs, max_retries):
         """Run a batch of concurrent updates with conflict-retry; return the
-        order in which worker threads observed successful commits."""
+        commit order (``thread_index`` of the winning commit appended last)."""
         errors = []
         completion_order = []
         lock = threading.Lock()
@@ -561,9 +561,12 @@ def test_concurrent_updates_overlapping_rows_last_writer_wins(self):
             {'row_ids': [0, 1, 2], 'ages': [102, 202, 302]},
             {'row_ids': [0, 1, 2], 'ages': [103, 203, 303]},
         ]
-        self._run_concurrent_updates(table, specs, max_retries=30)
+        completion_order = self._run_concurrent_updates(
+            table, specs, max_retries=30
+        )
+        winner = specs[completion_order[-1]]['ages']
         ages = self._read_all(table)['age'].to_pylist()
-        self.assertIn(ages[:3], [spec['ages'] for spec in specs])
+        self.assertEqual(winner, ages[:3])
         # Rows 3 & 4 must remain at seed values
         self.assertEqual([40, 45], ages[3:])
 
diff --git a/paimon-python/pypaimon/write/writer/dedicated_format_writer.py b/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
index df96d9f94863..6e4b052d228a 100644
--- a/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
+++ b/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
@@ -69,20 +69,6 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
                 f"Unknown fields: {sorted(unknown_descriptor_fields)}"
             )
 
-        unknown_view_fields = self.blob_view_fields.difference(set(self.blob_column_names))
-        if unknown_view_fields:
-            raise ValueError(
-                "Fields in 'blob-view-field' must be blob fields in schema. "
-                f"Unknown fields: {sorted(unknown_view_fields)}"
-            )
-
-        overlapping_inline_fields = self.blob_descriptor_fields.intersection(self.blob_view_fields)
-        if overlapping_inline_fields:
-            raise ValueError(
-                "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
-                f"Overlapping fields: {sorted(overlapping_inline_fields)}"
-            )
-
         # Blob fields that should still be written to `.blob` files.
         self.blob_file_column_names = [
             col for col in self.blob_column_names if col not in self.blob_inline_fields

From 528ad93c8396c265628d0c551fc403108f7b8c26 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Fri, 29 May 2026 14:30:51 +0800
Subject: [PATCH 06/34] prescan

# Conflicts:
#	paimon-python/pypaimon/read/split_read.py
---
 .../reader/blob_descriptor_convert_reader.py  | 146 +++++++-------
 paimon-python/pypaimon/read/split_read.py     |  43 ++++-
 .../pypaimon/utils/blob_view_lookup.py        | 179 +++++++++---------
 3 files changed, 202 insertions(+), 166 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 19afaffb0645..456ae35248d9 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -15,110 +15,109 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Optional
+from typing import Callable, Optional, Set
 
+import pyarrow
 from pyarrow import RecordBatch
 
 from pypaimon.common.options.core_options import CoreOptions
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
-from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+from pypaimon.table.row.blob import Blob, BlobViewStruct
 
 
 class BlobDescriptorConvertReader(RecordBatchReader):
     """Resolves BlobView and BlobDescriptor fields in record batches.
 
     Processing is split into two clear stages:
-      Stage 1 (BlobView resolution): If view fields exist, prescan all batches,
-               collect BlobViewStructs, bulk-preload their descriptors from
-               upstream tables, and replace view field values with the
-               corresponding BlobDescriptor serialized bytes.
+      Stage 1 (BlobView resolution): If view fields exist, use a lightweight
+               prescan reader (only projecting view columns) to collect
+               BlobViewStructs, bulk-preload their descriptors, then read
+               full data from the main reader and replace view field values
+               with the corresponding BlobDescriptor serialized bytes.
       Stage 2 (BlobData resolution): Controlled by blob-as-descriptor option.
                If false, resolve all BlobDescriptor bytes (from both descriptor
                fields and view fields) into real blob data bytes.
                If true, return as-is.
     """
 
-    def __init__(self, inner: RecordBatchReader, table):
+    def __init__(self, inner: RecordBatchReader, table,
+                 prescan_reader_factory: Optional[Callable[[Set[str]], RecordBatchReader]] = None):
+        """
+        Args:
+            inner: The main data reader (reads all columns).
+            table: The table instance.
+            prescan_reader_factory: Optional factory that creates a lightweight
+                reader projecting only the specified field names. Used for
+                prescan to collect BlobViewStructs without reading all columns.
+                Signature: (field_names: Set[str]) -> RecordBatchReader
+        """
         self._inner = inner
         self._table = table
+        self._prescan_reader_factory = prescan_reader_factory
         self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
         self.file_io = inner.file_io
         self.blob_field_indices = inner.blob_field_indices
         self._view_fields = CoreOptions.blob_view_fields(table.options)
         self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
         self._blob_as_descriptor = CoreOptions.blob_as_descriptor(table.options)
-        self._cached_batches = None
-        self._batch_index = 0
+        self._prescan_done = False
+        self._blob_view_lookup = None
 
     def read_arrow_batch(self) -> Optional[RecordBatch]:
-        import pyarrow
-        # Stage 1: obtain batch (prescan for view fields, or direct read)
-        if self._view_fields:
-            batch = self._read_with_prescan(pyarrow)
-        else:
-            batch = self._inner.read_arrow_batch()
+        # Ensure prescan is done before reading (only needed for view fields)
+        if self._view_fields and not self._prescan_done:
+            self._prescan_view_structs()
+
+        batch = self._inner.read_arrow_batch()
         if batch is None:
             return None
-        # Stage 2: resolve BlobDescriptor -> real bytes (if blob-as-descriptor=false)
-        return self._resolve_blob_data(batch, pyarrow)
+        # Resolve view fields using the preloaded lookup
+        if self._view_fields and self._blob_view_lookup is not None:
+            batch = self._resolve_view_fields(batch, self._blob_view_lookup)
+        # Resolve BlobDescriptor -> real bytes (if blob-as-descriptor=false)
+        return self._resolve_blob_data(batch)
 
     # ------------------------------------------------------------------
-    # Stage 1: BlobView prescan and resolution
+    # Stage 1: BlobView prescan (lightweight, only reads view columns)
     # ------------------------------------------------------------------
 
-    def _read_with_prescan(self, pyarrow):
-        """Return the next batch from cache (view fields already resolved to
-        BlobDescriptor bytes)."""
-        if self._cached_batches is None:
-            self._prescan_and_resolve_views(pyarrow)
-        if self._batch_index >= len(self._cached_batches):
-            return None
-        batch = self._cached_batches[self._batch_index]
-        self._batch_index += 1
-        return batch
-
-    def _prescan_and_resolve_views(self, pyarrow):
-        """Prescan all batches, collect BlobViewStructs, bulk-preload
-        descriptors, then replace view field values with BlobDescriptor bytes."""
+    def _prescan_view_structs(self):
+        """Use a lightweight prescan reader (projecting only view columns) to
+        collect all BlobViewStructs and bulk-preload their descriptors."""
         from pypaimon.table.row.blob import BlobViewStruct
         from pypaimon.utils.blob_view_lookup import BlobViewLookup
 
-        # Step 1: cache all batches and collect BlobViewStructs
-        raw_batches = []
+        self._prescan_done = True
         all_view_structs = []
-        while True:
-            batch = self._inner.read_arrow_batch()
-            if batch is None:
-                break
-            raw_batches.append(batch)
-            for field_name in self._view_fields:
-                if field_name not in batch.schema.names:
-                    continue
-                for value in batch.column(field_name).to_pylist():
-                    value = self._normalize_blob_to_bytes(value)
-                    if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value):
-                        all_view_structs.append(BlobViewStruct.deserialize(value))
 
-        # Step 2: bulk-preload BlobViewStruct -> BlobDescriptor mapping
-        blob_view_lookup = None
+        prescan_reader = self._prescan_reader_factory(self._view_fields)
+        try:
+            while True:
+                batch = prescan_reader.read_arrow_batch()
+                if batch is None:
+                    break
+                for field_name in self._view_fields:
+                    if field_name not in batch.schema.names:
+                        continue
+                    for value in batch.column(field_name).to_pylist():
+                        value = self._normalize_blob_to_bytes(value)
+                        if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value):
+                            all_view_structs.append(BlobViewStruct.deserialize(value))
+        finally:
+            prescan_reader.close()
+
+        # Bulk-preload BlobViewStruct -> BlobDescriptor mapping
         if all_view_structs:
-            blob_view_lookup = BlobViewLookup(self._table)
-            blob_view_lookup.preload(all_view_structs)
+            self._blob_view_lookup = BlobViewLookup(self._table)
+            self._blob_view_lookup.preload(all_view_structs)
 
-        # Step 3: resolve view fields in each batch
-        self._cached_batches = []
-        for batch in raw_batches:
-            batch = self._resolve_view_fields(batch, blob_view_lookup, pyarrow)
-            self._cached_batches.append(batch)
-
-    def _resolve_view_fields(self, batch, blob_view_lookup, pyarrow):
+    def _resolve_view_fields(self, batch, blob_view_lookup):
         """Replace BlobViewStruct bytes in view fields with the corresponding
         BlobDescriptor serialized bytes."""
-        result = batch
         for field_name in self._view_fields:
-            if field_name not in result.schema.names:
+            if field_name not in batch.schema.names:
                 continue
-            values = [self._normalize_blob_to_bytes(v) for v in result.column(field_name).to_pylist()]
+            values = [self._normalize_blob_to_bytes(v) for v in batch.column(field_name).to_pylist()]
             converted_values = []
             for value in values:
                 if value is None:
@@ -134,19 +133,19 @@ def _resolve_view_fields(self, batch, blob_view_lookup, pyarrow):
                 descriptor = blob_view_lookup.resolve_descriptor(view_struct)
                 converted_values.append(descriptor.serialize())
 
-            column_idx = result.schema.names.index(field_name)
-            result = result.set_column(
+            column_idx = batch.schema.names.index(field_name)
+            batch = batch.set_column(
                 column_idx,
                 pyarrow.field(field_name, pyarrow.large_binary(), nullable=True),
                 pyarrow.array(converted_values, type=pyarrow.large_binary()),
             )
-        return result
+        return batch
 
     # ------------------------------------------------------------------
     # Stage 2: BlobData resolution (unified exit)
     # ------------------------------------------------------------------
 
-    def _resolve_blob_data(self, batch, pyarrow):
+    def _resolve_blob_data(self, batch):
         if self._blob_as_descriptor:
             return batch
 
@@ -186,3 +185,22 @@ def _normalize_blob_to_bytes(value):
 
     def close(self):
         self._inner.close()
+
+
+class _CachedBatchReader(RecordBatchReader):
+    """A simple reader that replays pre-cached RecordBatches.
+    Used as fallback when no prescan_reader_factory is provided."""
+
+    def __init__(self, batches):
+        self._batches = batches
+        self._index = 0
+
+    def read_arrow_batch(self) -> Optional[RecordBatch]:
+        if self._index >= len(self._batches):
+            return None
+        batch = self._batches[self._index]
+        self._index += 1
+        return batch
+
+    def close(self):
+        self._batches = None
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 0685b6c48979..9a37f0b40901 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -808,15 +808,27 @@ def _push_down_predicate(self) -> Optional[Predicate]:
         return None
 
     def create_reader(self) -> RecordReader:
+        reader = self._create_raw_reader()
+
+        if (CoreOptions.blob_view_fields(self.table.options)
+                or (not CoreOptions.blob_as_descriptor(self.table.options)
+                    and CoreOptions.blob_descriptor_fields(self.table.options))):
+            reader = BlobDescriptorConvertReader(
+                reader, self.table,
+                prescan_reader_factory=lambda names: self._create_prescan_reader(names))
+
+        return reader
+
+    def _create_raw_reader(self) -> RecordReader:
+        """Core read logic: split_by_row_id -> suppliers -> ConcatBatchReader -> filter.
+        Does NOT include BlobView wrapping to avoid recursion during prescan."""
         files = self.split.files
         suppliers = []
 
-        # Split files by row ID
         split_by_row_id = self._split_by_row_id(files)
 
         for need_merge_files in split_by_row_id:
             if len(need_merge_files) == 1 or not self.read_fields:
-                # No need to merge fields, just create a single file reader
                 suppliers.append(
                     lambda f=need_merge_files[0]: self._create_file_reader(f, self._get_final_read_data_fields())
                 )
@@ -839,16 +851,27 @@ def create_reader(self) -> RecordReader:
         else:
             reader = merge_reader
 
-        if (CoreOptions.blob_view_fields(self.table.options)
-                or (not CoreOptions.blob_as_descriptor(self.table.options)
-                    and CoreOptions.blob_descriptor_fields(self.table.options))):
-            reader = BlobDescriptorConvertReader(reader, self.table)
-
-        if self.limit is not None:
-            reader = LimitedRecordBatchReader(reader, self.limit)
-
         return reader
 
+    def _create_prescan_reader(self, field_names):
+        """Create a prescan reader by constructing a new DataEvolutionSplitRead
+        instance that only projects the specified field names."""
+        from pypaimon.read.reader.iface.record_batch_reader import EmptyRecordBatchReader
+
+        prescan_fields = [f for f in self.read_fields if f.name in field_names]
+        if not prescan_fields:
+            return EmptyRecordBatchReader()
+
+        prescan_read = DataEvolutionSplitRead(
+            table=self.table,
+            predicate=self.predicate,
+            read_type=prescan_fields,
+            split=self.split,
+            row_tracking_enabled=False,
+        )
+        prescan_read.row_ranges = self.row_ranges
+        return prescan_read._create_raw_reader()
+
     def _split_by_row_id(self, files: List[DataFileMeta]) -> List[List[DataFileMeta]]:
         """Split files by firstRowId for data evolution."""
 
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 9dbcadd653ea..bfe75de83567 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -26,32 +26,57 @@
 from pypaimon.schema.schema_manager import SchemaManager
 from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
 from pypaimon.table.special_fields import SpecialFields
+from pypaimon.utils.range import Range
 
 _PRELOAD_THREAD_NUM = 100
 _MIN_ROWS_PER_TASK = 100
 
 
+class TableReferences:
+    """Groups BlobViewStruct references by upstream table."""
+
+    def __init__(self, identifier: Identifier):
+        self.identifier: Identifier = identifier
+        self.references_by_field: Dict[int, List[BlobViewStruct]] = {}
+        self.row_ids: List[int] = []
+
+    def add(self, view_struct: BlobViewStruct) -> None:
+        self.references_by_field.setdefault(view_struct.field_id, []).append(view_struct)
+        self.row_ids.append(int(view_struct.row_id))
+
+
+class TableReadPlan:
+    """A plan for reading blob descriptors from one upstream table."""
+
+    def __init__(self, identifier: Identifier, upstream_table,
+                 fields: List, row_ranges: List[Range]):
+        self.identifier: Identifier = identifier
+        self.upstream_table = upstream_table
+        self.fields: List = fields
+        self.row_ranges: List[Range] = row_ranges
+
+
 class BlobViewLookup:
     """Resolve BlobViewStruct references by reading upstream blob descriptors."""
 
     def __init__(self, table):
         self._table = table
-        self._table_cache = {}
+        self._table_cache: Dict[str, object] = {}
         self._uri_reader_cache: Dict[str, UriReader] = {}
         self._descriptor_cache: Dict[BlobViewStruct, BlobDescriptor] = {}
 
     def preload(self, view_structs: Iterable[BlobViewStruct]) -> None:
-        unique_structs = []
+        unique_structs: List[BlobViewStruct] = []
         for view_struct in view_structs:
             if view_struct not in self._descriptor_cache:
                 unique_structs.append(view_struct)
         if not unique_structs:
             return
-        resolved = self._preload_descriptors(unique_structs)
+        resolved: Dict[BlobViewStruct, BlobDescriptor] = self._preload_descriptors(unique_structs)
         self._descriptor_cache.update(resolved)
 
     def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
-        descriptor = self._descriptor_cache.get(view_struct)
+        descriptor: BlobDescriptor = self._descriptor_cache.get(view_struct)
         if descriptor is None:
             self.preload([view_struct])
             descriptor = self._descriptor_cache.get(view_struct)
@@ -69,19 +94,19 @@ def resolve_data(self, view_struct: BlobViewStruct) -> bytes:
         return Blob.from_descriptor(uri_reader, descriptor).to_data()
 
     def _preload_descriptors(
-            self, view_structs: List[BlobViewStruct]) -> Dict[BlobViewStruct, BlobDescriptor]:
+        self, view_structs: List[BlobViewStruct]) -> Dict[BlobViewStruct, BlobDescriptor]:
         if not view_structs:
             return {}
 
-        grouped = self._group_by_table(view_structs)
-        plans = []
-        for identifier, table_refs in grouped.items():
-            plans.append(self._create_table_read_plan(identifier, table_refs))
+        grouped: Dict[str, TableReferences] = self._group_by_table(view_structs)
+        plans: List[TableReadPlan] = []
+        for table_refs in grouped.values():
+            plans.append(self._create_table_read_plan(table_refs))
 
-        target_rows = self._target_rows_per_task(plans)
-        tasks = []
+        target_rows: int = self._target_rows_per_task(plans)
+        tasks: List[Tuple[TableReadPlan, List[Tuple[int, int]]]] = []
         for plan in plans:
-            for range_chunk in self._split_row_ranges(plan["row_ranges"], target_rows):
+            for range_chunk in self._split_row_ranges(plan.row_ranges, target_rows):
                 tasks.append((plan, range_chunk))
 
         if len(tasks) <= 1:
@@ -104,48 +129,35 @@ def _preload_descriptors(
         return resolved
 
     def _group_by_table(
-            self, view_structs: List[BlobViewStruct]
-    ) -> Dict[str, Dict]:
-        grouped = {}
+        self, view_structs: List[BlobViewStruct]
+    ) -> Dict[str, TableReferences]:
+        grouped: Dict[str, TableReferences] = {}
         for view_struct in view_structs:
             key = view_struct.identifier.get_full_name()
             if key not in grouped:
-                grouped[key] = {
-                    "identifier": view_struct.identifier,
-                    "fields_by_id": {},
-                    "row_ids": [],
-                }
-            refs = grouped[key]
-            refs["fields_by_id"].setdefault(view_struct.field_id, []).append(view_struct)
-            refs["row_ids"].append(int(view_struct.row_id))
+                grouped[key] = TableReferences(view_struct.identifier)
+            grouped[key].add(view_struct)
         return grouped
 
-    def _create_table_read_plan(self, table_key: str, table_refs: Dict) -> Dict:
-        identifier = table_refs["identifier"]
-        upstream_table = self._load_table(identifier)
+    def _create_table_read_plan(self, table_refs: TableReferences) -> TableReadPlan:
+        upstream_table = self._load_table(table_refs.identifier)
 
-        fields = []
-        for field_id in table_refs["fields_by_id"]:
-            field = self._field_by_id(upstream_table, field_id)
-            fields.append({"field_id": field_id, "field": field})
+        fields: List = []
+        for field_id in table_refs.references_by_field:
+            fields.append(self._field_by_id(upstream_table, field_id))
 
-        row_ranges = self._to_sorted_distinct_ranges(table_refs["row_ids"])
-        return {
-            "identifier": identifier,
-            "upstream_table": upstream_table,
-            "fields": fields,
-            "row_ranges": row_ranges,
-        }
+        row_ranges: List[Tuple[int, int]] = self._to_sorted_distinct_ranges(table_refs.row_ids)
+        return TableReadPlan(table_refs.identifier, upstream_table, fields, row_ranges)
 
     def _load_descriptor_chunk(
-            self, plan: Dict, row_ranges: List[Tuple[int, int]]
+        self, plan: TableReadPlan, row_ranges: List[Range]
     ) -> Dict[BlobViewStruct, BlobDescriptor]:
-        identifier = plan["identifier"]
-        upstream_table = plan["upstream_table"]
-        fields = plan["fields"]
+        identifier: Identifier = plan.identifier
+        upstream_table = plan.upstream_table
+        fields: List = plan.fields
 
-        field_names = [f["field"].name for f in fields]
-        projection = field_names + [SpecialFields.ROW_ID.name]
+        field_names: List[str] = [f.name for f in fields]
+        projection: List[str] = field_names + [SpecialFields.ROW_ID.name]
 
         descriptor_table = upstream_table.copy({CoreOptions.BLOB_AS_DESCRIPTOR.key(): "true"})
         read_builder = descriptor_table.new_read_builder().with_projection(projection)
@@ -159,14 +171,14 @@ def _load_descriptor_chunk(
             )
 
         predicate_builder = read_builder.new_predicate_builder()
-        range_predicates = []
-        for range_from, range_to in row_ranges:
-            if range_from == range_to:
+        range_predicates: List = []
+        for r in row_ranges:
+            if r.from_ == r.to:
                 range_predicates.append(
-                    predicate_builder.equal(SpecialFields.ROW_ID.name, range_from))
+                    predicate_builder.equal(SpecialFields.ROW_ID.name, r.from_))
             else:
                 range_predicates.append(
-                    predicate_builder.between(SpecialFields.ROW_ID.name, range_from, range_to))
+                    predicate_builder.between(SpecialFields.ROW_ID.name, r.from_, r.to))
         if len(range_predicates) == 1:
             predicate = range_predicates[0]
         else:
@@ -180,62 +192,45 @@ def _load_descriptor_chunk(
                 .format(identifier.get_full_name())
             )
 
-        row_id_values = result.column(SpecialFields.ROW_ID.name).to_pylist()
-        resolved = {}
-        for field_info in fields:
-            field_id = field_info["field_id"]
-            field_name = field_info["field"].name
-            if field_name not in result.schema.names:
+        row_id_values: List = result.column(SpecialFields.ROW_ID.name).to_pylist()
+        resolved: Dict[BlobViewStruct, BlobDescriptor] = {}
+        for field in fields:
+            if field.name not in result.schema.names:
                 continue
-            values = result.column(field_name).to_pylist()
+            values = result.column(field.name).to_pylist()
             for row_id, value in zip(row_id_values, values):
                 if value is None:
                     continue
                 descriptor = self._to_descriptor(value)
                 view_struct = BlobViewStruct(
-                    identifier.get_full_name(), field_id, int(row_id))
+                    identifier.get_full_name(), field.id, int(row_id))
                 resolved[view_struct] = descriptor
         return resolved
 
     @staticmethod
-    def _to_sorted_distinct_ranges(row_ids: List[int]) -> List[Tuple[int, int]]:
-        if not row_ids:
-            return []
-        sorted_ids = sorted(set(row_ids))
-        ranges = []
-        range_start = sorted_ids[0]
-        range_end = range_start
-        for i in range(1, len(sorted_ids)):
-            row_id = sorted_ids[i]
-            if row_id == range_end + 1:
-                range_end = row_id
-            else:
-                ranges.append((range_start, range_end))
-                range_start = row_id
-                range_end = row_id
-        ranges.append((range_start, range_end))
-        return ranges
+    def _to_sorted_distinct_ranges(row_ids: List[int]) -> List[Range]:
+        return Range.to_ranges(row_ids)
 
     @staticmethod
     def _split_row_ranges(
-            row_ranges: List[Tuple[int, int]], target_rows_per_task: int
-    ) -> List[List[Tuple[int, int]]]:
+        row_ranges: List[Range], target_rows_per_task: int
+    ) -> List[List[Range]]:
         if not row_ranges:
             return []
 
-        chunks = []
-        current_chunk = []
-        current_chunk_rows = 0
-        for range_from, range_to in row_ranges:
-            next_from = range_from
-            while next_from <= range_to:
+        chunks: List[List[Range]] = []
+        current_chunk: List[Range] = []
+        current_chunk_rows: int = 0
+        for r in row_ranges:
+            next_from = r.from_
+            while next_from <= r.to:
                 if current_chunk_rows == target_rows_per_task:
                     chunks.append(current_chunk)
                     current_chunk = []
                     current_chunk_rows = 0
                 remaining = target_rows_per_task - current_chunk_rows
-                next_to = min(range_to, next_from + remaining - 1)
-                current_chunk.append((next_from, next_to))
+                next_to = min(r.to, next_from + remaining - 1)
+                current_chunk.append(Range(next_from, next_to))
                 current_chunk_rows += next_to - next_from + 1
                 next_from = next_to + 1
         if current_chunk:
@@ -243,18 +238,18 @@ def _split_row_ranges(
         return chunks
 
     @staticmethod
-    def _target_rows_per_task(plans: List[Dict]) -> int:
-        total_rows = 0
+    def _target_rows_per_task(plans: List[TableReadPlan]) -> int:
+        total_rows: int = 0
         for plan in plans:
-            for range_from, range_to in plan["row_ranges"]:
-                total_rows += range_to - range_from + 1
+            for r in plan.row_ranges:
+                total_rows += r.count()
         if total_rows <= 0:
             return _MIN_ROWS_PER_TASK
         target = (total_rows + _PRELOAD_THREAD_NUM - 1) // _PRELOAD_THREAD_NUM
         return max(_MIN_ROWS_PER_TASK, target)
 
     def _load_table(self, identifier: Identifier):
-        key = identifier.get_full_name()
+        key: str = identifier.get_full_name()
         if key in self._table_cache:
             return self._table_cache[key]
 
@@ -283,9 +278,9 @@ def _load_filesystem_table(self, identifier: Identifier):
         return FileStoreTable(self._table.file_io, identifier, table_path, table_schema)
 
     def _filesystem_table_path(self, identifier: Identifier) -> str:
-        current_table_path = self._table.table_path.rstrip("/")
-        current_db_path = os.path.dirname(current_table_path)
-        warehouse = os.path.dirname(current_db_path)
+        current_table_path: str = self._table.table_path.rstrip("/")
+        current_db_path: str = os.path.dirname(current_table_path)
+        warehouse: str = os.path.dirname(current_db_path)
         return "{}/{}.db/{}".format(
             warehouse.rstrip("/"),
             identifier.get_database_name(),
@@ -318,7 +313,7 @@ def _to_descriptor(self, value) -> BlobDescriptor:
         return BlobDescriptor.deserialize(value)
 
     def _get_or_create_uri_reader(self, table, descriptor: BlobDescriptor) -> UriReader:
-        cache_key = table.identifier.get_full_name()
+        cache_key: str = table.identifier.get_full_name()
         if cache_key in self._uri_reader_cache:
             return self._uri_reader_cache[cache_key]
         uri_reader_factory = getattr(table.file_io, "uri_reader_factory", None)

From 60ce34b992d9d67cd86632cb0c6d2498a3bb72e2 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Fri, 29 May 2026 14:33:19 +0800
Subject: [PATCH 07/34] fix

---
 paimon-python/pypaimon/utils/blob_view_lookup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index bfe75de83567..4e01413d1456 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -94,7 +94,7 @@ def resolve_data(self, view_struct: BlobViewStruct) -> bytes:
         return Blob.from_descriptor(uri_reader, descriptor).to_data()
 
     def _preload_descriptors(
-        self, view_structs: List[BlobViewStruct]) -> Dict[BlobViewStruct, BlobDescriptor]:
+            self, view_structs: List[BlobViewStruct]) -> Dict[BlobViewStruct, BlobDescriptor]:
         if not view_structs:
             return {}
 

From a441a33bc4dfac7b6ecbe3eabee60448369b3eba Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Fri, 29 May 2026 15:27:04 +0800
Subject: [PATCH 08/34] fix

---
 paimon-python/pypaimon/read/split_read.py |  5 +++--
 paimon-python/pypaimon/table/row/blob.py  | 16 ++--------------
 paimon-python/pypaimon/tests/blob_test.py |  6 ------
 3 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 9a37f0b40901..3cf3f2c3499b 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -820,15 +820,16 @@ def create_reader(self) -> RecordReader:
         return reader
 
     def _create_raw_reader(self) -> RecordReader:
-        """Core read logic: split_by_row_id -> suppliers -> ConcatBatchReader -> filter.
-        Does NOT include BlobView wrapping to avoid recursion during prescan."""
+        """Core read logic: split_by_row_id -> suppliers -> ConcatBatchReader -> filter."""
         files = self.split.files
         suppliers = []
 
+        # Split files by row ID
         split_by_row_id = self._split_by_row_id(files)
 
         for need_merge_files in split_by_row_id:
             if len(need_merge_files) == 1 or not self.read_fields:
+                # No need to merge fields, just create a single file reader
                 suppliers.append(
                     lambda f=need_merge_files[0]: self._create_file_reader(f, self._get_final_read_data_fields())
                 )
diff --git a/paimon-python/pypaimon/table/row/blob.py b/paimon-python/pypaimon/table/row/blob.py
index 9770a0a228b0..3f745c1575a9 100644
--- a/paimon-python/pypaimon/table/row/blob.py
+++ b/paimon-python/pypaimon/table/row/blob.py
@@ -393,8 +393,6 @@ def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True
         if not isinstance(data, (bytes, bytearray)):
             raise TypeError(f"Blob.from_bytes expects bytes, got {type(data)}")
         data = bytes(data)
-        if BlobViewStruct.is_blob_view_struct(data):
-            return Blob.from_view(BlobViewStruct.deserialize(data))
         is_descriptor = BlobDescriptor.is_blob_descriptor(data)
         if not allow_blob_data and not is_descriptor:
             raise ValueError(
@@ -408,16 +406,6 @@ def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True
             return BlobRef(uri_reader, descriptor)
         return BlobData(data)
 
-    @staticmethod
-    def from_view(view_struct: BlobViewStruct) -> 'Blob':
-        return BlobView(view_struct)
-
-    @staticmethod
-    def serialize_blob(blob: 'Blob') -> bytes:
-        if isinstance(blob, BlobView):
-            return blob.view_struct.serialize()
-        return blob.to_descriptor().serialize()
-
 
 class _PlaceholderBlob(Blob):
 
@@ -512,8 +500,8 @@ def __hash__(self) -> int:
 class BlobView(Blob):
 
     def __init__(self, view_struct: BlobViewStruct):
-        self._view_struct = view_struct
-        self._resolved_blob = None
+        self._view_struct: BlobViewStruct = view_struct
+        self._resolved_blob: Optional[BlobRef] = None
 
     @property
     def view_struct(self) -> BlobViewStruct:
diff --git a/paimon-python/pypaimon/tests/blob_test.py b/paimon-python/pypaimon/tests/blob_test.py
index 0fbb224f9f8a..e7f903984725 100644
--- a/paimon-python/pypaimon/tests/blob_test.py
+++ b/paimon-python/pypaimon/tests/blob_test.py
@@ -180,12 +180,6 @@ def test_blob_view_struct_roundtrip(self):
         self.assertEqual(restored.field_id, 7)
         self.assertEqual(restored.row_id, 42)
 
-        blob = Blob.from_bytes(serialized)
-        self.assertIsInstance(blob, BlobView)
-        self.assertEqual(Blob.serialize_blob(blob), serialized)
-        with self.assertRaises(RuntimeError):
-            blob.to_data()
-
     def test_blob_data_interface_compliance(self):
         """Test that BlobData properly implements Blob interface."""
         test_data = b"interface test data"

From 19c8a53865ab77e8b75738d9702b74db85efb673 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Fri, 29 May 2026 17:56:42 +0800
Subject: [PATCH 09/34] refine

---
 .../pypaimon/catalog/filesystem_catalog.py    |   6 +-
 .../pypaimon/utils/blob_view_lookup.py        | 144 +++++-------------
 2 files changed, 44 insertions(+), 106 deletions(-)

diff --git a/paimon-python/pypaimon/catalog/filesystem_catalog.py b/paimon-python/pypaimon/catalog/filesystem_catalog.py
index 86e2f775e769..b7356b45955b 100644
--- a/paimon-python/pypaimon/catalog/filesystem_catalog.py
+++ b/paimon-python/pypaimon/catalog/filesystem_catalog.py
@@ -142,11 +142,11 @@ def _load_data_table(self, identifier: Identifier) -> FileStoreTable:
         table_schema = self.get_table_schema(identifier)
 
         # Create catalog environment for filesystem catalog
-        # Filesystem catalog doesn't support version management by default
+        from pypaimon.catalog.filesystem_catalog_loader import FileSystemCatalogLoader
         catalog_environment = CatalogEnvironment(
             identifier=identifier,
-            uuid=None,  # Filesystem catalog doesn't track table UUIDs
-            catalog_loader=None,  # No catalog loader for filesystem
+            uuid=None,
+            catalog_loader=FileSystemCatalogLoader(self.catalog_context),
             supports_version_management=False
         )
 
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 4e01413d1456..94524ac4dc4b 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -16,14 +16,12 @@
 # limitations under the License.
 ################################################################################
 
-import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Iterable, List, Tuple
+from typing import Dict, List, Tuple
 
 from pypaimon.common.identifier import Identifier
 from pypaimon.common.options.core_options import CoreOptions
 from pypaimon.common.uri_reader import UriReader
-from pypaimon.schema.schema_manager import SchemaManager
 from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
 from pypaimon.table.special_fields import SpecialFields
 from pypaimon.utils.range import Range
@@ -61,42 +59,12 @@ class BlobViewLookup:
 
     def __init__(self, table):
         self._table = table
-        self._table_cache: Dict[str, object] = {}
         self._uri_reader_cache: Dict[str, UriReader] = {}
         self._descriptor_cache: Dict[BlobViewStruct, BlobDescriptor] = {}
 
-    def preload(self, view_structs: Iterable[BlobViewStruct]) -> None:
-        unique_structs: List[BlobViewStruct] = []
-        for view_struct in view_structs:
-            if view_struct not in self._descriptor_cache:
-                unique_structs.append(view_struct)
-        if not unique_structs:
-            return
-        resolved: Dict[BlobViewStruct, BlobDescriptor] = self._preload_descriptors(unique_structs)
-        self._descriptor_cache.update(resolved)
-
-    def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
-        descriptor: BlobDescriptor = self._descriptor_cache.get(view_struct)
-        if descriptor is None:
-            self.preload([view_struct])
-            descriptor = self._descriptor_cache.get(view_struct)
-        if descriptor is None:
-            raise ValueError(
-                "Cannot resolve BlobViewStruct {} because row id {} was not found "
-                "in upstream table.".format(view_struct, view_struct.row_id)
-            )
-        return descriptor
-
-    def resolve_data(self, view_struct: BlobViewStruct) -> bytes:
-        descriptor = self.resolve_descriptor(view_struct)
-        upstream_table = self._load_table(view_struct.identifier)
-        uri_reader = self._get_or_create_uri_reader(upstream_table, descriptor)
-        return Blob.from_descriptor(uri_reader, descriptor).to_data()
-
-    def _preload_descriptors(
-            self, view_structs: List[BlobViewStruct]) -> Dict[BlobViewStruct, BlobDescriptor]:
+    def preload(self, view_structs: List[BlobViewStruct]):
         if not view_structs:
-            return {}
+            return
 
         grouped: Dict[str, TableReferences] = self._group_by_table(view_structs)
         plans: List[TableReadPlan] = []
@@ -104,18 +72,16 @@ def _preload_descriptors(
             plans.append(self._create_table_read_plan(table_refs))
 
         target_rows: int = self._target_rows_per_task(plans)
-        tasks: List[Tuple[TableReadPlan, List[Tuple[int, int]]]] = []
+        tasks: List[Tuple[TableReadPlan, List[Range]]] = []
         for plan in plans:
             for range_chunk in self._split_row_ranges(plan.row_ranges, target_rows):
                 tasks.append((plan, range_chunk))
 
         if len(tasks) <= 1:
-            resolved = {}
             for plan, range_chunk in tasks:
-                resolved.update(self._load_descriptor_chunk(plan, range_chunk))
-            return resolved
+                self._descriptor_cache.update(self._load_descriptor_chunk(plan, range_chunk))
+            return
 
-        resolved = {}
         with ThreadPoolExecutor(max_workers=min(_PRELOAD_THREAD_NUM, len(tasks))) as executor:
             futures = {
                 executor.submit(self._load_descriptor_chunk, plan, range_chunk): (plan, range_chunk)
@@ -123,13 +89,21 @@ def _preload_descriptors(
             }
             for future in as_completed(futures):
                 try:
-                    resolved.update(future.result())
+                    self._descriptor_cache.update(future.result())
                 except Exception as exc:
                     raise RuntimeError("Failed to preload blob descriptors.") from exc
-        return resolved
+
+    def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
+        descriptor: BlobDescriptor = self._descriptor_cache.get(view_struct)
+        if descriptor is None:
+            raise ValueError(
+                "Cannot resolve BlobViewStruct {} because row id {} was not found "
+                "in upstream table.".format(view_struct, view_struct.row_id)
+            )
+        return descriptor
 
     def _group_by_table(
-        self, view_structs: List[BlobViewStruct]
+            self, view_structs: List[BlobViewStruct]
     ) -> Dict[str, TableReferences]:
         grouped: Dict[str, TableReferences] = {}
         for view_struct in view_structs:
@@ -146,11 +120,10 @@ def _create_table_read_plan(self, table_refs: TableReferences) -> TableReadPlan:
         for field_id in table_refs.references_by_field:
             fields.append(self._field_by_id(upstream_table, field_id))
 
-        row_ranges: List[Tuple[int, int]] = self._to_sorted_distinct_ranges(table_refs.row_ids)
-        return TableReadPlan(table_refs.identifier, upstream_table, fields, row_ranges)
+        return TableReadPlan(table_refs.identifier, upstream_table, fields, Range.to_ranges(table_refs.row_ids))
 
     def _load_descriptor_chunk(
-        self, plan: TableReadPlan, row_ranges: List[Range]
+            self, plan: TableReadPlan, row_ranges: List[Range]
     ) -> Dict[BlobViewStruct, BlobDescriptor]:
         identifier: Identifier = plan.identifier
         upstream_table = plan.upstream_table
@@ -207,34 +180,46 @@ def _load_descriptor_chunk(
                 resolved[view_struct] = descriptor
         return resolved
 
-    @staticmethod
-    def _to_sorted_distinct_ranges(row_ids: List[int]) -> List[Range]:
-        return Range.to_ranges(row_ids)
-
     @staticmethod
     def _split_row_ranges(
-        row_ranges: List[Range], target_rows_per_task: int
+            row_ranges: List[Range], target_rows_per_task: int
     ) -> List[List[Range]]:
+        """
+        Split row ranges into multiple chunks for parallel task processing.
+        """
         if not row_ranges:
             return []
 
         chunks: List[List[Range]] = []
         current_chunk: List[Range] = []
         current_chunk_rows: int = 0
+
         for r in row_ranges:
             next_from = r.from_
+            # Process current range until all rows are allocated
             while next_from <= r.to:
+                # If current chunk is full, save it and start a new one
                 if current_chunk_rows == target_rows_per_task:
                     chunks.append(current_chunk)
                     current_chunk = []
                     current_chunk_rows = 0
+
+                # Calculate remaining capacity in current chunk
                 remaining = target_rows_per_task - current_chunk_rows
+                # Determine the end position for this allocation (don't exceed range boundary)
                 next_to = min(r.to, next_from + remaining - 1)
+
+                # Add the allocated range to current chunk
                 current_chunk.append(Range(next_from, next_to))
                 current_chunk_rows += next_to - next_from + 1
+
+                # Move to next unallocated position
                 next_from = next_to + 1
+
+        # Don't forget the last chunk if it has any ranges
         if current_chunk:
             chunks.append(current_chunk)
+
         return chunks
 
     @staticmethod
@@ -245,50 +230,15 @@ def _target_rows_per_task(plans: List[TableReadPlan]) -> int:
                 total_rows += r.count()
         if total_rows <= 0:
             return _MIN_ROWS_PER_TASK
-        target = (total_rows + _PRELOAD_THREAD_NUM - 1) // _PRELOAD_THREAD_NUM
-        return max(_MIN_ROWS_PER_TASK, target)
-
-    def _load_table(self, identifier: Identifier):
-        key: str = identifier.get_full_name()
-        if key in self._table_cache:
-            return self._table_cache[key]
-
-        catalog_loader = self._table.catalog_environment.catalog_loader
-        if catalog_loader is not None:
-            catalog = catalog_loader.load()
-            table = catalog.get_table(identifier)
-        else:
-            table = self._load_filesystem_table(identifier)
-
-        self._table_cache[key] = table
-        return table
 
-    def _load_filesystem_table(self, identifier: Identifier):
-        from pypaimon.table.file_store_table import FileStoreTable
+        return max(_MIN_ROWS_PER_TASK, (total_rows + _PRELOAD_THREAD_NUM - 1) // _PRELOAD_THREAD_NUM)
 
-        table_path = self._filesystem_table_path(identifier)
-        schema_manager = SchemaManager(
-            self._table.file_io,
-            table_path,
-            branch=identifier.get_branch_name_or_default(),
-        )
-        table_schema = schema_manager.latest()
-        if table_schema is None:
-            raise ValueError("Cannot find upstream table at path: {}".format(table_path))
-        return FileStoreTable(self._table.file_io, identifier, table_path, table_schema)
-
-    def _filesystem_table_path(self, identifier: Identifier) -> str:
-        current_table_path: str = self._table.table_path.rstrip("/")
-        current_db_path: str = os.path.dirname(current_table_path)
-        warehouse: str = os.path.dirname(current_db_path)
-        return "{}/{}.db/{}".format(
-            warehouse.rstrip("/"),
-            identifier.get_database_name(),
-            identifier.get_table_name(),
-        )
+    def _load_table(self, identifier: Identifier):
+        catalog = self._table.catalog_environment.catalog_loader.load()
+        return catalog.get_table(identifier)
 
     @staticmethod
-    def _field_by_id(table, field_id: int):
+    def _field_by_id(table, field_id: int) -> 'DataField':
         for field in table.table_schema.fields:
             if field.id == field_id:
                 return field
@@ -311,15 +261,3 @@ def _to_descriptor(self, value) -> BlobDescriptor:
         if not BlobDescriptor.is_blob_descriptor(value):
             raise ValueError("Blob view upstream value is not a serialized BlobDescriptor.")
         return BlobDescriptor.deserialize(value)
-
-    def _get_or_create_uri_reader(self, table, descriptor: BlobDescriptor) -> UriReader:
-        cache_key: str = table.identifier.get_full_name()
-        if cache_key in self._uri_reader_cache:
-            return self._uri_reader_cache[cache_key]
-        uri_reader_factory = getattr(table.file_io, "uri_reader_factory", None)
-        if uri_reader_factory is not None:
-            uri_reader = uri_reader_factory.create(descriptor.uri)
-        else:
-            uri_reader = UriReader.from_file(table.file_io)
-        self._uri_reader_cache[cache_key] = uri_reader
-        return uri_reader

From 96eed812b19e05e3d4486fd056c2cb1fc7be9ef0 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Fri, 29 May 2026 17:57:57 +0800
Subject: [PATCH 10/34] fmt

---
 paimon-python/pypaimon/tests/blob_test.py        | 2 +-
 paimon-python/pypaimon/utils/blob_view_lookup.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paimon-python/pypaimon/tests/blob_test.py b/paimon-python/pypaimon/tests/blob_test.py
index e7f903984725..0ad5bab74ac3 100644
--- a/paimon-python/pypaimon/tests/blob_test.py
+++ b/paimon-python/pypaimon/tests/blob_test.py
@@ -31,7 +31,7 @@
 from pypaimon.common.options import Options
 from pypaimon.read.reader.format_blob_reader import BlobRecordIterator, FormatBlobReader
 from pypaimon.schema.data_types import AtomicType, DataField
-from pypaimon.table.row.blob import Blob, BlobData, BlobRef, BlobDescriptor, BlobView, BlobViewStruct
+from pypaimon.table.row.blob import Blob, BlobData, BlobRef, BlobDescriptor, BlobViewStruct
 from pypaimon.table.row.generic_row import GenericRowDeserializer, GenericRowSerializer, GenericRow
 from pypaimon.table.row.row_kind import RowKind
 
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 94524ac4dc4b..264652183d54 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -22,7 +22,7 @@
 from pypaimon.common.identifier import Identifier
 from pypaimon.common.options.core_options import CoreOptions
 from pypaimon.common.uri_reader import UriReader
-from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobViewStruct
+from pypaimon.table.row.blob import BlobDescriptor, BlobViewStruct
 from pypaimon.table.special_fields import SpecialFields
 from pypaimon.utils.range import Range
 

From f8326b989092db9a6bff34016ed28cde0102020d Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Mon, 1 Jun 2026 23:38:23 +0800
Subject: [PATCH 11/34] fix

---
 .../reader/blob_descriptor_convert_reader.py  | 26 +++----------
 paimon-python/pypaimon/read/split_read.py     |  4 +-
 .../pypaimon/table/special_fields.py          | 19 ++++++++++
 .../pypaimon/utils/blob_view_lookup.py        | 37 +++++++------------
 4 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 456ae35248d9..5f4e98841309 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -25,7 +25,7 @@
 from pypaimon.table.row.blob import Blob, BlobViewStruct
 
 
-class BlobDescriptorConvertReader(RecordBatchReader):
+class BlobInlineConvertReader(RecordBatchReader):
     """Resolves BlobView and BlobDescriptor fields in record batches.
 
     Processing is split into two clear stages:
@@ -103,6 +103,11 @@ def _prescan_view_structs(self):
                         value = self._normalize_blob_to_bytes(value)
                         if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value):
                             all_view_structs.append(BlobViewStruct.deserialize(value))
+                        else:
+                            raise ValueError(
+                                f"Expected BlobViewStruct bytes in view field '{field_name}', "
+                                f"but got non-BlobViewStruct bytes (length={len(value)})"
+                            )
         finally:
             prescan_reader.close()
 
@@ -185,22 +190,3 @@ def _normalize_blob_to_bytes(value):
 
     def close(self):
         self._inner.close()
-
-
-class _CachedBatchReader(RecordBatchReader):
-    """A simple reader that replays pre-cached RecordBatches.
-    Used as fallback when no prescan_reader_factory is provided."""
-
-    def __init__(self, batches):
-        self._batches = batches
-        self._index = 0
-
-    def read_arrow_batch(self) -> Optional[RecordBatch]:
-        if self._index >= len(self._batches):
-            return None
-        batch = self._batches[self._index]
-        self._index += 1
-        return batch
-
-    def close(self):
-        self._batches = None
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 3cf3f2c3499b..f3a32dd8ad4c 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -40,7 +40,7 @@
 from pypaimon.read.reader.field_bunch import BlobBunch, DataBunch, FieldBunch, VectorBunch
 from pypaimon.read.reader.filter_record_reader import FilterRecordReader
 from pypaimon.read.reader.format_avro_reader import FormatAvroReader
-from pypaimon.read.reader.blob_descriptor_convert_reader import BlobDescriptorConvertReader
+from pypaimon.read.reader.blob_descriptor_convert_reader import BlobInlineConvertReader
 from pypaimon.read.reader.filter_record_batch_reader import FilterRecordBatchReader
 from pypaimon.read.reader.limited_record_reader import LimitedRecordBatchReader, LimitedRecordReader
 from pypaimon.read.reader.row_range_filter_record_reader import RowIdFilterRecordBatchReader
@@ -813,7 +813,7 @@ def create_reader(self) -> RecordReader:
         if (CoreOptions.blob_view_fields(self.table.options)
                 or (not CoreOptions.blob_as_descriptor(self.table.options)
                     and CoreOptions.blob_descriptor_fields(self.table.options))):
-            reader = BlobDescriptorConvertReader(
+            reader = BlobInlineConvertReader(
                 reader, self.table,
                 prescan_reader_factory=lambda names: self._create_prescan_reader(names))
 
diff --git a/paimon-python/pypaimon/table/special_fields.py b/paimon-python/pypaimon/table/special_fields.py
index 5c578ec85f07..64d2429bef7d 100644
--- a/paimon-python/pypaimon/table/special_fields.py
+++ b/paimon-python/pypaimon/table/special_fields.py
@@ -81,3 +81,22 @@ def row_type_with_row_tracking(table_fields: List[DataField],
             fields_with_row_tracking.append(SpecialFields.SEQUENCE_NUMBER)
 
         return fields_with_row_tracking
+
+    @staticmethod
+    def row_type_with_row_id(table_fields: List[DataField]) -> List[DataField]:
+        """Add ROW_ID field to the given fields list.
+
+        Args:
+            table_fields: The original table fields
+        """
+        fields_with_row_id = list(table_fields)
+
+        for field in fields_with_row_id:
+            if SpecialFields.ROW_ID.name == field.name:
+                raise ValueError(
+                    "Row tracking field name '{}' conflicts with existing field names."
+                    .format(field.name)
+                )
+
+        fields_with_row_id.append(SpecialFields.ROW_ID)
+        return fields_with_row_id
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 264652183d54..fa792644ee8d 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -47,10 +47,10 @@ class TableReadPlan:
     """A plan for reading blob descriptors from one upstream table."""
 
     def __init__(self, identifier: Identifier, upstream_table,
-                 fields: List, row_ranges: List[Range]):
+                 read_fields: List, row_ranges: List[Range]):
         self.identifier: Identifier = identifier
         self.upstream_table = upstream_table
-        self.fields: List = fields
+        self.read_fields: List = read_fields
         self.row_ranges: List[Range] = row_ranges
 
 
@@ -120,20 +120,22 @@ def _create_table_read_plan(self, table_refs: TableReferences) -> TableReadPlan:
         for field_id in table_refs.references_by_field:
             fields.append(self._field_by_id(upstream_table, field_id))
 
-        return TableReadPlan(table_refs.identifier, upstream_table, fields, Range.to_ranges(table_refs.row_ids))
+        read_fields = SpecialFields.row_type_with_row_id(fields)
+        return TableReadPlan(
+            table_refs.identifier, upstream_table, read_fields,
+            Range.to_ranges(table_refs.row_ids))
 
     def _load_descriptor_chunk(
             self, plan: TableReadPlan, row_ranges: List[Range]
     ) -> Dict[BlobViewStruct, BlobDescriptor]:
         identifier: Identifier = plan.identifier
         upstream_table = plan.upstream_table
-        fields: List = plan.fields
+        read_fields = plan.read_fields
 
-        field_names: List[str] = [f.name for f in fields]
-        projection: List[str] = field_names + [SpecialFields.ROW_ID.name]
+        projection_field_names: List[str] = [f.name for f in read_fields]
 
         descriptor_table = upstream_table.copy({CoreOptions.BLOB_AS_DESCRIPTOR.key(): "true"})
-        read_builder = descriptor_table.new_read_builder().with_projection(projection)
+        read_builder = descriptor_table.new_read_builder().with_projection(projection_field_names)
 
         if SpecialFields.ROW_ID.name not in [
             data_field.name for data_field in read_builder.read_type()
@@ -167,14 +169,16 @@ def _load_descriptor_chunk(
 
         row_id_values: List = result.column(SpecialFields.ROW_ID.name).to_pylist()
         resolved: Dict[BlobViewStruct, BlobDescriptor] = {}
-        for field in fields:
+        for field in read_fields:
+            if field.name == SpecialFields.ROW_ID.name:
+                continue
             if field.name not in result.schema.names:
                 continue
             values = result.column(field.name).to_pylist()
             for row_id, value in zip(row_id_values, values):
                 if value is None:
                     continue
-                descriptor = self._to_descriptor(value)
+                descriptor = BlobDescriptor.deserialize(value)
                 view_struct = BlobViewStruct(
                     identifier.get_full_name(), field.id, int(row_id))
                 resolved[view_struct] = descriptor
@@ -246,18 +250,3 @@ def _field_by_id(table, field_id: int) -> 'DataField':
             "Cannot find blob fieldId {} in upstream table {}."
             .format(field_id, table.identifier.get_full_name())
         )
-
-    def _to_descriptor(self, value) -> BlobDescriptor:
-        if hasattr(value, "as_py"):
-            value = value.as_py()
-        if isinstance(value, str):
-            value = value.encode("utf-8")
-        if isinstance(value, bytearray):
-            value = bytes(value)
-        if not isinstance(value, bytes):
-            raise ValueError("Blob view upstream value must be serialized blob bytes.")
-        if BlobViewStruct.is_blob_view_struct(value):
-            return self.resolve_descriptor(BlobViewStruct.deserialize(value))
-        if not BlobDescriptor.is_blob_descriptor(value):
-            raise ValueError("Blob view upstream value is not a serialized BlobDescriptor.")
-        return BlobDescriptor.deserialize(value)

From 6feb4352ca4451e11fbedc7efca8d8647509a6c5 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Mon, 1 Jun 2026 23:50:35 +0800
Subject: [PATCH 12/34] rm

---
 paimon-python/pypaimon/common/options/core_options.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py
index f465ca7c98f8..fdd98fc419bb 100644
--- a/paimon-python/pypaimon/common/options/core_options.py
+++ b/paimon-python/pypaimon/common/options/core_options.py
@@ -735,12 +735,6 @@ def blob_view_fields(self, default=None):
         value = self.options.get(CoreOptions.BLOB_VIEW_FIELD, default)
         return CoreOptions._parse_field_set(value)
 
-    def blob_inline_fields(self, default=None):
-        fields = set()
-        fields.update(self.blob_descriptor_fields(default))
-        fields.update(self.blob_view_fields(default))
-        return fields
-
     @staticmethod
     def _parse_field_set(value):
         if value is None:

From ac8499f6f25c5fa18a93064ace1596b6f486f182 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 00:19:35 +0800
Subject: [PATCH 13/34] fix

---
 .../pypaimon/read/reader/blob_descriptor_convert_reader.py   | 5 +++--
 paimon-python/pypaimon/utils/blob_view_lookup.py             | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 5f4e98841309..eb540de11b18 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -54,7 +54,6 @@ def __init__(self, inner: RecordBatchReader, table,
         self._inner = inner
         self._table = table
         self._prescan_reader_factory = prescan_reader_factory
-        self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
         self.file_io = inner.file_io
         self.blob_field_indices = inner.blob_field_indices
         self._view_fields = CoreOptions.blob_view_fields(table.options)
@@ -101,12 +100,14 @@ def _prescan_view_structs(self):
                         continue
                     for value in batch.column(field_name).to_pylist():
                         value = self._normalize_blob_to_bytes(value)
+                        if value is None:
+                            continue
                         if isinstance(value, bytes) and BlobViewStruct.is_blob_view_struct(value):
                             all_view_structs.append(BlobViewStruct.deserialize(value))
                         else:
                             raise ValueError(
                                 f"Expected BlobViewStruct bytes in view field '{field_name}', "
-                                f"but got non-BlobViewStruct bytes (length={len(value)})"
+                                f"but got non-BlobViewStruct bytes."
                             )
         finally:
             prescan_reader.close()
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index fa792644ee8d..19c8e7a1c67f 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -59,7 +59,6 @@ class BlobViewLookup:
 
     def __init__(self, table):
         self._table = table
-        self._uri_reader_cache: Dict[str, UriReader] = {}
         self._descriptor_cache: Dict[BlobViewStruct, BlobDescriptor] = {}
 
     def preload(self, view_structs: List[BlobViewStruct]):

From 7aa38360d048cd5ae1736e69e2bb624fca01c1f6 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 00:40:32 +0800
Subject: [PATCH 14/34] fmt

---
 paimon-python/pypaimon/utils/blob_view_lookup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 19c8e7a1c67f..70b49dfa82f3 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -21,7 +21,6 @@
 
 from pypaimon.common.identifier import Identifier
 from pypaimon.common.options.core_options import CoreOptions
-from pypaimon.common.uri_reader import UriReader
 from pypaimon.table.row.blob import BlobDescriptor, BlobViewStruct
 from pypaimon.table.special_fields import SpecialFields
 from pypaimon.utils.range import Range

From 8ef2ff028c3c37d9ab83043d1bc1f41916f55d30 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 11:03:44 +0800
Subject: [PATCH 15/34] fix

---
 .../tests/external_storage_blob_test.py         | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/paimon-python/pypaimon/tests/external_storage_blob_test.py b/paimon-python/pypaimon/tests/external_storage_blob_test.py
index 505e4407a9e6..3bbe5cc06b6b 100644
--- a/paimon-python/pypaimon/tests/external_storage_blob_test.py
+++ b/paimon-python/pypaimon/tests/external_storage_blob_test.py
@@ -85,16 +85,15 @@ def test_validation_field_not_blob_type(self):
             ('name', pa.string()),
             ('video', pa.large_binary()),
         ])
-        schema = Schema.from_pyarrow_schema(pa_schema, options={
-            'row-tracking.enabled': 'true',
-            'data-evolution.enabled': 'true',
-            'blob-descriptor-field': 'name,video',
-            'blob-external-storage-field': 'name',
-            'blob-external-storage-path': external_path,
-        })
         with self.assertRaises(ValueError) as ctx:
-            self.catalog.create_table('test_db.not_blob_type_test', schema, False)
-        self.assertIn('must be a BLOB type field', str(ctx.exception))
+            Schema.from_pyarrow_schema(pa_schema, options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-descriptor-field': 'name,video',
+                'blob-external-storage-field': 'name',
+                'blob-external-storage-path': external_path,
+            })
+        self.assertIn('must be blob fields', str(ctx.exception))
 
     def test_validation_blob_not_null_field_passes(self):
         """BLOB NOT NULL fields should pass validation (not be rejected by str comparison)."""

From 8fb75486cf8a4ee5c6fe1831274d92d2d6e2d9eb Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 11:17:53 +0800
Subject: [PATCH 16/34] fix

---
 paimon-python/pypaimon/table/file_store_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/table/file_store_table.py b/paimon-python/pypaimon/table/file_store_table.py
index 9d418cbb26bb..129e5936d2d8 100644
--- a/paimon-python/pypaimon/table/file_store_table.py
+++ b/paimon-python/pypaimon/table/file_store_table.py
@@ -117,7 +117,7 @@ def branch_manager(self):
         """Get the branch manager for this table."""
         # If catalog environment has a catalog loader, use CatalogBranchManager
         catalog_loader = self.catalog_environment.catalog_loader
-        if catalog_loader is not None:
+        if catalog_loader is not None and self.catalog_environment.supports_version_management:
             from pypaimon.branch.catalog_branch_manager import CatalogBranchManager
             return CatalogBranchManager(
                 catalog_loader,

From 17ae8ce04adb847b0ae0d2777bea64688ca4775e Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 13:49:28 +0800
Subject: [PATCH 17/34] fix

---
 .../reader/blob_descriptor_convert_reader.py  |  2 +-
 .../pypaimon/utils/blob_view_lookup.py        | 33 ++++++++++---------
 .../write/writer/dedicated_format_writer.py   |  1 -
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index eb540de11b18..fe930797ff09 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -86,7 +86,6 @@ def _prescan_view_structs(self):
         from pypaimon.table.row.blob import BlobViewStruct
         from pypaimon.utils.blob_view_lookup import BlobViewLookup
 
-        self._prescan_done = True
         all_view_structs = []
 
         prescan_reader = self._prescan_reader_factory(self._view_fields)
@@ -116,6 +115,7 @@ def _prescan_view_structs(self):
         if all_view_structs:
             self._blob_view_lookup = BlobViewLookup(self._table)
             self._blob_view_lookup.preload(all_view_structs)
+        self._prescan_done = True
 
     def _resolve_view_fields(self, batch, blob_view_lookup):
         """Replace BlobViewStruct bytes in view fields with the corresponding
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 70b49dfa82f3..60d67628d3e8 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -1,20 +1,19 @@
-################################################################################
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#   http://www.apache.org/licenses/LICENSE-2.0
 #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, List, Tuple
@@ -89,6 +88,10 @@ def preload(self, view_structs: List[BlobViewStruct]):
                 try:
                     self._descriptor_cache.update(future.result())
                 except Exception as exc:
+                    # Cancel remaining futures that have not started yet so a single
+                    # failure can abort the rest of the preload work as early as possible.
+                    for pending_future in futures:
+                        pending_future.cancel()
                     raise RuntimeError("Failed to preload blob descriptors.") from exc
 
     def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
diff --git a/paimon-python/pypaimon/write/writer/dedicated_format_writer.py b/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
index 6e4b052d228a..01216b36cd4d 100644
--- a/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
+++ b/paimon-python/pypaimon/write/writer/dedicated_format_writer.py
@@ -98,7 +98,6 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op
                 col for col in write_cols if col not in dedicated_set
             ]
         else:
-            self.blob_file_column_names = list(self.blob_file_column_names)
             self.vector_write_columns = list(full_vector_column_names) if has_dedicated_vector else []
             self.normal_column_names = [
                 col for col in all_column_names if col not in dedicated_set

From 967a0c6b0a55e938d318b5a72e24d84c1e3d4bc0 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 15:45:57 +0800
Subject: [PATCH 18/34] support blob-view.resolve.enabled and null blob value

---
 .../pypaimon/common/options/core_options.py   |  14 ++
 .../reader/blob_descriptor_convert_reader.py  |  16 +-
 .../pypaimon/tests/blob_table_test.py         | 154 ++++++++++++++++++
 .../pypaimon/utils/blob_view_lookup.py        |  39 ++++-
 4 files changed, 209 insertions(+), 14 deletions(-)

diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py
index fdd98fc419bb..d9a6d2a245a6 100644
--- a/paimon-python/pypaimon/common/options/core_options.py
+++ b/paimon-python/pypaimon/common/options/core_options.py
@@ -272,6 +272,17 @@ class CoreOptions:
         .with_description("Comma-separated field names to treat as BLOB view fields.")
     )
 
+    BLOB_VIEW_RESOLVE_ENABLED: ConfigOption[bool] = (
+        ConfigOptions.key("blob-view.resolve.enabled")
+        .boolean_type()
+        .default_value(True)
+        .with_description(
+            "Whether to resolve blob-view-field values from upstream tables at "
+            "read time. Set to false to preserve BlobViewStruct references when "
+            "forwarding blob view values to another blob-view table."
+        )
+    )
+
     VECTOR_FIELD: ConfigOption[str] = (
         ConfigOptions.key("vector-field")
         .string_type()
@@ -735,6 +746,9 @@ def blob_view_fields(self, default=None):
         value = self.options.get(CoreOptions.BLOB_VIEW_FIELD, default)
         return CoreOptions._parse_field_set(value)
 
+    def blob_view_resolve_enabled(self, default=True):
+        return self.options.get(CoreOptions.BLOB_VIEW_RESOLVE_ENABLED, default)
+
     @staticmethod
     def _parse_field_set(value):
         if value is None:
diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index fe930797ff09..1fcddc51e3a9 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -56,7 +56,10 @@ def __init__(self, inner: RecordBatchReader, table,
         self._prescan_reader_factory = prescan_reader_factory
         self.file_io = inner.file_io
         self.blob_field_indices = inner.blob_field_indices
-        self._view_fields = CoreOptions.blob_view_fields(table.options)
+        # Preserve original BlobViewStruct bytes when resolve disabled: skip both
+        # view resolution (Stage 1) and descriptor-to-data resolution (Stage 2).
+        resolve_enabled = CoreOptions.blob_view_resolve_enabled(table.options)
+        self._view_fields = CoreOptions.blob_view_fields(table.options) if resolve_enabled else set()
         self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
         self._blob_as_descriptor = CoreOptions.blob_as_descriptor(table.options)
         self._prescan_done = False
@@ -74,7 +77,7 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
         if self._view_fields and self._blob_view_lookup is not None:
             batch = self._resolve_view_fields(batch, self._blob_view_lookup)
         # Resolve BlobDescriptor -> real bytes (if blob-as-descriptor=false)
-        return self._resolve_blob_data(batch)
+        return self._resolve_descriptor_fields(batch)
 
     # ------------------------------------------------------------------
     # Stage 1: BlobView prescan (lightweight, only reads view columns)
@@ -136,8 +139,11 @@ def _resolve_view_fields(self, batch, blob_view_lookup):
                     converted_values.append(value)
                     continue
                 view_struct = BlobViewStruct.deserialize(value)
-                descriptor = blob_view_lookup.resolve_descriptor(view_struct)
-                converted_values.append(descriptor.serialize())
+                if blob_view_lookup.resolve_to_null(view_struct):
+                    converted_values.append(None)
+                else:
+                    descriptor = blob_view_lookup.resolve_descriptor(view_struct)
+                    converted_values.append(descriptor.serialize())
 
             column_idx = batch.schema.names.index(field_name)
             batch = batch.set_column(
@@ -151,7 +157,7 @@ def _resolve_view_fields(self, batch, blob_view_lookup):
     # Stage 2: BlobData resolution (unified exit)
     # ------------------------------------------------------------------
 
-    def _resolve_blob_data(self, batch):
+    def _resolve_descriptor_fields(self, batch):
         if self._blob_as_descriptor:
             return batch
 
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
index c9a82e055651..42131dbc080c 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -1476,6 +1476,160 @@ def test_blob_view_fields_resolve_upstream_blob(self):
                 "Expected BlobDescriptor bytes when blob-as-descriptor=true"
             )
 
+    def test_blob_view_resolve_disabled_preserves_references(self):
+        from pypaimon import Schema
+        from pypaimon.common.options.core_options import CoreOptions
+        from pypaimon.table.row.blob import BlobViewStruct
+
+        source_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        source = Schema.from_pyarrow_schema(
+            source_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_resolve_source', source, False)
+        source_table = self.catalog.get_table('test_db.blob_view_resolve_source')
+        payloads = [b'resolve-source-0', b'resolve-source-1']
+
+        write_builder = source_table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(pa.Table.from_pydict({
+            'id': [1, 2],
+            'picture': payloads,
+        }, schema=source_schema))
+        commit_messages = writer.prepare_commit()
+        write_builder.new_commit().commit(commit_messages)
+        writer.close()
+
+        picture_field_id = next(
+            field.id for field in source_table.table_schema.fields if field.name == 'picture'
+        )
+        view_values = [
+            BlobViewStruct('test_db.blob_view_resolve_source', picture_field_id, 0).serialize(),
+            BlobViewStruct('test_db.blob_view_resolve_source', picture_field_id, 1).serialize(),
+        ]
+
+        target_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        target = Schema.from_pyarrow_schema(
+            target_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-view-field': 'picture',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_resolve_target', target, False)
+        target_table = self.catalog.get_table('test_db.blob_view_resolve_target')
+
+        target_write_builder = target_table.new_batch_write_builder()
+        target_writer = target_write_builder.new_write()
+        target_writer.write_arrow(pa.Table.from_pydict({
+            'id': [10, 11],
+            'picture': view_values,
+        }, schema=target_schema))
+        target_commit_messages = target_writer.prepare_commit()
+        target_write_builder.new_commit().commit(target_commit_messages)
+        target_writer.close()
+
+        # Default (resolve enabled): view fields are resolved to real blob data.
+        resolved_result = target_table.new_read_builder().new_read().to_arrow(
+            target_table.new_read_builder().new_scan().plan().splits()
+        ).sort_by('id')
+        self.assertEqual(resolved_result.column('picture').to_pylist(), payloads)
+
+        # resolve disabled: view fields keep the original BlobViewStruct bytes.
+        preserve_table = target_table.copy(
+            {CoreOptions.BLOB_VIEW_RESOLVE_ENABLED.key(): 'false'}
+        )
+        preserve_result = preserve_table.new_read_builder().new_read().to_arrow(
+            preserve_table.new_read_builder().new_scan().plan().splits()
+        ).sort_by('id')
+        preserved_values = preserve_result.column('picture').to_pylist()
+        self.assertEqual(preserved_values, view_values)
+        for value in preserved_values:
+            self.assertTrue(
+                BlobViewStruct.is_blob_view_struct(value),
+                "Expected original BlobViewStruct bytes when resolve disabled"
+            )
+
+    def test_blob_view_resolves_null_upstream_value(self):
+        from pypaimon import Schema
+        from pypaimon.table.row.blob import BlobViewStruct
+
+        source_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        source = Schema.from_pyarrow_schema(
+            source_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_null_source', source, False)
+        source_table = self.catalog.get_table('test_db.blob_view_null_source')
+        # Row 0 has a real blob value, row 1 has a null blob value.
+        payloads = [b'null-source-0', None]
+
+        write_builder = source_table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(pa.Table.from_pydict({
+            'id': [1, 2],
+            'picture': payloads,
+        }, schema=source_schema))
+        commit_messages = writer.prepare_commit()
+        write_builder.new_commit().commit(commit_messages)
+        writer.close()
+
+        picture_field_id = next(
+            field.id for field in source_table.table_schema.fields if field.name == 'picture'
+        )
+        view_values = [
+            BlobViewStruct('test_db.blob_view_null_source', picture_field_id, 0).serialize(),
+            BlobViewStruct('test_db.blob_view_null_source', picture_field_id, 1).serialize(),
+        ]
+
+        target_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        target = Schema.from_pyarrow_schema(
+            target_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-view-field': 'picture',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_null_target', target, False)
+        target_table = self.catalog.get_table('test_db.blob_view_null_target')
+
+        target_write_builder = target_table.new_batch_write_builder()
+        target_writer = target_write_builder.new_write()
+        target_writer.write_arrow(pa.Table.from_pydict({
+            'id': [10, 11],
+            'picture': view_values,
+        }, schema=target_schema))
+        target_commit_messages = target_writer.prepare_commit()
+        target_write_builder.new_commit().commit(target_commit_messages)
+        target_writer.close()
+
+        # View referencing a real upstream value resolves to data; view
+        # referencing a null upstream value resolves to None (not an error).
+        result = target_table.new_read_builder().new_read().to_arrow(
+            target_table.new_read_builder().new_scan().plan().splits()
+        ).sort_by('id')
+        self.assertEqual(result.column('picture').to_pylist(), [b'null-source-0', None])
+
     def test_blob_view_fields_rejects_non_view_input(self):
         from pypaimon import Schema
 
diff --git a/paimon-python/pypaimon/utils/blob_view_lookup.py b/paimon-python/pypaimon/utils/blob_view_lookup.py
index 60d67628d3e8..1acff6bcc8ff 100644
--- a/paimon-python/pypaimon/utils/blob_view_lookup.py
+++ b/paimon-python/pypaimon/utils/blob_view_lookup.py
@@ -16,7 +16,7 @@
 # under the License.
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Set
 
 from pypaimon.common.identifier import Identifier
 from pypaimon.common.options.core_options import CoreOptions
@@ -58,6 +58,7 @@ class BlobViewLookup:
     def __init__(self, table):
         self._table = table
         self._descriptor_cache: Dict[BlobViewStruct, BlobDescriptor] = {}
+        self._null_value_cache: Set[BlobViewStruct] = set()
 
     def preload(self, view_structs: List[BlobViewStruct]):
         if not view_structs:
@@ -76,7 +77,9 @@ def preload(self, view_structs: List[BlobViewStruct]):
 
         if len(tasks) <= 1:
             for plan, range_chunk in tasks:
-                self._descriptor_cache.update(self._load_descriptor_chunk(plan, range_chunk))
+                descriptors, null_values = self._load_descriptor_chunk(plan, range_chunk)
+                self._descriptor_cache.update(descriptors)
+                self._null_value_cache.update(null_values)
             return
 
         with ThreadPoolExecutor(max_workers=min(_PRELOAD_THREAD_NUM, len(tasks))) as executor:
@@ -86,7 +89,9 @@ def preload(self, view_structs: List[BlobViewStruct]):
             }
             for future in as_completed(futures):
                 try:
-                    self._descriptor_cache.update(future.result())
+                    descriptors, null_values = future.result()
+                    self._descriptor_cache.update(descriptors)
+                    self._null_value_cache.update(null_values)
                 except Exception as exc:
                     # Cancel remaining futures that have not started yet so a single
                     # failure can abort the rest of the preload work as early as possible.
@@ -97,12 +102,26 @@ def preload(self, view_structs: List[BlobViewStruct]):
     def resolve_descriptor(self, view_struct: BlobViewStruct) -> BlobDescriptor:
         descriptor: BlobDescriptor = self._descriptor_cache.get(view_struct)
         if descriptor is None:
+            if view_struct in self._null_value_cache:
+                raise ValueError(
+                    "BlobViewStruct {} resolves to a null blob value.".format(view_struct)
+                )
             raise ValueError(
                 "Cannot resolve BlobViewStruct {} because row id {} was not found "
                 "in upstream table.".format(view_struct, view_struct.row_id)
             )
         return descriptor
 
+    def resolve_to_null(self, view_struct: BlobViewStruct) -> bool:
+        if view_struct in self._null_value_cache:
+            return True
+        if view_struct not in self._descriptor_cache:
+            raise ValueError(
+                "Cannot resolve BlobViewStruct {} because row id {} was not found "
+                "in upstream table.".format(view_struct, view_struct.row_id)
+            )
+        return False
+
     def _group_by_table(
             self, view_structs: List[BlobViewStruct]
     ) -> Dict[str, TableReferences]:
@@ -127,8 +146,8 @@ def _create_table_read_plan(self, table_refs: TableReferences) -> TableReadPlan:
             Range.to_ranges(table_refs.row_ids))
 
     def _load_descriptor_chunk(
-            self, plan: TableReadPlan, row_ranges: List[Range]
-    ) -> Dict[BlobViewStruct, BlobDescriptor]:
+        self, plan: TableReadPlan, row_ranges: List[Range]
+    ) -> Tuple[Dict[BlobViewStruct, BlobDescriptor], set]:
         identifier: Identifier = plan.identifier
         upstream_table = plan.upstream_table
         read_fields = plan.read_fields
@@ -170,6 +189,7 @@ def _load_descriptor_chunk(
 
         row_id_values: List = result.column(SpecialFields.ROW_ID.name).to_pylist()
         resolved: Dict[BlobViewStruct, BlobDescriptor] = {}
+        null_values: set = set()
         for field in read_fields:
             if field.name == SpecialFields.ROW_ID.name:
                 continue
@@ -177,17 +197,18 @@ def _load_descriptor_chunk(
                 continue
             values = result.column(field.name).to_pylist()
             for row_id, value in zip(row_id_values, values):
+                view_struct = BlobViewStruct(
+                    identifier.get_full_name(), field.id, int(row_id))
                 if value is None:
+                    null_values.add(view_struct)
                     continue
                 descriptor = BlobDescriptor.deserialize(value)
-                view_struct = BlobViewStruct(
-                    identifier.get_full_name(), field.id, int(row_id))
                 resolved[view_struct] = descriptor
-        return resolved
+        return resolved, null_values
 
     @staticmethod
     def _split_row_ranges(
-            row_ranges: List[Range], target_rows_per_task: int
+        row_ranges: List[Range], target_rows_per_task: int
     ) -> List[List[Range]]:
         """
         Split row ranges into multiple chunks for parallel task processing.

From 1035cb1a3b26602d062b9e19f0edd3a469188d71 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 16:39:04 +0800
Subject: [PATCH 19/34] remove redundant code

---
 .../read/reader/data_file_batch_reader.py     | 55 -------------------
 paimon-python/pypaimon/read/split_read.py     |  7 ---
 2 files changed, 62 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
index 33475b2c4c67..f0e8adae954b 100644
--- a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
+++ b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
@@ -25,7 +25,6 @@
 from pypaimon.read.reader.format_blob_reader import FormatBlobReader
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
 from pypaimon.schema.data_types import DataField, PyarrowFieldParser
-from pypaimon.table.row.blob import Blob
 from pypaimon.table.special_fields import SpecialFields
 
 
@@ -40,8 +39,6 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
                  first_row_id: int,
                  row_tracking_enabled: bool,
                  system_fields: dict,
-                 blob_as_descriptor: bool = False,
-                 blob_descriptor_fields: Optional[set] = None,
                  file_io: Optional[FileIO] = None,
                  row_id_offsets: Optional[List[int]] = None):
         self.format_reader = format_reader
@@ -55,19 +52,7 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
         self._row_id_cursor = 0
         self.max_sequence_number = max_sequence_number
         self.system_fields = system_fields
-        self.blob_as_descriptor = blob_as_descriptor
-        self.blob_descriptor_fields = blob_descriptor_fields or set()
         self.file_io = file_io
-        self.blob_field_names = {
-            field.name
-            for field in fields
-            if hasattr(field.type, 'type') and field.type.type == 'BLOB'
-        }
-        self.descriptor_blob_fields = {
-            field_name
-            for field_name in self.blob_descriptor_fields
-            if field_name in self.blob_field_names
-        }
 
     def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch]:
         if isinstance(self.format_reader, FormatBlobReader):
@@ -140,8 +125,6 @@ def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch
         if self.row_tracking_enabled and self.system_fields:
             record_batch = self._assign_row_tracking(record_batch)
 
-        record_batch = self._convert_descriptor_stored_blob_columns(record_batch)
-
         return record_batch
 
     def _align_batch_to_read_schema(self, names: List[str], arrays: list) -> RecordBatch:
@@ -177,44 +160,6 @@ def _convert_descriptor_stored_blob_columns(self, record_batch: RecordBatch) ->
         if not self.descriptor_blob_fields:
             return record_batch
 
-        schema_names = set(record_batch.schema.names)
-        target_fields = [f for f in self.descriptor_blob_fields if f in schema_names]
-        if not target_fields:
-            return record_batch
-
-        arrays = list(record_batch.columns)
-        for field_name in target_fields:
-            field_idx = record_batch.schema.get_field_index(field_name)
-            values = record_batch.column(field_idx).to_pylist()
-
-            if self.blob_as_descriptor:
-                converted = [self._normalize_blob_cell(v) for v in values]
-            else:
-                converted = [self._blob_cell_to_data(v) for v in values]
-            arrays[field_idx] = pa.array(converted, type=pa.large_binary())
-
-        return pa.RecordBatch.from_arrays(arrays, schema=record_batch.schema)
-
-    @staticmethod
-    def _normalize_blob_cell(value):
-        if value is None:
-            return None
-        if hasattr(value, 'as_py'):
-            value = value.as_py()
-        if isinstance(value, str):
-            value = value.encode('utf-8')
-        if isinstance(value, bytearray):
-            value = bytes(value)
-        return value
-
-    def _blob_cell_to_data(self, value):
-        value = self._normalize_blob_cell(value)
-        if value is None:
-            return None
-        if not isinstance(value, bytes):
-            return value
-        return Blob.from_bytes(value, self.file_io).to_data()
-
     def _assign_row_tracking(self, record_batch: RecordBatch) -> RecordBatch:
         """Assign row tracking meta fields (_ROW_ID and _SEQUENCE_NUMBER)."""
         arrays = list(record_batch.columns)
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index f3a32dd8ad4c..74d763df0263 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -318,9 +318,6 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
         else:
             raise ValueError(f"Unexpected file format: {file_format}")
 
-        blob_as_descriptor = CoreOptions.blob_as_descriptor(self.table.options)
-        blob_descriptor_fields = CoreOptions.blob_descriptor_fields(self.table.options)
-
         index_mapping = self.create_index_mapping()
         partition_info = self._create_partition_info()
         system_fields = SpecialFields.find_system_fields(self.read_fields)
@@ -348,8 +345,6 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 effective_first_row_id,
                 row_tracking_enabled,
                 system_fields,
-                blob_as_descriptor=blob_as_descriptor,
-                blob_descriptor_fields=blob_descriptor_fields,
                 file_io=self.table.file_io,
                 row_id_offsets=row_indices)
         else:
@@ -363,8 +358,6 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
                 effective_first_row_id,
                 row_tracking_enabled,
                 system_fields,
-                blob_as_descriptor=blob_as_descriptor,
-                blob_descriptor_fields=blob_descriptor_fields,
                 file_io=self.table.file_io,
                 row_id_offsets=row_indices)
 

From 9fe7eb01aa488839c44f84683441553d68f83dd3 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 17:03:56 +0800
Subject: [PATCH 20/34] fix

---
 paimon-python/pypaimon/read/split_read.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 74d763df0263..36221866cc64 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -803,7 +803,8 @@ def _push_down_predicate(self) -> Optional[Predicate]:
     def create_reader(self) -> RecordReader:
         reader = self._create_raw_reader()
 
-        if (CoreOptions.blob_view_fields(self.table.options)
+        if ((CoreOptions.blob_view_fields(self.table.options) and CoreOptions.blob_view_resolve_enabled(
+                self.table.options))
                 or (not CoreOptions.blob_as_descriptor(self.table.options)
                     and CoreOptions.blob_descriptor_fields(self.table.options))):
             reader = BlobInlineConvertReader(

From 80af0256babbc58940df3195e997f8820f9fa256 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 17:49:20 +0800
Subject: [PATCH 21/34] from bytes supports BlobView

---
 paimon-python/pypaimon/table/row/blob.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paimon-python/pypaimon/table/row/blob.py b/paimon-python/pypaimon/table/row/blob.py
index 3f745c1575a9..eb2f00b76471 100644
--- a/paimon-python/pypaimon/table/row/blob.py
+++ b/paimon-python/pypaimon/table/row/blob.py
@@ -386,6 +386,10 @@ def from_file(file_io, file_path: str, offset: int, length: int) -> 'Blob':
     def from_descriptor(uri_reader: UriReader, descriptor: BlobDescriptor) -> 'Blob':
         return BlobRef(uri_reader, descriptor)
 
+    @staticmethod
+    def from_view(view_struct: BlobViewStruct) -> 'BlobView':
+        return BlobView(view_struct)
+
     @staticmethod
     def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True) -> Optional['Blob']:
         if data is None:
@@ -393,6 +397,8 @@ def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True
         if not isinstance(data, (bytes, bytearray)):
             raise TypeError(f"Blob.from_bytes expects bytes, got {type(data)}")
         data = bytes(data)
+        if BlobViewStruct.is_blob_view_struct(data):
+            return Blob.from_view(BlobViewStruct.deserialize(data))
         is_descriptor = BlobDescriptor.is_blob_descriptor(data)
         if not allow_blob_data and not is_descriptor:
             raise ValueError(

From 77e1a62f3bd49efdec62ca4c432390cf53201492 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Tue, 2 Jun 2026 17:57:19 +0800
Subject: [PATCH 22/34] fix

---
 paimon-python/pypaimon/tests/blob_test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/tests/blob_test.py b/paimon-python/pypaimon/tests/blob_test.py
index 0ad5bab74ac3..37217f8b7cfe 100644
--- a/paimon-python/pypaimon/tests/blob_test.py
+++ b/paimon-python/pypaimon/tests/blob_test.py
@@ -31,7 +31,7 @@
 from pypaimon.common.options import Options
 from pypaimon.read.reader.format_blob_reader import BlobRecordIterator, FormatBlobReader
 from pypaimon.schema.data_types import AtomicType, DataField
-from pypaimon.table.row.blob import Blob, BlobData, BlobRef, BlobDescriptor, BlobViewStruct
+from pypaimon.table.row.blob import Blob, BlobData, BlobRef, BlobDescriptor, BlobViewStruct, BlobView
 from pypaimon.table.row.generic_row import GenericRowDeserializer, GenericRowSerializer, GenericRow
 from pypaimon.table.row.row_kind import RowKind
 
@@ -180,6 +180,11 @@ def test_blob_view_struct_roundtrip(self):
         self.assertEqual(restored.field_id, 7)
         self.assertEqual(restored.row_id, 42)
 
+        blob = Blob.from_bytes(view_struct.serialize())
+        self.assertIsInstance(blob, BlobView)
+        self.assertFalse(blob.is_resolved())
+        self.assertEqual(blob.view_struct, view_struct)
+
     def test_blob_data_interface_compliance(self):
         """Test that BlobData properly implements Blob interface."""
         test_data = b"interface test data"

From a594ad38e48eba9100db1167b2aed488bdb79490 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 10:29:49 +0800
Subject: [PATCH 23/34] rebase

---
 .../pypaimon/read/reader/data_file_batch_reader.py         | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
index f0e8adae954b..21d1b2a911df 100644
--- a/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
+++ b/paimon-python/pypaimon/read/reader/data_file_batch_reader.py
@@ -153,13 +153,6 @@ def _align_batch_to_read_schema(self, names: List[str], arrays: list) -> RecordB
             out_fields.append(target_field)
         return pa.RecordBatch.from_arrays(out_arrays, schema=pa.schema(out_fields))
 
-    def _convert_inline_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
-    def _convert_descriptor_stored_blob_columns(self, record_batch: RecordBatch) -> RecordBatch:
-        if isinstance(self.format_reader, FormatBlobReader):
-            return record_batch
-        if not self.descriptor_blob_fields:
-            return record_batch
-
     def _assign_row_tracking(self, record_batch: RecordBatch) -> RecordBatch:
         """Assign row tracking meta fields (_ROW_ID and _SEQUENCE_NUMBER)."""
         arrays = list(record_batch.columns)

From b0bae12f4680fddbd9578f2942bdd2ef4d6caa5f Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 14:16:44 +0800
Subject: [PATCH 24/34] fix

---
 .../read/reader/blob_descriptor_convert_reader.py   | 11 +++++++++--
 paimon-python/pypaimon/schema/schema.py             | 13 +------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 1fcddc51e3a9..55db05100056 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -66,9 +66,16 @@ def __init__(self, inner: RecordBatchReader, table,
         self._blob_view_lookup = None
 
     def read_arrow_batch(self) -> Optional[RecordBatch]:
-        # Ensure prescan is done before reading (only needed for view fields)
+        # Align with Java: only enter blob view resolution when catalog_loader is available
+        # If catalog_loader is None, skip both Stage 1 (view resolution) and Stage 2 (descriptor resolution)
+        # This matches Java's behavior in DataEvolutionTableRead.createReader where blob view reader
+        # is only created when catalogContext != null
         if self._view_fields and not self._prescan_done:
-            self._prescan_view_structs()
+            if self._table.catalog_environment.catalog_loader is None:
+                # No catalog_loader available, skip view resolution
+                self._prescan_done = True
+            else:
+                self._prescan_view_structs()
 
         batch = self._inner.read_arrow_batch()
         if batch is None:
diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index e758fc262512..a24c5c907707 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -106,26 +106,15 @@ def from_pyarrow_schema(pa_schema: pa.Schema, partition_keys: Optional[List[str]
     @staticmethod
     def _validate_blob_fields(fields, options, primary_keys):
         """Validate blob field configurations in the schema."""
-        blob_names = [
-            field.name for field in fields
-            if 'blob' in str(field.type).lower()
-        ]
-
-        if not blob_names:
-            return
 
         if options is None:
             options = {}
 
-        if len(fields) <= len(blob_names):
-            raise ValueError(
-                "Table with BLOB type column must have other normal columns."
-            )
-
         blob_field_names = {
             field.name for field in fields if 'blob' in str(field.type).lower()
         }
         core_options = CoreOptions.from_dict(options)
+
         descriptor_fields = core_options.blob_descriptor_fields()
         view_fields = core_options.blob_view_fields()
         unknown_inline_fields = descriptor_fields.union(view_fields).difference(blob_field_names)

From ea2509bd88d9d6f762c899c496eb1bb805ff69ff Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 14:39:41 +0800
Subject: [PATCH 25/34] fix

---
 paimon-python/pypaimon/schema/schema.py | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index a24c5c907707..f81ce5b80bf2 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -130,22 +130,22 @@ def _validate_blob_fields(fields, options, primary_keys):
                 "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
                 "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
             )
+        if blob_field_names:
+            required_options = {
+                CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
+                CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
+            }
 
-        required_options = {
-            CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
-            CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
-        }
-
-        missing_options = []
-        for key, expected_value in required_options.items():
-            if key not in options or options[key] != expected_value:
-                missing_options.append(f"{key}='{expected_value}'")
+            missing_options = []
+            for key, expected_value in required_options.items():
+                if key not in options or options[key] != expected_value:
+                    missing_options.append(f"{key}='{expected_value}'")
 
-        if missing_options:
-            raise ValueError(
-                f"Schema contains Blob type but is missing required options: {', '.join(missing_options)}. "
-                f"Please add these options to the schema."
-            )
+            if missing_options:
+                raise ValueError(
+                    f"Schema contains Blob type but is missing required options: {', '.join(missing_options)}. "
+                    f"Please add these options to the schema."
+                )
 
-        if primary_keys is not None:
-            raise ValueError("Blob type is not supported with primary key.")
+            if primary_keys is not None:
+                raise ValueError("Blob type is not supported with primary key.")

From 179a0bc84a824d16a821429031b3952791b16b86 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 14:49:47 +0800
Subject: [PATCH 26/34] fix

---
 paimon-python/pypaimon/schema/schema.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index f81ce5b80bf2..8d3f267031ba 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -113,6 +113,12 @@ def _validate_blob_fields(fields, options, primary_keys):
         blob_field_names = {
             field.name for field in fields if 'blob' in str(field.type).lower()
         }
+
+        if len(fields) <= len(blob_field_names):
+            raise ValueError(
+                "Table with BLOB type column must have other normal columns."
+            )
+
         core_options = CoreOptions.from_dict(options)
 
         descriptor_fields = core_options.blob_descriptor_fields()

From 819950b5a056c5cc8c3790e14ac2b156d94d6365 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 15:15:34 +0800
Subject: [PATCH 27/34] schema

---
 .../pypaimon/common/options/core_options.py   |  4 ++++
 paimon-python/pypaimon/schema/schema.py       | 23 ++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py
index d9a6d2a245a6..5ad7e3e201fc 100644
--- a/paimon-python/pypaimon/common/options/core_options.py
+++ b/paimon-python/pypaimon/common/options/core_options.py
@@ -746,6 +746,10 @@ def blob_view_fields(self, default=None):
         value = self.options.get(CoreOptions.BLOB_VIEW_FIELD, default)
         return CoreOptions._parse_field_set(value)
 
+    def blob_field(self, default=None):
+        value = self.options.get(CoreOptions.BLOB_FIELD, default)
+        return CoreOptions._parse_field_set(value)
+
     def blob_view_resolve_enabled(self, default=True):
         return self.options.get(CoreOptions.BLOB_VIEW_RESOLVE_ENABLED, default)
 
diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index 8d3f267031ba..23d037ecbcc4 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -121,21 +121,38 @@ def _validate_blob_fields(fields, options, primary_keys):
 
         core_options = CoreOptions.from_dict(options)
 
+        # Validate blob-field configuration
+        configured_blob_fields = core_options.blob_field()
+        for field in configured_blob_fields:
+            if field not in blob_field_names:
+                raise ValueError(
+                    "Field '{}' in '{}' must be a BLOB field in table schema.".format(
+                        field, CoreOptions.BLOB_FIELD.key()
+                    )
+                )
+
+        # Validate blob-descriptor-field and blob-view-field configuration
         descriptor_fields = core_options.blob_descriptor_fields()
         view_fields = core_options.blob_view_fields()
-        unknown_inline_fields = descriptor_fields.union(view_fields).difference(blob_field_names)
-        if unknown_inline_fields:
+
+        # Check that configured fields are BLOB type
+        all_inline_fields = descriptor_fields.union(view_fields)
+        non_blob_inline_fields = all_inline_fields.difference(blob_field_names)
+        if non_blob_inline_fields:
             raise ValueError(
                 "Fields in 'blob-descriptor-field' or 'blob-view-field' must be blob fields "
-                "in schema. Unknown fields: {}".format(sorted(unknown_inline_fields))
+                "in schema. Non-BLOB fields: {}".format(sorted(non_blob_inline_fields))
             )
 
+        # Check for overlap between descriptor and view fields
         overlapping_inline_fields = descriptor_fields.intersection(view_fields)
         if overlapping_inline_fields:
             raise ValueError(
                 "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
                 "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
             )
+
+        # Apply BLOB-specific table constraints only when BLOB fields exist
         if blob_field_names:
             required_options = {
                 CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',

From fac4e15902450b852c340cd9426542d05c68b4a0 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 17:11:05 +0800
Subject: [PATCH 28/34] fix

---
 .../read/reader/blob_descriptor_convert_reader.py        | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
index 55db05100056..12e975bae5ee 100644
--- a/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
+++ b/paimon-python/pypaimon/read/reader/blob_descriptor_convert_reader.py
@@ -58,7 +58,8 @@ def __init__(self, inner: RecordBatchReader, table,
         self.blob_field_indices = inner.blob_field_indices
         # Preserve original BlobViewStruct bytes when resolve disabled: skip both
         # view resolution (Stage 1) and descriptor-to-data resolution (Stage 2).
-        resolve_enabled = CoreOptions.blob_view_resolve_enabled(table.options)
+        resolve_enabled = CoreOptions.blob_view_resolve_enabled(
+            table.options) and self._table.catalog_environment.catalog_loader is not None
         self._view_fields = CoreOptions.blob_view_fields(table.options) if resolve_enabled else set()
         self._descriptor_fields = CoreOptions.blob_descriptor_fields(table.options)
         self._blob_as_descriptor = CoreOptions.blob_as_descriptor(table.options)
@@ -71,11 +72,7 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
         # This matches Java's behavior in DataEvolutionTableRead.createReader where blob view reader
         # is only created when catalogContext != null
         if self._view_fields and not self._prescan_done:
-            if self._table.catalog_environment.catalog_loader is None:
-                # No catalog_loader available, skip view resolution
-                self._prescan_done = True
-            else:
-                self._prescan_view_structs()
+            self._prescan_view_structs()
 
         batch = self._inner.read_arrow_batch()
         if batch is None:

From 48c13ffac51e6af84cd644b2a6fba7b64f21db31 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 20:22:04 +0800
Subject: [PATCH 29/34] schemaCheck

---
 paimon-python/pypaimon/schema/schema.py       | 73 ---------------
 .../pypaimon/schema/schema_manager.py         | 89 ++++++++++++++++---
 .../pypaimon/tests/blob_table_test.py         |  8 +-
 3 files changed, 83 insertions(+), 87 deletions(-)

diff --git a/paimon-python/pypaimon/schema/schema.py b/paimon-python/pypaimon/schema/schema.py
index 23d037ecbcc4..f3a63c88e14c 100644
--- a/paimon-python/pypaimon/schema/schema.py
+++ b/paimon-python/pypaimon/schema/schema.py
@@ -62,9 +62,6 @@ def from_pyarrow_schema(pa_schema: pa.Schema, partition_keys: Optional[List[str]
                 if field.name in pk_set:
                     field.type.nullable = False
 
-        # Validate Blob type fields in the schema
-        Schema._validate_blob_fields(fields, options, primary_keys)
-
         # Check if Vector type with dedicated file format
         vector_names = [
             field.name for field in fields
@@ -102,73 +99,3 @@ def from_pyarrow_schema(pa_schema: pa.Schema, partition_keys: Optional[List[str]
                 )
 
         return Schema(fields, partition_keys, primary_keys, options, comment)
-
-    @staticmethod
-    def _validate_blob_fields(fields, options, primary_keys):
-        """Validate blob field configurations in the schema."""
-
-        if options is None:
-            options = {}
-
-        blob_field_names = {
-            field.name for field in fields if 'blob' in str(field.type).lower()
-        }
-
-        if len(fields) <= len(blob_field_names):
-            raise ValueError(
-                "Table with BLOB type column must have other normal columns."
-            )
-
-        core_options = CoreOptions.from_dict(options)
-
-        # Validate blob-field configuration
-        configured_blob_fields = core_options.blob_field()
-        for field in configured_blob_fields:
-            if field not in blob_field_names:
-                raise ValueError(
-                    "Field '{}' in '{}' must be a BLOB field in table schema.".format(
-                        field, CoreOptions.BLOB_FIELD.key()
-                    )
-                )
-
-        # Validate blob-descriptor-field and blob-view-field configuration
-        descriptor_fields = core_options.blob_descriptor_fields()
-        view_fields = core_options.blob_view_fields()
-
-        # Check that configured fields are BLOB type
-        all_inline_fields = descriptor_fields.union(view_fields)
-        non_blob_inline_fields = all_inline_fields.difference(blob_field_names)
-        if non_blob_inline_fields:
-            raise ValueError(
-                "Fields in 'blob-descriptor-field' or 'blob-view-field' must be blob fields "
-                "in schema. Non-BLOB fields: {}".format(sorted(non_blob_inline_fields))
-            )
-
-        # Check for overlap between descriptor and view fields
-        overlapping_inline_fields = descriptor_fields.intersection(view_fields)
-        if overlapping_inline_fields:
-            raise ValueError(
-                "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
-                "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
-            )
-
-        # Apply BLOB-specific table constraints only when BLOB fields exist
-        if blob_field_names:
-            required_options = {
-                CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
-                CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
-            }
-
-            missing_options = []
-            for key, expected_value in required_options.items():
-                if key not in options or options[key] != expected_value:
-                    missing_options.append(f"{key}='{expected_value}'")
-
-            if missing_options:
-                raise ValueError(
-                    f"Schema contains Blob type but is missing required options: {', '.join(missing_options)}. "
-                    f"Please add these options to the schema."
-                )
-
-            if primary_keys is not None:
-                raise ValueError("Blob type is not supported with primary key.")
diff --git a/paimon-python/pypaimon/schema/schema_manager.py b/paimon-python/pypaimon/schema/schema_manager.py
index 645d2f4328c0..ceaf183aa4f0 100644
--- a/paimon-python/pypaimon/schema/schema_manager.py
+++ b/paimon-python/pypaimon/schema/schema_manager.py
@@ -166,6 +166,70 @@ def _assert_not_renaming_blob_column(
             )
 
 
+def _validate_blob_fields(fields: List[DataField], options: dict, primary_keys: List[str]):
+    """Validate blob field configurations in the schema."""
+    if options is None:
+        options = {}
+
+    blob_field_names = {
+        field.name for field in fields if 'blob' in str(field.type).lower()
+    }
+
+    if len(fields) <= len(blob_field_names):
+        raise ValueError(
+            "Table with BLOB type column must have other normal columns."
+        )
+
+    core_options = CoreOptions(Options(options))
+
+    configured_blob_fields = core_options.blob_field()
+    for field in configured_blob_fields:
+        if field not in blob_field_names:
+            raise ValueError(
+                "Field '{}' in '{}' must be a BLOB field in table schema.".format(
+                    field, CoreOptions.BLOB_FIELD.key()
+                )
+            )
+
+    descriptor_fields = core_options.blob_descriptor_fields()
+    view_fields = core_options.blob_view_fields()
+
+    all_inline_fields = descriptor_fields.union(view_fields)
+    non_blob_inline_fields = all_inline_fields.difference(blob_field_names)
+    if non_blob_inline_fields:
+        raise ValueError(
+            "Fields in 'blob-descriptor-field' or 'blob-view-field' must be blob fields "
+            "in schema. Non-BLOB fields: {}".format(sorted(non_blob_inline_fields))
+        )
+
+    overlapping_inline_fields = descriptor_fields.intersection(view_fields)
+    if overlapping_inline_fields:
+        raise ValueError(
+            "Fields in 'blob-descriptor-field' and 'blob-view-field' must not overlap. "
+            "Overlapping fields: {}".format(sorted(overlapping_inline_fields))
+        )
+
+    if blob_field_names:
+        required_options = {
+            CoreOptions.ROW_TRACKING_ENABLED.key(): 'true',
+            CoreOptions.DATA_EVOLUTION_ENABLED.key(): 'true'
+        }
+
+        missing_options = []
+        for key, expected_value in required_options.items():
+            if key not in options or options[key] != expected_value:
+                missing_options.append(f"{key}='{expected_value}'")
+
+        if missing_options:
+            raise ValueError(
+                f"Schema contains Blob type but is missing required options: {', '.join(missing_options)}. "
+                f"Please add these options to the schema."
+            )
+
+        if primary_keys:
+            raise ValueError("Blob type is not supported with primary key.")
+
+
 def _validate_blob_external_storage_fields(fields: List[DataField], options: dict):
     """Validate blob-external-storage-field configuration.
 
@@ -255,12 +319,12 @@ def _apply_move(fields: List[DataField], new_field: Optional[DataField], move):
 
 
 def _handle_add_column(
-    change: AddColumn,
-    new_fields: List[DataField],
-    highest_field_id: AtomicInteger,
-    partition_keys: List[str],
-    add_column_before_partition: bool,
-    new_options: dict
+        change: AddColumn,
+        new_fields: List[DataField],
+        highest_field_id: AtomicInteger,
+        partition_keys: List[str],
+        add_column_before_partition: bool,
+        new_options: dict
 ):
     if not change.data_type.nullable:
         raise ValueError(
@@ -287,9 +351,9 @@ def _handle_add_column(
     if change.move:
         _apply_move(new_fields, new_field, change.move)
     elif (
-        add_column_before_partition
-        and partition_keys
-        and len(change.field_names) == 1
+            add_column_before_partition
+            and partition_keys
+            and len(change.field_names) == 1
     ):
         insert_index = len(new_fields)
         for i, field in enumerate(new_fields):
@@ -364,6 +428,7 @@ def create_table(self, schema: Schema) -> TableSchema:
                 comment=schema.comment,
             )
 
+            _validate_blob_fields(schema.fields, schema.options, schema.primary_keys)
             _validate_blob_external_storage_fields(schema.fields, schema.options)
             table_schema = TableSchema.from_schema(schema_id=0, schema=schema)
             success = self.commit(table_schema)
@@ -429,7 +494,7 @@ def commit_changes(self, changes: List[SchemaChange]) -> TableSchema:
                 raise RuntimeError(f"Failed to commit schema changes: {e}") from e
 
     def _generate_table_schema(
-        self, old_table_schema: TableSchema, changes: List[SchemaChange]
+            self, old_table_schema: TableSchema, changes: List[SchemaChange]
     ) -> TableSchema:
         new_options = dict(old_table_schema.options)
         new_fields = []
@@ -521,13 +586,13 @@ def _generate_table_schema(
 
     @staticmethod
     def _apply_not_nested_column_rename(
-        columns: List[str], rename_mappings: dict
+            columns: List[str], rename_mappings: dict
     ) -> List[str]:
         return [rename_mappings.get(col, col) for col in columns]
 
     @staticmethod
     def _apply_rename_columns_to_options(
-        options: dict, rename_mappings: dict
+            options: dict, rename_mappings: dict
     ) -> dict:
         if not rename_mappings:
             return options
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
index 42131dbc080c..87199dfeccdf 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -1676,14 +1676,18 @@ def test_blob_inline_fields_reject_overlap_and_unknown_fields(self):
             'blob-descriptor-field': 'picture',
             'blob-view-field': 'picture',
         })
+        overlap_schema = Schema.from_pyarrow_schema(pa_schema, options=overlap_options)
         with self.assertRaises(ValueError) as overlap_context:
-            Schema.from_pyarrow_schema(pa_schema, options=overlap_options)
+            self.catalog.create_table(
+                'test_db.blob_overlap_reject', overlap_schema, False)
         self.assertIn("must not overlap", str(overlap_context.exception))
 
         unknown_options = dict(base_options)
         unknown_options.update({'blob-view-field': 'missing_picture'})
+        unknown_schema = Schema.from_pyarrow_schema(pa_schema, options=unknown_options)
         with self.assertRaises(ValueError) as unknown_context:
-            Schema.from_pyarrow_schema(pa_schema, options=unknown_options)
+            self.catalog.create_table(
+                'test_db.blob_unknown_reject', unknown_schema, False)
         self.assertIn("must be blob fields", str(unknown_context.exception))
 
     def test_to_arrow_batch_reader(self):

From 2ddb95f0a5964588bb576dd4ffd29406b5ac7f40 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 21:25:29 +0800
Subject: [PATCH 30/34] fix

---
 .../pypaimon/tests/external_storage_blob_test.py  | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/paimon-python/pypaimon/tests/external_storage_blob_test.py b/paimon-python/pypaimon/tests/external_storage_blob_test.py
index 3bbe5cc06b6b..e7ad8273d9a1 100644
--- a/paimon-python/pypaimon/tests/external_storage_blob_test.py
+++ b/paimon-python/pypaimon/tests/external_storage_blob_test.py
@@ -85,14 +85,15 @@ def test_validation_field_not_blob_type(self):
             ('name', pa.string()),
             ('video', pa.large_binary()),
         ])
+        schema = Schema.from_pyarrow_schema(pa_schema, options={
+            'row-tracking.enabled': 'true',
+            'data-evolution.enabled': 'true',
+            'blob-descriptor-field': 'name,video',
+            'blob-external-storage-field': 'name',
+            'blob-external-storage-path': external_path,
+        })
         with self.assertRaises(ValueError) as ctx:
-            Schema.from_pyarrow_schema(pa_schema, options={
-                'row-tracking.enabled': 'true',
-                'data-evolution.enabled': 'true',
-                'blob-descriptor-field': 'name,video',
-                'blob-external-storage-field': 'name',
-                'blob-external-storage-path': external_path,
-            })
+            self.catalog.create_table('test_db.not_blob_type_test', schema, False)
         self.assertIn('must be blob fields', str(ctx.exception))
 
     def test_validation_blob_not_null_field_passes(self):

From 41f849f7bb08253249e3ddb92146378c7303018f Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 22:25:35 +0800
Subject: [PATCH 31/34] supportLimit

---
 paimon-python/pypaimon/read/split_read.py     |  10 +-
 .../pypaimon/schema/schema_manager.py         |  37 ++--
 .../pypaimon/tests/blob_table_test.py         | 181 +++++++++++++++++-
 3 files changed, 207 insertions(+), 21 deletions(-)

diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 36221866cc64..5b627dc2cadb 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -846,11 +846,18 @@ def _create_raw_reader(self) -> RecordReader:
         else:
             reader = merge_reader
 
+        if self.limit is not None:
+            reader = LimitedRecordBatchReader(reader, self.limit)
+
         return reader
 
     def _create_prescan_reader(self, field_names):
         """Create a prescan reader by constructing a new DataEvolutionSplitRead
-        instance that only projects the specified field names."""
+        instance that only projects the specified field names.
+        
+        Align with Java's configureBlobViewPrescanRead: pass limit to prescan reader
+        to avoid scanning entire split when there's a LIMIT clause.
+        """
         from pypaimon.read.reader.iface.record_batch_reader import EmptyRecordBatchReader
 
         prescan_fields = [f for f in self.read_fields if f.name in field_names]
@@ -863,6 +870,7 @@ def _create_prescan_reader(self, field_names):
             read_type=prescan_fields,
             split=self.split,
             row_tracking_enabled=False,
+            limit=self.limit,  # Pass limit to prescan reader
         )
         prescan_read.row_ranges = self.row_ranges
         return prescan_read._create_raw_reader()
diff --git a/paimon-python/pypaimon/schema/schema_manager.py b/paimon-python/pypaimon/schema/schema_manager.py
index ceaf183aa4f0..11b690fdbb67 100644
--- a/paimon-python/pypaimon/schema/schema_manager.py
+++ b/paimon-python/pypaimon/schema/schema_manager.py
@@ -53,7 +53,7 @@ def _get_rename_mappings(changes: List[SchemaChange]) -> dict:
 
 
 def _handle_update_column_comment(
-        change: UpdateColumnComment, new_fields: List[DataField]
+    change: UpdateColumnComment, new_fields: List[DataField]
 ):
     field_name = change.field_names[-1]
     field_index = _find_field_index(new_fields, field_name)
@@ -66,7 +66,7 @@ def _handle_update_column_comment(
 
 
 def _handle_update_column_nullability(
-        change: UpdateColumnNullability, new_fields: List[DataField]
+    change: UpdateColumnNullability, new_fields: List[DataField]
 ):
     field_name = change.field_names[-1]
     field_index = _find_field_index(new_fields, field_name)
@@ -83,7 +83,7 @@ def _handle_update_column_nullability(
 
 
 def _handle_update_column_type(
-        change: UpdateColumnType, new_fields: List[DataField]
+    change: UpdateColumnType, new_fields: List[DataField]
 ):
     field_name = change.field_names[-1]
     field_index = _find_field_index(new_fields, field_name)
@@ -135,7 +135,7 @@ def _get_type_root(data_type) -> str:
 
 
 def _assert_not_updating_partition_keys(
-        schema: 'TableSchema', field_names: List[str], operation: str):
+    schema: 'TableSchema', field_names: List[str], operation: str):
     if len(field_names) > 1:
         return
     field_name = field_names[0]
@@ -146,7 +146,7 @@ def _assert_not_updating_partition_keys(
 
 
 def _assert_not_updating_primary_keys(
-        schema: 'TableSchema', field_names: List[str], operation: str):
+    schema: 'TableSchema', field_names: List[str], operation: str):
     if len(field_names) > 1:
         return
     field_name = field_names[0]
@@ -155,7 +155,7 @@ def _assert_not_updating_primary_keys(
 
 
 def _assert_not_renaming_blob_column(
-        new_fields: List[DataField], field_names: List[str]):
+    new_fields: List[DataField], field_names: List[str]):
     if len(field_names) > 1:
         return
     field_name = field_names[0]
@@ -319,12 +319,12 @@ def _apply_move(fields: List[DataField], new_field: Optional[DataField], move):
 
 
 def _handle_add_column(
-        change: AddColumn,
-        new_fields: List[DataField],
-        highest_field_id: AtomicInteger,
-        partition_keys: List[str],
-        add_column_before_partition: bool,
-        new_options: dict
+    change: AddColumn,
+    new_fields: List[DataField],
+    highest_field_id: AtomicInteger,
+    partition_keys: List[str],
+    add_column_before_partition: bool,
+    new_options: dict
 ):
     if not change.data_type.nullable:
         raise ValueError(
@@ -351,9 +351,9 @@ def _handle_add_column(
     if change.move:
         _apply_move(new_fields, new_field, change.move)
     elif (
-            add_column_before_partition
-            and partition_keys
-            and len(change.field_names) == 1
+        add_column_before_partition
+        and partition_keys
+        and len(change.field_names) == 1
     ):
         insert_index = len(new_fields)
         for i, field in enumerate(new_fields):
@@ -436,6 +436,7 @@ def create_table(self, schema: Schema) -> TableSchema:
                 return table_schema
 
     def commit(self, new_schema: TableSchema) -> bool:
+        _validate_blob_fields(new_schema.fields, new_schema.options, new_schema.primary_keys)
         schema_path = self._to_schema_path(new_schema.id)
         try:
             result = self.file_io.try_to_write_atomic(schema_path, JSON.to_json(new_schema, indent=2))
@@ -494,7 +495,7 @@ def commit_changes(self, changes: List[SchemaChange]) -> TableSchema:
                 raise RuntimeError(f"Failed to commit schema changes: {e}") from e
 
     def _generate_table_schema(
-            self, old_table_schema: TableSchema, changes: List[SchemaChange]
+        self, old_table_schema: TableSchema, changes: List[SchemaChange]
     ) -> TableSchema:
         new_options = dict(old_table_schema.options)
         new_fields = []
@@ -586,13 +587,13 @@ def _generate_table_schema(
 
     @staticmethod
     def _apply_not_nested_column_rename(
-            columns: List[str], rename_mappings: dict
+        columns: List[str], rename_mappings: dict
     ) -> List[str]:
         return [rename_mappings.get(col, col) for col in columns]
 
     @staticmethod
     def _apply_rename_columns_to_options(
-            options: dict, rename_mappings: dict
+        options: dict, rename_mappings: dict
     ) -> dict:
         if not rename_mappings:
             return options
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
index 87199dfeccdf..793131478236 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -1690,6 +1690,183 @@ def test_blob_inline_fields_reject_overlap_and_unknown_fields(self):
                 'test_db.blob_unknown_reject', unknown_schema, False)
         self.assertIn("must be blob fields", str(unknown_context.exception))
 
+    def test_blob_view_prescan_with_limit(self):
+        """Test that limit is correctly pushed down to prescan reader.
+        
+        Regression test for: prescan should only scan up to limit rows,
+        not the entire split.
+        """
+        from pypaimon import Schema
+        from pypaimon.table.row.blob import BlobViewStruct
+
+        # Create source table with multiple rows
+        source_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        source = Schema.from_pyarrow_schema(
+            source_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_limit_source', source, False)
+        source_table = self.catalog.get_table('test_db.blob_view_limit_source')
+
+        # Write 10 rows
+        num_rows = 10
+        payloads = [f'payload-{i}'.encode() for i in range(num_rows)]
+        write_builder = source_table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(pa.Table.from_pydict({
+            'id': list(range(num_rows)),
+            'picture': payloads,
+        }, schema=source_schema))
+        commit_messages = writer.prepare_commit()
+        write_builder.new_commit().commit(commit_messages)
+        writer.close()
+
+        picture_field_id = next(
+            field.id for field in source_table.table_schema.fields if field.name == 'picture'
+        )
+        view_values = [
+            BlobViewStruct('test_db.blob_view_limit_source', picture_field_id, i).serialize()
+            for i in range(num_rows)
+        ]
+
+        # Create target table with blob-view-field
+        target_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        target = Schema.from_pyarrow_schema(
+            target_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-view-field': 'picture',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_limit_target', target, False)
+        target_table = self.catalog.get_table('test_db.blob_view_limit_target')
+
+        target_write_builder = target_table.new_batch_write_builder()
+        target_writer = target_write_builder.new_write()
+        target_writer.write_arrow(pa.Table.from_pydict({
+            'id': list(range(num_rows)),
+            'picture': view_values,
+        }, schema=target_schema))
+        target_commit_messages = target_writer.prepare_commit()
+        target_write_builder.new_commit().commit(target_commit_messages)
+        target_writer.close()
+
+        # Test with limit: should only return first 3 rows
+        read_builder = target_table.new_read_builder()
+        read_builder.with_limit(3)
+        result = read_builder.new_read().to_arrow(
+            read_builder.new_scan().plan().splits()
+        )
+        self.assertEqual(result.num_rows, 3, "LIMIT should be respected in blob view prescan")
+        self.assertEqual(result.column('id').to_pylist(), [0, 1, 2])
+
+    def test_blob_view_prescan_only_collects_limited_view_structs(self):
+        """Verify that the prescan stage only collects as many BlobViewStructs as
+        the limit allows, instead of scanning the entire split.
+
+        Unlike test_blob_view_prescan_with_limit (which only checks the final
+        output), this test patches BlobViewLookup.preload to capture the exact
+        list of view structs collected during prescan and asserts its length
+        equals the limit.
+        """
+        from unittest import mock
+
+        from pypaimon import Schema
+        from pypaimon.table.row.blob import BlobViewStruct
+        from pypaimon.utils.blob_view_lookup import BlobViewLookup
+
+        source_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        source = Schema.from_pyarrow_schema(
+            source_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_prescan_count_source', source, False)
+        source_table = self.catalog.get_table('test_db.blob_view_prescan_count_source')
+
+        num_rows = 10
+        payloads = [f'payload-{i}'.encode() for i in range(num_rows)]
+        write_builder = source_table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(pa.Table.from_pydict({
+            'id': list(range(num_rows)),
+            'picture': payloads,
+        }, schema=source_schema))
+        commit_messages = writer.prepare_commit()
+        write_builder.new_commit().commit(commit_messages)
+        writer.close()
+
+        picture_field_id = next(
+            field.id for field in source_table.table_schema.fields if field.name == 'picture'
+        )
+        view_values = [
+            BlobViewStruct('test_db.blob_view_prescan_count_source', picture_field_id, i).serialize()
+            for i in range(num_rows)
+        ]
+
+        target_schema = pa.schema([
+            ('id', pa.int32()),
+            ('picture', pa.large_binary()),
+        ])
+        target = Schema.from_pyarrow_schema(
+            target_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-view-field': 'picture',
+            }
+        )
+        self.catalog.create_table('test_db.blob_view_prescan_count_target', target, False)
+        target_table = self.catalog.get_table('test_db.blob_view_prescan_count_target')
+
+        target_write_builder = target_table.new_batch_write_builder()
+        target_writer = target_write_builder.new_write()
+        target_writer.write_arrow(pa.Table.from_pydict({
+            'id': list(range(num_rows)),
+            'picture': view_values,
+        }, schema=target_schema))
+        target_commit_messages = target_writer.prepare_commit()
+        target_write_builder.new_commit().commit(target_commit_messages)
+        target_writer.close()
+
+        captured_view_structs = []
+        original_preload = BlobViewLookup.preload
+
+        def capturing_preload(lookup_self, view_structs):
+            captured_view_structs.append(list(view_structs))
+            return original_preload(lookup_self, view_structs)
+
+        limit = 3
+        read_builder = target_table.new_read_builder()
+        read_builder.with_limit(limit)
+        with mock.patch.object(BlobViewLookup, 'preload', autospec=True,
+                               side_effect=capturing_preload):
+            result = read_builder.new_read().to_arrow(
+                read_builder.new_scan().plan().splits()
+            )
+
+        self.assertEqual(result.num_rows, limit)
+        self.assertEqual(len(captured_view_structs), 1,
+                         "preload should be invoked exactly once during prescan")
+        self.assertEqual(
+            len(captured_view_structs[0]), limit,
+            "prescan should only collect as many view structs as the limit allows")
+
     def test_to_arrow_batch_reader(self):
         import random
         from pypaimon import Schema
@@ -3504,7 +3681,7 @@ def test_blob_data_with_ray(self):
         total_split_row_count = sum([s.row_count for s in splits])
         self.assertEqual(total_split_row_count, num_rows * 2,
                          f"Total split row count should be {num_rows}, got {total_split_row_count}")
-        
+
         total_merged_count = 0
         for split in splits:
             merged_count = split.merged_row_count()
@@ -3513,7 +3690,7 @@ def test_blob_data_with_ray(self):
                 self.assertLessEqual(
                     merged_count, split.row_count,
                     f"merged_row_count ({merged_count}) should be <= row_count ({split.row_count})")
-        
+
         if total_merged_count > 0:
             self.assertEqual(
                 total_merged_count, num_rows,

From 0383e5aacc488ca221f4b7e247f313414201fc89 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 22:34:14 +0800
Subject: [PATCH 32/34] fmt

---
 paimon-python/pypaimon/schema/schema_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paimon-python/pypaimon/schema/schema_manager.py b/paimon-python/pypaimon/schema/schema_manager.py
index 11b690fdbb67..57a3f54efd8a 100644
--- a/paimon-python/pypaimon/schema/schema_manager.py
+++ b/paimon-python/pypaimon/schema/schema_manager.py
@@ -135,7 +135,7 @@ def _get_type_root(data_type) -> str:
 
 
 def _assert_not_updating_partition_keys(
-    schema: 'TableSchema', field_names: List[str], operation: str):
+        schema: 'TableSchema', field_names: List[str], operation: str):
     if len(field_names) > 1:
         return
     field_name = field_names[0]
@@ -146,7 +146,7 @@ def _assert_not_updating_partition_keys(
 
 
 def _assert_not_updating_primary_keys(
-    schema: 'TableSchema', field_names: List[str], operation: str):
+        schema: 'TableSchema', field_names: List[str], operation: str):
     if len(field_names) > 1:
         return
     field_name = field_names[0]
@@ -155,7 +155,7 @@ def _assert_not_updating_primary_keys(
 
 
 def _assert_not_renaming_blob_column(
-    new_fields: List[DataField], field_names: List[str]):
+        new_fields: List[DataField], field_names: List[str]):
     if len(field_names) > 1:
         return
     field_name = field_names[0]

From c36806b94b9b4db197635c2b22f806b5730c71e6 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 22:40:58 +0800
Subject: [PATCH 33/34] fix

---
 .../pypaimon/schema/schema_manager.py          |  3 ++-
 .../pypaimon/tests/blob_table_test.py          | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/schema/schema_manager.py b/paimon-python/pypaimon/schema/schema_manager.py
index 57a3f54efd8a..d01549c71bc5 100644
--- a/paimon-python/pypaimon/schema/schema_manager.py
+++ b/paimon-python/pypaimon/schema/schema_manager.py
@@ -172,7 +172,8 @@ def _validate_blob_fields(fields: List[DataField], options: dict, primary_keys:
         options = {}
 
     blob_field_names = {
-        field.name for field in fields if 'blob' in str(field.type).lower()
+        field.name for field in fields
+        if getattr(field.type, 'type', None) == 'BLOB'
     }
 
     if len(fields) <= len(blob_field_names):
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
index 793131478236..3d33594c4a92 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -3789,6 +3789,24 @@ def test_rename_blob_column_should_fail(self):
             )
         self.assertIn('Cannot rename BLOB column', str(ctx.exception))
 
+    def test_nested_field_named_blob_not_treated_as_blob(self):
+        """Regression: a ROW field with a nested column whose name contains
+        'blob' must NOT be treated as a top-level BLOB column.  Previously
+        the substring match would falsely classify such fields, causing
+        create_table to require row-tracking and data-evolution options."""
+        pa_schema = pa.schema([
+            ('id', pa.int32()),
+            ('payload', pa.struct([
+                ('blob_name', pa.string()),
+                ('value', pa.int64()),
+            ])),
+        ])
+        schema = Schema.from_pyarrow_schema(pa_schema)
+        self.catalog.create_table(
+            'test_db.nested_blob_name_no_error', schema, False)
+        table = self.catalog.get_table('test_db.nested_blob_name_no_error')
+        self.assertIsNotNone(table)
+
 
 class GetBlobTest(unittest.TestCase):
 

From e2b20c63b62ec4a89d30e6967a9c2d257c453491 Mon Sep 17 00:00:00 2001
From: umi <zhaowenhai.zwh@alibaba-inc.com>
Date: Wed, 3 Jun 2026 23:17:47 +0800
Subject: [PATCH 34/34] fix

---
 paimon-python/pypaimon/read/split_read.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
index 5b627dc2cadb..b4952b0fcdbd 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -864,13 +864,17 @@ def _create_prescan_reader(self, field_names):
         if not prescan_fields:
             return EmptyRecordBatchReader()
 
+        # When there's a normal field predicate, don't push down limit to prescan reader
+        # because the outer reader will apply predicate+limit filtering,
+        # while prescan reader would only apply limit without normal field predicate
+        # TODO support limit+predicate push down
         prescan_read = DataEvolutionSplitRead(
             table=self.table,
             predicate=self.predicate,
             read_type=prescan_fields,
             split=self.split,
             row_tracking_enabled=False,
-            limit=self.limit,  # Pass limit to prescan reader
+            limit=None if self.predicate else self.limit,
         )
         prescan_read.row_ranges = self.row_ranges
         return prescan_read._create_raw_reader()