-
Notifications
You must be signed in to change notification settings - Fork 50
(improvement)Optimize custom type parsing with LRU caching #690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Simple benchmark demonstrating LRU cache benefit for repeated type lookups. | ||
| """ | ||
| import os | ||
| import sys | ||
| import time | ||
|
|
||
| # Pin to single CPU | ||
| if hasattr(os, 'sched_setaffinity'): | ||
| try: | ||
| cpu = list(os.sched_getaffinity(0))[0] | ||
| os.sched_setaffinity(0, {cpu}) | ||
| print(f"Benchmark pinned to CPU {cpu}\n") | ||
| except Exception: | ||
| # Best-effort CPU pinning: ignore failures (unsupported platform, permissions, etc.). | ||
| pass | ||
|
|
||
| repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | ||
| if repo_root not in sys.path: | ||
| sys.path.insert(0, repo_root) | ||
|
|
||
| from cassandra.cqltypes import lookup_casstype, parse_casstype_args, lookup_casstype_simple | ||
|
|
||
| print("=" * 80) | ||
| print("LRU CACHE BENEFIT DEMONSTRATION") | ||
| print("=" * 80) | ||
| print("\nShowing performance improvement from caching parse_casstype_args()") | ||
| print("Scenario: Same complex type string looked up 100,000 times") | ||
| print("(simulates repeated query execution with same schema)\n") | ||
|
|
||
| # Test types | ||
| types_to_test = [ | ||
| ("Simple", "org.apache.cassandra.db.marshal.UTF8Type"), | ||
| ("List", "org.apache.cassandra.db.marshal.ListType(org.apache.cassandra.db.marshal.UTF8Type)"), | ||
| ("Map", "org.apache.cassandra.db.marshal.MapType(org.apache.cassandra.db.marshal.UTF8Type,org.apache.cassandra.db.marshal.Int32Type)"), | ||
| ("Nested", "org.apache.cassandra.db.marshal.MapType(org.apache.cassandra.db.marshal.UTF8Type,org.apache.cassandra.db.marshal.ListType(org.apache.cassandra.db.marshal.Int32Type))"), | ||
| ] | ||
|
|
||
| iterations = 100000 | ||
|
|
||
| print(f"{'Type':<15} {'Time (100k calls)':<20} {'Per call':<15} {'Cache Info'}") | ||
| print("-" * 80) | ||
|
|
||
| for name, type_str in types_to_test: | ||
| # Clear cache before test | ||
| if hasattr(parse_casstype_args, 'cache_clear'): | ||
| parse_casstype_args.cache_clear() | ||
mykaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Also clear lookup_casstype_simple cache for accurate measurements | ||
| if hasattr(lookup_casstype_simple, 'cache_clear'): | ||
| lookup_casstype_simple.cache_clear() | ||
|
|
||
| # Run test | ||
| start = time.perf_counter() | ||
| for _ in range(iterations): | ||
| lookup_casstype(type_str) | ||
| elapsed = time.perf_counter() - start | ||
|
|
||
| per_call = (elapsed / iterations) * 1_000_000 # Convert to microseconds | ||
|
|
||
| # Get cache stats | ||
| if hasattr(parse_casstype_args, 'cache_info'): | ||
| cache_info = parse_casstype_args.cache_info() | ||
| cache_str = f"hits={cache_info.hits} misses={cache_info.misses}" | ||
| else: | ||
| cache_str = "no cache" | ||
|
|
||
| print(f"{name:<15} {elapsed*1000:>8.2f} ms {per_call:>6.3f} μs {cache_str}") | ||
|
|
||
| print("\n" + "=" * 80) | ||
| print("INTERPRETATION") | ||
| print("=" * 80) | ||
| print("\nFor complex types (List, Map, Nested):") | ||
| print(" • First call (miss=1): Parses with regex scanner") | ||
| print(" • Subsequent calls (hits=99,999): Return cached result") | ||
| print(" • Speedup: Nearly instant lookups after first parse\n") | ||
|
|
||
| print("For simple types:") | ||
| print(" • Fast path: Never calls parse_casstype_args") | ||
| print(" • Direct dict lookup in _casstypes") | ||
| print(" • No cache needed (already optimized)\n") | ||
|
|
||
| # Demonstrate cache effectiveness with mixed types | ||
| print("\n" + "=" * 80) | ||
| print("REAL-WORLD SCENARIO: Mixed Schema with 4 Columns") | ||
| print("=" * 80) | ||
| print("\nSchema: (id: UUID, name: text, age: int, tags: list<text>)") | ||
| print("Simulating 10,000 query executions...\n") | ||
|
|
||
| schema_types = [ | ||
| "org.apache.cassandra.db.marshal.UUIDType", | ||
| "org.apache.cassandra.db.marshal.UTF8Type", | ||
| "org.apache.cassandra.db.marshal.Int32Type", | ||
| "org.apache.cassandra.db.marshal.ListType(org.apache.cassandra.db.marshal.UTF8Type)", | ||
| ] | ||
|
|
||
| # Clear cache | ||
| if hasattr(parse_casstype_args, 'cache_clear'): | ||
| parse_casstype_args.cache_clear() | ||
mykaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Also clear lookup_casstype_simple cache for accurate measurements | ||
| if hasattr(lookup_casstype_simple, 'cache_clear'): | ||
| lookup_casstype_simple.cache_clear() | ||
|
|
||
| queries = 10000 | ||
| start = time.perf_counter() | ||
| for _ in range(queries): | ||
| for type_str in schema_types: | ||
| lookup_casstype(type_str) | ||
| elapsed = time.perf_counter() - start | ||
|
|
||
| total_lookups = queries * len(schema_types) | ||
| per_query = (elapsed / queries) * 1_000_000 # μs | ||
| per_lookup = (elapsed / total_lookups) * 1_000_000 # μs | ||
|
|
||
| print(f"Total queries: {queries:>10,}") | ||
| print(f"Total lookups: {total_lookups:>10,}") | ||
| print(f"Total time: {elapsed*1000:>10.2f} ms") | ||
| print(f"Per query: {per_query:>10.3f} μs") | ||
| print(f"Per lookup: {per_lookup:>10.3f} μs") | ||
|
|
||
| if hasattr(parse_casstype_args, 'cache_info'): | ||
| cache_info = parse_casstype_args.cache_info() | ||
| print(f"\nCache statistics:") | ||
| print(f" Hits: {cache_info.hits:>10,}") | ||
| print(f" Misses: {cache_info.misses:>10,}") | ||
| print(f" Hit rate: {cache_info.hits / (cache_info.hits + cache_info.misses) * 100:>10.1f}%") | ||
|
|
||
| print("\n" + "=" * 80) | ||
| print("KEY FINDING") | ||
| print("=" * 80) | ||
| print("\nThe LRU cache on parse_casstype_args() provides:") | ||
| print(" ✓ Fast repeated lookups of complex types (List, Map, Set, Tuple)") | ||
| print(" ✓ Natural fast path for simple types (no parentheses = no parsing)") | ||
| print(" ✓ Significant performance improvement for real-world query workloads") | ||
| print() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| import calendar | ||
| from collections import namedtuple | ||
| from decimal import Decimal | ||
| import functools | ||
| import io | ||
| from itertools import chain | ||
| import logging | ||
|
|
@@ -56,7 +57,6 @@ | |
|
|
||
| apache_cassandra_type_prefix = 'org.apache.cassandra.db.marshal.' | ||
|
|
||
| cassandra_empty_type = 'org.apache.cassandra.db.marshal.EmptyType' | ||
| cql_empty_type = 'empty' | ||
mykaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
@@ -185,6 +185,7 @@ def strip_frozen(cql): | |
| return cql | ||
|
|
||
|
|
||
| @functools.lru_cache(maxsize=256) | ||
mykaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def lookup_casstype_simple(casstype): | ||
| """ | ||
| Given a Cassandra type name (either fully distinguished or not), hand | ||
|
|
@@ -203,6 +204,7 @@ def lookup_casstype_simple(casstype): | |
| return typeclass | ||
|
|
||
|
|
||
| @functools.lru_cache(maxsize=256) | ||
|
||
| def parse_casstype_args(typestring): | ||
| tokens, remainder = casstype_scanner.scan(typestring) | ||
| if remainder: | ||
|
|
@@ -215,7 +217,7 @@ def parse_casstype_args(typestring): | |
| args.append(([], [])) | ||
| elif tok == ')': | ||
| types, names = args.pop() | ||
| prev_types, prev_names = args[-1] | ||
| prev_types, _ = args[-1] | ||
mykaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| prev_types[-1] = prev_types[-1].apply_parameters(types, names) | ||
| else: | ||
| types, names = args[-1] | ||
|
|
@@ -235,6 +237,7 @@ def parse_casstype_args(typestring): | |
| # return the first (outer) type, which will have all parameters applied | ||
| return args[0][0][0] | ||
|
|
||
|
|
||
| def lookup_casstype(casstype): | ||
| """ | ||
| Given a Cassandra type as a string (possibly including parameters), hand | ||
|
|
@@ -247,12 +250,17 @@ def lookup_casstype(casstype): | |
| <class 'cassandra.cqltypes.MapType(UTF8Type, Int32Type)'> | ||
|
|
||
| """ | ||
| # Fast path: already a type object | ||
| if isinstance(casstype, (CassandraType, CassandraTypeType)): | ||
| return casstype | ||
| try: | ||
| return parse_casstype_args(casstype) | ||
| except (ValueError, AssertionError, IndexError) as e: | ||
| raise ValueError("Don't know how to parse type string %r: %s" % (casstype, e)) | ||
|
|
||
| # Fast path: simple type without parameters (no parentheses) | ||
| # This avoids regex scanning for the most common case | ||
| if '(' not in casstype: | ||
| return lookup_casstype_simple(casstype) | ||
|
|
||
| # Complex type with parameters: use cached parser | ||
| return parse_casstype_args(casstype) | ||
mykaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def is_reversed_casstype(data_type): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.