Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/features/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ def get_last_modified_in_current_environment(
class FeatureSerializerWithMetadata(MetadataSerializerMixin, CreateFeatureSerializer):
metadata = MetadataSerializer(required=False, many=True)

# NOTE: This field is populated by `projects.code_references.services.annotate_feature_queryset_with_code_references_summary`.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this comment?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added it to hint at where this is materialised, because I personally find it useful, but one could find it if they search. Weakly held, let me know if you prefer the 🔪

code_references_counts = FeatureFlagCodeReferencesRepositoryCountSerializer(
many=True,
read_only=True,
Expand Down
16 changes: 1 addition & 15 deletions api/features/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@
from common.core.utils import is_database_replica_setup, using_database_replica
from common.projects.permissions import VIEW_PROJECT
from django.conf import settings
from django.contrib.postgres.fields import ArrayField
from django.core.cache import caches
from django.db.models import (
BooleanField,
Case,
Exists,
JSONField,
Max,
OuterRef,
Q,
Expand Down Expand Up @@ -62,7 +60,6 @@
NestedEnvironmentPermissions,
)
from features.value_types import BOOLEAN, INTEGER, STRING
from integrations.flagsmith.client import get_openfeature_client
from projects.code_references.services import (
annotate_feature_queryset_with_code_references_summary,
)
Expand Down Expand Up @@ -219,18 +216,7 @@ def get_queryset(self): # type: ignore[no-untyped-def]
query_serializer.is_valid(raise_exception=True)
query_data = query_serializer.validated_data

# TODO: Delete this after https://github.com/flagsmith/flagsmith/issues/6832 is resolved
organisation = project.organisation
if get_openfeature_client().get_boolean_value(
"code_references_ui_stats",
default_value=False,
evaluation_context=organisation.openfeature_evaluation_context,
):
queryset = annotate_feature_queryset_with_code_references_summary(queryset)
else:
queryset = queryset.annotate(
code_references_counts=Value([], output_field=ArrayField(JSONField()))
)
queryset = annotate_feature_queryset_with_code_references_summary(queryset)

queryset = self._filter_queryset(queryset, query_serializer)

Expand Down
13 changes: 0 additions & 13 deletions api/integrations/flagsmith/data/environment.json
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,6 @@
"featurestate_uuid": "e0d380a6-bdbc-4ad6-ae6f-b8b77d8beae6",
"multivariate_feature_state_values": []
},
{
Comment thread
gagantrivedi marked this conversation as resolved.
"django_id": 1212320,
"enabled": false,
"feature": {
"id": 192793,
"name": "code_references_ui_stats",
"type": "STANDARD"
},
"feature_segment": null,
"feature_state_value": null,
"featurestate_uuid": "f976df2f-2341-4623-8425-d6eda23a2ebc",
"multivariate_feature_state_values": []
},
{
"django_id": 1229327,
"enabled": false,
Expand Down
3 changes: 0 additions & 3 deletions api/projects/code_references/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
# TODO: Implement history cleanup?
FEATURE_FLAG_CODE_REFERENCES_RETENTION_DAYS = 30
Comment thread
gagantrivedi marked this conversation as resolved.

# Linux maximum file path length, as per limits.h/PATH_MAX
MAX_FILE_PATH_LENGTH = 4096
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import hashlib
import json
Comment thread
gagantrivedi marked this conversation as resolved.
from itertools import groupby
from operator import attrgetter
from typing import TypedDict

import django.db.models.deletion
from django.apps.registry import Apps
from django.db import migrations, models
from django.db.models import Max


class LegacyCodeReference(TypedDict):
feature_name: str
file_path: str
line_number: int


class StoredCodeReference(TypedDict):
file_path: str
line_number: int


def _hash_references(references: list[StoredCodeReference]) -> str:
return hashlib.md5(
json.dumps(references, sort_keys=True).encode(),
usedforsecurity=False,
).hexdigest()


def migrate_scans_forward(apps: Apps, _: object) -> None:
"""Split each legacy scan into new cardinality (per-repository and per-feature)"""

LegacyScan = apps.get_model("code_references", "FeatureFlagCodeReferencesScan")
PerFeatureScan = apps.get_model("code_references", "ScannedCodeReferences")
Repository = apps.get_model("code_references", "VCSRepository")
Feature = apps.get_model("features", "Feature")

legacy_scans_summaries = LegacyScan.objects.values(
"project_id",
"repository_url",
"vcs_provider",
).annotate(last_scanned_at=Max("created_at"))

repositories = {
(summary["project_id"], summary["repository_url"]): Repository.objects.create(
project_id=summary["project_id"],
url=summary["repository_url"],
vcs_provider=summary["vcs_provider"],
last_scanned_at=summary["last_scanned_at"],
)
for summary in legacy_scans_summaries
}

# Oldest-first per project so the newest scan wins on hash collisions
legacy_scans = LegacyScan.objects.order_by("project_id", "created_at").iterator()
grouped_scans = groupby(legacy_scans, key=attrgetter("project_id"))
for project_id, project_scans in grouped_scans:
features = {
(feature.project_id, feature.name): feature
for feature in Feature.objects.filter(
project_id=project_id,
deleted_at__isnull=True, # Historical models drop SoftDeleteManager
)
}
for legacy_scan in project_scans:
repository_url = legacy_scan.repository_url
repository = repositories[project_id, repository_url]

references_by_feature: dict[str, list[StoredCodeReference]] = {}
for reference in legacy_scan.code_references:
feature_name = reference["feature_name"]
references_by_feature.setdefault(feature_name, []).append(
StoredCodeReference(
file_path=reference["file_path"],
line_number=reference["line_number"],
)
)

for feature_name, references in references_by_feature.items():
if not (feature := features.get((project_id, feature_name))):
continue
PerFeatureScan.objects.update_or_create(
feature=feature,
repository=repository,
code_references_hash=_hash_references(references),
defaults={
"revision": legacy_scan.revision,
"code_references": references,
"created_at": legacy_scan.created_at,
},
)


def migrate_scans_backward(apps: Apps, _: object) -> None:
"""Mirror each per-feature row back into the legacy single-table layout."""
LegacyScan = apps.get_model("code_references", "FeatureFlagCodeReferencesScan")
PerFeatureScan = apps.get_model("code_references", "ScannedCodeReferences")
LegacyScan._meta.get_field("created_at").auto_now_add = False

per_feature_scans = PerFeatureScan.objects.select_related(
"repository",
"feature",
).iterator(chunk_size=200)

for per_feature_scan in per_feature_scans:
repository = per_feature_scan.repository
feature_name = per_feature_scan.feature.name
LegacyScan.objects.create(
project_id=repository.project_id,
repository_url=repository.url,
vcs_provider=repository.vcs_provider,
revision=per_feature_scan.revision,
code_references=[
{"feature_name": feature_name, **reference}
for reference in per_feature_scan.code_references
],
created_at=per_feature_scan.created_at,
)


class Migration(migrations.Migration):
dependencies = [
("code_references", "0002_add_project_repo_created_index"),
("features", "0066_constrain_feature_type"),
("projects", "0029_bump_default_project_limits"),
]

operations = [
migrations.CreateModel(
name="VCSRepository",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created_at", models.DateTimeField(auto_now_add=True)),
("url", models.URLField()),
(
"vcs_provider",
models.CharField(
choices=[("github", "GitHub")],
max_length=50,
),
),
("last_scanned_at", models.DateTimeField(null=True)),
(
"project",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="vcs_repositories",
to="projects.project",
),
),
],
),
migrations.AddConstraint(
model_name="vcsrepository",
constraint=models.UniqueConstraint(
fields=("project", "url"),
name="unique_vcs_repository",
),
),
migrations.CreateModel(
name="ScannedCodeReferences",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created_at", models.DateTimeField()),
("revision", models.CharField(max_length=100)),
("code_references", models.JSONField(default=list)),
("code_references_hash", models.CharField(max_length=32)),
(
"feature",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="scanned_code_references",
to="features.feature",
),
),
(
"repository",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="scanned_code_references",
to="code_references.vcsrepository",
),
),
],
),
migrations.AddConstraint(
model_name="scannedcodereferences",
constraint=models.UniqueConstraint(
fields=("feature", "repository", "code_references_hash"),
name="unique_scanned_code_references",
),
),
migrations.AddIndex(
model_name="scannedcodereferences",
index=models.Index(
fields=("feature", "repository", "created_at"),
name="cr_feature_repo_created_idx",
),
),
migrations.RunPython(
code=migrate_scans_forward,
reverse_code=migrate_scans_backward,
),
migrations.DeleteModel(
name="FeatureFlagCodeReferencesScan",
),
]
62 changes: 50 additions & 12 deletions api/projects/code_references/models.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,75 @@
from django.db import models

from projects.code_references.types import JSONCodeReference, VCSProvider
from projects.code_references.types import StoredCodeReference, VCSProvider


class FeatureFlagCodeReferencesScan(models.Model):
class VCSRepository(models.Model):
"""
A scan of feature flag code references in a repository
A VCS repository that is scanned for feature flag code references
"""

created_at = models.DateTimeField(auto_now_add=True)

project = models.ForeignKey(
"projects.Project",
on_delete=models.CASCADE,
related_name="code_references",
related_name="vcs_repositories",
)

# Provider-agnostic URL to the web UI of the repository, e.g. https://github.flagsmith.com/backend/
repository_url = models.URLField()
url = models.URLField()

vcs_provider = models.CharField(
max_length=50,
choices=VCSProvider.choices,
default=VCSProvider.GITHUB, # TODO: Remove when adding other providers
)

last_scanned_at = models.DateTimeField(null=True)

class Meta:
constraints = [
models.UniqueConstraint(
fields=["project", "url"],
name="unique_vcs_repository",
),
]


class ScannedCodeReferences(models.Model):
"""
A list of code references for a feature scanned from a VCS repository
"""

created_at = models.DateTimeField()

feature = models.ForeignKey(
"features.Feature",
on_delete=models.CASCADE,
related_name="scanned_code_references",
)

repository = models.ForeignKey(
VCSRepository,
on_delete=models.CASCADE,
related_name="scanned_code_references",
)

revision = models.CharField(max_length=100)
code_references = models.JSONField[list[JSONCodeReference]](default=list)

created_at = models.DateTimeField(auto_now_add=True, db_index=True)
code_references = models.JSONField[list[StoredCodeReference]](default=list)

code_references_hash = models.CharField(max_length=32)

class Meta:
ordering = ["-created_at"]
constraints = [
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you see this being used in your query plan? Mine isn't using it, which brings me to another important question — do you think we should test the query on the staging DB at least? The production DB is very different from a MacBook, and the query still looks complex enough to warrant testing on a prod-like DB

Copy link
Copy Markdown
Contributor Author

@emyller emyller May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you see this being used in your query plan?

Yes! The constraint index was used heavily in my local tests to help narrowing down row search (feature, repository). But seemingly not enough, so thanks for flagging.

do you think we should test the query on the staging DB at least?

I ran this scenario in staging, via direct database access, and temporary tables matching the ones created in this PR: "a project with 400 features, 350 are present in code, 10 merges / day (mostly dupes), over 6 months".

Results revealed slowness would bite us again in the future for big customers running micro services, as unique_scanned_code_references would still lead to N items to sort. Luckily this was an easy fix: 29f3276 — annotation is down to sub-10ms to most common cases, and sub-100ms in big-bad cases like the benchmark.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry forgot to add this in my response above. Real benchmarking, formatted by LLM:

Scenario 1, common: small project, steady scanning

40 features, 2 repos, 5 unique scans/repo/week, 6 months retained, 10,400 rows in bench_code_references_scannedcodereferences.

query time
list endpoint (full history) 52 ms
list endpoint (3-month window) 27 ms
detail endpoint × 100 features 110 ms total (≈1.1 ms each)

All three queries use cr_feature_repo_created_idx (introduced by this PR).

EXPLAIN ANALYZE full output
--- LIST ANNOTATION (full project) ---
                                                                                                                      QUERY PLAN                                                                                                                      
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=58538.5..58538.5 rows=8 width=36) (actual time=52.4..52.4 rows=40 loops=1)
   Output: f.id, ((SubPlan 2))
   Sort Key: f.id
   Sort Method: quicksort  Memory: 46kB
   Buffers: shared hit=52292
   ->  Index Scan using features_feature_project_id_72859830 on public.features_feature f  (cost=0.3..58538.4 rows=8 width=36) (actual time=1.6..52.3 rows=40 loops=1)
         Output: f.id, (SubPlan 2)
         Index Cond: (f.project_id = 25969)
         Filter: (f.deleted_at IS NULL)
         Buffers: shared hit=52289
         SubPlan 2
           ->  Aggregate  (cost=7315.6..7315.6 rows=1 width=32) (actual time=1.3..1.3 rows=1 loops=40)
                 Output: array_agg((jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))))
                 Buffers: shared hit=52285
                 ->  Unique  (cost=7311.3..7315.5 rows=12 width=86) (actual time=1.3..1.3 rows=2 loops=40)
                       Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
                       Buffers: shared hit=52285
                       ->  Sort  (cost=7311.3..7313.4 rows=843 width=86) (actual time=1.3..1.3 rows=260 loops=40)
                             Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
                             Sort Key: r.url, scr.created_at DESC
                             Sort Method: quicksort  Memory: 110kB
                             Buffers: shared hit=52285
                             ->  Hash Join  (cost=1.7..7270.3 rows=843 width=86) (actual time=0.0..1.2 rows=260 loops=40)
                                   Output: jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0)), r.url, scr.created_at
                                   Inner Unique: true
                                   Hash Cond: (scr.repository_id = r.id)
                                   Buffers: shared hit=52279
                                   ->  Index Only Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences scr  (cost=0.4..142.7 rows=843 width=16) (actual time=0.0..0.2 rows=260 loops=40)
                                         Output: scr.feature_id, scr.repository_id, scr.created_at
                                         Index Cond: (scr.feature_id = f.id)
                                         Heap Fetches: 10400
                                         Buffers: shared hit=10678
                                   ->  Hash  (cost=1.1..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
                                         Output: r.url, r.last_scanned_at, r.id
                                         Buckets: 1024  Batches: 1  Memory Usage: 10kB
                                         Buffers: shared hit=1
                                         ->  Seq Scan on public.bench_code_references_vcsrepository r  (cost=0.0..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
                                               Output: r.url, r.last_scanned_at, r.id
                                               Buffers: shared hit=1
                                   SubPlan 1
                                     ->  Limit  (cost=0.4..8.4 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=10400)
                                           Output: (jsonb_array_length(inner_scr.code_references)), inner_scr.created_at
                                           Buffers: shared hit=41600
                                           ->  Index Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences inner_scr  (cost=0.4..8.4 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=10400)
                                                 Output: jsonb_array_length(inner_scr.code_references), inner_scr.created_at
                                                 Index Cond: ((inner_scr.feature_id = scr.feature_id) AND (inner_scr.repository_id = scr.repository_id) AND (inner_scr.created_at = r.last_scanned_at))
                                                 Buffers: shared hit=41600
 Query Identifier: -4966899296074004523
 Planning:
   Buffers: shared hit=411
 Planning Time: 1.0 ms
 Execution Time: 52.5 ms
(52 rows)

Time: 314.6 ms

--- LIST ANNOTATION with 3-month window ---
QUERY PLAN

Sort (cost=29054.8..29054.8 rows=8 width=36) (actual time=26.6..26.6 rows=40 loops=1)
Output: f.id, ((SubPlan 2))
Sort Key: f.id
Sort Method: quicksort Memory: 46kB
Buffers: shared hit=26305
-> Index Scan using features_feature_project_id_72859830 on public.features_feature f (cost=0.3..29054.7 rows=8 width=36) (actual time=0.8..26.6 rows=40 loops=1)
Output: f.id, (SubPlan 2)
Index Cond: (f.project_id = 25969)
Filter: (f.deleted_at IS NULL)
Buffers: shared hit=26305
SubPlan 2
-> Aggregate (cost=3630.2..3630.2 rows=1 width=32) (actual time=0.7..0.7 rows=1 loops=40)
Output: array_agg((jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))))
Buffers: shared hit=26301
-> Unique (cost=3627.9..3630.0 rows=12 width=86) (actual time=0.6..0.7 rows=2 loops=40)
Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
Buffers: shared hit=26301
-> Sort (cost=3627.9..3629.0 rows=416 width=86) (actual time=0.6..0.7 rows=130 loops=40)
Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
Sort Key: r.url, scr.created_at DESC
Sort Method: quicksort Memory: 67kB
Buffers: shared hit=26301
-> Hash Join (cost=1.7..3609.8 rows=416 width=86) (actual time=0.0..0.6 rows=130 loops=40)
Output: jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0)), r.url, scr.created_at
Inner Unique: true
Hash Cond: (scr.repository_id = r.id)
Buffers: shared hit=26301
-> Index Only Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences scr (cost=0.4..88.8 rows=416 width=16) (actual time=0.0..0.1 rows=130 loops=40)
Output: scr.feature_id, scr.repository_id, scr.created_at
Index Cond: ((scr.feature_id = f.id) AND (scr.created_at >= (now() - '3 mons'::interval)))
Heap Fetches: 5200
Buffers: shared hit=5500
-> Hash (cost=1.1..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
Output: r.url, r.last_scanned_at, r.id
Buckets: 1024 Batches: 1 Memory Usage: 10kB
Buffers: shared hit=1
-> Seq Scan on public.bench_code_references_vcsrepository r (cost=0.0..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
Output: r.url, r.last_scanned_at, r.id
Buffers: shared hit=1
SubPlan 1
-> Limit (cost=0.4..8.5 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=5200)
Output: (jsonb_array_length(inner_scr.code_references)), inner_scr.created_at
Buffers: shared hit=20800
-> Index Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences inner_scr (cost=0.4..8.5 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=5200)
Output: jsonb_array_length(inner_scr.code_references), inner_scr.created_at
Index Cond: ((inner_scr.feature_id = scr.feature_id) AND (inner_scr.repository_id = scr.repository_id) AND (inner_scr.created_at >= (now() - '3 mons'::interval)) AND (inner_scr.created_at = r.last_scanned_at))
Buffers: shared hit=20800
Query Identifier: 5778658752241488958
Planning:
Buffers: shared hit=12
Planning Time: 0.3 ms
Execution Time: 26.7 ms
(52 rows)

Time: 300.2 ms

--- DETAIL QUERY across 100 features (single plan, 100 loops on the inner scan) ---
QUERY PLAN

Incremental Sort (cost=1041.5..16824.9 rows=729 width=1575) (actual time=109.8..110.4 rows=80 loops=1)
Output: s.feature_id, scr.id, scr.created_at, scr.revision, scr.code_references, r.url, r.vcs_provider, r.last_scanned_at
Sort Key: s.feature_id, r.url
Presorted Key: s.feature_id
Full-sort Groups: 3 Sort Method: quicksort Average Memory: 76kB Peak Memory: 76kB
Buffers: shared hit=27195
-> Nested Loop (cost=1001.0..16799.0 rows=729 width=1575) (actual time=109.3..110.3 rows=80 loops=1)
Output: s.feature_id, scr.id, scr.created_at, scr.revision, scr.code_references, r.url, r.vcs_provider, r.last_scanned_at
Buffers: shared hit=27195
-> Limit (cost=1000.6..6528.0 rows=100 width=4) (actual time=109.3..109.4 rows=40 loops=1)
Output: s.feature_id
Buffers: shared hit=25635
-> Unique (cost=1000.6..22557.4 rows=390 width=4) (actual time=109.3..109.4 rows=40 loops=1)
Output: s.feature_id
Buffers: shared hit=25635
-> Gather Merge (cost=1000.6..22556.4 rows=396 width=4) (actual time=109.3..109.4 rows=43 loops=1)
Output: s.feature_id
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=25635
-> Unique (cost=0.6..21510.7 rows=198 width=4) (actual time=83.3..90.0 rows=14 loops=3)
Output: s.feature_id
Buffers: shared hit=25635
Worker 0: actual time=75.1..83.3 rows=40 loops=1
Buffers: shared hit=15030
Worker 1: actual time=76.5..88.2 rows=3 loops=1
Buffers: shared hit=5502
-> Nested Loop (cost=0.6..21510.2 rows=198 width=4) (actual time=83.3..90.0 rows=27 loops=3)
Output: s.feature_id
Inner Unique: true
Buffers: shared hit=25635
Worker 0: actual time=75.0..83.3 rows=77 loops=1
Buffers: shared hit=15030
Worker 1: actual time=76.5..88.2 rows=3 loops=1
Buffers: shared hit=5502
-> Parallel Index Only Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences s (cost=0.4..17668.6 rows=137042 width=16) (actual time=0.0..40.2 rows=109633 loops=3)
Output: s.feature_id, s.repository_id, s.created_at
Heap Fetches: 10400
Buffers: shared hit=19133
Worker 0: actual time=0.0..23.5 rows=104455 loops=1
Buffers: shared hit=12689
Worker 1: actual time=0.0..42.8 rows=102664 loops=1
Buffers: shared hit=3161
-> Memoize (cost=0.1..0.2 rows=1 width=12) (actual time=0.0..0.0 rows=0 loops=328900)
Output: r_1.id, r_1.last_scanned_at
Cache Key: s.repository_id, s.created_at
Cache Mode: logical
Hits: 120871 Misses: 910 Evictions: 0 Overflows: 0 Memory Usage: 72kB
Buffers: shared hit=6502
Worker 0: actual time=0.0..0.0 rows=0 loops=104455
Hits: 103285 Misses: 1170 Evictions: 0 Overflows: 0 Memory Usage: 92kB
Buffers: shared hit=2341
Worker 1: actual time=0.0..0.0 rows=0 loops=102664
Hits: 101494 Misses: 1170 Evictions: 0 Overflows: 0 Memory Usage: 92kB
Buffers: shared hit=2341
-> Index Scan using bench_code_references_vcsrepository_pkey on public.bench_code_references_vcsrepository r_1 (cost=0.1..0.2 rows=1 width=12) (actual time=0.0..0.0 rows=0 loops=3250)
Output: r_1.id, r_1.last_scanned_at
Index Cond: (r_1.id = s.repository_id)
Filter: ((r_1.project_id = 25969) AND (s.created_at = r_1.last_scanned_at))
Rows Removed by Filter: 1
Buffers: shared hit=6502
Worker 0: actual time=0.0..0.0 rows=0 loops=1170
Buffers: shared hit=2341
Worker 1: actual time=0.0..0.0 rows=0 loops=1170
Buffers: shared hit=2341
-> Nested Loop (cost=0.4..102.6 rows=12 width=1575) (actual time=0.0..0.0 rows=2 loops=40)
Output: scr.id, scr.created_at, scr.revision, scr.code_references, scr.feature_id, r.url, r.vcs_provider, r.last_scanned_at
Buffers: shared hit=1560
-> Seq Scan on public.bench_code_references_vcsrepository r (cost=0.0..1.1 rows=12 width=65) (actual time=0.0..0.0 rows=12 loops=40)
Output: r.url, r.vcs_provider, r.last_scanned_at, r.id
Buffers: shared hit=40
-> Index Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences scr (cost=0.4..8.4 rows=1 width=1518) (actual time=0.0..0.0 rows=0 loops=480)
Output: scr.id, scr.created_at, scr.revision, scr.code_references, scr.code_references_hash, scr.feature_id, scr.repository_id
Index Cond: ((scr.feature_id = s.feature_id) AND (scr.repository_id = r.id) AND (scr.created_at = r.last_scanned_at))
Buffers: shared hit=1520
Query Identifier: 180893212974011606
Planning:
Buffers: shared hit=36
Planning Time: 1.2 ms
Execution Time: 110.5 ms
(80 rows)

Time: 364.8 ms

Scenario 2, exaggerated bad case but plausible: large project, many repos

400 features, 10 repos, ~3.5 unique scans/repo/week, 6 months retained, 318,500 rows in bench_code_references_scannedcodereferences.

query time
list endpoint (full history) 1,805 ms
list endpoint (3-month window) 1,488 ms (~18% faster)
detail endpoint × 100 features 85 ms total (≈0.85 ms each)

The list endpoint stays above 1 s at this scale even with the covering index, because the inner subplan still loops once per (feature, repository) pair (318,500 loops, 1 row each). The 3-month window helps less than expected because the seed distributes scans uniformly over 6 months.

EXPLAIN ANALYZE full output
--- LIST ANNOTATION (full project) ---
                                                                                                                   QUERY PLAN                                                                                                                   
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Index Scan using features_feature_pkey on public.features_feature f  (cost=0.3..564849.7 rows=77 width=36) (actual time=12.8..1804.5 rows=400 loops=1)
   Output: f.id, (SubPlan 2)
   Filter: ((f.deleted_at IS NULL) AND (f.project_id = 25968))
   Rows Removed by Filter: 32295
   Buffers: shared hit=1297014
   SubPlan 2
     ->  Aggregate  (cost=7315.6..7315.6 rows=1 width=32) (actual time=4.5..4.5 rows=1 loops=400)
           Output: array_agg((jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))))
           Buffers: shared hit=1283893
           ->  Unique  (cost=7311.3..7315.5 rows=12 width=86) (actual time=4.4..4.5 rows=9 loops=400)
                 Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
                 Buffers: shared hit=1283893
                 ->  Sort  (cost=7311.3..7313.4 rows=843 width=86) (actual time=4.4..4.4 rows=796 loops=400)
                       Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
                       Sort Key: r.url, scr.created_at DESC
                       Sort Method: quicksort  Memory: 25kB
                       Buffers: shared hit=1283893
                       ->  Hash Join  (cost=1.7..7270.3 rows=843 width=86) (actual time=0.0..3.7 rows=796 loops=400)
                             Output: jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0)), r.url, scr.created_at
                             Inner Unique: true
                             Hash Cond: (scr.repository_id = r.id)
                             Buffers: shared hit=1283887
                             ->  Index Only Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences scr  (cost=0.4..142.7 rows=843 width=16) (actual time=0.0..0.1 rows=796 loops=400)
                                   Output: scr.feature_id, scr.repository_id, scr.created_at
                                   Index Cond: (scr.feature_id = f.id)
                                   Heap Fetches: 0
                                   Buffers: shared hit=9886
                             ->  Hash  (cost=1.1..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
                                   Output: r.url, r.last_scanned_at, r.id
                                   Buckets: 1024  Batches: 1  Memory Usage: 10kB
                                   Buffers: shared hit=1
                                   ->  Seq Scan on public.bench_code_references_vcsrepository r  (cost=0.0..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
                                         Output: r.url, r.last_scanned_at, r.id
                                         Buffers: shared hit=1
                             SubPlan 1
                               ->  Limit  (cost=0.4..8.4 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=318500)
                                     Output: (jsonb_array_length(inner_scr.code_references)), inner_scr.created_at
                                     Buffers: shared hit=1274000
                                     ->  Index Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences inner_scr  (cost=0.4..8.4 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=318500)
                                           Output: jsonb_array_length(inner_scr.code_references), inner_scr.created_at
                                           Index Cond: ((inner_scr.feature_id = scr.feature_id) AND (inner_scr.repository_id = scr.repository_id) AND (inner_scr.created_at = r.last_scanned_at))
                                           Buffers: shared hit=1274000
 Query Identifier: -4966899296074004523
 Planning:
   Buffers: shared hit=411
 Planning Time: 1.1 ms
 Execution Time: 1804.6 ms
(47 rows)

Time: 2152.9 ms (00:2.2)

--- LIST ANNOTATION with 3-month window ---
QUERY PLAN

Index Scan using features_feature_pkey on public.features_feature f (cost=0.3..281069.1 rows=77 width=36) (actual time=9.6..1488.2 rows=400 loops=1)
Output: f.id, (SubPlan 2)
Filter: ((f.deleted_at IS NULL) AND (f.project_id = 25968))
Rows Removed by Filter: 32295
Buffers: shared hit=650730
SubPlan 2
-> Aggregate (cost=3630.2..3630.2 rows=1 width=32) (actual time=3.7..3.7 rows=1 loops=400)
Output: array_agg((jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))))
Buffers: shared hit=637609
-> Unique (cost=3627.9..3630.0 rows=12 width=86) (actual time=3.6..3.7 rows=9 loops=400)
Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
Buffers: shared hit=637609
-> Sort (cost=3627.9..3629.0 rows=416 width=86) (actual time=3.6..3.7 rows=394 loops=400)
Output: (jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0))), r.url, scr.created_at
Sort Key: r.url, scr.created_at DESC
Sort Method: quicksort Memory: 25kB
Buffers: shared hit=637609
-> Hash Join (cost=1.7..3609.8 rows=416 width=86) (actual time=0.0..3.3 rows=394 loops=400)
Output: jsonb_build_object('repository_url', r.url, 'last_successful_repository_scanned_at', r.last_scanned_at, 'last_feature_found_at', scr.created_at, 'count', COALESCE((SubPlan 1), 0)), r.url, scr.created_at
Inner Unique: true
Hash Cond: (scr.repository_id = r.id)
Buffers: shared hit=637609
-> Index Only Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences scr (cost=0.4..88.8 rows=416 width=16) (actual time=0.0..0.1 rows=394 loops=400)
Output: scr.feature_id, scr.repository_id, scr.created_at
Index Cond: ((scr.feature_id = f.id) AND (scr.created_at >= (now() - '3 mons'::interval)))
Heap Fetches: 0
Buffers: shared hit=7608
-> Hash (cost=1.1..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
Output: r.url, r.last_scanned_at, r.id
Buckets: 1024 Batches: 1 Memory Usage: 10kB
Buffers: shared hit=1
-> Seq Scan on public.bench_code_references_vcsrepository r (cost=0.0..1.1 rows=12 width=58) (actual time=0.0..0.0 rows=12 loops=1)
Output: r.url, r.last_scanned_at, r.id
Buffers: shared hit=1
SubPlan 1
-> Limit (cost=0.4..8.5 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=157500)
Output: (jsonb_array_length(inner_scr.code_references)), inner_scr.created_at
Buffers: shared hit=630000
-> Index Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences inner_scr (cost=0.4..8.5 rows=1 width=12) (actual time=0.0..0.0 rows=1 loops=157500)
Output: jsonb_array_length(inner_scr.code_references), inner_scr.created_at
Index Cond: ((inner_scr.feature_id = scr.feature_id) AND (inner_scr.repository_id = scr.repository_id) AND (inner_scr.created_at >= (now() - '3 mons'::interval)) AND (inner_scr.created_at = r.last_scanned_at))
Buffers: shared hit=630000
Query Identifier: 5778658752241488958
Planning:
Buffers: shared hit=12
Planning Time: 0.3 ms
Execution Time: 1488.4 ms
(47 rows)

Time: 1744.8 ms (00:1.7)

--- DETAIL QUERY across 100 features (single plan, 100 loops on the inner scan) ---
QUERY PLAN

Incremental Sort (cost=1041.6..16836.9 rows=729 width=1575) (actual time=78.2..84.4 rows=1000 loops=1)
Output: s.feature_id, scr.id, scr.created_at, scr.revision, scr.code_references, r.url, r.vcs_provider, r.last_scanned_at
Sort Key: s.feature_id, r.url
Presorted Key: s.feature_id
Full-sort Groups: 25 Sort Method: quicksort Average Memory: 88kB Peak Memory: 88kB
Buffers: shared hit=28571
-> Nested Loop (cost=1001.0..16810.9 rows=729 width=1575) (actual time=77.8..81.6 rows=1000 loops=1)
Output: s.feature_id, scr.id, scr.created_at, scr.revision, scr.code_references, r.url, r.vcs_provider, r.last_scanned_at
Buffers: shared hit=28571
-> Limit (cost=1000.6..6539.9 rows=100 width=4) (actual time=77.7..78.0 rows=100 loops=1)
Output: s.feature_id
Buffers: shared hit=23871
-> Unique (cost=1000.6..22603.9 rows=390 width=4) (actual time=77.7..78.0 rows=100 loops=1)
Output: s.feature_id
Buffers: shared hit=23871
-> Gather Merge (cost=1000.6..22602.0 rows=780 width=4) (actual time=77.7..78.0 rows=200 loops=1)
Output: s.feature_id
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=23871
-> Unique (cost=0.6..21511.9 rows=390 width=4) (actual time=0.2..45.3 rows=234 loops=3)
Output: s.feature_id
Buffers: shared hit=23871
Worker 0: actual time=0.2..66.3 rows=350 loops=1
Buffers: shared hit=17134
Worker 1: actual time=0.1..69.4 rows=350 loops=1
Buffers: shared hit=6440
-> Nested Loop (cost=0.6..21509.5 rows=988 width=4) (actual time=0.2..45.2 rows=1167 loops=3)
Output: s.feature_id
Inner Unique: true
Buffers: shared hit=23871
Worker 0: actual time=0.2..66.1 rows=1810 loops=1
Buffers: shared hit=17134
Worker 1: actual time=0.1..69.3 rows=1689 loops=1
Buffers: shared hit=6440
-> Parallel Index Only Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences s (cost=0.4..17668.6 rows=137042 width=16) (actual time=0.0..12.1 rows=109633 loops=3)
Output: s.feature_id, s.repository_id, s.created_at
Heap Fetches: 10400
Buffers: shared hit=19127
Worker 0: actual time=0.0..20.4 rows=173049 loops=1
Buffers: shared hit=14793
Worker 1: actual time=0.0..16.0 rows=155706 loops=1
Buffers: shared hit=4327
-> Memoize (cost=0.1..0.2 rows=1 width=12) (actual time=0.0..0.0 rows=0 loops=328900)
Output: r_1.id, r_1.last_scanned_at
Cache Key: s.repository_id, s.created_at
Cache Mode: logical
Hits: 0 Misses: 145 Evictions: 0 Overflows: 0 Memory Usage: 12kB
Buffers: shared hit=4744
Worker 0: actual time=0.0..0.0 rows=0 loops=173049
Hits: 171879 Misses: 1170 Evictions: 0 Overflows: 0 Memory Usage: 92kB
Buffers: shared hit=2341
Worker 1: actual time=0.0..0.0 rows=0 loops=155706
Hits: 154650 Misses: 1056 Evictions: 0 Overflows: 0 Memory Usage: 83kB
Buffers: shared hit=2113
-> Index Scan using bench_code_references_vcsrepository_pkey on public.bench_code_references_vcsrepository r_1 (cost=0.1..0.2 rows=1 width=12) (actual time=0.0..0.0 rows=0 loops=2371)
Output: r_1.id, r_1.last_scanned_at
Index Cond: (r_1.id = s.repository_id)
Filter: ((r_1.project_id = 25968) AND (s.created_at = r_1.last_scanned_at))
Rows Removed by Filter: 1
Buffers: shared hit=4744
Worker 0: actual time=0.0..0.0 rows=0 loops=1170
Buffers: shared hit=2341
Worker 1: actual time=0.0..0.0 rows=0 loops=1056
Buffers: shared hit=2113
-> Nested Loop (cost=0.4..102.6 rows=12 width=1575) (actual time=0.0..0.0 rows=10 loops=100)
Output: scr.id, scr.created_at, scr.revision, scr.code_references, scr.feature_id, r.url, r.vcs_provider, r.last_scanned_at
Buffers: shared hit=4700
-> Seq Scan on public.bench_code_references_vcsrepository r (cost=0.0..1.1 rows=12 width=65) (actual time=0.0..0.0 rows=12 loops=100)
Output: r.url, r.vcs_provider, r.last_scanned_at, r.id
Buffers: shared hit=100
-> Index Scan using bench_cr_feature_repo_created_idx on public.bench_code_references_scannedcodereferences scr (cost=0.4..8.4 rows=1 width=1518) (actual time=0.0..0.0 rows=1 loops=1200)
Output: scr.id, scr.created_at, scr.revision, scr.code_references, scr.code_references_hash, scr.feature_id, scr.repository_id
Index Cond: ((scr.feature_id = s.feature_id) AND (scr.repository_id = r.id) AND (scr.created_at = r.last_scanned_at))
Buffers: shared hit=4600
Query Identifier: 180893212974011606
Planning:
Buffers: shared hit=32
Planning Time: 0.4 ms
Execution Time: 84.5 ms
(80 rows)

Time: 343.6 ms

models.UniqueConstraint( # Supports batch-insert with ignore-conflicts
fields=["feature", "repository", "code_references_hash"],
name="unique_scanned_code_references",
),
]
indexes = [
models.Index(
fields=["project", "repository_url", "-created_at"],
name="code_ref_proj_repo_created_idx",
models.Index( # Supports finding the latest scan for a feature/repository
fields=["feature", "repository", "created_at"],
name="cr_feature_repo_created_idx",
),
]
Loading
Loading