Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions data/darktableconfig.xml.in
Original file line number Diff line number Diff line change
Expand Up @@ -3558,11 +3558,25 @@
<longdescription>radius of structuring element for morphological open+close cleanup. removes small protrusions and fills small holes in the mask. 0 = disabled, 1-3 = typical values.</longdescription>
</dtconfig>
<dtconfig>
<name>plugins/darkroom/masks/object/edge_refine</name>
<type min="0.0" max="0.5">float</type>
<default>0.2</default>
<shortdescription>AI mask edge refinement</shortdescription>
<longdescription>strength of edge-aware threshold boost. near strong image edges the binarization threshold is raised, snapping the mask boundary to object edges. 0 = disabled, 0.1-0.3 = typical values.</longdescription>
<name>plugins/darkroom/masks/object/guided_radius</name>
<type min="0" max="20">int</type>
<default>5</default>
<shortdescription>AI mask guided filter radius</shortdescription>
<longdescription>radius of the guided filter used to snap the mask boundary to image edges. larger values produce smoother boundaries. 0 = disabled.</longdescription>
</dtconfig>
<dtconfig>
<name>plugins/darkroom/masks/object/guided_eps</name>
<type min="0.001" max="1.0">float</type>
<default>0.01</default>
<shortdescription>AI mask guided filter edge sensitivity</shortdescription>
<longdescription>edge sensitivity for the guided filter. smaller values preserve finer edges. 0.001 = very sharp, 0.1 = soft, 1.0 = nearly no edge preservation.</longdescription>
</dtconfig>
<dtconfig>
<name>plugins/darkroom/masks/object/render_size</name>
<type min="1024">int</type>
<default>1024</default>
<shortdescription>AI mask render resolution</shortdescription>
<longdescription>target resolution (longest side in pixels) for rendering the image before AI mask encoding. higher values improve edge accuracy but increase processing time. the AI encoder always works at 1024px internally.</longdescription>
</dtconfig>
<dtconfig>
<name>plugins/darkroom/masks/object/brush_size</name>
Expand Down
68 changes: 45 additions & 23 deletions src/common/ai/segmentation.c
Original file line number Diff line number Diff line change
Expand Up @@ -557,14 +557,20 @@ void dt_seg_warmup_decoder(dt_seg_context_t *ctx)
{
int64_t iou_shape[2] = {1, nm};
int64_t lr_shape[4] = {1, nm, pm_dim, pm_dim};
const int dec_outputs = dt_ai_get_output_count(ctx->decoder);

outputs[0] = (dt_ai_tensor_t){
.data = masks, .type = DT_AI_FLOAT, .shape = masks_shape, .ndim = 4};
outputs[1] = (dt_ai_tensor_t){
.data = iou_buf, .type = DT_AI_FLOAT, .shape = iou_shape, .ndim = 2};
outputs[2] = (dt_ai_tensor_t){
.data = low_res, .type = DT_AI_FLOAT, .shape = lr_shape, .ndim = 4};
n_out = 3;
n_out = 2;
// low_res_masks output is optional (absent in 256x256 decoders)
if(dec_outputs >= 3)
{
outputs[2] = (dt_ai_tensor_t){
.data = low_res, .type = DT_AI_FLOAT, .shape = lr_shape, .ndim = 4};
n_out = 3;
}
}
else
{
Expand Down Expand Up @@ -825,27 +831,33 @@ float *dt_seg_compute_mask(dt_seg_context_t *ctx,

if(is_sam)
{
// SAM: 3 outputs -- masks [1,N,H,W], iou [1,N], low_res [1,N,pm_dim,pm_dim]
const size_t low_res_per = (size_t)pm_dim * pm_dim;
low_res = g_try_malloc((size_t)nm * low_res_per * sizeof(float));
if(!low_res)
{
g_free(point_coords);
g_free(point_labels);
g_free(masks);
return NULL;
}

// SAM: masks [1,N,H,W] + iou [1,N], optionally low_res [1,N,pm,pm]
int64_t iou_shape[2] = {1, nm};
int64_t low_res_shape[4] = {1, nm, pm_dim, pm_dim};
const int dec_out_count = dt_ai_get_output_count(ctx->decoder);

dec_outputs[0] = (dt_ai_tensor_t){
.data = masks, .type = DT_AI_FLOAT, .shape = masks_shape, .ndim = 4};
dec_outputs[1] = (dt_ai_tensor_t){
.data = iou_pred, .type = DT_AI_FLOAT, .shape = iou_shape, .ndim = 2};
dec_outputs[2] = (dt_ai_tensor_t){
.data = low_res, .type = DT_AI_FLOAT, .shape = low_res_shape, .ndim = 4};
n_dec_out = 3;
n_dec_out = 2;

// low_res_masks output is optional (absent in 256x256 decoders)
if(dec_out_count >= 3)
{
const size_t low_res_per = (size_t)pm_dim * pm_dim;
low_res = g_try_malloc((size_t)nm * low_res_per * sizeof(float));
if(!low_res)
{
g_free(point_coords);
g_free(point_labels);
g_free(masks);
return NULL;
}
int64_t low_res_shape[4] = {1, nm, pm_dim, pm_dim};
dec_outputs[2] = (dt_ai_tensor_t){
.data = low_res, .type = DT_AI_FLOAT, .shape = low_res_shape, .ndim = 4};
n_dec_out = 3;
}
}
else
{
Expand Down Expand Up @@ -897,11 +909,21 @@ float *dt_seg_compute_mask(dt_seg_context_t *ctx,
"[segmentation] mask computed (%.3fs), best=%d/%d IoU=%.3f",
dec_elapsed, best, nm, iou_pred[best]);

// cache the best low-res mask for iterative refinement
const size_t low_res_per = (size_t)pm_dim * pm_dim;
memcpy(ctx->prev_mask, low_res + (size_t)best * low_res_per,
low_res_per * sizeof(float));
g_free(low_res);
// cache the best mask for iterative refinement
if(low_res)
{
// use dedicated low_res output (1024x1024 decoder)
const size_t low_res_per = (size_t)pm_dim * pm_dim;
memcpy(ctx->prev_mask, low_res + (size_t)best * low_res_per,
low_res_per * sizeof(float));
g_free(low_res);
}
else
{
// masks output is already at prev_mask resolution (256x256 decoder)
memcpy(ctx->prev_mask, masks + (size_t)best * per_mask,
per_mask * sizeof(float));
}
}
else
{
Expand Down
121 changes: 57 additions & 64 deletions src/develop/masks/object.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "common/ai_models.h"
#include "common/colorspaces.h"
#include "common/debug.h"
#include "common/guided_filter.h"
#include "common/mipmap_cache.h"
#include "common/ras2vect.h"
#include "control/conf.h"
Expand All @@ -40,16 +41,21 @@
#define CONF_OBJECT_THRESHOLD_KEY "plugins/darkroom/masks/object/threshold"
#define CONF_OBJECT_REFINE_KEY "plugins/darkroom/masks/object/refine_passes"
#define CONF_OBJECT_MORPH_KEY "plugins/darkroom/masks/object/morph_radius"
#define CONF_OBJECT_EDGE_REFINE_KEY "plugins/darkroom/masks/object/edge_refine"
#define CONF_OBJECT_GUIDED_RADIUS_KEY "plugins/darkroom/masks/object/guided_radius"
#define CONF_OBJECT_GUIDED_EPS_KEY "plugins/darkroom/masks/object/guided_eps"
#define CONF_OBJECT_CLEANUP_KEY "plugins/darkroom/masks/object/cleanup"
#define CONF_OBJECT_SMOOTHING_KEY "plugins/darkroom/masks/object/smoothing"
#define CONF_OBJECT_FEATHER_KEY "plugins/darkroom/masks/object/feather"
#define CONF_OBJECT_PERSIST_KEY "plugins/darkroom/masks/object/persist_model"
#define CONF_OBJECT_PATH_PREVIEW_KEY "plugins/darkroom/masks/object/path_preview"

// target resolution for segmentation encoding (longest side in pixels),
// matches the encoder input size (1024) -- rendering higher just to
// downscale in preprocessing wastes pipeline time with no quality gain
#define SEG_ENCODE_TARGET 1024
// default render target (longest side in pixels).
// the SAM encoder internally downscales to 1024 so encoding quality
// is the same, but higher render resolution gives the guided filter
// and vectorizer more detail for edge refinement.
// configurable via plugins/darkroom/masks/object/render_size
#define SEG_RENDER_DEFAULT 1024
#define CONF_OBJECT_RENDER_SIZE_KEY "plugins/darkroom/masks/object/render_size"

// --- per-session segmentation state (stored in gui->scratchpad) ---

Expand Down Expand Up @@ -309,8 +315,11 @@ static gpointer _encode_thread_func(gpointer data)
dt_dev_pixelpipe_get_dimensions(&pipe, &dev, pipe.iwidth, pipe.iheight,
&pipe.processed_width, &pipe.processed_height);

const double scale = fmin((double)SEG_ENCODE_TARGET / (double)pipe.processed_width,
(double)SEG_ENCODE_TARGET / (double)pipe.processed_height);
const int render_target = dt_conf_key_exists(CONF_OBJECT_RENDER_SIZE_KEY)
? MAX(dt_conf_get_int(CONF_OBJECT_RENDER_SIZE_KEY), 1024)
: SEG_RENDER_DEFAULT;
const double scale = fmin((double)render_target / (double)pipe.processed_width,
(double)render_target / (double)pipe.processed_height);
const double final_scale = fmin(scale, 1.0); // don't upscale
const int out_w = (int)(final_scale * pipe.processed_width);
const int out_h = (int)(final_scale * pipe.processed_height);
Expand Down Expand Up @@ -599,73 +608,51 @@ static void _morph_open_close(float *mask, int w, int h, float threshold, int ra
g_free(tmp);
}

// edge-aware threshold refinement: near strong image edges the binarization
// threshold is raised by up to edge_boost, snapping the mask boundary to
// actual object contours - uses Scharr gradient of the stored RGB image
static void _edge_refine_threshold(float *mask, int mw, int mh,
const uint8_t *rgb, int rgb_w, int rgb_h,
float base_threshold, float edge_boost)
// edge-aware mask refinement using guided filter: smooths the mask in
// flat regions while preserving sharp transitions at image edges.
// the stored RGB image is used as the guide
static void _guided_filter_refine(float *mask,
const int mw,
const int mh,
const uint8_t *rgb,
const int rgb_w,
const int rgb_h,
const int radius,
const float sqrt_eps)
{
if(edge_boost <= 0.0f || !rgb || rgb_w < 3 || rgb_h < 3)
if(!rgb || rgb_w < 3 || rgb_h < 3)
return;
if(mw != rgb_w || mh != rgb_h)
return;

const size_t npix = (size_t)mw * mh;

// step 1: convert uint8 RGB to float luminance (Rec.601)
float *lum = g_try_malloc(npix * sizeof(float));
if(!lum) return;
// convert uint8 RGB to float RGBA guide (guided_filter expects 4ch)
float *guide = dt_alloc_align_float(npix * 4);
if(!guide) return;

for(size_t i = 0; i < npix; i++)
lum[i] = (0.299f * (float)rgb[i * 3]
+ 0.587f * (float)rgb[i * 3 + 1]
+ 0.114f * (float)rgb[i * 3 + 2]) / 255.0f;

// step 2: compute Scharr gradient magnitude, track max for normalization
float *grad = g_try_malloc(npix * sizeof(float));
if(!grad)
{
g_free(lum);
return;
guide[i * 4 + 0] = (float)rgb[i * 3 + 0] / 255.0f;
guide[i * 4 + 1] = (float)rgb[i * 3 + 1] / 255.0f;
guide[i * 4 + 2] = (float)rgb[i * 3 + 2] / 255.0f;
guide[i * 4 + 3] = 0.0f;
}

float grad_max = 0.0f;

for(int y = 0; y < mh; y++)
// run guided filter: smooths mask but preserves edges from the guide
float *mask_bak = dt_alloc_align_float(npix);
if(!mask_bak)
{
for(int x = 0; x < mw; x++)
{
float g = 0.0f;
if(y >= 1 && y < mh - 1 && x >= 1 && x < mw - 1)
{
const float *p = &lum[y * mw + x];
const float gx = (47.0f / 255.0f) * (p[-mw - 1] - p[-mw + 1]
+ p[mw - 1] - p[mw + 1])
+ (162.0f / 255.0f) * (p[-1] - p[1]);
const float gy = (47.0f / 255.0f) * (p[-mw - 1] - p[mw - 1]
+ p[-mw + 1] - p[mw + 1])
+ (162.0f / 255.0f) * (p[-mw] - p[mw]);
g = sqrtf(gx * gx + gy * gy);
}
grad[y * mw + x] = g;
if(g > grad_max) grad_max = g;
}
dt_free_align(guide);
return;
}

g_free(lum);
memcpy(mask_bak, mask, npix * sizeof(float));
guided_filter(guide, mask_bak, mask, mw, mh, 4,
radius, sqrt_eps, 1.0f, 0.0f, 1.0f);

// step 3: normalize and apply spatially-varying threshold
const float inv_max = (grad_max > 1e-6f) ? 1.0f / grad_max : 0.0f;

for(size_t i = 0; i < npix; i++)
{
const float g_norm = grad[i] * inv_max;
const float effective_thresh = base_threshold + edge_boost * g_norm;
mask[i] = (mask[i] > effective_thresh) ? 1.0f : 0.0f;
}

g_free(grad);
dt_free_align(mask_bak);
dt_free_align(guide);
}

// run the decoder with accumulated points and update the cached mask
Expand Down Expand Up @@ -741,12 +728,13 @@ static void _run_decoder(dt_masks_form_gui_t *gui)
seed_y = CLAMP(seed_y, 0, mh - 1);
const float threshold = CLAMP(dt_conf_get_float(CONF_OBJECT_THRESHOLD_KEY), 0.3f, 0.9f);

// edge-aware threshold refinement: snap mask boundary to image edges
const float edge_boost = CLAMP(dt_conf_get_float(CONF_OBJECT_EDGE_REFINE_KEY), 0.0f, 0.5f);
if(edge_boost > 0.0f && d->encode_rgb)
_edge_refine_threshold(mask, mw, mh,
d->encode_rgb, d->encode_rgb_w, d->encode_rgb_h,
threshold, edge_boost);
// guided filter edge refinement: snap mask boundary to image edges
const int gf_radius = CLAMP(dt_conf_get_int(CONF_OBJECT_GUIDED_RADIUS_KEY), 0, 20);
const float gf_eps = CLAMP(dt_conf_get_float(CONF_OBJECT_GUIDED_EPS_KEY), 0.001f, 1.0f);
if(gf_radius > 0 && d->encode_rgb)
_guided_filter_refine(mask, mw, mh,
d->encode_rgb, d->encode_rgb_w, d->encode_rgb_h,
gf_radius, sqrtf(gf_eps));

_keep_seed_component(mask, mw, mh, threshold, seed_x, seed_y);

Expand All @@ -770,6 +758,11 @@ static void _update_preview(_object_data_t *d)
if(!d->mask || d->mask_w <= 0 || d->mask_h <= 0)
return;

// skip vectorization when path preview is disabled
if(dt_conf_key_exists(CONF_OBJECT_PATH_PREVIEW_KEY)
&& !dt_conf_get_bool(CONF_OBJECT_PATH_PREVIEW_KEY))
return;

const size_t n = (size_t)d->mask_w * d->mask_h;
float *inv_mask = g_try_malloc(n * sizeof(float));
if(!inv_mask) return;
Expand Down
Loading