dual use text encoder/kv fast edits #13853

kwal559 · 2026-06-01T19:39:49Z

kwal559
Jun 1, 2026

this script explores flux2 klein 9b-kv. Pass a prompt to enhance it directly to text encoder, allow it to think and capture it's final output. Then we create variation prompts and feed them back to text encoder for embeds. 1 component dual use = save memory.. Quantize it if you want to save time.. We include the initial image and pile on the variation prompts. Allow batch generation for speed. Receive a grid of consistent characters in different poses/image challenges. If you want to see magic, load up a svdq or similar small transformer and set the image count to 100. on rtx 4090 100 pics (128x128) generate less than 10 seconds. each image unique and character remains.

import torch,diffusers,gc,time,psutil,random
from PIL import Image

def flush():
gc.collect();torch.cuda.empty_cache()
print(f"🧹✂️ {torch.cuda.memory_reserved()/10243:.1f}GB")
print(f"VRAM: {24 - torch.cuda.mem_get_info()[0]/10243:.2f}GB | RAM: {psutil.virtual_memory()[3]/1024**3:.1f}GB")

model_id, kv_tran= "black-forest-labs/FLUX.2-klein-9B","black-forest-labs/FLUX.2-klein-9b-kv"

def enhance_and_embed(user_concept, num_prompts=20):
time_1 = time.time()
print("🧠 Text Encode + Enhance")
pipe = diffusers.DiffusionPipeline.from_pretrained(model_id,transformer=None,vae=None,scheduler=None,torch_dtype=torch.bfloat16).to("cuda")

system_prompt = "You are a creative assistant. Take the user's simple concept and write a highly detailed, descriptive prompt for an image generator. Wrap your final, ready to input enhanced prompt in quotation marks."

messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_concept}]
text_input = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = pipe.tokenizer(text_input, return_tensors="pt").to("cuda")

print(f"\nStarting enhancement for: '{user_concept}'");flush()

with torch.no_grad():
    outputs = pipe.text_encoder.generate(**inputs,top_p=0.95,top_k=20,repetition_penalty=1.0,temperature=1.0,max_new_tokens=1024,pad_token_id=pipe.tokenizer.pad_token_id,eos_token_id=pipe.tokenizer.eos_token_id)

raw_response = pipe.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=False)

if "</think>" in raw_response:
    thinking, final_prompt = raw_response.split("</think>")
    final_prompt = final_prompt.replace("<|im_end|>", "").strip()
elif "<tool_call>" in raw_response:
    thinking, final_prompt = raw_response.split("<tool_call>")
    final_prompt = final_prompt.replace("<tool_call>", "").strip()
else:
    final_prompt = raw_response.replace("<|im_end|>", "").strip()
print(f"hmmm..\n{thinking}\n")
print(f"✨ ENHANCED PROMPT (took {time.time() - time_1:.2f}s):\n{final_prompt}\n")

color = ["white", "black", "brown", "neon blue", "pastel green", "red", "orange", "neon yellow"]
feels = ["holds a cat", "holds a pig", "teeth braces big smile", "growls", "laughs", "waves at viewer", "doubles in size", "drools", "shrinks in size"]
extra = ["eye patch", "sunglasses", "devil horns", "baseball cap", "western cowboy hat", "giant wings", "balloon", "swimsuit"]
where = ["storm clouds", "spotlight", "holding a pencil drawing of this image", "holding a charcoal sketch of the character"]
style = ["4bit game", "comic book", "claymation", "sand sculpture", "80's cartoon", "cosmic horror graphic novel"]
looks = ["looks to the left", "looks to the right", "looks up", "looks down", "obtains the rear view", "looks at reflection in mirror"]

print("🎨 Mixing variation prompts...")
variant_prompts = []
for _ in range(num_prompts):
    c, f, e, h, s, l = random.choice(color), random.choice(feels), random.choice(extra), random.choice(where), random.choice(style), random.choice(looks)
    sentence = f"Enhance this concept: {final_prompt}. Add a {c} {e} and this character {f}.. {l}, {h} in a {s} style."
    variant_prompts.append(sentence)

print("🔠 Embedding prompts...")
time2 = time.time()
init_embeddings = []
with torch.no_grad():
    init_embed = pipe.encode_prompt(prompt=final_prompt)[0]
init_embeddings.append(init_embed.to("cpu"))
init_embeddings = torch.cat(init_embeddings, dim=0)

prompt_embeddings = []
for i, p in enumerate(variant_prompts):
    with torch.no_grad():
        prompt_embeds = pipe.encode_prompt(prompt=p)[0]
    prompt_embeddings.append(prompt_embeds.to("cpu"))
    
prompt_embeddings = torch.cat(prompt_embeddings, dim=0)
print(f"Embeddings done.. {time.time() - time2:.1f} sec")

pipe.text_encoder = None
pipe.tokenizer = None
del pipe
flush()

return init_embeddings, prompt_embeddings

def generate_images(init_embeddings, prompt_embeddings, num_prompts=20):
print("\n🚀 Loading Image Generation Models...")
vae = diffusers.AutoencoderKLFlux2.from_pretrained("black-forest-labs/FLUX.2-small-decoder", torch_dtype=torch.bfloat16)
transformer = diffusers.AutoModel.from_pretrained(kv_tran, subfolder="transformer", torch_dtype=torch.bfloat16)
transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn)

pipe = diffusers.Flux2Pipeline.from_pretrained(model_id,transformer=transformer,text_encoder=None,tokenizer=None,vae=vae,torch_dtype=torch.bfloat16).to("cuda")
pipe.transformer.to(memory_format=torch.channels_last)
pipe.vae.to(memory_format=torch.channels_last)
flush()

print("\n🖼️ Generating Base Image...")
init_embeddings = init_embeddings.to("cuda", dtype=torch.bfloat16)
time_base = time.time()

with torch.inference_mode():
    kvimg = pipe(prompt_embeds=init_embeddings,width=1024,height=1024,num_inference_steps=4,guidance_scale=1).images[0]
    
display(kvimg)
print(f"Base image generated in: {time.time() - time_base:.1f} sec");flush()

init_image = kvimg.resize((256, 256), Image.LANCZOS)
del kvimg
flush()

print(f"\n🧬 Generating {num_prompts} Variations...")
prompt_embeddings = prompt_embeddings.to("cuda", dtype=torch.bfloat16)
time_vars = time.time()

with torch.inference_mode():
    images = pipe(prompt_embeds=prompt_embeddings,image=init_image,width=256,height=256,num_inference_steps=4,guidance_scale=1).images
    
print(f"{num_prompts} variant images generated in: {time.time() - time_vars:.1f} sec");flush()

w, h = images[0].size
max_cols, n=4,len(images)
cols = min(max_cols, n)
rows = (n + max_cols - 1) // max_cols
grid = Image.new('RGB', (cols*w, rows*h))
for i, img in enumerate(images):
    grid.paste(img, (i%cols*w, i//cols*h))
display(grid)

EXECUTE PIPELINE

if name == "main":
USER_CONCEPT = "Portrait of a ghoul"
NUM_VARIATIONS = 20

flush()

init_emb, prompt_emb = enhance_and_embed(USER_CONCEPT, num_prompts=NUM_VARIATIONS) generate_images(init_emb, prompt_emb, num_prompts=NUM_VARIATIONS)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

dual use text encoder/kv fast edits #13853

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

dual use text encoder/kv fast edits #13853

Uh oh!

kwal559 Jun 1, 2026

EXECUTE PIPELINE

Replies: 0 comments

kwal559
Jun 1, 2026