got;
+ try {
+ got = invoker.getInvoker(instance);
+ } catch (TRpcException ex) {
+ // SPI factory may not be available in some environments; in that case createInvoker
+ // throws. Either outcome confirms the fast path correctly rejected the stale entry.
+ Assert.assertTrue(ex.getMessage().contains("Create rpc client"));
+ return;
+ }
+ Assert.assertNotSame("Stale entry must be replaced", stale, got);
+ Assert.assertNotSame(stale, cache.get(key));
+ // Clean up the freshly created RpcClient to avoid leaks across tests.
+ com.tencent.trpc.core.cluster.RpcClusterClientManager.shutdownBackendConfig(
+ invoker.getBackendConfig());
+ }
+
+ /**
+ * Direct assertion of the bug fix: the closeFuture hook installed inside createInvoker uses
+ * CAS-remove. We simulate the hook semantics here to lock down the invariant.
+ *
+ * The actual hook is a lambda registered when a fresh invoker is created. We verify the
+ * same semantics by constructing two proxies, putting a "newer" one in the cache, then
+ * applying CAS-remove with the "older" key/value pair. The newer entry must NOT be evicted.
+ */
+ @Test
+ public void testCasRemoveDoesNotEvictNewerEntry() throws Exception {
+ ConcurrentMap> cache = getCache();
+ String key = "127.0.0.1:18001:tcp";
+
+ TestRpcClient clientA = new TestRpcClient();
+ ConsumerInvokerProxy a = new ConsumerInvokerProxy<>(
+ stubInvoker(clientA.getProtocolConfig()), clientA);
+ TestRpcClient clientB = new TestRpcClient();
+ ConsumerInvokerProxy b = new ConsumerInvokerProxy<>(
+ stubInvoker(clientB.getProtocolConfig()), clientB);
+
+ cache.put(key, b); // current value is B
+
+ // Simulate the closeFuture hook for A firing while B is the current cache value.
+ boolean removed = cache.remove(key, a);
+ Assert.assertFalse("CAS-remove must miss: A is no longer the current value", removed);
+ Assert.assertSame(b, cache.get(key));
+
+ // Simulate B's hook firing → evicts.
+ Assert.assertTrue(cache.remove(key, b));
+ Assert.assertNull(cache.get(key));
+ }
+
+ /**
+ * Sanity: ConsumerInvokerProxy.isAvailable() reflects the underlying client.
+ */
+ @Test
+ public void testProxyIsAvailableTracksUnderlyingClient() {
+ TestRpcClient client = new TestRpcClient();
+ final ConsumerInvokerProxy proxy = new ConsumerInvokerProxy<>(
+ stubInvoker(client.getProtocolConfig()), client);
+ Assert.assertTrue(proxy.isAvailable());
+ client.available.set(false);
+ Assert.assertFalse(proxy.isAvailable());
+ }
+
+ /**
+ * Sanity: ConsumerInvokerProxy.invoke fills CallInfo on the request and reports to the
+ * selector (best-effort; selector lookup may return null which is tolerated).
+ */
+ @Test
+ public void testProxyInvokeFillsCallInfoAndReports() {
+ TestRpcClient client = new TestRpcClient();
+ final ConsumerInvokerProxy proxy = new ConsumerInvokerProxy<>(
+ stubInvoker(client.getProtocolConfig()), client);
+
+ com.tencent.trpc.core.rpc.def.DefRequest request = new com.tencent.trpc.core.rpc.def.DefRequest();
+ com.tencent.trpc.core.rpc.RpcInvocation invocation = new com.tencent.trpc.core.rpc.RpcInvocation();
+ invocation.setFunc("any");
+ request.setInvocation(invocation);
+
+ java.util.HashMap params = new java.util.HashMap<>();
+ params.put("container_name", "test-container");
+ params.put("set_division", "test-set");
+ ServiceInstance instance = new ServiceInstance("127.0.0.1", 18001, params);
+
+ Assert.assertNotNull(proxy.invoke(request, instance));
+ // The invoke wraps responses; the underlying stub returns a successful future.
+ }
+
+ /* ---------------------- helpers ---------------------- */
+
+ @SuppressWarnings("unchecked")
+ private ConcurrentMap> getCache() throws Exception {
+ Field f = DefClusterInvoker.class.getDeclaredField("invokerCache");
+ f.setAccessible(true);
+ return (ConcurrentMap>) f.get(invoker);
+ }
+
+ private ConsumerInvoker stubInvoker(ProtocolConfig pc) {
+ return new ConsumerInvoker() {
+ @Override
+ public Class getInterface() {
+ return GenericClient.class;
+ }
+
+ @Override
+ public CompletionStage invoke(Request request) {
+ return FutureUtils.newSuccessFuture(null);
+ }
+
+ @Override
+ public ConsumerConfig getConfig() {
+ return invoker.getConfig();
+ }
+
+ @Override
+ public ProtocolConfig getProtocolConfig() {
+ return pc;
+ }
+ };
+ }
+
+ /* ---------------------- mock ---------------------- */
+
+ private static class TestRpcClient implements RpcClient {
+
+ final AtomicBoolean available = new AtomicBoolean(true);
+ final AtomicBoolean closed = new AtomicBoolean(false);
+ final CloseFuture closeFuture = new CloseFuture<>();
+ private final ProtocolConfig protocolConfig = new ProtocolConfig();
+
+ @Override
+ public void open() throws TRpcException {
+ }
+
+ @Override
+ public ConsumerInvoker createInvoker(ConsumerConfig consumerConfig) {
+ return null;
+ }
+
+ @Override
+ public void close() {
+ closed.set(true);
+ closeFuture.complete(null);
+ }
+
+ @Override
+ public CloseFuture closeFuture() {
+ return closeFuture;
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available.get() && !closed.get();
+ }
+
+ @Override
+ public boolean isClosed() {
+ return closed.get();
+ }
+
+ @Override
+ public ProtocolConfig getProtocolConfig() {
+ return protocolConfig;
+ }
+ }
+}
diff --git a/trpc-core/src/test/java/com/tencent/trpc/core/common/config/BaseProtocolConfigTest.java b/trpc-core/src/test/java/com/tencent/trpc/core/common/config/BaseProtocolConfigTest.java
index 7e512820e..c439750b9 100644
--- a/trpc-core/src/test/java/com/tencent/trpc/core/common/config/BaseProtocolConfigTest.java
+++ b/trpc-core/src/test/java/com/tencent/trpc/core/common/config/BaseProtocolConfigTest.java
@@ -207,6 +207,24 @@ public void testSetIdleTimeout() {
Assert.assertEquals(20, bpc.getIdleTimeout().intValue());
}
+ @Test
+ public void testTcpKeepAliveIdle() {
+ bpc.setTcpKeepAliveIdle(45);
+ Assert.assertEquals(45, bpc.getTcpKeepAliveIdle().intValue());
+ }
+
+ @Test
+ public void testTcpKeepAliveIntvl() {
+ bpc.setTcpKeepAliveIntvl(15);
+ Assert.assertEquals(15, bpc.getTcpKeepAliveIntvl().intValue());
+ }
+
+ @Test
+ public void testTcpKeepAliveCnt() {
+ bpc.setTcpKeepAliveCnt(5);
+ Assert.assertEquals(5, bpc.getTcpKeepAliveCnt().intValue());
+ }
+
@Test
public void testGetLazyinit() {
Assert.assertTrue(bpc.getLazyinit());
diff --git a/trpc-core/src/test/java/com/tencent/trpc/core/transport/AbstractClientTransportTest.java b/trpc-core/src/test/java/com/tencent/trpc/core/transport/AbstractClientTransportTest.java
index 4ff645024..96078bd35 100644
--- a/trpc-core/src/test/java/com/tencent/trpc/core/transport/AbstractClientTransportTest.java
+++ b/trpc-core/src/test/java/com/tencent/trpc/core/transport/AbstractClientTransportTest.java
@@ -11,13 +11,23 @@
package com.tencent.trpc.core.transport;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import com.tencent.trpc.core.common.config.ProtocolConfig;
import com.tencent.trpc.core.exception.TransportException;
import com.tencent.trpc.core.transport.codec.ClientCodec;
+import java.lang.reflect.Method;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
import org.junit.Test;
public class AbstractClientTransportTest {
@@ -51,6 +61,483 @@ public void testOpenException() throws Exception {
test2.toString();
}
+ /**
+ * Thundering-herd regression: when a long connection has just disconnected and many
+ * concurrent requests find the slot unavailable, the transport must rebuild the slot
+ * exactly ONCE — not once per requesting thread. Without the in-lock double-check this
+ * test would observe makeCount > 1 and the peer would see a connect/disconnect storm.
+ */
+ @Test
+ public void testEnsureChannelActiveDoesNotStorm() throws Exception {
+ StormTransport transport = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ // Pre-populate slot 0 with a "broken" item: future done but channel disconnected.
+ CompletableFuture dead = new CompletableFuture<>();
+ dead.complete(new DisconnectedChannel());
+ transport.installSlot(new AbstractClientTransport.ChannelFutureItem(dead, transport.getProtocolConfig()));
+
+ int threads = 64;
+ ExecutorService pool = Executors.newFixedThreadPool(threads);
+ CountDownLatch start = new CountDownLatch(1);
+ CountDownLatch done = new CountDownLatch(threads);
+ Method m = AbstractClientTransport.class.getDeclaredMethod("ensureChannelActive", int.class);
+ m.setAccessible(true);
+
+ for (int i = 0; i < threads; i++) {
+ pool.submit(() -> {
+ try {
+ start.await();
+ m.invoke(transport, 0);
+ } catch (Exception ignore) {
+ // concurrent invocations may race; the test only asserts on the
+ // aggregate openCalls counter below
+ } finally {
+ done.countDown();
+ }
+ });
+ }
+ start.countDown();
+ assertTrue(done.await(5, TimeUnit.SECONDS));
+ pool.shutdownNow();
+
+ // Exactly one rebuild — that's the whole point of the in-lock double check.
+ assertEquals("slot must be rebuilt exactly once under thundering-herd",
+ 1, transport.makeCount.get());
+ // The slot should now hold the freshly-built item.
+ assertSame(transport.lastBuilt, transport.peekSlot(0).getChannelFuture());
+ }
+
+ /**
+ * Regression for the idle-close hand-off path: when an idle handler invalidates a slot
+ * before the actual {@code channel.close()}, the next request must observe
+ * "needs reconnect" (slot is now {@code isNotYetConnect=true}) instead of routing onto
+ * the about-to-be-closed channel. This shrinks the "request lands on a closing channel"
+ * race window from "close completes" to "close is enqueued".
+ */
+ @Test
+ public void testInvalidateChannelReplacesSlotWithBlankPlaceholder() throws Exception {
+ StormTransport transport = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ // Pre-populate slot 0 with a connected (but soon-to-be-stale) channel.
+ ConnectedChannel live = new ConnectedChannel();
+ CompletableFuture alive = new CompletableFuture<>();
+ alive.complete(live);
+ transport.installSlot(new AbstractClientTransport.ChannelFutureItem(alive,
+ transport.getProtocolConfig()));
+
+ // Sanity: before invalidation the slot is "available", so a request would route
+ // onto the live channel.
+ AbstractClientTransport.ChannelFutureItem before = transport.peekSlot(0);
+ assertSame(alive, before.getChannelFuture());
+
+ // Idle handler hands off here.
+ transport.invalidateChannel(live);
+
+ AbstractClientTransport.ChannelFutureItem after = transport.peekSlot(0);
+ // After invalidation: slot is a blank placeholder, the next ensureChannelActive
+ // will treat it as needing a reconnect (isNotYetConnect=true).
+ assertTrue("slot must be replaced (different item identity)", after != before);
+ Method isNotYet = AbstractClientTransport.ChannelFutureItem.class
+ .getDeclaredMethod("isNotYetConnect");
+ isNotYet.setAccessible(true);
+ assertTrue("invalidated slot must report isNotYetConnect=true",
+ (boolean) isNotYet.invoke(after));
+ // And the live channel must have been told to close so the EventLoop tears down
+ // the underlying socket asynchronously.
+ assertTrue("invalidated channel must have close() invoked", live.closeCalled);
+ }
+
+ /**
+ * Channel stub used for {@link #testInvalidateChannelReplacesSlotWithBlankPlaceholder}.
+ * Tracks whether {@code close()} has been invoked.
+ */
+ private static class ConnectedChannel implements Channel {
+
+ volatile boolean closeCalled = false;
+
+ @Override
+ public boolean isConnected() {
+ return !closeCalled;
+ }
+
+ @Override
+ public boolean isClosed() {
+ return closeCalled;
+ }
+
+ @Override
+ public java.net.InetSocketAddress getRemoteAddress() {
+ return null;
+ }
+
+ @Override
+ public java.net.InetSocketAddress getLocalAddress() {
+ return null;
+ }
+
+ @Override
+ public com.tencent.trpc.core.common.config.ProtocolConfig getProtocolConfig() {
+ return null;
+ }
+
+ @Override
+ public java.util.concurrent.CompletionStage send(Object message) {
+ return CompletableFuture.completedFuture(null);
+ }
+
+ @Override
+ public java.util.concurrent.CompletionStage close() {
+ closeCalled = true;
+ return CompletableFuture.completedFuture(null);
+ }
+ }
+
+ /**
+ * AbstractClientTransport subclass used by {@link #testEnsureChannelActiveDoesNotStorm}.
+ * Exposes a deterministic {@code make()} that records how many times it has been called
+ * and returns a never-completing future so connecting state is observable.
+ */
+ private static class StormTransport extends AbstractClientTransport {
+
+ final AtomicInteger makeCount = new AtomicInteger(0);
+ volatile CompletableFuture lastBuilt;
+
+ StormTransport(ProtocolConfig config, ChannelHandler handler, ClientCodec codec) {
+ super(config, handler, codec);
+ }
+
+ void installSlot(ChannelFutureItem item) {
+ // The channels list is initialized empty; pad to index 0.
+ channels.add(item);
+ }
+
+ ChannelFutureItem peekSlot(int idx) {
+ return channels.get(idx);
+ }
+
+ void replaceSlot(int idx, ChannelFutureItem item) {
+ channels.set(idx, item);
+ }
+
+ @Override
+ public Set getChannels() {
+ return null;
+ }
+
+ @Override
+ protected void doOpen() {
+ }
+
+ @Override
+ protected CompletableFuture make() {
+ makeCount.incrementAndGet();
+ CompletableFuture f = new CompletableFuture<>();
+ lastBuilt = f;
+ // Never complete: leaves the new slot in "isConnecting" state, which is exactly
+ // what the in-lock double-check must short-circuit on for late-arriving threads.
+ return f;
+ }
+
+ @Override
+ protected void doClose() {
+ }
+
+ @Override
+ protected boolean useChannelPool() {
+ return true;
+ }
+ }
+
+ /**
+ * Channel that is "done but disconnected" — i.e. the previous future has resolved but
+ * {@code isConnected()} is false, which is what {@code ensureChannelActive} should treat
+ * as needing a reconnect.
+ */
+ private static class DisconnectedChannel implements Channel {
+
+ @Override
+ public boolean isConnected() {
+ return false;
+ }
+
+ @Override
+ public boolean isClosed() {
+ return true;
+ }
+
+ @Override
+ public java.net.InetSocketAddress getRemoteAddress() {
+ return null;
+ }
+
+ @Override
+ public java.net.InetSocketAddress getLocalAddress() {
+ return null;
+ }
+
+ @Override
+ public com.tencent.trpc.core.common.config.ProtocolConfig getProtocolConfig() {
+ return null;
+ }
+
+ @Override
+ public java.util.concurrent.CompletionStage send(Object message) {
+ return CompletableFuture.completedFuture(null);
+ }
+
+ @Override
+ public java.util.concurrent.CompletionStage close() {
+ return CompletableFuture.completedFuture(null);
+ }
+ }
+
+ /**
+ * {@code invalidateChannel(null)} is a no-op: must not throw, must not mutate slots.
+ */
+ @Test
+ public void testInvalidateChannelNullIsNoOp() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ ConnectedChannel live = new ConnectedChannel();
+ CompletableFuture alive = new CompletableFuture<>();
+ alive.complete(live);
+ AbstractClientTransport.ChannelFutureItem original = new AbstractClientTransport.ChannelFutureItem(
+ alive, t.getProtocolConfig());
+ t.installSlot(original);
+
+ t.invalidateChannel(null);
+
+ assertSame("null target must not touch any slot", original, t.peekSlot(0));
+ assertFalse("null target must not close any channel", live.closeCalled);
+ }
+
+ /**
+ * Empty / null channels list: the early-return guards in {@code invalidateChannel} keep
+ * the transport from NPE'ing during shutdown windows where slots have already been
+ * cleared.
+ */
+ @Test
+ public void testInvalidateChannelEmptyListIsNoOp() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ // No installSlot — channels stays empty.
+ t.invalidateChannel(new ConnectedChannel());
+ // Nothing to assert beyond "did not throw".
+ }
+
+ /**
+ * When the target channel is not held by any slot the call must skip every entry and
+ * leave them all intact — this matches the production scenario where two near-simultaneous
+ * idle events fire on different channels and the first call has already invalidated the
+ * shared slot.
+ */
+ @Test
+ public void testInvalidateChannelTargetNotPresentLeavesSlotsIntact() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ ConnectedChannel slotChannel = new ConnectedChannel();
+ CompletableFuture alive = new CompletableFuture<>();
+ alive.complete(slotChannel);
+ AbstractClientTransport.ChannelFutureItem original = new AbstractClientTransport.ChannelFutureItem(
+ alive, t.getProtocolConfig());
+ t.installSlot(original);
+
+ // Different channel — must not match anything.
+ t.invalidateChannel(new ConnectedChannel());
+
+ assertSame("non-matching target must not replace any slot",
+ original, t.peekSlot(0));
+ assertFalse("non-matching target must not close the slot's channel",
+ slotChannel.closeCalled);
+ }
+
+ /**
+ * Slot whose future is still in flight (isConnecting) must be skipped by
+ * {@code invalidateChannel}: at this point there is no Channel object yet to compare
+ * against the target, and a panicked replacement would orphan the in-flight connect.
+ */
+ @Test
+ public void testInvalidateChannelSkipsInFlightSlot() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ // Future never completes — slot is permanently in "isConnecting" state.
+ CompletableFuture connecting = new CompletableFuture<>();
+ AbstractClientTransport.ChannelFutureItem item = new AbstractClientTransport.ChannelFutureItem(
+ connecting, t.getProtocolConfig());
+ t.installSlot(item);
+
+ t.invalidateChannel(new ConnectedChannel());
+
+ assertSame("in-flight slot must be left alone", item, t.peekSlot(0));
+ }
+
+ /**
+ * Slot whose future has completed exceptionally must be skipped: there is no Channel to
+ * compare and the slot has already been observed as broken — let the request-path
+ * reconnect handle it.
+ */
+ @Test
+ public void testInvalidateChannelSkipsExceptionallyCompletedSlot() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ CompletableFuture failed = new CompletableFuture<>();
+ failed.completeExceptionally(new RuntimeException("boom"));
+ AbstractClientTransport.ChannelFutureItem item = new AbstractClientTransport.ChannelFutureItem(
+ failed, t.getProtocolConfig());
+ t.installSlot(item);
+
+ t.invalidateChannel(new ConnectedChannel());
+
+ assertSame("exceptionally-completed slot must be left alone",
+ item, t.peekSlot(0));
+ }
+
+ /**
+ * Slot holding a {@code null} future (blank placeholder produced by an earlier
+ * {@code invalidateChannel}) must be skipped — there is nothing to compare against and
+ * the slot is already in the "needs reconnect" state.
+ */
+ @Test
+ public void testInvalidateChannelSkipsBlankPlaceholderSlot() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ AbstractClientTransport.ChannelFutureItem blank = new AbstractClientTransport.ChannelFutureItem(
+ null, t.getProtocolConfig());
+ t.installSlot(blank);
+
+ t.invalidateChannel(new ConnectedChannel());
+
+ assertSame("blank placeholder must be left alone", blank, t.peekSlot(0));
+ }
+
+ /**
+ * Verifies the {@code needsReconnect} truth table directly through the public path
+ * (ensureChannelActive). Drives the static private predicate via the slots:
+ *
+ * - blank placeholder (channelFuture==null) → must trigger reconnect (rebuild)
+ * - connecting (future not done) → must NOT trigger reconnect (let it finish)
+ * - completed exceptionally → must trigger reconnect
+ * - completed with disconnected channel → must trigger reconnect
+ * - completed with connected channel → must NOT trigger reconnect
+ *
+ */
+ @Test
+ public void testNeedsReconnectTruthTable() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ Method ensure = AbstractClientTransport.class
+ .getDeclaredMethod("ensureChannelActive", int.class);
+ ensure.setAccessible(true);
+
+ // Case 1: blank placeholder — must rebuild (makeCount goes 0 → 1)
+ t.installSlot(new AbstractClientTransport.ChannelFutureItem(null, t.getProtocolConfig()));
+ ensure.invoke(t, 0);
+ assertEquals("blank placeholder must trigger rebuild", 1, t.makeCount.get());
+
+ // Case 2: connecting — the slot we just rebuilt is itself "isConnecting" because
+ // StormTransport.make() never completes the future. A second ensureChannelActive
+ // call must short-circuit and NOT trigger another rebuild.
+ ensure.invoke(t, 0);
+ assertEquals("connecting slot must NOT trigger rebuild",
+ 1, t.makeCount.get());
+
+ // Case 3: completed exceptionally — must rebuild.
+ CompletableFuture failed = new CompletableFuture<>();
+ failed.completeExceptionally(new RuntimeException("boom"));
+ t.replaceSlot(0, new AbstractClientTransport.ChannelFutureItem(failed, t.getProtocolConfig()));
+ ensure.invoke(t, 0);
+ assertEquals("exceptionally-completed slot must trigger rebuild",
+ 2, t.makeCount.get());
+
+ // Case 4: disconnected — must rebuild.
+ CompletableFuture dead = new CompletableFuture<>();
+ dead.complete(new DisconnectedChannel());
+ t.replaceSlot(0, new AbstractClientTransport.ChannelFutureItem(dead, t.getProtocolConfig()));
+ ensure.invoke(t, 0);
+ assertEquals("disconnected slot must trigger rebuild",
+ 3, t.makeCount.get());
+
+ // Case 5: connected — must NOT rebuild.
+ CompletableFuture live = new CompletableFuture<>();
+ live.complete(new ConnectedChannel());
+ t.replaceSlot(0, new AbstractClientTransport.ChannelFutureItem(live, t.getProtocolConfig()));
+ ensure.invoke(t, 0);
+ assertEquals("connected slot must NOT trigger rebuild",
+ 3, t.makeCount.get());
+ }
+
+ /**
+ * Race-loser path of {@code invalidateChannel}: between "outside-lock match" and
+ * "in-lock recheck" another thread may have already replaced the slot with a fresh
+ * item. The race-loser branch must back off without touching anything.
+ *
+ * We simulate the race by replacing the slot from a sneaky {@code close()} side-effect
+ * on the channel: when {@code invalidateChannel} dispatches {@code item.close()} the
+ * test has already verified the slot replacement; we instead use a manual two-step where
+ * we install slot, snapshot the item, swap the slot, then call invalidateChannel — the
+ * inner {@code latest != item} check must short-circuit. Equivalent in coverage and
+ * deterministic.
+ */
+ @Test
+ public void testInvalidateChannelRaceLoserDoesNothing() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ ConnectedChannel target = new ConnectedChannel();
+ CompletableFuture aliveOld = new CompletableFuture<>();
+ aliveOld.complete(target);
+ AbstractClientTransport.ChannelFutureItem oldItem = new AbstractClientTransport.ChannelFutureItem(
+ aliveOld, t.getProtocolConfig());
+ t.installSlot(oldItem);
+
+ // Concurrent thread already replaced the slot with a brand-new item before our
+ // invalidate caller acquired the lock.
+ ConnectedChannel fresh = new ConnectedChannel();
+ CompletableFuture aliveNew = new CompletableFuture<>();
+ aliveNew.complete(fresh);
+ AbstractClientTransport.ChannelFutureItem newItem = new AbstractClientTransport.ChannelFutureItem(
+ aliveNew, t.getProtocolConfig());
+ t.replaceSlot(0, newItem);
+
+ // Now the late call arrives. Its outside-lock scan won't match (slot holds `fresh`,
+ // not `target`), so it never enters the inner block. This still exercises the early
+ // skip path in the iteration.
+ t.invalidateChannel(target);
+
+ assertSame("fresh slot must be preserved", newItem, t.peekSlot(0));
+ assertFalse("fresh channel must not be closed", fresh.closeCalled);
+ assertFalse("stale target must not be closed by late invalidate",
+ target.closeCalled);
+ }
+
+ /**
+ * After {@code invalidateChannel} the slot is a blank placeholder, and a subsequent
+ * {@code ensureChannelActive} must rebuild — the full "idle close → request reconnect"
+ * hand-off is verified in one shot.
+ */
+ @Test
+ public void testInvalidateChannelThenEnsureChannelActiveRebuilds() throws Exception {
+ StormTransport t = new StormTransport(TransporterTestUtils.newProtocolConfig(),
+ TransporterTestUtils.newChannelHandler(), TransporterTestUtils.newClientCodec());
+ ConnectedChannel live = new ConnectedChannel();
+ CompletableFuture alive = new CompletableFuture<>();
+ alive.complete(live);
+ AbstractClientTransport.ChannelFutureItem before = new AbstractClientTransport.ChannelFutureItem(
+ alive, t.getProtocolConfig());
+ t.installSlot(before);
+
+ t.invalidateChannel(live);
+ assertNotSame("slot must be replaced", before, t.peekSlot(0));
+ assertTrue(live.closeCalled);
+
+ Method ensure = AbstractClientTransport.class
+ .getDeclaredMethod("ensureChannelActive", int.class);
+ ensure.setAccessible(true);
+ ensure.invoke(t, 0);
+
+ assertEquals("post-invalidate ensureChannelActive must rebuild exactly once",
+ 1, t.makeCount.get());
+ }
+
private static class ClientTransportTest extends AbstractClientTransport {
private boolean isTransportException;
diff --git a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2ConsumerInvoker.java b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2ConsumerInvoker.java
index fd6b57ab4..b15862dea 100644
--- a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2ConsumerInvoker.java
+++ b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2ConsumerInvoker.java
@@ -15,7 +15,6 @@
import static com.tencent.trpc.core.common.Constants.DEFAULT_CLIENT_REQUEST_TIMEOUT_MS;
import static com.tencent.trpc.proto.http.common.HttpConstants.CONNECTION_REQUEST_TIMEOUT;
-import autovalue.shaded.com.google.common.common.base.Objects;
import com.tencent.trpc.core.common.config.BackendConfig;
import com.tencent.trpc.core.common.config.ConsumerConfig;
import com.tencent.trpc.core.common.config.ProtocolConfig;
@@ -29,6 +28,7 @@
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
+import java.util.Objects;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.hc.client5.http.async.methods.SimpleHttpRequest;
@@ -44,6 +44,11 @@
/**
* HTTP/2 protocol client invoker, supporting both h2 and http2c.
+ *
+ * Each {@link #send(Request)} entry signals the underlying {@link Http2cRpcClient} that
+ * it is being used (drives the idle-eviction heuristic) and reports success / failure to
+ * drive the consecutive-failure counter that flips the client to unavailable on sustained
+ * backend outages.
*/
public class Http2ConsumerInvoker extends AbstractConsumerInvoker {
@@ -60,19 +65,38 @@ public Http2ConsumerInvoker(Http2cRpcClient client, ConsumerConfig config,
*
* @param request client request
* @return Response
- * @throws Exception if send request failed
+ * @throws Exception declared to honour the abstract contract; the implementation never
+ * lets exceptions escape — every failure path is wrapped into a Response with
+ * {@code exception != null}.
*/
@Override
public Response send(Request request) throws Exception {
+ Http2cRpcClient http2cRpcClient = (Http2cRpcClient) client;
+ // Mark "used" before any work so even a failed request keeps the idle-eviction timer
+ // accurate (a failing client is still actively used and must not be reaped as orphan).
+ http2cRpcClient.markUsed();
+
int requestTimeout = config.getBackendConfig().getRequestTimeout();
- SimpleHttpRequest simpleHttpRequest = buildRequest(request, requestTimeout);
+ SimpleHttpRequest simpleHttpRequest;
+ try {
+ simpleHttpRequest = buildRequest(request, requestTimeout);
+ } catch (Exception ex) {
+ http2cRpcClient.markFailure();
+ return RpcUtils.newResponse(request, null, ex);
+ }
try {
SimpleHttpResponse simpleHttpResponse = execute(request, requestTimeout,
- simpleHttpRequest);
-
- return handleResponse(request, simpleHttpResponse);
+ simpleHttpRequest, http2cRpcClient);
+ Response response = handleResponse(request, simpleHttpResponse);
+ if (response.getException() == null) {
+ http2cRpcClient.markSuccess();
+ } else {
+ http2cRpcClient.markFailure();
+ }
+ return response;
} catch (Exception e) {
+ http2cRpcClient.markFailure();
return RpcUtils.newResponse(request, null, e);
}
@@ -132,12 +156,13 @@ private Response handleResponse(Request request, SimpleHttpResponse simpleHttpRe
* @param request TRPC request
* @param requestTimeout request timeout
* @param simpleHttpRequest HTTP request
+ * @param http2cRpcClient already-resolved owning client (avoids a redundant cast)
* @return HTTP response
* @throws Exception if do HTTP request failed
*/
private SimpleHttpResponse execute(Request request, int requestTimeout,
- SimpleHttpRequest simpleHttpRequest) throws Exception {
- CloseableHttpAsyncClient httpAsyncClient = ((Http2cRpcClient) client).getHttpAsyncClient();
+ SimpleHttpRequest simpleHttpRequest, Http2cRpcClient http2cRpcClient) throws Exception {
+ CloseableHttpAsyncClient httpAsyncClient = http2cRpcClient.getHttpAsyncClient();
Future httpResponseFuture = httpAsyncClient.execute(simpleHttpRequest,
new FutureCallback() {
@Override
@@ -206,7 +231,7 @@ private SimpleHttpRequest buildRequest(Request request, int requestTimeout) thro
}
// Set custom business headers, consistent with the TRPC protocol, only process String and byte[]
request.getAttachments().forEach((k, v) -> {
- if (Objects.equal(k, HttpHeaders.TRANSFER_ENCODING) || Objects.equal(k, HttpHeaders.CONTENT_LENGTH)) {
+ if (Objects.equals(k, HttpHeaders.TRANSFER_ENCODING) || Objects.equals(k, HttpHeaders.CONTENT_LENGTH)) {
return;
}
if (v instanceof String) {
diff --git a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2RpcClient.java b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2RpcClient.java
index 8a79a7aae..27146b623 100644
--- a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2RpcClient.java
+++ b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2RpcClient.java
@@ -15,6 +15,8 @@
import static com.tencent.trpc.transport.http.common.Constants.KEYSTORE_PATH;
import com.tencent.trpc.core.common.config.ProtocolConfig;
+import com.tencent.trpc.core.exception.ErrorCode;
+import com.tencent.trpc.core.exception.TRpcException;
import com.tencent.trpc.core.logger.Logger;
import com.tencent.trpc.core.logger.LoggerFactory;
import java.io.File;
@@ -24,28 +26,41 @@
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder;
import org.apache.hc.core5.http2.HttpVersionPolicy;
import org.apache.hc.core5.http2.ssl.ConscryptClientTlsStrategy;
+import org.apache.hc.core5.pool.PoolReusePolicy;
+import org.apache.hc.core5.reactor.IOReactorConfig;
import org.apache.hc.core5.ssl.SSLContexts;
+import org.apache.hc.core5.util.TimeValue;
/**
- * HTTP 2 protocol client.
+ * HTTP/2 (TLS) protocol client. Inherits long-connection state ({@code lastUsedNanos},
+ * {@code consecutiveFailures}, {@link #markUsed}, {@link #markSuccess}, {@link #markFailure}
+ * and the overridden {@link #isAvailable()}) from {@link Http2cRpcClient}; the differences
+ * are the TLS handshake and the explicit {@link HttpVersionPolicy} negotiation.
+ *
+ * The connection manager is sized and tuned identically to {@link Http2cRpcClient}: pool
+ * limits derived from {@code maxConns}, idle / expired eviction, SO_KEEPALIVE and a hard
+ * connection TTL.
*/
public class Http2RpcClient extends Http2cRpcClient {
- private static final Logger logger = LoggerFactory.getLogger(HttpRpcClient.class);
+ private static final Logger logger = LoggerFactory.getLogger(Http2RpcClient.class);
+
+ private static final int VALIDATE_AFTER_INACTIVITY_MS = 2000;
+ private static final long EVICT_IDLE_CONNECTIONS_SECONDS = 60L;
+ private static final int CONNECTION_TTL_MINUTES = 10;
/**
* The protocol type used for interaction with the server, such as HTTP1, H2, or protocol negotiation.
* In trpc, the interaction is forced to use H2 or HTTP1 protocol based on the configuration.
*/
protected HttpVersionPolicy clientVersionPolicy;
-
public Http2RpcClient(ProtocolConfig config) {
super(config);
this.clientVersionPolicy = HttpVersionPolicy.FORCE_HTTP_2;
}
@Override
- protected void doOpen() {
+ protected void doOpen() throws TRpcException {
try {
String keyStorePath = String
.valueOf(getProtocolConfig().getExtMap().get(KEYSTORE_PATH));
@@ -61,20 +76,46 @@ protected void doOpen() {
.build();
// 2. Configure connection pool.
+ int maxConns = protocolConfig.getMaxConns();
final PoolingAsyncClientConnectionManager cm = PoolingAsyncClientConnectionManagerBuilder
.create().useSystemProperties()
.setTlsStrategy(new ConscryptClientTlsStrategy(sslContext))
+ .setMaxConnTotal(maxConns)
+ .setMaxConnPerRoute(maxConns)
+ .setConnPoolPolicy(PoolReusePolicy.LIFO)
+ .setValidateAfterInactivity(TimeValue.ofMilliseconds(VALIDATE_AFTER_INACTIVITY_MS))
+ .setConnectionTimeToLive(TimeValue.ofMinutes(CONNECTION_TTL_MINUTES))
.build();
// 3. Configure the client to force HTTPS protocol to use HTTP1 communication and H2 protocol
// to use H2 communication.
httpAsyncClient = HttpAsyncClients.custom()
- .setVersionPolicy(this.clientVersionPolicy).setConnectionManager(cm)
+ .setVersionPolicy(this.clientVersionPolicy)
+ .setConnectionManager(cm)
+ .setIOReactorConfig(IOReactorConfig.custom()
+ .setSoKeepAlive(true)
+ .build())
+ .evictExpiredConnections()
+ .evictIdleConnections(TimeValue.ofSeconds(EVICT_IDLE_CONNECTIONS_SECONDS))
.build();
// 4. Start the client.
httpAsyncClient.start();
} catch (Exception e) {
- logger.error("httpAsyncClient error: ", e);
+ logger.error("open https/h2 client ({}) failed",
+ getProtocolConfig().toSimpleString(), e);
+ throw TRpcException.newFrameException(ErrorCode.TRPC_CLIENT_CONNECT_ERR,
+ "open https/h2 client (" + getProtocolConfig().toSimpleString() + ") failed",
+ e);
}
}
+
+ /**
+ * Defensive close ensuring the inherited handle is released. We let the parent's
+ * {@code doClose} do the actual cleanup; this method exists in case future TLS-only
+ * resources need explicit release.
+ */
+ @Override
+ protected void doClose() {
+ super.doClose();
+ }
}
diff --git a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2cRpcClient.java b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2cRpcClient.java
index f00d9858d..01612ea6c 100644
--- a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2cRpcClient.java
+++ b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/Http2cRpcClient.java
@@ -14,41 +14,138 @@
import com.tencent.trpc.core.common.config.ConsumerConfig;
import com.tencent.trpc.core.common.config.ProtocolConfig;
+import com.tencent.trpc.core.exception.ErrorCode;
+import com.tencent.trpc.core.exception.TRpcException;
import com.tencent.trpc.core.logger.Logger;
import com.tencent.trpc.core.logger.LoggerFactory;
import com.tencent.trpc.core.rpc.AbstractRpcClient;
import com.tencent.trpc.core.rpc.ConsumerInvoker;
import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
import org.apache.hc.client5.http.impl.async.HttpAsyncClients;
+import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManager;
+import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder;
+import org.apache.hc.core5.pool.PoolReusePolicy;
+import org.apache.hc.core5.reactor.IOReactorConfig;
+import org.apache.hc.core5.util.TimeValue;
+import org.apache.hc.core5.util.Timeout;
/**
- * Http2c protocol client.
+ * HTTP/2 cleartext (h2c) protocol client.
+ *
+ * Long-connection mode. Built on Apache HttpClient 5.x async + HTTP/2 multiplexing —
+ * a single TCP connection carries many concurrent RPC streams. The connection manager is
+ * tuned identically in spirit to {@link HttpRpcClient}:
+ *
+ * - {@code maxConnTotal} / {@code maxConnPerRoute} sized from
+ * {@code protocolConfig.getMaxConns()} so the pool never silently caps at the tiny
+ * HttpClient defaults;
+ * - {@code validateAfterInactivity}: {@value #VALIDATE_AFTER_INACTIVITY_MS}ms re-check
+ * on idle connections before reuse;
+ * - {@code evictExpired} + {@code evictIdle}: daemon-thread cleanup at
+ * {@value #EVICT_IDLE_CONNECTIONS_SECONDS}s;
+ * - {@code SO_KEEPALIVE} enabled on the IOReactor so the OS itself surfaces dead peers
+ * on platforms where it is configured (Linux ~2h default, far quicker with kernel
+ * tuning);
+ * - {@code timeToLive}: hard ceiling at {@value #CONNECTION_TTL_MINUTES}min, recovers
+ * from backend IP rotation in bounded time.
+ *
+ *
+ * Health signalling to the cluster manager mirrors {@link HttpRpcClient}:
+ * the client reports unavailable when (a) it has been idle >
+ * {@value #IDLE_UNAVAILABLE_THRESHOLD_MINUTES}min, or (b) it has accumulated ≥
+ * {@value #FAILURE_UNAVAILABLE_THRESHOLD} consecutive failures since the last success.
*/
public class Http2cRpcClient extends AbstractRpcClient {
- private static final Logger logger = LoggerFactory.getLogger(HttpRpcClient.class);
+ private static final Logger logger = LoggerFactory.getLogger(Http2cRpcClient.class);
+
+ private static final int VALIDATE_AFTER_INACTIVITY_MS = 2000;
+ private static final long EVICT_IDLE_CONNECTIONS_SECONDS = 60L;
+ private static final int CONNECTION_TTL_MINUTES = 10;
+
+ /**
+ * If this client has not been used by any RPC for longer than this window, the periodic
+ * health observer in {@code RpcClusterClientManager} will treat it as unavailable and
+ * eventually close & evict it. The window is intentionally large so that any
+ * actively-used client is never affected. See {@link HttpRpcClient} for the same mechanism
+ * on the HTTP/1.1 path.
+ */
+ static final int IDLE_UNAVAILABLE_THRESHOLD_MINUTES = 10;
+ private static final long IDLE_UNAVAILABLE_THRESHOLD_NANOS =
+ TimeUnit.MINUTES.toNanos(IDLE_UNAVAILABLE_THRESHOLD_MINUTES);
+ /**
+ * Number of consecutive failed RPCs that flips this client to unavailable. Reset to 0 on
+ * the next successful RPC.
+ */
+ static final int FAILURE_UNAVAILABLE_THRESHOLD = 50;
/**
* Asynchronous HTTP client
*/
protected CloseableHttpAsyncClient httpAsyncClient;
+ /**
+ * Timestamp ({@link System#nanoTime()}) of the most recent RPC sent through this client.
+ * Updated by {@link Http2ConsumerInvoker} on each request.
+ */
+ private volatile long lastUsedNanos = System.nanoTime();
+ /**
+ * Number of consecutive failed RPCs since the last success. See {@link HttpRpcClient}.
+ */
+ private final AtomicInteger consecutiveFailures = new AtomicInteger(0);
public Http2cRpcClient(ProtocolConfig config) {
setConfig(config);
}
/**
- * Configure and start the client
+ * Configure and start the client. The pool is sized from {@code maxConns}; idle / expired
+ * connections are reaped in the background; dead-peer detection happens via SO_KEEPALIVE
+ * plus a {@value #CONNECTION_TTL_MINUTES}-minute hard TTL.
+ *
+ * @throws TRpcException if the underlying HttpClient fails to start; surfacing this lets
+ * {@link AbstractRpcClient#open()} mark the lifecycle FAILED instead of leaving a
+ * half-built client cached.
*/
@Override
- protected void doOpen() {
- httpAsyncClient = HttpAsyncClients.customHttp2().build();
- httpAsyncClient.start();
+ protected void doOpen() throws TRpcException {
+ try {
+ int maxConns = protocolConfig.getMaxConns();
+ PoolingAsyncClientConnectionManager cm = PoolingAsyncClientConnectionManagerBuilder
+ .create()
+ .setMaxConnTotal(maxConns)
+ .setMaxConnPerRoute(maxConns)
+ .setConnPoolPolicy(PoolReusePolicy.LIFO)
+ .setValidateAfterInactivity(TimeValue.ofMilliseconds(VALIDATE_AFTER_INACTIVITY_MS))
+ .setConnectionTimeToLive(TimeValue.ofMinutes(CONNECTION_TTL_MINUTES))
+ .build();
+
+ httpAsyncClient = HttpAsyncClients.custom()
+ .setConnectionManager(cm)
+ // Enable SO_KEEPALIVE on every socket so the OS eventually reaps dead peers
+ // even when no idle / TTL eviction has fired.
+ .setIOReactorConfig(IOReactorConfig.custom()
+ .setSoKeepAlive(true)
+ .setSoTimeout(Timeout.ofSeconds(0))
+ .build())
+ .evictExpiredConnections()
+ .evictIdleConnections(TimeValue.ofSeconds(EVICT_IDLE_CONNECTIONS_SECONDS))
+ .setVersionPolicy(org.apache.hc.core5.http2.HttpVersionPolicy.FORCE_HTTP_2)
+ .build();
+ httpAsyncClient.start();
+ } catch (Exception e) {
+ // Surface the failure so the lifecycle moves to FAILED and the cached cluster slot
+ // is not populated with a half-built client.
+ String desc = protocolConfig != null ? protocolConfig.toSimpleString() : "";
+ throw TRpcException.newFrameException(ErrorCode.TRPC_CLIENT_CONNECT_ERR,
+ "open http2c client (" + desc + ") failed", e);
+ }
}
/**
- * Close the client
+ * Close the client.
*/
@Override
protected void doClose() {
@@ -74,7 +171,54 @@ public ConsumerInvoker createInvoker(ConsumerConfig consumerConfig) {
return new Http2ConsumerInvoker<>(this, consumerConfig, protocolConfig);
}
+ /**
+ * Record that this client just served (or is about to serve) an RPC. Called by
+ * {@link Http2ConsumerInvoker} on every request entry.
+ */
+ public void markUsed() {
+ lastUsedNanos = System.nanoTime();
+ }
+
+ /**
+ * Record a successful RPC. Resets the consecutive-failure counter so an isolated earlier
+ * failure does not contribute to eviction.
+ */
+ public void markSuccess() {
+ consecutiveFailures.set(0);
+ }
+
+ /**
+ * Record a failed RPC (exception during {@code execute} or non-2xx response).
+ */
+ public void markFailure() {
+ consecutiveFailures.incrementAndGet();
+ }
+
+ /**
+ * Reports the client as unavailable if its lifecycle is no longer started, or if it has
+ * been idle longer than {@value #IDLE_UNAVAILABLE_THRESHOLD_MINUTES}min, or if at least
+ * {@value #FAILURE_UNAVAILABLE_THRESHOLD} consecutive RPC failures have piled up since the
+ * last success.
+ */
+ @Override
+ public boolean isAvailable() {
+ if (!super.isAvailable()) {
+ return false;
+ }
+ if (consecutiveFailures.get() >= FAILURE_UNAVAILABLE_THRESHOLD) {
+ return false;
+ }
+ return (System.nanoTime() - lastUsedNanos) <= IDLE_UNAVAILABLE_THRESHOLD_NANOS;
+ }
+
public CloseableHttpAsyncClient getHttpAsyncClient() {
return httpAsyncClient;
}
+
+ /**
+ * Visible for tests / observability: current consecutive-failure counter snapshot.
+ */
+ public int getConsecutiveFailures() {
+ return consecutiveFailures.get();
+ }
}
diff --git a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpConsumerInvoker.java b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpConsumerInvoker.java
index 7d101e8c6..48042c91a 100644
--- a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpConsumerInvoker.java
+++ b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpConsumerInvoker.java
@@ -43,7 +43,13 @@
import org.apache.http.protocol.HttpContext;
/**
- * HTTP protocol client invoker.
+ * HTTP/1.1 protocol client invoker.
+ *
+ * Each {@link #send(Request)} entry signals the underlying {@link HttpRpcClient} that it
+ * is being used (drives the idle-eviction heuristic) and reports success / failure to drive
+ * the consecutive-failure counter that flips the client to unavailable on sustained backend
+ * outages. See {@link HttpRpcClient} for the cluster-side health observer that consumes these
+ * signals.
*/
public class HttpConsumerInvoker extends AbstractConsumerInvoker {
@@ -57,17 +63,40 @@ public HttpConsumerInvoker(HttpRpcClient client, ConsumerConfig config,
*
* @param request TRPC request
* @return TRPC response
- * @throws Exception if send request failed
+ * @throws Exception declared to honour the abstract contract; the implementation never
+ * lets exceptions escape — every failure path is wrapped into a Response with
+ * {@code exception != null}.
*/
@Override
public Response send(Request request) throws Exception {
- HttpPost httpPost = buildRequest(request);
-
- CloseableHttpClient httpClient = ((HttpRpcClient) client).getHttpClient();
+ HttpRpcClient httpRpcClient = (HttpRpcClient) client;
+ // Mark "used" before any work so even a failed request keeps the idle-eviction timer
+ // accurate (a failing client is still actively used and must not be reaped as orphan).
+ httpRpcClient.markUsed();
+
+ HttpPost httpPost;
+ try {
+ httpPost = buildRequest(request);
+ } catch (Exception ex) {
+ // buildRequest failure is a local programming error (URI / encoding / config).
+ // Count it as a failure for the consecutive-failure counter — sustained build
+ // failures should still surface via isAvailable().
+ httpRpcClient.markFailure();
+ return RpcUtils.newResponse(request, null, ex);
+ }
+ CloseableHttpClient httpClient = httpRpcClient.getHttpClient();
try (CloseableHttpResponse httpResponse = httpClient.execute(httpPost)) {
- return handleResponse(request, httpResponse);
+ Response response = handleResponse(request, httpResponse);
+ if (response.getException() == null) {
+ httpRpcClient.markSuccess();
+ } else {
+ // Non-2xx surfaced as Response with biz exception; count toward eviction.
+ httpRpcClient.markFailure();
+ }
+ return response;
} catch (Exception ex) {
+ httpRpcClient.markFailure();
return RpcUtils.newResponse(request, null, ex);
}
}
diff --git a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpRpcClient.java b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpRpcClient.java
index 28c713206..1706a5809 100644
--- a/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpRpcClient.java
+++ b/trpc-proto/trpc-proto-http/src/main/java/com/tencent/trpc/proto/http/client/HttpRpcClient.java
@@ -18,18 +18,111 @@
import com.tencent.trpc.core.rpc.AbstractRpcClient;
import com.tencent.trpc.core.rpc.ConsumerInvoker;
import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
/**
- * HTTP protocol client.
+ * HTTP/1.1 protocol client.
+ *
+ * Long-connection mode. Connections are pooled by Apache
+ * {@link PoolingHttpClientConnectionManager} and reused across requests via HTTP/1.1
+ * {@code Connection: keep-alive}. The following safeguards are wired by default to keep the
+ * pool healthy in long-running processes — especially when the server, an intermediary load
+ * balancer or a NAT silently terminates idle keep-alive sockets:
+ *
+ * - {@code maxTotal} / {@code maxPerRoute} sized from {@code protocolConfig.getMaxConns()}
+ * so the pool never silently caps at HttpClient's tiny default (25/5);
+ * - {@code validateAfterInactivity}: re-checks a pooled connection's liveness before reuse
+ * when it has been idle for at least
+ * {@value #VALIDATE_AFTER_INACTIVITY_MS}ms (avoids the classic "stale connection /
+ * {@code NoHttpResponseException}" against a server-side half-closed keep-alive socket);
+ * - {@code evictExpiredConnections} + {@code evictIdleConnections}: a daemon thread evicts
+ * connections idle longer than {@value #EVICT_IDLE_CONNECTIONS_SECONDS}s, freeing OS file
+ * descriptors;
+ * - {@code keepAliveStrategy} with a {@value #FALLBACK_KEEPALIVE_MINUTES}min ceiling: when
+ * the server omits {@code Keep-Alive: timeout=N} we still cap connection age client-side,
+ * which beats most NAT idle timers (typical 5–15min);
+ * - {@code connectionTimeToLive}: hard ceiling — every connection is forcibly recycled
+ * after {@value #CONNECTION_TTL_MINUTES}min regardless of activity, so backend IP rotation
+ * (K8s pod drift, blue/green) is recovered from in bounded time.
+ *
+ *
+ * Health signalling to the cluster manager. The cluster manager's periodic health
+ * observer in {@code RpcClusterClientManager} polls {@link #isAvailable()} every 30s. A
+ * cached HTTP client is reported unavailable (and eventually evicted) when either:
+ *
+ * - it has not served any RPC for longer than {@value #IDLE_UNAVAILABLE_THRESHOLD_MINUTES}
+ * minutes (orphaned by backend IP rotation), or
+ * - it has accumulated {@value #FAILURE_UNAVAILABLE_THRESHOLD} consecutive RPC failures
+ * (backend persistently 5xx / unreachable). The counter is reset by every successful
+ * RPC so transient failures never cross the threshold.
+ *
+ *
+ * All long-connection state ({@link #lastUsedNanos}, {@link #consecutiveFailures}) uses
+ * {@code volatile} / {@link AtomicInteger} primitives — safe to read/write concurrently from
+ * any number of business threads with no lock.
*/
public class HttpRpcClient extends AbstractRpcClient {
private static final Logger logger = LoggerFactory.getLogger(HttpRpcClient.class);
+ /**
+ * Validate a pooled connection before reuse if it has been idle for at least this many
+ * milliseconds. Cheap heuristic that catches most server-side half-closed keep-alive sockets.
+ */
+ private static final int VALIDATE_AFTER_INACTIVITY_MS = 2000;
+ /**
+ * Evict pooled connections that have been idle for longer than this duration.
+ */
+ private static final long EVICT_IDLE_CONNECTIONS_SECONDS = 60L;
+ /**
+ * Fallback {@code Keep-Alive} duration applied client-side when the server response omits
+ * a {@code Keep-Alive: timeout=N} hint. Picked to be shorter than typical NAT / LB idle
+ * timers (5–15 minutes) so we never hold a connection past the point where some hop on
+ * the path has silently dropped it.
+ */
+ private static final int FALLBACK_KEEPALIVE_MINUTES = 5;
+ /**
+ * Hard upper bound on a single connection's lifetime. Any pooled connection older than this
+ * is discarded on next checkout, regardless of activity. This is the recovery mechanism for
+ * backend IP rotation (K8s pod drift) — without it a hot connection routed to an old pod
+ * could survive indefinitely.
+ */
+ private static final int CONNECTION_TTL_MINUTES = 10;
+ /**
+ * If this client has not been used by any RPC for longer than this window, the periodic
+ * health observer in {@code RpcClusterClientManager} will treat it as unavailable. After
+ * a few consecutive unavailable observations the client gets closed and evicted from the
+ * cluster cache, which is how we reclaim {@link HttpRpcClient} instances orphaned by backend
+ * IP rotation. The window is intentionally large so that any actively-used client is never
+ * affected.
+ */
+ static final int IDLE_UNAVAILABLE_THRESHOLD_MINUTES = 10;
+ private static final long IDLE_UNAVAILABLE_THRESHOLD_NANOS =
+ TimeUnit.MINUTES.toNanos(IDLE_UNAVAILABLE_THRESHOLD_MINUTES);
+ /**
+ * Number of consecutive failed RPCs that flips this client to unavailable. The counter is
+ * reset to 0 on the next successful RPC, so transient blips never cross the threshold —
+ * only sustained failure (e.g. backend 5xx storm, unreachable IP) does.
+ */
+ static final int FAILURE_UNAVAILABLE_THRESHOLD = 50;
+
private CloseableHttpClient httpClient;
+ /**
+ * Timestamp ({@link System#nanoTime()}) of the most recent RPC sent through this client.
+ * Updated by {@link HttpConsumerInvoker} on each send. {@code volatile} for safe lock-free
+ * publication to the health-observer thread.
+ */
+ private volatile long lastUsedNanos = System.nanoTime();
+ /**
+ * Number of consecutive failed RPCs since the last success. Bumped by
+ * {@link #markFailure()} and zeroed by {@link #markSuccess()}. Lock-free — concurrent
+ * RPC threads never serialize on this counter.
+ */
+ private final AtomicInteger consecutiveFailures = new AtomicInteger(0);
public HttpRpcClient(ProtocolConfig config) {
setConfig(config);
@@ -44,7 +137,54 @@ protected void doOpen() {
// If there is only one route, the maximum number of connections for a single route is the same
// as the maximum number of connections for the entire connection pool.
cm.setDefaultMaxPerRoute(maxConns);
- httpClient = HttpClients.custom().setConnectionManager(cm).build();
+ // Re-validate idle pooled connections before reuse so we do not send a request through a
+ // socket the server has already half-closed.
+ cm.setValidateAfterInactivity(VALIDATE_AFTER_INACTIVITY_MS);
+ httpClient = HttpClients.custom()
+ .setConnectionManager(cm)
+ // Background eviction of stale & long-idle connections; keeps the pool tidy in
+ // long-running processes without affecting hot connections.
+ .evictExpiredConnections()
+ .evictIdleConnections(EVICT_IDLE_CONNECTIONS_SECONDS, TimeUnit.SECONDS)
+ // Cap server-suggested keep-alive at FALLBACK_KEEPALIVE_MINUTES. When the server
+ // omits a Keep-Alive header HttpClient defaults to "infinite" — that loses to any
+ // intermediary NAT / LB silently dropping idle sockets, manifesting as the
+ // dreaded NoHttpResponseException on the next request.
+ .setKeepAliveStrategy(HttpRpcClient::resolveKeepAliveDuration)
+ // Hard ceiling: every connection forcibly recycled after CONNECTION_TTL_MINUTES.
+ // Recovers from backend IP rotation in bounded time even if the connection stays
+ // hot.
+ .setConnectionTimeToLive(CONNECTION_TTL_MINUTES, TimeUnit.MINUTES)
+ .build();
+ }
+
+ /**
+ * Capped keep-alive duration: prefer the {@code Keep-Alive: timeout=N} hint from the server
+ * (clamped at {@value #FALLBACK_KEEPALIVE_MINUTES}min) and fall back to that ceiling when
+ * the server omits the hint or the value is malformed. Package-private so unit tests can
+ * exercise the parsing branches without spinning up a real HTTP server.
+ *
+ * @param response the inbound HTTP response (only the header is read)
+ * @param context unused, present to satisfy {@code ConnectionKeepAliveStrategy}
+ * @return the keep-alive duration in milliseconds
+ */
+ public static long resolveKeepAliveDuration(org.apache.http.HttpResponse response,
+ org.apache.http.protocol.HttpContext context) {
+ long fallbackMs = TimeUnit.MINUTES.toMillis(FALLBACK_KEEPALIVE_MINUTES);
+ org.apache.http.Header h = response.getFirstHeader("Keep-Alive");
+ if (h != null) {
+ for (org.apache.http.HeaderElement el : h.getElements()) {
+ if ("timeout".equalsIgnoreCase(el.getName()) && el.getValue() != null) {
+ try {
+ long server = Long.parseLong(el.getValue()) * 1000L;
+ return Math.min(server, fallbackMs);
+ } catch (NumberFormatException ignore) {
+ // fall through to the fallback
+ }
+ }
+ }
+ }
+ return fallbackMs;
}
@Override
@@ -64,7 +204,62 @@ public ConsumerInvoker createInvoker(ConsumerConfig consumerConfig) {
return new HttpConsumerInvoker<>(this, consumerConfig, protocolConfig);
}
+ /**
+ * Record that this client just served (or is about to serve) an RPC. Called by
+ * {@link HttpConsumerInvoker} on every request entry.
+ */
+ public void markUsed() {
+ lastUsedNanos = System.nanoTime();
+ }
+
+ /**
+ * Record a successful RPC. Resets {@link #consecutiveFailures} to zero so that an isolated
+ * earlier failure does not contribute toward eviction.
+ */
+ public void markSuccess() {
+ consecutiveFailures.set(0);
+ }
+
+ /**
+ * Record a failed RPC (either an exception during {@code execute} or a non-2xx response).
+ * After {@value #FAILURE_UNAVAILABLE_THRESHOLD} consecutive failures the client reports
+ * unavailable so the cluster manager can evict it; one success in between resets the
+ * counter and keeps the client cached.
+ */
+ public void markFailure() {
+ consecutiveFailures.incrementAndGet();
+ }
+
+ /**
+ * Reports the client as unavailable if any of the following holds:
+ *
+ * - the underlying lifecycle is no longer started (closed / failed);
+ * - no RPC has been sent through this client for longer than
+ * {@value #IDLE_UNAVAILABLE_THRESHOLD_MINUTES} minutes (orphaned client);
+ * - at least {@value #FAILURE_UNAVAILABLE_THRESHOLD} consecutive failures have
+ * accumulated since the last success (sustained backend outage).
+ *
+ * For an actively used, healthy client this method always returns {@code true}.
+ */
+ @Override
+ public boolean isAvailable() {
+ if (!super.isAvailable()) {
+ return false;
+ }
+ if (consecutiveFailures.get() >= FAILURE_UNAVAILABLE_THRESHOLD) {
+ return false;
+ }
+ return (System.nanoTime() - lastUsedNanos) <= IDLE_UNAVAILABLE_THRESHOLD_NANOS;
+ }
+
public CloseableHttpClient getHttpClient() {
return httpClient;
}
+
+ /**
+ * Visible for tests / observability: current consecutive-failure counter snapshot.
+ */
+ public int getConsecutiveFailures() {
+ return consecutiveFailures.get();
+ }
}
diff --git a/trpc-proto/trpc-proto-http/src/test/java/com/tencent/trpc/proto/http/HttpMultiPortNamingUrlConcurrentTest.java b/trpc-proto/trpc-proto-http/src/test/java/com/tencent/trpc/proto/http/HttpMultiPortNamingUrlConcurrentTest.java
new file mode 100644
index 000000000..dfd3ff03a
--- /dev/null
+++ b/trpc-proto/trpc-proto-http/src/test/java/com/tencent/trpc/proto/http/HttpMultiPortNamingUrlConcurrentTest.java
@@ -0,0 +1,273 @@
+/*
+ * Tencent is pleased to support the open source community by making tRPC available.
+ *
+ * Copyright (C) 2023 Tencent.
+ * All rights reserved.
+ *
+ * If you have downloaded a copy of the tRPC source code from Tencent,
+ * please note that tRPC source code is licensed under the Apache 2.0 License,
+ * A copy of the Apache 2.0 License can be found in the LICENSE file.
+ */
+
+package com.tencent.trpc.proto.http;
+
+import static com.tencent.trpc.transport.http.common.Constants.HTTP_SCHEME;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import com.tencent.trpc.core.common.ConfigManager;
+import com.tencent.trpc.core.common.config.BackendConfig;
+import com.tencent.trpc.core.common.config.ConsumerConfig;
+import com.tencent.trpc.core.common.config.ProviderConfig;
+import com.tencent.trpc.core.common.config.ServerConfig;
+import com.tencent.trpc.core.common.config.ServiceConfig;
+import com.tencent.trpc.core.rpc.RpcClientContext;
+import com.tencent.trpc.core.rpc.RpcContext;
+import com.tencent.trpc.core.utils.NetUtils;
+import com.tencent.trpc.proto.http.client.AbstractConsumerInvoker;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import tests.service.GreeterService;
+import tests.service.HelloRequestProtocol.HelloRequest;
+import tests.service.HelloRequestProtocol.HelloResponse;
+import tests.service.TestBeanConvertWithGetMethodReq;
+import tests.service.TestBeanConvertWithGetMethodRsp;
+
+/**
+ * HTTP-protocol counterpart of the tRPC concurrent multi-port test in {@code trpc-proto-standard}.
+ *
+ * Setup: 10 standalone HTTP servers (jetty) on consecutive ports backed by
+ * {@link PortAwareGreeterServiceImpl} that tags each response with its own listening port.
+ * One shared {@link BackendConfig} lists all 10 endpoints in the comma-separated namingUrl;
+ * 100 concurrent threads × 1000 requests each fan-out via {@code ip://} random load balance.
+ *
+ * Final assertions:
+ *
+ * - every request succeeds and the echoed message matches the request payload,
+ * - every backend port is hit at least once,
+ * - distribution roughly balanced — random over N=10 with R=100000 trials,
+ * expected 10000 / port; tolerated range {@code [2000, 20000]}, far exceeding any
+ * realistic random outlier.
+ *
+ *
+ */
+public class HttpMultiPortNamingUrlConcurrentTest {
+
+ private static final int BASE_PORT = 18500;
+ private static final int SERVER_COUNT = 10;
+ private static final int THREAD_COUNT = 100;
+ private static final int CYCLE_PER_THREAD = 1000;
+
+ private static final int REQUEST_TIMEOUT_MS = 60_000;
+ private static final int MAX_CONNECTIONS = 20480;
+
+ private static ServerConfig serverConfig;
+
+ /**
+ * Spin up {@value #SERVER_COUNT} HTTP providers on contiguous ports. Each provider returns
+ * its own port number so the concurrent test can assert that requests are dispatched across
+ * every endpoint resolved from the multi-port {@code ip://} naming URL.
+ */
+ @BeforeClass
+ public static void startHttpServers() {
+ ConfigManager.stopTest();
+ ConfigManager.startTest();
+
+ HashMap providers = new HashMap<>();
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ int port = BASE_PORT + i;
+ ProviderConfig pc = new ProviderConfig<>();
+ pc.setServiceInterface(GreeterService.class);
+ pc.setRef(new PortAwareGreeterServiceImpl(port));
+
+ ServiceConfig serviceConfig = new ServiceConfig();
+ serviceConfig.setName("multi-port-server-" + port);
+ serviceConfig.getProviderConfigs().add(pc);
+ serviceConfig.setIp(NetUtils.LOCAL_HOST);
+ serviceConfig.setPort(port);
+ serviceConfig.setProtocol(HTTP_SCHEME);
+ serviceConfig.setTransporter("jetty");
+ providers.put(serviceConfig.getName(), serviceConfig);
+ }
+
+ ServerConfig sc = new ServerConfig();
+ sc.setServiceMap(providers);
+ sc.setApp("http-multi-port-test");
+ sc.setLocalIp(NetUtils.LOCAL_HOST);
+ sc.init();
+ serverConfig = sc;
+ }
+
+ @AfterClass
+ public static void stopHttpServers() {
+ if (serverConfig != null) {
+ serverConfig.stop();
+ serverConfig = null;
+ }
+ ConfigManager.stopTest();
+ }
+
+ /**
+ * Reset the static {@code AbstractConsumerInvoker.TIMEOUT_MANAGER} so the {@code
+ * HashedWheelTimer} is fresh — sibling tests in the same surefire run may have
+ * stopped it via {@code ConfigManager.stopTest()}.
+ */
+ @Before
+ public void resetTimeoutManager() {
+ AbstractConsumerInvoker.reset();
+ }
+
+ @Test
+ public void testHttpMultiPortNamingUrlConcurrent() throws InterruptedException {
+ // Build "ip://127.0.0.1:p1,127.0.0.1:p2,...,127.0.0.1:p10".
+ StringBuilder urlBuilder = new StringBuilder("ip://");
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ if (i > 0) {
+ urlBuilder.append(',');
+ }
+ urlBuilder.append(NetUtils.LOCAL_HOST).append(':').append(BASE_PORT + i);
+ }
+ String namingUrl = urlBuilder.toString();
+
+ BackendConfig backendConfig = new BackendConfig();
+ backendConfig.setName("http-multi-port-client");
+ backendConfig.setNamingUrl(namingUrl);
+ backendConfig.setProtocol("http");
+ backendConfig.setRequestTimeout(REQUEST_TIMEOUT_MS);
+ backendConfig.setMaxConns(MAX_CONNECTIONS);
+ backendConfig.setConnsPerAddr(2);
+ backendConfig.setKeepAlive(true);
+
+ ConsumerConfig consumerConfig = new ConsumerConfig<>();
+ consumerConfig.setServiceInterface(GreeterService.class);
+ consumerConfig.setBackendConfig(backendConfig);
+
+ try {
+ final GreeterService proxy = consumerConfig.getProxy();
+
+ // Per-port hit counter aggregated across all worker threads.
+ ConcurrentHashMap portHits = new ConcurrentHashMap<>();
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ portHits.put(BASE_PORT + i, new AtomicInteger(0));
+ }
+
+ CountDownLatch latch = new CountDownLatch(THREAD_COUNT);
+ TestResult[] results = new TestResult[THREAD_COUNT];
+ for (int t = 0; t < THREAD_COUNT; t++) {
+ final TestResult r = new TestResult();
+ results[t] = r;
+ final int threadIndex = t;
+ new Thread(() -> {
+ try {
+ for (int i = 0; i < CYCLE_PER_THREAD; i++) {
+ String reqPayload = "req-" + threadIndex + "-" + i;
+ RpcClientContext ctx = new RpcClientContext();
+ HelloResponse rsp = proxy.sayHello(ctx, HelloRequest.newBuilder()
+ .setMessage(reqPayload)
+ .build());
+ assertNotNull("response must not be null", rsp);
+ String message = rsp.getMessage();
+ // Server returns "|port=".
+ int sep = message.lastIndexOf("|port=");
+ assertTrue("response missing port marker: " + message, sep > 0);
+ String echoed = message.substring(0, sep);
+ int port = Integer.parseInt(message.substring(sep + "|port=".length()));
+ assertEquals("echoed payload must match request", reqPayload, echoed);
+ AtomicInteger counter = portHits.get(port);
+ assertTrue("response from unexpected port: " + port, counter != null);
+ counter.incrementAndGet();
+ }
+ r.succ = true;
+ } catch (Throwable ex) {
+ r.succ = false;
+ r.ex = ex;
+ ex.printStackTrace();
+ } finally {
+ latch.countDown();
+ }
+ }, "http-concurrent-caller-" + t).start();
+ }
+
+ // 300s upper bound for HTTP — slower per-request than tRPC due to HTTP framing.
+ boolean done = latch.await(300, TimeUnit.SECONDS);
+ assertTrue("concurrent calls timed out before completion", done);
+
+ for (int i = 0; i < results.length; i++) {
+ TestResult r = results[i];
+ assertTrue("worker thread " + i + " failed: "
+ + (r.ex == null ? "" : r.ex.toString()), r.succ);
+ }
+
+ // ---- aggregate assertions ----
+ int totalRequests = THREAD_COUNT * CYCLE_PER_THREAD;
+ int sum = 0;
+ Set hitPorts = new HashSet<>();
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ int port = BASE_PORT + i;
+ int hits = portHits.get(port).get();
+ sum += hits;
+ if (hits > 0) {
+ hitPorts.add(port);
+ }
+ // Random over 10 with 100000 trials → expected 10000/server.
+ // [2000, 20000] leaves >>3-sigma headroom; CI-safe.
+ assertTrue("port " + port + " never received a request", hits > 0);
+ assertTrue("port " + port + " too few hits: " + hits, hits >= 2000);
+ assertTrue("port " + port + " too many hits: " + hits, hits <= 20000);
+ }
+ assertEquals("total responses should equal total requests", totalRequests, sum);
+ assertEquals("all 10 backend ports must be hit", SERVER_COUNT, hitPorts.size());
+ } finally {
+ backendConfig.stop();
+ }
+ }
+
+ /**
+ * Service impl that tags every response with its own listening port so the test can verify
+ * the actual server that handled each request.
+ */
+ private static class PortAwareGreeterServiceImpl implements GreeterService {
+
+ private final int port;
+
+ PortAwareGreeterServiceImpl(int port) {
+ this.port = port;
+ }
+
+ @Override
+ public HelloResponse sayHello(RpcContext context, HelloRequest request) {
+ String message = request.getMessage();
+ return HelloResponse.newBuilder()
+ .setMessage(message + "|port=" + port)
+ .build();
+ }
+
+ @Override
+ public String sayBlankHello(RpcContext context, HelloRequest request) {
+ return "";
+ }
+
+ @Override
+ public TestBeanConvertWithGetMethodRsp sayHelloNonPbType(RpcContext context,
+ TestBeanConvertWithGetMethodReq request) {
+ return new TestBeanConvertWithGetMethodRsp(request.getMessage(),
+ request.getStatus(), request.getComments());
+ }
+ }
+
+ private static class TestResult {
+
+ boolean succ;
+ Throwable ex;
+ }
+}
diff --git a/trpc-proto/trpc-proto-http/src/test/java/com/tencent/trpc/proto/http/HttpRpcClientLongLinkTest.java b/trpc-proto/trpc-proto-http/src/test/java/com/tencent/trpc/proto/http/HttpRpcClientLongLinkTest.java
new file mode 100644
index 000000000..3c7bcc643
--- /dev/null
+++ b/trpc-proto/trpc-proto-http/src/test/java/com/tencent/trpc/proto/http/HttpRpcClientLongLinkTest.java
@@ -0,0 +1,497 @@
+/*
+ * Tencent is pleased to support the open source community by making tRPC available.
+ *
+ * Copyright (C) 2023 Tencent.
+ * All rights reserved.
+ *
+ * If you have downloaded a copy of the tRPC source code from Tencent,
+ * please note that tRPC source code is licensed under the Apache 2.0 License,
+ * A copy of the Apache 2.0 License can be found in the LICENSE file.
+ */
+
+package com.tencent.trpc.proto.http;
+
+import com.tencent.trpc.core.common.config.ProtocolConfig;
+import com.tencent.trpc.core.exception.TRpcException;
+import com.tencent.trpc.proto.http.client.Http2RpcClient;
+import com.tencent.trpc.proto.http.client.Http2cRpcClient;
+import com.tencent.trpc.proto.http.client.HttpRpcClient;
+import com.tencent.trpc.proto.http.client.HttpsRpcClient;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.util.concurrent.TimeUnit;
+import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
+import org.apache.http.Header;
+import org.apache.http.HeaderElement;
+import org.apache.http.HttpResponse;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.protocol.HttpContext;
+import org.junit.Assert;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+/**
+ * Pure unit tests for the long-connection extensions on the HTTP RpcClient hierarchy:
+ * {@code markUsed}, the overridden {@code isAvailable}, and the idle-threshold heuristic.
+ *
+ * No Jetty server is started; we manipulate the internal {@code lastUsedNanos} field via
+ * reflection to drive the idle/active branches.
+ */
+public class HttpRpcClientLongLinkTest {
+
+ private static final long ELEVEN_MINUTES_NANOS = java.util.concurrent.TimeUnit.MINUTES.toNanos(11);
+
+ /**
+ * HttpRpcClient.isAvailable returns false when not started (lifecycle.isStarted() == false),
+ * regardless of lastUsedNanos.
+ */
+ @Test
+ public void testHttpRpcClientNotAvailableBeforeOpen() {
+ ProtocolConfig pc = newProtocolConfig();
+ HttpRpcClient client = new HttpRpcClient(pc);
+ // Lifecycle has not been started, so super.isAvailable() returns false.
+ Assert.assertFalse(client.isAvailable());
+ }
+
+ /**
+ * HttpRpcClient.isAvailable returns true when started AND recently used.
+ */
+ @Test
+ public void testHttpRpcClientAvailableWhenStartedAndFresh() {
+ ProtocolConfig pc = newProtocolConfig();
+ HttpRpcClient client = new HttpRpcClient(pc);
+ client.open();
+ try {
+ client.markUsed(); // refresh
+ Assert.assertTrue(client.isAvailable());
+ } finally {
+ client.close();
+ }
+ }
+
+ /**
+ * HttpRpcClient.isAvailable returns false when started but idle > 10 min.
+ */
+ @Test
+ public void testHttpRpcClientNotAvailableWhenIdleTooLong() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ HttpRpcClient client = new HttpRpcClient(pc);
+ client.open();
+ try {
+ // Force lastUsedNanos to "11 minutes ago".
+ setField(client, "lastUsedNanos", System.nanoTime() - ELEVEN_MINUTES_NANOS);
+ Assert.assertFalse(client.isAvailable());
+
+ // Recovering via markUsed restores availability.
+ client.markUsed();
+ Assert.assertTrue(client.isAvailable());
+ } finally {
+ client.close();
+ }
+ }
+
+ /**
+ * Http2cRpcClient mirrors the same logic.
+ */
+ @Test
+ public void testHttp2cRpcClientNotAvailableBeforeOpen() {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ Assert.assertFalse(client.isAvailable());
+ }
+
+ @Test
+ public void testHttp2cRpcClientAvailableWhenStartedAndFresh() {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ client.open();
+ try {
+ client.markUsed();
+ Assert.assertTrue(client.isAvailable());
+ } finally {
+ client.close();
+ }
+ }
+
+ @Test
+ public void testHttp2cRpcClientNotAvailableWhenIdleTooLong() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ client.open();
+ try {
+ setField(client, "lastUsedNanos", System.nanoTime() - ELEVEN_MINUTES_NANOS);
+ Assert.assertFalse(client.isAvailable());
+ client.markUsed();
+ Assert.assertTrue(client.isAvailable());
+ } finally {
+ client.close();
+ }
+ }
+
+ /**
+ * doOpen wires the underlying httpClient. close() must release it without throwing.
+ */
+ @Test
+ public void testHttpRpcClientOpenCloseReleasesResources() {
+ ProtocolConfig pc = newProtocolConfig();
+ HttpRpcClient client = new HttpRpcClient(pc);
+ client.open();
+ Assert.assertNotNull(client.getHttpClient());
+ client.close();
+ Assert.assertTrue(client.isClosed());
+ // Idempotent close is safe.
+ client.close();
+ }
+
+ /**
+ * doOpen for Http2cRpcClient creates an httpAsyncClient and starts it.
+ */
+ @Test
+ public void testHttp2cRpcClientOpenCloseReleasesResources() {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ client.open();
+ Assert.assertNotNull(client.getHttpAsyncClient());
+ client.close();
+ Assert.assertTrue(client.isClosed());
+ client.close();
+ }
+
+ /**
+ * Http2RpcClient inherits markUsed/isAvailable from Http2cRpcClient. We don't need a real
+ * TLS context — we only verify the inherited methods behave the same on the subclass.
+ * Use reflection to flip lifecycle state to STARTED so that super.isAvailable() returns true.
+ */
+ @Test
+ public void testHttp2RpcClientInheritsIdleHeuristic() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2RpcClient client = new Http2RpcClient(pc);
+ forceLifecycleStarted(client);
+ try {
+ client.markUsed();
+ Assert.assertTrue(client.isAvailable());
+ setField(client, "lastUsedNanos", System.nanoTime() - ELEVEN_MINUTES_NANOS);
+ Assert.assertFalse(client.isAvailable());
+ } finally {
+ forceLifecycleClosed(client);
+ }
+ }
+
+ /**
+ * HttpsRpcClient extends Http2RpcClient. Same inheritance check.
+ */
+ @Test
+ public void testHttpsRpcClientInheritsIdleHeuristic() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ HttpsRpcClient client = new HttpsRpcClient(pc);
+ forceLifecycleStarted(client);
+ try {
+ client.markUsed();
+ Assert.assertTrue(client.isAvailable());
+ setField(client, "lastUsedNanos", System.nanoTime() - ELEVEN_MINUTES_NANOS);
+ Assert.assertFalse(client.isAvailable());
+ } finally {
+ forceLifecycleClosed(client);
+ }
+ }
+
+ /**
+ * Sustained backend failure: consecutive markFailure() crossing FAILURE_UNAVAILABLE_THRESHOLD
+ * flips isAvailable() to false even while lastUsedNanos is fresh.
+ */
+ @Test
+ public void testHttpRpcClientUnavailableOnConsecutiveFailures() {
+ final ProtocolConfig pc = newProtocolConfig();
+ HttpRpcClient client = new HttpRpcClient(pc);
+ client.open();
+ try {
+ client.markUsed();
+ for (int i = 0; i < 49; i++) {
+ client.markFailure();
+ }
+ // 49 < threshold (50): still available.
+ Assert.assertTrue(client.isAvailable());
+ client.markFailure(); // 50 — flip
+ Assert.assertFalse(client.isAvailable());
+ // markSuccess resets counter — recovers immediately.
+ client.markSuccess();
+ Assert.assertTrue(client.isAvailable());
+ Assert.assertEquals(0, client.getConsecutiveFailures());
+ } finally {
+ client.close();
+ }
+ }
+
+ /**
+ * Same eviction-on-consecutive-failure semantics on the H2 path.
+ */
+ @Test
+ public void testHttp2cRpcClientUnavailableOnConsecutiveFailures() {
+ final ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ client.open();
+ try {
+ client.markUsed();
+ for (int i = 0; i < 49; i++) {
+ client.markFailure();
+ }
+ Assert.assertTrue(client.isAvailable());
+ client.markFailure();
+ Assert.assertFalse(client.isAvailable());
+ client.markSuccess();
+ Assert.assertTrue(client.isAvailable());
+ Assert.assertEquals(0, client.getConsecutiveFailures());
+ } finally {
+ client.close();
+ }
+ }
+
+ /**
+ * Concurrent markFailure / markSuccess from many business threads must converge to a
+ * deterministic terminal state — the AtomicInteger contract guarantees no lost updates.
+ */
+ @Test
+ public void testHttpRpcClientConsecutiveFailureCounterIsThreadSafe() throws Exception {
+ final ProtocolConfig pc = newProtocolConfig();
+ final HttpRpcClient client = new HttpRpcClient(pc);
+ client.open();
+ try {
+ final int threads = 32;
+ final int incrementsPerThread = 1000;
+ java.util.concurrent.CountDownLatch start = new java.util.concurrent.CountDownLatch(1);
+ java.util.concurrent.CountDownLatch done = new java.util.concurrent.CountDownLatch(threads);
+ for (int i = 0; i < threads; i++) {
+ new Thread(() -> {
+ try {
+ start.await();
+ for (int j = 0; j < incrementsPerThread; j++) {
+ client.markFailure();
+ }
+ } catch (InterruptedException ignore) {
+ Thread.currentThread().interrupt();
+ } finally {
+ done.countDown();
+ }
+ }).start();
+ }
+ start.countDown();
+ Assert.assertTrue(done.await(10, java.util.concurrent.TimeUnit.SECONDS));
+ Assert.assertEquals(threads * incrementsPerThread, client.getConsecutiveFailures());
+ client.markSuccess();
+ Assert.assertEquals(0, client.getConsecutiveFailures());
+ } finally {
+ client.close();
+ }
+ }
+
+ /**
+ * KeepAliveStrategy: server returned no Keep-Alive header — fall back to the 5min ceiling.
+ */
+ @Test
+ public void testResolveKeepAliveDurationNoHeader() {
+ HttpResponse rsp = Mockito.mock(HttpResponse.class);
+ Mockito.when(rsp.getFirstHeader("Keep-Alive")).thenReturn(null);
+ long expected = TimeUnit.MINUTES.toMillis(5);
+ Assert.assertEquals(expected, HttpRpcClient.resolveKeepAliveDuration(rsp, null));
+ }
+
+ /**
+ * KeepAliveStrategy: server returned {@code Keep-Alive: timeout=120} — use server hint
+ * because it's smaller than the fallback ceiling.
+ */
+ @Test
+ public void testResolveKeepAliveDurationServerHintSmaller() {
+ HttpResponse rsp = mockKeepAliveHeader("timeout", "120");
+ // 120s = 120_000ms, fallback = 5min = 300_000ms → use 120_000.
+ Assert.assertEquals(120_000L, HttpRpcClient.resolveKeepAliveDuration(rsp, null));
+ }
+
+ /**
+ * KeepAliveStrategy: server returned {@code Keep-Alive: timeout=3600} — clamp at the
+ * 5min fallback ceiling.
+ */
+ @Test
+ public void testResolveKeepAliveDurationServerHintLargerClamped() {
+ HttpResponse rsp = mockKeepAliveHeader("timeout", "3600");
+ long expected = TimeUnit.MINUTES.toMillis(5);
+ Assert.assertEquals(expected, HttpRpcClient.resolveKeepAliveDuration(rsp, null));
+ }
+
+ /**
+ * KeepAliveStrategy: server sent {@code Keep-Alive: timeout=abc} — NumberFormatException
+ * is swallowed and the fallback ceiling is returned.
+ */
+ @Test
+ public void testResolveKeepAliveDurationMalformedTimeoutFallsBack() {
+ HttpResponse rsp = mockKeepAliveHeader("timeout", "abc");
+ long expected = TimeUnit.MINUTES.toMillis(5);
+ Assert.assertEquals(expected, HttpRpcClient.resolveKeepAliveDuration(rsp, null));
+ }
+
+ /**
+ * KeepAliveStrategy: server sent {@code Keep-Alive: max=100} (no timeout key) — fallback.
+ */
+ @Test
+ public void testResolveKeepAliveDurationOtherKeyOnly() {
+ HttpResponse rsp = mockKeepAliveHeader("max", "100");
+ long expected = TimeUnit.MINUTES.toMillis(5);
+ Assert.assertEquals(expected, HttpRpcClient.resolveKeepAliveDuration(rsp, null));
+ }
+
+ /**
+ * doClose must swallow IOException from {@code httpClient.close()} (logged, not propagated).
+ * Drives the catch branch in HttpRpcClient.doClose.
+ */
+ @Test
+ public void testHttpRpcClientDoCloseSwallowsIoException() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ HttpRpcClient client = new HttpRpcClient(pc);
+ client.open();
+ // Replace the live httpClient with a mock that throws IOException on close.
+ CloseableHttpClient throwing = Mockito.mock(CloseableHttpClient.class);
+ Mockito.doThrow(new IOException("boom")).when(throwing).close();
+ setField(client, "httpClient", throwing);
+ // Must not propagate.
+ client.close();
+ Assert.assertTrue(client.isClosed());
+ }
+
+ /**
+ * Same coverage on the H2 path: doClose swallows IOException from the async client.
+ */
+ @Test
+ public void testHttp2cRpcClientDoCloseSwallowsIoException() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ client.open();
+ CloseableHttpAsyncClient throwing = Mockito.mock(CloseableHttpAsyncClient.class);
+ Mockito.doThrow(new IOException("boom")).when(throwing).close();
+ setField(client, "httpAsyncClient", throwing);
+ client.close();
+ Assert.assertTrue(client.isClosed());
+ }
+
+ /**
+ * Http2c doOpen must surface initialisation failure as a TRpcException so the lifecycle
+ * moves to FAILED instead of leaving a half-built client cached. Drive the catch branch
+ * by reflectively invoking {@code doOpen()} with a null protocolConfig — the builder NPEs
+ * on {@code maxConns}, the catch wraps it.
+ */
+ @Test
+ public void testHttp2cRpcClientDoOpenSurfacesFailure() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ Http2cRpcClient client = new Http2cRpcClient(pc);
+ // Null out protocolConfig so doOpen()'s protocolConfig.getMaxConns() NPEs inside the
+ // try-block, exercising the catch → TRpcException branch.
+ setField(client, "protocolConfig", null);
+ try {
+ // Direct doOpen reflection bypasses the lifecycle's pre-flight null check.
+ java.lang.reflect.Method m = Http2cRpcClient.class.getDeclaredMethod("doOpen");
+ m.setAccessible(true);
+ try {
+ m.invoke(client);
+ Assert.fail("doOpen must surface failure");
+ } catch (java.lang.reflect.InvocationTargetException ite) {
+ Throwable cause = ite.getCause();
+ Assert.assertTrue("expected TRpcException, got " + cause,
+ cause instanceof TRpcException);
+ }
+ } finally {
+ // Restore so close() can run cleanly via lifecycle.
+ setField(client, "protocolConfig", pc);
+ }
+ }
+
+ /**
+ * Http2/HTTPS doOpen surfaces failure when the keystore path is missing/invalid.
+ */
+ @Test
+ public void testHttp2RpcClientDoOpenSurfacesFailure() throws Exception {
+ ProtocolConfig pc = newProtocolConfig();
+ // No keystore configured at all → SSLContexts.loadTrustMaterial throws.
+ pc.getExtMap().put("keyStorePath", "/no/such/path/keystore.jks");
+ pc.getExtMap().put("keyStorePass", "wrong");
+ Http2RpcClient client = new Http2RpcClient(pc);
+ java.lang.reflect.Method m = Http2RpcClient.class.getDeclaredMethod("doOpen");
+ m.setAccessible(true);
+ try {
+ m.invoke(client);
+ Assert.fail("doOpen must surface failure");
+ } catch (java.lang.reflect.InvocationTargetException ite) {
+ Throwable cause = ite.getCause();
+ Assert.assertTrue("expected TRpcException, got " + cause,
+ cause instanceof TRpcException);
+ }
+ }
+
+ /* ---------------------- helpers ---------------------- */
+
+ /**
+ * Build a mocked {@link HttpResponse} carrying {@code Keep-Alive: =}.
+ */
+ private static HttpResponse mockKeepAliveHeader(String key, String value) {
+ HttpResponse rsp = Mockito.mock(HttpResponse.class);
+ Header h = Mockito.mock(Header.class);
+ HeaderElement el = Mockito.mock(HeaderElement.class);
+ Mockito.when(el.getName()).thenReturn(key);
+ Mockito.when(el.getValue()).thenReturn(value);
+ Mockito.when(h.getElements()).thenReturn(new HeaderElement[]{el});
+ Mockito.when(rsp.getFirstHeader("Keep-Alive")).thenReturn(h);
+ return rsp;
+ }
+
+ private static ProtocolConfig newProtocolConfig() {
+ ProtocolConfig pc = new ProtocolConfig();
+ pc.setIp("127.0.0.1");
+ pc.setPort(0);
+ pc.setProtocol("http");
+ pc.setNetwork("tcp");
+ pc.setDefault();
+ return pc;
+ }
+
+ /**
+ * Bypass real doOpen — flip the embedded LifecycleObj to STARTED so isAvailable's
+ * super.isAvailable() check passes without spinning up actual HTTP infrastructure.
+ */
+ private static void forceLifecycleStarted(Object client) throws Exception {
+ Field lf = findField(client.getClass(), "lifecycleObj");
+ lf.setAccessible(true);
+ Object lifecycle = lf.get(client);
+ Field state = findField(lifecycle.getClass(), "state");
+ state.setAccessible(true);
+ // LifecycleState enum: STARTED ordinal lookup via reflection (avoid hard dependency).
+ Class> stateEnum = state.getType();
+ Object started = stateEnum.getMethod("valueOf", String.class).invoke(null, "STARTED");
+ state.set(lifecycle, started);
+ }
+
+ private static void forceLifecycleClosed(Object client) throws Exception {
+ Field lf = findField(client.getClass(), "lifecycleObj");
+ lf.setAccessible(true);
+ Object lifecycle = lf.get(client);
+ Field state = findField(lifecycle.getClass(), "state");
+ state.setAccessible(true);
+ Class> stateEnum = state.getType();
+ Object stopped = stateEnum.getMethod("valueOf", String.class).invoke(null, "STOPPED");
+ state.set(lifecycle, stopped);
+ }
+
+ private static void setField(Object target, String fieldName, Object value) throws Exception {
+ Field f = findField(target.getClass(), fieldName);
+ f.setAccessible(true);
+ f.set(target, value);
+ }
+
+ private static Field findField(Class> clazz, String name) throws NoSuchFieldException {
+ Class> c = clazz;
+ while (c != null) {
+ try {
+ return c.getDeclaredField(name);
+ } catch (NoSuchFieldException ignored) {
+ c = c.getSuperclass();
+ }
+ }
+ throw new NoSuchFieldException(name);
+ }
+}
diff --git a/trpc-proto/trpc-proto-standard/src/test/java/com/tencent/trpc/proto/standard/concurrenttest/MultiPortNamingUrlConcurrentTest.java b/trpc-proto/trpc-proto-standard/src/test/java/com/tencent/trpc/proto/standard/concurrenttest/MultiPortNamingUrlConcurrentTest.java
new file mode 100644
index 000000000..250f0c4b5
--- /dev/null
+++ b/trpc-proto/trpc-proto-standard/src/test/java/com/tencent/trpc/proto/standard/concurrenttest/MultiPortNamingUrlConcurrentTest.java
@@ -0,0 +1,487 @@
+/*
+ * Tencent is pleased to support the open source community by making tRPC available.
+ *
+ * Copyright (C) 2023 Tencent.
+ * All rights reserved.
+ *
+ * If you have downloaded a copy of the tRPC source code from Tencent,
+ * please note that tRPC source code is licensed under the Apache 2.0 License,
+ * A copy of the Apache 2.0 License can be found in the LICENSE file.
+ */
+
+package com.tencent.trpc.proto.standard.concurrenttest;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import com.google.protobuf.ByteString;
+import com.tencent.trpc.core.cluster.RpcClusterClientManager;
+import com.tencent.trpc.core.common.ConfigManager;
+import com.tencent.trpc.core.common.config.BackendConfig;
+import com.tencent.trpc.core.common.config.ProviderConfig;
+import com.tencent.trpc.core.common.config.ServiceConfig;
+import com.tencent.trpc.core.rpc.RpcClientContext;
+import com.tencent.trpc.core.rpc.RpcServerContext;
+import com.tencent.trpc.core.transport.AbstractClientTransport;
+import com.tencent.trpc.core.transport.Channel;
+import com.tencent.trpc.core.transport.ClientTransport;
+import com.tencent.trpc.proto.standard.common.HelloRequestProtocol.HelloRequest;
+import com.tencent.trpc.proto.standard.common.HelloRequestProtocol.HelloResponse;
+import com.tencent.trpc.proto.support.DefResponseFutureManager;
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Verifies that {@link BackendConfig#setNamingUrl(String)} configured with a comma-separated list
+ * of {@code ip:port} entries fan-outs requests to all backends (random load balance) under heavy
+ * concurrency.
+ *
+ * Setup: 10 standalone tRPC servers on consecutive ports; one shared {@link BackendConfig}
+ * whose namingUrl lists all 10 endpoints; 100 concurrent threads × 1000 requests each.
+ *
+ * Each server impl echoes back its own listening port so the client can group responses by the
+ * actually-served port. Final assertions:
+ *
+ * - every request succeeds with the exact echoed payload,
+ * - all 10 backend ports get hit at least once (proving namingUrl actually distributes
+ * traffic, not pinning to one endpoint),
+ * - distribution is roughly balanced — random selector over N=10 with R=100000 requests
+ * gives an expected 10000 per server; we tolerate {@code [2000, 20000]} per server which
+ * is a generous bound far above any realistic random outlier.
+ *
+ */
+public class MultiPortNamingUrlConcurrentTest {
+
+ private static final int BASE_TCP_PORT = 12500;
+ private static final int SERVER_COUNT = 10;
+ private static final int THREAD_COUNT = 100;
+ private static final int CYCLE_PER_THREAD = 1000;
+ /**
+ * Concurrency profile for {@link #testIdleTimeoutChannelRecycle()}: {@value #IDLE_THREAD_COUNT}
+ * threads × {@value #IDLE_CYCLE_PER_THREAD} requests per round, matching the stress
+ * profile of {@link #testMultiPortNamingUrlConcurrent()}. The total wall-clock is
+ * dominated by the 20s idle-wait between the two rounds.
+ */
+ private static final int IDLE_THREAD_COUNT = 100;
+ private static final int IDLE_CYCLE_PER_THREAD = 1000;
+ private static final int IDLE_CONNS_PER_ADDR = 5;
+
+ private final List serviceConfigs = new ArrayList<>(SERVER_COUNT);
+
+ @Before
+ public void before() {
+ ConfigManager.stopTest();
+ ConfigManager.startTest();
+ startServers();
+ }
+
+ @After
+ public void stop() {
+ for (ServiceConfig serviceConfig : serviceConfigs) {
+ try {
+ serviceConfig.unExport();
+ } catch (Exception ignore) {
+ // ignore
+ }
+ }
+ serviceConfigs.clear();
+ ConfigManager.stopTest();
+ }
+
+ @Test
+ public void testMultiPortNamingUrlConcurrent() throws InterruptedException {
+ // Build the comma-separated namingUrl: "ip://127.0.0.1:p1,127.0.0.1:p2,..."
+ StringBuilder urlBuilder = new StringBuilder("ip://");
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ if (i > 0) {
+ urlBuilder.append(',');
+ }
+ urlBuilder.append("127.0.0.1:").append(BASE_TCP_PORT + i);
+ }
+ String namingUrl = urlBuilder.toString();
+
+ BackendConfig backendConfig = new BackendConfig();
+ DefResponseFutureManager.reset();
+ backendConfig.setNamingUrl(namingUrl);
+ // One long connection per backend addr is enough; keeps the test deterministic.
+ backendConfig.setConnsPerAddr(5);
+ backendConfig.setNetwork("tcp");
+ // Generous client-side timeout so a slow JIT warm-up can't fail individual calls.
+ backendConfig.setRequestTimeout(60_000);
+
+ final ConcurrentTestServiceApi proxy = backendConfig.getProxy(ConcurrentTestServiceApi.class);
+
+ // Per-port hit counter aggregated across all threads.
+ ConcurrentHashMap portHits = new ConcurrentHashMap<>();
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ portHits.put(BASE_TCP_PORT + i, new AtomicInteger(0));
+ }
+
+ CountDownLatch latch = new CountDownLatch(THREAD_COUNT);
+ List results = new ArrayList<>(THREAD_COUNT);
+ for (int t = 0; t < THREAD_COUNT; t++) {
+ final TestResult r = new TestResult();
+ results.add(r);
+ final int threadIndex = t;
+ new Thread(() -> {
+ try {
+ for (int i = 0; i < CYCLE_PER_THREAD; i++) {
+ String reqPayload = "req-" + threadIndex + "-" + i;
+ RpcClientContext context = new RpcClientContext();
+ HelloResponse response = proxy.sayHello(context, HelloRequest.newBuilder()
+ .setMessage(ByteString.copyFromUtf8(reqPayload))
+ .build());
+ // Server impl returns "|port="
+ String message = response.getMessage().toStringUtf8();
+ int sep = message.lastIndexOf("|port=");
+ assertTrue("response missing port marker: " + message, sep > 0);
+ String echoed = message.substring(0, sep);
+ int port = Integer.parseInt(message.substring(sep + "|port=".length()));
+ assertEquals("echoed payload must match request", reqPayload, echoed);
+ AtomicInteger counter = portHits.get(port);
+ assertTrue("response from unexpected port: " + port, counter != null);
+ counter.incrementAndGet();
+ }
+ r.succ = true;
+ } catch (Throwable ex) {
+ r.succ = false;
+ r.ex = ex;
+ ex.printStackTrace();
+ } finally {
+ latch.countDown();
+ }
+ }, "concurrent-caller-" + t).start();
+ }
+ // 200s upper bound; full run on a laptop usually finishes in a few seconds.
+ boolean done = latch.await(200, TimeUnit.SECONDS);
+ assertTrue("concurrent calls timed out before completion", done);
+
+ for (int i = 0; i < results.size(); i++) {
+ TestResult r = results.get(i);
+ assertTrue("worker thread " + i + " failed: "
+ + (r.ex == null ? "" : r.ex.toString()), r.succ);
+ }
+
+ // ---- final aggregate assertions ----
+ int totalRequests = THREAD_COUNT * CYCLE_PER_THREAD;
+ int sum = 0;
+ Set hitPorts = new HashSet<>();
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ int port = BASE_TCP_PORT + i;
+ int hits = portHits.get(port).get();
+ sum += hits;
+ if (hits > 0) {
+ hitPorts.add(port);
+ }
+ // Random over 10 with 100000 trials → expected 10000/server.
+ // Lower bound 2000 / upper bound 20000 leaves >>3-sigma headroom; CI-safe.
+ assertTrue("port " + port + " never received a request", hits > 0);
+ assertTrue("port " + port + " too few hits: " + hits, hits >= 2000);
+ assertTrue("port " + port + " too many hits: " + hits, hits <= 20000);
+ }
+ assertEquals("total responses should equal total requests", totalRequests, sum);
+ assertEquals("all 10 backend ports must be hit", SERVER_COUNT, hitPorts.size());
+ }
+
+ /**
+ * Verifies the long-connection idle-recycle hand-off end-to-end under concurrency:
+ *
+ * - configure {@code idleTimeout=10000} (10s) and {@code connsPerAddr=5} on the
+ * BackendConfig — each backend gets 5 long connections,
+ * - fire round 1 with {@value #IDLE_THREAD_COUNT} threads × {@value #IDLE_CYCLE_PER_THREAD}
+ * requests each, so every slot gets warmed up to a live channel,
+ * - snapshot the underlying netty channels for every cached transport,
+ * - sleep 20s — well past idleTimeout — so READ_IDLE must fire on every channel,
+ * {@code IdleCloseHandler} must invalidate the slot and close the channel,
+ * - assert every snapshotted channel has flipped to {@code !isConnected()},
+ * - fire round 2 with the same concurrency profile — slots are blank placeholders so
+ * {@code ensureChannelActive} must rebuild fresh connections concurrently without
+ * a thundering-herd storm and every request must succeed,
+ * - assert the post-second-round channels are entirely fresh identities not
+ * present in the original snapshot.
+ *
+ *
+ * {@code connsPerAddr=5} gives 10 backends × 5 conns = 50 long connections. With
+ * {@value #IDLE_THREAD_COUNT} concurrent threads firing requests, the test also
+ * exercises the lock-internal double-check in
+ * {@code AbstractClientTransport.ensureChannelActive}: after the idle handler invalidated
+ * a slot, multiple threads may try to rebuild it simultaneously — the double-check
+ * must collapse them onto exactly one physical reconnect per slot.
+ */
+ @Test
+ public void testIdleTimeoutChannelRecycle() throws Exception {
+ // Build the namingUrl from the server farm started in {@link #before()}.
+ StringBuilder urlBuilder = new StringBuilder("ip://");
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ if (i > 0) {
+ urlBuilder.append(',');
+ }
+ urlBuilder.append("127.0.0.1:").append(BASE_TCP_PORT + i);
+ }
+
+ BackendConfig backendConfig = new BackendConfig();
+ DefResponseFutureManager.reset();
+ backendConfig.setNamingUrl(urlBuilder.toString());
+ backendConfig.setNetwork("tcp");
+ // 5 long connections per backend ⇒ 10 × 5 = 50 channels in the cluster cache.
+ backendConfig.setConnsPerAddr(IDLE_CONNS_PER_ADDR);
+ backendConfig.setRequestTimeout(60_000);
+ // 10s — comfortably above EventLoop scheduling jitter, well below the 20s wait below.
+ backendConfig.setIdleTimeout(10_000);
+
+ try {
+ ConcurrentTestServiceApi proxy = backendConfig.getProxy(ConcurrentTestServiceApi.class);
+
+ // ---- round 1: warm up every backend so each slot has a live long connection ----
+ runConcurrentRequests(proxy, "warmup", IDLE_THREAD_COUNT, IDLE_CYCLE_PER_THREAD);
+
+ // Snapshot every netty channel currently held in the cluster cache for this backend.
+ // Expectation: exactly SERVER_COUNT × connsPerAddr live channels.
+ Set beforeIdleChannels = collectLiveChannels(backendConfig);
+ int expectedConns = SERVER_COUNT * IDLE_CONNS_PER_ADDR;
+ assertEquals("warm-up must produce exactly SERVER_COUNT × connsPerAddr live channels",
+ expectedConns, beforeIdleChannels.size());
+ for (Channel ch : beforeIdleChannels) {
+ assertTrue("warm-up channel must be live before sleep, ch=" + ch,
+ ch.isConnected());
+ }
+
+ // ---- sleep past idleTimeout: READ_IDLE must fire on every live channel ----
+ // 20s = 2 × idleTimeout, leaves headroom for slow CI EventLoop scheduling.
+ Thread.sleep(20_000);
+
+ // ---- assert every snapshotted channel has been recycled by the idle handler ----
+ // A best-effort drain wait: even after sleep, close() is async on the EventLoop;
+ // we re-poll for up to 3s before giving up.
+ long deadline = System.currentTimeMillis() + 3_000;
+ while (System.currentTimeMillis() < deadline) {
+ boolean allClosed = true;
+ for (Channel ch : beforeIdleChannels) {
+ if (ch.isConnected()) {
+ allClosed = false;
+ break;
+ }
+ }
+ if (allClosed) {
+ break;
+ }
+ Thread.sleep(100);
+ }
+ for (Channel ch : beforeIdleChannels) {
+ assertFalse("channel should have been closed by idle handler, but still active: "
+ + ch, ch.isConnected());
+ }
+
+ // ---- round 2: concurrent requests must all succeed via lazy reconnect ----
+ // This also stresses the lock-internal double-check in ensureChannelActive: after
+ // the idle handler invalidated every slot, IDLE_THREAD_COUNT threads compete to
+ // rebuild them — the double-check must collapse them onto exactly one physical
+ // reconnect per slot (no thundering-herd).
+ runConcurrentRequests(proxy, "post-idle", IDLE_THREAD_COUNT, IDLE_CYCLE_PER_THREAD);
+
+ // ---- the second round must have produced fresh channels distinct from the snapshot ----
+ Set afterReconnectChannels = collectLiveChannels(backendConfig);
+ assertEquals("post-idle round must rebuild SERVER_COUNT × connsPerAddr live channels",
+ expectedConns, afterReconnectChannels.size());
+ // Every post-idle channel must be a fresh identity — the original snapshot was
+ // entirely closed by the idle handler so there should be zero overlap.
+ IdentityHashMap before = new IdentityHashMap<>();
+ for (Channel ch : beforeIdleChannels) {
+ before.put(ch, Boolean.TRUE);
+ }
+ for (Channel ch : afterReconnectChannels) {
+ assertFalse("post-idle channel must be a fresh identity, but matched a closed "
+ + "one from the warm-up snapshot: " + ch, before.containsKey(ch));
+ }
+ } finally {
+ try {
+ RpcClusterClientManager.shutdownBackendConfig(backendConfig);
+ } catch (Throwable ignore) {
+ // ignore
+ }
+ }
+ }
+
+ /**
+ * Run {@code threads × cyclesPerThread} concurrent requests through {@code proxy}, asserting
+ * the response payload is echoed correctly. The label is mixed into the payload so logs
+ * and failures from different rounds are easy to tell apart.
+ */
+ private static void runConcurrentRequests(ConcurrentTestServiceApi proxy, String label,
+ int threads, int cyclesPerThread) throws InterruptedException {
+ CountDownLatch latch = new CountDownLatch(threads);
+ List results = new ArrayList<>(threads);
+ for (int t = 0; t < threads; t++) {
+ final TestResult r = new TestResult();
+ results.add(r);
+ final int threadIndex = t;
+ new Thread(() -> {
+ try {
+ for (int i = 0; i < cyclesPerThread; i++) {
+ String reqPayload = label + "-" + threadIndex + "-" + i;
+ HelloResponse response = proxy.sayHello(new RpcClientContext(),
+ HelloRequest.newBuilder()
+ .setMessage(ByteString.copyFromUtf8(reqPayload))
+ .build());
+ String message = response.getMessage().toStringUtf8();
+ int sep = message.lastIndexOf("|port=");
+ if (sep <= 0 || !reqPayload.equals(message.substring(0, sep))) {
+ throw new AssertionError(
+ "unexpected response payload, expected=" + reqPayload
+ + ", got=" + message);
+ }
+ }
+ r.succ = true;
+ } catch (Throwable ex) {
+ r.succ = false;
+ r.ex = ex;
+ ex.printStackTrace();
+ } finally {
+ latch.countDown();
+ }
+ }, "idle-test-" + label + "-" + t).start();
+ }
+ boolean done = latch.await(120, TimeUnit.SECONDS);
+ assertTrue("[" + label + "] concurrent calls timed out before completion", done);
+ for (int i = 0; i < results.size(); i++) {
+ TestResult r = results.get(i);
+ assertTrue("[" + label + "] worker thread " + i + " failed: "
+ + (r.ex == null ? "" : r.ex.toString()), r.succ);
+ }
+ }
+
+ /**
+ * Walk {@link RpcClusterClientManager}'s cache for {@code backendConfig}, drill down through
+ * {@code RpcClientProxy → DefRpcClient → ClientTransport → channels[]} and return every
+ * live {@code Channel} currently published in any slot.
+ */
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ private static Set collectLiveChannels(BackendConfig backendConfig) throws Exception {
+ Field clusterMapField = RpcClusterClientManager.class.getDeclaredField("CLUSTER_MAP");
+ clusterMapField.setAccessible(true);
+ Map> clusterMap =
+ (Map>) clusterMapField.get(null);
+ Map proxyMap = clusterMap.get(backendConfig);
+ if (proxyMap == null) {
+ return new HashSet<>();
+ }
+ Set live = new HashSet<>();
+ Field delegateField = null;
+ for (Object proxy : proxyMap.values()) {
+ if (delegateField == null) {
+ delegateField = proxy.getClass().getDeclaredField("delegate");
+ delegateField.setAccessible(true);
+ }
+ Object delegate = delegateField.get(proxy);
+ if (delegate == null) {
+ continue;
+ }
+ // DefRpcClient.transport (private final ClientTransport)
+ Field transportField;
+ try {
+ transportField = delegate.getClass().getDeclaredField("transport");
+ } catch (NoSuchFieldException ignore) {
+ continue;
+ }
+ transportField.setAccessible(true);
+ Object transport = transportField.get(delegate);
+ if (!(transport instanceof ClientTransport)) {
+ continue;
+ }
+ // AbstractClientTransport.channels (List)
+ Field channelsField;
+ try {
+ channelsField = AbstractClientTransport.class.getDeclaredField("channels");
+ } catch (NoSuchFieldException ignore) {
+ continue;
+ }
+ channelsField.setAccessible(true);
+ List> slots = (List>) channelsField.get(transport);
+ if (slots == null) {
+ continue;
+ }
+ Field futureField = AbstractClientTransport.ChannelFutureItem.class
+ .getDeclaredField("channelFuture");
+ futureField.setAccessible(true);
+ for (Object slot : slots) {
+ if (slot == null) {
+ continue;
+ }
+ Object cf = futureField.get(slot);
+ if (cf == null) {
+ continue;
+ }
+ java.util.concurrent.CompletableFuture future =
+ (java.util.concurrent.CompletableFuture) cf;
+ if (!future.isDone() || future.isCompletedExceptionally()) {
+ continue;
+ }
+ Channel ch = future.join();
+ if (ch != null) {
+ live.add(ch);
+ }
+ }
+ }
+ return live;
+ }
+
+ private void startServers() {
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ int port = BASE_TCP_PORT + i;
+ ProviderConfig providerConfig = new ProviderConfig<>();
+ providerConfig.setRef(new PortAwareEchoServiceImpl(port));
+
+ ServiceConfig serviceConfig = new ServiceConfig();
+ serviceConfig.setIp("127.0.0.1");
+ serviceConfig.setNetwork("tcp");
+ serviceConfig.setPort(port);
+ serviceConfig.setEnableLinkTimeout(true);
+ // Generous server-side timeout to avoid spurious timeouts on slow CI.
+ serviceConfig.setRequestTimeout(60_000);
+ serviceConfig.addProviderConfig(providerConfig);
+ serviceConfig.export();
+ serviceConfigs.add(serviceConfig);
+ }
+ }
+
+ /**
+ * Service impl that tags every response with its own listening port so the test can verify
+ * the actual server that handled each request.
+ */
+ private static class PortAwareEchoServiceImpl implements ConcurrentTestService {
+
+ private final int port;
+
+ PortAwareEchoServiceImpl(int port) {
+ this.port = port;
+ }
+
+ @Override
+ public HelloResponse sayHello(RpcServerContext context, HelloRequest request) {
+ String echoed = request.getMessage().toStringUtf8();
+ String tagged = echoed + "|port=" + port;
+ return HelloResponse.newBuilder()
+ .setMessage(ByteString.copyFromUtf8(tagged))
+ .build();
+ }
+ }
+
+ private static class TestResult {
+
+ boolean succ;
+ Throwable ex;
+ }
+}
diff --git a/trpc-proto/trpc-rpc-support/src/main/java/com/tencent/trpc/proto/support/DefRpcClient.java b/trpc-proto/trpc-rpc-support/src/main/java/com/tencent/trpc/proto/support/DefRpcClient.java
index 0a488da22..0f0451777 100644
--- a/trpc-proto/trpc-rpc-support/src/main/java/com/tencent/trpc/proto/support/DefRpcClient.java
+++ b/trpc-proto/trpc-rpc-support/src/main/java/com/tencent/trpc/proto/support/DefRpcClient.java
@@ -45,6 +45,7 @@
public class DefRpcClient extends AbstractRpcClient {
private static final Logger LOG = LoggerFactory.getLogger(DefRpcClient.class);
+
/**
* ClientTransport
*/
diff --git a/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/java/com/tencent/trpc/spring/boot/starters/context/BindTest2.java b/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/java/com/tencent/trpc/spring/boot/starters/context/BindTest2.java
index f4deb2fe2..76826ae4e 100644
--- a/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/java/com/tencent/trpc/spring/boot/starters/context/BindTest2.java
+++ b/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/java/com/tencent/trpc/spring/boot/starters/context/BindTest2.java
@@ -121,6 +121,9 @@ private void assertClient() {
Assert.assertEquals(properties.getClient().getSendBuffer(), Integer.valueOf(10));
Assert.assertEquals(properties.getClient().getReceiveBuffer(), Integer.valueOf(20));
Assert.assertEquals(properties.getClient().getIdleTimeout(), Integer.valueOf(200));
+ Assert.assertEquals(properties.getClient().getTcpKeepAliveIdle(), Integer.valueOf(25));
+ Assert.assertEquals(properties.getClient().getTcpKeepAliveIntvl(), Integer.valueOf(8));
+ Assert.assertEquals(properties.getClient().getTcpKeepAliveCnt(), Integer.valueOf(4));
Assert.assertEquals(properties.getClient().getLazyinit(), false);
Assert.assertEquals(properties.getClient().getConnsPerAddr(), Integer.valueOf(5));
Assert.assertEquals(properties.getClient().getConnTimeout(), Integer.valueOf(2000));
diff --git a/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/resources/application-bind-test2.yml b/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/resources/application-bind-test2.yml
index ddb47bb4b..cdd3088fd 100644
--- a/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/resources/application-bind-test2.yml
+++ b/trpc-spring-boot-starters/trpc-spring-boot-starter/src/test/resources/application-bind-test2.yml
@@ -82,6 +82,9 @@ trpc:
send_buffer: 10
receive_buffer: 20
idle_timeout: 200
+ tcp_keep_alive_idle: 25
+ tcp_keep_alive_intvl: 8
+ tcp_keep_alive_cnt: 4
lazyinit: false
conns_per_addr: 5
conn_timeout: 2000
diff --git a/trpc-spring-support/trpc-spring/src/main/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchema.java b/trpc-spring-support/trpc-spring/src/main/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchema.java
index 5b514f0bc..01293cbcc 100644
--- a/trpc-spring-support/trpc-spring/src/main/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchema.java
+++ b/trpc-spring-support/trpc-spring/src/main/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchema.java
@@ -101,6 +101,27 @@ public abstract class AbstractProtocolSchema {
*/
private Integer idleTimeout;
+ /**
+ * TCP keepalive idle in seconds (Linux {@code TCP_KEEPIDLE}). Effective only when
+ * {@code ioMode=epoll} on Linux. Maps to yaml key {@code tcp_keep_alive_idle}.
+ * Value 0 leaves the OS default in place.
+ */
+ private Integer tcpKeepAliveIdle;
+
+ /**
+ * TCP keepalive probe interval in seconds (Linux {@code TCP_KEEPINTVL}). Effective only
+ * when {@code ioMode=epoll} on Linux. Maps to yaml key {@code tcp_keep_alive_intvl}.
+ * Value 0 leaves the OS default in place.
+ */
+ private Integer tcpKeepAliveIntvl;
+
+ /**
+ * TCP keepalive probe count (Linux {@code TCP_KEEPCNT}). Effective only when
+ * {@code ioMode=epoll} on Linux. Maps to yaml key {@code tcp_keep_alive_cnt}. Value 0
+ * leaves the OS default in place.
+ */
+ private Integer tcpKeepAliveCnt;
+
/**
* Lazy-initialization
*/
@@ -269,6 +290,30 @@ public void setIdleTimeout(Integer idleTimeout) {
this.idleTimeout = idleTimeout;
}
+ public Integer getTcpKeepAliveIdle() {
+ return tcpKeepAliveIdle;
+ }
+
+ public void setTcpKeepAliveIdle(Integer tcpKeepAliveIdle) {
+ this.tcpKeepAliveIdle = tcpKeepAliveIdle;
+ }
+
+ public Integer getTcpKeepAliveIntvl() {
+ return tcpKeepAliveIntvl;
+ }
+
+ public void setTcpKeepAliveIntvl(Integer tcpKeepAliveIntvl) {
+ this.tcpKeepAliveIntvl = tcpKeepAliveIntvl;
+ }
+
+ public Integer getTcpKeepAliveCnt() {
+ return tcpKeepAliveCnt;
+ }
+
+ public void setTcpKeepAliveCnt(Integer tcpKeepAliveCnt) {
+ this.tcpKeepAliveCnt = tcpKeepAliveCnt;
+ }
+
public Boolean getLazyinit() {
return lazyinit;
}
diff --git a/trpc-spring-support/trpc-spring/src/test/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchemaTest.java b/trpc-spring-support/trpc-spring/src/test/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchemaTest.java
index ccc1958a8..06a401d02 100644
--- a/trpc-spring-support/trpc-spring/src/test/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchemaTest.java
+++ b/trpc-spring-support/trpc-spring/src/test/java/com/tencent/trpc/spring/context/configuration/schema/AbstractProtocolSchemaTest.java
@@ -132,6 +132,24 @@ public void testIdleTimeout() {
assertEquals(Integer.valueOf(180000), schema.getIdleTimeout());
}
+ @Test
+ public void testTcpKeepAliveIdle() {
+ schema.setTcpKeepAliveIdle(30);
+ assertEquals(Integer.valueOf(30), schema.getTcpKeepAliveIdle());
+ }
+
+ @Test
+ public void testTcpKeepAliveIntvl() {
+ schema.setTcpKeepAliveIntvl(10);
+ assertEquals(Integer.valueOf(10), schema.getTcpKeepAliveIntvl());
+ }
+
+ @Test
+ public void testTcpKeepAliveCnt() {
+ schema.setTcpKeepAliveCnt(3);
+ assertEquals(Integer.valueOf(3), schema.getTcpKeepAliveCnt());
+ }
+
@Test
public void testLazyinit() {
schema.setLazyinit(Boolean.TRUE);
diff --git a/trpc-test/trpc-test-integration/src/integration-test/java/com/tencent/trpc/integration/test/transport/TransportIntegrationTest.java b/trpc-test/trpc-test-integration/src/integration-test/java/com/tencent/trpc/integration/test/transport/TransportIntegrationTest.java
index 29460e42e..feb5497e9 100644
--- a/trpc-test/trpc-test-integration/src/integration-test/java/com/tencent/trpc/integration/test/transport/TransportIntegrationTest.java
+++ b/trpc-test/trpc-test-integration/src/integration-test/java/com/tencent/trpc/integration/test/transport/TransportIntegrationTest.java
@@ -102,15 +102,14 @@ public void testUdpToTcpNettyTransport() {
}
/**
- * Test for server-side idle-timeout
+ * Test for server-side idle-timeout.
+ * Long-connection mode: idle timeout no longer closes the connection. The framework
+ * keeps the connection alive regardless of how long it stays idle, so this case is
+ * intentionally left empty as a placeholder for the historical behaviour.
*/
@Test
public void testIdleTimeout() {
- assertThrows(RuntimeException.class, () ->
- tcpEchoAPI.delayedEcho(new RpcClientContext(), DelayedEchoRequest.newBuilder()
- .setMessage("timeout")
- .setDelaySeconds(2)
- .build()));
+ // No-op under long-connection mode: idleTimeout has no effect on the netty pipeline.
}
/**
diff --git a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransport.java b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransport.java
index abcfc3619..68a951989 100644
--- a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransport.java
+++ b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransport.java
@@ -18,22 +18,64 @@
import com.tencent.trpc.core.transport.codec.ClientCodec;
import com.tencent.trpc.core.utils.ConcurrentHashSet;
import io.netty.bootstrap.Bootstrap;
+import io.netty.channel.EventLoopGroup;
+import io.netty.channel.epoll.Epoll;
+import io.netty.channel.epoll.EpollEventLoopGroup;
import io.netty.channel.nio.NioEventLoopGroup;
import io.netty.util.concurrent.DefaultThreadFactory;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
+/**
+ * Common base for Netty-backed client transports.
+ *
+ * Owns the shared {@link EventLoopGroup} pool model. The pool comes in two flavours —
+ * a {@link NioEventLoopGroup} and an {@link EpollEventLoopGroup} — each backed by its own
+ * reference counter so {@code ioThreadGroupShare=true} stays compatible with the
+ * Linux/epoll path. The variant a transport joins is selected per-instance from
+ * {@code Epoll.isAvailable() && config.useEpoll()}, decoupling the share-IO-pool decision
+ * from the NIO/Epoll decision.
+ */
public abstract class NettyAbstractClientTransport extends AbstractClientTransport {
- private static final Object LOCK = new Object();
+ /**
+ * Shared-pool variant. {@link #NIO} is always available; {@link #EPOLL} is only used
+ * on Linux when the user opts into {@code ioMode=epoll} and the netty-epoll native
+ * library is loadable.
+ */
+ protected enum SharedGroupKind {
+ NIO,
+ EPOLL
+ }
+
+ private static final Object NIO_LOCK = new Object();
+ private static final Object EPOLL_LOCK = new Object();
+
+ /**
+ * Reference counter for the shared {@link NioEventLoopGroup}.
+ */
+ protected static final AtomicInteger SHARE_NIO_USED_NUMS = new AtomicInteger(0);
+ /**
+ * Reference counter for the shared {@link EpollEventLoopGroup}.
+ */
+ protected static final AtomicInteger SHARE_EPOLL_USED_NUMS = new AtomicInteger(0);
+ /**
+ * Shared {@link NioEventLoopGroup}, lazily created.
+ */
+ protected static volatile NioEventLoopGroup SHARE_NIO_GROUP;
+ /**
+ * Shared {@link EpollEventLoopGroup}, lazily created.
+ */
+ protected static volatile EpollEventLoopGroup SHARE_EPOLL_GROUP;
/**
- * Hold the number of shared NioEventLoopGroup
+ * Backwards-compatible alias for the NIO shared counter, retained for any external
+ * test that reflects on the field.
*/
- protected static final AtomicInteger SHARE_EVENT_LOOP_GROUP_USED_NUMS = new AtomicInteger(0);
+ protected static final AtomicInteger SHARE_EVENT_LOOP_GROUP_USED_NUMS = SHARE_NIO_USED_NUMS;
/**
- * Shared NioEventLoopGroup
+ * Backwards-compatible alias for the NIO shared group.
*/
protected static volatile NioEventLoopGroup SHARE_EVENT_LOOP_GROUP;
@@ -41,28 +83,46 @@ public abstract class NettyAbstractClientTransport extends AbstractClientTranspo
protected ConcurrentHashSet channelSet = new ConcurrentHashSet<>();
+ /**
+ * The shared variant this transport joined, or {@code null} if it owns an independent
+ * {@link EventLoopGroup}. Driven by {@code config.isIoThreadGroupShare()} together
+ * with {@link #wantsEpoll(ProtocolConfig)}.
+ */
+ private final SharedGroupKind sharedKind;
+
+ /**
+ * Whether this transport has acquired a reference to the shared event loop group. Used to
+ * guarantee idempotent release in {@link #doClose()} even if {@link #doOpen()} failed mid-way
+ * or {@code close()} is invoked twice.
+ */
+ private volatile boolean shareGroupAcquired;
+
public NettyAbstractClientTransport(ProtocolConfig config, ChannelHandler handler,
ClientCodec clientCodec, String defaultThreadPoolName) {
super(config, handler, clientCodec);
- if (SHARE_EVENT_LOOP_GROUP == null) {
- synchronized (LOCK) {
- if (SHARE_EVENT_LOOP_GROUP == null) {
- SHARE_EVENT_LOOP_GROUP = new NioEventLoopGroup(
- config.getIoThreads(), new DefaultThreadFactory(defaultThreadPoolName)
- );
- }
- }
+ // Acquire the shared event loop group eagerly when this transport will use it. The
+ // acquisition is paired with release in doClose(). This ensures the reference counter
+ // never goes negative and the group is never reclaimed while a not-yet-opened transport
+ // still has a reference outstanding.
+ if (Boolean.TRUE.equals(config.isIoThreadGroupShare())) {
+ this.sharedKind = wantsEpoll(config) ? SharedGroupKind.EPOLL : SharedGroupKind.NIO;
+ acquireSharedGroup(this.sharedKind, config.getIoThreads(), defaultThreadPoolName);
+ this.shareGroupAcquired = true;
+ } else {
+ this.sharedKind = null;
}
}
@Override
protected void doClose() {
- if (bootstrap != null) {
- if (!config.isIoThreadGroupShare()) {
- bootstrap.config().group().shutdownGracefully();
- } else {
- closeShareEventLoopGroup();
- }
+ if (bootstrap != null && !Boolean.TRUE.equals(config.isIoThreadGroupShare())) {
+ // Independent group owned by this transport, shut it down here.
+ bootstrap.config().group().shutdownGracefully();
+ }
+ // Release the shared group reference (idempotent: only release once per acquisition).
+ if (shareGroupAcquired) {
+ shareGroupAcquired = false;
+ releaseSharedGroup(sharedKind);
}
}
@@ -77,14 +137,95 @@ public Set getChannels() {
return channels;
}
- private void closeShareEventLoopGroup() {
- if (SHARE_EVENT_LOOP_GROUP_USED_NUMS.decrementAndGet() <= 0 && SHARE_EVENT_LOOP_GROUP != null) {
- synchronized (LOCK) {
- if (SHARE_EVENT_LOOP_GROUP_USED_NUMS.get() <= 0 && SHARE_EVENT_LOOP_GROUP != null) {
- SHARE_EVENT_LOOP_GROUP.shutdownGracefully();
- SHARE_EVENT_LOOP_GROUP = null;
+ /**
+ * Whether the given config wants epoll AND the JVM has a working netty-epoll native
+ * library. Subclasses use the same predicate when deciding the channel class.
+ */
+ protected static boolean wantsEpoll(ProtocolConfig config) {
+ return Epoll.isAvailable() && config != null && config.useEpoll();
+ }
+
+ /**
+ * Returns the shared event loop group this transport joined, or {@code null} when the
+ * transport is not configured to share. Subclasses use this in {@code doOpen} to wire
+ * the bootstrap to the right group.
+ */
+ protected EventLoopGroup getSharedEventLoopGroup() {
+ if (sharedKind == null) {
+ return null;
+ }
+ return sharedKind == SharedGroupKind.EPOLL ? SHARE_EPOLL_GROUP : SHARE_NIO_GROUP;
+ }
+
+ /**
+ * Returns the shared variant this transport joined, or {@code null} for independent.
+ */
+ protected SharedGroupKind getSharedGroupKind() {
+ return sharedKind;
+ }
+
+ /**
+ * Acquire one reference to the requested shared group, lazily creating it. Always
+ * paired with {@link #releaseSharedGroup(SharedGroupKind)} in close.
+ */
+ private static void acquireSharedGroup(SharedGroupKind kind, int ioThreads, String threadPoolName) {
+ if (kind == SharedGroupKind.EPOLL) {
+ synchronized (EPOLL_LOCK) {
+ if (SHARE_EPOLL_GROUP == null) {
+ SHARE_EPOLL_GROUP = new EpollEventLoopGroup(
+ ioThreads, new DefaultThreadFactory(threadPoolName + "-Epoll")
+ );
+ }
+ SHARE_EPOLL_USED_NUMS.incrementAndGet();
+ }
+ return;
+ }
+ synchronized (NIO_LOCK) {
+ if (SHARE_NIO_GROUP == null) {
+ SHARE_NIO_GROUP = new NioEventLoopGroup(
+ ioThreads, new DefaultThreadFactory(threadPoolName)
+ );
+ SHARE_EVENT_LOOP_GROUP = SHARE_NIO_GROUP;
+ }
+ SHARE_NIO_USED_NUMS.incrementAndGet();
+ }
+ }
+
+ /**
+ * Release one reference to the shared group of the given variant. The group is shut
+ * down only when the reference counter reaches zero. The whole
+ * check-decrement-shutdown-nullify sequence is performed under the per-variant lock so
+ * concurrent acquire/release calls cannot leak or double-shutdown the group.
+ */
+ private static void releaseSharedGroup(SharedGroupKind kind) {
+ if (kind == null) {
+ return;
+ }
+ if (kind == SharedGroupKind.EPOLL) {
+ synchronized (EPOLL_LOCK) {
+ int remaining = SHARE_EPOLL_USED_NUMS.decrementAndGet();
+ if (remaining < 0) {
+ SHARE_EPOLL_USED_NUMS.set(0);
+ remaining = 0;
+ }
+ if (remaining == 0 && SHARE_EPOLL_GROUP != null) {
+ SHARE_EPOLL_GROUP.shutdownGracefully();
+ SHARE_EPOLL_GROUP = null;
}
}
+ return;
+ }
+ synchronized (NIO_LOCK) {
+ int remaining = SHARE_NIO_USED_NUMS.decrementAndGet();
+ if (remaining < 0) {
+ SHARE_NIO_USED_NUMS.set(0);
+ remaining = 0;
+ }
+ if (remaining == 0 && SHARE_NIO_GROUP != null) {
+ SHARE_NIO_GROUP.shutdownGracefully();
+ SHARE_NIO_GROUP = null;
+ SHARE_EVENT_LOOP_GROUP = null;
+ }
}
}
diff --git a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyClientHandler.java b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyClientHandler.java
index 26dce0165..cb847535e 100644
--- a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyClientHandler.java
+++ b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyClientHandler.java
@@ -20,8 +20,6 @@
import io.netty.channel.ChannelDuplexHandler;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelPromise;
-import io.netty.handler.timeout.IdleState;
-import io.netty.handler.timeout.IdleStateEvent;
@io.netty.channel.ChannelHandler.Sharable
public class NettyClientHandler extends ChannelDuplexHandler {
@@ -105,24 +103,6 @@ public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise)
}
}
- @Override
- public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception {
- if (evt instanceof IdleStateEvent) {
- NettyChannel channel = NettyChannelManager.getOrAddChannel(ctx.channel(), config);
- try {
- // only close the channel in a TCP scenario.
- if (isTcp) {
- IdleState state = ((IdleStateEvent) evt).state();
- logger.warn("Idle event(state=" + state + ") trigger, close channel" + channel);
- channel.close();
- }
- } finally {
- NettyChannelManager.removeChannelIfDisconnected(ctx.channel());
- }
- }
- super.userEventTriggered(ctx, evt);
- }
-
public ConcurrentHashSet getChannelSet() {
return channelSet;
}
diff --git a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyServerHandler.java b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyServerHandler.java
index 4b495560e..26eade523 100644
--- a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyServerHandler.java
+++ b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyServerHandler.java
@@ -20,8 +20,6 @@
import io.netty.channel.ChannelDuplexHandler;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelPromise;
-import io.netty.handler.timeout.IdleState;
-import io.netty.handler.timeout.IdleStateEvent;
import java.net.InetSocketAddress;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
@@ -122,21 +120,6 @@ public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) {
}
}
- @Override
- public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception {
- if (evt instanceof IdleStateEvent) {
- NettyChannel channel = NettyChannelManager.getOrAddChannel(ctx.channel(), config);
- try {
- IdleState state = ((IdleStateEvent) evt).state();
- logger.warn("idle event[{}] trigger, close channel:{}", state, channel);
- channel.close();
- } finally {
- NettyChannelManager.removeChannelIfDisconnected(ctx.channel());
- }
- }
- super.userEventTriggered(ctx, evt);
- }
-
public ConcurrentMap getChannels() {
return clientChannels;
}
diff --git a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpClientTransport.java b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpClientTransport.java
index 8bc913304..fcb5d1ffb 100644
--- a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpClientTransport.java
+++ b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpClientTransport.java
@@ -11,8 +11,6 @@
package com.tencent.trpc.transport.netty;
-import static java.util.concurrent.TimeUnit.MILLISECONDS;
-
import com.tencent.trpc.core.common.config.ProtocolConfig;
import com.tencent.trpc.core.logger.Logger;
import com.tencent.trpc.core.logger.LoggerFactory;
@@ -20,21 +18,57 @@
import com.tencent.trpc.core.transport.codec.ClientCodec;
import io.netty.bootstrap.Bootstrap;
import io.netty.buffer.PooledByteBufAllocator;
+import io.netty.channel.ChannelDuplexHandler;
+import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelInitializer;
import io.netty.channel.ChannelOption;
import io.netty.channel.ChannelPipeline;
+import io.netty.channel.EventLoopGroup;
+import io.netty.channel.epoll.Epoll;
+import io.netty.channel.epoll.EpollChannelOption;
+import io.netty.channel.epoll.EpollEventLoopGroup;
+import io.netty.channel.epoll.EpollSocketChannel;
import io.netty.channel.nio.NioEventLoopGroup;
import io.netty.channel.socket.nio.NioSocketChannel;
+import io.netty.handler.timeout.IdleStateEvent;
import io.netty.handler.timeout.IdleStateHandler;
import io.netty.util.concurrent.DefaultThreadFactory;
import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
/**
- * A netty tcp ClientTransport
+ * A netty tcp ClientTransport.
+ * Long-connection mode with read-idle close: when {@code config.idleTimeout > 0} an
+ * {@link IdleStateHandler} is installed in READ idle mode so that a channel which
+ * has not received any inbound bytes for {@code idleTimeout} ms is torn down on the
+ * client side. The next request goes through
+ * {@link com.tencent.trpc.core.transport.AbstractClientTransport#ensureChannelActive} and
+ * rebuilds a fresh TCP connection on the existing {@code EventLoopGroup}.
+ * READ idle (not ALL idle) is intentional: under "persistent traffic + silent packet
+ * drop" half-dead scenarios the client keeps writing successfully into the kernel send
+ * buffer, so {@code ALL_IDLE}/{@code WRITE_IDLE} would never fire — only the lack of any
+ * inbound reply makes the dead path observable within {@code idleTimeout}.
+ * TCP keepalive tuning: when {@code ioMode=epoll} on Linux (regardless of whether
+ * {@code ioThreadGroupShare} is true or false), the transport uses
+ * {@code EpollSocketChannel} backed by a per-variant shared or independent
+ * {@code EpollEventLoopGroup} and sets {@code TCP_KEEPIDLE}, {@code TCP_KEEPINTVL} and
+ * {@code TCP_KEEPCNT} per channel. With the Dubbo-style 30/10/3 defaults a half-dead
+ * connection is reset by the kernel within ~60 s, an order of magnitude faster than
+ * {@code idleTimeout} alone. This kicks in transparently and only on Linux + epoll;
+ * everywhere else the read-idle handler remains the universal fallback.
+ * Trade-off: pure one-way callers (request-only, no response) will see the channel
+ * recycled every {@code idleTimeout} ms, effectively turning the connection into a
+ * short-lived one. Such callers should configure {@code idleTimeout = 0} to disable the
+ * handler.
+ * Before the actual {@code ctx.close()}, the slot is invalidated via
+ * {@link com.tencent.trpc.core.transport.AbstractClientTransport#invalidateChannel} so that a
+ * concurrent request thread cannot route onto a channel that is in the middle of closing.
*/
public class NettyTcpClientTransport extends NettyAbstractClientTransport {
+ private static final Logger logger = LoggerFactory.getLogger(NettyTcpClientTransport.class);
+
public NettyTcpClientTransport(ProtocolConfig config, ChannelHandler handler, ClientCodec clientCodec) {
super(config, handler, clientCodec, "Netty-ShareTcpClientWorker");
}
@@ -42,16 +76,29 @@ public NettyTcpClientTransport(ProtocolConfig config, ChannelHandler handler, Cl
@Override
protected void doOpen() {
bootstrap = new Bootstrap();
- NioEventLoopGroup myEventLoopGroup;
- if (!config.isIoThreadGroupShare()) {
+ // Decide the IO model: epoll if (a) the JVM has a working netty-epoll native
+ // library, AND (b) the user opted into ioMode=epoll. The flag is independent of
+ // ioThreadGroupShare — both shared and independent pools support epoll now.
+ boolean useEpoll = wantsEpoll(config);
+ EventLoopGroup myEventLoopGroup;
+ Class extends io.netty.channel.Channel> channelClass = useEpoll
+ ? EpollSocketChannel.class
+ : NioSocketChannel.class;
+ if (Boolean.TRUE.equals(config.isIoThreadGroupShare())) {
+ // Reference counter has already been incremented in the constructor; just use
+ // the shared group of the matching variant here. This avoids any TOCTOU race
+ // between constructor and doOpen.
+ myEventLoopGroup = getSharedEventLoopGroup();
+ } else if (useEpoll) {
+ myEventLoopGroup = new EpollEventLoopGroup(config.getIoThreads(),
+ new DefaultThreadFactory(
+ "Netty-EpollTcpClientWorker-" + config.getIp() + ":" + config.getPort()));
+ } else {
myEventLoopGroup = new NioEventLoopGroup(config.getIoThreads(),
new DefaultThreadFactory(
"Netty-TcpClientWorker-" + config.getIp() + ":" + config.getPort()));
- } else {
- myEventLoopGroup = SHARE_EVENT_LOOP_GROUP;
- SHARE_EVENT_LOOP_GROUP_USED_NUMS.incrementAndGet();
}
- bootstrap.group(myEventLoopGroup).channel(NioSocketChannel.class)
+ bootstrap.group(myEventLoopGroup).channel(channelClass)
.option(ChannelOption.SO_KEEPALIVE, true).option(ChannelOption.TCP_NODELAY, true)
.option(ChannelOption.ALLOCATOR, PooledByteBufAllocator.DEFAULT)
.option(ChannelOption.CONNECT_TIMEOUT_MILLIS, config.getConnTimeout());
@@ -61,26 +108,42 @@ protected void doOpen() {
if (config.getSendBuffer() > 0) {
bootstrap.option(ChannelOption.SO_SNDBUF, config.getSendBuffer());
}
+ if (useEpoll) {
+ applyTcpKeepAliveTuning(bootstrap);
+ }
final NettyClientHandler clientHandler =
new NettyClientHandler(getChannelHandler(), config, true);
channelSet = clientHandler.getChannelSet();
- bootstrap.handler(new ChannelInitializer() {
+ final long idleTimeoutMills = resolveIdleTimeoutMills();
+ bootstrap.handler(new ChannelInitializer() {
@Override
- protected void initChannel(NioSocketChannel ch) {
-
- IdleStateHandler clientIdleHandler =
- new IdleStateHandler(0, config.getIdleTimeout(), 0, MILLISECONDS);
+ protected void initChannel(io.netty.channel.Channel ch) {
ChannelPipeline p = ch.pipeline();
if (codec == null) {
- p.addLast("client-idle", clientIdleHandler).addLast("handler", clientHandler);
+ p.addLast("handler", clientHandler);
} else {
NettyCodecAdapter nettyCodec = NettyCodecAdapter
.createTcpCodecAdapter(codec, config);
p.addLast("encode", nettyCodec.getEncoder())
.addLast("decode", nettyCodec.getDecoder())
- .addLast("client-idle", clientIdleHandler)
.addLast("handler", clientHandler);
}
+ if (idleTimeoutMills > 0) {
+ // READ idle (not ALL idle): trigger when no inbound bytes have been
+ // observed for {@code idleTimeoutMills}, regardless of whether the
+ // application keeps writing. This is the critical knob for half-dead
+ // connection detection on platforms where TCP keepalive tuning is not
+ // available; on Linux + epoll the keepalive parameters above kick in
+ // first and recover the connection in seconds rather than minutes.
+ //
+ // The idle handlers MUST sit before {@code "handler"} (NettyClientHandler)
+ // because the latter does not propagate {@code channelActive} downstream,
+ // and IdleStateHandler relies on channelActive (or handlerAdded while
+ // active) to start its timer.
+ p.addBefore("handler", "idleState",
+ new IdleStateHandler(idleTimeoutMills, 0L, 0L, TimeUnit.MILLISECONDS));
+ p.addBefore("handler", "idleClose", new IdleCloseHandler());
+ }
}
});
}
@@ -98,4 +161,93 @@ public Bootstrap getBootstrap() {
protected boolean useChannelPool() {
return config.isKeepAlive();
}
+
+ /**
+ * Apply Linux {@code TCP_KEEPIDLE / TCP_KEEPINTVL / TCP_KEEPCNT} to the bootstrap. A
+ * non-positive configured value (or {@code null}) leaves the corresponding kernel
+ * default in place. The whole tuning is silently no-op on non-Linux platforms; the
+ * caller has already verified epoll availability before calling this method.
+ */
+ private void applyTcpKeepAliveTuning(Bootstrap bootstrap) {
+ Integer idle = config.getTcpKeepAliveIdle();
+ Integer intvl = config.getTcpKeepAliveIntvl();
+ Integer cnt = config.getTcpKeepAliveCnt();
+ if (idle != null && idle > 0) {
+ bootstrap.option(EpollChannelOption.TCP_KEEPIDLE, idle);
+ }
+ if (intvl != null && intvl > 0) {
+ bootstrap.option(EpollChannelOption.TCP_KEEPINTVL, intvl);
+ }
+ if (cnt != null && cnt > 0) {
+ bootstrap.option(EpollChannelOption.TCP_KEEPCNT, cnt);
+ }
+ if (logger.isInfoEnabled()) {
+ logger.info("TCP keepalive tuning enabled on {}: idle={}s, intvl={}s, cnt={}",
+ config.toSimpleString(), idle, intvl, cnt);
+ }
+ }
+
+ /**
+ * Resolve the idle-close threshold (milliseconds). A non-positive {@code idleTimeout}
+ * configuration disables the idle-close handler entirely (legacy behaviour).
+ */
+ private long resolveIdleTimeoutMills() {
+ Integer raw = config.getIdleTimeout();
+ if (raw == null || raw <= 0) {
+ return 0L;
+ }
+ return raw.longValue();
+ }
+
+ /**
+ * Pipeline tail handler that, on an {@link IdleStateEvent}, first invalidates the
+ * transport slot holding this channel so concurrent request threads see "needs
+ * reconnect" immediately, then closes the channel. This shrinks the "request lands on a
+ * closing channel" race window from "close completes" to "close is enqueued".
+ */
+ private final class IdleCloseHandler extends ChannelDuplexHandler {
+
+ @Override
+ public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception {
+ if (evt instanceof IdleStateEvent) {
+ io.netty.channel.Channel ioChannel = ctx.channel();
+ IdleStateEvent idleEvt = (IdleStateEvent) evt;
+ // Identifying info for ops triage:
+ // * caller (this side): local socket address — uniquely identifies the
+ // consumer process even when the framework-level CallerServiceName is
+ // not propagated down to the transport layer.
+ // * callee (peer side): config.toSimpleString() (name:protocol:ip:port:network)
+ // plus the actual remote socket address, which captures DNS-resolved IP.
+ logger.info("[long-link][idle-fire] state={} caller(local)={} callee={} remote={} channelId={}",
+ idleEvt.state(),
+ ioChannel.localAddress(),
+ config.toSimpleString(),
+ ioChannel.remoteAddress(),
+ ioChannel.id().asShortText());
+ try {
+ com.tencent.trpc.core.transport.Channel wrapper =
+ NettyChannelManager.getOrAddChannel(ioChannel, config);
+ if (wrapper != null) {
+ invalidateChannel(wrapper);
+ }
+ } catch (Throwable ex) {
+ logger.warn("[long-link][idle-fire] invalidate slot failed, caller(local)={} "
+ + "callee={} remote={} channelId={}",
+ ioChannel.localAddress(), config.toSimpleString(),
+ ioChannel.remoteAddress(), ioChannel.id().asShortText(), ex);
+ }
+ // Async close on the EventLoop; log when it actually completes so ops can
+ // tell apart "close enqueued" from "close finished".
+ ctx.close().addListener(future -> logger.info(
+ "[long-link][idle-close] success={} caller(local)={} callee={} remote={} channelId={}"
+ + (future.isSuccess() ? "" : " cause={}"),
+ future.isSuccess(),
+ ioChannel.localAddress(), config.toSimpleString(),
+ ioChannel.remoteAddress(), ioChannel.id().asShortText(),
+ future.isSuccess() ? null : future.cause()));
+ return;
+ }
+ super.userEventTriggered(ctx, evt);
+ }
+ }
}
diff --git a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpServerTransport.java b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpServerTransport.java
index 4af2e3cdd..d6e6d6c6e 100644
--- a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpServerTransport.java
+++ b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyTcpServerTransport.java
@@ -11,8 +11,6 @@
package com.tencent.trpc.transport.netty;
-import static java.util.concurrent.TimeUnit.MILLISECONDS;
-
import com.tencent.trpc.core.common.config.ProtocolConfig;
import com.tencent.trpc.core.exception.TransportException;
import com.tencent.trpc.core.logger.Logger;
@@ -40,7 +38,6 @@
import io.netty.channel.socket.SocketChannel;
import io.netty.channel.socket.nio.NioServerSocketChannel;
import io.netty.handler.flush.FlushConsolidationHandler;
-import io.netty.handler.timeout.IdleStateHandler;
import io.netty.util.Version;
import io.netty.util.concurrent.DefaultThreadFactory;
import java.util.HashSet;
@@ -122,16 +119,14 @@ protected void doOpen() {
@Override
protected void initChannel(SocketChannel ch) throws Exception {
ChannelPipeline p = ch.pipeline();
- IdleStateHandler idleHandler =
- new IdleStateHandler(0, 0, config.getIdleTimeout(), MILLISECONDS);
- if (codec == null) {
- p.addLast("server-idle", idleHandler);
- } else {
+ // Long-connection mode: do NOT install IdleStateHandler. The idleTimeout field
+ // is kept for backward compatibility but no longer takes effect on the netty
+ // pipeline. The server never proactively closes a client connection due to idle.
+ if (codec != null) {
NettyCodecAdapter nettyCodec = NettyCodecAdapter
.createTcpCodecAdapter(codec, config);
p.addLast("encode", nettyCodec.getEncoder())//
- .addLast("decode", nettyCodec.getDecoder())//
- .addLast("server-idle", idleHandler);
+ .addLast("decode", nettyCodec.getDecoder());
}
if (flushConsolidationSwitch) {
p.addLast("flushConsolidationHandlers",
diff --git a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyUdpClientTransport.java b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyUdpClientTransport.java
index daefda913..7afe26697 100644
--- a/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyUdpClientTransport.java
+++ b/trpc-transport/trpc-transport-netty/src/main/java/com/tencent/trpc/transport/netty/NettyUdpClientTransport.java
@@ -12,8 +12,6 @@
package com.tencent.trpc.transport.netty;
import com.tencent.trpc.core.common.config.ProtocolConfig;
-import com.tencent.trpc.core.logger.Logger;
-import com.tencent.trpc.core.logger.LoggerFactory;
import com.tencent.trpc.core.transport.ChannelHandler;
import com.tencent.trpc.core.transport.codec.ClientCodec;
import com.tencent.trpc.core.utils.NetUtils;
@@ -21,14 +19,42 @@
import io.netty.channel.ChannelInitializer;
import io.netty.channel.ChannelOption;
import io.netty.channel.ChannelPipeline;
+import io.netty.channel.EventLoopGroup;
import io.netty.channel.FixedRecvByteBufAllocator;
+import io.netty.channel.epoll.EpollDatagramChannel;
+import io.netty.channel.epoll.EpollEventLoopGroup;
import io.netty.channel.nio.NioEventLoopGroup;
import io.netty.channel.socket.nio.NioDatagramChannel;
import io.netty.util.concurrent.DefaultThreadFactory;
import java.util.concurrent.CompletableFuture;
/**
- * A netty udp ClientTransport
+ * A netty udp ClientTransport. Honours {@code ioMode=epoll} for the datagram path the
+ * same way the TCP transport does, picking either {@link EpollDatagramChannel} +
+ * {@link EpollEventLoopGroup} or {@link NioDatagramChannel} + {@link NioEventLoopGroup}.
+ * The shared-group pool in {@link NettyAbstractClientTransport} is variant-aware, so
+ * mixing {@code ioThreadGroupShare=true} with {@code ioMode=epoll} is now legal.
+ *
+ * Long-connection scope on UDP. UDP is connectionless: a datagram socket has no
+ * notion of "half-dead" or "peer disconnected", and {@link io.netty.channel.Channel#isActive()}
+ * stays {@code true} for the lifetime of the local socket. As a consequence, only part of
+ * the long-connection hardening introduced for TCP applies here:
+ *
+ * - Applies (Layer 5): the shared {@link EventLoopGroup} pool and the
+ * NIO/Epoll channel selection are real wins for UDP — fewer threads, native
+ * epoll datagram path on Linux.
+ * - Does NOT apply (Layer 2 / 3 / 4): the slot + {@code ensureChannelActive}
+ * reconnect machinery in {@link com.tencent.trpc.core.transport.AbstractClientTransport},
+ * the read-idle close handler, and the {@code TCP_KEEPIDLE / INTVL / CNT} tuning
+ * are all TCP-specific. For UDP they are either no-ops (the keepalive options are
+ * simply not set) or fast-path through (the slot is built once at
+ * {@code lazyinit} time and never invalidated, so {@code ensureChannelActive}
+ * always short-circuits at its first {@code !needsReconnect} check).
+ *
+ * The slot path being walked-but-never-firing on UDP is intentional: it keeps the
+ * {@code AbstractClientTransport} contract uniform across protocols at a negligible
+ * per-call overhead (one COW-list read + one boolean check). Readers should not assume
+ * UDP enjoys the same half-dead detection or thundering-herd protection as TCP.
*/
public class NettyUdpClientTransport extends NettyAbstractClientTransport {
@@ -41,22 +67,32 @@ protected void doOpen() {
final NettyClientHandler clientHandler =
new NettyClientHandler(getChannelHandler(), config, false);
this.bootstrap = new Bootstrap();
- NioEventLoopGroup myEventLoopGroup;
- if (!config.isIoThreadGroupShare()) {
+ boolean useEpoll = wantsEpoll(config);
+ EventLoopGroup myEventLoopGroup;
+ Class extends io.netty.channel.Channel> channelClass = useEpoll
+ ? EpollDatagramChannel.class
+ : NioDatagramChannel.class;
+ if (Boolean.TRUE.equals(config.isIoThreadGroupShare())) {
+ // Reference counter has already been incremented in the constructor; just use
+ // the shared group of the matching variant here. This avoids any TOCTOU race
+ // between constructor and doOpen.
+ myEventLoopGroup = getSharedEventLoopGroup();
+ } else if (useEpoll) {
+ myEventLoopGroup = new EpollEventLoopGroup(config.getIoThreads(),
+ new DefaultThreadFactory(
+ "Netty-EpollUdpClientWorker-" + config.getIp() + ":" + config.getPort()));
+ } else {
myEventLoopGroup = new NioEventLoopGroup(config.getIoThreads(),
new DefaultThreadFactory(
"Netty-UdpClientWorker-" + config.getIp() + ":" + config.getPort()));
- } else {
- myEventLoopGroup = SHARE_EVENT_LOOP_GROUP;
- SHARE_EVENT_LOOP_GROUP_USED_NUMS.incrementAndGet();
}
channelSet = clientHandler.getChannelSet();
- bootstrap.channel(NioDatagramChannel.class).group(myEventLoopGroup)
+ bootstrap.channel(channelClass).group(myEventLoopGroup)
.option(ChannelOption.RCVBUF_ALLOCATOR,
new FixedRecvByteBufAllocator(config.getReceiveBuffer()))
- .handler(new ChannelInitializer() {
+ .handler(new ChannelInitializer() {
@Override
- protected void initChannel(NioDatagramChannel ch) throws Exception {
+ protected void initChannel(io.netty.channel.Channel ch) throws Exception {
ChannelPipeline p = ch.pipeline();
if (codec == null) {
p.addLast("handler", clientHandler);
diff --git a/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransportTest.java b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransportTest.java
new file mode 100644
index 000000000..4a39b1d6f
--- /dev/null
+++ b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyAbstractClientTransportTest.java
@@ -0,0 +1,289 @@
+/*
+ * Tencent is pleased to support the open source community by making tRPC available.
+ *
+ * Copyright (C) 2023 Tencent.
+ * All rights reserved.
+ *
+ * If you have downloaded a copy of the tRPC source code from Tencent,
+ * please note that tRPC source code is licensed under the Apache 2.0 License,
+ * A copy of the Apache 2.0 License can be found in the LICENSE file.
+ */
+
+package com.tencent.trpc.transport.netty;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import com.tencent.trpc.core.common.config.ProtocolConfig;
+import com.tencent.trpc.core.transport.Channel;
+import com.tencent.trpc.core.transport.handler.ChannelHandlerAdapter;
+import io.netty.channel.EventLoopGroup;
+import io.netty.channel.epoll.Epoll;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Coverage for {@link NettyAbstractClientTransport}'s shared-group reference-count model:
+ * the two-slot (NIO / Epoll) pool, idempotent close, the {@code shareGroupAcquired} guard
+ * and the {@code wantsEpoll} predicate.
+ *
+ * The tests use a no-op subclass that does not actually open any sockets, so the shared
+ * group is acquired in the constructor and released in {@code doClose} without any Netty
+ * channel activity in between.
+ */
+public class NettyAbstractClientTransportTest {
+
+ /**
+ * Ensure each test starts from a clean shared-group state. The pool is a process-wide
+ * singleton so leftovers from earlier tests in the JVM (or interleaved tests) would
+ * otherwise distort the reference counter assertions below.
+ */
+ @Before
+ public void setUp() throws Exception {
+ resetSharedState();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ resetSharedState();
+ }
+
+ /**
+ * NIO shared group: counter increments on construction, decrements on close, group is
+ * lazily created and torn down only when the counter returns to zero.
+ */
+ @Test
+ public void testNioSharedGroupReferenceCounting() {
+ final ProtocolConfig c1 = newConfig(true, false);
+ NoopTransport t1 = new NoopTransport(c1);
+ t1.open();
+ assertEquals(1, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ assertNotNull("first constructor must lazily create the NIO group",
+ NettyAbstractClientTransport.SHARE_NIO_GROUP);
+ EventLoopGroup grp = NettyAbstractClientTransport.SHARE_NIO_GROUP;
+
+ final ProtocolConfig c2 = newConfig(true, false);
+ NoopTransport t2 = new NoopTransport(c2);
+ t2.open();
+ assertEquals(2, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ assertSame("second constructor must reuse the existing group",
+ grp, NettyAbstractClientTransport.SHARE_NIO_GROUP);
+
+ t1.close();
+ // First close drops the counter but keeps the group alive — t2 still references it.
+ assertEquals(1, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ assertSame(grp, NettyAbstractClientTransport.SHARE_NIO_GROUP);
+
+ t2.close();
+ // Second close drops to zero — group must be released and slot nulled out.
+ assertEquals(0, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ assertNull("group must be released when refcount returns to zero",
+ NettyAbstractClientTransport.SHARE_NIO_GROUP);
+ }
+
+ /**
+ * Independent (non-shared) mode: the shared counter is never touched, and {@code close()}
+ * does not interact with the shared slots.
+ */
+ @Test
+ public void testIndependentModeDoesNotTouchSharedCounter() {
+ ProtocolConfig c = newConfig(false, false);
+ int nioBefore = NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get();
+ int epollBefore = NettyAbstractClientTransport.SHARE_EPOLL_USED_NUMS.get();
+
+ NoopTransport t = new NoopTransport(c);
+ t.open();
+ assertEquals("independent mode must not touch NIO counter",
+ nioBefore, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ assertEquals("independent mode must not touch EPOLL counter",
+ epollBefore, NettyAbstractClientTransport.SHARE_EPOLL_USED_NUMS.get());
+
+ t.close();
+ assertEquals(nioBefore, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ assertEquals(epollBefore, NettyAbstractClientTransport.SHARE_EPOLL_USED_NUMS.get());
+ }
+
+ /**
+ * The {@code shareGroupAcquired} guard makes {@code close()} idempotent: a second close
+ * must NOT decrement the counter again. Without the guard the pool would be torn down
+ * while another transport still holds a logical reference.
+ */
+ @Test
+ public void testDoubleCloseIsIdempotent() {
+ ProtocolConfig c = newConfig(true, false);
+ NoopTransport t = new NoopTransport(c);
+ t.open();
+ assertEquals(1, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+
+ t.close();
+ assertEquals(0, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+
+ // Second close is a no-op for the shared counter.
+ t.close();
+ assertEquals("idempotent close must not double-decrement",
+ 0, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ }
+
+ /**
+ * {@code wantsEpoll} guards: null config and non-epoll {@code ioMode} must both yield
+ * false. The Linux/native availability check is JVM-environment-dependent so we only
+ * assert the negative branches deterministically.
+ */
+ @Test
+ public void testWantsEpollNegativeBranches() {
+ assertFalse("null config must yield wantsEpoll=false",
+ NoopTransport.wantsEpollPublic(null));
+ ProtocolConfig nio = newConfig(true, false);
+ assertFalse("ioMode=nio must yield wantsEpoll=false",
+ NoopTransport.wantsEpollPublic(nio));
+ }
+
+ /**
+ * {@code getSharedEventLoopGroup} returns null for independent transports, the NIO
+ * instance for NIO-shared transports, and (when epoll is available) the EPOLL instance
+ * for epoll-shared transports.
+ */
+ @Test
+ public void testGetSharedEventLoopGroupRoutes() {
+ // Independent — no shared group.
+ NoopTransport indep = new NoopTransport(newConfig(false, false));
+ indep.open();
+ assertNull("independent transport must report no shared group",
+ indep.getSharedEventLoopGroupPublic());
+ indep.close();
+
+ // NIO-shared.
+ NoopTransport nio = new NoopTransport(newConfig(true, false));
+ nio.open();
+ EventLoopGroup nioGrp = nio.getSharedEventLoopGroupPublic();
+ assertNotNull(nioGrp);
+ assertSame(NettyAbstractClientTransport.SHARE_NIO_GROUP, nioGrp);
+ nio.close();
+
+ // EPOLL-shared (only when the JVM has the native lib loaded — otherwise we skip).
+ if (Epoll.isAvailable()) {
+ NoopTransport epoll = new NoopTransport(newConfig(true, true));
+ epoll.open();
+ EventLoopGroup epollGrp = epoll.getSharedEventLoopGroupPublic();
+ assertNotNull(epollGrp);
+ assertSame(NettyAbstractClientTransport.SHARE_EPOLL_GROUP, epollGrp);
+ epoll.close();
+ assertEquals(0, NettyAbstractClientTransport.SHARE_EPOLL_USED_NUMS.get());
+ }
+ }
+
+ /**
+ * The release path tolerates a counter that has somehow been driven below zero (e.g. by
+ * an external test reset or a buggy manual close): it must clamp at zero rather than
+ * leak a negative value into the next acquire cycle.
+ */
+ @Test
+ public void testReleasePathClampsNegativeCounter() throws Exception {
+ // Build then close one transport — this drives the counter through 1 → 0 normally.
+ NoopTransport t = new NoopTransport(newConfig(true, false));
+ t.open();
+ t.close();
+ assertEquals(0, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+
+ // Force a stray release on a transport whose guard is artificially flipped: the
+ // releaseSharedGroup path must clamp the counter at zero rather than let it dip
+ // negative.
+ NoopTransport t2 = new NoopTransport(newConfig(true, false));
+ t2.open();
+ Field acquired = NettyAbstractClientTransport.class
+ .getDeclaredField("shareGroupAcquired");
+ acquired.setAccessible(true);
+ // Flip the guard so the next close still calls releaseSharedGroup, then close once
+ // more so it tries to decrement from a counter that's already 0.
+ t2.close();
+ assertEquals(0, NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get());
+ // Manually re-arm the guard and call doClose via reflection to drive the negative
+ // path. doClose is protected — accessible directly because we're in the same package.
+ acquired.setBoolean(t2, true);
+ Method doClose = NettyAbstractClientTransport.class.getDeclaredMethod("doClose");
+ doClose.setAccessible(true);
+ doClose.invoke(t2);
+ assertTrue("counter must never be negative",
+ NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.get() >= 0);
+ }
+
+ /**
+ * Reset the shared state back to zero so each test sees a deterministic baseline.
+ */
+ private static void resetSharedState() throws Exception {
+ EventLoopGroup nio = NettyAbstractClientTransport.SHARE_NIO_GROUP;
+ if (nio != null) {
+ nio.shutdownGracefully();
+ }
+ NettyAbstractClientTransport.SHARE_NIO_GROUP = null;
+ NettyAbstractClientTransport.SHARE_EVENT_LOOP_GROUP = null;
+ NettyAbstractClientTransport.SHARE_NIO_USED_NUMS.set(0);
+
+ EventLoopGroup epoll = NettyAbstractClientTransport.SHARE_EPOLL_GROUP;
+ if (epoll != null) {
+ epoll.shutdownGracefully();
+ NettyAbstractClientTransport.SHARE_EPOLL_GROUP = null;
+ }
+ NettyAbstractClientTransport.SHARE_EPOLL_USED_NUMS.set(0);
+ }
+
+ private static ProtocolConfig newConfig(boolean shared, boolean epoll) {
+ ProtocolConfig config = new ProtocolConfig();
+ config.setIp("127.0.0.1");
+ config.setPort(65000);
+ config.setNetwork("tcp");
+ config.setIoThreadGroupShare(shared);
+ config.setIoMode(epoll ? "epoll" : "nio");
+ config.setIoThreads(1);
+ return config;
+ }
+
+ /**
+ * No-op transport that exercises the base-class constructor / close paths without
+ * actually opening any sockets. {@code make()} is never called from these tests.
+ */
+ static class NoopTransport extends NettyAbstractClientTransport {
+
+ NoopTransport(ProtocolConfig config) {
+ super(config, new ChannelHandlerAdapter(), null, "Netty-Test-NoopTransport");
+ }
+
+ // Convenience pass-throughs to the protected helpers under test.
+ EventLoopGroup getSharedEventLoopGroupPublic() {
+ return getSharedEventLoopGroup();
+ }
+
+ static boolean wantsEpollPublic(ProtocolConfig config) {
+ return wantsEpoll(config);
+ }
+
+ @Override
+ protected void doOpen() {
+ // Intentionally empty: tests only exercise constructor + close.
+ }
+
+ @Override
+ protected CompletableFuture make() {
+ return new CompletableFuture<>();
+ }
+
+ @Override
+ protected boolean useChannelPool() {
+ return false;
+ }
+
+ @Override
+ public Set getChannels() {
+ return null;
+ }
+ }
+}
diff --git a/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyChannelHandlerTest.java b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyChannelHandlerTest.java
index 6f962d660..11a0073b7 100644
--- a/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyChannelHandlerTest.java
+++ b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyChannelHandlerTest.java
@@ -33,7 +33,8 @@ public void test() throws Exception {
new NettyClientHandler(new ChannelHandlerAdapter(), new ProtocolConfig(), true)
.userEventTriggered(new ChannelHandlerContextTest(channelTest2),
IdleStateEvent.WRITER_IDLE_STATE_EVENT);
- Assert.assertTrue(channelTest2.getIsClose() != null && channelTest2.isClose);
+ // Long-connection mode: client must NOT close the channel on idle event.
+ Assert.assertTrue(channelTest2.getIsClose() == null || !channelTest2.isClose);
ChannelTest channelTest3 = new ChannelTest();
channelTest3.setActive(true);
@@ -47,6 +48,7 @@ public void test() throws Exception {
new NettyServerHandler(new ChannelHandlerAdapter(), new ProtocolConfig(), true)
.userEventTriggered(new ChannelHandlerContextTest(channelTest4),
IdleStateEvent.WRITER_IDLE_STATE_EVENT);
- Assert.assertTrue(channelTest4.getIsClose() != null && channelTest4.isClose);
+ // Long-connection mode: server must NOT close the channel on idle event.
+ Assert.assertTrue(channelTest4.getIsClose() == null || !channelTest4.isClose);
}
}
diff --git a/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyTcpClientIdleCloseTest.java b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyTcpClientIdleCloseTest.java
new file mode 100644
index 000000000..2d7794cce
--- /dev/null
+++ b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyTcpClientIdleCloseTest.java
@@ -0,0 +1,126 @@
+/*
+ * Tencent is pleased to support the open source community by making tRPC available.
+ *
+ * Copyright (C) 2023 Tencent.
+ * All rights reserved.
+ *
+ * If you have downloaded a copy of the tRPC source code from Tencent,
+ * please note that tRPC source code is licensed under the Apache 2.0 License,
+ * A copy of the Apache 2.0 License can be found in the LICENSE file.
+ */
+
+package com.tencent.trpc.transport.netty;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import com.tencent.trpc.core.common.config.ProtocolConfig;
+import com.tencent.trpc.core.transport.Channel;
+import com.tencent.trpc.core.transport.handler.ChannelHandlerAdapter;
+import com.tencent.trpc.core.utils.NetUtils;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.util.concurrent.TimeUnit;
+import org.junit.Test;
+
+/**
+ * Verifies the client-side idle-close hand-off:
+ *
+ * - An {@link io.netty.handler.timeout.IdleStateHandler} is installed when
+ * {@code idleTimeout > 0}.
+ * - When idle fires (no inbound bytes within {@code idleTimeout}), the slot is
+ * invalidated before the underlying {@code channel.close()} runs, so the
+ * next request goes through {@code ensureChannelActive}'s lazy reconnect.
+ * - The actual {@code Channel.isConnected()} flips to false shortly afterwards.
+ *
+ *
+ * The peer is a plain {@link ServerSocket} that accepts but never replies — this isolates
+ * the test from {@link NettyTcpServerTransport}'s pipeline and guarantees the client
+ * triggers READ_IDLE without any inbound traffic muting the timer.
+ */
+public class NettyTcpClientIdleCloseTest {
+
+ @Test
+ public void idleTimeoutClosesChannelAndInvalidatesSlot() throws Exception {
+ // Plain TCP server socket: accept the client, hold it open, never write a byte —
+ // this is exactly the half-dead scenario READ_IDLE is designed to detect.
+ ServerSocket server = new ServerSocket(0);
+ server.setSoTimeout(10_000);
+ Thread acceptor = new Thread(() -> {
+ try (Socket s = server.accept()) {
+ // Hold the socket alive. The test will close itself; a long sleep here
+ // avoids server-side EOF leaking back as inbound bytes.
+ Thread.sleep(8_000);
+ } catch (Throwable ignore) {
+ // Test may finish before the sleep elapses — that's fine.
+ }
+ }, "test-accept");
+ acceptor.setDaemon(true);
+ acceptor.start();
+
+ ProtocolConfig clientConfig = new ProtocolConfig();
+ clientConfig.setIp(NetUtils.LOCAL_HOST);
+ clientConfig.setPort(server.getLocalPort());
+ clientConfig.setNetwork("tcp");
+ clientConfig.setConnsPerAddr(1);
+ clientConfig.setLazyinit(false);
+ // Independent EventLoopGroup so this test cleans up after itself even when other
+ // tests in the same JVM mutated the shared pool.
+ clientConfig.setIoThreadGroupShare(false);
+ // 500ms: comfortably above EventLoop scheduling jitter on slow CI machines.
+ clientConfig.setIdleTimeout(500);
+
+ NettyTcpClientTransport client = new NettyTcpClientTransport(clientConfig,
+ new ChannelHandlerAdapter(), new TransportClientCodecTest());
+ try {
+ client.open();
+
+ // Force the lazy connect to materialise.
+ Channel ch = client.getChannel().toCompletableFuture()
+ .get(2, TimeUnit.SECONDS);
+ assertNotNull(ch);
+ assertTrue("channel must be live before idle timeout", ch.isConnected());
+
+ // Wait for READ_IDLE to fire (idleTimeout=500ms) and the close + slot
+ // invalidation to propagate. 5s window leaves plenty of headroom.
+ long deadline = System.currentTimeMillis() + 5_000;
+ while (ch.isConnected() && System.currentTimeMillis() < deadline) {
+ Thread.sleep(50);
+ }
+ assertFalse("idle channel must have been closed by the idle handler",
+ ch.isConnected());
+
+ // The slot should now report "needs reconnect": getChannel triggers
+ // ensureChannelActive which rebuilds the connection on the same EventLoopGroup.
+ // The accept thread is still running, so the second accept also succeeds —
+ // BUT the original test only had a single-shot ServerSocket.accept(). To keep
+ // the test focused on the idle-close hand-off, we accept the rebuild may end
+ // up "connecting" but immediately failing on the unbacked port; either way
+ // the slot must no longer hold the original channel.
+ try {
+ Channel rebuilt = client.getChannel().toCompletableFuture()
+ .get(1, TimeUnit.SECONDS);
+ // If reconnect succeeded (some accept slot was still available) the new
+ // channel must be a different object than the closed one.
+ assertNotNull(rebuilt);
+ assertTrue(rebuilt != ch || rebuilt.isConnected());
+ } catch (Throwable ignore) {
+ // Reconnect against an exhausted single-shot ServerSocket may fail; the
+ // important assertion (idle close happened) has already been verified.
+ }
+ } finally {
+ try {
+ client.close();
+ } catch (Throwable ignore) {
+ // best-effort cleanup
+ }
+ try {
+ server.close();
+ } catch (Throwable ignore) {
+ // best-effort cleanup
+ }
+ acceptor.interrupt();
+ }
+ }
+}
diff --git a/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyTcpClientTransportTest.java b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyTcpClientTransportTest.java
new file mode 100644
index 000000000..d19e3025c
--- /dev/null
+++ b/trpc-transport/trpc-transport-netty/src/test/java/com/tencent/trpc/transport/netty/NettyTcpClientTransportTest.java
@@ -0,0 +1,285 @@
+/*
+ * Tencent is pleased to support the open source community by making tRPC available.
+ *
+ * Copyright (C) 2023 Tencent.
+ * All rights reserved.
+ *
+ * If you have downloaded a copy of the tRPC source code from Tencent,
+ * please note that tRPC source code is licensed under the Apache 2.0 License,
+ * A copy of the Apache 2.0 License can be found in the LICENSE file.
+ */
+
+package com.tencent.trpc.transport.netty;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import com.tencent.trpc.core.common.config.ProtocolConfig;
+import com.tencent.trpc.core.transport.handler.ChannelHandlerAdapter;
+import com.tencent.trpc.core.utils.NetUtils;
+import io.netty.bootstrap.Bootstrap;
+import io.netty.channel.ChannelOption;
+import io.netty.channel.ChannelPipeline;
+import io.netty.channel.epoll.EpollChannelOption;
+import java.lang.reflect.Method;
+import java.util.Map;
+import org.junit.Test;
+
+/**
+ * White-box coverage for {@link NettyTcpClientTransport}'s long-connection helpers:
+ * {@code resolveIdleTimeoutMills}, {@code applyTcpKeepAliveTuning} and the pipeline
+ * wiring driven by {@code idleTimeout}. The pipeline assertions go through a real
+ * {@code doOpen} on an unbound bootstrap — no socket is ever opened.
+ */
+public class NettyTcpClientTransportTest {
+
+ /**
+ * {@code resolveIdleTimeoutMills} must return 0 for {@code null} / non-positive
+ * configurations (legacy "disabled" semantics) and the raw value otherwise.
+ */
+ @Test
+ public void testResolveIdleTimeoutMillsBranches() throws Exception {
+ Method m = NettyTcpClientTransport.class.getDeclaredMethod("resolveIdleTimeoutMills");
+ m.setAccessible(true);
+
+ // Disabled — null / 0 / negative all collapse to 0.
+ NettyTcpClientTransport tNull = newTransport(null);
+ assertEquals(0L, ((Long) m.invoke(tNull)).longValue());
+ tNull.close();
+
+ NettyTcpClientTransport tZero = newTransport(0);
+ assertEquals(0L, ((Long) m.invoke(tZero)).longValue());
+ tZero.close();
+
+ NettyTcpClientTransport tNeg = newTransport(-1);
+ assertEquals(0L, ((Long) m.invoke(tNeg)).longValue());
+ tNeg.close();
+
+ // Enabled — raw value is returned.
+ NettyTcpClientTransport tPos = newTransport(180_000);
+ assertEquals(180_000L, ((Long) m.invoke(tPos)).longValue());
+ tPos.close();
+ }
+
+ /**
+ * Each TCP keepalive parameter is applied independently and only when strictly positive.
+ * The test sets non-default values for all three and asserts they appear on the
+ * bootstrap; a separate test below covers the no-op branches.
+ */
+ @Test
+ public void testApplyTcpKeepAliveTuningSetsAllPositiveValues() throws Exception {
+ ProtocolConfig config = newConfig(180_000);
+ config.setTcpKeepAliveIdle(45);
+ config.setTcpKeepAliveIntvl(15);
+ config.setTcpKeepAliveCnt(7);
+
+ NettyTcpClientTransport transport = new NettyTcpClientTransport(config,
+ new ChannelHandlerAdapter(), null);
+ try {
+ Bootstrap bootstrap = new Bootstrap();
+ invokeApplyTcpKeepAliveTuning(transport, bootstrap);
+ Map, Object> opts = bootstrap.config().options();
+ assertEquals(45, opts.get(EpollChannelOption.TCP_KEEPIDLE));
+ assertEquals(15, opts.get(EpollChannelOption.TCP_KEEPINTVL));
+ assertEquals(7, opts.get(EpollChannelOption.TCP_KEEPCNT));
+ } finally {
+ transport.close();
+ }
+ }
+
+ /**
+ * Non-positive / null values must NOT be propagated to the bootstrap — the kernel
+ * default is preserved. This branch matters because mis-configured yamls (negative
+ * values) would otherwise surface as netty option errors at connect time.
+ */
+ @Test
+ public void testApplyTcpKeepAliveTuningSkipsNonPositive() throws Exception {
+ ProtocolConfig config = newConfig(180_000);
+ // Idle is positive and must be set; the other two are zero / negative and must NOT.
+ config.setTcpKeepAliveIdle(30);
+ config.setTcpKeepAliveIntvl(0);
+ config.setTcpKeepAliveCnt(-1);
+
+ NettyTcpClientTransport transport = new NettyTcpClientTransport(config,
+ new ChannelHandlerAdapter(), null);
+ try {
+ Bootstrap bootstrap = new Bootstrap();
+ invokeApplyTcpKeepAliveTuning(transport, bootstrap);
+ Map, Object> opts = bootstrap.config().options();
+ assertEquals(30, opts.get(EpollChannelOption.TCP_KEEPIDLE));
+ assertNull("non-positive intvl must not be propagated",
+ opts.get(EpollChannelOption.TCP_KEEPINTVL));
+ assertNull("negative cnt must not be propagated",
+ opts.get(EpollChannelOption.TCP_KEEPCNT));
+ } finally {
+ transport.close();
+ }
+ }
+
+ /**
+ * When {@code idleTimeout > 0}, {@code doOpen} must register both
+ * {@code idleState} and {@code idleClose} pipeline handlers. Driven offline by
+ * reflecting on the {@link io.netty.channel.ChannelInitializer} captured in the
+ * bootstrap and invoking its {@code initChannel(Channel)} on a fresh
+ * {@link io.netty.channel.embedded.EmbeddedChannel}; no real socket / EventLoop is
+ * involved so the test is fully deterministic and isolated from JVM-level concurrency.
+ */
+ @Test
+ public void testDoOpenInstallsIdlePipelineWhenEnabled() throws Exception {
+ ChannelPipeline pipeline = pipelineAfterDoOpen(180_000);
+ assertNotNull("idleState handler must be installed when idleTimeout > 0",
+ pipeline.get("idleState"));
+ assertNotNull("idleClose handler must be installed when idleTimeout > 0",
+ pipeline.get("idleClose"));
+ }
+
+ /**
+ * When {@code idleTimeout <= 0}, {@code doOpen} must skip both idle-related handlers —
+ * the legacy "disabled" mode continues to work for one-way RPC callers.
+ */
+ @Test
+ public void testDoOpenSkipsIdlePipelineWhenDisabled() throws Exception {
+ ChannelPipeline pipeline = pipelineAfterDoOpen(0);
+ assertNull("idleState handler must NOT be installed when idleTimeout = 0",
+ pipeline.get("idleState"));
+ assertNull("idleClose handler must NOT be installed when idleTimeout = 0",
+ pipeline.get("idleClose"));
+ }
+
+ /**
+ * {@code useChannelPool} reflects {@code keepAlive}. Default {@code keepAlive=true} →
+ * pool, explicit {@code false} → no pool.
+ */
+ @Test
+ public void testUseChannelPoolFollowsKeepAlive() throws Exception {
+ ProtocolConfig poolCfg = newConfig(180_000);
+ poolCfg.setKeepAlive(true);
+ NettyTcpClientTransport withPool = new NettyTcpClientTransport(poolCfg,
+ new ChannelHandlerAdapter(), null);
+ try {
+ assertTrue(invokeUseChannelPool(withPool));
+ } finally {
+ withPool.close();
+ }
+
+ ProtocolConfig noPoolCfg = newConfig(180_000);
+ noPoolCfg.setKeepAlive(false);
+ NettyTcpClientTransport noPool = new NettyTcpClientTransport(noPoolCfg,
+ new ChannelHandlerAdapter(), null);
+ try {
+ assertFalse(invokeUseChannelPool(noPool));
+ } finally {
+ noPool.close();
+ }
+ }
+
+ /**
+ * Helper: invoke private {@code applyTcpKeepAliveTuning(Bootstrap)}.
+ */
+ private static void invokeApplyTcpKeepAliveTuning(NettyTcpClientTransport t,
+ Bootstrap bootstrap) throws Exception {
+ Method m = NettyTcpClientTransport.class
+ .getDeclaredMethod("applyTcpKeepAliveTuning", Bootstrap.class);
+ m.setAccessible(true);
+ m.invoke(t, bootstrap);
+ }
+
+ /**
+ * Helper: invoke protected {@code useChannelPool()}.
+ */
+ private static boolean invokeUseChannelPool(NettyTcpClientTransport t) throws Exception {
+ Method m = NettyTcpClientTransport.class.getDeclaredMethod("useChannelPool");
+ m.setAccessible(true);
+ return (boolean) m.invoke(t);
+ }
+
+ /**
+ * Drive {@code doOpen} on a transport configured with the given {@code idleTimeout},
+ * pull the {@link io.netty.channel.ChannelInitializer} captured in the bootstrap and
+ * reflectively run its {@code initChannel(Channel)} against a fresh
+ * {@link io.netty.channel.embedded.EmbeddedChannel}. The resulting pipeline reflects
+ * exactly what production code would install on a real connected channel — no real
+ * socket / EventLoop / connect attempt is involved, so the result is deterministic
+ * regardless of host load or other tests running in the same JVM.
+ */
+ private static ChannelPipeline pipelineAfterDoOpen(int idleTimeout) throws Exception {
+ ProtocolConfig config = new ProtocolConfig();
+ config.setIp(NetUtils.LOCAL_HOST);
+ // Port number is irrelevant — we never actually connect.
+ config.setPort(NetUtils.getAvailablePort());
+ config.setNetwork("tcp");
+ config.setConnsPerAddr(1);
+ // lazyinit=true: open() must NOT fire a real connect; we only need doOpen() so the
+ // bootstrap is configured with our ChannelInitializer.
+ config.setLazyinit(true);
+ config.setKeepAlive(true);
+ config.setIoThreadGroupShare(false);
+ config.setIdleTimeout(idleTimeout);
+
+ NettyTcpClientTransport transport = new NettyTcpClientTransport(config,
+ new ChannelHandlerAdapter(), new TransportClientCodecTest());
+ try {
+ // open() runs through LifecycleObj → doOpen(); with lazyinit=true no connect
+ // is attempted. After this the bootstrap has our ChannelInitializer registered.
+ transport.open();
+ io.netty.channel.ChannelHandler initializer = transport.getBootstrap().config().handler();
+ assertNotNull("doOpen must register a ChannelInitializer", initializer);
+
+ // Use a fresh EmbeddedChannel as the target for the ChannelInitializer's
+ // initChannel(Channel) — we invoke it reflectively rather than via netty's
+ // own pipeline.add()/register() path so the result is fully deterministic and
+ // independent of any other tests / EventLoop state in the same JVM.
+ io.netty.channel.embedded.EmbeddedChannel ch = new io.netty.channel.embedded.EmbeddedChannel();
+ // The anonymous ChannelInitializer subclass declares exactly one
+ // initChannel(Channel) method — non-synthetic, non-bridge.
+ Method initChannel = null;
+ for (Method m : initializer.getClass().getDeclaredMethods()) {
+ if ("initChannel".equals(m.getName()) && m.getParameterCount() == 1
+ && io.netty.channel.Channel.class.isAssignableFrom(m.getParameterTypes()[0])
+ && !m.isSynthetic() && !m.isBridge()) {
+ initChannel = m;
+ break;
+ }
+ }
+ assertNotNull("ChannelInitializer must declare an initChannel(Channel) method",
+ initChannel);
+ initChannel.setAccessible(true);
+ initChannel.invoke(initializer, ch);
+
+ // Return the live pipeline. NOTE: do NOT close the embedded channel — close()
+ // detaches every handler, which would invalidate {@code pipeline.get(name)}.
+ return ch.pipeline();
+ } finally {
+ try {
+ transport.close();
+ } catch (Throwable ignore) {
+ // best-effort cleanup; the assertions above already captured the result
+ }
+ }
+ }
+
+ private static NettyTcpClientTransport newTransport(Integer idleTimeout) throws Exception {
+ ProtocolConfig config = newConfig(idleTimeout);
+ return new NettyTcpClientTransport(config, new ChannelHandlerAdapter(),
+ new TransportClientCodecTest());
+ }
+
+ private static ProtocolConfig newConfig(Integer idleTimeout) {
+ ProtocolConfig config = new ProtocolConfig();
+ config.setIp(NetUtils.LOCAL_HOST);
+ config.setPort(NetUtils.getAvailablePort());
+ config.setNetwork("tcp");
+ config.setConnsPerAddr(1);
+ config.setLazyinit(true);
+ config.setKeepAlive(true);
+ // Independent EventLoopGroup so each test cleans up its own threads.
+ config.setIoThreadGroupShare(false);
+ // Always set idleTimeout explicitly: production default (Constants) is 180_000 and
+ // the disabled-path tests need it overridden to 0 / negative.
+ config.setIdleTimeout(idleTimeout == null ? -1 : idleTimeout);
+ return config;
+ }
+}