1. 请求分发的简单思路
2. es中search的多节点分发收集
我们以search的分发收集为出发点,观看es如何办成这件。原因是search在es中最为普遍与经典,虽说不得每个地方实现都一样,但至少参考意义还是有的。故以search为切入点。search的框架工作流程,我们之前已经研究过,本节就直接以核心开始讲解,它是在 TransportSearchAction.executeRequest() 中的。
private void executeRequest(Task task, SearchRequest searchRequest,
SearchAsyncActionProvider searchAsyncActionProvider, ActionListener<SearchResponse> listener) {
final long relativeStartNanos = System.nanoTime();
final SearchTimeProvider timeProvider =
new SearchTimeProvider(searchRequest.getOrCreateAbsoluteStartMillis(), relativeStartNanos, System::nanoTime);
ActionListener<SearchSourceBuilder> rewriteListener = ActionListener.wrap(source -> {
if (source != searchRequest.source()) {
// only set it if it changed - we don't allow null values to be set but it might be already null. this way we catch
// situations when source is rewritten to null due to a bug
final ClusterState clusterState = clusterService.state();
final SearchContextId searchContext;
final Map<String, OriginalIndices> remoteClusterIndices;
if (searchRequest.pointInTimeBuilder() != null) {
searchContext = SearchContextId.decode(namedWriteableRegistry, searchRequest.pointInTimeBuilder().getId());
remoteClusterIndices = getIndicesFromSearchContexts(searchContext, searchRequest.indicesOptions());
} else {
searchContext = null;
remoteClusterIndices = remoteClusterService.groupIndices(searchRequest.indicesOptions(),
searchRequest.indices(), idx -> indexNameExpressionResolver.hasIndexAbstraction(idx, clusterState));
OriginalIndices localIndices = remoteClusterIndices.remove(RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY);
if (remoteClusterIndices.isEmpty()) {
task, timeProvider, searchRequest, localIndices, clusterState, listener, searchContext, searchAsyncActionProvider);
} else {
// 多节点数据请求
if (shouldMinimizeRoundtrips(searchRequest)) {
// 通过 parentTaskId 关联所有子任务
final TaskId parentTaskId = task.taskInfo(clusterService.localNode().getId(), false).getTaskId();
ccsRemoteReduce(parentTaskId, searchRequest, localIndices, remoteClusterIndices, timeProvider,
remoteClusterService, threadPool, listener,
(r, l) -> executeLocalSearch(
task, timeProvider, r, localIndices, clusterState, l, searchContext, searchAsyncActionProvider));
} else {
AtomicInteger skippedClusters = new AtomicInteger(0);
// 直接分发多shard请求到各节点
collectSearchShards(searchRequest.indicesOptions(), searchRequest.preference(), searchRequest.routing(),
skippedClusters, remoteClusterIndices, remoteClusterService, threadPool,
searchShardsResponses -> {
// 当所有节点都响应后,再做后续逻辑处理,即此处的后置监听
final BiFunction<String, String, DiscoveryNode> clusterNodeLookup =
final Map<String, AliasFilter> remoteAliasFilters;
final List<SearchShardIterator> remoteShardIterators;
if (searchContext != null) {
remoteAliasFilters = searchContext.aliasFilter();
remoteShardIterators = getRemoteShardsIteratorFromPointInTime(searchShardsResponses,
searchContext, searchRequest.pointInTimeBuilder().getKeepAlive(), remoteClusterIndices);
} else {
remoteAliasFilters = getRemoteAliasFilters(searchShardsResponses);
remoteShardIterators = getRemoteShardsIterator(searchShardsResponses, remoteClusterIndices,
int localClusters = localIndices == null ? 0 : 1;
int totalClusters = remoteClusterIndices.size() + localClusters;
int successfulClusters = searchShardsResponses.size() + localClusters;
// 至于后续搜索实现如何,不在此间
executeSearch((SearchTask) task, timeProvider, searchRequest, localIndices, remoteShardIterators,
clusterNodeLookup, clusterState, remoteAliasFilters, listener,
new SearchResponse.Clusters(totalClusters, successfulClusters, skippedClusters.get()),
searchContext, searchAsyncActionProvider);
}, listener::onFailure);
if (searchRequest.source() == null) {
} else {
Rewriteable.rewriteAndFetch(searchRequest.source(), searchService.getRewriteContext(timeProvider::getAbsoluteStartMillis),
可以看到,es的search功能,会被划分为几种类型,有点会走集群分发,而有的则不需要。我们自然是希望走集群分发的,所以,只需看 collectSearchShards() 即可。这里面其实就是对多个集群节点的依次请求,当然还有结果收集。
static void collectSearchShards(IndicesOptions indicesOptions, String preference, String routing, AtomicInteger skippedClusters,
Map<String, OriginalIndices> remoteIndicesByCluster, RemoteClusterService remoteClusterService,
ThreadPool threadPool, ActionListener<Map<String, ClusterSearchShardsResponse>> listener) {
// 使用该计数器进行结果控制
final CountDown responsesCountDown = new CountDown(remoteIndicesByCluster.size());
final Map<String, ClusterSearchShardsResponse> searchShardsResponses = new ConcurrentHashMap<>();
final AtomicReference<Exception> exceptions = new AtomicReference<>();
// 迭代各节点,依次发送请求
for (Map.Entry<String, OriginalIndices> entry : remoteIndicesByCluster.entrySet()) {
final String clusterAlias = entry.getKey();
boolean skipUnavailable = remoteClusterService.isSkipUnavailable(clusterAlias);
Client clusterClient = remoteClusterService.getRemoteClusterClient(threadPool, clusterAlias);
final String[] indices = entry.getValue().indices();
ClusterSearchShardsRequest searchShardsRequest = new ClusterSearchShardsRequest(indices)
// 向集群中 clusterAlias 异步发起请求处理 search
new CCSActionListener<ClusterSearchShardsResponse, Map<String, ClusterSearchShardsResponse>>(
clusterAlias, skipUnavailable, responsesCountDown, skippedClusters, exceptions, listener) {
void innerOnResponse(ClusterSearchShardsResponse clusterSearchShardsResponse) {
// 每次单节点响应时,将结果存放到 searchShardsResponses 中
searchShardsResponses.put(clusterAlias, clusterSearchShardsResponse);
} @Override
Map<String, ClusterSearchShardsResponse> createFinalResponse() {
// 所有节点都返回时,将结果集返回
return searchShardsResponses;
public void searchShards(final ClusterSearchShardsRequest request, final ActionListener<ClusterSearchShardsResponse> listener) {
// 发起请求 indices:admin/shards/search_shards, 其对应处理器为 TransportClusterSearchShardsAction
execute(ClusterSearchShardsAction.INSTANCE, request, listener);
2.1. 多节点响应结果处理
这是我们本文讨论的重点。前面我们看到es已经异步发送请求出去了(且不论其如何发送),所以如何收集结果也很关键。而es中的做法则很简单,使用一个 ConcurrentHashMap 收集每个结果,一个CountDown标识是否已处理完成。
CCSActionListener(String clusterAlias, boolean skipUnavailable, CountDown countDown, AtomicInteger skippedClusters,
AtomicReference<Exception> exceptions, ActionListener<FinalResponse> originalListener) {
this.clusterAlias = clusterAlias;
this.skipUnavailable = skipUnavailable;
this.countDown = countDown;
this.skippedClusters = skippedClusters;
this.exceptions = exceptions;
this.originalListener = originalListener;
} // 成功时的响应
public final void onResponse(Response response) {
// inner响应为将结果放入 searchShardsResponses 中
// maybeFinish 则进行结果是否完成判定,如果完成,则调用回调方法,构造结果
} private void maybeFinish() {
// 使用一个 AtomicInteger 进行控制
if (countDown.countDown()) {
Exception exception = exceptions.get();
if (exception == null) {
FinalResponse response;
try {
// 创建响应结果,此处 search 即为 searchShardsResponses
response = createFinalResponse();
} catch(Exception e) {
// 成功响应回调,实现结果收集后的其他业务处理
} else {
// CountDown 实现比较简单,只有最后一个返回true, 其他皆为false, 即实现了 At Most Once 语义
* Decrements the count-down and returns <code>true</code> iff this call
* reached zero otherwise <code>false</code>
public boolean countDown() {
assert originalCount > 0;
for (;;) {
final int current = countDown.get();
assert current >= 0;
if (current == 0) {
return false;
if (countDown.compareAndSet(current, current - 1)) {
return current == 1;
可见,ES中的结果收集,是以一个 AtomicInteger 实现的CountDown来处理的,当所有节点都响应时,就处理最终结果,否则将每个节点的数据放入ConcurrentHashMap中暂存起来。
而通过一个Client通用的异步调用框架,实现多节点的异步提交。整个节点响应以 CCSActionListener 作为接收者。可以说是比较简洁的了,好像也没有我们前面讨论的复杂性。因为:大道至简。
2.2. 异步提交请求实现
具体样例大致如下:因最终的处理器是以 TransportClusterSearchShardsAction 进行处理的,所以直接转到 TransportClusterSearchShardsAction。
// org.elasticsearch.action.admin.cluster.shards.TransportClusterSearchShardsAction
public class TransportClusterSearchShardsAction extends
TransportMasterNodeReadAction<ClusterSearchShardsRequest, ClusterSearchShardsResponse> { private final IndicesService indicesService; @Inject
public TransportClusterSearchShardsAction(TransportService transportService, ClusterService clusterService,
IndicesService indicesService, ThreadPool threadPool, ActionFilters actionFilters,
IndexNameExpressionResolver indexNameExpressionResolver) {
super(ClusterSearchShardsAction.NAME, transportService, clusterService, threadPool, actionFilters,
ClusterSearchShardsRequest::new, indexNameExpressionResolver, ClusterSearchShardsResponse::new, ThreadPool.Names.SAME);
this.indicesService = indicesService;
} @Override
protected ClusterBlockException checkBlock(ClusterSearchShardsRequest request, ClusterState state) {
return state.blocks().indicesBlockedException(ClusterBlockLevel.METADATA_READ,
indexNameExpressionResolver.concreteIndexNames(state, request));
} @Override
protected void masterOperation(final ClusterSearchShardsRequest request, final ClusterState state,
final ActionListener<ClusterSearchShardsResponse> listener) {
ClusterState clusterState = clusterService.state();
String[] concreteIndices = indexNameExpressionResolver.concreteIndexNames(clusterState, request);
Map<String, Set<String>> routingMap = indexNameExpressionResolver.resolveSearchRouting(state, request.routing(), request.indices());
Map<String, AliasFilter> indicesAndFilters = new HashMap<>();
Set<String> indicesAndAliases = indexNameExpressionResolver.resolveExpressions(clusterState, request.indices());
for (String index : concreteIndices) {
final AliasFilter aliasFilter = indicesService.buildAliasFilter(clusterState, index, indicesAndAliases);
final String[] aliases = indexNameExpressionResolver.indexAliases(clusterState, index, aliasMetadata -> true, true,
indicesAndFilters.put(index, new AliasFilter(aliasFilter.getQueryBuilder(), aliases));
} Set<String> nodeIds = new HashSet<>();
GroupShardsIterator<ShardIterator> groupShardsIterator = clusterService.operationRouting()
.searchShards(clusterState, concreteIndices, routingMap, request.preference());
ShardRouting shard;
ClusterSearchShardsGroup[] groupResponses = new ClusterSearchShardsGroup[groupShardsIterator.size()];
int currentGroup = 0;
for (ShardIterator shardIt : groupShardsIterator) {
ShardId shardId = shardIt.shardId();
ShardRouting[] shardRoutings = new ShardRouting[shardIt.size()];
int currentShard = 0;
while ((shard = shardIt.nextOrNull()) != null) {
shardRoutings[currentShard++] = shard;
groupResponses[currentGroup++] = new ClusterSearchShardsGroup(shardId, shardRoutings);
DiscoveryNode[] nodes = new DiscoveryNode[nodeIds.size()];
int currentNode = 0;
for (String nodeId : nodeIds) {
nodes[currentNode++] = clusterState.getNodes().get(nodeId);
listener.onResponse(new ClusterSearchShardsResponse(groupResponses, nodes, indicesAndFilters));
// doExecute 在父类中完成
protected void doExecute(Task task, final Request request, ActionListener<Response> listener) {
ClusterState state = clusterService.state();
logger.trace("starting processing request [{}] with cluster state version [{}]", request, state.version());
if (task != null) {
request.setParentTask(clusterService.localNode().getId(), task.getId());
new AsyncSingleAction(task, request, listener).doStart(state);
} //
AsyncSingleAction(Task task, Request request, ActionListener<Response> listener) {
this.task = task;
this.request = request;
this.listener = listener;
this.startTime = threadPool.relativeTimeInMillis();
} protected void doStart(ClusterState clusterState) {
try {
final DiscoveryNodes nodes = clusterState.nodes();
if (nodes.isLocalNodeElectedMaster() || localExecute(request)) {
// check for block, if blocked, retry, else, execute locally
final ClusterBlockException blockException = checkBlock(request, clusterState);
if (blockException != null) {
if (!blockException.retryable()) {
} else {
logger.debug("can't execute due to a cluster block, retrying", blockException);
// 重试处理
retry(clusterState, blockException, newState -> {
try {
ClusterBlockException newException = checkBlock(request, newState);
return (newException == null || !newException.retryable());
} catch (Exception e) {
// accept state as block will be rechecked by doStart() and listener.onFailure() then called
logger.trace("exception occurred during cluster block checking, accepting state", e);
return true;
} else {
ActionListener<Response> delegate = ActionListener.delegateResponse(listener, (delegatedListener, t) -> {
if (t instanceof FailedToCommitClusterStateException || t instanceof NotMasterException) {
logger.debug(() -> new ParameterizedMessage("master could not publish cluster state or " +
"stepped down before publishing action [{}], scheduling a retry", actionName), t);
retryOnMasterChange(clusterState, t);
} else {
// 本地节点执行结果,直接以异步线程处理即可
.execute(ActionRunnable.wrap(delegate, l -> masterOperation(task, request, clusterState, l)));
} else {
if (nodes.getMasterNode() == null) {
logger.debug("no known master node, scheduling a retry");
retryOnMasterChange(clusterState, null);
} else {
DiscoveryNode masterNode = nodes.getMasterNode();
final String actionName = getMasterActionName(masterNode);
// 发送到master节点,以netty作为通讯工具,完成后回调 当前listner
transportService.sendRequest(masterNode, actionName, request,
new ActionListenerResponseHandler<Response>(listener, responseReader) {
public void handleException(final TransportException exp) {
Throwable cause = exp.unwrapCause();
if (cause instanceof ConnectTransportException ||
(exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) {
// we want to retry here a bit to see if a new master is elected
logger.debug("connection exception while trying to forward request with action name [{}] to " +
"master node [{}], scheduling a retry. Error: [{}]",
actionName, nodes.getMasterNode(), exp.getDetailedMessage());
retryOnMasterChange(clusterState, cause);
} else {
} catch (Exception e) {
// org.elasticsearch.transport.TransportService#sendRequest
public final <T extends TransportResponse> void sendRequest(final DiscoveryNode node, final String action,
final TransportRequest request,
final TransportRequestOptions options,
TransportResponseHandler<T> handler) {
final Transport.Connection connection;
try {
// 假设不是本节点,则获取远程的一个 connection, channel
connection = getConnection(node);
} catch (final NodeNotConnectedException ex) {
// the caller might not handle this so we invoke the handler
sendRequest(connection, action, request, options, handler);
// org.elasticsearch.transport.TransportService#getConnection
* Returns either a real transport connection or a local node connection if we are using the local node optimization.
* @throws NodeNotConnectedException if the given node is not connected
public Transport.Connection getConnection(DiscoveryNode node) {
if (isLocalNode(node)) {
return localNodeConnection;
} else {
return connectionManager.getConnection(node);
} // org.elasticsearch.transport.TransportService#sendRequest
* Sends a request on the specified connection. If there is a failure sending the request, the specified handler is invoked.
* @param connection the connection to send the request on
* @param action the name of the action
* @param request the request
* @param options the options for this request
* @param handler the response handler
* @param <T> the type of the transport response
public final <T extends TransportResponse> void sendRequest(final Transport.Connection connection, final String action,
final TransportRequest request,
final TransportRequestOptions options,
final TransportResponseHandler<T> handler) {
try {
final TransportResponseHandler<T> delegate;
if (request.getParentTask().isSet()) {
// If the connection is a proxy connection, then we will create a cancellable proxy task on the proxy node and an actual
// child task on the target node of the remote cluster.
// ----> a parent task on the local cluster
// |
// ----> a proxy task on the proxy node on the remote cluster
// |
// ----> an actual child task on the target node on the remote cluster
// To cancel the child task on the remote cluster, we must send a cancel request to the proxy node instead of the target
// node as the parent task of the child task is the proxy task not the parent task on the local cluster. Hence, here we
// unwrap the connection and keep track of the connection to the proxy node instead of the proxy connection.
final Transport.Connection unwrappedConn = unwrapConnection(connection);
final Releasable unregisterChildNode = taskManager.registerChildConnection(request.getParentTask().getId(), unwrappedConn);
delegate = new TransportResponseHandler<T>() {
public void handleResponse(T response) {
} @Override
public void handleException(TransportException exp) {
} @Override
public String executor() {
return handler.executor();
} @Override
public T read(StreamInput in) throws IOException {
} @Override
public String toString() {
return getClass().getName() + "/[" + action + "]:" + handler.toString();
} else {
delegate = handler;
asyncSender.sendRequest(connection, action, request, options, delegate);
} catch (final Exception ex) {
// the caller might not handle this so we invoke the handler
final TransportException te;
if (ex instanceof TransportException) {
te = (TransportException) ex;
} else {
te = new TransportException("failure to send", ex);
} // org.elasticsearch.transport.TransportService#sendRequestInternal
private <T extends TransportResponse> void sendRequestInternal(final Transport.Connection connection, final String action,
final TransportRequest request,
final TransportRequestOptions options,
TransportResponseHandler<T> handler) {
if (connection == null) {
throw new IllegalStateException("can't send request to a null connection");
DiscoveryNode node = connection.getNode(); Supplier<ThreadContext.StoredContext> storedContextSupplier = threadPool.getThreadContext().newRestorableContext(true);
ContextRestoreResponseHandler<T> responseHandler = new ContextRestoreResponseHandler<>(storedContextSupplier, handler);
// TODO we can probably fold this entire request ID dance into connection.sendReqeust but it will be a bigger refactoring
final long requestId = responseHandlers.add(new Transport.ResponseContext<>(responseHandler, connection, action));
final TimeoutHandler timeoutHandler;
if (options.timeout() != null) {
timeoutHandler = new TimeoutHandler(requestId, connection.getNode(), action);
} else {
timeoutHandler = null;
try {
if (lifecycle.stoppedOrClosed()) {
* If we are not started the exception handling will remove the request holder again and calls the handler to notify the
* caller. It will only notify if toStop hasn't done the work yet.
throw new NodeClosedException(localNode);
if (timeoutHandler != null) {
assert options.timeout() != null;
connection.sendRequest(requestId, action, request, options); // local node optimization happens upstream
} catch (final Exception e) {
// usually happen either because we failed to connect to the node
// or because we failed serializing the message
final Transport.ResponseContext<? extends TransportResponse> contextToNotify = responseHandlers.remove(requestId);
// If holderToNotify == null then handler has already been taken care of.
if (contextToNotify != null) {
if (timeoutHandler != null) {
// callback that an exception happened, but on a different thread since we don't
// want handlers to worry about stack overflows. In the special case of running into a closing node we run on the current
// thread on a best effort basis though.
final SendRequestTransportException sendRequestException = new SendRequestTransportException(node, action, e);
final String executor = lifecycle.stoppedOrClosed() ? ThreadPool.Names.SAME : ThreadPool.Names.GENERIC;
threadPool.executor(executor).execute(new AbstractRunnable() {
public void onRejection(Exception e) {
// if we get rejected during node shutdown we don't wanna bubble it up
() -> new ParameterizedMessage(
"failed to notify response handler on rejection, action: {}",
public void onFailure(Exception e) {
() -> new ParameterizedMessage(
"failed to notify response handler on exception, action: {}",
protected void doRun() throws Exception {
} else {
logger.debug("Exception while sending request, handler likely already notified due to timeout", e);
// org.elasticsearch.transport.RemoteConnectionManager.ProxyConnection#sendRequest
public void sendRequest(long requestId, String action, TransportRequest request, TransportRequestOptions options)
throws IOException, TransportException {
connection.sendRequest(requestId, TransportActionProxy.getProxyAction(action),
TransportActionProxy.wrapRequest(targetNode, request), options);
// org.elasticsearch.transport.TcpTransport.NodeChannels#sendRequest
public void sendRequest(long requestId, String action, TransportRequest request, TransportRequestOptions options)
throws IOException, TransportException {
if (isClosing.get()) {
throw new NodeNotConnectedException(node, "connection already closed");
TcpChannel channel = channel(options.type());
outboundHandler.sendRequest(node, channel, requestId, action, request, options, getVersion(), compress, false);
// org.elasticsearch.transport.OutboundHandler#sendRequest
* Sends the request to the given channel. This method should be used to send {@link TransportRequest}
* objects back to the caller.
void sendRequest(final DiscoveryNode node, final TcpChannel channel, final long requestId, final String action,
final TransportRequest request, final TransportRequestOptions options, final Version channelVersion,
final boolean compressRequest, final boolean isHandshake) throws IOException, TransportException {
Version version = Version.min(this.version, channelVersion);
OutboundMessage.Request message = new OutboundMessage.Request(threadPool.getThreadContext(), features, request, version, action,
requestId, isHandshake, compressRequest);
ActionListener<Void> listener = ActionListener.wrap(() ->
messageListener.onRequestSent(node, requestId, action, request, options));
sendMessage(channel, message, listener);
// org.elasticsearch.transport.OutboundHandler#sendMessage
private void sendMessage(TcpChannel channel, OutboundMessage networkMessage, ActionListener<Void> listener) throws IOException {
MessageSerializer serializer = new MessageSerializer(networkMessage, bigArrays);
SendContext sendContext = new SendContext(channel, serializer, listener, serializer);
internalSend(channel, sendContext);
private void internalSend(TcpChannel channel, SendContext sendContext) throws IOException {
BytesReference reference = sendContext.get();
// stash thread context so that channel event loop is not polluted by thread context
try (ThreadContext.StoredContext existing = threadPool.getThreadContext().stashContext()) {
channel.sendMessage(reference, sendContext);
} catch (RuntimeException ex) {
throw ex;
// org.elasticsearch.transport.netty4.Netty4TcpChannel#sendMessage
public void sendMessage(BytesReference reference, ActionListener<Void> listener) {
// netty 发送数据,异步回调,完成异步请求
channel.writeAndFlush(Netty4Utils.toByteBuf(reference), addPromise(listener, channel)); if (channel.eventLoop().isShutdown()) {
listener.onFailure(new TransportException("Cannot send message, event loop is shutting down."));
