WebRuntimeMonitor

 

.GET("/jobs/:jobid/vertices/:vertexid/metrics", handler(new JobVertexMetricsHandler(metricFetcher)))
.GET("/jobs/:jobid/metrics", handler(new JobMetricsHandler(metricFetcher)))
.GET("/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/metrics", handler(new TaskManagerMetricsHandler(metricFetcher)))
.GET("/jobmanager/metrics", handler(new JobManagerMetricsHandler(metricFetcher)))
 

JobVertexMetricsHandler

 

AbstractMetricsHandler

 

MetricFetcher

核心就是fetchMetrics函数,会从JobManager获取数据,

private void fetchMetrics() {
try {
Option<scala.Tuple2<ActorGateway, Integer>> jobManagerGatewayAndWebPort = retriever.getJobManagerGatewayAndWebPort();
if (jobManagerGatewayAndWebPort.isDefined()) {
ActorGateway jobManager = jobManagerGatewayAndWebPort.get()._1(); //得到JobManager的ActorGateway /**
* Remove all metrics that belong to a job that is not running and no longer archived.
*/
Future<Object> jobDetailsFuture = jobManager.ask(new RequestJobDetails(true, true), timeout); //生成request获取job状态
jobDetailsFuture
.onSuccess(new OnSuccess<Object>() {
@Override
public void onSuccess(Object result) throws Throwable {
MultipleJobsDetails details = (MultipleJobsDetails) result;
ArrayList<String> toRetain = new ArrayList<>();
for (JobDetails job : details.getRunningJobs()) {
toRetain.add(job.getJobId().toString());
}
for (JobDetails job : details.getFinishedJobs()) {
toRetain.add(job.getJobId().toString());
}
synchronized (metrics) {
metrics.jobs.keySet().retainAll(toRetain); //只保留Runing和Finished的job,即不正常的都删掉
}
}
}, ctx);
logErrorOnFailure(jobDetailsFuture, "Fetching of JobDetails failed."); String jobManagerPath = jobManager.path();
String queryServicePath = jobManagerPath.substring(0, jobManagerPath.lastIndexOf('/') + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME;
ActorRef jobManagerQueryService = actorSystem.actorFor(queryServicePath); queryMetrics(jobManagerQueryService); //查询jobManager的Metrics /**
* We first request the list of all registered task managers from the job manager, and then
* request the respective metric dump from each task manager.
*
* All stored metrics that do not belong to a registered task manager will be removed.
*/
Future<Object> registeredTaskManagersFuture = jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout); //查询所有taskManager
registeredTaskManagersFuture
.onSuccess(new OnSuccess<Object>() {
@Override
public void onSuccess(Object result) throws Throwable {
Iterable<Instance> taskManagers = ((JobManagerMessages.RegisteredTaskManagers) result).asJavaIterable();
List<String> activeTaskManagers = new ArrayList<>();
for (Instance taskManager : taskManagers) { //遍历taskManager
activeTaskManagers.add(taskManager.getId().toString()); String taskManagerPath = taskManager.getTaskManagerGateway().getAddress();
String queryServicePath = taskManagerPath.substring(0, taskManagerPath.lastIndexOf('/') + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME + "_" + taskManager.getTaskManagerID().getResourceIdString();
ActorRef taskManagerQueryService = actorSystem.actorFor(queryServicePath); queryMetrics(taskManagerQueryService); //查询每个taskMananger的metrics
}
synchronized (metrics) { // remove all metrics belonging to unregistered task managers
metrics.taskManagers.keySet().retainAll(activeTaskManagers); //删除所有的未注册的TaskManager
}
}
}, ctx);
logErrorOnFailure(registeredTaskManagersFuture, "Fetchin list of registered TaskManagers failed.");
}
} catch (Exception e) {
LOG.warn("Exception while fetching metrics.", e);
}
}

 

queryMetrics

/**
* Requests a metric dump from the given actor.
*
* @param actor ActorRef to request the dump from
*/
private void queryMetrics(ActorRef actor) {
Future<Object> metricQueryFuture = new BasicGateway(actor).ask(MetricQueryService.getCreateDump(), timeout); //获取metrics dump
metricQueryFuture
.onSuccess(new OnSuccess<Object>() {
@Override
public void onSuccess(Object result) throws Throwable {
addMetrics(result);
}
}, ctx);
logErrorOnFailure(metricQueryFuture, "Fetching metrics failed.");
} private void addMetrics(Object result) throws IOException {
byte[] data = (byte[]) result;
List<MetricDump> dumpedMetrics = deserializer.deserialize(data);
for (MetricDump metric : dumpedMetrics) {
metrics.add(metric); //把metrics dump加入metrics store
}
}

 

MetricStore

用嵌套的hashmap来存储metrics,瞬时值

final JobManagerMetricStore jobManager = new JobManagerMetricStore();
final Map<String, TaskManagerMetricStore> taskManagers = new HashMap<>();
final Map<String, JobMetricStore> jobs = new HashMap<>();

 

public static class JobManagerMetricStore extends ComponentMetricStore {
} private static abstract class ComponentMetricStore {
public final Map<String, String> metrics = new HashMap<>(); //store就是一个map public String getMetric(String name, String defaultValue) {
String value = this.metrics.get(name);
return value != null
? value
: defaultValue;
}
}

 

MetricQueryService

public class MetricQueryService extends UntypedActor {
private static final Logger LOG = LoggerFactory.getLogger(MetricQueryService.class); public static final String METRIC_QUERY_SERVICE_NAME = "MetricQueryService"; private static final CharacterFilter FILTER = new CharacterFilter() {
@Override
public String filterCharacters(String input) {
return replaceInvalidChars(input);
}
}; private final MetricDumpSerializer serializer = new MetricDumpSerializer(); private final Map<Gauge<?>, Tuple2<QueryScopeInfo, String>> gauges = new HashMap<>();
private final Map<Counter, Tuple2<QueryScopeInfo, String>> counters = new HashMap<>();
private final Map<Histogram, Tuple2<QueryScopeInfo, String>> histograms = new HashMap<>();
private final Map<Meter, Tuple2<QueryScopeInfo, String>> meters = new HashMap<>();

 

收到CreateDump请求,

} else if (message instanceof CreateDump) {
byte[] dump = serializer.serialize(counters, gauges, histograms, meters);
getSender().tell(dump, getSelf());

 

Start

   /**
* Starts the MetricQueryService actor in the given actor system.
*
* @param actorSystem The actor system running the MetricQueryService
* @param resourceID resource ID to disambiguate the actor name
* @return actor reference to the MetricQueryService
*/
public static ActorRef startMetricQueryService(ActorSystem actorSystem, ResourceID resourceID) {
String actorName = resourceID == null
? METRIC_QUERY_SERVICE_NAME
: METRIC_QUERY_SERVICE_NAME + "_" + resourceID.getResourceIdString();
return actorSystem.actorOf(Props.create(MetricQueryService.class), actorName);
}

 

在MetricRegistry中把metrics注册到QueryService中,

if (queryService != null) {
MetricQueryService.notifyOfAddedMetric(queryService, metric, metricName, group);
}

 

采集点

numRecordsIn

StreamInputProcessor –> processInput

    @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
public boolean processInput(OneInputStreamOperator<IN, ?> streamOperator, final Object lock) throws Exception {
if (numRecordsIn == null) {
numRecordsIn = ((OperatorMetricGroup) streamOperator.getMetricGroup()).getIOMetricGroup().getNumRecordsInCounter();
}
//...... // now we can do the actual processing
StreamRecord<IN> record = recordOrMark.asRecord();
synchronized (lock) {
numRecordsIn.inc(); //执行processElement前加一
streamOperator.setKeyContextElement1(record);
streamOperator.processElement(record);
}
return true;

如果是chaining,

ChainingOutput

private static class ChainingOutput<T> implements Output<StreamRecord<T>> {

    protected final OneInputStreamOperator<T, ?> operator;
protected final Counter numRecordsIn; public ChainingOutput(OneInputStreamOperator<T, ?> operator) {
this.operator = operator;
this.numRecordsIn = ((OperatorMetricGroup) operator.getMetricGroup()).getIOMetricGroup().getNumRecordsInCounter(); //初始化
} @Override
public void collect(StreamRecord<T> record) {
try {
numRecordsIn.inc(); //对于chain,在output时调用processElement
operator.setKeyContextElement1(record);
operator.processElement(record);
}
catch (Exception e) {
throw new ExceptionInChainedOperatorException(e);
}
}

 

numRecordsOut

在AbstractStreamOperator初始化时,

生成CountingOutput

    @Override
public void setup(StreamTask<?, ?> containingTask, StreamConfig config, Output<StreamRecord<OUT>> output) {
this.container = containingTask;
this.config = config; this.metrics = container.getEnvironment().getMetricGroup().addOperator(config.getOperatorName());
this.output = new CountingOutput(output, ((OperatorMetricGroup) this.metrics).getIOMetricGroup().getNumRecordsOutCounter()); //生成CountingOutput

这个output,

在processWatermark,processElement中会用于emit数据

output.emitWatermark(mark);

 

    public class CountingOutput implements Output<StreamRecord<OUT>> {
private final Output<StreamRecord<OUT>> output;
private final Counter numRecordsOut; public CountingOutput(Output<StreamRecord<OUT>> output, Counter counter) {
this.output = output;
this.numRecordsOut = counter;
} @Override
public void emitWatermark(Watermark mark) {
output.emitWatermark(mark);
} @Override
public void emitLatencyMarker(LatencyMarker latencyMarker) {
output.emitLatencyMarker(latencyMarker);
} @Override
public void collect(StreamRecord<OUT> record) {
numRecordsOut.inc(); //发出的时候,inc numRecordsOut
output.collect(record);
} @Override
public void close() {
output.close();
}
}

 

注意numRecordsOut和numRecordsIn,除了会统计operator级别的,还会统计task级别的,逻辑在

AbstractStreamOperator
    public void setup(StreamTask<?, ?> containingTask, StreamConfig config, Output<StreamRecord<OUT>> output) {
this.container = containingTask;
this.config = config; this.metrics = container.getEnvironment().getMetricGroup().addOperator(config.getOperatorName());
this.output = new CountingOutput(output, ((OperatorMetricGroup) this.metrics).getIOMetricGroup().getNumRecordsOutCounter());
if (config.isChainStart()) {
((OperatorMetricGroup) this.metrics).getIOMetricGroup().reuseInputMetricsForTask();
}
if (config.isChainEnd()) {
((OperatorMetricGroup) this.metrics).getIOMetricGroup().reuseOutputMetricsForTask();
}
OperatorIOMetricGroup
    public void reuseInputMetricsForTask() {
TaskIOMetricGroup taskIO = parentMetricGroup.parent().getIOMetricGroup();
taskIO.reuseRecordsInputCounter(this.numRecordsIn); } public void reuseOutputMetricsForTask() {
TaskIOMetricGroup taskIO = parentMetricGroup.parent().getIOMetricGroup();
taskIO.reuseRecordsOutputCounter(this.numRecordsOut);
}

可以看到,会将ChainHead的numRecordsIn,set到task的TaskIOMetricGroup

而将ChainEnd的numRecordsOut,set到task的TaskIOMetricGroup

看起来很合理

 

numRecordInPerSecond,numRecordsOutPerSecond

在OperatorIOMetricGroup
public OperatorIOMetricGroup(OperatorMetricGroup parentMetricGroup) {
super(parentMetricGroup);
numRecordsIn = parentMetricGroup.counter(MetricNames.IO_NUM_RECORDS_IN);
numRecordsOut = parentMetricGroup.counter(MetricNames.IO_NUM_RECORDS_OUT);
numRecordsInRate = parentMetricGroup.meter(MetricNames.IO_NUM_RECORDS_IN_RATE, new MeterView(numRecordsIn, 60));
numRecordsOutRate = parentMetricGroup.meter(MetricNames.IO_NUM_RECORDS_OUT_RATE, new MeterView(numRecordsOut, 60));
}

可以看到numRecordsInRate和numRecordsOutRate,只是numRecordsIn和numRecordsOut的MeterView

public class MeterView implements Meter, View {
/** The underlying counter maintaining the count */
private final Counter counter;
/** The time-span over which the average is calculated */
private final int timeSpanInSeconds;
/** Circular array containing the history of values */
private final long[] values;
/** The index in the array for the current time */
private int time = 0;
/** The last rate we computed */
private double currentRate = 0; public MeterView(Counter counter, int timeSpanInSeconds) {
this.counter = counter;
this.timeSpanInSeconds = timeSpanInSeconds - (timeSpanInSeconds % UPDATE_INTERVAL_SECONDS); //timeSpanInSeconds需要是UPDATE_INTERVAL_SECONDS(5)的倍数,
this.values = new long[this.timeSpanInSeconds / UPDATE_INTERVAL_SECONDS + 1]; //比如timeSpanInSeconds为60,那么就需要保存12个value
} @Override
public void markEvent() {
this.counter.inc();
} @Override
public void markEvent(long n) {
this.counter.inc(n);
} @Override
public long getCount() {
return counter.getCount();
} @Override
public double getRate() { //获取平均值
return currentRate;
} @Override
public void update() { //会被以UPDATE_INTERVAL_SECONDS为间隔调用
time = (time + 1) % values.length;
values[time] = counter.getCount();
currentRate = ((double) (values[time] - values[(time + 1) % values.length]) / timeSpanInSeconds); //values保存了timeSpanInSeconds时间段的counter的变化过程,所以用最新的减最老的,再除以timeSpanInSeconds
}
}

这个实现真是tricky,不好的设计

在MetricRegistry中,会创建

ViewUpdater
    public void register(Metric metric, String metricName, AbstractMetricGroup group) {
try {
if (reporters != null) {
for (int i = 0; i < reporters.size(); i++) {
MetricReporter reporter = reporters.get(i);
if (reporter != null) {
FrontMetricGroup front = new FrontMetricGroup<AbstractMetricGroup<?>>(i, group);
reporter.notifyOfAddedMetric(metric, metricName, front);
}
}
}
if (queryService != null) {
MetricQueryService.notifyOfAddedMetric(queryService, metric, metricName, group);
}
if (metric instanceof View) {
if (viewUpdater == null) {
viewUpdater = new ViewUpdater(executor);
}
viewUpdater.notifyOfAddedView((View) metric);
}
} catch (Exception e) {
LOG.error("Error while registering metric.", e);
}
}

并且在register metrics的时候,除了注册到reporter,MetricQueryService

如果是view的子类还要,注册到ViewUpdater

    public ViewUpdater(ScheduledExecutorService executor) {
executor.scheduleWithFixedDelay(new ViewUpdaterTask(lock, toAdd, toRemove), 5, UPDATE_INTERVAL_SECONDS, TimeUnit.SECONDS);
}

ViewUpdater会定期执行ViewUpdaterTask,task中就会调用view的update

 

numBytesInLocal, numBytesInRemote

在RemoteInputChannel和LocalInputChannel中,

    public LocalInputChannel(
SingleInputGate inputGate,
int channelIndex,
ResultPartitionID partitionId,
ResultPartitionManager partitionManager,
TaskEventDispatcher taskEventDispatcher,
int initialBackoff,
int maxBackoff,
TaskIOMetricGroup metrics) { super(inputGate, channelIndex, partitionId, initialBackoff, maxBackoff, metrics.getNumBytesInLocalCounter()); //metrics.getNumBytesInLocalCounter() public RemoteInputChannel(
SingleInputGate inputGate,
int channelIndex,
ResultPartitionID partitionId,
ConnectionID connectionId,
ConnectionManager connectionManager,
int initialBackOff,
int maxBackoff,
TaskIOMetricGroup metrics) { super(inputGate, channelIndex, partitionId, initialBackOff, maxBackoff, metrics.getNumBytesInRemoteCounter()); // metrics.getNumBytesInRemoteCounter()

并且都会在

BufferAndAvailability getNextBuffer()

会调用,

numBytesIn.inc(next.getSize());

 

numBytesOut

RecordWriter
public class RecordWriter<T extends IOReadableWritable> {
private Counter numBytesOut = new SimpleCounter(); public void emit(T record) throws IOException, InterruptedException {
for (int targetChannel : channelSelector.selectChannels(record, numChannels)) {
sendToTarget(record, targetChannel);
}
} private void sendToTarget(T record, int targetChannel) throws IOException, InterruptedException {
RecordSerializer<T> serializer = serializers[targetChannel]; synchronized (serializer) {
SerializationResult result = serializer.addRecord(record); while (result.isFullBuffer()) {
Buffer buffer = serializer.getCurrentBuffer(); if (buffer != null) {
numBytesOut.inc(buffer.getSize()); //计数numBytesOut
writeAndClearBuffer(buffer, targetChannel, serializer); // If this was a full record, we are done. Not breaking
// out of the loop at this point will lead to another
// buffer request before breaking out (that would not be
// a problem per se, but it can lead to stalls in the
// pipeline).
if (result.isFullRecord()) {
break;
}
} else {
buffer = targetPartition.getBufferProvider().requestBufferBlocking();
result = serializer.setNextBuffer(buffer);
}
}
}
}
RecordWriterOutput.collect –> StreamRecordWriter.emit –> RecordWriter.emit
 

inputQueueLength, outputQueueLength, inPoolUsage, outPoolUsage

TaskIOMetricGroup
   /**
* Initialize Buffer Metrics for a task
*/
public void initializeBufferMetrics(Task task) {
final MetricGroup buffers = addGroup("buffers");
buffers.gauge("inputQueueLength", new InputBuffersGauge(task));
buffers.gauge("outputQueueLength", new OutputBuffersGauge(task));
buffers.gauge("inPoolUsage", new InputBufferPoolUsageGauge(task));
buffers.gauge("outPoolUsage", new OutputBufferPoolUsageGauge(task));
}
 

inputQueueLength

for (SingleInputGate inputGate : task.getAllInputGates()) {
totalBuffers += inputGate.getNumberOfQueuedBuffers();
}
inputGate.getNumberOfQueuedBuffers
for (InputChannel channel : inputChannels.values()) {
if (channel instanceof RemoteInputChannel) { // 只统计RemoteInputChannel
totalBuffers += ((RemoteInputChannel) channel).getNumberOfQueuedBuffers();
}
}
getNumberOfQueuedBuffers
/**
* The received buffers. Received buffers are enqueued by the network I/O thread and the queue
* is consumed by the receiving task thread.
*/
private final Queue<Buffer> receivedBuffers = new ArrayDeque<>(); public int getNumberOfQueuedBuffers() {
synchronized (receivedBuffers) {
return receivedBuffers.size();
}
}
 
outputQueueLength
for (ResultPartition producedPartition : task.getProducedPartitions()) {
totalBuffers += producedPartition.getNumberOfQueuedBuffers();
}
ResultPartition getNumberOfQueuedBuffers
for (ResultSubpartition subpartition : subpartitions) {
totalBuffers += subpartition.getNumberOfQueuedBuffers();
}

SpillableSubpartition getNumberOfQueuedBuffers

class SpillableSubpartition extends ResultSubpartition {
/** Buffers are kept in this queue as long as we weren't ask to release any. */
private final ArrayDeque<Buffer> buffers = new ArrayDeque<>(); @Override
public int getNumberOfQueuedBuffers() {
return buffers.size();
}

inputQueueLength, outputQueueLength

指标的含义是,inputchannel和resultparitition,持有的buffer个数,这些buffer被读完后会release,所以链路通畅的话,length应该会很小

 

inPoolUsage
int usedBuffers = 0;
int bufferPoolSize = 0; for (SingleInputGate inputGate : task.getAllInputGates()) {
usedBuffers += inputGate.getBufferPool().bestEffortGetNumOfUsedBuffers();
bufferPoolSize += inputGate.getBufferPool().getNumBuffers();
} if (bufferPoolSize != 0) {
return ((float) usedBuffers) / bufferPoolSize;
} else {
return 0.0f;
}

bestEffortGetNumOfUsedBuffers()

@Override
public int bestEffortGetNumOfUsedBuffers() {
return Math.max(0, numberOfRequestedMemorySegments - availableMemorySegments.size());
}

numberOfRequestedMemorySegments,从bufferpool申请多少

availableMemorySegments,可用的

所以相减就是使用多少

 
outPoolUsage
int usedBuffers = 0;
int bufferPoolSize = 0; for (ResultPartition resultPartition : task.getProducedPartitions()) {
usedBuffers += resultPartition.getBufferPool().bestEffortGetNumOfUsedBuffers();
bufferPoolSize += resultPartition.getBufferPool().getNumBuffers();
} if (bufferPoolSize != 0) {
return ((float) usedBuffers) / bufferPoolSize;
} else {
return 0.0f;
}
和inPoolUsage类似,也是看bufferPool的情况
所以inPoolUsage,outPoolUsage表示的是inputgate和resultpartition中bufferpool的使用情况
这个bufferpool是inputgate初始化的时候,注册到NetworkEnvironment创建的,
// Setup the buffer pool for each buffer reader
final SingleInputGate[] inputGates = task.getAllInputGates(); for (SingleInputGate gate : inputGates) {
BufferPool bufferPool = null; try {
bufferPool = networkBufferPool.createBufferPool(gate.getNumberOfInputChannels(), false);
gate.setBufferPool(bufferPool);
}

可以看到默认大小是,inputchanels的size

如果pool用完了,那么inputGate和ResultPartiton就无法继续读取新的数据

 

latency

在AbstractStreamOperator中,

setup,

protected LatencyGauge latencyGauge;
latencyGauge = this.metrics.gauge("latency", new LatencyGauge(historySize));
 
注意,这里metrics是OperatorMetricGroup
this.metrics = container.getEnvironment().getMetricGroup().addOperator(config.getOperatorName());

TaskMetricGroup

    public OperatorMetricGroup addOperator(String name) {
OperatorMetricGroup operator = new OperatorMetricGroup(this.registry, this, name); synchronized (this) {
OperatorMetricGroup previous = operators.put(name, operator);
if (previous == null) {
// no operator group so far
return operator;
} else {
// already had an operator group. restore that one.
operators.put(name, previous);
return previous;
}
}
}

 

 

LatencyGauge的定义,

/**
* The gauge uses a HashMap internally to avoid classloading issues when accessing
* the values using JMX.
*/
protected static class LatencyGauge implements Gauge<Map<String, HashMap<String, Double>>> { //LatencySourceDescriptor,包含vertexID和subtaskIndex
//DescriptiveStatistics,统计模块
private final Map<LatencySourceDescriptor, DescriptiveStatistics> latencyStats = new HashMap<>();
private final int historySize; LatencyGauge(int historySize) {
this.historySize = historySize;
} public void reportLatency(LatencyMarker marker, boolean isSink) {
LatencySourceDescriptor sourceDescriptor = LatencySourceDescriptor.of(marker, !isSink);
DescriptiveStatistics sourceStats = latencyStats.get(sourceDescriptor);
if (sourceStats == null) { //初始化DescriptiveStatistics
// 512 element window (4 kb)
sourceStats = new DescriptiveStatistics(this.historySize);
latencyStats.put(sourceDescriptor, sourceStats);
}
long now = System.currentTimeMillis();
sourceStats.addValue(now - marker.getMarkedTime()); //当前时间和source发出时时间差值作为延迟
} @Override
public Map<String, HashMap<String, Double>> getValue() {
while (true) {
try {
Map<String, HashMap<String, Double>> ret = new HashMap<>();
for (Map.Entry<LatencySourceDescriptor, DescriptiveStatistics> source : latencyStats.entrySet()) {
HashMap<String, Double> sourceStatistics = new HashMap<>(6);
sourceStatistics.put("max", source.getValue().getMax());
sourceStatistics.put("mean", source.getValue().getMean());
sourceStatistics.put("min", source.getValue().getMin());
sourceStatistics.put("p50", source.getValue().getPercentile(50));
sourceStatistics.put("p95", source.getValue().getPercentile(95));
sourceStatistics.put("p99", source.getValue().getPercentile(99));
ret.put(source.getKey().toString(), sourceStatistics);
}
return ret;
// Concurrent access onto the "latencyStats" map could cause
// ConcurrentModificationExceptions. To avoid unnecessary blocking
// of the reportLatency() method, we retry this operation until
// it succeeds.
} catch(ConcurrentModificationException ignore) {
LOG.debug("Unable to report latency statistics", ignore);
}
}
}
}

    

这个Gauge.getValue返回的是个map,太奇葩

latencyStats里面有多少entry,取决于有多少source,以及每个source有几个并发

因为他要记录,每个source operator的某个subtask,到当前operator的该subtask的延迟

        public static LatencySourceDescriptor of(LatencyMarker marker, boolean ignoreSubtaskIndex) {
if (ignoreSubtaskIndex) {
return new LatencySourceDescriptor(marker.getVertexID(), -1);
} else {
return new LatencySourceDescriptor(marker.getVertexID(), marker.getSubtaskIndex());
} }

LatencySourceDescriptor构造函数,由vertexid,和subtaskIndex组成

如果忽略subtaskindex,置为-1

 

 

流程

StreamSource

定义LatencyMarksEmitter

private static class LatencyMarksEmitter<OUT> {
private final ScheduledFuture<?> latencyMarkTimer; public LatencyMarksEmitter(
final ProcessingTimeService processingTimeService,
final Output<StreamRecord<OUT>> output,
long latencyTrackingInterval,
final int vertexID,
final int subtaskIndex) { latencyMarkTimer = processingTimeService.scheduleAtFixedRate( //根据processingTime定期发送latencyMarker
new ProcessingTimeCallback() {
@Override
public void onProcessingTime(long timestamp) throws Exception {
try {
// ProcessingTimeService callbacks are executed under the checkpointing lock
output.emitLatencyMarker(new LatencyMarker(timestamp, vertexID, subtaskIndex)); //emitLatencyMarker,以processTime为初始时间
} catch (Throwable t) {
// we catch the Throwables here so that we don't trigger the processing
// timer services async exception handler
LOG.warn("Error while emitting latency marker.", t);
}
}
},
0L,
latencyTrackingInterval);
}

 

source.run,当isLatencyTrackingEnabled,schedule latency marker

public void run(final Object lockingObject, final Output<StreamRecord<OUT>> collector) throws Exception {
final TimeCharacteristic timeCharacteristic = getOperatorConfig().getTimeCharacteristic(); LatencyMarksEmitter latencyEmitter = null;
if(getExecutionConfig().isLatencyTrackingEnabled()) {
latencyEmitter = new LatencyMarksEmitter<>(
getProcessingTimeService(),
collector,
getExecutionConfig().getLatencyTrackingInterval(),
getOperatorConfig().getVertexID(),
getRuntimeContext().getIndexOfThisSubtask());
}

 

 

StreamInputProcessor –> processInput

如果是isLatencyMarker

else if(recordOrMark.isLatencyMarker()) {
// handle latency marker
synchronized (lock) {
streamOperator.processLatencyMarker(recordOrMark.asLatencyMarker());
}
continue;
}

对于,chaining, ChainingOutput

private static class ChainingOutput<T> implements Output<StreamRecord<T>> {

    protected final OneInputStreamOperator<T, ?> operator;
protected final Counter numRecordsIn; @Override
public void emitLatencyMarker(LatencyMarker latencyMarker) {
try {
operator.processLatencyMarker(latencyMarker);
}
catch (Exception e) {
throw new ExceptionInChainedOperatorException(e);
}
}

 

AbstractStreamOperator

public void processLatencyMarker(LatencyMarker latencyMarker) throws Exception {
reportOrForwardLatencyMarker(latencyMarker);
}

 

protected void reportOrForwardLatencyMarker(LatencyMarker marker) {
// all operators are tracking latencies
this.latencyGauge.reportLatency(marker, false); // everything except sinks forwards latency markers
this.output.emitLatencyMarker(marker);
}

调用到latencyGauge.reportLatency,逻辑如上

后续继续emitLatencyMarker

 

currentLowWatermark, checkpointAlignmentTime

OneInputStreamTask
@Override
public void init() throws Exception {
if (numberOfInputs > 0) {
InputGate[] inputGates = getEnvironment().getAllInputGates();
inputProcessor = new StreamInputProcessor<IN>(
inputGates, inSerializer,
this,
configuration.getCheckpointMode(),
getEnvironment().getIOManager(),
getEnvironment().getTaskManagerInfo().getConfiguration()); // make sure that stream tasks report their I/O statistics
inputProcessor.setMetricGroup(getEnvironment().getMetricGroup().getIOMetricGroup());
}
}

 

StreamInputProcessor
    public void setMetricGroup(TaskIOMetricGroup metrics) {
metrics.gauge("currentLowWatermark", new Gauge<Long>() {
@Override
public Long getValue() {
return lastEmittedWatermark;
}
}); metrics.gauge("checkpointAlignmentTime", new Gauge<Long>() {
@Override
public Long getValue() {
return barrierHandler.getAlignmentDurationNanos();
}
});
}

 

currentLowWatermark,即lastEmittedWatermark

默认值是,

lastEmittedWatermark = Long.MIN_VALUE;

所以如果没有assignTimestampsAndWatermarks,那么currentLowWatermark会是一个极大的负数

    public boolean processInput(OneInputStreamOperator<IN, ?> streamOperator, final Object lock) throws Exception {
while (true) {
if (currentRecordDeserializer != null) { if (result.isFullRecord()) {
StreamElement recordOrMark = deserializationDelegate.getInstance(); if (recordOrMark.isWatermark()) {
long watermarkMillis = recordOrMark.asWatermark().getTimestamp();
if (watermarkMillis > watermarks[currentChannel]) { // 更新每个channel对应的waterMark
watermarks[currentChannel] = watermarkMillis;
long newMinWatermark = Long.MAX_VALUE;
for (long watermark: watermarks) { // 找出所有channel最小的watermark,以最小的为准
newMinWatermark = Math.min(watermark, newMinWatermark);
}
if (newMinWatermark > lastEmittedWatermark) {
lastEmittedWatermark = newMinWatermark; // 将最小的watermark设为lastEmittedWatermark
synchronized (lock) {
streamOperator.processWatermark(new Watermark(lastEmittedWatermark));
}
}
}
continue;
}

 

checkpointAlignmentTime

barrierHandler.getAlignmentDurationNanos

    @Override
public long getAlignmentDurationNanos() {
long start = this.startOfAlignmentTimestamp;
if (start <= 0) {
return latestAlignmentDurationNanos;
} else {
return System.nanoTime() - start;
}
}

startOfAlignmentTimestamp是在这次checkpoint开始的时候打的时间戳,即beginNewAlignment

    private void beginNewAlignment(long checkpointId, int channelIndex) throws IOException {
currentCheckpointId = checkpointId;
onBarrier(channelIndex); startOfAlignmentTimestamp = System.nanoTime();
}

beginNewAlignment在

processBarrier中被调用,
        if (numBarriersReceived > 0) {
// this is only true if some alignment is already progress and was not canceled if (barrierId == currentCheckpointId) {
// regular case
onBarrier(channelIndex);
}
else if (barrierId > currentCheckpointId) {// 当收到新的checkpointid,所以老的id已经过期,需要产生新的checkpoint
// we did not complete the current checkpoint, another started before
LOG.warn("Received checkpoint barrier for checkpoint {} before completing current checkpoint {}. " +
"Skipping current checkpoint.", barrierId, currentCheckpointId); // let the task know we are not completing this
notifyAbort(currentCheckpointId, new CheckpointDeclineSubsumedException(barrierId)); // abort the current checkpoint
releaseBlocksAndResetBarriers(); // begin a the new checkpoint
beginNewAlignment(barrierId, channelIndex); //标识checkpoint开始
}
else {
// ignore trailing barrier from an earlier checkpoint (obsolete now)
return;
}
}
else if (barrierId > currentCheckpointId) { //新的checkpoint开始
// first barrier of a new checkpoint
beginNewAlignment(barrierId, channelIndex); //标识checkpoint开始
}

所以checkpointAlignmentTime的意思是,当前的checkpoint已经等待多久,因为要等到所有input channel的barrier,checkpoint才会触发

单位是纳秒,所以billion级别代表秒

如果比较大,说明各个并发之前的延迟差异较大,或延迟较高

Flink – metrics V1.2的更多相关文章

  1. Flink - metrics

      Metrics是以MetricsGroup来组织的 MetricGroup MetricGroup 这就是个metric容器,里面可以放subGroup,或者各种metric 所以主要的接口就是注 ...

  2. Flink Metrics 源码解析

    Flink Metrics 有如下模块: Flink Metrics 源码解析 -- Flink-metrics-core Flink Metrics 源码解析 -- Flink-metrics-da ...

  3. 深入理解Flink ---- Metrics的内部结构

    从Metrics的使用说起 Flink的Metrics种类有四种Counters, Gauges, Histograms和Meters. 如何使用Metrics呢? 以Counter为例, publi ...

  4. Apache Flink 进阶(八):详解 Metrics 原理与实战

    本文由 Apache Flink Contributor 刘彪分享,本文对两大问题进行了详细的介绍,即什么是 Metrics.如何使用 Metrics,并对 Metrics 监控实战进行解释说明. 什 ...

  5. Flink写入kafka时,只写入kafka的部分Partitioner,无法写所有的Partitioner问题

    1. 写在前面 在利用flink实时计算的时候,往往会从kafka读取数据写入数据到kafka,但会发现当kafka多个Partitioner时,特别在P量级数据为了kafka的性能kafka的节点有 ...

  6. flink metric库的使用和自定义metric-reporter

    简单介绍 flink内部实现了一套metric数据收集库. 同时flink自身系统有一些固定的metric数据, 包括系统的一些指标,CPU,内存, IO 或者各个task运行的一些指标.具体包含那些 ...

  7. Flink知识点

    1. Flink.Storm.Sparkstreaming对比 Storm只支持流处理任务,数据是一条一条的源源不断地处理,而MapReduce.spark只支持批处理任务,spark-streami ...

  8. Flink 灵魂两百问,这谁顶得住?

    Flink 学习 https://github.com/zhisheng17/flink-learning 麻烦路过的各位亲给这个项目点个 star,太不易了,写了这么多,算是对我坚持下来的一种鼓励吧 ...

  9. Flink 从0到1学习 —— Flink 中如何管理配置?

    前言 如果你了解 Apache Flink 的话,那么你应该熟悉该如何像 Flink 发送数据或者如何从 Flink 获取数据.但是在某些情况下,我们需要将配置数据发送到 Flink 集群并从中接收一 ...

随机推荐

  1. 【Linux高级驱动】I2C驱动框架分析

    1.i2c-dev.c(i2c设备驱动组件层) 功能:1)给用户提供接口 i2c_dev_init  //入口函数 /*申请主设备号*/ register_chrdev(I2C_MAJOR(), &q ...

  2. 【Java】接口(interface)VS抽象类

    接口(interface)可以说成是抽象类的一种特例,接口中的所有方法都必须是抽象的.接口中的方法定义默认为public abstract类型,接口中的成员变量类型默认为public static f ...

  3. Python3自定义json逐层解析器

    [本文出自天外归云的博客园] 用python3对json内容逐层进行解析,拿中国天气网的接口返回数据测试,代码如下: # -*- coding: utf-8 -*- import operator a ...

  4. Android开发(十五)——ListView中Items的间距margin

    ListView中Items没有margin 参考:http://www.cnblogs.com/xitang/p/3677528.html

  5. 【原】使用Json作为Python和C#混合编程时对象转换的中间文件

    一.Python中自定义类对象json字符串化的步骤[1]   1. 用 json 或者simplejson 就可以: 2.定义转换函数: 3. 定义类 4. 生成对象 5.dumps执行,引入转换函 ...

  6. 从git上check out指定的文件夹至本地

    当项目过大时,从服务器上拉取项目是件很头疼的事情,那么就说说怎么只拉区某个或几个文件夹至本地. git clone -n git@172.0.0.10:test/test_platform.git c ...

  7. 【转帖】39个让你受益的HTML5教程

    39个让你受益的HTML5教程                    闲话少说,本文作者为大家收集了网上学习HTML5的资源,期望它们可以帮助大家更好地学习HTML5. 好人啊! 不过,作者原来说的4 ...

  8. circRNA 在人和小鼠脑组织中的表达

    circRNA 是一类动物体内的内源性的RNA,尽管circRNA的种类丰富,但是其在神经系统中的 功能,并不清楚.科学家通过对人和小鼠的不同脑部组织的RNA 测序,发现了上千种circRNA,经过分 ...

  9. EasyRadius 动态域名DDNS设置工具,支持WayOS三代,完美解决近段时间3322和每步不稳定问题

    以下软件只适合拥有公网IP的用户哦,要是您没有公网IP,只能和我们联系,获取VPN了 EasyRadius从1.65开始就提供DDNS,中途由于我们升级了安全性,导致DDNS更新失败 这段时间由于33 ...

  10. [Tensorflow] RNN - 04. Work with CNN for Text Classification

    Ref: Combining CNN and RNN for spoken language identification Ref: Convolutional Methods for Text [1 ...