1 ==> 接受消息,org.apache.spark.executor.CoarseGrainedExecutorBackend#receive

    case LaunchTask(data) =>
if (executor == null) {
exitExecutor(1, "Received LaunchTask command but executor was null")
} else {
val taskDesc = TaskDescription.decode(data.value)
logInfo("Got assigned task " + taskDesc.taskId)
executor.launchTask(this, taskDesc)
}

2. ==> org.apache.spark.executor.Executor#launchTask

  // Maintains the list of running tasks.
private val runningTasks = new ConcurrentHashMap[Long, TaskRunner] def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = {
val tr = new TaskRunner(context, taskDescription)
runningTasks.put(taskDescription.taskId, tr)
threadPool.execute(tr)
}

3. ==>org.apache.spark.executor.Executor.TaskRunner#run

override def run(): Unit = {
threadId = Thread.currentThread.getId
Thread.currentThread.setName(threadName)
val threadMXBean = ManagementFactory.getThreadMXBean
val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId) //下载依赖
updateDependencies(taskDescription.addedFiles, taskDescription.addedJars)
//反序列化得到真正的 task
task = ser.deserialize[Task[Any]](taskDescription.serializedTask, Thread.currentThread.getContextClassLoader)
task.localProperties = taskDescription.properties
task.setTaskMemoryManager(taskMemoryManager) val value = Utils.tryWithSafeFinally {
val res = task.run(
taskAttemptId = taskId,
attemptNumber = taskDescription.attemptNumber,
metricsSystem = env.metricsSystem)
threwException = false
res
} {
val releasedLocks = env.blockManager.releaseAllLocksForTask(taskId)
val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()
}
//处理执行结果
val resultSer = env.serializer.newInstance()
val beforeSerialization = System.currentTimeMillis()
val valueBytes = resultSer.serialize(value)
val afterSerialization = System.currentTimeMillis() // Note: accumulator updates must be collected after TaskMetrics is updated
val accumUpdates = task.collectAccumulatorUpdates()
// TODO: do not serialize value twice
val directResult = new DirectTaskResult(valueBytes, accumUpdates)
val serializedDirectResult = ser.serialize(directResult)
val resultSize = serializedDirectResult.limit() // directSend = sending directly back to the driver
val serializedResult: ByteBuffer = {
if (maxResultSize > 0 && resultSize > maxResultSize) {
logWarning(s"Finished $taskName (TID $taskId). Result is larger than maxResultSize " +
s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " +
s"dropping it.")
ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))
} else if (resultSize > maxDirectResultSize) {
val blockId = TaskResultBlockId(taskId)
env.blockManager.putBytes(
blockId,
new ChunkedByteBuffer(serializedDirectResult.duplicate()),
StorageLevel.MEMORY_AND_DISK_SER)
logInfo(
s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")
ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))
} else {
logInfo(s"Finished $taskName (TID $taskId). $resultSize bytes result sent to driver")
serializedDirectResult
}
} setTaskFinishedAndClearInterruptStatus()
execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)
}

==> org.apache.spark.executor.Executor#updateDependencies

  /**
* Download any missing dependencies if we receive a new set of files and JARs from the
* SparkContext. Also adds any new JARs we fetched to the class loader.
*/
private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]) {
lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
synchronized {
// Fetch missing dependencies
for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {
logInfo("Fetching " + name + " with timestamp " + timestamp)
// Fetch file with useCache mode, close cache for local mode.
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf,
env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
currentFiles(name) = timestamp
}
for ((name, timestamp) <- newJars) {
val localName = new URI(name).getPath.split("/").last
val currentTimeStamp = currentJars.get(name)
.orElse(currentJars.get(localName))
.getOrElse(-1L)
if (currentTimeStamp < timestamp) {
logInfo("Fetching " + name + " with timestamp " + timestamp)
// Fetch file with useCache mode, close cache for local mode.
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf,
env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
currentJars(name) = timestamp
// Add it to our class loader
val url = new File(SparkFiles.getRootDirectory(), localName).toURI.toURL
if (!urlClassLoader.getURLs().contains(url)) {
logInfo("Adding " + url + " to class loader")
urlClassLoader.addURL(url)
}
}
}
}
}

==> org.apache.spark.scheduler.Task#run

 final def run(
taskAttemptId: Long,
attemptNumber: Int,
metricsSystem: MetricsSystem): T = {
SparkEnv.get.blockManager.registerTask(taskAttemptId) val taskContext = new TaskContextImpl(
stageId,
stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal
partitionId,
taskAttemptId,
attemptNumber,
taskMemoryManager,
localProperties,
metricsSystem,
metrics) context = if (isBarrier) {
new BarrierTaskContext(taskContext)
} else {
taskContext
} TaskContext.setTaskContext(context)
taskThread = Thread.currentThread() if (_reasonIfKilled != null) {
kill(interruptThread = false, _reasonIfKilled)
} new CallerContext(
"TASK",
SparkEnv.get.conf.get(APP_CALLER_CONTEXT),
appId,
appAttemptId,
jobId,
Option(stageId),
Option(stageAttemptId),
Option(taskAttemptId),
Option(attemptNumber)).setCurrentContext() try {
//这个类只是一个模板类或者抽象类, 具体实现类分为ResultTask, ShuffleMapTask 两种
runTask(context)
}
}

==>org.apache.spark.scheduler.ShuffleMapTask#runTask

ShuffleMapTask将rdd的元素,切分为多个bucket, 基于ShuffleDependency指定的partitioner,默认就是HashPartitioner

ShuffleMapTask 核心方法是 RDD.iterator[底层调用 compute 方法(fn(context,index,partition))],

执行完成rdd之后,rdd或返回处理过后的partition数据,这些数据通过shuffleWriter在经过HashPartitioner写入对应的分区中

// ShuffleMapTask将rdd的元素,切分为多个bucket
// 基于ShuffleDependency指定的partitioner,默认就是HashPartitioner
private[spark] class ShuffleMapTask(
...
// ShuffleMapTask的 runTask 有 MapStatus返回值
override def runTask(context: TaskContext): MapStatus = {
// Deserialize the RDD using the broadcast variable.
val threadMXBean = ManagementFactory.getThreadMXBean
val deserializeStartTime = System.currentTimeMillis()
val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime
} else 0L // 对task要处理的数据,做反序列化操作 val ser = SparkEnv.get.closureSerializer.newInstance()
//获得 RDD
val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
_executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
_executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
} else 0L var writer: ShuffleWriter[Any, Any] = null
try {
// 拿到shuffleManager
val manager = SparkEnv.get.shuffleManager
// 拿到shuffleWriter
writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) // 核心逻辑,调用rdd的iterator方法,并且传入了当前要处理的partition
// 执行完成rdd之后,rdd或返回处理过后的partition数据,这些数据通过shuffleWriter
// 在经过HashPartitioner写入对应的分区中 writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) // 返回结果 MapStatus ,里面封装了ShuffleMapTask存储在哪里,其实就是BlockManager相关信息
writer.stop(success = true).get
}
}
...
}

==> org.apache.spark.scheduler.ResultTask#runTask

  override def runTask(context: TaskContext): U = {
// Deserialize the RDD and the func using the broadcast variables.
val threadMXBean = ManagementFactory.getThreadMXBean
val deserializeStartTime = System.currentTimeMillis()
val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime
} else 0L
val ser = SparkEnv.get.closureSerializer.newInstance()
val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
_executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
_executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
} else 0L //直接调用用户自定义函数
func(context, rdd.iterator(partition, context))
}

==> org.apache.spark.rdd.RDD#iterator

 final def iterator(split: Partition, context: TaskContext): Iterator[T] = {
   //结果不需要存储
if (storageLevel != StorageLevel.NONE) {
getOrCompute(split, context)
} else {
computeOrReadCheckpoint(split, context)
}
}

==> org.apache.spark.rdd.RDD#computeOrReadCheckpoint

  /**
* Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing.
*/
private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =
{
if (isCheckpointedAndMaterialized) {
firstParent[T].iterator(split, context)
} else {
//核心方法, 此方法为虚方法,具体实现由具体 RDD 子类实现,如 MapPartitionsRDD,JdbcRDD等
compute(split, context)
}
}

demo:

class MapPartitionsRDD[U: ClassTag, T: ClassTag](
var prev: RDD[T],
f: (TaskContext, Int, Iterator[T]) => Iterator[U], // (TaskContext, partition index, iterator)
preservesPartitioning: Boolean = false,
isFromBarrier: Boolean = false,
isOrderSensitive: Boolean = false)
extends RDD[U](prev) { override def compute(split: Partition, context: TaskContext): Iterator[U] =
f(context, split.index, firstParent[T].iterator(split, context)) } class JdbcRDD[T: ClassTag](
sc: SparkContext,
getConnection: () => Connection,
sql: String,
lowerBound: Long,
upperBound: Long,
numPartitions: Int,
mapRow: (ResultSet) => T = JdbcRDD.resultSetToObjectArray _)
extends RDD[T](sc, Nil) with Logging {
override def getPartitions: Array[Partition] = {
// bounds are inclusive, hence the + 1 here and - 1 on end
val length = BigInt(1) + upperBound - lowerBound
(0 until numPartitions).map { i =>
val start = lowerBound + ((i * length) / numPartitions)
val end = lowerBound + (((i + 1) * length) / numPartitions) - 1
new JdbcPartition(i, start.toLong, end.toLong)
}.toArray
} override def compute(thePart: Partition, context: TaskContext): Iterator[T] = new NextIterator[T]
{
context.addTaskCompletionListener[Unit]{ context => closeIfNeeded() }
val part = thePart.asInstanceOf[JdbcPartition]
val conn = getConnection()
val stmt = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) val url = conn.getMetaData.getURL val rs = stmt.executeQuery() override def getNext(): T = {
if (rs.next()) {
mapRow(rs)
} else {
finished = true
null.asInstanceOf[T]
}
} override def close() { }
}
}

spark源码分析, 任务反序列化及执行的更多相关文章

  1. spark 源码分析之二十一 -- Task的执行流程

    引言 在上两篇文章 spark 源码分析之十九 -- DAG的生成和Stage的划分 和 spark 源码分析之二十 -- Stage的提交 中剖析了Spark的DAG的生成,Stage的划分以及St ...

  2. Spark 源码分析 -- task实际执行过程

    Spark源码分析 – SparkContext 中的例子, 只分析到sc.runJob 那么最终是怎么执行的? 通过DAGScheduler切分成Stage, 封装成taskset, 提交给Task ...

  3. Spark源码分析之九:内存管理模型

    Spark是现在很流行的一个基于内存的分布式计算框架,既然是基于内存,那么自然而然的,内存的管理就是Spark存储管理的重中之重了.那么,Spark究竟采用什么样的内存管理模型呢?本文就为大家揭开Sp ...

  4. Spark源码分析之八:Task运行(二)

    在<Spark源码分析之七:Task运行(一)>一文中,我们详细叙述了Task运行的整体流程,最终Task被传输到Executor上,启动一个对应的TaskRunner线程,并且在线程池中 ...

  5. Spark源码分析之七:Task运行(一)

    在Task调度相关的两篇文章<Spark源码分析之五:Task调度(一)>与<Spark源码分析之六:Task调度(二)>中,我们大致了解了Task调度相关的主要逻辑,并且在T ...

  6. Spark源码分析之四:Stage提交

    各位看官,上一篇<Spark源码分析之Stage划分>详细讲述了Spark中Stage的划分,下面,我们进入第三个阶段--Stage提交. Stage提交阶段的主要目的就一个,就是将每个S ...

  7. Spark源码分析之三:Stage划分

    继上篇<Spark源码分析之Job的调度模型与运行反馈>之后,我们继续来看第二阶段--Stage划分. Stage划分的大体流程如下图所示: 前面提到,对于JobSubmitted事件,我 ...

  8. spark 源码分析之十六 -- Spark内存存储剖析

    上篇spark 源码分析之十五 -- Spark内存管理剖析 讲解了Spark的内存管理机制,主要是MemoryManager的内容.跟Spark的内存管理机制最密切相关的就是内存存储,本篇文章主要介 ...

  9. spark 源码分析之十八 -- Spark存储体系剖析

    本篇文章主要剖析BlockManager相关的类以及总结Spark底层存储体系. 总述 先看 BlockManager相关类之间的关系如下: 我们从NettyRpcEnv 开始,做一下简单说明. Ne ...

随机推荐

  1. 键盘敲入 A 字母时,操作系统期间发生了什么

    前言 键盘可以说是我们最常使用的输入硬件设备了,但身为程序员的你,你知道「键盘敲入A 字母时,操作系统期间发生了什么吗」? 那要想知道这个发生的过程,我们得先了解了解「操作系统是如何管理多种多样的的输 ...

  2. 牛客网PAT练习场-有几个PAT

    题目地址:https://www.nowcoder.com/pat/6/problem/4066 题意:求pat->求pa->求p /** * *作者:YCute *时间:2019-12- ...

  3. Tensorflow2(二)tf.data输入模块

    代码和其他资料在 github 一.tf.data模块 数据分割 import tensorflow as tf dataset = tf.data.Dataset.from_tensor_slice ...

  4. 微信小程序自动化

    解析微信小程序 注意:若上面方法不行就使用下面的 小程序对应的chrome驱动版本包,2.4版本的

  5. 基于laravel的有偿开源流程引擎

    系统主要文档已经编写完成,具体请前往查看[系统文档](https://www.kancloud.cn/lijianlin/jishullin_workflow_engine/1894424 " ...

  6. JS深浅拷贝及其实现

    基本数据类型和引用数据类型 JS数据分为基本数据类型和引用数据类型.基本数据类型的变量存储在栈中,引用数据类型则存储在堆中,引用数据类型的存储地址则保存在栈中. 下面来看一个小例子 // 基本数据类型 ...

  7. 攻防世界——web新手练习区解题记录<1>(1-4题)

    web新手练习区一至四题 第一题view_source: 题目说右键不管用了,我们先获取在线场景来看一看,我们看到这样一个网页,并且右键确实点了没什么反应,而用到右键一般就是查看网页源码 用快捷键(F ...

  8. java初探(1)之登录初解

    初识登录 登录的应用场景 登录比较常见,大多数网站都有登录的操作.然后登录本身也从简单到复杂有着漫长的发展历史.本文记录博主对登录的应用场景的剖析,深究不在于学习如何实现,主要关注其编码思想,过程中用 ...

  9. Azure Blob (三)参数设置说明

    一,引言 上一篇将 Azure Blob 存储的时候,有使用到一个 .NET  Core Web 项目,通过代码的方式进行操作 Azure Blob 的数据,接着上一篇的内容,今天继续看一下代码,具体 ...

  10. Django模型验证器详解和源码分析

    转发请注明来源 在Django的模型字段参数中,有一个参数叫做validators,这个参数是用来指定当前字段需要使用的验证器,也就是对字段数据的合法性进行验证,比如大小.类型等. Django的验证 ...